aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c118
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c483
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c315
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c150
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c501
-rw-r--r--fs/btrfs/ioctl.c74
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c90
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
34 files changed, 2327 insertions, 1726 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
255 * to a logical address 255 * to a logical address
256 */ 256 */
257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
258 int search_commit_root, 258 struct btrfs_path *path, u64 time_seq,
259 u64 time_seq, 259 struct __prelim_ref *ref,
260 struct __prelim_ref *ref, 260 struct ulist *parents,
261 struct ulist *parents, 261 const u64 *extent_item_pos)
262 const u64 *extent_item_pos)
263{ 262{
264 struct btrfs_path *path;
265 struct btrfs_root *root; 263 struct btrfs_root *root;
266 struct btrfs_key root_key; 264 struct btrfs_key root_key;
267 struct extent_buffer *eb; 265 struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
269 int root_level; 267 int root_level;
270 int level = ref->level; 268 int level = ref->level;
271 269
272 path = btrfs_alloc_path();
273 if (!path)
274 return -ENOMEM;
275 path->search_commit_root = !!search_commit_root;
276
277 root_key.objectid = ref->root_id; 270 root_key.objectid = ref->root_id;
278 root_key.type = BTRFS_ROOT_ITEM_KEY; 271 root_key.type = BTRFS_ROOT_ITEM_KEY;
279 root_key.offset = (u64)-1; 272 root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
314 time_seq, ref->wanted_disk_byte, 307 time_seq, ref->wanted_disk_byte,
315 extent_item_pos); 308 extent_item_pos);
316out: 309out:
317 btrfs_free_path(path); 310 path->lowest_level = 0;
311 btrfs_release_path(path);
318 return ret; 312 return ret;
319} 313}
320 314
@@ -322,7 +316,7 @@ out:
322 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
323 */ 317 */
324static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
325 int search_commit_root, u64 time_seq, 319 struct btrfs_path *path, u64 time_seq,
326 struct list_head *head, 320 struct list_head *head,
327 const u64 *extent_item_pos) 321 const u64 *extent_item_pos)
328{ 322{
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
349 continue; 343 continue;
350 if (ref->count == 0) 344 if (ref->count == 0)
351 continue; 345 continue;
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
353 time_seq, ref, parents, 347 parents, extent_item_pos);
354 extent_item_pos);
355 if (err == -ENOMEM) 348 if (err == -ENOMEM)
356 goto out; 349 goto out;
357 if (err) 350 if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
604 int slot; 597 int slot;
605 struct extent_buffer *leaf; 598 struct extent_buffer *leaf;
606 struct btrfs_key key; 599 struct btrfs_key key;
600 struct btrfs_key found_key;
607 unsigned long ptr; 601 unsigned long ptr;
608 unsigned long end; 602 unsigned long end;
609 struct btrfs_extent_item *ei; 603 struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
621 615
622 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 616 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
623 flags = btrfs_extent_flags(leaf, ei); 617 flags = btrfs_extent_flags(leaf, ei);
618 btrfs_item_key_to_cpu(leaf, &found_key, slot);
624 619
625 ptr = (unsigned long)(ei + 1); 620 ptr = (unsigned long)(ei + 1);
626 end = (unsigned long)ei + item_size; 621 end = (unsigned long)ei + item_size;
627 622
628 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
624 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
629 struct btrfs_tree_block_info *info; 625 struct btrfs_tree_block_info *info;
630 626
631 info = (struct btrfs_tree_block_info *)ptr; 627 info = (struct btrfs_tree_block_info *)ptr;
632 *info_level = btrfs_tree_block_level(leaf, info); 628 *info_level = btrfs_tree_block_level(leaf, info);
633 ptr += sizeof(struct btrfs_tree_block_info); 629 ptr += sizeof(struct btrfs_tree_block_info);
634 BUG_ON(ptr > end); 630 BUG_ON(ptr > end);
631 } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
632 *info_level = found_key.offset;
635 } else { 633 } else {
636 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 634 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
637 } 635 }
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
795 struct btrfs_delayed_ref_head *head; 793 struct btrfs_delayed_ref_head *head;
796 int info_level = 0; 794 int info_level = 0;
797 int ret; 795 int ret;
798 int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
799 struct list_head prefs_delayed; 796 struct list_head prefs_delayed;
800 struct list_head prefs; 797 struct list_head prefs;
801 struct __prelim_ref *ref; 798 struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
804 INIT_LIST_HEAD(&prefs_delayed); 801 INIT_LIST_HEAD(&prefs_delayed);
805 802
806 key.objectid = bytenr; 803 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = (u64)-1; 804 key.offset = (u64)-1;
805 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
806 key.type = BTRFS_METADATA_ITEM_KEY;
807 else
808 key.type = BTRFS_EXTENT_ITEM_KEY;
809 809
810 path = btrfs_alloc_path(); 810 path = btrfs_alloc_path();
811 if (!path) 811 if (!path)
812 return -ENOMEM; 812 return -ENOMEM;
813 path->search_commit_root = !!search_commit_root; 813 if (!trans)
814 path->search_commit_root = 1;
814 815
815 /* 816 /*
816 * grab both a lock on the path and a lock on the delayed ref head. 817 * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
825 goto out; 826 goto out;
826 BUG_ON(ret == 0); 827 BUG_ON(ret == 0);
827 828
828 if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { 829 if (trans) {
829 /* 830 /*
830 * look if there are updates for this ref queued and lock the 831 * look if there are updates for this ref queued and lock the
831 * head 832 * head
@@ -869,7 +870,8 @@ again:
869 slot = path->slots[0]; 870 slot = path->slots[0];
870 btrfs_item_key_to_cpu(leaf, &key, slot); 871 btrfs_item_key_to_cpu(leaf, &key, slot);
871 if (key.objectid == bytenr && 872 if (key.objectid == bytenr &&
872 key.type == BTRFS_EXTENT_ITEM_KEY) { 873 (key.type == BTRFS_EXTENT_ITEM_KEY ||
874 key.type == BTRFS_METADATA_ITEM_KEY)) {
873 ret = __add_inline_refs(fs_info, path, bytenr, 875 ret = __add_inline_refs(fs_info, path, bytenr,
874 &info_level, &prefs); 876 &info_level, &prefs);
875 if (ret) 877 if (ret)
@@ -890,8 +892,8 @@ again:
890 892
891 __merge_refs(&prefs, 1); 893 __merge_refs(&prefs, 1);
892 894
893 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, 895 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
894 &prefs, extent_item_pos); 896 extent_item_pos);
895 if (ret) 897 if (ret)
896 goto out; 898 goto out;
897 899
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1283{ 1285{
1284 int ret; 1286 int ret;
1285 u64 flags; 1287 u64 flags;
1288 u64 size = 0;
1286 u32 item_size; 1289 u32 item_size;
1287 struct extent_buffer *eb; 1290 struct extent_buffer *eb;
1288 struct btrfs_extent_item *ei; 1291 struct btrfs_extent_item *ei;
1289 struct btrfs_key key; 1292 struct btrfs_key key;
1290 1293
1291 key.type = BTRFS_EXTENT_ITEM_KEY; 1294 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1295 key.type = BTRFS_METADATA_ITEM_KEY;
1296 else
1297 key.type = BTRFS_EXTENT_ITEM_KEY;
1292 key.objectid = logical; 1298 key.objectid = logical;
1293 key.offset = (u64)-1; 1299 key.offset = (u64)-1;
1294 1300
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1301 return ret; 1307 return ret;
1302 1308
1303 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1309 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1304 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 1310 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1311 size = fs_info->extent_root->leafsize;
1312 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1313 size = found_key->offset;
1314
1315 if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
1316 found_key->type != BTRFS_METADATA_ITEM_KEY) ||
1305 found_key->objectid > logical || 1317 found_key->objectid > logical ||
1306 found_key->objectid + found_key->offset <= logical) { 1318 found_key->objectid + size <= logical) {
1307 pr_debug("logical %llu is not within any extent\n", 1319 pr_debug("logical %llu is not within any extent\n",
1308 (unsigned long long)logical); 1320 (unsigned long long)logical);
1309 return -ENOENT; 1321 return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1459 iterate_extent_inodes_t *iterate, void *ctx) 1471 iterate_extent_inodes_t *iterate, void *ctx)
1460{ 1472{
1461 int ret; 1473 int ret;
1462 struct btrfs_trans_handle *trans; 1474 struct btrfs_trans_handle *trans = NULL;
1463 struct ulist *refs = NULL; 1475 struct ulist *refs = NULL;
1464 struct ulist *roots = NULL; 1476 struct ulist *roots = NULL;
1465 struct ulist_node *ref_node = NULL; 1477 struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1471 pr_debug("resolving all inodes for extent %llu\n", 1483 pr_debug("resolving all inodes for extent %llu\n",
1472 extent_item_objectid); 1484 extent_item_objectid);
1473 1485
1474 if (search_commit_root) { 1486 if (!search_commit_root) {
1475 trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
1476 } else {
1477 trans = btrfs_join_transaction(fs_info->extent_root); 1487 trans = btrfs_join_transaction(fs_info->extent_root);
1478 if (IS_ERR(trans)) 1488 if (IS_ERR(trans))
1479 return PTR_ERR(trans); 1489 return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
27
28struct inode_fs_paths { 26struct inode_fs_paths {
29 struct btrfs_path *btrfs_path; 27 struct btrfs_path *btrfs_path;
30 struct btrfs_root *fs_root; 28 struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 17dffe33e8d0..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1089 btrfs_set_node_ptr_generation(parent, parent_slot, 1089 btrfs_set_node_ptr_generation(parent, parent_slot,
1090 trans->transid); 1090 trans->transid);
1091 btrfs_mark_buffer_dirty(parent); 1091 btrfs_mark_buffer_dirty(parent);
1092 tree_mod_log_free_eb(root->fs_info, buf); 1092 if (last_ref)
1093 tree_mod_log_free_eb(root->fs_info, buf);
1093 btrfs_free_tree_block(trans, root, buf, parent_start, 1094 btrfs_free_tree_block(trans, root, buf, parent_start,
1094 last_ref); 1095 last_ref);
1095 } 1096 }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1161 * time_seq). 1162 * time_seq).
1162 */ 1163 */
1163static void 1164static void
1164__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1165__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1165 struct tree_mod_elem *first_tm) 1166 u64 time_seq, struct tree_mod_elem *first_tm)
1166{ 1167{
1167 u32 n; 1168 u32 n;
1168 struct rb_node *next; 1169 struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1172 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1173 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1173 1174
1174 n = btrfs_header_nritems(eb); 1175 n = btrfs_header_nritems(eb);
1176 tree_mod_log_read_lock(fs_info);
1175 while (tm && tm->seq >= time_seq) { 1177 while (tm && tm->seq >= time_seq) {
1176 /* 1178 /*
1177 * all the operations are recorded with the operator used for 1179 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1226 if (tm->index != first_tm->index) 1228 if (tm->index != first_tm->index)
1227 break; 1229 break;
1228 } 1230 }
1231 tree_mod_log_read_unlock(fs_info);
1229 btrfs_set_header_nritems(eb, n); 1232 btrfs_set_header_nritems(eb, n);
1230} 1233}
1231 1234
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1274 1277
1275 extent_buffer_get(eb_rewin); 1278 extent_buffer_get(eb_rewin);
1276 btrfs_tree_read_lock(eb_rewin); 1279 btrfs_tree_read_lock(eb_rewin);
1277 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1280 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1278 WARN_ON(btrfs_header_nritems(eb_rewin) > 1281 WARN_ON(btrfs_header_nritems(eb_rewin) >
1279 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1282 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1280 1283
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1350 btrfs_set_header_generation(eb, old_generation); 1353 btrfs_set_header_generation(eb, old_generation);
1351 } 1354 }
1352 if (tm) 1355 if (tm)
1353 __tree_mod_log_rewind(eb, time_seq, tm); 1356 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1354 else 1357 else
1355 WARN_ON(btrfs_header_level(eb) != 0); 1358 WARN_ON(btrfs_header_level(eb) != 0);
1356 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1359 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
2178 } 2181 }
2179} 2182}
2180 2183
2181/* 2184static noinline void reada_for_balance(struct btrfs_root *root,
2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2185 struct btrfs_path *path, int level)
2183 * cache
2184 */
2185static noinline int reada_for_balance(struct btrfs_root *root,
2186 struct btrfs_path *path, int level)
2187{ 2186{
2188 int slot; 2187 int slot;
2189 int nritems; 2188 int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2192 u64 gen; 2191 u64 gen;
2193 u64 block1 = 0; 2192 u64 block1 = 0;
2194 u64 block2 = 0; 2193 u64 block2 = 0;
2195 int ret = 0;
2196 int blocksize; 2194 int blocksize;
2197 2195
2198 parent = path->nodes[level + 1]; 2196 parent = path->nodes[level + 1];
2199 if (!parent) 2197 if (!parent)
2200 return 0; 2198 return;
2201 2199
2202 nritems = btrfs_header_nritems(parent); 2200 nritems = btrfs_header_nritems(parent);
2203 slot = path->slots[level + 1]; 2201 slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2224 block2 = 0; 2222 block2 = 0;
2225 free_extent_buffer(eb); 2223 free_extent_buffer(eb);
2226 } 2224 }
2227 if (block1 || block2) {
2228 ret = -EAGAIN;
2229
2230 /* release the whole path */
2231 btrfs_release_path(path);
2232
2233 /* read the blocks */
2234 if (block1)
2235 readahead_tree_block(root, block1, blocksize, 0);
2236 if (block2)
2237 readahead_tree_block(root, block2, blocksize, 0);
2238 2225
2239 if (block1) { 2226 if (block1)
2240 eb = read_tree_block(root, block1, blocksize, 0); 2227 readahead_tree_block(root, block1, blocksize, 0);
2241 free_extent_buffer(eb); 2228 if (block2)
2242 } 2229 readahead_tree_block(root, block2, blocksize, 0);
2243 if (block2) {
2244 eb = read_tree_block(root, block2, blocksize, 0);
2245 free_extent_buffer(eb);
2246 }
2247 }
2248 return ret;
2249} 2230}
2250 2231
2251 2232
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2359 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2340 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2360 if (tmp) { 2341 if (tmp) {
2361 /* first we do an atomic uptodate check */ 2342 /* first we do an atomic uptodate check */
2362 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2343 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2363 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2344 *eb_ret = tmp;
2364 /* 2345 return 0;
2365 * we found an up to date block without 2346 }
2366 * sleeping, return
2367 * right away
2368 */
2369 *eb_ret = tmp;
2370 return 0;
2371 }
2372 /* the pages were up to date, but we failed
2373 * the generation number check. Do a full
2374 * read for the generation number that is correct.
2375 * We must do this without dropping locks so
2376 * we can trust our generation number
2377 */
2378 free_extent_buffer(tmp);
2379 btrfs_set_path_blocking(p);
2380 2347
2381 /* now we're allowed to do a blocking uptodate check */ 2348 /* the pages were up to date, but we failed
2382 tmp = read_tree_block(root, blocknr, blocksize, gen); 2349 * the generation number check. Do a full
2383 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2350 * read for the generation number that is correct.
2384 *eb_ret = tmp; 2351 * We must do this without dropping locks so
2385 return 0; 2352 * we can trust our generation number
2386 } 2353 */
2387 free_extent_buffer(tmp); 2354 btrfs_set_path_blocking(p);
2388 btrfs_release_path(p); 2355
2389 return -EIO; 2356 /* now we're allowed to do a blocking uptodate check */
2357 ret = btrfs_read_buffer(tmp, gen);
2358 if (!ret) {
2359 *eb_ret = tmp;
2360 return 0;
2390 } 2361 }
2362 free_extent_buffer(tmp);
2363 btrfs_release_path(p);
2364 return -EIO;
2391 } 2365 }
2392 2366
2393 /* 2367 /*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2448 goto again; 2422 goto again;
2449 } 2423 }
2450 2424
2451 sret = reada_for_balance(root, p, level);
2452 if (sret)
2453 goto again;
2454
2455 btrfs_set_path_blocking(p); 2425 btrfs_set_path_blocking(p);
2426 reada_for_balance(root, p, level);
2456 sret = split_node(trans, root, p, level); 2427 sret = split_node(trans, root, p, level);
2457 btrfs_clear_path_blocking(p, NULL, 0); 2428 btrfs_clear_path_blocking(p, NULL, 0);
2458 2429
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2472 goto again; 2443 goto again;
2473 } 2444 }
2474 2445
2475 sret = reada_for_balance(root, p, level);
2476 if (sret)
2477 goto again;
2478
2479 btrfs_set_path_blocking(p); 2446 btrfs_set_path_blocking(p);
2447 reada_for_balance(root, p, level);
2480 sret = balance_level(trans, root, p, level); 2448 sret = balance_level(trans, root, p, level);
2481 btrfs_clear_path_blocking(p, NULL, 0); 2449 btrfs_clear_path_blocking(p, NULL, 0);
2482 2450
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
3143 */ 3111 */
3144static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3112static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3145 struct btrfs_root *root, 3113 struct btrfs_root *root,
3146 struct btrfs_path *path, int level, int log_removal) 3114 struct btrfs_path *path, int level)
3147{ 3115{
3148 u64 lower_gen; 3116 u64 lower_gen;
3149 struct extent_buffer *lower; 3117 struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3194 btrfs_mark_buffer_dirty(c); 3162 btrfs_mark_buffer_dirty(c);
3195 3163
3196 old = root->node; 3164 old = root->node;
3197 tree_mod_log_set_root_pointer(root, c, log_removal); 3165 tree_mod_log_set_root_pointer(root, c, 0);
3198 rcu_assign_pointer(root->node, c); 3166 rcu_assign_pointer(root->node, c);
3199 3167
3200 /* the super has an extra ref to root->node */ 3168 /* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3278 /* 3246 /*
3279 * trying to split the root, lets make a new one 3247 * trying to split the root, lets make a new one
3280 * 3248 *
3281 * tree mod log: We pass 0 as log_removal parameter to 3249 * tree mod log: We don't log_removal old root in
3282 * insert_new_root, because that root buffer will be kept as a 3250 * insert_new_root, because that root buffer will be kept as a
3283 * normal node. We are going to log removal of half of the 3251 * normal node. We are going to log removal of half of the
3284 * elements below with tree_mod_log_eb_copy. We're holding a 3252 * elements below with tree_mod_log_eb_copy. We're holding a
3285 * tree lock on the buffer, which is why we cannot race with 3253 * tree lock on the buffer, which is why we cannot race with
3286 * other tree_mod_log users. 3254 * other tree_mod_log users.
3287 */ 3255 */
3288 ret = insert_new_root(trans, root, path, level + 1, 0); 3256 ret = insert_new_root(trans, root, path, level + 1);
3289 if (ret) 3257 if (ret)
3290 return ret; 3258 return ret;
3291 } else { 3259 } else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
3986 return -EOVERFLOW; 3954 return -EOVERFLOW;
3987 3955
3988 /* first try to make some room by pushing left and right */ 3956 /* first try to make some room by pushing left and right */
3989 if (data_size) { 3957 if (data_size && path->nodes[1]) {
3990 wret = push_leaf_right(trans, root, path, data_size, 3958 wret = push_leaf_right(trans, root, path, data_size,
3991 data_size, 0, 0); 3959 data_size, 0, 0);
3992 if (wret < 0) 3960 if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
4005 } 3973 }
4006 3974
4007 if (!path->nodes[1]) { 3975 if (!path->nodes[1]) {
4008 ret = insert_new_root(trans, root, path, 1, 1); 3976 ret = insert_new_root(trans, root, path, 1);
4009 if (ret) 3977 if (ret)
4010 return ret; 3978 return ret;
4011 } 3979 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
964#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 964#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
965#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 965#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
967 967
968enum btrfs_raid_types { 968enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
1102 account */ 1102 account */
1103 1103
1104 /* 1104 /*
1105 * bytes_pinned is kept in line with what is actually pinned, as in
1106 * we've called update_block_group and dropped the bytes_used counter
1107 * and increased the bytes_pinned counter. However this means that
1108 * bytes_pinned does not reflect the bytes that will be pinned once the
1109 * delayed refs are flushed, so this counter is inc'ed everytime we call
1110 * btrfs_free_extent so it is a realtime count of what will be freed
1111 * once the transaction is committed. It will be zero'ed everytime the
1112 * transaction commits.
1113 */
1114 struct percpu_counter total_bytes_pinned;
1115
1116 /*
1105 * we bump reservation progress every time we decrement 1117 * we bump reservation progress every time we decrement
1106 * bytes_reserved. This way people waiting for reservations 1118 * bytes_reserved. This way people waiting for reservations
1107 * know something good has happened and they can check 1119 * know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
1437 atomic_t open_ioctl_trans; 1449 atomic_t open_ioctl_trans;
1438 1450
1439 /* 1451 /*
1440 * this is used by the balancing code to wait for all the pending 1452 * this is used to protect the following list -- ordered_roots.
1441 * ordered extents
1442 */ 1453 */
1443 spinlock_t ordered_extent_lock; 1454 spinlock_t ordered_root_lock;
1444 1455
1445 /* 1456 /*
1446 * all of the data=ordered extents pending writeback 1457 * all fs/file tree roots in which there are data=ordered extents
1458 * pending writeback are added into this list.
1459 *
1447 * these can span multiple transactions and basically include 1460 * these can span multiple transactions and basically include
1448 * every dirty data page that isn't from nodatacow 1461 * every dirty data page that isn't from nodatacow
1449 */ 1462 */
1450 struct list_head ordered_extents; 1463 struct list_head ordered_roots;
1451 1464
1452 spinlock_t delalloc_lock; 1465 spinlock_t delalloc_root_lock;
1453 /* 1466 /* all fs/file tree roots that have delalloc inodes. */
1454 * all of the inodes that have delalloc bytes. It is possible for 1467 struct list_head delalloc_roots;
1455 * this list to be empty even when there is still dirty data=ordered
1456 * extents waiting to finish IO.
1457 */
1458 struct list_head delalloc_inodes;
1459 1468
1460 /* 1469 /*
1461 * there is a pool of worker threads for checksumming during writes 1470 * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
1498 int do_barriers; 1507 int do_barriers;
1499 int closing; 1508 int closing;
1500 int log_root_recovering; 1509 int log_root_recovering;
1501 int enospc_unlink;
1502 int trans_no_join;
1503 1510
1504 u64 total_pinned; 1511 u64 total_pinned;
1505 1512
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
1594 struct rb_root qgroup_tree; 1601 struct rb_root qgroup_tree;
1595 spinlock_t qgroup_lock; 1602 spinlock_t qgroup_lock;
1596 1603
1604 /*
1605 * used to avoid frequently calling ulist_alloc()/ulist_free()
1606 * when doing qgroup accounting, it must be protected by qgroup_lock.
1607 */
1608 struct ulist *qgroup_ulist;
1609
1597 /* protect user change for quota operations */ 1610 /* protect user change for quota operations */
1598 struct mutex qgroup_ioctl_lock; 1611 struct mutex qgroup_ioctl_lock;
1599 1612
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
1607 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1620 struct mutex qgroup_rescan_lock; /* protects the progress item */
1608 struct btrfs_key qgroup_rescan_progress; 1621 struct btrfs_key qgroup_rescan_progress;
1609 struct btrfs_workers qgroup_rescan_workers; 1622 struct btrfs_workers qgroup_rescan_workers;
1623 struct completion qgroup_rescan_completion;
1624 struct btrfs_work qgroup_rescan_work;
1610 1625
1611 /* filesystem state */ 1626 /* filesystem state */
1612 unsigned long fs_state; 1627 unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
1739 int force_cow; 1754 int force_cow;
1740 1755
1741 spinlock_t root_item_lock; 1756 spinlock_t root_item_lock;
1757 atomic_t refs;
1758
1759 spinlock_t delalloc_lock;
1760 /*
1761 * all of the inodes that have delalloc bytes. It is possible for
1762 * this list to be empty even when there is still dirty data=ordered
1763 * extents waiting to finish IO.
1764 */
1765 struct list_head delalloc_inodes;
1766 struct list_head delalloc_root;
1767 u64 nr_delalloc_inodes;
1768 /*
1769 * this is used by the balancing code to wait for all the pending
1770 * ordered extents
1771 */
1772 spinlock_t ordered_extent_lock;
1773
1774 /*
1775 * all of the data=ordered extents pending writeback
1776 * these can span multiple transactions and basically include
1777 * every dirty data page that isn't from nodatacow
1778 */
1779 struct list_head ordered_extents;
1780 struct list_head ordered_root;
1781 u64 nr_ordered_extents;
1742}; 1782};
1743 1783
1744struct btrfs_ioctl_defrag_range_args { 1784struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3028 num_items; 3068 num_items;
3029} 3069}
3030 3070
3071int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3072 struct btrfs_root *root);
3031void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3073void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3032int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3074int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3033 struct btrfs_root *root, unsigned long count); 3075 struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
3039 u64 bytenr, u64 num, int reserved); 3081 u64 bytenr, u64 num, int reserved);
3040int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 3082int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
3041 u64 bytenr, u64 num_bytes); 3083 u64 bytenr, u64 num_bytes);
3084int btrfs_exclude_logged_extents(struct btrfs_root *root,
3085 struct extent_buffer *eb);
3042int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3086int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3043 struct btrfs_root *root, 3087 struct btrfs_root *root,
3044 u64 objectid, u64 offset, u64 bytenr); 3088 u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3155int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3199int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3156 struct btrfs_block_rsv *dst_rsv, 3200 struct btrfs_block_rsv *dst_rsv,
3157 u64 num_bytes); 3201 u64 num_bytes);
3202int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
3203 struct btrfs_block_rsv *dest, u64 num_bytes,
3204 int min_factor);
3158void btrfs_block_rsv_release(struct btrfs_root *root, 3205void btrfs_block_rsv_release(struct btrfs_root *root,
3159 struct btrfs_block_rsv *block_rsv, 3206 struct btrfs_block_rsv *block_rsv,
3160 u64 num_bytes); 3207 u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
3311 smp_mb(); 3358 smp_mb();
3312 return fs_info->closing; 3359 return fs_info->closing;
3313} 3360}
3361
3362/*
3363 * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
3364 * anything except sleeping. This function is used to check the status of
3365 * the fs.
3366 */
3367static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
3368{
3369 return (root->fs_info->sb->s_flags & MS_RDONLY ||
3370 btrfs_fs_closing(root->fs_info));
3371}
3372
3314static inline void free_fs_info(struct btrfs_fs_info *fs_info) 3373static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3315{ 3374{
3316 kfree(fs_info->balance_ctl); 3375 kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
3357 struct btrfs_root_item *item); 3416 struct btrfs_root_item *item);
3358void btrfs_read_root_item(struct extent_buffer *eb, int slot, 3417void btrfs_read_root_item(struct extent_buffer *eb, int slot,
3359 struct btrfs_root_item *item); 3418 struct btrfs_root_item *item);
3360int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3419int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
3361 btrfs_root_item *item, struct btrfs_key *key); 3420 struct btrfs_path *path, struct btrfs_root_item *root_item,
3362int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3421 struct btrfs_key *root_key);
3363int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 3422int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
3364void btrfs_set_root_node(struct btrfs_root_item *item, 3423void btrfs_set_root_node(struct btrfs_root_item *item,
3365 struct extent_buffer *node); 3424 struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3493struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3552struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3494 size_t pg_offset, u64 start, u64 len, 3553 size_t pg_offset, u64 start, u64 len,
3495 int create); 3554 int create);
3555noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
3556 struct inode *inode, u64 offset, u64 *len,
3557 u64 *orig_start, u64 *orig_block_len,
3558 u64 *ram_bytes);
3496 3559
3497/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 3560/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
3498#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) 3561#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3530 u32 min_type); 3593 u32 min_type);
3531 3594
3532int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3595int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3596int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
3597 int delay_iput);
3533int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3598int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3534 struct extent_state **cached_state); 3599 struct extent_state **cached_state);
3535int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3600int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3814int btrfs_quota_disable(struct btrfs_trans_handle *trans, 3879int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3815 struct btrfs_fs_info *fs_info); 3880 struct btrfs_fs_info *fs_info);
3816int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 3881int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
3882void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
3883int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
3817int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 3884int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3818 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 3885 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3819int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 3886int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb34438ddedb..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
535 return next; 535 return next;
536} 536}
537 537
538static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
539 u64 root_id)
540{
541 struct btrfs_key root_key;
542
543 if (root->objectid == root_id)
544 return root;
545
546 root_key.objectid = root_id;
547 root_key.type = BTRFS_ROOT_ITEM_KEY;
548 root_key.offset = (u64)-1;
549 return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
550}
551
552static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 538static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
553 struct btrfs_root *root, 539 struct btrfs_root *root,
554 struct btrfs_delayed_item *item) 540 struct btrfs_delayed_item *item)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
401 btrfs_dev_replace_unlock(dev_replace); 401 btrfs_dev_replace_unlock(dev_replace);
402 402
403 btrfs_wait_ordered_extents(root, 0); 403 btrfs_wait_all_ordered_extents(root->fs_info, 0);
404 404
405 /* force writing the updated state information to disk */ 405 /* force writing the updated state information to disk */
406 trans = btrfs_start_transaction(root, 0); 406 trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
470 * flush all outstanding I/O and inode extent mappings before the 470 * flush all outstanding I/O and inode extent mappings before the
471 * copy operation is declared as being finished 471 * copy operation is declared as being finished
472 */ 472 */
473 ret = btrfs_start_delalloc_inodes(root, 0); 473 ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
474 if (ret) { 474 if (ret) {
475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
476 return ret; 476 return ret;
477 } 477 }
478 btrfs_wait_ordered_extents(root, 0); 478 btrfs_wait_all_ordered_extents(root->fs_info, 0);
479 479
480 trans = btrfs_start_transaction(root, 0); 480 trans = btrfs_start_transaction(root, 0);
481 if (IS_ERR(trans)) { 481 if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0292b3ead54..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1192 root->objectid = objectid; 1192 root->objectid = objectid;
1193 root->last_trans = 0; 1193 root->last_trans = 0;
1194 root->highest_objectid = 0; 1194 root->highest_objectid = 0;
1195 root->nr_delalloc_inodes = 0;
1196 root->nr_ordered_extents = 0;
1195 root->name = NULL; 1197 root->name = NULL;
1196 root->inode_tree = RB_ROOT; 1198 root->inode_tree = RB_ROOT;
1197 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1199 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1200,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1200 1202
1201 INIT_LIST_HEAD(&root->dirty_list); 1203 INIT_LIST_HEAD(&root->dirty_list);
1202 INIT_LIST_HEAD(&root->root_list); 1204 INIT_LIST_HEAD(&root->root_list);
1205 INIT_LIST_HEAD(&root->delalloc_inodes);
1206 INIT_LIST_HEAD(&root->delalloc_root);
1207 INIT_LIST_HEAD(&root->ordered_extents);
1208 INIT_LIST_HEAD(&root->ordered_root);
1203 INIT_LIST_HEAD(&root->logged_list[0]); 1209 INIT_LIST_HEAD(&root->logged_list[0]);
1204 INIT_LIST_HEAD(&root->logged_list[1]); 1210 INIT_LIST_HEAD(&root->logged_list[1]);
1205 spin_lock_init(&root->orphan_lock); 1211 spin_lock_init(&root->orphan_lock);
1206 spin_lock_init(&root->inode_lock); 1212 spin_lock_init(&root->inode_lock);
1213 spin_lock_init(&root->delalloc_lock);
1214 spin_lock_init(&root->ordered_extent_lock);
1207 spin_lock_init(&root->accounting_lock); 1215 spin_lock_init(&root->accounting_lock);
1208 spin_lock_init(&root->log_extents_lock[0]); 1216 spin_lock_init(&root->log_extents_lock[0]);
1209 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
@@ -1217,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1217 atomic_set(&root->log_writers, 0); 1225 atomic_set(&root->log_writers, 0);
1218 atomic_set(&root->log_batch, 0); 1226 atomic_set(&root->log_batch, 0);
1219 atomic_set(&root->orphan_inodes, 0); 1227 atomic_set(&root->orphan_inodes, 0);
1228 atomic_set(&root->refs, 1);
1220 root->log_transid = 0; 1229 root->log_transid = 0;
1221 root->last_log_commit = 0; 1230 root->last_log_commit = 0;
1222 extent_io_tree_init(&root->dirty_log_pages, 1231 extent_io_tree_init(&root->dirty_log_pages,
@@ -1235,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1235 spin_lock_init(&root->root_item_lock); 1244 spin_lock_init(&root->root_item_lock);
1236} 1245}
1237 1246
1238static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
1239 struct btrfs_fs_info *fs_info,
1240 u64 objectid,
1241 struct btrfs_root *root)
1242{
1243 int ret;
1244 u32 blocksize;
1245 u64 generation;
1246
1247 __setup_root(tree_root->nodesize, tree_root->leafsize,
1248 tree_root->sectorsize, tree_root->stripesize,
1249 root, fs_info, objectid);
1250 ret = btrfs_find_last_root(tree_root, objectid,
1251 &root->root_item, &root->root_key);
1252 if (ret > 0)
1253 return -ENOENT;
1254 else if (ret < 0)
1255 return ret;
1256
1257 generation = btrfs_root_generation(&root->root_item);
1258 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1259 root->commit_root = NULL;
1260 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1261 blocksize, generation);
1262 if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
1263 free_extent_buffer(root->node);
1264 root->node = NULL;
1265 return -EIO;
1266 }
1267 root->commit_root = btrfs_root_node(root);
1268 return 0;
1269}
1270
1271static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1247static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1272{ 1248{
1273 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1249 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1452,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1452 return 0; 1428 return 0;
1453} 1429}
1454 1430
1455struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 1431struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1456 struct btrfs_key *location) 1432 struct btrfs_key *key)
1457{ 1433{
1458 struct btrfs_root *root; 1434 struct btrfs_root *root;
1459 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1435 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1460 struct btrfs_path *path; 1436 struct btrfs_path *path;
1461 struct extent_buffer *l;
1462 u64 generation; 1437 u64 generation;
1463 u32 blocksize; 1438 u32 blocksize;
1464 int ret = 0; 1439 int ret;
1465 int slot;
1466 1440
1467 root = btrfs_alloc_root(fs_info); 1441 path = btrfs_alloc_path();
1468 if (!root) 1442 if (!path)
1469 return ERR_PTR(-ENOMEM); 1443 return ERR_PTR(-ENOMEM);
1470 if (location->offset == (u64)-1) { 1444
1471 ret = find_and_setup_root(tree_root, fs_info, 1445 root = btrfs_alloc_root(fs_info);
1472 location->objectid, root); 1446 if (!root) {
1473 if (ret) { 1447 ret = -ENOMEM;
1474 kfree(root); 1448 goto alloc_fail;
1475 return ERR_PTR(ret);
1476 }
1477 goto out;
1478 } 1449 }
1479 1450
1480 __setup_root(tree_root->nodesize, tree_root->leafsize, 1451 __setup_root(tree_root->nodesize, tree_root->leafsize,
1481 tree_root->sectorsize, tree_root->stripesize, 1452 tree_root->sectorsize, tree_root->stripesize,
1482 root, fs_info, location->objectid); 1453 root, fs_info, key->objectid);
1483 1454
1484 path = btrfs_alloc_path(); 1455 ret = btrfs_find_root(tree_root, key, path,
1485 if (!path) { 1456 &root->root_item, &root->root_key);
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1490 if (ret == 0) {
1491 l = path->nodes[0];
1492 slot = path->slots[0];
1493 btrfs_read_root_item(l, slot, &root->root_item);
1494 memcpy(&root->root_key, location, sizeof(*location));
1495 }
1496 btrfs_free_path(path);
1497 if (ret) { 1457 if (ret) {
1498 kfree(root);
1499 if (ret > 0) 1458 if (ret > 0)
1500 ret = -ENOENT; 1459 ret = -ENOENT;
1501 return ERR_PTR(ret); 1460 goto find_fail;
1502 } 1461 }
1503 1462
1504 generation = btrfs_root_generation(&root->root_item); 1463 generation = btrfs_root_generation(&root->root_item);
1505 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1464 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1506 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1465 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1507 blocksize, generation); 1466 blocksize, generation);
1508 if (!root->node || !extent_buffer_uptodate(root->node)) { 1467 if (!root->node) {
1509 ret = (!root->node) ? -ENOMEM : -EIO; 1468 ret = -ENOMEM;
1510 1469 goto find_fail;
1511 free_extent_buffer(root->node); 1470 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1512 kfree(root); 1471 ret = -EIO;
1513 return ERR_PTR(ret); 1472 goto read_fail;
1514 } 1473 }
1515
1516 root->commit_root = btrfs_root_node(root); 1474 root->commit_root = btrfs_root_node(root);
1517out: 1475out:
1518 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1476 btrfs_free_path(path);
1477 return root;
1478
1479read_fail:
1480 free_extent_buffer(root->node);
1481find_fail:
1482 kfree(root);
1483alloc_fail:
1484 root = ERR_PTR(ret);
1485 goto out;
1486}
1487
1488struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1489 struct btrfs_key *location)
1490{
1491 struct btrfs_root *root;
1492
1493 root = btrfs_read_tree_root(tree_root, location);
1494 if (IS_ERR(root))
1495 return root;
1496
1497 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1519 root->ref_cows = 1; 1498 root->ref_cows = 1;
1520 btrfs_check_and_init_root_item(&root->root_item); 1499 btrfs_check_and_init_root_item(&root->root_item);
1521 } 1500 }
@@ -1523,6 +1502,66 @@ out:
1523 return root; 1502 return root;
1524} 1503}
1525 1504
1505int btrfs_init_fs_root(struct btrfs_root *root)
1506{
1507 int ret;
1508
1509 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1510 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1511 GFP_NOFS);
1512 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1513 ret = -ENOMEM;
1514 goto fail;
1515 }
1516
1517 btrfs_init_free_ino_ctl(root);
1518 mutex_init(&root->fs_commit_mutex);
1519 spin_lock_init(&root->cache_lock);
1520 init_waitqueue_head(&root->cache_wait);
1521
1522 ret = get_anon_bdev(&root->anon_dev);
1523 if (ret)
1524 goto fail;
1525 return 0;
1526fail:
1527 kfree(root->free_ino_ctl);
1528 kfree(root->free_ino_pinned);
1529 return ret;
1530}
1531
1532struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1533 u64 root_id)
1534{
1535 struct btrfs_root *root;
1536
1537 spin_lock(&fs_info->fs_roots_radix_lock);
1538 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1539 (unsigned long)root_id);
1540 spin_unlock(&fs_info->fs_roots_radix_lock);
1541 return root;
1542}
1543
1544int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1545 struct btrfs_root *root)
1546{
1547 int ret;
1548
1549 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1550 if (ret)
1551 return ret;
1552
1553 spin_lock(&fs_info->fs_roots_radix_lock);
1554 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1555 (unsigned long)root->root_key.objectid,
1556 root);
1557 if (ret == 0)
1558 root->in_radix = 1;
1559 spin_unlock(&fs_info->fs_roots_radix_lock);
1560 radix_tree_preload_end();
1561
1562 return ret;
1563}
1564
1526struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1565struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1527 struct btrfs_key *location) 1566 struct btrfs_key *location)
1528{ 1567{
@@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1543 return fs_info->quota_root ? fs_info->quota_root : 1582 return fs_info->quota_root ? fs_info->quota_root :
1544 ERR_PTR(-ENOENT); 1583 ERR_PTR(-ENOENT);
1545again: 1584again:
1546 spin_lock(&fs_info->fs_roots_radix_lock); 1585 root = btrfs_lookup_fs_root(fs_info, location->objectid);
1547 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1548 (unsigned long)location->objectid);
1549 spin_unlock(&fs_info->fs_roots_radix_lock);
1550 if (root) 1586 if (root)
1551 return root; 1587 return root;
1552 1588
1553 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1589 root = btrfs_read_fs_root(fs_info->tree_root, location);
1554 if (IS_ERR(root)) 1590 if (IS_ERR(root))
1555 return root; 1591 return root;
1556 1592
1557 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1593 if (btrfs_root_refs(&root->root_item) == 0) {
1558 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1594 ret = -ENOENT;
1559 GFP_NOFS);
1560 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1561 ret = -ENOMEM;
1562 goto fail; 1595 goto fail;
1563 } 1596 }
1564 1597
1565 btrfs_init_free_ino_ctl(root); 1598 ret = btrfs_init_fs_root(root);
1566 mutex_init(&root->fs_commit_mutex);
1567 spin_lock_init(&root->cache_lock);
1568 init_waitqueue_head(&root->cache_wait);
1569
1570 ret = get_anon_bdev(&root->anon_dev);
1571 if (ret) 1599 if (ret)
1572 goto fail; 1600 goto fail;
1573 1601
1574 if (btrfs_root_refs(&root->root_item) == 0) {
1575 ret = -ENOENT;
1576 goto fail;
1577 }
1578
1579 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1602 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1580 if (ret < 0) 1603 if (ret < 0)
1581 goto fail; 1604 goto fail;
1582 if (ret == 0) 1605 if (ret == 0)
1583 root->orphan_item_inserted = 1; 1606 root->orphan_item_inserted = 1;
1584 1607
1585 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1608 ret = btrfs_insert_fs_root(fs_info, root);
1586 if (ret)
1587 goto fail;
1588
1589 spin_lock(&fs_info->fs_roots_radix_lock);
1590 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1591 (unsigned long)root->root_key.objectid,
1592 root);
1593 if (ret == 0)
1594 root->in_radix = 1;
1595
1596 spin_unlock(&fs_info->fs_roots_radix_lock);
1597 radix_tree_preload_end();
1598 if (ret) { 1609 if (ret) {
1599 if (ret == -EEXIST) { 1610 if (ret == -EEXIST) {
1600 free_fs_root(root); 1611 free_fs_root(root);
@@ -1602,10 +1613,6 @@ again:
1602 } 1613 }
1603 goto fail; 1614 goto fail;
1604 } 1615 }
1605
1606 ret = btrfs_find_dead_roots(fs_info->tree_root,
1607 root->root_key.objectid);
1608 WARN_ON(ret);
1609 return root; 1616 return root;
1610fail: 1617fail:
1611 free_fs_root(root); 1618 free_fs_root(root);
@@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
1677static int cleaner_kthread(void *arg) 1684static int cleaner_kthread(void *arg)
1678{ 1685{
1679 struct btrfs_root *root = arg; 1686 struct btrfs_root *root = arg;
1687 int again;
1680 1688
1681 do { 1689 do {
1682 int again = 0; 1690 again = 0;
1683 1691
1684 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1692 /* Make the cleaner go to sleep early. */
1685 down_read_trylock(&root->fs_info->sb->s_umount)) { 1693 if (btrfs_need_cleaner_sleep(root))
1686 if (mutex_trylock(&root->fs_info->cleaner_mutex)) { 1694 goto sleep;
1687 btrfs_run_delayed_iputs(root); 1695
1688 again = btrfs_clean_one_deleted_snapshot(root); 1696 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1689 mutex_unlock(&root->fs_info->cleaner_mutex); 1697 goto sleep;
1690 } 1698
1691 btrfs_run_defrag_inodes(root->fs_info); 1699 /*
1692 up_read(&root->fs_info->sb->s_umount); 1700 * Avoid the problem that we change the status of the fs
1701 * during the above check and trylock.
1702 */
1703 if (btrfs_need_cleaner_sleep(root)) {
1704 mutex_unlock(&root->fs_info->cleaner_mutex);
1705 goto sleep;
1693 } 1706 }
1694 1707
1708 btrfs_run_delayed_iputs(root);
1709 again = btrfs_clean_one_deleted_snapshot(root);
1710 mutex_unlock(&root->fs_info->cleaner_mutex);
1711
1712 /*
1713 * The defragger has dealt with the R/O remount and umount,
1714 * needn't do anything special here.
1715 */
1716 btrfs_run_defrag_inodes(root->fs_info);
1717sleep:
1695 if (!try_to_freeze() && !again) { 1718 if (!try_to_freeze() && !again) {
1696 set_current_state(TASK_INTERRUPTIBLE); 1719 set_current_state(TASK_INTERRUPTIBLE);
1697 if (!kthread_should_stop()) 1720 if (!kthread_should_stop())
@@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg)
1725 } 1748 }
1726 1749
1727 now = get_seconds(); 1750 now = get_seconds();
1728 if (!cur->blocked && 1751 if (cur->state < TRANS_STATE_BLOCKED &&
1729 (now < cur->start_time || now - cur->start_time < 30)) { 1752 (now < cur->start_time || now - cur->start_time < 30)) {
1730 spin_unlock(&root->fs_info->trans_lock); 1753 spin_unlock(&root->fs_info->trans_lock);
1731 delay = HZ * 5; 1754 delay = HZ * 5;
@@ -2035,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2035 list_del(&gang[0]->root_list); 2058 list_del(&gang[0]->root_list);
2036 2059
2037 if (gang[0]->in_radix) { 2060 if (gang[0]->in_radix) {
2038 btrfs_free_fs_root(fs_info, gang[0]); 2061 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2039 } else { 2062 } else {
2040 free_extent_buffer(gang[0]->node); 2063 free_extent_buffer(gang[0]->node);
2041 free_extent_buffer(gang[0]->commit_root); 2064 free_extent_buffer(gang[0]->commit_root);
2042 kfree(gang[0]); 2065 btrfs_put_fs_root(gang[0]);
2043 } 2066 }
2044 } 2067 }
2045 2068
@@ -2050,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2050 if (!ret) 2073 if (!ret)
2051 break; 2074 break;
2052 for (i = 0; i < ret; i++) 2075 for (i = 0; i < ret; i++)
2053 btrfs_free_fs_root(fs_info, gang[i]); 2076 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2054 } 2077 }
2055} 2078}
2056 2079
@@ -2082,14 +2105,8 @@ int open_ctree(struct super_block *sb,
2082 int backup_index = 0; 2105 int backup_index = 0;
2083 2106
2084 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2107 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2085 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
2086 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
2087 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2108 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2088 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 2109 if (!tree_root || !chunk_root) {
2089 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
2090
2091 if (!tree_root || !extent_root || !csum_root ||
2092 !chunk_root || !dev_root || !quota_root) {
2093 err = -ENOMEM; 2110 err = -ENOMEM;
2094 goto fail; 2111 goto fail;
2095 } 2112 }
@@ -2132,9 +2149,9 @@ int open_ctree(struct super_block *sb,
2132 INIT_LIST_HEAD(&fs_info->trans_list); 2149 INIT_LIST_HEAD(&fs_info->trans_list);
2133 INIT_LIST_HEAD(&fs_info->dead_roots); 2150 INIT_LIST_HEAD(&fs_info->dead_roots);
2134 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2151 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2135 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2152 INIT_LIST_HEAD(&fs_info->delalloc_roots);
2136 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2153 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2137 spin_lock_init(&fs_info->delalloc_lock); 2154 spin_lock_init(&fs_info->delalloc_root_lock);
2138 spin_lock_init(&fs_info->trans_lock); 2155 spin_lock_init(&fs_info->trans_lock);
2139 spin_lock_init(&fs_info->fs_roots_radix_lock); 2156 spin_lock_init(&fs_info->fs_roots_radix_lock);
2140 spin_lock_init(&fs_info->delayed_iput_lock); 2157 spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2170,7 +2187,6 @@ int open_ctree(struct super_block *sb,
2170 fs_info->max_inline = 8192 * 1024; 2187 fs_info->max_inline = 8192 * 1024;
2171 fs_info->metadata_ratio = 0; 2188 fs_info->metadata_ratio = 0;
2172 fs_info->defrag_inodes = RB_ROOT; 2189 fs_info->defrag_inodes = RB_ROOT;
2173 fs_info->trans_no_join = 0;
2174 fs_info->free_chunk_space = 0; 2190 fs_info->free_chunk_space = 0;
2175 fs_info->tree_mod_log = RB_ROOT; 2191 fs_info->tree_mod_log = RB_ROOT;
2176 2192
@@ -2181,8 +2197,8 @@ int open_ctree(struct super_block *sb,
2181 fs_info->thread_pool_size = min_t(unsigned long, 2197 fs_info->thread_pool_size = min_t(unsigned long,
2182 num_online_cpus() + 2, 8); 2198 num_online_cpus() + 2, 8);
2183 2199
2184 INIT_LIST_HEAD(&fs_info->ordered_extents); 2200 INIT_LIST_HEAD(&fs_info->ordered_roots);
2185 spin_lock_init(&fs_info->ordered_extent_lock); 2201 spin_lock_init(&fs_info->ordered_root_lock);
2186 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2202 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2187 GFP_NOFS); 2203 GFP_NOFS);
2188 if (!fs_info->delayed_root) { 2204 if (!fs_info->delayed_root) {
@@ -2275,6 +2291,7 @@ int open_ctree(struct super_block *sb,
2275 fs_info->qgroup_seq = 1; 2291 fs_info->qgroup_seq = 1;
2276 fs_info->quota_enabled = 0; 2292 fs_info->quota_enabled = 0;
2277 fs_info->pending_quota_state = 0; 2293 fs_info->pending_quota_state = 0;
2294 fs_info->qgroup_ulist = NULL;
2278 mutex_init(&fs_info->qgroup_rescan_lock); 2295 mutex_init(&fs_info->qgroup_rescan_lock);
2279 2296
2280 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2297 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2639,33 +2656,44 @@ retry_root_backup:
2639 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2656 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2640 tree_root->commit_root = btrfs_root_node(tree_root); 2657 tree_root->commit_root = btrfs_root_node(tree_root);
2641 2658
2642 ret = find_and_setup_root(tree_root, fs_info, 2659 location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2643 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2660 location.type = BTRFS_ROOT_ITEM_KEY;
2644 if (ret) 2661 location.offset = 0;
2662
2663 extent_root = btrfs_read_tree_root(tree_root, &location);
2664 if (IS_ERR(extent_root)) {
2665 ret = PTR_ERR(extent_root);
2645 goto recovery_tree_root; 2666 goto recovery_tree_root;
2667 }
2646 extent_root->track_dirty = 1; 2668 extent_root->track_dirty = 1;
2669 fs_info->extent_root = extent_root;
2647 2670
2648 ret = find_and_setup_root(tree_root, fs_info, 2671 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2649 BTRFS_DEV_TREE_OBJECTID, dev_root); 2672 dev_root = btrfs_read_tree_root(tree_root, &location);
2650 if (ret) 2673 if (IS_ERR(dev_root)) {
2674 ret = PTR_ERR(dev_root);
2651 goto recovery_tree_root; 2675 goto recovery_tree_root;
2676 }
2652 dev_root->track_dirty = 1; 2677 dev_root->track_dirty = 1;
2678 fs_info->dev_root = dev_root;
2679 btrfs_init_devices_late(fs_info);
2653 2680
2654 ret = find_and_setup_root(tree_root, fs_info, 2681 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2655 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2682 csum_root = btrfs_read_tree_root(tree_root, &location);
2656 if (ret) 2683 if (IS_ERR(csum_root)) {
2684 ret = PTR_ERR(csum_root);
2657 goto recovery_tree_root; 2685 goto recovery_tree_root;
2686 }
2658 csum_root->track_dirty = 1; 2687 csum_root->track_dirty = 1;
2688 fs_info->csum_root = csum_root;
2659 2689
2660 ret = find_and_setup_root(tree_root, fs_info, 2690 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2661 BTRFS_QUOTA_TREE_OBJECTID, quota_root); 2691 quota_root = btrfs_read_tree_root(tree_root, &location);
2662 if (ret) { 2692 if (!IS_ERR(quota_root)) {
2663 kfree(quota_root);
2664 quota_root = fs_info->quota_root = NULL;
2665 } else {
2666 quota_root->track_dirty = 1; 2693 quota_root->track_dirty = 1;
2667 fs_info->quota_enabled = 1; 2694 fs_info->quota_enabled = 1;
2668 fs_info->pending_quota_state = 1; 2695 fs_info->pending_quota_state = 1;
2696 fs_info->quota_root = quota_root;
2669 } 2697 }
2670 2698
2671 fs_info->generation = generation; 2699 fs_info->generation = generation;
@@ -2818,11 +2846,9 @@ retry_root_backup:
2818 2846
2819 location.objectid = BTRFS_FS_TREE_OBJECTID; 2847 location.objectid = BTRFS_FS_TREE_OBJECTID;
2820 location.type = BTRFS_ROOT_ITEM_KEY; 2848 location.type = BTRFS_ROOT_ITEM_KEY;
2821 location.offset = (u64)-1; 2849 location.offset = 0;
2822 2850
2823 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2851 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2824 if (!fs_info->fs_root)
2825 goto fail_qgroup;
2826 if (IS_ERR(fs_info->fs_root)) { 2852 if (IS_ERR(fs_info->fs_root)) {
2827 err = PTR_ERR(fs_info->fs_root); 2853 err = PTR_ERR(fs_info->fs_root);
2828 goto fail_qgroup; 2854 goto fail_qgroup;
@@ -2854,6 +2880,8 @@ retry_root_backup:
2854 return ret; 2880 return ret;
2855 } 2881 }
2856 2882
2883 btrfs_qgroup_rescan_resume(fs_info);
2884
2857 return 0; 2885 return 0;
2858 2886
2859fail_qgroup: 2887fail_qgroup:
@@ -3259,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3259 BTRFS_BLOCK_GROUP_RAID10)) { 3287 BTRFS_BLOCK_GROUP_RAID10)) {
3260 num_tolerated_disk_barrier_failures = 1; 3288 num_tolerated_disk_barrier_failures = 1;
3261 } else if (flags & 3289 } else if (flags &
3262 BTRFS_BLOCK_GROUP_RAID5) { 3290 BTRFS_BLOCK_GROUP_RAID6) {
3263 num_tolerated_disk_barrier_failures = 2; 3291 num_tolerated_disk_barrier_failures = 2;
3264 } 3292 }
3265 } 3293 }
@@ -3367,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
3367 return ret; 3395 return ret;
3368} 3396}
3369 3397
3370void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3398/* Drop a fs root from the radix tree and free it. */
3399void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3400 struct btrfs_root *root)
3371{ 3401{
3372 spin_lock(&fs_info->fs_roots_radix_lock); 3402 spin_lock(&fs_info->fs_roots_radix_lock);
3373 radix_tree_delete(&fs_info->fs_roots_radix, 3403 radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3398,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
3398 kfree(root->free_ino_ctl); 3428 kfree(root->free_ino_ctl);
3399 kfree(root->free_ino_pinned); 3429 kfree(root->free_ino_pinned);
3400 kfree(root->name); 3430 kfree(root->name);
3401 kfree(root); 3431 btrfs_put_fs_root(root);
3432}
3433
3434void btrfs_free_fs_root(struct btrfs_root *root)
3435{
3436 free_fs_root(root);
3402} 3437}
3403 3438
3404int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 3439int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3654,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3654 INIT_LIST_HEAD(&splice); 3689 INIT_LIST_HEAD(&splice);
3655 3690
3656 mutex_lock(&root->fs_info->ordered_operations_mutex); 3691 mutex_lock(&root->fs_info->ordered_operations_mutex);
3657 spin_lock(&root->fs_info->ordered_extent_lock); 3692 spin_lock(&root->fs_info->ordered_root_lock);
3658 3693
3659 list_splice_init(&t->ordered_operations, &splice); 3694 list_splice_init(&t->ordered_operations, &splice);
3660 while (!list_empty(&splice)) { 3695 while (!list_empty(&splice)) {
@@ -3662,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3662 ordered_operations); 3697 ordered_operations);
3663 3698
3664 list_del_init(&btrfs_inode->ordered_operations); 3699 list_del_init(&btrfs_inode->ordered_operations);
3665 spin_unlock(&root->fs_info->ordered_extent_lock); 3700 spin_unlock(&root->fs_info->ordered_root_lock);
3666 3701
3667 btrfs_invalidate_inodes(btrfs_inode->root); 3702 btrfs_invalidate_inodes(btrfs_inode->root);
3668 3703
3669 spin_lock(&root->fs_info->ordered_extent_lock); 3704 spin_lock(&root->fs_info->ordered_root_lock);
3670 } 3705 }
3671 3706
3672 spin_unlock(&root->fs_info->ordered_extent_lock); 3707 spin_unlock(&root->fs_info->ordered_root_lock);
3673 mutex_unlock(&root->fs_info->ordered_operations_mutex); 3708 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3674} 3709}
3675 3710
@@ -3677,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3677{ 3712{
3678 struct btrfs_ordered_extent *ordered; 3713 struct btrfs_ordered_extent *ordered;
3679 3714
3680 spin_lock(&root->fs_info->ordered_extent_lock); 3715 spin_lock(&root->ordered_extent_lock);
3681 /* 3716 /*
3682 * This will just short circuit the ordered completion stuff which will 3717 * This will just short circuit the ordered completion stuff which will
3683 * make sure the ordered extent gets properly cleaned up. 3718 * make sure the ordered extent gets properly cleaned up.
3684 */ 3719 */
3685 list_for_each_entry(ordered, &root->fs_info->ordered_extents, 3720 list_for_each_entry(ordered, &root->ordered_extents,
3686 root_extent_list) 3721 root_extent_list)
3687 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 3722 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3688 spin_unlock(&root->fs_info->ordered_extent_lock); 3723 spin_unlock(&root->ordered_extent_lock);
3724}
3725
3726static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3727{
3728 struct btrfs_root *root;
3729 struct list_head splice;
3730
3731 INIT_LIST_HEAD(&splice);
3732
3733 spin_lock(&fs_info->ordered_root_lock);
3734 list_splice_init(&fs_info->ordered_roots, &splice);
3735 while (!list_empty(&splice)) {
3736 root = list_first_entry(&splice, struct btrfs_root,
3737 ordered_root);
3738 list_del_init(&root->ordered_root);
3739
3740 btrfs_destroy_ordered_extents(root);
3741
3742 cond_resched_lock(&fs_info->ordered_root_lock);
3743 }
3744 spin_unlock(&fs_info->ordered_root_lock);
3689} 3745}
3690 3746
3691int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 3747int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3707,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3707 3763
3708 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3764 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3709 struct btrfs_delayed_ref_head *head = NULL; 3765 struct btrfs_delayed_ref_head *head = NULL;
3766 bool pin_bytes = false;
3710 3767
3711 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3768 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3712 atomic_set(&ref->refs, 1); 3769 atomic_set(&ref->refs, 1);
@@ -3727,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3727 } 3784 }
3728 3785
3729 if (head->must_insert_reserved) 3786 if (head->must_insert_reserved)
3730 btrfs_pin_extent(root, ref->bytenr, 3787 pin_bytes = true;
3731 ref->num_bytes, 1);
3732 btrfs_free_delayed_extent_op(head->extent_op); 3788 btrfs_free_delayed_extent_op(head->extent_op);
3733 delayed_refs->num_heads--; 3789 delayed_refs->num_heads--;
3734 if (list_empty(&head->cluster)) 3790 if (list_empty(&head->cluster))
@@ -3739,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3739 ref->in_tree = 0; 3795 ref->in_tree = 0;
3740 rb_erase(&ref->rb_node, &delayed_refs->root); 3796 rb_erase(&ref->rb_node, &delayed_refs->root);
3741 delayed_refs->num_entries--; 3797 delayed_refs->num_entries--;
3742 if (head)
3743 mutex_unlock(&head->mutex);
3744 spin_unlock(&delayed_refs->lock); 3798 spin_unlock(&delayed_refs->lock);
3799 if (head) {
3800 if (pin_bytes)
3801 btrfs_pin_extent(root, ref->bytenr,
3802 ref->num_bytes, 1);
3803 mutex_unlock(&head->mutex);
3804 }
3745 btrfs_put_delayed_ref(ref); 3805 btrfs_put_delayed_ref(ref);
3746 3806
3747 cond_resched(); 3807 cond_resched();
@@ -3778,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3778 3838
3779 INIT_LIST_HEAD(&splice); 3839 INIT_LIST_HEAD(&splice);
3780 3840
3781 spin_lock(&root->fs_info->delalloc_lock); 3841 spin_lock(&root->delalloc_lock);
3782 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 3842 list_splice_init(&root->delalloc_inodes, &splice);
3783 3843
3784 while (!list_empty(&splice)) { 3844 while (!list_empty(&splice)) {
3785 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3845 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
3786 delalloc_inodes); 3846 delalloc_inodes);
3787 3847
3788 list_del_init(&btrfs_inode->delalloc_inodes); 3848 list_del_init(&btrfs_inode->delalloc_inodes);
3789 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 3849 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3790 &btrfs_inode->runtime_flags); 3850 &btrfs_inode->runtime_flags);
3791 spin_unlock(&root->fs_info->delalloc_lock); 3851 spin_unlock(&root->delalloc_lock);
3792 3852
3793 btrfs_invalidate_inodes(btrfs_inode->root); 3853 btrfs_invalidate_inodes(btrfs_inode->root);
3794 3854
3795 spin_lock(&root->fs_info->delalloc_lock); 3855 spin_lock(&root->delalloc_lock);
3796 } 3856 }
3797 3857
3798 spin_unlock(&root->fs_info->delalloc_lock); 3858 spin_unlock(&root->delalloc_lock);
3859}
3860
3861static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
3862{
3863 struct btrfs_root *root;
3864 struct list_head splice;
3865
3866 INIT_LIST_HEAD(&splice);
3867
3868 spin_lock(&fs_info->delalloc_root_lock);
3869 list_splice_init(&fs_info->delalloc_roots, &splice);
3870 while (!list_empty(&splice)) {
3871 root = list_first_entry(&splice, struct btrfs_root,
3872 delalloc_root);
3873 list_del_init(&root->delalloc_root);
3874 root = btrfs_grab_fs_root(root);
3875 BUG_ON(!root);
3876 spin_unlock(&fs_info->delalloc_root_lock);
3877
3878 btrfs_destroy_delalloc_inodes(root);
3879 btrfs_put_fs_root(root);
3880
3881 spin_lock(&fs_info->delalloc_root_lock);
3882 }
3883 spin_unlock(&fs_info->delalloc_root_lock);
3799} 3884}
3800 3885
3801static int btrfs_destroy_marked_extents(struct btrfs_root *root, 3886static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3879,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3879 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, 3964 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
3880 cur_trans->dirty_pages.dirty_bytes); 3965 cur_trans->dirty_pages.dirty_bytes);
3881 3966
3882 /* FIXME: cleanup wait for commit */ 3967 cur_trans->state = TRANS_STATE_COMMIT_START;
3883 cur_trans->in_commit = 1;
3884 cur_trans->blocked = 1;
3885 wake_up(&root->fs_info->transaction_blocked_wait); 3968 wake_up(&root->fs_info->transaction_blocked_wait);
3886 3969
3887 btrfs_evict_pending_snapshots(cur_trans); 3970 btrfs_evict_pending_snapshots(cur_trans);
3888 3971
3889 cur_trans->blocked = 0; 3972 cur_trans->state = TRANS_STATE_UNBLOCKED;
3890 wake_up(&root->fs_info->transaction_wait); 3973 wake_up(&root->fs_info->transaction_wait);
3891 3974
3892 cur_trans->commit_done = 1;
3893 wake_up(&cur_trans->commit_wait);
3894
3895 btrfs_destroy_delayed_inodes(root); 3975 btrfs_destroy_delayed_inodes(root);
3896 btrfs_assert_delayed_root_empty(root); 3976 btrfs_assert_delayed_root_empty(root);
3897 3977
@@ -3900,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3900 btrfs_destroy_pinned_extent(root, 3980 btrfs_destroy_pinned_extent(root,
3901 root->fs_info->pinned_extents); 3981 root->fs_info->pinned_extents);
3902 3982
3983 cur_trans->state =TRANS_STATE_COMPLETED;
3984 wake_up(&cur_trans->commit_wait);
3985
3903 /* 3986 /*
3904 memset(cur_trans, 0, sizeof(*cur_trans)); 3987 memset(cur_trans, 0, sizeof(*cur_trans));
3905 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 3988 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3915,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3915 3998
3916 spin_lock(&root->fs_info->trans_lock); 3999 spin_lock(&root->fs_info->trans_lock);
3917 list_splice_init(&root->fs_info->trans_list, &list); 4000 list_splice_init(&root->fs_info->trans_list, &list);
3918 root->fs_info->trans_no_join = 1; 4001 root->fs_info->running_transaction = NULL;
3919 spin_unlock(&root->fs_info->trans_lock); 4002 spin_unlock(&root->fs_info->trans_lock);
3920 4003
3921 while (!list_empty(&list)) { 4004 while (!list_empty(&list)) {
@@ -3923,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3923 4006
3924 btrfs_destroy_ordered_operations(t, root); 4007 btrfs_destroy_ordered_operations(t, root);
3925 4008
3926 btrfs_destroy_ordered_extents(root); 4009 btrfs_destroy_all_ordered_extents(root->fs_info);
3927 4010
3928 btrfs_destroy_delayed_refs(t, root); 4011 btrfs_destroy_delayed_refs(t, root);
3929 4012
3930 /* FIXME: cleanup wait for commit */ 4013 /*
3931 t->in_commit = 1; 4014 * FIXME: cleanup wait for commit
3932 t->blocked = 1; 4015 * We needn't acquire the lock here, because we are during
4016 * the umount, there is no other task which will change it.
4017 */
4018 t->state = TRANS_STATE_COMMIT_START;
3933 smp_mb(); 4019 smp_mb();
3934 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 4020 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3935 wake_up(&root->fs_info->transaction_blocked_wait); 4021 wake_up(&root->fs_info->transaction_blocked_wait);
3936 4022
3937 btrfs_evict_pending_snapshots(t); 4023 btrfs_evict_pending_snapshots(t);
3938 4024
3939 t->blocked = 0; 4025 t->state = TRANS_STATE_UNBLOCKED;
3940 smp_mb(); 4026 smp_mb();
3941 if (waitqueue_active(&root->fs_info->transaction_wait)) 4027 if (waitqueue_active(&root->fs_info->transaction_wait))
3942 wake_up(&root->fs_info->transaction_wait); 4028 wake_up(&root->fs_info->transaction_wait);
3943 4029
3944 t->commit_done = 1;
3945 smp_mb();
3946 if (waitqueue_active(&t->commit_wait))
3947 wake_up(&t->commit_wait);
3948
3949 btrfs_destroy_delayed_inodes(root); 4030 btrfs_destroy_delayed_inodes(root);
3950 btrfs_assert_delayed_root_empty(root); 4031 btrfs_assert_delayed_root_empty(root);
3951 4032
3952 btrfs_destroy_delalloc_inodes(root); 4033 btrfs_destroy_all_delalloc_inodes(root->fs_info);
3953
3954 spin_lock(&root->fs_info->trans_lock);
3955 root->fs_info->running_transaction = NULL;
3956 spin_unlock(&root->fs_info->trans_lock);
3957 4034
3958 btrfs_destroy_marked_extents(root, &t->dirty_pages, 4035 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3959 EXTENT_DIRTY); 4036 EXTENT_DIRTY);
@@ -3961,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3961 btrfs_destroy_pinned_extent(root, 4038 btrfs_destroy_pinned_extent(root,
3962 root->fs_info->pinned_extents); 4039 root->fs_info->pinned_extents);
3963 4040
4041 t->state = TRANS_STATE_COMPLETED;
4042 smp_mb();
4043 if (waitqueue_active(&t->commit_wait))
4044 wake_up(&t->commit_wait);
4045
3964 atomic_set(&t->use_count, 0); 4046 atomic_set(&t->use_count, 0);
3965 list_del_init(&t->list); 4047 list_del_init(&t->list);
3966 memset(t, 0, sizeof(*t)); 4048 memset(t, 0, sizeof(*t));
3967 kmem_cache_free(btrfs_transaction_cachep, t); 4049 kmem_cache_free(btrfs_transaction_cachep, t);
3968 } 4050 }
3969 4051
3970 spin_lock(&root->fs_info->trans_lock);
3971 root->fs_info->trans_no_join = 0;
3972 spin_unlock(&root->fs_info->trans_lock);
3973 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 4052 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3974 4053
3975 return 0; 4054 return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr, u32 blocksize);
66struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root);
68struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 71struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
69 struct btrfs_key *location); 72 struct btrfs_key *location);
70int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 73int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
71void btrfs_btree_balance_dirty(struct btrfs_root *root); 74void btrfs_btree_balance_dirty(struct btrfs_root *root);
72void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 75void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
73void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 76void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
77 struct btrfs_root *root);
78void btrfs_free_fs_root(struct btrfs_root *root);
79
80/*
81 * This function is used to grab the root, and avoid it is freed when we
82 * access it. But it doesn't ensure that the tree is not dropped.
83 *
84 * If you want to ensure the whole tree is safe, you should use
85 * fs_info->subvol_srcu
86 */
87static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
88{
89 if (atomic_inc_not_zero(&root->refs))
90 return root;
91 return NULL;
92}
93
94static inline void btrfs_put_fs_root(struct btrfs_root *root)
95{
96 if (atomic_dec_and_test(&root->refs))
97 kfree(root);
98}
99
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 100void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 101int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
76 int atomic); 102 int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 goto fail; 82 goto fail;
83 } 83 }
84 84
85 if (btrfs_root_refs(&root->root_item) == 0) {
86 err = -ENOENT;
87 goto fail;
88 }
89
90 key.objectid = objectid; 85 key.objectid = objectid;
91 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 86 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
92 key.offset = 0; 87 key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..0236de711989 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h>
27#include "compat.h" 28#include "compat.h"
28#include "hash.h" 29#include "hash.h"
29#include "ctree.h" 30#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2526 return 0; 2527 return 0;
2527} 2528}
2528 2529
2530static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2531{
2532 u64 num_bytes;
2533
2534 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2535 sizeof(struct btrfs_extent_inline_ref));
2536 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2537 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2538
2539 /*
2540 * We don't ever fill up leaves all the way so multiply by 2 just to be
2541 * closer to what we're really going to want to ouse.
2542 */
2543 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2544}
2545
2546int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2547 struct btrfs_root *root)
2548{
2549 struct btrfs_block_rsv *global_rsv;
2550 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2551 u64 num_bytes;
2552 int ret = 0;
2553
2554 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2555 num_heads = heads_to_leaves(root, num_heads);
2556 if (num_heads > 1)
2557 num_bytes += (num_heads - 1) * root->leafsize;
2558 num_bytes <<= 1;
2559 global_rsv = &root->fs_info->global_block_rsv;
2560
2561 /*
2562 * If we can't allocate any more chunks lets make sure we have _lots_ of
2563 * wiggle room since running delayed refs can create more delayed refs.
2564 */
2565 if (global_rsv->space_info->full)
2566 num_bytes <<= 1;
2567
2568 spin_lock(&global_rsv->lock);
2569 if (global_rsv->reserved <= num_bytes)
2570 ret = 1;
2571 spin_unlock(&global_rsv->lock);
2572 return ret;
2573}
2574
2529/* 2575/*
2530 * this starts processing the delayed reference count updates and 2576 * this starts processing the delayed reference count updates and
2531 * extent insertions we have queued up so far. count can be 2577 * extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
2573 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2619 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2574 if (old) { 2620 if (old) {
2575 DEFINE_WAIT(__wait); 2621 DEFINE_WAIT(__wait);
2576 if (delayed_refs->num_entries < 16348) 2622 if (delayed_refs->flushing ||
2623 !btrfs_should_throttle_delayed_refs(trans, root))
2577 return 0; 2624 return 0;
2578 2625
2579 prepare_to_wait(&delayed_refs->wait, &__wait, 2626 prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
2608 2655
2609 while (1) { 2656 while (1) {
2610 if (!(run_all || run_most) && 2657 if (!(run_all || run_most) &&
2611 delayed_refs->num_heads_ready < 64) 2658 !btrfs_should_throttle_delayed_refs(trans, root))
2612 break; 2659 break;
2613 2660
2614 /* 2661 /*
@@ -2629,6 +2676,7 @@ again:
2629 spin_unlock(&delayed_refs->lock); 2676 spin_unlock(&delayed_refs->lock);
2630 btrfs_abort_transaction(trans, root, ret); 2677 btrfs_abort_transaction(trans, root, ret);
2631 atomic_dec(&delayed_refs->procs_running_refs); 2678 atomic_dec(&delayed_refs->procs_running_refs);
2679 wake_up(&delayed_refs->wait);
2632 return ret; 2680 return ret;
2633 } 2681 }
2634 2682
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3310 struct btrfs_space_info *found; 3358 struct btrfs_space_info *found;
3311 int i; 3359 int i;
3312 int factor; 3360 int factor;
3361 int ret;
3313 3362
3314 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3363 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3315 BTRFS_BLOCK_GROUP_RAID10)) 3364 BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3333 if (!found) 3382 if (!found)
3334 return -ENOMEM; 3383 return -ENOMEM;
3335 3384
3385 ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3386 if (ret) {
3387 kfree(found);
3388 return ret;
3389 }
3390
3336 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3391 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3337 INIT_LIST_HEAD(&found->block_groups[i]); 3392 INIT_LIST_HEAD(&found->block_groups[i]);
3338 init_rwsem(&found->groups_sem); 3393 init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
3565 } 3620 }
3566 3621
3567 /* 3622 /*
3568 * If we have less pinned bytes than we want to allocate then 3623 * If we don't have enough pinned space to deal with this
3569 * don't bother committing the transaction, it won't help us. 3624 * allocation don't bother committing the transaction.
3570 */ 3625 */
3571 if (data_sinfo->bytes_pinned < bytes) 3626 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3627 bytes) < 0)
3572 committed = 1; 3628 committed = 1;
3573 spin_unlock(&data_sinfo->lock); 3629 spin_unlock(&data_sinfo->lock);
3574 3630
@@ -3577,6 +3633,7 @@ commit_trans:
3577 if (!committed && 3633 if (!committed &&
3578 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3634 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3579 committed = 1; 3635 committed = 1;
3636
3580 trans = btrfs_join_transaction(root); 3637 trans = btrfs_join_transaction(root);
3581 if (IS_ERR(trans)) 3638 if (IS_ERR(trans))
3582 return PTR_ERR(trans); 3639 return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3609 3666
3610 data_sinfo = root->fs_info->data_sinfo; 3667 data_sinfo = root->fs_info->data_sinfo;
3611 spin_lock(&data_sinfo->lock); 3668 spin_lock(&data_sinfo->lock);
3669 WARN_ON(data_sinfo->bytes_may_use < bytes);
3612 data_sinfo->bytes_may_use -= bytes; 3670 data_sinfo->bytes_may_use -= bytes;
3613 trace_btrfs_space_reservation(root->fs_info, "space_info", 3671 trace_btrfs_space_reservation(root->fs_info, "space_info",
3614 data_sinfo->flags, bytes, 0); 3672 data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3886 unsigned long nr_pages) 3944 unsigned long nr_pages)
3887{ 3945{
3888 struct super_block *sb = root->fs_info->sb; 3946 struct super_block *sb = root->fs_info->sb;
3889 int started;
3890 3947
3891 /* If we can not start writeback, just sync all the delalloc file. */ 3948 if (down_read_trylock(&sb->s_umount)) {
3892 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3949 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3893 WB_REASON_FS_FREE_SPACE); 3950 up_read(&sb->s_umount);
3894 if (!started) { 3951 } else {
3895 /* 3952 /*
3896 * We needn't worry the filesystem going from r/w to r/o though 3953 * We needn't worry the filesystem going from r/w to r/o though
3897 * we don't acquire ->s_umount mutex, because the filesystem 3954 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3899 * the filesystem is readonly(all dirty pages are written to 3956 * the filesystem is readonly(all dirty pages are written to
3900 * the disk). 3957 * the disk).
3901 */ 3958 */
3902 btrfs_start_delalloc_inodes(root, 0); 3959 btrfs_start_all_delalloc_inodes(root->fs_info, 0);
3903 if (!current->journal_info) 3960 if (!current->journal_info)
3904 btrfs_wait_ordered_extents(root, 0); 3961 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3905 } 3962 }
3906} 3963}
3907 3964
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3931 if (delalloc_bytes == 0) { 3988 if (delalloc_bytes == 0) {
3932 if (trans) 3989 if (trans)
3933 return; 3990 return;
3934 btrfs_wait_ordered_extents(root, 0); 3991 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3935 return; 3992 return;
3936 } 3993 }
3937 3994
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3959 4016
3960 loops++; 4017 loops++;
3961 if (wait_ordered && !trans) { 4018 if (wait_ordered && !trans) {
3962 btrfs_wait_ordered_extents(root, 0); 4019 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3963 } else { 4020 } else {
3964 time_left = schedule_timeout_killable(1); 4021 time_left = schedule_timeout_killable(1);
3965 if (time_left) 4022 if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
3997 4054
3998 /* See if there is enough pinned space to make this reservation */ 4055 /* See if there is enough pinned space to make this reservation */
3999 spin_lock(&space_info->lock); 4056 spin_lock(&space_info->lock);
4000 if (space_info->bytes_pinned >= bytes) { 4057 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4058 bytes) >= 0) {
4001 spin_unlock(&space_info->lock); 4059 spin_unlock(&space_info->lock);
4002 goto commit; 4060 goto commit;
4003 } 4061 }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
4012 4070
4013 spin_lock(&space_info->lock); 4071 spin_lock(&space_info->lock);
4014 spin_lock(&delayed_rsv->lock); 4072 spin_lock(&delayed_rsv->lock);
4015 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4073 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4074 bytes - delayed_rsv->size) >= 0) {
4016 spin_unlock(&delayed_rsv->lock); 4075 spin_unlock(&delayed_rsv->lock);
4017 spin_unlock(&space_info->lock); 4076 spin_unlock(&space_info->lock);
4018 return -ENOSPC; 4077 return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4297 spin_unlock(&block_rsv->lock); 4356 spin_unlock(&block_rsv->lock);
4298} 4357}
4299 4358
4359int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4360 struct btrfs_block_rsv *dest, u64 num_bytes,
4361 int min_factor)
4362{
4363 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4364 u64 min_bytes;
4365
4366 if (global_rsv->space_info != dest->space_info)
4367 return -ENOSPC;
4368
4369 spin_lock(&global_rsv->lock);
4370 min_bytes = div_factor(global_rsv->size, min_factor);
4371 if (global_rsv->reserved < min_bytes + num_bytes) {
4372 spin_unlock(&global_rsv->lock);
4373 return -ENOSPC;
4374 }
4375 global_rsv->reserved -= num_bytes;
4376 if (global_rsv->reserved < global_rsv->size)
4377 global_rsv->full = 0;
4378 spin_unlock(&global_rsv->lock);
4379
4380 block_rsv_add_bytes(dest, num_bytes, 1);
4381 return 0;
4382}
4383
4300static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4384static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4301 struct btrfs_block_rsv *block_rsv, 4385 struct btrfs_block_rsv *block_rsv,
4302 struct btrfs_block_rsv *dest, u64 num_bytes) 4386 struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
5030 int factor; 5114 int factor;
5031 5115
5032 /* block accounting for super block */ 5116 /* block accounting for super block */
5033 spin_lock(&info->delalloc_lock); 5117 spin_lock(&info->delalloc_root_lock);
5034 old_val = btrfs_super_bytes_used(info->super_copy); 5118 old_val = btrfs_super_bytes_used(info->super_copy);
5035 if (alloc) 5119 if (alloc)
5036 old_val += num_bytes; 5120 old_val += num_bytes;
5037 else 5121 else
5038 old_val -= num_bytes; 5122 old_val -= num_bytes;
5039 btrfs_set_super_bytes_used(info->super_copy, old_val); 5123 btrfs_set_super_bytes_used(info->super_copy, old_val);
5040 spin_unlock(&info->delalloc_lock); 5124 spin_unlock(&info->delalloc_root_lock);
5041 5125
5042 while (total) { 5126 while (total) {
5043 cache = btrfs_lookup_block_group(info, bytenr); 5127 cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5189 return ret; 5273 return ret;
5190} 5274}
5191 5275
5276static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5277{
5278 int ret;
5279 struct btrfs_block_group_cache *block_group;
5280 struct btrfs_caching_control *caching_ctl;
5281
5282 block_group = btrfs_lookup_block_group(root->fs_info, start);
5283 if (!block_group)
5284 return -EINVAL;
5285
5286 cache_block_group(block_group, 0);
5287 caching_ctl = get_caching_control(block_group);
5288
5289 if (!caching_ctl) {
5290 /* Logic error */
5291 BUG_ON(!block_group_cache_done(block_group));
5292 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5293 } else {
5294 mutex_lock(&caching_ctl->mutex);
5295
5296 if (start >= caching_ctl->progress) {
5297 ret = add_excluded_extent(root, start, num_bytes);
5298 } else if (start + num_bytes <= caching_ctl->progress) {
5299 ret = btrfs_remove_free_space(block_group,
5300 start, num_bytes);
5301 } else {
5302 num_bytes = caching_ctl->progress - start;
5303 ret = btrfs_remove_free_space(block_group,
5304 start, num_bytes);
5305 if (ret)
5306 goto out_lock;
5307
5308 num_bytes = (start + num_bytes) -
5309 caching_ctl->progress;
5310 start = caching_ctl->progress;
5311 ret = add_excluded_extent(root, start, num_bytes);
5312 }
5313out_lock:
5314 mutex_unlock(&caching_ctl->mutex);
5315 put_caching_control(caching_ctl);
5316 }
5317 btrfs_put_block_group(block_group);
5318 return ret;
5319}
5320
5321int btrfs_exclude_logged_extents(struct btrfs_root *log,
5322 struct extent_buffer *eb)
5323{
5324 struct btrfs_file_extent_item *item;
5325 struct btrfs_key key;
5326 int found_type;
5327 int i;
5328
5329 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5330 return 0;
5331
5332 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5333 btrfs_item_key_to_cpu(eb, &key, i);
5334 if (key.type != BTRFS_EXTENT_DATA_KEY)
5335 continue;
5336 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5337 found_type = btrfs_file_extent_type(eb, item);
5338 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5339 continue;
5340 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5341 continue;
5342 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5343 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5344 __exclude_logged_extent(log, key.objectid, key.offset);
5345 }
5346
5347 return 0;
5348}
5349
5192/** 5350/**
5193 * btrfs_update_reserved_bytes - update the block_group and space info counters 5351 * btrfs_update_reserved_bytes - update the block_group and space info counters
5194 * @cache: The cache we are manipulating 5352 * @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5251 struct btrfs_caching_control *next; 5409 struct btrfs_caching_control *next;
5252 struct btrfs_caching_control *caching_ctl; 5410 struct btrfs_caching_control *caching_ctl;
5253 struct btrfs_block_group_cache *cache; 5411 struct btrfs_block_group_cache *cache;
5412 struct btrfs_space_info *space_info;
5254 5413
5255 down_write(&fs_info->extent_commit_sem); 5414 down_write(&fs_info->extent_commit_sem);
5256 5415
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5273 5432
5274 up_write(&fs_info->extent_commit_sem); 5433 up_write(&fs_info->extent_commit_sem);
5275 5434
5435 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5436 percpu_counter_set(&space_info->total_bytes_pinned, 0);
5437
5276 update_global_block_rsv(fs_info); 5438 update_global_block_rsv(fs_info);
5277} 5439}
5278 5440
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5370 return 0; 5532 return 0;
5371} 5533}
5372 5534
5535static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5536 u64 owner, u64 root_objectid)
5537{
5538 struct btrfs_space_info *space_info;
5539 u64 flags;
5540
5541 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5542 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5543 flags = BTRFS_BLOCK_GROUP_SYSTEM;
5544 else
5545 flags = BTRFS_BLOCK_GROUP_METADATA;
5546 } else {
5547 flags = BTRFS_BLOCK_GROUP_DATA;
5548 }
5549
5550 space_info = __find_space_info(fs_info, flags);
5551 BUG_ON(!space_info); /* Logic bug */
5552 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5553}
5554
5555
5373static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5556static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5374 struct btrfs_root *root, 5557 struct btrfs_root *root,
5375 u64 bytenr, u64 num_bytes, u64 parent, 5558 u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5590 goto out; 5773 goto out;
5591 } 5774 }
5592 } 5775 }
5776 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5777 root_objectid);
5593 } else { 5778 } else {
5594 if (found_extent) { 5779 if (found_extent) {
5595 BUG_ON(is_data && refs_to_drop != 5780 BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5713 u64 parent, int last_ref) 5898 u64 parent, int last_ref)
5714{ 5899{
5715 struct btrfs_block_group_cache *cache = NULL; 5900 struct btrfs_block_group_cache *cache = NULL;
5901 int pin = 1;
5716 int ret; 5902 int ret;
5717 5903
5718 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5904 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5745 5931
5746 btrfs_add_free_space(cache, buf->start, buf->len); 5932 btrfs_add_free_space(cache, buf->start, buf->len);
5747 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5933 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5934 pin = 0;
5748 } 5935 }
5749out: 5936out:
5937 if (pin)
5938 add_pinned_bytes(root->fs_info, buf->len,
5939 btrfs_header_level(buf),
5940 root->root_key.objectid);
5941
5750 /* 5942 /*
5751 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5943 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5752 * anymore. 5944 * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5763 int ret; 5955 int ret;
5764 struct btrfs_fs_info *fs_info = root->fs_info; 5956 struct btrfs_fs_info *fs_info = root->fs_info;
5765 5957
5958 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
5959
5766 /* 5960 /*
5767 * tree log blocks never actually go into the extent allocation 5961 * tree log blocks never actually go into the extent allocation
5768 * tree, just update pinning info and exit early. 5962 * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6560{ 6754{
6561 int ret; 6755 int ret;
6562 struct btrfs_block_group_cache *block_group; 6756 struct btrfs_block_group_cache *block_group;
6563 struct btrfs_caching_control *caching_ctl;
6564 u64 start = ins->objectid;
6565 u64 num_bytes = ins->offset;
6566
6567 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6568 cache_block_group(block_group, 0);
6569 caching_ctl = get_caching_control(block_group);
6570
6571 if (!caching_ctl) {
6572 BUG_ON(!block_group_cache_done(block_group));
6573 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6574 if (ret)
6575 goto out;
6576 } else {
6577 mutex_lock(&caching_ctl->mutex);
6578 6757
6579 if (start >= caching_ctl->progress) { 6758 /*
6580 ret = add_excluded_extent(root, start, num_bytes); 6759 * Mixed block groups will exclude before processing the log so we only
6581 } else if (start + num_bytes <= caching_ctl->progress) { 6760 * need to do the exlude dance if this fs isn't mixed.
6582 ret = btrfs_remove_free_space(block_group, 6761 */
6583 start, num_bytes); 6762 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6584 } else { 6763 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6585 num_bytes = caching_ctl->progress - start;
6586 ret = btrfs_remove_free_space(block_group,
6587 start, num_bytes);
6588 if (ret)
6589 goto out_lock;
6590
6591 start = caching_ctl->progress;
6592 num_bytes = ins->objectid + ins->offset -
6593 caching_ctl->progress;
6594 ret = add_excluded_extent(root, start, num_bytes);
6595 }
6596out_lock:
6597 mutex_unlock(&caching_ctl->mutex);
6598 put_caching_control(caching_ctl);
6599 if (ret) 6764 if (ret)
6600 goto out; 6765 return ret;
6601 } 6766 }
6602 6767
6768 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6769 if (!block_group)
6770 return -EINVAL;
6771
6603 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6772 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6604 RESERVE_ALLOC_NO_ACCOUNT); 6773 RESERVE_ALLOC_NO_ACCOUNT);
6605 BUG_ON(ret); /* logic error */ 6774 BUG_ON(ret); /* logic error */
6606 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6775 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6607 0, owner, offset, ins, 1); 6776 0, owner, offset, ins, 1);
6608out:
6609 btrfs_put_block_group(block_group); 6777 btrfs_put_block_group(block_group);
6610 return ret; 6778 return ret;
6611} 6779}
@@ -7384,7 +7552,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7384 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7552 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7385 7553
7386 while (1) { 7554 while (1) {
7387 if (!for_reloc && btrfs_fs_closing(root->fs_info)) { 7555 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7388 pr_debug("btrfs: drop snapshot early exit\n"); 7556 pr_debug("btrfs: drop snapshot early exit\n");
7389 err = -EAGAIN; 7557 err = -EAGAIN;
7390 goto out_end_trans; 7558 goto out_end_trans;
@@ -7447,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7447 } 7615 }
7448 7616
7449 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7617 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7450 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7618 ret = btrfs_find_root(tree_root, &root->root_key, path,
7451 NULL, NULL); 7619 NULL, NULL);
7452 if (ret < 0) { 7620 if (ret < 0) {
7453 btrfs_abort_transaction(trans, tree_root, ret); 7621 btrfs_abort_transaction(trans, tree_root, ret);
7454 err = ret; 7622 err = ret;
@@ -7465,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7465 } 7633 }
7466 7634
7467 if (root->in_radix) { 7635 if (root->in_radix) {
7468 btrfs_free_fs_root(tree_root->fs_info, root); 7636 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7469 } else { 7637 } else {
7470 free_extent_buffer(root->node); 7638 free_extent_buffer(root->node);
7471 free_extent_buffer(root->commit_root); 7639 free_extent_buffer(root->commit_root);
7472 kfree(root); 7640 btrfs_put_fs_root(root);
7473 } 7641 }
7474out_end_trans: 7642out_end_trans:
7475 btrfs_end_transaction_throttle(trans, tree_root); 7643 btrfs_end_transaction_throttle(trans, tree_root);
@@ -7782,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7782 struct btrfs_space_info *space_info; 7950 struct btrfs_space_info *space_info;
7783 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7951 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7784 struct btrfs_device *device; 7952 struct btrfs_device *device;
7953 struct btrfs_trans_handle *trans;
7785 u64 min_free; 7954 u64 min_free;
7786 u64 dev_min = 1; 7955 u64 dev_min = 1;
7787 u64 dev_nr = 0; 7956 u64 dev_nr = 0;
@@ -7868,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7868 do_div(min_free, dev_min); 8037 do_div(min_free, dev_min);
7869 } 8038 }
7870 8039
8040 /* We need to do this so that we can look at pending chunks */
8041 trans = btrfs_join_transaction(root);
8042 if (IS_ERR(trans)) {
8043 ret = PTR_ERR(trans);
8044 goto out;
8045 }
8046
7871 mutex_lock(&root->fs_info->chunk_mutex); 8047 mutex_lock(&root->fs_info->chunk_mutex);
7872 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8048 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7873 u64 dev_offset; 8049 u64 dev_offset;
@@ -7878,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7878 */ 8054 */
7879 if (device->total_bytes > device->bytes_used + min_free && 8055 if (device->total_bytes > device->bytes_used + min_free &&
7880 !device->is_tgtdev_for_dev_replace) { 8056 !device->is_tgtdev_for_dev_replace) {
7881 ret = find_free_dev_extent(device, min_free, 8057 ret = find_free_dev_extent(trans, device, min_free,
7882 &dev_offset, NULL); 8058 &dev_offset, NULL);
7883 if (!ret) 8059 if (!ret)
7884 dev_nr++; 8060 dev_nr++;
@@ -7890,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7890 } 8066 }
7891 } 8067 }
7892 mutex_unlock(&root->fs_info->chunk_mutex); 8068 mutex_unlock(&root->fs_info->chunk_mutex);
8069 btrfs_end_transaction(trans, root);
7893out: 8070out:
7894 btrfs_put_block_group(block_group); 8071 btrfs_put_block_group(block_group);
7895 return ret; 8072 return ret;
@@ -8032,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8032 dump_space_info(space_info, 0, 0); 8209 dump_space_info(space_info, 0, 0);
8033 } 8210 }
8034 } 8211 }
8212 percpu_counter_destroy(&space_info->total_bytes_pinned);
8035 list_del(&space_info->list); 8213 list_del(&space_info->list);
8036 kfree(space_info); 8214 kfree(space_info);
8037 } 8215 }
@@ -8254,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8254 sizeof(item)); 8432 sizeof(item));
8255 if (ret) 8433 if (ret)
8256 btrfs_abort_transaction(trans, extent_root, ret); 8434 btrfs_abort_transaction(trans, extent_root, ret);
8435 ret = btrfs_finish_chunk_alloc(trans, extent_root,
8436 key.objectid, key.offset);
8437 if (ret)
8438 btrfs_abort_transaction(trans, extent_root, ret);
8257 } 8439 }
8258} 8440}
8259 8441
@@ -8591,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8591 if (end - start >= range->minlen) { 8773 if (end - start >= range->minlen) {
8592 if (!block_group_cache_done(cache)) { 8774 if (!block_group_cache_done(cache)) {
8593 ret = cache_block_group(cache, 0); 8775 ret = cache_block_group(cache, 0);
8594 if (!ret) 8776 if (ret) {
8595 wait_block_group_cache_done(cache); 8777 btrfs_put_block_group(cache);
8778 break;
8779 }
8780 ret = wait_block_group_cache_done(cache);
8781 if (ret) {
8782 btrfs_put_block_group(cache);
8783 break;
8784 }
8596 } 8785 }
8597 ret = btrfs_trim_block_group(cache, 8786 ret = btrfs_trim_block_group(cache,
8598 &group_trimmed, 8787 &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bca9472f313..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
77 kmem_cache_free(extent_buffer_cache, eb); 77 kmem_cache_free(extent_buffer_cache, eb);
78 } 78 }
79} 79}
80
81#define btrfs_debug_check_extent_io_range(inode, start, end) \
82 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
83static inline void __btrfs_debug_check_extent_io_range(const char *caller,
84 struct inode *inode, u64 start, u64 end)
85{
86 u64 isize = i_size_read(inode);
87
88 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
89 printk_ratelimited(KERN_DEBUG
90 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
91 caller,
92 (unsigned long long)btrfs_ino(inode),
93 (unsigned long long)isize,
94 (unsigned long long)start,
95 (unsigned long long)end);
96 }
97}
80#else 98#else
81#define btrfs_leak_debug_add(new, head) do {} while (0) 99#define btrfs_leak_debug_add(new, head) do {} while (0)
82#define btrfs_leak_debug_del(entry) do {} while (0) 100#define btrfs_leak_debug_del(entry) do {} while (0)
83#define btrfs_leak_debug_check() do {} while (0) 101#define btrfs_leak_debug_check() do {} while (0)
102#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
84#endif 103#endif
85 104
86#define BUFFER_LRU_MAX 64 105#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
522 int err; 541 int err;
523 int clear = 0; 542 int clear = 0;
524 543
544 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
545
546 if (bits & EXTENT_DELALLOC)
547 bits |= EXTENT_NORESERVE;
548
525 if (delete) 549 if (delete)
526 bits |= ~EXTENT_CTLBITS; 550 bits |= ~EXTENT_CTLBITS;
527 bits |= EXTENT_FIRST_DELALLOC; 551 bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
677 struct extent_state *state; 701 struct extent_state *state;
678 struct rb_node *node; 702 struct rb_node *node;
679 703
704 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
705
680 spin_lock(&tree->lock); 706 spin_lock(&tree->lock);
681again: 707again:
682 while (1) { 708 while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
769 u64 last_start; 795 u64 last_start;
770 u64 last_end; 796 u64 last_end;
771 797
798 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
799
772 bits |= EXTENT_FIRST_DELALLOC; 800 bits |= EXTENT_FIRST_DELALLOC;
773again: 801again:
774 if (!prealloc && (mask & __GFP_WAIT)) { 802 if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
989 u64 last_start; 1017 u64 last_start;
990 u64 last_end; 1018 u64 last_end;
991 1019
1020 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
1021
992again: 1022again:
993 if (!prealloc && (mask & __GFP_WAIT)) { 1023 if (!prealloc && (mask & __GFP_WAIT)) {
994 prealloc = alloc_extent_state(mask); 1024 prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2450 struct extent_state *cached = NULL; 2480 struct extent_state *cached = NULL;
2451 struct extent_state *state; 2481 struct extent_state *state;
2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2482 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2483 struct inode *inode = page->mapping->host;
2453 2484
2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2485 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2455 "mirror=%lu\n", (u64)bio->bi_sector, err, 2486 "mirror=%lu\n", (u64)bio->bi_sector, err,
2456 io_bio->mirror_num); 2487 io_bio->mirror_num);
2457 tree = &BTRFS_I(page->mapping->host)->io_tree; 2488 tree = &BTRFS_I(inode)->io_tree;
2458 2489
2459 /* We always issue full-page reads, but if some block 2490 /* We always issue full-page reads, but if some block
2460 * in a page fails to read, blk_update_request() will 2491 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2559 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2529 2560
2530 if (uptodate) { 2561 if (uptodate) {
2562 loff_t i_size = i_size_read(inode);
2563 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2564 unsigned offset;
2565
2566 /* Zero out the end if this page straddles i_size */
2567 offset = i_size & (PAGE_CACHE_SIZE-1);
2568 if (page->index == end_index && offset)
2569 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2531 SetPageUptodate(page); 2570 SetPageUptodate(page);
2532 } else { 2571 } else {
2533 ClearPageUptodate(page); 2572 ClearPageUptodate(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14) 21#define EXTENT_DAMAGED (1 << 14)
22#define EXTENT_NORESERVE (1 << 15)
22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
24 25
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
34 34
35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
36 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
37 sizeof(struct btrfs_sector_sum) * \ 37 sizeof(u32) * (r)->sectorsize)
38 (r)->sectorsize - (r)->sectorsize)
39 38
40int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 39int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 40 struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
297 struct btrfs_path *path; 296 struct btrfs_path *path;
298 struct extent_buffer *leaf; 297 struct extent_buffer *leaf;
299 struct btrfs_ordered_sum *sums; 298 struct btrfs_ordered_sum *sums;
300 struct btrfs_sector_sum *sector_sum;
301 struct btrfs_csum_item *item; 299 struct btrfs_csum_item *item;
302 LIST_HEAD(tmplist); 300 LIST_HEAD(tmplist);
303 unsigned long offset; 301 unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
368 struct btrfs_csum_item); 366 struct btrfs_csum_item);
369 while (start < csum_end) { 367 while (start < csum_end) {
370 size = min_t(size_t, csum_end - start, 368 size = min_t(size_t, csum_end - start,
371 MAX_ORDERED_SUM_BYTES(root)); 369 MAX_ORDERED_SUM_BYTES(root));
372 sums = kzalloc(btrfs_ordered_sum_size(root, size), 370 sums = kzalloc(btrfs_ordered_sum_size(root, size),
373 GFP_NOFS); 371 GFP_NOFS);
374 if (!sums) { 372 if (!sums) {
375 ret = -ENOMEM; 373 ret = -ENOMEM;
376 goto fail; 374 goto fail;
377 } 375 }
378 376
379 sector_sum = sums->sums;
380 sums->bytenr = start; 377 sums->bytenr = start;
381 sums->len = size; 378 sums->len = (int)size;
382 379
383 offset = (start - key.offset) >> 380 offset = (start - key.offset) >>
384 root->fs_info->sb->s_blocksize_bits; 381 root->fs_info->sb->s_blocksize_bits;
385 offset *= csum_size; 382 offset *= csum_size;
383 size >>= root->fs_info->sb->s_blocksize_bits;
386 384
387 while (size > 0) { 385 read_extent_buffer(path->nodes[0],
388 read_extent_buffer(path->nodes[0], 386 sums->sums,
389 &sector_sum->sum, 387 ((unsigned long)item) + offset,
390 ((unsigned long)item) + 388 csum_size * size);
391 offset, csum_size); 389
392 sector_sum->bytenr = start; 390 start += root->sectorsize * size;
393
394 size -= root->sectorsize;
395 start += root->sectorsize;
396 offset += csum_size;
397 sector_sum++;
398 }
399 list_add_tail(&sums->list, &tmplist); 391 list_add_tail(&sums->list, &tmplist);
400 } 392 }
401 path->slots[0]++; 393 path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
417 struct bio *bio, u64 file_start, int contig) 409 struct bio *bio, u64 file_start, int contig)
418{ 410{
419 struct btrfs_ordered_sum *sums; 411 struct btrfs_ordered_sum *sums;
420 struct btrfs_sector_sum *sector_sum;
421 struct btrfs_ordered_extent *ordered; 412 struct btrfs_ordered_extent *ordered;
422 char *data; 413 char *data;
423 struct bio_vec *bvec = bio->bi_io_vec; 414 struct bio_vec *bvec = bio->bi_io_vec;
424 int bio_index = 0; 415 int bio_index = 0;
416 int index;
425 unsigned long total_bytes = 0; 417 unsigned long total_bytes = 0;
426 unsigned long this_sum_bytes = 0; 418 unsigned long this_sum_bytes = 0;
427 u64 offset; 419 u64 offset;
428 u64 disk_bytenr;
429 420
430 WARN_ON(bio->bi_vcnt <= 0); 421 WARN_ON(bio->bi_vcnt <= 0);
431 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 422 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
432 if (!sums) 423 if (!sums)
433 return -ENOMEM; 424 return -ENOMEM;
434 425
435 sector_sum = sums->sums;
436 disk_bytenr = (u64)bio->bi_sector << 9;
437 sums->len = bio->bi_size; 426 sums->len = bio->bi_size;
438 INIT_LIST_HEAD(&sums->list); 427 INIT_LIST_HEAD(&sums->list);
439 428
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
444 433
445 ordered = btrfs_lookup_ordered_extent(inode, offset); 434 ordered = btrfs_lookup_ordered_extent(inode, offset);
446 BUG_ON(!ordered); /* Logic error */ 435 BUG_ON(!ordered); /* Logic error */
447 sums->bytenr = ordered->start; 436 sums->bytenr = (u64)bio->bi_sector << 9;
437 index = 0;
448 438
449 while (bio_index < bio->bi_vcnt) { 439 while (bio_index < bio->bi_vcnt) {
450 if (!contig) 440 if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
463 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 453 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
464 GFP_NOFS); 454 GFP_NOFS);
465 BUG_ON(!sums); /* -ENOMEM */ 455 BUG_ON(!sums); /* -ENOMEM */
466 sector_sum = sums->sums;
467 sums->len = bytes_left; 456 sums->len = bytes_left;
468 ordered = btrfs_lookup_ordered_extent(inode, offset); 457 ordered = btrfs_lookup_ordered_extent(inode, offset);
469 BUG_ON(!ordered); /* Logic error */ 458 BUG_ON(!ordered); /* Logic error */
470 sums->bytenr = ordered->start; 459 sums->bytenr = ((u64)bio->bi_sector << 9) +
460 total_bytes;
461 index = 0;
471 } 462 }
472 463
473 data = kmap_atomic(bvec->bv_page); 464 data = kmap_atomic(bvec->bv_page);
474 sector_sum->sum = ~(u32)0; 465 sums->sums[index] = ~(u32)0;
475 sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, 466 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
476 sector_sum->sum, 467 sums->sums[index],
477 bvec->bv_len); 468 bvec->bv_len);
478 kunmap_atomic(data); 469 kunmap_atomic(data);
479 btrfs_csum_final(sector_sum->sum, 470 btrfs_csum_final(sums->sums[index],
480 (char *)&sector_sum->sum); 471 (char *)(sums->sums + index));
481 sector_sum->bytenr = disk_bytenr;
482 472
483 sector_sum++;
484 bio_index++; 473 bio_index++;
474 index++;
485 total_bytes += bvec->bv_len; 475 total_bytes += bvec->bv_len;
486 this_sum_bytes += bvec->bv_len; 476 this_sum_bytes += bvec->bv_len;
487 disk_bytenr += bvec->bv_len;
488 offset += bvec->bv_len; 477 offset += bvec->bv_len;
489 bvec++; 478 bvec++;
490 } 479 }
@@ -672,62 +661,46 @@ out:
672 return ret; 661 return ret;
673} 662}
674 663
675static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
676 struct btrfs_sector_sum *sector_sum,
677 u64 total_bytes, u64 sectorsize)
678{
679 u64 tmp = sectorsize;
680 u64 next_sector = sector_sum->bytenr;
681 struct btrfs_sector_sum *next = sector_sum + 1;
682
683 while ((tmp + total_bytes) < sums->len) {
684 if (next_sector + sectorsize != next->bytenr)
685 break;
686 tmp += sectorsize;
687 next_sector = next->bytenr;
688 next++;
689 }
690 return tmp;
691}
692
693int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 664int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
694 struct btrfs_root *root, 665 struct btrfs_root *root,
695 struct btrfs_ordered_sum *sums) 666 struct btrfs_ordered_sum *sums)
696{ 667{
697 u64 bytenr;
698 int ret;
699 struct btrfs_key file_key; 668 struct btrfs_key file_key;
700 struct btrfs_key found_key; 669 struct btrfs_key found_key;
701 u64 next_offset;
702 u64 total_bytes = 0;
703 int found_next;
704 struct btrfs_path *path; 670 struct btrfs_path *path;
705 struct btrfs_csum_item *item; 671 struct btrfs_csum_item *item;
706 struct btrfs_csum_item *item_end; 672 struct btrfs_csum_item *item_end;
707 struct extent_buffer *leaf = NULL; 673 struct extent_buffer *leaf = NULL;
674 u64 next_offset;
675 u64 total_bytes = 0;
708 u64 csum_offset; 676 u64 csum_offset;
709 struct btrfs_sector_sum *sector_sum; 677 u64 bytenr;
710 u32 nritems; 678 u32 nritems;
711 u32 ins_size; 679 u32 ins_size;
680 int index = 0;
681 int found_next;
682 int ret;
712 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 683 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
713 684
714 path = btrfs_alloc_path(); 685 path = btrfs_alloc_path();
715 if (!path) 686 if (!path)
716 return -ENOMEM; 687 return -ENOMEM;
717
718 sector_sum = sums->sums;
719again: 688again:
720 next_offset = (u64)-1; 689 next_offset = (u64)-1;
721 found_next = 0; 690 found_next = 0;
691 bytenr = sums->bytenr + total_bytes;
722 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 692 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
723 file_key.offset = sector_sum->bytenr; 693 file_key.offset = bytenr;
724 bytenr = sector_sum->bytenr;
725 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 694 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
726 695
727 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); 696 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
728 if (!IS_ERR(item)) { 697 if (!IS_ERR(item)) {
729 leaf = path->nodes[0];
730 ret = 0; 698 ret = 0;
699 leaf = path->nodes[0];
700 item_end = btrfs_item_ptr(leaf, path->slots[0],
701 struct btrfs_csum_item);
702 item_end = (struct btrfs_csum_item *)((char *)item_end +
703 btrfs_item_size_nr(leaf, path->slots[0]));
731 goto found; 704 goto found;
732 } 705 }
733 ret = PTR_ERR(item); 706 ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
807 780
808 free_space = btrfs_leaf_free_space(root, leaf) - 781 free_space = btrfs_leaf_free_space(root, leaf) -
809 sizeof(struct btrfs_item) - csum_size; 782 sizeof(struct btrfs_item) - csum_size;
810 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 783 tmp = sums->len - total_bytes;
811 root->sectorsize);
812 tmp >>= root->fs_info->sb->s_blocksize_bits; 784 tmp >>= root->fs_info->sb->s_blocksize_bits;
813 WARN_ON(tmp < 1); 785 WARN_ON(tmp < 1);
814 786
@@ -822,6 +794,7 @@ again:
822 diff *= csum_size; 794 diff *= csum_size;
823 795
824 btrfs_extend_item(root, path, diff); 796 btrfs_extend_item(root, path, diff);
797 ret = 0;
825 goto csum; 798 goto csum;
826 } 799 }
827 800
@@ -831,8 +804,7 @@ insert:
831 if (found_next) { 804 if (found_next) {
832 u64 tmp; 805 u64 tmp;
833 806
834 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 807 tmp = sums->len - total_bytes;
835 root->sectorsize);
836 tmp >>= root->fs_info->sb->s_blocksize_bits; 808 tmp >>= root->fs_info->sb->s_blocksize_bits;
837 tmp = min(tmp, (next_offset - file_key.offset) >> 809 tmp = min(tmp, (next_offset - file_key.offset) >>
838 root->fs_info->sb->s_blocksize_bits); 810 root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
853 WARN_ON(1); 825 WARN_ON(1);
854 goto fail_unlock; 826 goto fail_unlock;
855 } 827 }
856csum:
857 leaf = path->nodes[0]; 828 leaf = path->nodes[0];
829csum:
858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 830 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
859 ret = 0; 831 item_end = (struct btrfs_csum_item *)((unsigned char *)item +
832 btrfs_item_size_nr(leaf, path->slots[0]));
860 item = (struct btrfs_csum_item *)((unsigned char *)item + 833 item = (struct btrfs_csum_item *)((unsigned char *)item +
861 csum_offset * csum_size); 834 csum_offset * csum_size);
862found: 835found:
863 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 836 ins_size = (u32)(sums->len - total_bytes) >>
864 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 837 root->fs_info->sb->s_blocksize_bits;
865 btrfs_item_size_nr(leaf, path->slots[0])); 838 ins_size *= csum_size;
866next_sector: 839 ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
867 840 ins_size);
868 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size); 841 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
869 842 ins_size);
870 total_bytes += root->sectorsize; 843
871 sector_sum++; 844 ins_size /= csum_size;
872 if (total_bytes < sums->len) { 845 total_bytes += ins_size * root->sectorsize;
873 item = (struct btrfs_csum_item *)((char *)item + 846 index += ins_size;
874 csum_size);
875 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
876 sector_sum->bytenr) {
877 bytenr = sector_sum->bytenr;
878 goto next_sector;
879 }
880 }
881 847
882 btrfs_mark_buffer_dirty(path->nodes[0]); 848 btrfs_mark_buffer_dirty(path->nodes[0]);
883 if (total_bytes < sums->len) { 849 if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89da56a58b63..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
309 ret = PTR_ERR(inode_root); 309 ret = PTR_ERR(inode_root);
310 goto cleanup; 310 goto cleanup;
311 } 311 }
312 if (btrfs_root_refs(&inode_root->root_item) == 0) {
313 ret = -ENOENT;
314 goto cleanup;
315 }
316 312
317 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
318 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
1317 1313
1318} 1314}
1319 1315
1316static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1317 size_t *write_bytes)
1318{
1319 struct btrfs_trans_handle *trans;
1320 struct btrfs_root *root = BTRFS_I(inode)->root;
1321 struct btrfs_ordered_extent *ordered;
1322 u64 lockstart, lockend;
1323 u64 num_bytes;
1324 int ret;
1325
1326 lockstart = round_down(pos, root->sectorsize);
1327 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
1328
1329 while (1) {
1330 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1331 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1332 lockend - lockstart + 1);
1333 if (!ordered) {
1334 break;
1335 }
1336 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1337 btrfs_start_ordered_extent(inode, ordered, 1);
1338 btrfs_put_ordered_extent(ordered);
1339 }
1340
1341 trans = btrfs_join_transaction(root);
1342 if (IS_ERR(trans)) {
1343 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1344 return PTR_ERR(trans);
1345 }
1346
1347 num_bytes = lockend - lockstart + 1;
1348 ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
1349 NULL);
1350 btrfs_end_transaction(trans, root);
1351 if (ret <= 0) {
1352 ret = 0;
1353 } else {
1354 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1355 EXTENT_DIRTY | EXTENT_DELALLOC |
1356 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1357 NULL, GFP_NOFS);
1358 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1359 }
1360
1361 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1362
1363 return ret;
1364}
1365
1320static noinline ssize_t __btrfs_buffered_write(struct file *file, 1366static noinline ssize_t __btrfs_buffered_write(struct file *file,
1321 struct iov_iter *i, 1367 struct iov_iter *i,
1322 loff_t pos) 1368 loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1324 struct inode *inode = file_inode(file); 1370 struct inode *inode = file_inode(file);
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1371 struct btrfs_root *root = BTRFS_I(inode)->root;
1326 struct page **pages = NULL; 1372 struct page **pages = NULL;
1373 u64 release_bytes = 0;
1327 unsigned long first_index; 1374 unsigned long first_index;
1328 size_t num_written = 0; 1375 size_t num_written = 0;
1329 int nrptrs; 1376 int nrptrs;
1330 int ret = 0; 1377 int ret = 0;
1378 bool only_release_metadata = false;
1331 bool force_page_uptodate = false; 1379 bool force_page_uptodate = false;
1332 1380
1333 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1381 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 offset); 1396 offset);
1349 size_t num_pages = (write_bytes + offset + 1397 size_t num_pages = (write_bytes + offset +
1350 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1398 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1399 size_t reserve_bytes;
1351 size_t dirty_pages; 1400 size_t dirty_pages;
1352 size_t copied; 1401 size_t copied;
1353 1402
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1362 break; 1411 break;
1363 } 1412 }
1364 1413
1365 ret = btrfs_delalloc_reserve_space(inode, 1414 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1366 num_pages << PAGE_CACHE_SHIFT); 1415 ret = btrfs_check_data_free_space(inode, reserve_bytes);
1416 if (ret == -ENOSPC &&
1417 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1418 BTRFS_INODE_PREALLOC))) {
1419 ret = check_can_nocow(inode, pos, &write_bytes);
1420 if (ret > 0) {
1421 only_release_metadata = true;
1422 /*
1423 * our prealloc extent may be smaller than
1424 * write_bytes, so scale down.
1425 */
1426 num_pages = (write_bytes + offset +
1427 PAGE_CACHE_SIZE - 1) >>
1428 PAGE_CACHE_SHIFT;
1429 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1430 ret = 0;
1431 } else {
1432 ret = -ENOSPC;
1433 }
1434 }
1435
1367 if (ret) 1436 if (ret)
1368 break; 1437 break;
1369 1438
1439 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1440 if (ret) {
1441 if (!only_release_metadata)
1442 btrfs_free_reserved_data_space(inode,
1443 reserve_bytes);
1444 break;
1445 }
1446
1447 release_bytes = reserve_bytes;
1448
1370 /* 1449 /*
1371 * This is going to setup the pages array with the number of 1450 * This is going to setup the pages array with the number of
1372 * pages we want, so we don't really need to worry about the 1451 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1375 ret = prepare_pages(root, file, pages, num_pages, 1454 ret = prepare_pages(root, file, pages, num_pages,
1376 pos, first_index, write_bytes, 1455 pos, first_index, write_bytes,
1377 force_page_uptodate); 1456 force_page_uptodate);
1378 if (ret) { 1457 if (ret)
1379 btrfs_delalloc_release_space(inode,
1380 num_pages << PAGE_CACHE_SHIFT);
1381 break; 1458 break;
1382 }
1383 1459
1384 copied = btrfs_copy_from_user(pos, num_pages, 1460 copied = btrfs_copy_from_user(pos, num_pages,
1385 write_bytes, pages, i); 1461 write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1409 * managed to copy. 1485 * managed to copy.
1410 */ 1486 */
1411 if (num_pages > dirty_pages) { 1487 if (num_pages > dirty_pages) {
1488 release_bytes = (num_pages - dirty_pages) <<
1489 PAGE_CACHE_SHIFT;
1412 if (copied > 0) { 1490 if (copied > 0) {
1413 spin_lock(&BTRFS_I(inode)->lock); 1491 spin_lock(&BTRFS_I(inode)->lock);
1414 BTRFS_I(inode)->outstanding_extents++; 1492 BTRFS_I(inode)->outstanding_extents++;
1415 spin_unlock(&BTRFS_I(inode)->lock); 1493 spin_unlock(&BTRFS_I(inode)->lock);
1416 } 1494 }
1417 btrfs_delalloc_release_space(inode, 1495 if (only_release_metadata)
1418 (num_pages - dirty_pages) << 1496 btrfs_delalloc_release_metadata(inode,
1419 PAGE_CACHE_SHIFT); 1497 release_bytes);
1498 else
1499 btrfs_delalloc_release_space(inode,
1500 release_bytes);
1420 } 1501 }
1421 1502
1503 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1422 if (copied > 0) { 1504 if (copied > 0) {
1423 ret = btrfs_dirty_pages(root, inode, pages, 1505 ret = btrfs_dirty_pages(root, inode, pages,
1424 dirty_pages, pos, copied, 1506 dirty_pages, pos, copied,
1425 NULL); 1507 NULL);
1426 if (ret) { 1508 if (ret) {
1427 btrfs_delalloc_release_space(inode,
1428 dirty_pages << PAGE_CACHE_SHIFT);
1429 btrfs_drop_pages(pages, num_pages); 1509 btrfs_drop_pages(pages, num_pages);
1430 break; 1510 break;
1431 } 1511 }
1432 } 1512 }
1433 1513
1514 release_bytes = 0;
1434 btrfs_drop_pages(pages, num_pages); 1515 btrfs_drop_pages(pages, num_pages);
1435 1516
1517 if (only_release_metadata && copied > 0) {
1518 u64 lockstart = round_down(pos, root->sectorsize);
1519 u64 lockend = lockstart +
1520 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1521
1522 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1523 lockend, EXTENT_NORESERVE, NULL,
1524 NULL, GFP_NOFS);
1525 only_release_metadata = false;
1526 }
1527
1436 cond_resched(); 1528 cond_resched();
1437 1529
1438 balance_dirty_pages_ratelimited(inode->i_mapping); 1530 balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1445 1537
1446 kfree(pages); 1538 kfree(pages);
1447 1539
1540 if (release_bytes) {
1541 if (only_release_metadata)
1542 btrfs_delalloc_release_metadata(inode, release_bytes);
1543 else
1544 btrfs_delalloc_release_space(inode, release_bytes);
1545 }
1546
1448 return num_written ? num_written : ret; 1547 return num_written ? num_written : ret;
1449} 1548}
1450 1549
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2175 goto out_reserve_fail; 2274 goto out_reserve_fail;
2176 } 2275 }
2177 2276
2178 /*
2179 * wait for ordered IO before we have any locks. We'll loop again
2180 * below with the locks held.
2181 */
2182 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2183
2184 mutex_lock(&inode->i_mutex); 2277 mutex_lock(&inode->i_mutex);
2185 ret = inode_newsize_ok(inode, alloc_end); 2278 ret = inode_newsize_ok(inode, alloc_end);
2186 if (ret) 2279 if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
2191 alloc_start); 2284 alloc_start);
2192 if (ret) 2285 if (ret)
2193 goto out; 2286 goto out;
2287 } else {
2288 /*
2289 * If we are fallocating from the end of the file onward we
2290 * need to zero out the end of the page if i_size lands in the
2291 * middle of a page.
2292 */
2293 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2294 if (ret)
2295 goto out;
2194 } 2296 }
2195 2297
2298 /*
2299 * wait for ordered IO before we have any locks. We'll loop again
2300 * below with the locks held.
2301 */
2302 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2303
2196 locked_end = alloc_end - 1; 2304 locked_end = alloc_end - 1;
2197 while (1) { 2305 while (1) {
2198 struct btrfs_ordered_extent *ordered; 2306 struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2750b5023526..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
213 else 213 else
214 ret = 0; 214 ret = 0;
215 spin_unlock(&rsv->lock); 215 spin_unlock(&rsv->lock);
216 return 0; 216 return ret;
217} 217}
218 218
219int btrfs_truncate_free_space_cache(struct btrfs_root *root, 219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
3150 return 0; 3150 return 0;
3151} 3151}
3152 3152
3153#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
3154
3153/* 3155/*
3154 * This test just does basic sanity checking, making sure we can add an exten 3156 * This test just does basic sanity checking, making sure we can add an exten
3155 * entry and remove space from either end and the middle, and make sure we can 3157 * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
3159{ 3161{
3160 int ret = 0; 3162 int ret = 0;
3161 3163
3162 printk(KERN_ERR "Running extent only tests\n"); 3164 test_msg("Running extent only tests\n");
3163 3165
3164 /* First just make sure we can remove an entire entry */ 3166 /* First just make sure we can remove an entire entry */
3165 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3167 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3166 if (ret) { 3168 if (ret) {
3167 printk(KERN_ERR "Error adding initial extents %d\n", ret); 3169 test_msg("Error adding initial extents %d\n", ret);
3168 return ret; 3170 return ret;
3169 } 3171 }
3170 3172
3171 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3173 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3172 if (ret) { 3174 if (ret) {
3173 printk(KERN_ERR "Error removing extent %d\n", ret); 3175 test_msg("Error removing extent %d\n", ret);
3174 return ret; 3176 return ret;
3175 } 3177 }
3176 3178
3177 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3179 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3178 printk(KERN_ERR "Full remove left some lingering space\n"); 3180 test_msg("Full remove left some lingering space\n");
3179 return -1; 3181 return -1;
3180 } 3182 }
3181 3183
3182 /* Ok edge and middle cases now */ 3184 /* Ok edge and middle cases now */
3183 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3185 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3184 if (ret) { 3186 if (ret) {
3185 printk(KERN_ERR "Error adding half extent %d\n", ret); 3187 test_msg("Error adding half extent %d\n", ret);
3186 return ret; 3188 return ret;
3187 } 3189 }
3188 3190
3189 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); 3191 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
3190 if (ret) { 3192 if (ret) {
3191 printk(KERN_ERR "Error removing tail end %d\n", ret); 3193 test_msg("Error removing tail end %d\n", ret);
3192 return ret; 3194 return ret;
3193 } 3195 }
3194 3196
3195 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3197 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3196 if (ret) { 3198 if (ret) {
3197 printk(KERN_ERR "Error removing front end %d\n", ret); 3199 test_msg("Error removing front end %d\n", ret);
3198 return ret; 3200 return ret;
3199 } 3201 }
3200 3202
3201 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 3203 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
3202 if (ret) { 3204 if (ret) {
3203 printk(KERN_ERR "Error removing middle piece %d\n", ret); 3205 test_msg("Error removing middle piece %d\n", ret);
3204 return ret; 3206 return ret;
3205 } 3207 }
3206 3208
3207 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3209 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3208 printk(KERN_ERR "Still have space at the front\n"); 3210 test_msg("Still have space at the front\n");
3209 return -1; 3211 return -1;
3210 } 3212 }
3211 3213
3212 if (check_exists(cache, 2 * 1024 * 1024, 4096)) { 3214 if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
3213 printk(KERN_ERR "Still have space in the middle\n"); 3215 test_msg("Still have space in the middle\n");
3214 return -1; 3216 return -1;
3215 } 3217 }
3216 3218
3217 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { 3219 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
3218 printk(KERN_ERR "Still have space at the end\n"); 3220 test_msg("Still have space at the end\n");
3219 return -1; 3221 return -1;
3220 } 3222 }
3221 3223
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3230 u64 next_bitmap_offset; 3232 u64 next_bitmap_offset;
3231 int ret; 3233 int ret;
3232 3234
3233 printk(KERN_ERR "Running bitmap only tests\n"); 3235 test_msg("Running bitmap only tests\n");
3234 3236
3235 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3237 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3236 if (ret) { 3238 if (ret) {
3237 printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); 3239 test_msg("Couldn't create a bitmap entry %d\n", ret);
3238 return ret; 3240 return ret;
3239 } 3241 }
3240 3242
3241 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3243 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3242 if (ret) { 3244 if (ret) {
3243 printk(KERN_ERR "Error removing bitmap full range %d\n", ret); 3245 test_msg("Error removing bitmap full range %d\n", ret);
3244 return ret; 3246 return ret;
3245 } 3247 }
3246 3248
3247 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3249 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3248 printk(KERN_ERR "Left some space in bitmap\n"); 3250 test_msg("Left some space in bitmap\n");
3249 return -1; 3251 return -1;
3250 } 3252 }
3251 3253
3252 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3254 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3253 if (ret) { 3255 if (ret) {
3254 printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); 3256 test_msg("Couldn't add to our bitmap entry %d\n", ret);
3255 return ret; 3257 return ret;
3256 } 3258 }
3257 3259
3258 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); 3260 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
3259 if (ret) { 3261 if (ret) {
3260 printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); 3262 test_msg("Couldn't remove middle chunk %d\n", ret);
3261 return ret; 3263 return ret;
3262 } 3264 }
3263 3265
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3271 ret = add_free_space_entry(cache, next_bitmap_offset - 3273 ret = add_free_space_entry(cache, next_bitmap_offset -
3272 (2 * 1024 * 1024), 4 * 1024 * 1024, 1); 3274 (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
3273 if (ret) { 3275 if (ret) {
3274 printk(KERN_ERR "Couldn't add space that straddles two bitmaps" 3276 test_msg("Couldn't add space that straddles two bitmaps %d\n",
3275 " %d\n", ret); 3277 ret);
3276 return ret; 3278 return ret;
3277 } 3279 }
3278 3280
3279 ret = btrfs_remove_free_space(cache, next_bitmap_offset - 3281 ret = btrfs_remove_free_space(cache, next_bitmap_offset -
3280 (1 * 1024 * 1024), 2 * 1024 * 1024); 3282 (1 * 1024 * 1024), 2 * 1024 * 1024);
3281 if (ret) { 3283 if (ret) {
3282 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3284 test_msg("Couldn't remove overlapping space %d\n", ret);
3283 return ret; 3285 return ret;
3284 } 3286 }
3285 3287
3286 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), 3288 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
3287 2 * 1024 * 1024)) { 3289 2 * 1024 * 1024)) {
3288 printk(KERN_ERR "Left some space when removing overlapping\n"); 3290 test_msg("Left some space when removing overlapping\n");
3289 return -1; 3291 return -1;
3290 } 3292 }
3291 3293
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3300 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); 3302 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
3301 int ret; 3303 int ret;
3302 3304
3303 printk(KERN_ERR "Running bitmap and extent tests\n"); 3305 test_msg("Running bitmap and extent tests\n");
3304 3306
3305 /* 3307 /*
3306 * First let's do something simple, an extent at the same offset as the 3308 * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3309 */ 3311 */
3310 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); 3312 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
3311 if (ret) { 3313 if (ret) {
3312 printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); 3314 test_msg("Couldn't create bitmap entry %d\n", ret);
3313 return ret; 3315 return ret;
3314 } 3316 }
3315 3317
3316 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3318 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3317 if (ret) { 3319 if (ret) {
3318 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3320 test_msg("Couldn't add extent entry %d\n", ret);
3319 return ret; 3321 return ret;
3320 } 3322 }
3321 3323
3322 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3324 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3323 if (ret) { 3325 if (ret) {
3324 printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); 3326 test_msg("Couldn't remove extent entry %d\n", ret);
3325 return ret; 3327 return ret;
3326 } 3328 }
3327 3329
3328 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3330 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3329 printk(KERN_ERR "Left remnants after our remove\n"); 3331 test_msg("Left remnants after our remove\n");
3330 return -1; 3332 return -1;
3331 } 3333 }
3332 3334
3333 /* Now to add back the extent entry and remove from the bitmap */ 3335 /* Now to add back the extent entry and remove from the bitmap */
3334 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3336 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3335 if (ret) { 3337 if (ret) {
3336 printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); 3338 test_msg("Couldn't re-add extent entry %d\n", ret);
3337 return ret; 3339 return ret;
3338 } 3340 }
3339 3341
3340 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); 3342 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
3341 if (ret) { 3343 if (ret) {
3342 printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); 3344 test_msg("Couldn't remove from bitmap %d\n", ret);
3343 return ret; 3345 return ret;
3344 } 3346 }
3345 3347
3346 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { 3348 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
3347 printk(KERN_ERR "Left remnants in the bitmap\n"); 3349 test_msg("Left remnants in the bitmap\n");
3348 return -1; 3350 return -1;
3349 } 3351 }
3350 3352
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3354 */ 3356 */
3355 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); 3357 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
3356 if (ret) { 3358 if (ret) {
3357 printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); 3359 test_msg("Couldn't add to a bitmap %d\n", ret);
3358 return ret; 3360 return ret;
3359 } 3361 }
3360 3362
3361 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); 3363 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
3362 if (ret) { 3364 if (ret) {
3363 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3365 test_msg("Couldn't remove overlapping space %d\n", ret);
3364 return ret; 3366 return ret;
3365 } 3367 }
3366 3368
3367 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 3369 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
3368 printk(KERN_ERR "Left over peices after removing " 3370 test_msg("Left over peices after removing overlapping\n");
3369 "overlapping\n");
3370 return -1; 3371 return -1;
3371 } 3372 }
3372 3373
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3375 /* Now with the extent entry offset into the bitmap */ 3376 /* Now with the extent entry offset into the bitmap */
3376 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); 3377 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
3377 if (ret) { 3378 if (ret) {
3378 printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); 3379 test_msg("Couldn't add space to the bitmap %d\n", ret);
3379 return ret; 3380 return ret;
3380 } 3381 }
3381 3382
3382 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); 3383 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
3383 if (ret) { 3384 if (ret) {
3384 printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); 3385 test_msg("Couldn't add extent to the cache %d\n", ret);
3385 return ret; 3386 return ret;
3386 } 3387 }
3387 3388
3388 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); 3389 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
3389 if (ret) { 3390 if (ret) {
3390 printk(KERN_ERR "Problem removing overlapping space %d\n", ret); 3391 test_msg("Problem removing overlapping space %d\n", ret);
3391 return ret; 3392 return ret;
3392 } 3393 }
3393 3394
3394 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { 3395 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
3395 printk(KERN_ERR "Left something behind when removing space"); 3396 test_msg("Left something behind when removing space");
3396 return -1; 3397 return -1;
3397 } 3398 }
3398 3399
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3410 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, 3411 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
3411 4 * 1024 * 1024, 1); 3412 4 * 1024 * 1024, 1);
3412 if (ret) { 3413 if (ret) {
3413 printk(KERN_ERR "Couldn't add bitmap %d\n", ret); 3414 test_msg("Couldn't add bitmap %d\n", ret);
3414 return ret; 3415 return ret;
3415 } 3416 }
3416 3417
3417 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, 3418 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
3418 5 * 1024 * 1024, 0); 3419 5 * 1024 * 1024, 0);
3419 if (ret) { 3420 if (ret) {
3420 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3421 test_msg("Couldn't add extent entry %d\n", ret);
3421 return ret; 3422 return ret;
3422 } 3423 }
3423 3424
3424 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, 3425 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
3425 5 * 1024 * 1024); 3426 5 * 1024 * 1024);
3426 if (ret) { 3427 if (ret) {
3427 printk(KERN_ERR "Failed to free our space %d\n", ret); 3428 test_msg("Failed to free our space %d\n", ret);
3428 return ret; 3429 return ret;
3429 } 3430 }
3430 3431
3431 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, 3432 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
3432 5 * 1024 * 1024)) { 3433 5 * 1024 * 1024)) {
3433 printk(KERN_ERR "Left stuff over\n"); 3434 test_msg("Left stuff over\n");
3434 return -1; 3435 return -1;
3435 } 3436 }
3436 3437
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3444 */ 3445 */
3445 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); 3446 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
3446 if (ret) { 3447 if (ret) {
3447 printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); 3448 test_msg("Couldn't add bitmap entry %d\n", ret);
3448 return ret; 3449 return ret;
3449 } 3450 }
3450 3451
3451 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); 3452 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
3452 if (ret) { 3453 if (ret) {
3453 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3454 test_msg("Couldn't add extent entry %d\n", ret);
3454 return ret; 3455 return ret;
3455 } 3456 }
3456 3457
3457 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); 3458 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
3458 if (ret) { 3459 if (ret) {
3459 printk(KERN_ERR "Error removing bitmap and extent " 3460 test_msg("Error removing bitmap and extent overlapping %d\n", ret);
3460 "overlapping %d\n", ret);
3461 return ret; 3461 return ret;
3462 } 3462 }
3463 3463
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
3469{ 3469{
3470 struct btrfs_block_group_cache *cache; 3470 struct btrfs_block_group_cache *cache;
3471 3471
3472 printk(KERN_ERR "Running btrfs free space cache tests\n"); 3472 test_msg("Running btrfs free space cache tests\n");
3473 3473
3474 cache = init_test_block_group(); 3474 cache = init_test_block_group();
3475 if (!cache) { 3475 if (!cache) {
3476 printk(KERN_ERR "Couldn't run the tests\n"); 3476 test_msg("Couldn't run the tests\n");
3477 return; 3477 return;
3478 } 3478 }
3479 3479
@@ -3487,6 +3487,9 @@ out:
3487 __btrfs_remove_free_space_cache(cache->free_space_ctl); 3487 __btrfs_remove_free_space_cache(cache->free_space_ctl);
3488 kfree(cache->free_space_ctl); 3488 kfree(cache->free_space_ctl);
3489 kfree(cache); 3489 kfree(cache);
3490 printk(KERN_ERR "Free space cache tests finished\n"); 3490 test_msg("Free space cache tests finished\n");
3491} 3491}
3492#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3492#undef test_msg
3493#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3494void btrfs_test_free_space_cache(void) {}
3495#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
114 u64 *trimmed, u64 start, u64 end, u64 minlen); 114 u64 *trimmed, u64 start, u64 end, u64 minlen);
115 115
116#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
117void btrfs_test_free_space_cache(void); 116void btrfs_test_free_space_cache(void);
118#endif
119 117
120#endif 118#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f9d16b70d3d..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/btrfs.h> 43#include <linux/btrfs.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/posix_acl_xattr.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
@@ -57,6 +58,7 @@
57#include "free-space-cache.h" 58#include "free-space-cache.h"
58#include "inode-map.h" 59#include "inode-map.h"
59#include "backref.h" 60#include "backref.h"
61#include "hash.h"
60 62
61struct btrfs_iget_args { 63struct btrfs_iget_args {
62 u64 ino; 64 u64 ino;
@@ -701,8 +703,12 @@ retry:
701 async_extent->nr_pages = 0; 703 async_extent->nr_pages = 0;
702 async_extent->pages = NULL; 704 async_extent->pages = NULL;
703 705
704 if (ret == -ENOSPC) 706 if (ret == -ENOSPC) {
707 unlock_extent(io_tree, async_extent->start,
708 async_extent->start +
709 async_extent->ram_size - 1);
705 goto retry; 710 goto retry;
711 }
706 goto out_free; 712 goto out_free;
707 } 713 }
708 714
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
1529 spin_unlock(&BTRFS_I(inode)->lock); 1535 spin_unlock(&BTRFS_I(inode)->lock);
1530} 1536}
1531 1537
1538static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1539 struct inode *inode)
1540{
1541 spin_lock(&root->delalloc_lock);
1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 root->nr_delalloc_inodes++;
1548 if (root->nr_delalloc_inodes == 1) {
1549 spin_lock(&root->fs_info->delalloc_root_lock);
1550 BUG_ON(!list_empty(&root->delalloc_root));
1551 list_add_tail(&root->delalloc_root,
1552 &root->fs_info->delalloc_roots);
1553 spin_unlock(&root->fs_info->delalloc_root_lock);
1554 }
1555 }
1556 spin_unlock(&root->delalloc_lock);
1557}
1558
1559static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1560 struct inode *inode)
1561{
1562 spin_lock(&root->delalloc_lock);
1563 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1564 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1565 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1566 &BTRFS_I(inode)->runtime_flags);
1567 root->nr_delalloc_inodes--;
1568 if (!root->nr_delalloc_inodes) {
1569 spin_lock(&root->fs_info->delalloc_root_lock);
1570 BUG_ON(list_empty(&root->delalloc_root));
1571 list_del_init(&root->delalloc_root);
1572 spin_unlock(&root->fs_info->delalloc_root_lock);
1573 }
1574 }
1575 spin_unlock(&root->delalloc_lock);
1576}
1577
1532/* 1578/*
1533 * extent_io.c set_bit_hook, used to track delayed allocation 1579 * extent_io.c set_bit_hook, used to track delayed allocation
1534 * bytes in this file, and to maintain the list of inodes that 1580 * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1561 spin_lock(&BTRFS_I(inode)->lock); 1607 spin_lock(&BTRFS_I(inode)->lock);
1562 BTRFS_I(inode)->delalloc_bytes += len; 1608 BTRFS_I(inode)->delalloc_bytes += len;
1563 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1609 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1564 &BTRFS_I(inode)->runtime_flags)) { 1610 &BTRFS_I(inode)->runtime_flags))
1565 spin_lock(&root->fs_info->delalloc_lock); 1611 btrfs_add_delalloc_inodes(root, inode);
1566 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1567 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1568 &root->fs_info->delalloc_inodes);
1569 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1570 &BTRFS_I(inode)->runtime_flags);
1571 }
1572 spin_unlock(&root->fs_info->delalloc_lock);
1573 }
1574 spin_unlock(&BTRFS_I(inode)->lock); 1612 spin_unlock(&BTRFS_I(inode)->lock);
1575 } 1613 }
1576} 1614}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1604 btrfs_delalloc_release_metadata(inode, len); 1642 btrfs_delalloc_release_metadata(inode, len);
1605 1643
1606 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1644 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1607 && do_list) 1645 && do_list && !(state->state & EXTENT_NORESERVE))
1608 btrfs_free_reserved_data_space(inode, len); 1646 btrfs_free_reserved_data_space(inode, len);
1609 1647
1610 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1648 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1613 BTRFS_I(inode)->delalloc_bytes -= len; 1651 BTRFS_I(inode)->delalloc_bytes -= len;
1614 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1652 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1615 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1653 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1616 &BTRFS_I(inode)->runtime_flags)) { 1654 &BTRFS_I(inode)->runtime_flags))
1617 spin_lock(&root->fs_info->delalloc_lock); 1655 btrfs_del_delalloc_inode(root, inode);
1618 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1619 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1620 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1621 &BTRFS_I(inode)->runtime_flags);
1622 }
1623 spin_unlock(&root->fs_info->delalloc_lock);
1624 }
1625 spin_unlock(&BTRFS_I(inode)->lock); 1656 spin_unlock(&BTRFS_I(inode)->lock);
1626 } 1657 }
1627} 1658}
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2263 return 0; 2294 return 0;
2264 return PTR_ERR(root); 2295 return PTR_ERR(root);
2265 } 2296 }
2266 if (btrfs_root_refs(&root->root_item) == 0) {
2267 srcu_read_unlock(&fs_info->subvol_srcu, index);
2268 /* parse ENOENT to 0 */
2269 return 0;
2270 }
2271 2297
2272 /* step 2: get inode */ 2298 /* step 2: get inode */
2273 key.objectid = backref->inum; 2299 key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3215 /* 1 for the orphan item deletion. */ 3241 /* 1 for the orphan item deletion. */
3216 trans = btrfs_start_transaction(root, 1); 3242 trans = btrfs_start_transaction(root, 1);
3217 if (IS_ERR(trans)) { 3243 if (IS_ERR(trans)) {
3244 iput(inode);
3218 ret = PTR_ERR(trans); 3245 ret = PTR_ERR(trans);
3219 goto out; 3246 goto out;
3220 } 3247 }
3221 ret = btrfs_orphan_add(trans, inode); 3248 ret = btrfs_orphan_add(trans, inode);
3222 btrfs_end_transaction(trans, root); 3249 btrfs_end_transaction(trans, root);
3223 if (ret) 3250 if (ret) {
3251 iput(inode);
3224 goto out; 3252 goto out;
3253 }
3225 3254
3226 ret = btrfs_truncate(inode); 3255 ret = btrfs_truncate(inode);
3227 if (ret) 3256 if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3274{ 3303{
3275 u32 nritems = btrfs_header_nritems(leaf); 3304 u32 nritems = btrfs_header_nritems(leaf);
3276 struct btrfs_key found_key; 3305 struct btrfs_key found_key;
3306 static u64 xattr_access = 0;
3307 static u64 xattr_default = 0;
3277 int scanned = 0; 3308 int scanned = 0;
3278 3309
3310 if (!xattr_access) {
3311 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3312 strlen(POSIX_ACL_XATTR_ACCESS));
3313 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3314 strlen(POSIX_ACL_XATTR_DEFAULT));
3315 }
3316
3279 slot++; 3317 slot++;
3280 while (slot < nritems) { 3318 while (slot < nritems) {
3281 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3319 btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3285 return 0; 3323 return 0;
3286 3324
3287 /* we found an xattr, assume we've got an acl */ 3325 /* we found an xattr, assume we've got an acl */
3288 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 3326 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3289 return 1; 3327 if (found_key.offset == xattr_access ||
3328 found_key.offset == xattr_default)
3329 return 1;
3330 }
3290 3331
3291 /* 3332 /*
3292 * we found a key greater than an xattr key, there can't 3333 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3660 } 3701 }
3661 return ret; 3702 return ret;
3662} 3703}
3663
3664
3665/* helper to check if there is any shared block in the path */
3666static int check_path_shared(struct btrfs_root *root,
3667 struct btrfs_path *path)
3668{
3669 struct extent_buffer *eb;
3670 int level;
3671 u64 refs = 1;
3672
3673 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3674 int ret;
3675
3676 if (!path->nodes[level])
3677 break;
3678 eb = path->nodes[level];
3679 if (!btrfs_block_can_be_shared(root, eb))
3680 continue;
3681 ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
3682 &refs, NULL);
3683 if (refs > 1)
3684 return 1;
3685 }
3686 return 0;
3687}
3688 3704
3689/* 3705/*
3690 * helper to start transaction for unlink and rmdir. 3706 * helper to start transaction for unlink and rmdir.
3691 * 3707 *
3692 * unlink and rmdir are special in btrfs, they do not always free space. 3708 * unlink and rmdir are special in btrfs, they do not always free space, so
3693 * so in enospc case, we should make sure they will free space before 3709 * if we cannot make our reservations the normal way try and see if there is
3694 * allowing them to use the global metadata reservation. 3710 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3711 * allow the unlink to occur.
3695 */ 3712 */
3696static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 3713static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3697 struct dentry *dentry)
3698{ 3714{
3699 struct btrfs_trans_handle *trans; 3715 struct btrfs_trans_handle *trans;
3700 struct btrfs_root *root = BTRFS_I(dir)->root; 3716 struct btrfs_root *root = BTRFS_I(dir)->root;
3701 struct btrfs_path *path;
3702 struct btrfs_dir_item *di;
3703 struct inode *inode = dentry->d_inode;
3704 u64 index;
3705 int check_link = 1;
3706 int err = -ENOSPC;
3707 int ret; 3717 int ret;
3708 u64 ino = btrfs_ino(inode);
3709 u64 dir_ino = btrfs_ino(dir);
3710 3718
3711 /* 3719 /*
3712 * 1 for the possible orphan item 3720 * 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3719 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3727 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3720 return trans; 3728 return trans;
3721 3729
3722 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 3730 if (PTR_ERR(trans) == -ENOSPC) {
3723 return ERR_PTR(-ENOSPC); 3731 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3724
3725 /* check if there is someone else holds reference */
3726 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
3727 return ERR_PTR(-ENOSPC);
3728
3729 if (atomic_read(&inode->i_count) > 2)
3730 return ERR_PTR(-ENOSPC);
3731
3732 if (xchg(&root->fs_info->enospc_unlink, 1))
3733 return ERR_PTR(-ENOSPC);
3734
3735 path = btrfs_alloc_path();
3736 if (!path) {
3737 root->fs_info->enospc_unlink = 0;
3738 return ERR_PTR(-ENOMEM);
3739 }
3740 3732
3741 /* 1 for the orphan item */ 3733 trans = btrfs_start_transaction(root, 0);
3742 trans = btrfs_start_transaction(root, 1); 3734 if (IS_ERR(trans))
3743 if (IS_ERR(trans)) { 3735 return trans;
3744 btrfs_free_path(path); 3736 ret = btrfs_cond_migrate_bytes(root->fs_info,
3745 root->fs_info->enospc_unlink = 0; 3737 &root->fs_info->trans_block_rsv,
3746 return trans; 3738 num_bytes, 5);
3747 } 3739 if (ret) {
3748 3740 btrfs_end_transaction(trans, root);
3749 path->skip_locking = 1; 3741 return ERR_PTR(ret);
3750 path->search_commit_root = 1;
3751
3752 ret = btrfs_lookup_inode(trans, root, path,
3753 &BTRFS_I(dir)->location, 0);
3754 if (ret < 0) {
3755 err = ret;
3756 goto out;
3757 }
3758 if (ret == 0) {
3759 if (check_path_shared(root, path))
3760 goto out;
3761 } else {
3762 check_link = 0;
3763 }
3764 btrfs_release_path(path);
3765
3766 ret = btrfs_lookup_inode(trans, root, path,
3767 &BTRFS_I(inode)->location, 0);
3768 if (ret < 0) {
3769 err = ret;
3770 goto out;
3771 }
3772 if (ret == 0) {
3773 if (check_path_shared(root, path))
3774 goto out;
3775 } else {
3776 check_link = 0;
3777 }
3778 btrfs_release_path(path);
3779
3780 if (ret == 0 && S_ISREG(inode->i_mode)) {
3781 ret = btrfs_lookup_file_extent(trans, root, path,
3782 ino, (u64)-1, 0);
3783 if (ret < 0) {
3784 err = ret;
3785 goto out;
3786 } 3742 }
3787 BUG_ON(ret == 0); /* Corruption */
3788 if (check_path_shared(root, path))
3789 goto out;
3790 btrfs_release_path(path);
3791 }
3792
3793 if (!check_link) {
3794 err = 0;
3795 goto out;
3796 }
3797
3798 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3799 dentry->d_name.name, dentry->d_name.len, 0);
3800 if (IS_ERR(di)) {
3801 err = PTR_ERR(di);
3802 goto out;
3803 }
3804 if (di) {
3805 if (check_path_shared(root, path))
3806 goto out;
3807 } else {
3808 err = 0;
3809 goto out;
3810 }
3811 btrfs_release_path(path);
3812
3813 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3814 dentry->d_name.len, ino, dir_ino, 0,
3815 &index);
3816 if (ret) {
3817 err = ret;
3818 goto out;
3819 }
3820
3821 if (check_path_shared(root, path))
3822 goto out;
3823
3824 btrfs_release_path(path);
3825
3826 /*
3827 * This is a commit root search, if we can lookup inode item and other
3828 * relative items in the commit root, it means the transaction of
3829 * dir/file creation has been committed, and the dir index item that we
3830 * delay to insert has also been inserted into the commit root. So
3831 * we needn't worry about the delayed insertion of the dir index item
3832 * here.
3833 */
3834 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
3835 dentry->d_name.name, dentry->d_name.len, 0);
3836 if (IS_ERR(di)) {
3837 err = PTR_ERR(di);
3838 goto out;
3839 }
3840 BUG_ON(ret == -ENOENT);
3841 if (check_path_shared(root, path))
3842 goto out;
3843
3844 err = 0;
3845out:
3846 btrfs_free_path(path);
3847 /* Migrate the orphan reservation over */
3848 if (!err)
3849 err = btrfs_block_rsv_migrate(trans->block_rsv,
3850 &root->fs_info->global_block_rsv,
3851 trans->bytes_reserved);
3852
3853 if (err) {
3854 btrfs_end_transaction(trans, root);
3855 root->fs_info->enospc_unlink = 0;
3856 return ERR_PTR(err);
3857 }
3858
3859 trans->block_rsv = &root->fs_info->global_block_rsv;
3860 return trans;
3861}
3862
3863static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3864 struct btrfs_root *root)
3865{
3866 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3867 btrfs_block_rsv_release(root, trans->block_rsv,
3868 trans->bytes_reserved);
3869 trans->block_rsv = &root->fs_info->trans_block_rsv; 3743 trans->block_rsv = &root->fs_info->trans_block_rsv;
3870 BUG_ON(!root->fs_info->enospc_unlink); 3744 trans->bytes_reserved = num_bytes;
3871 root->fs_info->enospc_unlink = 0;
3872 } 3745 }
3873 btrfs_end_transaction(trans, root); 3746 return trans;
3874} 3747}
3875 3748
3876static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3749static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3880 struct inode *inode = dentry->d_inode; 3753 struct inode *inode = dentry->d_inode;
3881 int ret; 3754 int ret;
3882 3755
3883 trans = __unlink_start_trans(dir, dentry); 3756 trans = __unlink_start_trans(dir);
3884 if (IS_ERR(trans)) 3757 if (IS_ERR(trans))
3885 return PTR_ERR(trans); 3758 return PTR_ERR(trans);
3886 3759
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3898 } 3771 }
3899 3772
3900out: 3773out:
3901 __unlink_end_trans(trans, root); 3774 btrfs_end_transaction(trans, root);
3902 btrfs_btree_balance_dirty(root); 3775 btrfs_btree_balance_dirty(root);
3903 return ret; 3776 return ret;
3904} 3777}
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3995 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3868 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3996 return -EPERM; 3869 return -EPERM;
3997 3870
3998 trans = __unlink_start_trans(dir, dentry); 3871 trans = __unlink_start_trans(dir);
3999 if (IS_ERR(trans)) 3872 if (IS_ERR(trans))
4000 return PTR_ERR(trans); 3873 return PTR_ERR(trans);
4001 3874
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4017 if (!err) 3890 if (!err)
4018 btrfs_i_size_write(inode, 0); 3891 btrfs_i_size_write(inode, 0);
4019out: 3892out:
4020 __unlink_end_trans(trans, root); 3893 btrfs_end_transaction(trans, root);
4021 btrfs_btree_balance_dirty(root); 3894 btrfs_btree_balance_dirty(root);
4022 3895
4023 return err; 3896 return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4395 u64 hole_size; 4268 u64 hole_size;
4396 int err = 0; 4269 int err = 0;
4397 4270
4271 /*
4272 * If our size started in the middle of a page we need to zero out the
4273 * rest of the page before we expand the i_size, otherwise we could
4274 * expose stale data.
4275 */
4276 err = btrfs_truncate_page(inode, oldsize, 0, 0);
4277 if (err)
4278 return err;
4279
4398 if (size <= hole_start) 4280 if (size <= hole_start)
4399 return 0; 4281 return 0;
4400 4282
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
4822 goto out; 4704 goto out;
4823 } 4705 }
4824 4706
4825 if (btrfs_root_refs(&new_root->root_item) == 0) {
4826 err = -ENOENT;
4827 goto out;
4828 }
4829
4830 *sub_root = new_root; 4707 *sub_root = new_root;
4831 location->objectid = btrfs_root_dirid(&new_root->root_item); 4708 location->objectid = btrfs_root_dirid(&new_root->root_item);
4832 location->type = BTRFS_INODE_ITEM_KEY; 4709 location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5092 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4969 if (!(inode->i_sb->s_flags & MS_RDONLY))
5093 ret = btrfs_orphan_cleanup(sub_root); 4970 ret = btrfs_orphan_cleanup(sub_root);
5094 up_read(&root->fs_info->cleanup_work_sem); 4971 up_read(&root->fs_info->cleanup_work_sem);
5095 if (ret) 4972 if (ret) {
4973 iput(inode);
5096 inode = ERR_PTR(ret); 4974 inode = ERR_PTR(ret);
4975 }
5097 } 4976 }
5098 4977
5099 return inode; 4978 return inode;
@@ -6501,10 +6380,10 @@ out:
6501 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6380 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6502 * block must be cow'd 6381 * block must be cow'd
6503 */ 6382 */
6504static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 6383noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
6505 struct inode *inode, u64 offset, u64 *len, 6384 struct inode *inode, u64 offset, u64 *len,
6506 u64 *orig_start, u64 *orig_block_len, 6385 u64 *orig_start, u64 *orig_block_len,
6507 u64 *ram_bytes) 6386 u64 *ram_bytes)
6508{ 6387{
6509 struct btrfs_path *path; 6388 struct btrfs_path *path;
6510 int ret; 6389 int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6518 u64 num_bytes; 6397 u64 num_bytes;
6519 int slot; 6398 int slot;
6520 int found_type; 6399 int found_type;
6521 6400 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6522 path = btrfs_alloc_path(); 6401 path = btrfs_alloc_path();
6523 if (!path) 6402 if (!path)
6524 return -ENOMEM; 6403 return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6558 /* not a regular extent, must cow */ 6437 /* not a regular extent, must cow */
6559 goto out; 6438 goto out;
6560 } 6439 }
6440
6441 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6442 goto out;
6443
6561 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6444 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6445 if (disk_bytenr == 0)
6446 goto out;
6447
6448 if (btrfs_file_extent_compression(leaf, fi) ||
6449 btrfs_file_extent_encryption(leaf, fi) ||
6450 btrfs_file_extent_other_encoding(leaf, fi))
6451 goto out;
6452
6562 backref_offset = btrfs_file_extent_offset(leaf, fi); 6453 backref_offset = btrfs_file_extent_offset(leaf, fi);
6563 6454
6564 *orig_start = key.offset - backref_offset; 6455 if (orig_start) {
6565 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6456 *orig_start = key.offset - backref_offset;
6566 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6457 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6458 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6459 }
6567 6460
6568 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6461 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6569 if (extent_end < offset + *len) {
6570 /* extent doesn't include our full range, must cow */
6571 goto out;
6572 }
6573 6462
6574 if (btrfs_extent_readonly(root, disk_bytenr)) 6463 if (btrfs_extent_readonly(root, disk_bytenr))
6575 goto out; 6464 goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6813 if (IS_ERR(trans)) 6702 if (IS_ERR(trans))
6814 goto must_cow; 6703 goto must_cow;
6815 6704
6816 if (can_nocow_odirect(trans, inode, start, &len, &orig_start, 6705 if (can_nocow_extent(trans, inode, start, &len, &orig_start,
6817 &orig_block_len, &ram_bytes) == 1) { 6706 &orig_block_len, &ram_bytes) == 1) {
6818 if (type == BTRFS_ORDERED_PREALLOC) { 6707 if (type == BTRFS_ORDERED_PREALLOC) {
6819 free_extent_map(em); 6708 free_extent_map(em);
6820 em = create_pinned_em(inode, start, len, 6709 em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7243{ 7132{
7244 struct btrfs_root *root = BTRFS_I(inode)->root; 7133 struct btrfs_root *root = BTRFS_I(inode)->root;
7245 struct btrfs_dio_private *dip; 7134 struct btrfs_dio_private *dip;
7246 struct bio_vec *bvec = dio_bio->bi_io_vec;
7247 struct bio *io_bio; 7135 struct bio *io_bio;
7248 int skip_sum; 7136 int skip_sum;
7249 int write = rw & REQ_WRITE; 7137 int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7265 } 7153 }
7266 7154
7267 dip->private = dio_bio->bi_private; 7155 dip->private = dio_bio->bi_private;
7268 io_bio->bi_private = dio_bio->bi_private;
7269 dip->inode = inode; 7156 dip->inode = inode;
7270 dip->logical_offset = file_offset; 7157 dip->logical_offset = file_offset;
7271 7158 dip->bytes = dio_bio->bi_size;
7272 dip->bytes = 0;
7273 do {
7274 dip->bytes += bvec->bv_len;
7275 bvec++;
7276 } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
7277
7278 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7159 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7279 io_bio->bi_private = dip; 7160 io_bio->bi_private = dip;
7280 dip->errors = 0; 7161 dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7373 atomic_inc(&inode->i_dio_count); 7254 atomic_inc(&inode->i_dio_count);
7374 smp_mb__after_atomic_inc(); 7255 smp_mb__after_atomic_inc();
7375 7256
7257 /*
7258 * The generic stuff only does filemap_write_and_wait_range, which isn't
7259 * enough if we've written compressed pages to this area, so we need to
7260 * call btrfs_wait_ordered_range to make absolutely sure that any
7261 * outstanding dirty pages are on disk.
7262 */
7263 count = iov_length(iov, nr_segs);
7264 btrfs_wait_ordered_range(inode, offset, count);
7265
7376 if (rw & WRITE) { 7266 if (rw & WRITE) {
7377 count = iov_length(iov, nr_segs);
7378 /* 7267 /*
7379 * If the write DIO is beyond the EOF, we need update 7268 * If the write DIO is beyond the EOF, we need update
7380 * the isize, but it is protected by i_mutex. So we can 7269 * the isize, but it is protected by i_mutex. So we can
@@ -7694,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
7694{ 7583{
7695 struct btrfs_root *root = BTRFS_I(inode)->root; 7584 struct btrfs_root *root = BTRFS_I(inode)->root;
7696 struct btrfs_block_rsv *rsv; 7585 struct btrfs_block_rsv *rsv;
7697 int ret; 7586 int ret = 0;
7698 int err = 0; 7587 int err = 0;
7699 struct btrfs_trans_handle *trans; 7588 struct btrfs_trans_handle *trans;
7700 u64 mask = root->sectorsize - 1; 7589 u64 mask = root->sectorsize - 1;
7701 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 7590 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7702 7591
7703 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
7704 if (ret)
7705 return ret;
7706
7707 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 7592 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7708 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 7593 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7709 7594
@@ -7961,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
7961 */ 7846 */
7962 smp_mb(); 7847 smp_mb();
7963 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7848 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7964 spin_lock(&root->fs_info->ordered_extent_lock); 7849 spin_lock(&root->fs_info->ordered_root_lock);
7965 list_del_init(&BTRFS_I(inode)->ordered_operations); 7850 list_del_init(&BTRFS_I(inode)->ordered_operations);
7966 spin_unlock(&root->fs_info->ordered_extent_lock); 7851 spin_unlock(&root->fs_info->ordered_root_lock);
7967 } 7852 }
7968 7853
7969 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7854 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8333,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8333 * some fairly slow code that needs optimization. This walks the list 8218 * some fairly slow code that needs optimization. This walks the list
8334 * of all the inodes with pending delalloc and forces them to disk. 8219 * of all the inodes with pending delalloc and forces them to disk.
8335 */ 8220 */
8336int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8221static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8337{ 8222{
8338 struct btrfs_inode *binode; 8223 struct btrfs_inode *binode;
8339 struct inode *inode; 8224 struct inode *inode;
@@ -8342,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8342 struct list_head splice; 8227 struct list_head splice;
8343 int ret = 0; 8228 int ret = 0;
8344 8229
8345 if (root->fs_info->sb->s_flags & MS_RDONLY)
8346 return -EROFS;
8347
8348 INIT_LIST_HEAD(&works); 8230 INIT_LIST_HEAD(&works);
8349 INIT_LIST_HEAD(&splice); 8231 INIT_LIST_HEAD(&splice);
8350 8232
8351 spin_lock(&root->fs_info->delalloc_lock); 8233 spin_lock(&root->delalloc_lock);
8352 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8234 list_splice_init(&root->delalloc_inodes, &splice);
8353 while (!list_empty(&splice)) { 8235 while (!list_empty(&splice)) {
8354 binode = list_entry(splice.next, struct btrfs_inode, 8236 binode = list_entry(splice.next, struct btrfs_inode,
8355 delalloc_inodes); 8237 delalloc_inodes);
8356 8238
8357 list_del_init(&binode->delalloc_inodes); 8239 list_move_tail(&binode->delalloc_inodes,
8358 8240 &root->delalloc_inodes);
8359 inode = igrab(&binode->vfs_inode); 8241 inode = igrab(&binode->vfs_inode);
8360 if (!inode) { 8242 if (!inode) {
8361 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 8243 cond_resched_lock(&root->delalloc_lock);
8362 &binode->runtime_flags);
8363 continue; 8244 continue;
8364 } 8245 }
8365 8246 spin_unlock(&root->delalloc_lock);
8366 list_add_tail(&binode->delalloc_inodes,
8367 &root->fs_info->delalloc_inodes);
8368 spin_unlock(&root->fs_info->delalloc_lock);
8369 8247
8370 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8248 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8371 if (unlikely(!work)) { 8249 if (unlikely(!work)) {
@@ -8377,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8377 &work->work); 8255 &work->work);
8378 8256
8379 cond_resched(); 8257 cond_resched();
8380 spin_lock(&root->fs_info->delalloc_lock); 8258 spin_lock(&root->delalloc_lock);
8381 } 8259 }
8382 spin_unlock(&root->fs_info->delalloc_lock); 8260 spin_unlock(&root->delalloc_lock);
8383 8261
8384 list_for_each_entry_safe(work, next, &works, list) { 8262 list_for_each_entry_safe(work, next, &works, list) {
8385 list_del_init(&work->list); 8263 list_del_init(&work->list);
8386 btrfs_wait_and_free_delalloc_work(work); 8264 btrfs_wait_and_free_delalloc_work(work);
8387 } 8265 }
8266 return 0;
8267out:
8268 list_for_each_entry_safe(work, next, &works, list) {
8269 list_del_init(&work->list);
8270 btrfs_wait_and_free_delalloc_work(work);
8271 }
8272
8273 if (!list_empty_careful(&splice)) {
8274 spin_lock(&root->delalloc_lock);
8275 list_splice_tail(&splice, &root->delalloc_inodes);
8276 spin_unlock(&root->delalloc_lock);
8277 }
8278 return ret;
8279}
8280
8281int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8282{
8283 int ret;
8388 8284
8389 /* the filemap_flush will queue IO into the worker threads, but 8285 if (root->fs_info->sb->s_flags & MS_RDONLY)
8286 return -EROFS;
8287
8288 ret = __start_delalloc_inodes(root, delay_iput);
8289 /*
8290 * the filemap_flush will queue IO into the worker threads, but
8390 * we have to make sure the IO is actually started and that 8291 * we have to make sure the IO is actually started and that
8391 * ordered extents get created before we return 8292 * ordered extents get created before we return
8392 */ 8293 */
@@ -8398,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8398 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8299 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8399 } 8300 }
8400 atomic_dec(&root->fs_info->async_submit_draining); 8301 atomic_dec(&root->fs_info->async_submit_draining);
8401 return 0; 8302 return ret;
8402out: 8303}
8403 list_for_each_entry_safe(work, next, &works, list) { 8304
8404 list_del_init(&work->list); 8305int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8405 btrfs_wait_and_free_delalloc_work(work); 8306 int delay_iput)
8307{
8308 struct btrfs_root *root;
8309 struct list_head splice;
8310 int ret;
8311
8312 if (fs_info->sb->s_flags & MS_RDONLY)
8313 return -EROFS;
8314
8315 INIT_LIST_HEAD(&splice);
8316
8317 spin_lock(&fs_info->delalloc_root_lock);
8318 list_splice_init(&fs_info->delalloc_roots, &splice);
8319 while (!list_empty(&splice)) {
8320 root = list_first_entry(&splice, struct btrfs_root,
8321 delalloc_root);
8322 root = btrfs_grab_fs_root(root);
8323 BUG_ON(!root);
8324 list_move_tail(&root->delalloc_root,
8325 &fs_info->delalloc_roots);
8326 spin_unlock(&fs_info->delalloc_root_lock);
8327
8328 ret = __start_delalloc_inodes(root, delay_iput);
8329 btrfs_put_fs_root(root);
8330 if (ret)
8331 goto out;
8332
8333 spin_lock(&fs_info->delalloc_root_lock);
8406 } 8334 }
8335 spin_unlock(&fs_info->delalloc_root_lock);
8407 8336
8337 atomic_inc(&fs_info->async_submit_draining);
8338 while (atomic_read(&fs_info->nr_async_submits) ||
8339 atomic_read(&fs_info->async_delalloc_pages)) {
8340 wait_event(fs_info->async_submit_wait,
8341 (atomic_read(&fs_info->nr_async_submits) == 0 &&
8342 atomic_read(&fs_info->async_delalloc_pages) == 0));
8343 }
8344 atomic_dec(&fs_info->async_submit_draining);
8345 return 0;
8346out:
8408 if (!list_empty_careful(&splice)) { 8347 if (!list_empty_careful(&splice)) {
8409 spin_lock(&root->fs_info->delalloc_lock); 8348 spin_lock(&fs_info->delalloc_root_lock);
8410 list_splice_tail(&splice, &root->fs_info->delalloc_inodes); 8349 list_splice_tail(&splice, &fs_info->delalloc_roots);
8411 spin_unlock(&root->fs_info->delalloc_lock); 8350 spin_unlock(&fs_info->delalloc_root_lock);
8412 } 8351 }
8413 return ret; 8352 return ret;
8414} 8353}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd7e96c73cb7..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
555 if (!root->ref_cows) 555 if (!root->ref_cows)
556 return -EINVAL; 556 return -EINVAL;
557 557
558 ret = btrfs_start_delalloc_inodes(root, 0);
559 if (ret)
560 return ret;
561
562 btrfs_wait_ordered_extents(root, 0);
563
558 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 564 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
559 if (!pending_snapshot) 565 if (!pending_snapshot)
560 return -ENOMEM; 566 return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2354 if (ret) 2360 if (ret)
2355 return ret; 2361 return ret;
2356 2362
2357 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2358 1)) {
2359 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2360 mnt_drop_write_file(file);
2361 return -EINVAL;
2362 }
2363
2364 mutex_lock(&root->fs_info->volume_mutex);
2365 vol_args = memdup_user(arg, sizeof(*vol_args)); 2363 vol_args = memdup_user(arg, sizeof(*vol_args));
2366 if (IS_ERR(vol_args)) { 2364 if (IS_ERR(vol_args)) {
2367 ret = PTR_ERR(vol_args); 2365 ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2369 } 2367 }
2370 2368
2371 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2369 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2372 ret = btrfs_rm_device(root, vol_args->name);
2373 2370
2374 kfree(vol_args); 2371 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2375out: 2372 1)) {
2373 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2374 goto out;
2375 }
2376
2377 mutex_lock(&root->fs_info->volume_mutex);
2378 ret = btrfs_rm_device(root, vol_args->name);
2376 mutex_unlock(&root->fs_info->volume_mutex); 2379 mutex_unlock(&root->fs_info->volume_mutex);
2377 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2380 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2381
2382out:
2383 kfree(vol_args);
2378 mnt_drop_write_file(file); 2384 mnt_drop_write_file(file);
2379 return ret; 2385 return ret;
2380} 2386}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2480 int ret; 2486 int ret;
2481 u64 len = olen; 2487 u64 len = olen;
2482 u64 bs = root->fs_info->sb->s_blocksize; 2488 u64 bs = root->fs_info->sb->s_blocksize;
2489 int same_inode = 0;
2483 2490
2484 /* 2491 /*
2485 * TODO: 2492 * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2516 2523
2517 ret = -EINVAL; 2524 ret = -EINVAL;
2518 if (src == inode) 2525 if (src == inode)
2519 goto out_fput; 2526 same_inode = 1;
2520 2527
2521 /* the src must be open for reading */ 2528 /* the src must be open for reading */
2522 if (!(src_file.file->f_mode & FMODE_READ)) 2529 if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2547 } 2554 }
2548 path->reada = 2; 2555 path->reada = 2;
2549 2556
2550 if (inode < src) { 2557 if (!same_inode) {
2551 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 2558 if (inode < src) {
2552 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 2559 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
2560 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
2561 } else {
2562 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
2563 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2564 }
2553 } else { 2565 } else {
2554 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 2566 mutex_lock(&src->i_mutex);
2555 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2556 } 2567 }
2557 2568
2558 /* determine range to clone */ 2569 /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2570 !IS_ALIGNED(destoff, bs)) 2581 !IS_ALIGNED(destoff, bs))
2571 goto out_unlock; 2582 goto out_unlock;
2572 2583
2584 /* verify if ranges are overlapped within the same file */
2585 if (same_inode) {
2586 if (destoff + len > off && destoff < off + len)
2587 goto out_unlock;
2588 }
2589
2573 if (destoff > inode->i_size) { 2590 if (destoff > inode->i_size) {
2574 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 2591 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2575 if (ret) 2592 if (ret)
@@ -2846,7 +2863,8 @@ out:
2846 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 2863 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2847out_unlock: 2864out_unlock:
2848 mutex_unlock(&src->i_mutex); 2865 mutex_unlock(&src->i_mutex);
2849 mutex_unlock(&inode->i_mutex); 2866 if (!same_inode)
2867 mutex_unlock(&inode->i_mutex);
2850 vfree(buf); 2868 vfree(buf);
2851 btrfs_free_path(path); 2869 btrfs_free_path(path);
2852out_fput: 2870out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2951 goto out; 2969 goto out;
2952 } 2970 }
2953 2971
2954 if (btrfs_root_refs(&new_root->root_item) == 0) {
2955 ret = -ENOENT;
2956 goto out;
2957 }
2958
2959 path = btrfs_alloc_path(); 2972 path = btrfs_alloc_path();
2960 if (!path) { 2973 if (!path) {
2961 ret = -ENOMEM; 2974 ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3719 break; 3732 break;
3720 } 3733 }
3721 3734
3722 if (copy_to_user(arg, sa, sizeof(*sa)))
3723 ret = -EFAULT;
3724
3725 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 3735 err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
3726 if (err && !ret) 3736 if (err && !ret)
3727 ret = err; 3737 ret = err;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3937 return ret; 3947 return ret;
3938} 3948}
3939 3949
3950static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
3951{
3952 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3953
3954 if (!capable(CAP_SYS_ADMIN))
3955 return -EPERM;
3956
3957 return btrfs_qgroup_wait_for_completion(root->fs_info);
3958}
3959
3940static long btrfs_ioctl_set_received_subvol(struct file *file, 3960static long btrfs_ioctl_set_received_subvol(struct file *file,
3941 void __user *arg) 3961 void __user *arg)
3942{ 3962{
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
4179 return btrfs_ioctl_quota_rescan(file, argp); 4199 return btrfs_ioctl_quota_rescan(file, argp);
4180 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 4200 case BTRFS_IOC_QUOTA_RESCAN_STATUS:
4181 return btrfs_ioctl_quota_rescan_status(file, argp); 4201 return btrfs_ioctl_quota_rescan_status(file, argp);
4202 case BTRFS_IOC_QUOTA_RESCAN_WAIT:
4203 return btrfs_ioctl_quota_rescan_wait(file, argp);
4182 case BTRFS_IOC_DEV_REPLACE: 4204 case BTRFS_IOC_DEV_REPLACE:
4183 return btrfs_ioctl_dev_replace(root, argp); 4205 return btrfs_ioctl_dev_replace(root, argp);
4184 case BTRFS_IOC_GET_FSLABEL: 4206 case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
31 31
32struct workspace { 32struct workspace {
33 void *mem; 33 void *mem;
34 void *buf; /* where compressed data goes */ 34 void *buf; /* where decompressed data goes */
35 void *cbuf; /* where decompressed data goes */ 35 void *cbuf; /* where compressed data goes */
36 struct list_head list; 36 struct list_head list;
37}; 37};
38 38
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
24#include "transaction.h" 24#include "transaction.h"
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h"
27 28
28static struct kmem_cache *btrfs_ordered_extent_cache; 29static struct kmem_cache *btrfs_ordered_extent_cache;
29 30
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
184 u64 start, u64 len, u64 disk_len, 185 u64 start, u64 len, u64 disk_len,
185 int type, int dio, int compress_type) 186 int type, int dio, int compress_type)
186{ 187{
188 struct btrfs_root *root = BTRFS_I(inode)->root;
187 struct btrfs_ordered_inode_tree *tree; 189 struct btrfs_ordered_inode_tree *tree;
188 struct rb_node *node; 190 struct rb_node *node;
189 struct btrfs_ordered_extent *entry; 191 struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
227 ordered_data_tree_panic(inode, -EEXIST, file_offset); 229 ordered_data_tree_panic(inode, -EEXIST, file_offset);
228 spin_unlock_irq(&tree->lock); 230 spin_unlock_irq(&tree->lock);
229 231
230 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 232 spin_lock(&root->ordered_extent_lock);
231 list_add_tail(&entry->root_extent_list, 233 list_add_tail(&entry->root_extent_list,
232 &BTRFS_I(inode)->root->fs_info->ordered_extents); 234 &root->ordered_extents);
233 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 235 root->nr_ordered_extents++;
236 if (root->nr_ordered_extents == 1) {
237 spin_lock(&root->fs_info->ordered_root_lock);
238 BUG_ON(!list_empty(&root->ordered_root));
239 list_add_tail(&root->ordered_root,
240 &root->fs_info->ordered_roots);
241 spin_unlock(&root->fs_info->ordered_root_lock);
242 }
243 spin_unlock(&root->ordered_extent_lock);
234 244
235 return 0; 245 return 0;
236} 246}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
516 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 526 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
517 spin_unlock_irq(&tree->lock); 527 spin_unlock_irq(&tree->lock);
518 528
519 spin_lock(&root->fs_info->ordered_extent_lock); 529 spin_lock(&root->ordered_extent_lock);
520 list_del_init(&entry->root_extent_list); 530 list_del_init(&entry->root_extent_list);
531 root->nr_ordered_extents--;
521 532
522 trace_btrfs_ordered_extent_remove(inode, entry); 533 trace_btrfs_ordered_extent_remove(inode, entry);
523 534
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
530 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 541 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
531 list_del_init(&BTRFS_I(inode)->ordered_operations); 542 list_del_init(&BTRFS_I(inode)->ordered_operations);
532 } 543 }
533 spin_unlock(&root->fs_info->ordered_extent_lock); 544
545 if (!root->nr_ordered_extents) {
546 spin_lock(&root->fs_info->ordered_root_lock);
547 BUG_ON(list_empty(&root->ordered_root));
548 list_del_init(&root->ordered_root);
549 spin_unlock(&root->fs_info->ordered_root_lock);
550 }
551 spin_unlock(&root->ordered_extent_lock);
534 wake_up(&entry->wait); 552 wake_up(&entry->wait);
535} 553}
536 554
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
550void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 568void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
551{ 569{
552 struct list_head splice, works; 570 struct list_head splice, works;
553 struct list_head *cur;
554 struct btrfs_ordered_extent *ordered, *next; 571 struct btrfs_ordered_extent *ordered, *next;
555 struct inode *inode; 572 struct inode *inode;
556 573
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
558 INIT_LIST_HEAD(&works); 575 INIT_LIST_HEAD(&works);
559 576
560 mutex_lock(&root->fs_info->ordered_operations_mutex); 577 mutex_lock(&root->fs_info->ordered_operations_mutex);
561 spin_lock(&root->fs_info->ordered_extent_lock); 578 spin_lock(&root->ordered_extent_lock);
562 list_splice_init(&root->fs_info->ordered_extents, &splice); 579 list_splice_init(&root->ordered_extents, &splice);
563 while (!list_empty(&splice)) { 580 while (!list_empty(&splice)) {
564 cur = splice.next; 581 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
565 ordered = list_entry(cur, struct btrfs_ordered_extent, 582 root_extent_list);
566 root_extent_list); 583 list_move_tail(&ordered->root_extent_list,
567 list_del_init(&ordered->root_extent_list); 584 &root->ordered_extents);
568 atomic_inc(&ordered->refs);
569
570 /* 585 /*
571 * the inode may be getting freed (in sys_unlink path). 586 * the inode may be getting freed (in sys_unlink path).
572 */ 587 */
573 inode = igrab(ordered->inode); 588 inode = igrab(ordered->inode);
589 if (!inode) {
590 cond_resched_lock(&root->ordered_extent_lock);
591 continue;
592 }
574 593
575 spin_unlock(&root->fs_info->ordered_extent_lock); 594 atomic_inc(&ordered->refs);
595 spin_unlock(&root->ordered_extent_lock);
576 596
577 if (inode) { 597 ordered->flush_work.func = btrfs_run_ordered_extent_work;
578 ordered->flush_work.func = btrfs_run_ordered_extent_work; 598 list_add_tail(&ordered->work_list, &works);
579 list_add_tail(&ordered->work_list, &works); 599 btrfs_queue_worker(&root->fs_info->flush_workers,
580 btrfs_queue_worker(&root->fs_info->flush_workers, 600 &ordered->flush_work);
581 &ordered->flush_work);
582 } else {
583 btrfs_put_ordered_extent(ordered);
584 }
585 601
586 cond_resched(); 602 cond_resched();
587 spin_lock(&root->fs_info->ordered_extent_lock); 603 spin_lock(&root->ordered_extent_lock);
588 } 604 }
589 spin_unlock(&root->fs_info->ordered_extent_lock); 605 spin_unlock(&root->ordered_extent_lock);
590 606
591 list_for_each_entry_safe(ordered, next, &works, work_list) { 607 list_for_each_entry_safe(ordered, next, &works, work_list) {
592 list_del_init(&ordered->work_list); 608 list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
604 mutex_unlock(&root->fs_info->ordered_operations_mutex); 620 mutex_unlock(&root->fs_info->ordered_operations_mutex);
605} 621}
606 622
623void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
624 int delay_iput)
625{
626 struct btrfs_root *root;
627 struct list_head splice;
628
629 INIT_LIST_HEAD(&splice);
630
631 spin_lock(&fs_info->ordered_root_lock);
632 list_splice_init(&fs_info->ordered_roots, &splice);
633 while (!list_empty(&splice)) {
634 root = list_first_entry(&splice, struct btrfs_root,
635 ordered_root);
636 root = btrfs_grab_fs_root(root);
637 BUG_ON(!root);
638 list_move_tail(&root->ordered_root,
639 &fs_info->ordered_roots);
640 spin_unlock(&fs_info->ordered_root_lock);
641
642 btrfs_wait_ordered_extents(root, delay_iput);
643 btrfs_put_fs_root(root);
644
645 spin_lock(&fs_info->ordered_root_lock);
646 }
647 spin_unlock(&fs_info->ordered_root_lock);
648}
649
607/* 650/*
608 * this is used during transaction commit to write all the inodes 651 * this is used during transaction commit to write all the inodes
609 * added to the ordered operation list. These files must be fully on 652 * added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
629 INIT_LIST_HEAD(&works); 672 INIT_LIST_HEAD(&works);
630 673
631 mutex_lock(&root->fs_info->ordered_operations_mutex); 674 mutex_lock(&root->fs_info->ordered_operations_mutex);
632 spin_lock(&root->fs_info->ordered_extent_lock); 675 spin_lock(&root->fs_info->ordered_root_lock);
633 list_splice_init(&cur_trans->ordered_operations, &splice); 676 list_splice_init(&cur_trans->ordered_operations, &splice);
634 while (!list_empty(&splice)) { 677 while (!list_empty(&splice)) {
635 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 678 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
648 if (!wait) 691 if (!wait)
649 list_add_tail(&BTRFS_I(inode)->ordered_operations, 692 list_add_tail(&BTRFS_I(inode)->ordered_operations,
650 &cur_trans->ordered_operations); 693 &cur_trans->ordered_operations);
651 spin_unlock(&root->fs_info->ordered_extent_lock); 694 spin_unlock(&root->fs_info->ordered_root_lock);
652 695
653 work = btrfs_alloc_delalloc_work(inode, wait, 1); 696 work = btrfs_alloc_delalloc_work(inode, wait, 1);
654 if (!work) { 697 if (!work) {
655 spin_lock(&root->fs_info->ordered_extent_lock); 698 spin_lock(&root->fs_info->ordered_root_lock);
656 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 699 if (list_empty(&BTRFS_I(inode)->ordered_operations))
657 list_add_tail(&btrfs_inode->ordered_operations, 700 list_add_tail(&btrfs_inode->ordered_operations,
658 &splice); 701 &splice);
659 list_splice_tail(&splice, 702 list_splice_tail(&splice,
660 &cur_trans->ordered_operations); 703 &cur_trans->ordered_operations);
661 spin_unlock(&root->fs_info->ordered_extent_lock); 704 spin_unlock(&root->fs_info->ordered_root_lock);
662 ret = -ENOMEM; 705 ret = -ENOMEM;
663 goto out; 706 goto out;
664 } 707 }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
667 &work->work); 710 &work->work);
668 711
669 cond_resched(); 712 cond_resched();
670 spin_lock(&root->fs_info->ordered_extent_lock); 713 spin_lock(&root->fs_info->ordered_root_lock);
671 } 714 }
672 spin_unlock(&root->fs_info->ordered_extent_lock); 715 spin_unlock(&root->fs_info->ordered_root_lock);
673out: 716out:
674 list_for_each_entry_safe(work, next, &works, list) { 717 list_for_each_entry_safe(work, next, &works, list) {
675 list_del_init(&work->list); 718 list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
989 u32 *sum, int len) 1032 u32 *sum, int len)
990{ 1033{
991 struct btrfs_ordered_sum *ordered_sum; 1034 struct btrfs_ordered_sum *ordered_sum;
992 struct btrfs_sector_sum *sector_sums;
993 struct btrfs_ordered_extent *ordered; 1035 struct btrfs_ordered_extent *ordered;
994 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 1036 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
995 unsigned long num_sectors; 1037 unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1007 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 1049 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1008 i = (disk_bytenr - ordered_sum->bytenr) >> 1050 i = (disk_bytenr - ordered_sum->bytenr) >>
1009 inode->i_sb->s_blocksize_bits; 1051 inode->i_sb->s_blocksize_bits;
1010 sector_sums = ordered_sum->sums + i;
1011 num_sectors = ordered_sum->len >> 1052 num_sectors = ordered_sum->len >>
1012 inode->i_sb->s_blocksize_bits; 1053 inode->i_sb->s_blocksize_bits;
1013 for (; i < num_sectors; i++) { 1054 num_sectors = min_t(int, len - index, num_sectors - i);
1014 if (sector_sums[i].bytenr == disk_bytenr) { 1055 memcpy(sum + index, ordered_sum->sums + i,
1015 sum[index] = sector_sums[i].sum; 1056 num_sectors);
1016 index++; 1057
1017 if (index == len) 1058 index += (int)num_sectors;
1018 goto out; 1059 if (index == len)
1019 disk_bytenr += sectorsize; 1060 goto out;
1020 } 1061 disk_bytenr += num_sectors * sectorsize;
1021 }
1022 } 1062 }
1023 } 1063 }
1024out: 1064out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1055 if (last_mod < root->fs_info->last_trans_committed) 1095 if (last_mod < root->fs_info->last_trans_committed)
1056 return; 1096 return;
1057 1097
1058 spin_lock(&root->fs_info->ordered_extent_lock); 1098 spin_lock(&root->fs_info->ordered_root_lock);
1059 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1099 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1060 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1100 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1061 &cur_trans->ordered_operations); 1101 &cur_trans->ordered_operations);
1062 } 1102 }
1063 spin_unlock(&root->fs_info->ordered_extent_lock); 1103 spin_unlock(&root->fs_info->ordered_root_lock);
1064} 1104}
1065 1105
1066int __init ordered_data_init(void) 1106int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
28 28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum { 29struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */ 30 /* bytenr is the start of this extent on disk */
43 u64 bytenr; 31 u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
45 /* 33 /*
46 * this is the length in bytes covered by the sums array below. 34 * this is the length in bytes covered by the sums array below.
47 */ 35 */
48 unsigned long len; 36 int len;
49 struct list_head list; 37 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */ 38 /* last field is a variable length array of csums */
51 struct btrfs_sector_sum sums[]; 39 u32 sums[];
52}; 40};
53 41
54/* 42/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
149static inline int btrfs_ordered_sum_size(struct btrfs_root *root, 137static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
150 unsigned long bytes) 138 unsigned long bytes)
151{ 139{
152 unsigned long num_sectors = (bytes + root->sectorsize - 1) / 140 int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
153 root->sectorsize; 141 return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
154 num_sectors++;
155 return sizeof(struct btrfs_ordered_sum) +
156 num_sectors * sizeof(struct btrfs_sector_sum);
157} 142}
158 143
159static inline void 144static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
204 struct btrfs_root *root, 189 struct btrfs_root *root,
205 struct inode *inode); 190 struct inode *inode);
206void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 191void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
192void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
193 int delay_iput);
207void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 194void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
208void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 195void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 196void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 98 struct btrfs_qgroup *member;
99}; 99};
100 100
101struct qgroup_rescan { 101static int
102 struct btrfs_work work; 102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 struct btrfs_fs_info *fs_info; 103 int init_flags);
104}; 104static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
105
106static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
107 struct qgroup_rescan *qscan);
108 105
109/* must be called with qgroup_ioctl_lock held */ 106/* must be called with qgroup_ioctl_lock held */
110static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 107static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
255 int slot; 252 int slot;
256 int ret = 0; 253 int ret = 0;
257 u64 flags = 0; 254 u64 flags = 0;
255 u64 rescan_progress = 0;
258 256
259 if (!fs_info->quota_enabled) 257 if (!fs_info->quota_enabled)
260 return 0; 258 return 0;
261 259
260 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
261 if (!fs_info->qgroup_ulist) {
262 ret = -ENOMEM;
263 goto out;
264 }
265
262 path = btrfs_alloc_path(); 266 path = btrfs_alloc_path();
263 if (!path) { 267 if (!path) {
264 ret = -ENOMEM; 268 ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
306 } 310 }
307 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 311 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
308 ptr); 312 ptr);
309 fs_info->qgroup_rescan_progress.objectid = 313 rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
310 btrfs_qgroup_status_rescan(l, ptr);
311 if (fs_info->qgroup_flags &
312 BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
313 struct qgroup_rescan *qscan =
314 kmalloc(sizeof(*qscan), GFP_NOFS);
315 if (!qscan) {
316 ret = -ENOMEM;
317 goto out;
318 }
319 fs_info->qgroup_rescan_progress.type = 0;
320 fs_info->qgroup_rescan_progress.offset = 0;
321 qgroup_rescan_start(fs_info, qscan);
322 }
323 goto next1; 314 goto next1;
324 } 315 }
325 316
@@ -421,9 +412,18 @@ out:
421 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { 412 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
422 fs_info->quota_enabled = 0; 413 fs_info->quota_enabled = 0;
423 fs_info->pending_quota_state = 0; 414 fs_info->pending_quota_state = 0;
415 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
416 ret >= 0) {
417 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
424 } 418 }
425 btrfs_free_path(path); 419 btrfs_free_path(path);
426 420
421 if (ret < 0) {
422 ulist_free(fs_info->qgroup_ulist);
423 fs_info->qgroup_ulist = NULL;
424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
425 }
426
427 return ret < 0 ? ret : 0; 427 return ret < 0 ? ret : 0;
428} 428}
429 429
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
460 } 460 }
461 kfree(qgroup); 461 kfree(qgroup);
462 } 462 }
463 ulist_free(fs_info->qgroup_ulist);
463} 464}
464 465
465static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, 466static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
819 goto out; 820 goto out;
820 } 821 }
821 822
823 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
824 if (!fs_info->qgroup_ulist) {
825 ret = -ENOMEM;
826 goto out;
827 }
828
822 /* 829 /*
823 * initially create the quota tree 830 * initially create the quota tree
824 */ 831 */
@@ -916,6 +923,10 @@ out_free_root:
916 kfree(quota_root); 923 kfree(quota_root);
917 } 924 }
918out: 925out:
926 if (ret) {
927 ulist_free(fs_info->qgroup_ulist);
928 fs_info->qgroup_ulist = NULL;
929 }
919 mutex_unlock(&fs_info->qgroup_ioctl_lock); 930 mutex_unlock(&fs_info->qgroup_ioctl_lock);
920 return ret; 931 return ret;
921} 932}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1355 u64 ref_root; 1366 u64 ref_root;
1356 struct btrfs_qgroup *qgroup; 1367 struct btrfs_qgroup *qgroup;
1357 struct ulist *roots = NULL; 1368 struct ulist *roots = NULL;
1358 struct ulist *tmp = NULL;
1359 u64 seq; 1369 u64 seq;
1360 int ret = 0; 1370 int ret = 0;
1361 int sgn; 1371 int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1428 if (ret < 0) 1438 if (ret < 0)
1429 return ret; 1439 return ret;
1430 1440
1431 mutex_lock(&fs_info->qgroup_rescan_lock);
1432 spin_lock(&fs_info->qgroup_lock); 1441 spin_lock(&fs_info->qgroup_lock);
1433 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1434 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
1435 ret = 0;
1436 goto unlock;
1437 }
1438 }
1439 1442
1440 quota_root = fs_info->quota_root; 1443 quota_root = fs_info->quota_root;
1441 if (!quota_root) 1444 if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1448 /* 1451 /*
1449 * step 1: for each old ref, visit all nodes once and inc refcnt 1452 * step 1: for each old ref, visit all nodes once and inc refcnt
1450 */ 1453 */
1451 tmp = ulist_alloc(GFP_ATOMIC); 1454 ulist_reinit(fs_info->qgroup_ulist);
1452 if (!tmp) {
1453 ret = -ENOMEM;
1454 goto unlock;
1455 }
1456 seq = fs_info->qgroup_seq; 1455 seq = fs_info->qgroup_seq;
1457 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1456 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1458 1457
1459 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 1458 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
1459 seq);
1460 if (ret) 1460 if (ret)
1461 goto unlock; 1461 goto unlock;
1462 1462
1463 /* 1463 /*
1464 * step 2: walk from the new root 1464 * step 2: walk from the new root
1465 */ 1465 */
1466 ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, 1466 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
1467 node->num_bytes, qgroup); 1467 seq, sgn, node->num_bytes, qgroup);
1468 if (ret) 1468 if (ret)
1469 goto unlock; 1469 goto unlock;
1470 1470
1471 /* 1471 /*
1472 * step 3: walk again from old refs 1472 * step 3: walk again from old refs
1473 */ 1473 */
1474 ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, 1474 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
1475 node->num_bytes); 1475 seq, sgn, node->num_bytes);
1476 if (ret) 1476 if (ret)
1477 goto unlock; 1477 goto unlock;
1478 1478
1479unlock: 1479unlock:
1480 spin_unlock(&fs_info->qgroup_lock); 1480 spin_unlock(&fs_info->qgroup_lock);
1481 mutex_unlock(&fs_info->qgroup_rescan_lock);
1482 ulist_free(roots); 1481 ulist_free(roots);
1483 ulist_free(tmp);
1484 1482
1485 return ret; 1483 return ret;
1486} 1484}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1527 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1525 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1528 1526
1529 if (!ret && start_rescan_worker) { 1527 if (!ret && start_rescan_worker) {
1530 ret = btrfs_qgroup_rescan(fs_info); 1528 ret = qgroup_rescan_init(fs_info, 0, 1);
1531 if (ret) 1529 if (!ret) {
1532 pr_err("btrfs: start rescan quota failed: %d\n", ret); 1530 qgroup_rescan_zero_tracking(fs_info);
1531 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
1532 &fs_info->qgroup_rescan_work);
1533 }
1533 ret = 0; 1534 ret = 0;
1534 } 1535 }
1535 1536
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1720 struct btrfs_fs_info *fs_info = root->fs_info; 1721 struct btrfs_fs_info *fs_info = root->fs_info;
1721 u64 ref_root = root->root_key.objectid; 1722 u64 ref_root = root->root_key.objectid;
1722 int ret = 0; 1723 int ret = 0;
1723 struct ulist *ulist = NULL;
1724 struct ulist_node *unode; 1724 struct ulist_node *unode;
1725 struct ulist_iterator uiter; 1725 struct ulist_iterator uiter;
1726 1726
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1743 * in a first step, we check all affected qgroups if any limits would 1743 * in a first step, we check all affected qgroups if any limits would
1744 * be exceeded 1744 * be exceeded
1745 */ 1745 */
1746 ulist = ulist_alloc(GFP_ATOMIC); 1746 ulist_reinit(fs_info->qgroup_ulist);
1747 if (!ulist) { 1747 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1748 ret = -ENOMEM;
1749 goto out;
1750 }
1751 ret = ulist_add(ulist, qgroup->qgroupid,
1752 (uintptr_t)qgroup, GFP_ATOMIC); 1748 (uintptr_t)qgroup, GFP_ATOMIC);
1753 if (ret < 0) 1749 if (ret < 0)
1754 goto out; 1750 goto out;
1755 ULIST_ITER_INIT(&uiter); 1751 ULIST_ITER_INIT(&uiter);
1756 while ((unode = ulist_next(ulist, &uiter))) { 1752 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1757 struct btrfs_qgroup *qg; 1753 struct btrfs_qgroup *qg;
1758 struct btrfs_qgroup_list *glist; 1754 struct btrfs_qgroup_list *glist;
1759 1755
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1774 } 1770 }
1775 1771
1776 list_for_each_entry(glist, &qg->groups, next_group) { 1772 list_for_each_entry(glist, &qg->groups, next_group) {
1777 ret = ulist_add(ulist, glist->group->qgroupid, 1773 ret = ulist_add(fs_info->qgroup_ulist,
1774 glist->group->qgroupid,
1778 (uintptr_t)glist->group, GFP_ATOMIC); 1775 (uintptr_t)glist->group, GFP_ATOMIC);
1779 if (ret < 0) 1776 if (ret < 0)
1780 goto out; 1777 goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1785 * no limits exceeded, now record the reservation into all qgroups 1782 * no limits exceeded, now record the reservation into all qgroups
1786 */ 1783 */
1787 ULIST_ITER_INIT(&uiter); 1784 ULIST_ITER_INIT(&uiter);
1788 while ((unode = ulist_next(ulist, &uiter))) { 1785 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1789 struct btrfs_qgroup *qg; 1786 struct btrfs_qgroup *qg;
1790 1787
1791 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1788 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1795 1792
1796out: 1793out:
1797 spin_unlock(&fs_info->qgroup_lock); 1794 spin_unlock(&fs_info->qgroup_lock);
1798 ulist_free(ulist);
1799
1800 return ret; 1795 return ret;
1801} 1796}
1802 1797
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1805 struct btrfs_root *quota_root; 1800 struct btrfs_root *quota_root;
1806 struct btrfs_qgroup *qgroup; 1801 struct btrfs_qgroup *qgroup;
1807 struct btrfs_fs_info *fs_info = root->fs_info; 1802 struct btrfs_fs_info *fs_info = root->fs_info;
1808 struct ulist *ulist = NULL;
1809 struct ulist_node *unode; 1803 struct ulist_node *unode;
1810 struct ulist_iterator uiter; 1804 struct ulist_iterator uiter;
1811 u64 ref_root = root->root_key.objectid; 1805 u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1827 if (!qgroup) 1821 if (!qgroup)
1828 goto out; 1822 goto out;
1829 1823
1830 ulist = ulist_alloc(GFP_ATOMIC); 1824 ulist_reinit(fs_info->qgroup_ulist);
1831 if (!ulist) { 1825 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1832 btrfs_std_error(fs_info, -ENOMEM);
1833 goto out;
1834 }
1835 ret = ulist_add(ulist, qgroup->qgroupid,
1836 (uintptr_t)qgroup, GFP_ATOMIC); 1826 (uintptr_t)qgroup, GFP_ATOMIC);
1837 if (ret < 0) 1827 if (ret < 0)
1838 goto out; 1828 goto out;
1839 ULIST_ITER_INIT(&uiter); 1829 ULIST_ITER_INIT(&uiter);
1840 while ((unode = ulist_next(ulist, &uiter))) { 1830 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1841 struct btrfs_qgroup *qg; 1831 struct btrfs_qgroup *qg;
1842 struct btrfs_qgroup_list *glist; 1832 struct btrfs_qgroup_list *glist;
1843 1833
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1846 qg->reserved -= num_bytes; 1836 qg->reserved -= num_bytes;
1847 1837
1848 list_for_each_entry(glist, &qg->groups, next_group) { 1838 list_for_each_entry(glist, &qg->groups, next_group) {
1849 ret = ulist_add(ulist, glist->group->qgroupid, 1839 ret = ulist_add(fs_info->qgroup_ulist,
1840 glist->group->qgroupid,
1850 (uintptr_t)glist->group, GFP_ATOMIC); 1841 (uintptr_t)glist->group, GFP_ATOMIC);
1851 if (ret < 0) 1842 if (ret < 0)
1852 goto out; 1843 goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1855 1846
1856out: 1847out:
1857 spin_unlock(&fs_info->qgroup_lock); 1848 spin_unlock(&fs_info->qgroup_lock);
1858 ulist_free(ulist);
1859} 1849}
1860 1850
1861void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) 1851void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1874 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. 1864 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
1875 */ 1865 */
1876static int 1866static int
1877qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, 1867qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1878 struct btrfs_trans_handle *trans, struct ulist *tmp, 1868 struct btrfs_trans_handle *trans, struct ulist *tmp,
1879 struct extent_buffer *scratch_leaf) 1869 struct extent_buffer *scratch_leaf)
1880{ 1870{
1881 struct btrfs_key found; 1871 struct btrfs_key found;
1882 struct btrfs_fs_info *fs_info = qscan->fs_info;
1883 struct ulist *roots = NULL; 1872 struct ulist *roots = NULL;
1884 struct ulist_node *unode; 1873 struct ulist_node *unode;
1885 struct ulist_iterator uiter; 1874 struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
2007 1996
2008static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 1997static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2009{ 1998{
2010 struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, 1999 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2011 work); 2000 qgroup_rescan_work);
2012 struct btrfs_path *path; 2001 struct btrfs_path *path;
2013 struct btrfs_trans_handle *trans = NULL; 2002 struct btrfs_trans_handle *trans = NULL;
2014 struct btrfs_fs_info *fs_info = qscan->fs_info;
2015 struct ulist *tmp = NULL; 2003 struct ulist *tmp = NULL;
2016 struct extent_buffer *scratch_leaf = NULL; 2004 struct extent_buffer *scratch_leaf = NULL;
2017 int err = -ENOMEM; 2005 int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2036 if (!fs_info->quota_enabled) { 2024 if (!fs_info->quota_enabled) {
2037 err = -EINTR; 2025 err = -EINTR;
2038 } else { 2026 } else {
2039 err = qgroup_rescan_leaf(qscan, path, trans, 2027 err = qgroup_rescan_leaf(fs_info, path, trans,
2040 tmp, scratch_leaf); 2028 tmp, scratch_leaf);
2041 } 2029 }
2042 if (err > 0) 2030 if (err > 0)
@@ -2049,7 +2037,6 @@ out:
2049 kfree(scratch_leaf); 2037 kfree(scratch_leaf);
2050 ulist_free(tmp); 2038 ulist_free(tmp);
2051 btrfs_free_path(path); 2039 btrfs_free_path(path);
2052 kfree(qscan);
2053 2040
2054 mutex_lock(&fs_info->qgroup_rescan_lock); 2041 mutex_lock(&fs_info->qgroup_rescan_lock);
2055 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2042 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
2068 } else { 2055 } else {
2069 pr_err("btrfs: qgroup scan failed with %d\n", err); 2056 pr_err("btrfs: qgroup scan failed with %d\n", err);
2070 } 2057 }
2071}
2072 2058
2073static void 2059 complete_all(&fs_info->qgroup_rescan_completion);
2074qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
2075{
2076 memset(&qscan->work, 0, sizeof(qscan->work));
2077 qscan->work.func = btrfs_qgroup_rescan_worker;
2078 qscan->fs_info = fs_info;
2079
2080 pr_info("btrfs: qgroup scan started\n");
2081 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
2082} 2060}
2083 2061
2084int 2062/*
2085btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 2063 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
2064 * memory required for the rescan context.
2065 */
2066static int
2067qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2068 int init_flags)
2086{ 2069{
2087 int ret = 0; 2070 int ret = 0;
2088 struct rb_node *n;
2089 struct btrfs_qgroup *qgroup;
2090 struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
2091 2071
2092 if (!qscan) 2072 if (!init_flags &&
2093 return -ENOMEM; 2073 (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
2074 !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
2075 ret = -EINVAL;
2076 goto err;
2077 }
2094 2078
2095 mutex_lock(&fs_info->qgroup_rescan_lock); 2079 mutex_lock(&fs_info->qgroup_rescan_lock);
2096 spin_lock(&fs_info->qgroup_lock); 2080 spin_lock(&fs_info->qgroup_lock);
2097 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2081
2098 ret = -EINPROGRESS; 2082 if (init_flags) {
2099 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 2083 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2100 ret = -EINVAL; 2084 ret = -EINPROGRESS;
2101 if (ret) { 2085 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
2102 spin_unlock(&fs_info->qgroup_lock); 2086 ret = -EINVAL;
2103 mutex_unlock(&fs_info->qgroup_rescan_lock); 2087
2104 kfree(qscan); 2088 if (ret) {
2105 return ret; 2089 spin_unlock(&fs_info->qgroup_lock);
2090 mutex_unlock(&fs_info->qgroup_rescan_lock);
2091 goto err;
2092 }
2093
2094 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2106 } 2095 }
2107 2096
2108 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2109 memset(&fs_info->qgroup_rescan_progress, 0, 2097 memset(&fs_info->qgroup_rescan_progress, 0,
2110 sizeof(fs_info->qgroup_rescan_progress)); 2098 sizeof(fs_info->qgroup_rescan_progress));
2099 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
2100
2101 spin_unlock(&fs_info->qgroup_lock);
2102 mutex_unlock(&fs_info->qgroup_rescan_lock);
2103
2104 init_completion(&fs_info->qgroup_rescan_completion);
2105
2106 memset(&fs_info->qgroup_rescan_work, 0,
2107 sizeof(fs_info->qgroup_rescan_work));
2108 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
2109
2110 if (ret) {
2111err:
2112 pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
2113 return ret;
2114 }
2115
2116 return 0;
2117}
2118
2119static void
2120qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
2121{
2122 struct rb_node *n;
2123 struct btrfs_qgroup *qgroup;
2111 2124
2125 spin_lock(&fs_info->qgroup_lock);
2112 /* clear all current qgroup tracking information */ 2126 /* clear all current qgroup tracking information */
2113 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 2127 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
2114 qgroup = rb_entry(n, struct btrfs_qgroup, node); 2128 qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2118 qgroup->excl_cmpr = 0; 2132 qgroup->excl_cmpr = 0;
2119 } 2133 }
2120 spin_unlock(&fs_info->qgroup_lock); 2134 spin_unlock(&fs_info->qgroup_lock);
2121 mutex_unlock(&fs_info->qgroup_rescan_lock); 2135}
2136
2137int
2138btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2139{
2140 int ret = 0;
2141 struct btrfs_trans_handle *trans;
2122 2142
2123 qgroup_rescan_start(fs_info, qscan); 2143 ret = qgroup_rescan_init(fs_info, 0, 1);
2144 if (ret)
2145 return ret;
2146
2147 /*
2148 * We have set the rescan_progress to 0, which means no more
2149 * delayed refs will be accounted by btrfs_qgroup_account_ref.
2150 * However, btrfs_qgroup_account_ref may be right after its call
2151 * to btrfs_find_all_roots, in which case it would still do the
2152 * accounting.
2153 * To solve this, we're committing the transaction, which will
2154 * ensure we run all delayed refs and only after that, we are
2155 * going to clear all tracking information for a clean start.
2156 */
2157
2158 trans = btrfs_join_transaction(fs_info->fs_root);
2159 if (IS_ERR(trans)) {
2160 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2161 return PTR_ERR(trans);
2162 }
2163 ret = btrfs_commit_transaction(trans, fs_info->fs_root);
2164 if (ret) {
2165 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2166 return ret;
2167 }
2168
2169 qgroup_rescan_zero_tracking(fs_info);
2170
2171 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2172 &fs_info->qgroup_rescan_work);
2124 2173
2125 return 0; 2174 return 0;
2126} 2175}
2176
2177int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
2178{
2179 int running;
2180 int ret = 0;
2181
2182 mutex_lock(&fs_info->qgroup_rescan_lock);
2183 spin_lock(&fs_info->qgroup_lock);
2184 running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2185 spin_unlock(&fs_info->qgroup_lock);
2186 mutex_unlock(&fs_info->qgroup_rescan_lock);
2187
2188 if (running)
2189 ret = wait_for_completion_interruptible(
2190 &fs_info->qgroup_rescan_completion);
2191
2192 return ret;
2193}
2194
2195/*
2196 * this is only called from open_ctree where we're still single threaded, thus
2197 * locking is omitted here.
2198 */
2199void
2200btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2201{
2202 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2203 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2204 &fs_info->qgroup_rescan_work);
2205}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1305 struct extent_buffer *eb; 1305 struct extent_buffer *eb;
1306 struct btrfs_root_item *root_item; 1306 struct btrfs_root_item *root_item;
1307 struct btrfs_key root_key; 1307 struct btrfs_key root_key;
1308 u64 last_snap = 0;
1308 int ret; 1309 int ret;
1309 1310
1310 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1311 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1320 BTRFS_TREE_RELOC_OBJECTID); 1321 BTRFS_TREE_RELOC_OBJECTID);
1321 BUG_ON(ret); 1322 BUG_ON(ret);
1322 1323
1324 last_snap = btrfs_root_last_snapshot(&root->root_item);
1323 btrfs_set_root_last_snapshot(&root->root_item, 1325 btrfs_set_root_last_snapshot(&root->root_item,
1324 trans->transid - 1); 1326 trans->transid - 1);
1325 } else { 1327 } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1345 memset(&root_item->drop_progress, 0, 1347 memset(&root_item->drop_progress, 0,
1346 sizeof(struct btrfs_disk_key)); 1348 sizeof(struct btrfs_disk_key));
1347 root_item->drop_level = 0; 1349 root_item->drop_level = 0;
1350 /*
1351 * abuse rtransid, it is safe because it is impossible to
1352 * receive data into a relocation tree.
1353 */
1354 btrfs_set_root_rtransid(root_item, last_snap);
1355 btrfs_set_root_otransid(root_item, trans->transid);
1348 } 1356 }
1349 1357
1350 btrfs_tree_unlock(eb); 1358 btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1355 BUG_ON(ret); 1363 BUG_ON(ret);
1356 kfree(root_item); 1364 kfree(root_item);
1357 1365
1358 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 1366 reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1359 &root_key);
1360 BUG_ON(IS_ERR(reloc_root)); 1367 BUG_ON(IS_ERR(reloc_root));
1361 reloc_root->last_trans = trans->transid; 1368 reloc_root->last_trans = trans->transid;
1362 return reloc_root; 1369 return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
2273static noinline_for_stack 2280static noinline_for_stack
2274int merge_reloc_roots(struct reloc_control *rc) 2281int merge_reloc_roots(struct reloc_control *rc)
2275{ 2282{
2283 struct btrfs_trans_handle *trans;
2276 struct btrfs_root *root; 2284 struct btrfs_root *root;
2277 struct btrfs_root *reloc_root; 2285 struct btrfs_root *reloc_root;
2286 u64 last_snap;
2287 u64 otransid;
2288 u64 objectid;
2278 LIST_HEAD(reloc_roots); 2289 LIST_HEAD(reloc_roots);
2279 int found = 0; 2290 int found = 0;
2280 int ret = 0; 2291 int ret = 0;
@@ -2308,12 +2319,44 @@ again:
2308 } else { 2319 } else {
2309 list_del_init(&reloc_root->root_list); 2320 list_del_init(&reloc_root->root_list);
2310 } 2321 }
2322
2323 /*
2324 * we keep the old last snapshod transid in rtranid when we
2325 * created the relocation tree.
2326 */
2327 last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2328 otransid = btrfs_root_otransid(&reloc_root->root_item);
2329 objectid = reloc_root->root_key.offset;
2330
2311 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); 2331 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2312 if (ret < 0) { 2332 if (ret < 0) {
2313 if (list_empty(&reloc_root->root_list)) 2333 if (list_empty(&reloc_root->root_list))
2314 list_add_tail(&reloc_root->root_list, 2334 list_add_tail(&reloc_root->root_list,
2315 &reloc_roots); 2335 &reloc_roots);
2316 goto out; 2336 goto out;
2337 } else if (!ret) {
2338 /*
2339 * recover the last snapshot tranid to avoid
2340 * the space balance break NOCOW.
2341 */
2342 root = read_fs_root(rc->extent_root->fs_info,
2343 objectid);
2344 if (IS_ERR(root))
2345 continue;
2346
2347 if (btrfs_root_refs(&root->root_item) == 0)
2348 continue;
2349
2350 trans = btrfs_join_transaction(root);
2351 BUG_ON(IS_ERR(trans));
2352
2353 /* Check if the fs/file tree was snapshoted or not. */
2354 if (btrfs_root_last_snapshot(&root->root_item) ==
2355 otransid - 1)
2356 btrfs_set_root_last_snapshot(&root->root_item,
2357 last_snap);
2358
2359 btrfs_end_transaction(trans, root);
2317 } 2360 }
2318 } 2361 }
2319 2362
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
3266 struct btrfs_path *path; 3309 struct btrfs_path *path;
3267 struct btrfs_key key; 3310 struct btrfs_key key;
3268 int ret; 3311 int ret;
3312 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3313 SKINNY_METADATA);
3269 3314
3270 if (tree_block_processed(bytenr, blocksize, rc)) 3315 if (tree_block_processed(bytenr, blocksize, rc))
3271 return 0; 3316 return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
3276 path = btrfs_alloc_path(); 3321 path = btrfs_alloc_path();
3277 if (!path) 3322 if (!path)
3278 return -ENOMEM; 3323 return -ENOMEM;
3279 3324again:
3280 key.objectid = bytenr; 3325 key.objectid = bytenr;
3281 key.type = BTRFS_EXTENT_ITEM_KEY; 3326 if (skinny) {
3282 key.offset = blocksize; 3327 key.type = BTRFS_METADATA_ITEM_KEY;
3328 key.offset = (u64)-1;
3329 } else {
3330 key.type = BTRFS_EXTENT_ITEM_KEY;
3331 key.offset = blocksize;
3332 }
3283 3333
3284 path->search_commit_root = 1; 3334 path->search_commit_root = 1;
3285 path->skip_locking = 1; 3335 path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
3287 if (ret < 0) 3337 if (ret < 0)
3288 goto out; 3338 goto out;
3289 3339
3290 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3340 if (ret > 0 && skinny) {
3291 if (ret > 0) { 3341 if (path->slots[0]) {
3292 if (key.objectid == bytenr && 3342 path->slots[0]--;
3293 key.type == BTRFS_METADATA_ITEM_KEY) 3343 btrfs_item_key_to_cpu(path->nodes[0], &key,
3294 ret = 0; 3344 path->slots[0]);
3345 if (key.objectid == bytenr &&
3346 (key.type == BTRFS_METADATA_ITEM_KEY ||
3347 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3348 key.offset == blocksize)))
3349 ret = 0;
3350 }
3351
3352 if (ret) {
3353 skinny = false;
3354 btrfs_release_path(path);
3355 goto again;
3356 }
3295 } 3357 }
3296 BUG_ON(ret); 3358 BUG_ON(ret);
3297 3359
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4160 (unsigned long long)rc->block_group->key.objectid, 4222 (unsigned long long)rc->block_group->key.objectid,
4161 (unsigned long long)rc->block_group->flags); 4223 (unsigned long long)rc->block_group->flags);
4162 4224
4163 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4225 ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
4164 if (ret < 0) { 4226 if (ret < 0) {
4165 err = ret; 4227 err = ret;
4166 goto out; 4228 goto out;
4167 } 4229 }
4168 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4230 btrfs_wait_all_ordered_extents(fs_info, 0);
4169 4231
4170 while (1) { 4232 while (1) {
4171 mutex_lock(&fs_info->cleaner_mutex); 4233 mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4277 key.type != BTRFS_ROOT_ITEM_KEY) 4339 key.type != BTRFS_ROOT_ITEM_KEY)
4278 break; 4340 break;
4279 4341
4280 reloc_root = btrfs_read_fs_root_no_radix(root, &key); 4342 reloc_root = btrfs_read_fs_root(root, &key);
4281 if (IS_ERR(reloc_root)) { 4343 if (IS_ERR(reloc_root)) {
4282 err = PTR_ERR(reloc_root); 4344 err = PTR_ERR(reloc_root);
4283 goto out; 4345 goto out;
@@ -4396,10 +4458,8 @@ out:
4396int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4458int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4397{ 4459{
4398 struct btrfs_ordered_sum *sums; 4460 struct btrfs_ordered_sum *sums;
4399 struct btrfs_sector_sum *sector_sum;
4400 struct btrfs_ordered_extent *ordered; 4461 struct btrfs_ordered_extent *ordered;
4401 struct btrfs_root *root = BTRFS_I(inode)->root; 4462 struct btrfs_root *root = BTRFS_I(inode)->root;
4402 size_t offset;
4403 int ret; 4463 int ret;
4404 u64 disk_bytenr; 4464 u64 disk_bytenr;
4405 LIST_HEAD(list); 4465 LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4413 if (ret) 4473 if (ret)
4414 goto out; 4474 goto out;
4415 4475
4476 disk_bytenr = ordered->start;
4416 while (!list_empty(&list)) { 4477 while (!list_empty(&list)) {
4417 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4478 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4418 list_del_init(&sums->list); 4479 list_del_init(&sums->list);
4419 4480
4420 sector_sum = sums->sums; 4481 sums->bytenr = disk_bytenr;
4421 sums->bytenr = ordered->start; 4482 disk_bytenr += sums->len;
4422
4423 offset = 0;
4424 while (offset < sums->len) {
4425 sector_sum->bytenr += ordered->start - disk_bytenr;
4426 sector_sum++;
4427 offset += root->sectorsize;
4428 }
4429 4483
4430 btrfs_add_ordered_sum(inode, ordered, sums); 4484 btrfs_add_ordered_sum(inode, ordered, sums);
4431 } 4485 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
64} 64}
65 65
66/* 66/*
67 * lookup the root with the highest offset for a given objectid. The key we do 67 * btrfs_find_root - lookup the root by the key.
68 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 68 * root: the root of the root tree
69 * on error. 69 * search_key: the key to search
70 * path: the path we search
71 * root_item: the root item of the tree we look for
72 * root_key: the reak key of the tree we look for
73 *
74 * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
75 * of the search key, just lookup the root with the highest offset for a
76 * given objectid.
77 *
78 * If we find something return 0, otherwise > 0, < 0 on error.
70 */ 79 */
71int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, 80int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
72 struct btrfs_root_item *item, struct btrfs_key *key) 81 struct btrfs_path *path, struct btrfs_root_item *root_item,
82 struct btrfs_key *root_key)
73{ 83{
74 struct btrfs_path *path;
75 struct btrfs_key search_key;
76 struct btrfs_key found_key; 84 struct btrfs_key found_key;
77 struct extent_buffer *l; 85 struct extent_buffer *l;
78 int ret; 86 int ret;
79 int slot; 87 int slot;
80 88
81 search_key.objectid = objectid; 89 ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
82 search_key.type = BTRFS_ROOT_ITEM_KEY;
83 search_key.offset = (u64)-1;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
89 if (ret < 0) 90 if (ret < 0)
90 goto out; 91 return ret;
91 92
92 BUG_ON(ret == 0); 93 if (search_key->offset != -1ULL) { /* the search key is exact */
93 if (path->slots[0] == 0) { 94 if (ret > 0)
94 ret = 1; 95 goto out;
95 goto out; 96 } else {
97 BUG_ON(ret == 0); /* Logical error */
98 if (path->slots[0] == 0)
99 goto out;
100 path->slots[0]--;
101 ret = 0;
96 } 102 }
103
97 l = path->nodes[0]; 104 l = path->nodes[0];
98 slot = path->slots[0] - 1; 105 slot = path->slots[0];
106
99 btrfs_item_key_to_cpu(l, &found_key, slot); 107 btrfs_item_key_to_cpu(l, &found_key, slot);
100 if (found_key.objectid != objectid || 108 if (found_key.objectid != search_key->objectid ||
101 found_key.type != BTRFS_ROOT_ITEM_KEY) { 109 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 110 ret = 1;
103 goto out; 111 goto out;
104 } 112 }
105 if (item)
106 btrfs_read_root_item(l, slot, item);
107 if (key)
108 memcpy(key, &found_key, sizeof(found_key));
109 113
110 ret = 0; 114 if (root_item)
115 btrfs_read_root_item(l, slot, root_item);
116 if (root_key)
117 memcpy(root_key, &found_key, sizeof(found_key));
111out: 118out:
112 btrfs_free_path(path); 119 btrfs_release_path(path);
113 return ret; 120 return ret;
114} 121}
115 122
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
212 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 219 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
213} 220}
214 221
215/*
216 * at mount time we want to find all the old transaction snapshots that were in
217 * the process of being deleted if we crashed. This is any root item with an
218 * offset lower than the latest root. They need to be queued for deletion to
219 * finish what was happening when we crashed.
220 */
221int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
222{
223 struct btrfs_root *dead_root;
224 struct btrfs_root_item *ri;
225 struct btrfs_key key;
226 struct btrfs_key found_key;
227 struct btrfs_path *path;
228 int ret;
229 u32 nritems;
230 struct extent_buffer *leaf;
231 int slot;
232
233 key.objectid = objectid;
234 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
235 key.offset = 0;
236 path = btrfs_alloc_path();
237 if (!path)
238 return -ENOMEM;
239
240again:
241 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
242 if (ret < 0)
243 goto err;
244 while (1) {
245 leaf = path->nodes[0];
246 nritems = btrfs_header_nritems(leaf);
247 slot = path->slots[0];
248 if (slot >= nritems) {
249 ret = btrfs_next_leaf(root, path);
250 if (ret)
251 break;
252 leaf = path->nodes[0];
253 nritems = btrfs_header_nritems(leaf);
254 slot = path->slots[0];
255 }
256 btrfs_item_key_to_cpu(leaf, &key, slot);
257 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
258 goto next;
259
260 if (key.objectid < objectid)
261 goto next;
262
263 if (key.objectid > objectid)
264 break;
265
266 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
267 if (btrfs_disk_root_refs(leaf, ri) != 0)
268 goto next;
269
270 memcpy(&found_key, &key, sizeof(key));
271 key.offset++;
272 btrfs_release_path(path);
273 dead_root =
274 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
275 &found_key);
276 if (IS_ERR(dead_root)) {
277 ret = PTR_ERR(dead_root);
278 goto err;
279 }
280
281 ret = btrfs_add_dead_root(dead_root);
282 if (ret)
283 goto err;
284 goto again;
285next:
286 slot++;
287 path->slots[0]++;
288 }
289 ret = 0;
290err:
291 btrfs_free_path(path);
292 return ret;
293}
294
295int btrfs_find_orphan_roots(struct btrfs_root *tree_root) 222int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
296{ 223{
297 struct extent_buffer *leaf; 224 struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
301 struct btrfs_root *root; 228 struct btrfs_root *root;
302 int err = 0; 229 int err = 0;
303 int ret; 230 int ret;
231 bool can_recover = true;
232
233 if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
234 can_recover = false;
304 235
305 path = btrfs_alloc_path(); 236 path = btrfs_alloc_path();
306 if (!path) 237 if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
340 root_key.objectid = key.offset; 271 root_key.objectid = key.offset;
341 key.offset++; 272 key.offset++;
342 273
343 root = btrfs_read_fs_root_no_name(tree_root->fs_info, 274 root = btrfs_read_fs_root(tree_root, &root_key);
344 &root_key); 275 err = PTR_RET(root);
345 if (!IS_ERR(root)) 276 if (err && err != -ENOENT) {
277 break;
278 } else if (err == -ENOENT) {
279 struct btrfs_trans_handle *trans;
280
281 btrfs_release_path(path);
282
283 trans = btrfs_join_transaction(tree_root);
284 if (IS_ERR(trans)) {
285 err = PTR_ERR(trans);
286 btrfs_error(tree_root->fs_info, err,
287 "Failed to start trans to delete "
288 "orphan item");
289 break;
290 }
291 err = btrfs_del_orphan_item(trans, tree_root,
292 root_key.objectid);
293 btrfs_end_transaction(trans, tree_root);
294 if (err) {
295 btrfs_error(tree_root->fs_info, err,
296 "Failed to delete root orphan "
297 "item");
298 break;
299 }
346 continue; 300 continue;
301 }
347 302
348 ret = PTR_ERR(root); 303 if (btrfs_root_refs(&root->root_item) == 0) {
349 if (ret != -ENOENT) { 304 btrfs_add_dead_root(root);
350 err = ret; 305 continue;
306 }
307
308 err = btrfs_init_fs_root(root);
309 if (err) {
310 btrfs_free_fs_root(root);
351 break; 311 break;
352 } 312 }
353 313
354 ret = btrfs_find_dead_roots(tree_root, root_key.objectid); 314 root->orphan_item_inserted = 1;
355 if (ret) { 315
356 err = ret; 316 err = btrfs_insert_fs_root(root->fs_info, root);
317 if (err) {
318 BUG_ON(err == -EEXIST);
319 btrfs_free_fs_root(root);
357 break; 320 break;
358 } 321 }
359 } 322 }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
368{ 331{
369 struct btrfs_path *path; 332 struct btrfs_path *path;
370 int ret; 333 int ret;
371 struct btrfs_root_item *ri;
372 struct extent_buffer *leaf;
373 334
374 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
375 if (!path) 336 if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
379 goto out; 340 goto out;
380 341
381 BUG_ON(ret != 0); 342 BUG_ON(ret != 0);
382 leaf = path->nodes[0];
383 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
384 343
385 ret = btrfs_del_item(trans, root, path); 344 ret = btrfs_del_item(trans, root, path);
386out: 345out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..4ba2a69a60ad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2126 u8 *csum) 2126 u8 *csum)
2127{ 2127{
2128 struct btrfs_ordered_sum *sum = NULL; 2128 struct btrfs_ordered_sum *sum = NULL;
2129 int ret = 0; 2129 unsigned long index;
2130 unsigned long i;
2131 unsigned long num_sectors; 2130 unsigned long num_sectors;
2132 2131
2133 while (!list_empty(&sctx->csum_list)) { 2132 while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2146 if (!sum) 2145 if (!sum)
2147 return 0; 2146 return 0;
2148 2147
2148 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2149 num_sectors = sum->len / sctx->sectorsize; 2149 num_sectors = sum->len / sctx->sectorsize;
2150 for (i = 0; i < num_sectors; ++i) { 2150 memcpy(csum, sum->sums + index, sctx->csum_size);
2151 if (sum->sums[i].bytenr == logical) { 2151 if (index == num_sectors - 1) {
2152 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2153 ret = 1;
2154 break;
2155 }
2156 }
2157 if (ret && i == num_sectors - 1) {
2158 list_del(&sum->list); 2152 list_del(&sum->list);
2159 kfree(sum); 2153 kfree(sum);
2160 } 2154 }
2161 return ret; 2155 return 1;
2162} 2156}
2163 2157
2164/* scrub extent tries to collect up to 64 kB for each bio */ 2158/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2505,6 +2499,7 @@ again:
2505 if (ret) 2499 if (ret)
2506 goto out; 2500 goto out;
2507 2501
2502 scrub_free_csums(sctx);
2508 if (extent_logical + extent_len < 2503 if (extent_logical + extent_len <
2509 key.objectid + bytes) { 2504 key.objectid + bytes) {
2510 logical += increment; 2505 logical += increment;
@@ -3204,16 +3199,18 @@ out:
3204 3199
3205static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3200static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3206{ 3201{
3207 unsigned long index;
3208 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3202 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3209 int ret = 0; 3203 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3210 struct btrfs_key key; 3204 struct btrfs_key key;
3211 struct inode *inode = NULL; 3205 struct inode *inode;
3206 struct page *page;
3212 struct btrfs_root *local_root; 3207 struct btrfs_root *local_root;
3213 u64 physical_for_dev_replace; 3208 u64 physical_for_dev_replace;
3214 u64 len; 3209 u64 len;
3215 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3210 unsigned long index;
3216 int srcu_index; 3211 int srcu_index;
3212 int ret;
3213 int err;
3217 3214
3218 key.objectid = root; 3215 key.objectid = root;
3219 key.type = BTRFS_ROOT_ITEM_KEY; 3216 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3227 return PTR_ERR(local_root); 3224 return PTR_ERR(local_root);
3228 } 3225 }
3229 3226
3227 if (btrfs_root_refs(&local_root->root_item) == 0) {
3228 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3229 return -ENOENT;
3230 }
3231
3230 key.type = BTRFS_INODE_ITEM_KEY; 3232 key.type = BTRFS_INODE_ITEM_KEY;
3231 key.objectid = inum; 3233 key.objectid = inum;
3232 key.offset = 0; 3234 key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3235 if (IS_ERR(inode)) 3237 if (IS_ERR(inode))
3236 return PTR_ERR(inode); 3238 return PTR_ERR(inode);
3237 3239
3240 /* Avoid truncate/dio/punch hole.. */
3241 mutex_lock(&inode->i_mutex);
3242 inode_dio_wait(inode);
3243
3244 ret = 0;
3238 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3245 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3239 len = nocow_ctx->len; 3246 len = nocow_ctx->len;
3240 while (len >= PAGE_CACHE_SIZE) { 3247 while (len >= PAGE_CACHE_SIZE) {
3241 struct page *page = NULL;
3242 int ret_sub;
3243
3244 index = offset >> PAGE_CACHE_SHIFT; 3248 index = offset >> PAGE_CACHE_SHIFT;
3245 3249again:
3246 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3250 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3247 if (!page) { 3251 if (!page) {
3248 pr_err("find_or_create_page() failed\n"); 3252 pr_err("find_or_create_page() failed\n");
3249 ret = -ENOMEM; 3253 ret = -ENOMEM;
3250 goto next_page; 3254 goto out;
3251 } 3255 }
3252 3256
3253 if (PageUptodate(page)) { 3257 if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3255 goto next_page; 3259 goto next_page;
3256 } else { 3260 } else {
3257 ClearPageError(page); 3261 ClearPageError(page);
3258 ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 3262 err = extent_read_full_page(&BTRFS_I(inode)->
3259 io_tree, 3263 io_tree,
3260 page, btrfs_get_extent, 3264 page, btrfs_get_extent,
3261 nocow_ctx->mirror_num); 3265 nocow_ctx->mirror_num);
3262 if (ret_sub) { 3266 if (err) {
3263 ret = ret_sub; 3267 ret = err;
3264 goto next_page; 3268 goto next_page;
3265 } 3269 }
3266 wait_on_page_locked(page); 3270
3271 lock_page(page);
3272 /*
3273 * If the page has been remove from the page cache,
3274 * the data on it is meaningless, because it may be
3275 * old one, the new data may be written into the new
3276 * page in the page cache.
3277 */
3278 if (page->mapping != inode->i_mapping) {
3279 page_cache_release(page);
3280 goto again;
3281 }
3267 if (!PageUptodate(page)) { 3282 if (!PageUptodate(page)) {
3268 ret = -EIO; 3283 ret = -EIO;
3269 goto next_page; 3284 goto next_page;
3270 } 3285 }
3271 } 3286 }
3272 ret_sub = write_page_nocow(nocow_ctx->sctx, 3287 err = write_page_nocow(nocow_ctx->sctx,
3273 physical_for_dev_replace, page); 3288 physical_for_dev_replace, page);
3274 if (ret_sub) { 3289 if (err)
3275 ret = ret_sub; 3290 ret = err;
3276 goto next_page;
3277 }
3278
3279next_page: 3291next_page:
3280 if (page) { 3292 unlock_page(page);
3281 unlock_page(page); 3293 page_cache_release(page);
3282 put_page(page); 3294
3283 } 3295 if (ret)
3296 break;
3297
3284 offset += PAGE_CACHE_SIZE; 3298 offset += PAGE_CACHE_SIZE;
3285 physical_for_dev_replace += PAGE_CACHE_SIZE; 3299 physical_for_dev_replace += PAGE_CACHE_SIZE;
3286 len -= PAGE_CACHE_SIZE; 3300 len -= PAGE_CACHE_SIZE;
3287 } 3301 }
3288 3302out:
3289 if (inode) 3303 mutex_unlock(&inode->i_mutex);
3290 iput(inode); 3304 iput(inode);
3291 return ret; 3305 return ret;
3292} 3306}
3293 3307
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
158 } 158 }
159} 159}
160 160
161static struct fs_path *fs_path_alloc(struct send_ctx *sctx) 161static struct fs_path *fs_path_alloc(void)
162{ 162{
163 struct fs_path *p; 163 struct fs_path *p;
164 164
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
173 return p; 173 return p;
174} 174}
175 175
176static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) 176static struct fs_path *fs_path_alloc_reversed(void)
177{ 177{
178 struct fs_path *p; 178 struct fs_path *p;
179 179
180 p = fs_path_alloc(sctx); 180 p = fs_path_alloc();
181 if (!p) 181 if (!p)
182 return NULL; 182 return NULL;
183 p->reversed = 1; 183 p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
185 return p; 185 return p;
186} 186}
187 187
188static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) 188static void fs_path_free(struct fs_path *p)
189{ 189{
190 if (!p) 190 if (!p)
191 return; 191 return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
753 * 753 *
754 * path must point to the INODE_REF or INODE_EXTREF when called. 754 * path must point to the INODE_REF or INODE_EXTREF when called.
755 */ 755 */
756static int iterate_inode_ref(struct send_ctx *sctx, 756static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
757 struct btrfs_root *root, struct btrfs_path *path,
758 struct btrfs_key *found_key, int resolve, 757 struct btrfs_key *found_key, int resolve,
759 iterate_inode_ref_t iterate, void *ctx) 758 iterate_inode_ref_t iterate, void *ctx)
760{ 759{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
777 unsigned long elem_size; 776 unsigned long elem_size;
778 unsigned long ptr; 777 unsigned long ptr;
779 778
780 p = fs_path_alloc_reversed(sctx); 779 p = fs_path_alloc_reversed();
781 if (!p) 780 if (!p)
782 return -ENOMEM; 781 return -ENOMEM;
783 782
784 tmp_path = alloc_path_for_send(); 783 tmp_path = alloc_path_for_send();
785 if (!tmp_path) { 784 if (!tmp_path) {
786 fs_path_free(sctx, p); 785 fs_path_free(p);
787 return -ENOMEM; 786 return -ENOMEM;
788 } 787 }
789 788
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
858 857
859out: 858out:
860 btrfs_free_path(tmp_path); 859 btrfs_free_path(tmp_path);
861 fs_path_free(sctx, p); 860 fs_path_free(p);
862 return ret; 861 return ret;
863} 862}
864 863
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
874 * 873 *
875 * path must point to the dir item when called. 874 * path must point to the dir item when called.
876 */ 875 */
877static int iterate_dir_item(struct send_ctx *sctx, 876static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
878 struct btrfs_root *root, struct btrfs_path *path,
879 struct btrfs_key *found_key, 877 struct btrfs_key *found_key,
880 iterate_dir_item_t iterate, void *ctx) 878 iterate_dir_item_t iterate, void *ctx)
881{ 879{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
990 * Retrieve the first path of an inode. If an inode has more then one 988 * Retrieve the first path of an inode. If an inode has more then one
991 * ref/hardlink, this is ignored. 989 * ref/hardlink, this is ignored.
992 */ 990 */
993static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, 991static int get_inode_path(struct btrfs_root *root,
994 u64 ino, struct fs_path *path) 992 u64 ino, struct fs_path *path)
995{ 993{
996 int ret; 994 int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
1022 goto out; 1020 goto out;
1023 } 1021 }
1024 1022
1025 ret = iterate_inode_ref(sctx, root, p, &found_key, 1, 1023 ret = iterate_inode_ref(root, p, &found_key, 1,
1026 __copy_first_ref, path); 1024 __copy_first_ref, path);
1027 if (ret < 0) 1025 if (ret < 0)
1028 goto out; 1026 goto out;
1029 ret = 0; 1027 ret = 0;
@@ -1314,8 +1312,7 @@ out:
1314 return ret; 1312 return ret;
1315} 1313}
1316 1314
1317static int read_symlink(struct send_ctx *sctx, 1315static int read_symlink(struct btrfs_root *root,
1318 struct btrfs_root *root,
1319 u64 ino, 1316 u64 ino,
1320 struct fs_path *dest) 1317 struct fs_path *dest)
1321{ 1318{
@@ -1562,8 +1559,7 @@ out:
1562 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1559 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1563 * generation of the parent dir and the name of the dir entry. 1560 * generation of the parent dir and the name of the dir entry.
1564 */ 1561 */
1565static int get_first_ref(struct send_ctx *sctx, 1562static int get_first_ref(struct btrfs_root *root, u64 ino,
1566 struct btrfs_root *root, u64 ino,
1567 u64 *dir, u64 *dir_gen, struct fs_path *name) 1563 u64 *dir, u64 *dir_gen, struct fs_path *name)
1568{ 1564{
1569 int ret; 1565 int ret;
@@ -1628,8 +1624,7 @@ out:
1628 return ret; 1624 return ret;
1629} 1625}
1630 1626
1631static int is_first_ref(struct send_ctx *sctx, 1627static int is_first_ref(struct btrfs_root *root,
1632 struct btrfs_root *root,
1633 u64 ino, u64 dir, 1628 u64 ino, u64 dir,
1634 const char *name, int name_len) 1629 const char *name, int name_len)
1635{ 1630{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
1638 u64 tmp_dir; 1633 u64 tmp_dir;
1639 u64 tmp_dir_gen; 1634 u64 tmp_dir_gen;
1640 1635
1641 tmp_name = fs_path_alloc(sctx); 1636 tmp_name = fs_path_alloc();
1642 if (!tmp_name) 1637 if (!tmp_name)
1643 return -ENOMEM; 1638 return -ENOMEM;
1644 1639
1645 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1640 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1646 if (ret < 0) 1641 if (ret < 0)
1647 goto out; 1642 goto out;
1648 1643
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
1654 ret = !memcmp(tmp_name->start, name, name_len); 1649 ret = !memcmp(tmp_name->start, name, name_len);
1655 1650
1656out: 1651out:
1657 fs_path_free(sctx, tmp_name); 1652 fs_path_free(tmp_name);
1658 return ret; 1653 return ret;
1659} 1654}
1660 1655
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1783 if (!sctx->parent_root) 1778 if (!sctx->parent_root)
1784 goto out; 1779 goto out;
1785 1780
1786 name = fs_path_alloc(sctx); 1781 name = fs_path_alloc();
1787 if (!name) 1782 if (!name)
1788 return -ENOMEM; 1783 return -ENOMEM;
1789 1784
1790 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); 1785 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
1791 if (ret < 0) 1786 if (ret < 0)
1792 goto out; 1787 goto out;
1793 1788
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1795 name->start, fs_path_len(name)); 1790 name->start, fs_path_len(name));
1796 1791
1797out: 1792out:
1798 fs_path_free(sctx, name); 1793 fs_path_free(name);
1799 return ret; 1794 return ret;
1800} 1795}
1801 1796
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 * send_root or parent_root for ref lookup. 1974 * send_root or parent_root for ref lookup.
1980 */ 1975 */
1981 if (ino < sctx->send_progress) 1976 if (ino < sctx->send_progress)
1982 ret = get_first_ref(sctx, sctx->send_root, ino, 1977 ret = get_first_ref(sctx->send_root, ino,
1983 parent_ino, parent_gen, dest); 1978 parent_ino, parent_gen, dest);
1984 else 1979 else
1985 ret = get_first_ref(sctx, sctx->parent_root, ino, 1980 ret = get_first_ref(sctx->parent_root, ino,
1986 parent_ino, parent_gen, dest); 1981 parent_ino, parent_gen, dest);
1987 if (ret < 0) 1982 if (ret < 0)
1988 goto out; 1983 goto out;
1989 1984
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2070 u64 parent_gen = 0; 2065 u64 parent_gen = 0;
2071 int stop = 0; 2066 int stop = 0;
2072 2067
2073 name = fs_path_alloc(sctx); 2068 name = fs_path_alloc();
2074 if (!name) { 2069 if (!name) {
2075 ret = -ENOMEM; 2070 ret = -ENOMEM;
2076 goto out; 2071 goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2098 } 2093 }
2099 2094
2100out: 2095out:
2101 fs_path_free(sctx, name); 2096 fs_path_free(name);
2102 if (!ret) 2097 if (!ret)
2103 fs_path_unreverse(dest); 2098 fs_path_unreverse(dest);
2104 return ret; 2099 return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2263 2258
2264verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); 2259verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2265 2260
2266 p = fs_path_alloc(sctx); 2261 p = fs_path_alloc();
2267 if (!p) 2262 if (!p)
2268 return -ENOMEM; 2263 return -ENOMEM;
2269 2264
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2281 2276
2282tlv_put_failure: 2277tlv_put_failure:
2283out: 2278out:
2284 fs_path_free(sctx, p); 2279 fs_path_free(p);
2285 return ret; 2280 return ret;
2286} 2281}
2287 2282
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2292 2287
2293verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); 2288verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2294 2289
2295 p = fs_path_alloc(sctx); 2290 p = fs_path_alloc();
2296 if (!p) 2291 if (!p)
2297 return -ENOMEM; 2292 return -ENOMEM;
2298 2293
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2310 2305
2311tlv_put_failure: 2306tlv_put_failure:
2312out: 2307out:
2313 fs_path_free(sctx, p); 2308 fs_path_free(p);
2314 return ret; 2309 return ret;
2315} 2310}
2316 2311
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2321 2316
2322verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); 2317verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2323 2318
2324 p = fs_path_alloc(sctx); 2319 p = fs_path_alloc();
2325 if (!p) 2320 if (!p)
2326 return -ENOMEM; 2321 return -ENOMEM;
2327 2322
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2340 2335
2341tlv_put_failure: 2336tlv_put_failure:
2342out: 2337out:
2343 fs_path_free(sctx, p); 2338 fs_path_free(p);
2344 return ret; 2339 return ret;
2345} 2340}
2346 2341
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2356 2351
2357verbose_printk("btrfs: send_utimes %llu\n", ino); 2352verbose_printk("btrfs: send_utimes %llu\n", ino);
2358 2353
2359 p = fs_path_alloc(sctx); 2354 p = fs_path_alloc();
2360 if (!p) 2355 if (!p)
2361 return -ENOMEM; 2356 return -ENOMEM;
2362 2357
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2397 2392
2398tlv_put_failure: 2393tlv_put_failure:
2399out: 2394out:
2400 fs_path_free(sctx, p); 2395 fs_path_free(p);
2401 btrfs_free_path(path); 2396 btrfs_free_path(path);
2402 return ret; 2397 return ret;
2403} 2398}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
2418 2413
2419verbose_printk("btrfs: send_create_inode %llu\n", ino); 2414verbose_printk("btrfs: send_create_inode %llu\n", ino);
2420 2415
2421 p = fs_path_alloc(sctx); 2416 p = fs_path_alloc();
2422 if (!p) 2417 if (!p)
2423 return -ENOMEM; 2418 return -ENOMEM;
2424 2419
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2459 2454
2460 if (S_ISLNK(mode)) { 2455 if (S_ISLNK(mode)) {
2461 fs_path_reset(p); 2456 fs_path_reset(p);
2462 ret = read_symlink(sctx, sctx->send_root, ino, p); 2457 ret = read_symlink(sctx->send_root, ino, p);
2463 if (ret < 0) 2458 if (ret < 0)
2464 goto out; 2459 goto out;
2465 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2460 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2476 2471
2477tlv_put_failure: 2472tlv_put_failure:
2478out: 2473out:
2479 fs_path_free(sctx, p); 2474 fs_path_free(p);
2480 return ret; 2475 return ret;
2481} 2476}
2482 2477
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
2615 return 0; 2610 return 0;
2616} 2611}
2617 2612
2618static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2613static void __free_recorded_refs(struct list_head *head)
2619{ 2614{
2620 struct recorded_ref *cur; 2615 struct recorded_ref *cur;
2621 2616
2622 while (!list_empty(head)) { 2617 while (!list_empty(head)) {
2623 cur = list_entry(head->next, struct recorded_ref, list); 2618 cur = list_entry(head->next, struct recorded_ref, list);
2624 fs_path_free(sctx, cur->full_path); 2619 fs_path_free(cur->full_path);
2625 list_del(&cur->list); 2620 list_del(&cur->list);
2626 kfree(cur); 2621 kfree(cur);
2627 } 2622 }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2629 2624
2630static void free_recorded_refs(struct send_ctx *sctx) 2625static void free_recorded_refs(struct send_ctx *sctx)
2631{ 2626{
2632 __free_recorded_refs(sctx, &sctx->new_refs); 2627 __free_recorded_refs(&sctx->new_refs);
2633 __free_recorded_refs(sctx, &sctx->deleted_refs); 2628 __free_recorded_refs(&sctx->deleted_refs);
2634} 2629}
2635 2630
2636/* 2631/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2644 int ret; 2639 int ret;
2645 struct fs_path *orphan; 2640 struct fs_path *orphan;
2646 2641
2647 orphan = fs_path_alloc(sctx); 2642 orphan = fs_path_alloc();
2648 if (!orphan) 2643 if (!orphan)
2649 return -ENOMEM; 2644 return -ENOMEM;
2650 2645
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2655 ret = send_rename(sctx, path, orphan); 2650 ret = send_rename(sctx, path, orphan);
2656 2651
2657out: 2652out:
2658 fs_path_free(sctx, orphan); 2653 fs_path_free(orphan);
2659 return ret; 2654 return ret;
2660} 2655}
2661 2656
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2746 */ 2741 */
2747 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 2742 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2748 2743
2749 valid_path = fs_path_alloc(sctx); 2744 valid_path = fs_path_alloc();
2750 if (!valid_path) { 2745 if (!valid_path) {
2751 ret = -ENOMEM; 2746 ret = -ENOMEM;
2752 goto out; 2747 goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2843 if (ret < 0) 2838 if (ret < 0)
2844 goto out; 2839 goto out;
2845 if (ret) { 2840 if (ret) {
2846 ret = is_first_ref(sctx, sctx->parent_root, 2841 ret = is_first_ref(sctx->parent_root,
2847 ow_inode, cur->dir, cur->name, 2842 ow_inode, cur->dir, cur->name,
2848 cur->name_len); 2843 cur->name_len);
2849 if (ret < 0) 2844 if (ret < 0)
2850 goto out; 2845 goto out;
2851 if (ret) { 2846 if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3024out: 3019out:
3025 free_recorded_refs(sctx); 3020 free_recorded_refs(sctx);
3026 ulist_free(check_dirs); 3021 ulist_free(check_dirs);
3027 fs_path_free(sctx, valid_path); 3022 fs_path_free(valid_path);
3028 return ret; 3023 return ret;
3029} 3024}
3030 3025
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3037 struct fs_path *p; 3032 struct fs_path *p;
3038 u64 gen; 3033 u64 gen;
3039 3034
3040 p = fs_path_alloc(sctx); 3035 p = fs_path_alloc();
3041 if (!p) 3036 if (!p)
3042 return -ENOMEM; 3037 return -ENOMEM;
3043 3038
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3057 3052
3058out: 3053out:
3059 if (ret) 3054 if (ret)
3060 fs_path_free(sctx, p); 3055 fs_path_free(p);
3061 return ret; 3056 return ret;
3062} 3057}
3063 3058
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3070 struct fs_path *p; 3065 struct fs_path *p;
3071 u64 gen; 3066 u64 gen;
3072 3067
3073 p = fs_path_alloc(sctx); 3068 p = fs_path_alloc();
3074 if (!p) 3069 if (!p)
3075 return -ENOMEM; 3070 return -ENOMEM;
3076 3071
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3090 3085
3091out: 3086out:
3092 if (ret) 3087 if (ret)
3093 fs_path_free(sctx, p); 3088 fs_path_free(p);
3094 return ret; 3089 return ret;
3095} 3090}
3096 3091
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
3098{ 3093{
3099 int ret; 3094 int ret;
3100 3095
3101 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3096 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3102 sctx->cmp_key, 0, __record_new_ref, sctx); 3097 sctx->cmp_key, 0, __record_new_ref, sctx);
3103 if (ret < 0) 3098 if (ret < 0)
3104 goto out; 3099 goto out;
3105 ret = 0; 3100 ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
3112{ 3107{
3113 int ret; 3108 int ret;
3114 3109
3115 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3110 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3116 sctx->cmp_key, 0, __record_deleted_ref, sctx); 3111 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3117 if (ret < 0) 3112 if (ret < 0)
3118 goto out; 3113 goto out;
3119 ret = 0; 3114 ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
3142 return 0; 3137 return 0;
3143} 3138}
3144 3139
3145static int find_iref(struct send_ctx *sctx, 3140static int find_iref(struct btrfs_root *root,
3146 struct btrfs_root *root,
3147 struct btrfs_path *path, 3141 struct btrfs_path *path,
3148 struct btrfs_key *key, 3142 struct btrfs_key *key,
3149 u64 dir, struct fs_path *name) 3143 u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
3155 ctx.name = name; 3149 ctx.name = name;
3156 ctx.found_idx = -1; 3150 ctx.found_idx = -1;
3157 3151
3158 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); 3152 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
3159 if (ret < 0) 3153 if (ret < 0)
3160 return ret; 3154 return ret;
3161 3155
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
3172 int ret; 3166 int ret;
3173 struct send_ctx *sctx = ctx; 3167 struct send_ctx *sctx = ctx;
3174 3168
3175 ret = find_iref(sctx, sctx->parent_root, sctx->right_path, 3169 ret = find_iref(sctx->parent_root, sctx->right_path,
3176 sctx->cmp_key, dir, name); 3170 sctx->cmp_key, dir, name);
3177 if (ret == -ENOENT) 3171 if (ret == -ENOENT)
3178 ret = __record_new_ref(num, dir, index, name, sctx); 3172 ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
3189 int ret; 3183 int ret;
3190 struct send_ctx *sctx = ctx; 3184 struct send_ctx *sctx = ctx;
3191 3185
3192 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3186 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
3193 dir, name); 3187 dir, name);
3194 if (ret == -ENOENT) 3188 if (ret == -ENOENT)
3195 ret = __record_deleted_ref(num, dir, index, name, sctx); 3189 ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
3203{ 3197{
3204 int ret = 0; 3198 int ret = 0;
3205 3199
3206 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3200 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3207 sctx->cmp_key, 0, __record_changed_new_ref, sctx); 3201 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3208 if (ret < 0) 3202 if (ret < 0)
3209 goto out; 3203 goto out;
3210 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3204 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3211 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); 3205 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3212 if (ret < 0) 3206 if (ret < 0)
3213 goto out; 3207 goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
3266 found_key.type != BTRFS_INODE_EXTREF_KEY)) 3260 found_key.type != BTRFS_INODE_EXTREF_KEY))
3267 break; 3261 break;
3268 3262
3269 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, 3263 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3270 sctx);
3271 btrfs_release_path(path); 3264 btrfs_release_path(path);
3272 if (ret < 0) 3265 if (ret < 0)
3273 goto out; 3266 goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3335 struct fs_path *p; 3328 struct fs_path *p;
3336 posix_acl_xattr_header dummy_acl; 3329 posix_acl_xattr_header dummy_acl;
3337 3330
3338 p = fs_path_alloc(sctx); 3331 p = fs_path_alloc();
3339 if (!p) 3332 if (!p)
3340 return -ENOMEM; 3333 return -ENOMEM;
3341 3334
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3362 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 3355 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3363 3356
3364out: 3357out:
3365 fs_path_free(sctx, p); 3358 fs_path_free(p);
3366 return ret; 3359 return ret;
3367} 3360}
3368 3361
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3375 struct send_ctx *sctx = ctx; 3368 struct send_ctx *sctx = ctx;
3376 struct fs_path *p; 3369 struct fs_path *p;
3377 3370
3378 p = fs_path_alloc(sctx); 3371 p = fs_path_alloc();
3379 if (!p) 3372 if (!p)
3380 return -ENOMEM; 3373 return -ENOMEM;
3381 3374
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3386 ret = send_remove_xattr(sctx, p, name, name_len); 3379 ret = send_remove_xattr(sctx, p, name, name_len);
3387 3380
3388out: 3381out:
3389 fs_path_free(sctx, p); 3382 fs_path_free(p);
3390 return ret; 3383 return ret;
3391} 3384}
3392 3385
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
3394{ 3387{
3395 int ret = 0; 3388 int ret = 0;
3396 3389
3397 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3390 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3398 sctx->cmp_key, __process_new_xattr, sctx); 3391 sctx->cmp_key, __process_new_xattr, sctx);
3399 3392
3400 return ret; 3393 return ret;
3401} 3394}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
3404{ 3397{
3405 int ret; 3398 int ret;
3406 3399
3407 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3400 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3408 sctx->cmp_key, __process_deleted_xattr, sctx); 3401 sctx->cmp_key, __process_deleted_xattr, sctx);
3409 3402
3410 return ret; 3403 return ret;
3411} 3404}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
3429 strncmp(name, ctx->name, name_len) == 0) { 3422 strncmp(name, ctx->name, name_len) == 0) {
3430 ctx->found_idx = num; 3423 ctx->found_idx = num;
3431 ctx->found_data_len = data_len; 3424 ctx->found_data_len = data_len;
3432 ctx->found_data = kmalloc(data_len, GFP_NOFS); 3425 ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
3433 if (!ctx->found_data) 3426 if (!ctx->found_data)
3434 return -ENOMEM; 3427 return -ENOMEM;
3435 memcpy(ctx->found_data, data, data_len);
3436 return 1; 3428 return 1;
3437 } 3429 }
3438 return 0; 3430 return 0;
3439} 3431}
3440 3432
3441static int find_xattr(struct send_ctx *sctx, 3433static int find_xattr(struct btrfs_root *root,
3442 struct btrfs_root *root,
3443 struct btrfs_path *path, 3434 struct btrfs_path *path,
3444 struct btrfs_key *key, 3435 struct btrfs_key *key,
3445 const char *name, int name_len, 3436 const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
3454 ctx.found_data = NULL; 3445 ctx.found_data = NULL;
3455 ctx.found_data_len = 0; 3446 ctx.found_data_len = 0;
3456 3447
3457 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); 3448 ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
3458 if (ret < 0) 3449 if (ret < 0)
3459 return ret; 3450 return ret;
3460 3451
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3480 char *found_data = NULL; 3471 char *found_data = NULL;
3481 int found_data_len = 0; 3472 int found_data_len = 0;
3482 3473
3483 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, 3474 ret = find_xattr(sctx->parent_root, sctx->right_path,
3484 sctx->cmp_key, name, name_len, &found_data, 3475 sctx->cmp_key, name, name_len, &found_data,
3485 &found_data_len); 3476 &found_data_len);
3486 if (ret == -ENOENT) { 3477 if (ret == -ENOENT) {
3487 ret = __process_new_xattr(num, di_key, name, name_len, data, 3478 ret = __process_new_xattr(num, di_key, name, name_len, data,
3488 data_len, type, ctx); 3479 data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3508 int ret; 3499 int ret;
3509 struct send_ctx *sctx = ctx; 3500 struct send_ctx *sctx = ctx;
3510 3501
3511 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3502 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
3512 name, name_len, NULL, NULL); 3503 name, name_len, NULL, NULL);
3513 if (ret == -ENOENT) 3504 if (ret == -ENOENT)
3514 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 3505 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3515 data_len, type, ctx); 3506 data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
3523{ 3514{
3524 int ret = 0; 3515 int ret = 0;
3525 3516
3526 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3517 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3527 sctx->cmp_key, __process_changed_new_xattr, sctx); 3518 sctx->cmp_key, __process_changed_new_xattr, sctx);
3528 if (ret < 0) 3519 if (ret < 0)
3529 goto out; 3520 goto out;
3530 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3521 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3531 sctx->cmp_key, __process_changed_deleted_xattr, sctx); 3522 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3532 3523
3533out: 3524out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3572 goto out; 3563 goto out;
3573 } 3564 }
3574 3565
3575 ret = iterate_dir_item(sctx, root, path, &found_key, 3566 ret = iterate_dir_item(root, path, &found_key,
3576 __process_new_xattr, sctx); 3567 __process_new_xattr, sctx);
3577 if (ret < 0) 3568 if (ret < 0)
3578 goto out; 3569 goto out;
3579 3570
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3598 int num_read = 0; 3589 int num_read = 0;
3599 mm_segment_t old_fs; 3590 mm_segment_t old_fs;
3600 3591
3601 p = fs_path_alloc(sctx); 3592 p = fs_path_alloc();
3602 if (!p) 3593 if (!p)
3603 return -ENOMEM; 3594 return -ENOMEM;
3604 3595
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3640 3631
3641tlv_put_failure: 3632tlv_put_failure:
3642out: 3633out:
3643 fs_path_free(sctx, p); 3634 fs_path_free(p);
3644 set_fs(old_fs); 3635 set_fs(old_fs);
3645 if (ret < 0) 3636 if (ret < 0)
3646 return ret; 3637 return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3663 clone_root->root->objectid, clone_root->ino, 3654 clone_root->root->objectid, clone_root->ino,
3664 clone_root->offset); 3655 clone_root->offset);
3665 3656
3666 p = fs_path_alloc(sctx); 3657 p = fs_path_alloc();
3667 if (!p) 3658 if (!p)
3668 return -ENOMEM; 3659 return -ENOMEM;
3669 3660
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3686 goto out; 3677 goto out;
3687 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3678 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3688 } else { 3679 } else {
3689 ret = get_inode_path(sctx, clone_root->root, 3680 ret = get_inode_path(clone_root->root, clone_root->ino, p);
3690 clone_root->ino, p);
3691 } 3681 }
3692 if (ret < 0) 3682 if (ret < 0)
3693 goto out; 3683 goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3704 3694
3705tlv_put_failure: 3695tlv_put_failure:
3706out: 3696out:
3707 fs_path_free(sctx, p); 3697 fs_path_free(p);
3708 return ret; 3698 return ret;
3709} 3699}
3710 3700
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
3717 int ret = 0; 3707 int ret = 0;
3718 struct fs_path *p; 3708 struct fs_path *p;
3719 3709
3720 p = fs_path_alloc(sctx); 3710 p = fs_path_alloc();
3721 if (!p) 3711 if (!p)
3722 return -ENOMEM; 3712 return -ENOMEM;
3723 3713
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
3737 3727
3738tlv_put_failure: 3728tlv_put_failure:
3739out: 3729out:
3740 fs_path_free(sctx, p); 3730 fs_path_free(p);
3741 return ret; 3731 return ret;
3742} 3732}
3743 3733
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4579 send_root = BTRFS_I(file_inode(mnt_file))->root; 4569 send_root = BTRFS_I(file_inode(mnt_file))->root;
4580 fs_info = send_root->fs_info; 4570 fs_info = send_root->fs_info;
4581 4571
4572 /*
4573 * This is done when we lookup the root, it should already be complete
4574 * by the time we get here.
4575 */
4576 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
4577
4578 /*
4579 * If we just created this root we need to make sure that the orphan
4580 * cleanup has been done and committed since we search the commit root,
4581 * so check its commit root transid with our otransid and if they match
4582 * commit the transaction to make sure everything is updated.
4583 */
4584 down_read(&send_root->fs_info->extent_commit_sem);
4585 if (btrfs_header_generation(send_root->commit_root) ==
4586 btrfs_root_otransid(&send_root->root_item)) {
4587 struct btrfs_trans_handle *trans;
4588
4589 up_read(&send_root->fs_info->extent_commit_sem);
4590
4591 trans = btrfs_attach_transaction_barrier(send_root);
4592 if (IS_ERR(trans)) {
4593 if (PTR_ERR(trans) != -ENOENT) {
4594 ret = PTR_ERR(trans);
4595 goto out;
4596 }
4597 /* ENOENT means theres no transaction */
4598 } else {
4599 ret = btrfs_commit_transaction(trans, send_root);
4600 if (ret)
4601 goto out;
4602 }
4603 } else {
4604 up_read(&send_root->fs_info->extent_commit_sem);
4605 }
4606
4582 arg = memdup_user(arg_, sizeof(*arg)); 4607 arg = memdup_user(arg_, sizeof(*arg));
4583 if (IS_ERR(arg)) { 4608 if (IS_ERR(arg)) {
4584 ret = PTR_ERR(arg); 4609 ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4663 key.type = BTRFS_ROOT_ITEM_KEY; 4688 key.type = BTRFS_ROOT_ITEM_KEY;
4664 key.offset = (u64)-1; 4689 key.offset = (u64)-1;
4665 clone_root = btrfs_read_fs_root_no_name(fs_info, &key); 4690 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4666 if (!clone_root) {
4667 ret = -EINVAL;
4668 goto out;
4669 }
4670 if (IS_ERR(clone_root)) { 4691 if (IS_ERR(clone_root)) {
4671 ret = PTR_ERR(clone_root); 4692 ret = PTR_ERR(clone_root);
4672 goto out; 4693 goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4682 key.type = BTRFS_ROOT_ITEM_KEY; 4703 key.type = BTRFS_ROOT_ITEM_KEY;
4683 key.offset = (u64)-1; 4704 key.offset = (u64)-1;
4684 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); 4705 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4685 if (!sctx->parent_root) { 4706 if (IS_ERR(sctx->parent_root)) {
4686 ret = -EINVAL; 4707 ret = PTR_ERR(sctx->parent_root);
4687 goto out; 4708 goto out;
4688 } 4709 }
4689 } 4710 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
54#include "version.h"
55#include "export.h" 54#include "export.h"
56#include "compression.h" 55#include "compression.h"
57#include "rcu-string.h" 56#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
266 return; 265 return;
267 } 266 }
268 ACCESS_ONCE(trans->transaction->aborted) = errno; 267 ACCESS_ONCE(trans->transaction->aborted) = errno;
268 /* Wake up anybody who may be waiting on this transaction */
269 wake_up(&root->fs_info->transaction_wait);
270 wake_up(&root->fs_info->transaction_blocked_wait);
269 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
270} 272}
271/* 273/*
@@ -776,9 +778,6 @@ find_root:
776 if (IS_ERR(new_root)) 778 if (IS_ERR(new_root))
777 return ERR_CAST(new_root); 779 return ERR_CAST(new_root);
778 780
779 if (btrfs_root_refs(&new_root->root_item) == 0)
780 return ERR_PTR(-ENOENT);
781
782 dir_id = btrfs_root_dirid(&new_root->root_item); 781 dir_id = btrfs_root_dirid(&new_root->root_item);
783setup_root: 782setup_root:
784 location.objectid = dir_id; 783 location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
866 return 0; 865 return 0;
867 } 866 }
868 867
869 btrfs_wait_ordered_extents(root, 1); 868 btrfs_wait_all_ordered_extents(fs_info, 1);
870 869
871 trans = btrfs_attach_transaction_barrier(root); 870 trans = btrfs_attach_transaction_barrier(root);
872 if (IS_ERR(trans)) { 871 if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
1685 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); 1684 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1686} 1685}
1687 1686
1687static void btrfs_print_info(void)
1688{
1689 printk(KERN_INFO "Btrfs loaded"
1690#ifdef CONFIG_BTRFS_DEBUG
1691 ", debug=on"
1692#endif
1693#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1694 ", integrity-checker=on"
1695#endif
1696 "\n");
1697}
1698
1688static int __init init_btrfs_fs(void) 1699static int __init init_btrfs_fs(void)
1689{ 1700{
1690 int err; 1701 int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
1733 1744
1734 btrfs_init_lockdep(); 1745 btrfs_init_lockdep();
1735 1746
1736#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1747 btrfs_print_info();
1737 btrfs_test_free_space_cache(); 1748 btrfs_test_free_space_cache();
1738#endif
1739 1749
1740 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
1741 return 0; 1750 return 0;
1742 1751
1743unregister_ioctl: 1752unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
34 34
35#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
36 36
37static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
38 [TRANS_STATE_RUNNING] = 0U,
39 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
40 __TRANS_START),
41 [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
42 __TRANS_START |
43 __TRANS_ATTACH),
44 [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
45 __TRANS_START |
46 __TRANS_ATTACH |
47 __TRANS_JOIN),
48 [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
49 __TRANS_START |
50 __TRANS_ATTACH |
51 __TRANS_JOIN |
52 __TRANS_JOIN_NOLOCK),
53 [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
54 __TRANS_START |
55 __TRANS_ATTACH |
56 __TRANS_JOIN |
57 __TRANS_JOIN_NOLOCK),
58};
59
37static void put_transaction(struct btrfs_transaction *transaction) 60static void put_transaction(struct btrfs_transaction *transaction)
38{ 61{
39 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
40 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 65 WARN_ON(transaction->delayed_refs.root.rb_node);
66 while (!list_empty(&transaction->pending_chunks)) {
67 struct extent_map *em;
68
69 em = list_first_entry(&transaction->pending_chunks,
70 struct extent_map, list);
71 list_del_init(&em->list);
72 free_extent_map(em);
73 }
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 74 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 75 }
45} 76}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
50 root->commit_root = btrfs_root_node(root); 81 root->commit_root = btrfs_root_node(root);
51} 82}
52 83
53static inline int can_join_transaction(struct btrfs_transaction *trans, 84static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
54 int type) 85 unsigned int type)
86{
87 if (type & TRANS_EXTWRITERS)
88 atomic_inc(&trans->num_extwriters);
89}
90
91static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
92 unsigned int type)
93{
94 if (type & TRANS_EXTWRITERS)
95 atomic_dec(&trans->num_extwriters);
96}
97
98static inline void extwriter_counter_init(struct btrfs_transaction *trans,
99 unsigned int type)
100{
101 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
102}
103
104static inline int extwriter_counter_read(struct btrfs_transaction *trans)
55{ 105{
56 return !(trans->in_commit && 106 return atomic_read(&trans->num_extwriters);
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59} 107}
60 108
61/* 109/*
62 * either allocate a new transaction or hop into the existing one 110 * either allocate a new transaction or hop into the existing one
63 */ 111 */
64static noinline int join_transaction(struct btrfs_root *root, int type) 112static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
65{ 113{
66 struct btrfs_transaction *cur_trans; 114 struct btrfs_transaction *cur_trans;
67 struct btrfs_fs_info *fs_info = root->fs_info; 115 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
74 return -EROFS; 122 return -EROFS;
75 } 123 }
76 124
77 if (fs_info->trans_no_join) {
78 /*
79 * If we are JOIN_NOLOCK we're already committing a current
80 * transaction, we just need a handle to deal with something
81 * when committing the transaction, such as inode cache and
82 * space cache. It is a special case.
83 */
84 if (type != TRANS_JOIN_NOLOCK) {
85 spin_unlock(&fs_info->trans_lock);
86 return -EBUSY;
87 }
88 }
89
90 cur_trans = fs_info->running_transaction; 125 cur_trans = fs_info->running_transaction;
91 if (cur_trans) { 126 if (cur_trans) {
92 if (cur_trans->aborted) { 127 if (cur_trans->aborted) {
93 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
94 return cur_trans->aborted; 129 return cur_trans->aborted;
95 } 130 }
96 if (!can_join_transaction(cur_trans, type)) { 131 if (btrfs_blocked_trans_types[cur_trans->state] & type) {
97 spin_unlock(&fs_info->trans_lock); 132 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY; 133 return -EBUSY;
99 } 134 }
100 atomic_inc(&cur_trans->use_count); 135 atomic_inc(&cur_trans->use_count);
101 atomic_inc(&cur_trans->num_writers); 136 atomic_inc(&cur_trans->num_writers);
102 cur_trans->num_joined++; 137 extwriter_counter_inc(cur_trans, type);
103 spin_unlock(&fs_info->trans_lock); 138 spin_unlock(&fs_info->trans_lock);
104 return 0; 139 return 0;
105 } 140 }
@@ -112,6 +147,12 @@ loop:
112 if (type == TRANS_ATTACH) 147 if (type == TRANS_ATTACH)
113 return -ENOENT; 148 return -ENOENT;
114 149
150 /*
151 * JOIN_NOLOCK only happens during the transaction commit, so
152 * it is impossible that ->running_transaction is NULL
153 */
154 BUG_ON(type == TRANS_JOIN_NOLOCK);
155
115 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 156 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
116 if (!cur_trans) 157 if (!cur_trans)
117 return -ENOMEM; 158 return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
120 if (fs_info->running_transaction) { 161 if (fs_info->running_transaction) {
121 /* 162 /*
122 * someone started a transaction after we unlocked. Make sure 163 * someone started a transaction after we unlocked. Make sure
123 * to redo the trans_no_join checks above 164 * to redo the checks above
124 */ 165 */
125 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 166 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
126 goto loop; 167 goto loop;
@@ -131,17 +172,15 @@ loop:
131 } 172 }
132 173
133 atomic_set(&cur_trans->num_writers, 1); 174 atomic_set(&cur_trans->num_writers, 1);
134 cur_trans->num_joined = 0; 175 extwriter_counter_init(cur_trans, type);
135 init_waitqueue_head(&cur_trans->writer_wait); 176 init_waitqueue_head(&cur_trans->writer_wait);
136 init_waitqueue_head(&cur_trans->commit_wait); 177 init_waitqueue_head(&cur_trans->commit_wait);
137 cur_trans->in_commit = 0; 178 cur_trans->state = TRANS_STATE_RUNNING;
138 cur_trans->blocked = 0;
139 /* 179 /*
140 * One for this trans handle, one so it will live on until we 180 * One for this trans handle, one so it will live on until we
141 * commit the transaction. 181 * commit the transaction.
142 */ 182 */
143 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
144 cur_trans->commit_done = 0;
145 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
146 185
147 cur_trans->delayed_refs.root = RB_ROOT; 186 cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
164 "creating a fresh transaction\n"); 203 "creating a fresh transaction\n");
165 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
166 205
167 spin_lock_init(&cur_trans->commit_lock);
168 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); 207 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0); 208 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
172 210
173 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 211 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations); 212 INIT_LIST_HEAD(&cur_trans->ordered_operations);
213 INIT_LIST_HEAD(&cur_trans->pending_chunks);
175 list_add_tail(&cur_trans->list, &fs_info->trans_list); 214 list_add_tail(&cur_trans->list, &fs_info->trans_list);
176 extent_io_tree_init(&cur_trans->dirty_pages, 215 extent_io_tree_init(&cur_trans->dirty_pages,
177 fs_info->btree_inode->i_mapping); 216 fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
269 return 0; 308 return 0;
270} 309}
271 310
311static inline int is_transaction_blocked(struct btrfs_transaction *trans)
312{
313 return (trans->state >= TRANS_STATE_BLOCKED &&
314 trans->state < TRANS_STATE_UNBLOCKED &&
315 !trans->aborted);
316}
317
272/* wait for commit against the current transaction to become unblocked 318/* wait for commit against the current transaction to become unblocked
273 * when this is done, it is safe to start a new transaction, but the current 319 * when this is done, it is safe to start a new transaction, but the current
274 * transaction might not be fully on disk. 320 * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
279 325
280 spin_lock(&root->fs_info->trans_lock); 326 spin_lock(&root->fs_info->trans_lock);
281 cur_trans = root->fs_info->running_transaction; 327 cur_trans = root->fs_info->running_transaction;
282 if (cur_trans && cur_trans->blocked) { 328 if (cur_trans && is_transaction_blocked(cur_trans)) {
283 atomic_inc(&cur_trans->use_count); 329 atomic_inc(&cur_trans->use_count);
284 spin_unlock(&root->fs_info->trans_lock); 330 spin_unlock(&root->fs_info->trans_lock);
285 331
286 wait_event(root->fs_info->transaction_wait, 332 wait_event(root->fs_info->transaction_wait,
287 !cur_trans->blocked); 333 cur_trans->state >= TRANS_STATE_UNBLOCKED ||
334 cur_trans->aborted);
288 put_transaction(cur_trans); 335 put_transaction(cur_trans);
289 } else { 336 } else {
290 spin_unlock(&root->fs_info->trans_lock); 337 spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
307} 354}
308 355
309static struct btrfs_trans_handle * 356static struct btrfs_trans_handle *
310start_transaction(struct btrfs_root *root, u64 num_items, int type, 357start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
311 enum btrfs_reserve_flush_enum flush) 358 enum btrfs_reserve_flush_enum flush)
312{ 359{
313 struct btrfs_trans_handle *h; 360 struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
320 return ERR_PTR(-EROFS); 367 return ERR_PTR(-EROFS);
321 368
322 if (current->journal_info) { 369 if (current->journal_info) {
323 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 370 WARN_ON(type & TRANS_EXTWRITERS);
324 h = current->journal_info; 371 h = current->journal_info;
325 h->use_count++; 372 h->use_count++;
326 WARN_ON(h->use_count > 2); 373 WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
366 * If we are ATTACH, it means we just want to catch the current 413 * If we are ATTACH, it means we just want to catch the current
367 * transaction and commit it, so we needn't do sb_start_intwrite(). 414 * transaction and commit it, so we needn't do sb_start_intwrite().
368 */ 415 */
369 if (type < TRANS_JOIN_NOLOCK) 416 if (type & __TRANS_FREEZABLE)
370 sb_start_intwrite(root->fs_info->sb); 417 sb_start_intwrite(root->fs_info->sb);
371 418
372 if (may_wait_transaction(root, type)) 419 if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
408 INIT_LIST_HEAD(&h->new_bgs); 455 INIT_LIST_HEAD(&h->new_bgs);
409 456
410 smp_mb(); 457 smp_mb();
411 if (cur_trans->blocked && may_wait_transaction(root, type)) { 458 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
459 may_wait_transaction(root, type)) {
412 btrfs_commit_transaction(h, root); 460 btrfs_commit_transaction(h, root);
413 goto again; 461 goto again;
414 } 462 }
@@ -429,7 +477,7 @@ got_it:
429 return h; 477 return h;
430 478
431join_fail: 479join_fail:
432 if (type < TRANS_JOIN_NOLOCK) 480 if (type & __TRANS_FREEZABLE)
433 sb_end_intwrite(root->fs_info->sb); 481 sb_end_intwrite(root->fs_info->sb);
434 kmem_cache_free(btrfs_trans_handle_cachep, h); 482 kmem_cache_free(btrfs_trans_handle_cachep, h);
435alloc_fail: 483alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
490} 538}
491 539
492/* 540/*
493 * btrfs_attach_transaction() - catch the running transaction 541 * btrfs_attach_transaction_barrier() - catch the running transaction
494 * 542 *
495 * It is similar to the above function, the differentia is this one 543 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully 544 * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
512static noinline void wait_for_commit(struct btrfs_root *root, 560static noinline void wait_for_commit(struct btrfs_root *root,
513 struct btrfs_transaction *commit) 561 struct btrfs_transaction *commit)
514{ 562{
515 wait_event(commit->commit_wait, commit->commit_done); 563 wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
516} 564}
517 565
518int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 566int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
548 spin_lock(&root->fs_info->trans_lock); 596 spin_lock(&root->fs_info->trans_lock);
549 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 597 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
550 list) { 598 list) {
551 if (t->in_commit) { 599 if (t->state >= TRANS_STATE_COMMIT_START) {
552 if (t->commit_done) 600 if (t->state == TRANS_STATE_COMPLETED)
553 break; 601 break;
554 cur_trans = t; 602 cur_trans = t;
555 atomic_inc(&cur_trans->use_count); 603 atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
576static int should_end_transaction(struct btrfs_trans_handle *trans, 624static int should_end_transaction(struct btrfs_trans_handle *trans,
577 struct btrfs_root *root) 625 struct btrfs_root *root)
578{ 626{
579 int ret; 627 if (root->fs_info->global_block_rsv.space_info->full &&
628 btrfs_should_throttle_delayed_refs(trans, root))
629 return 1;
580 630
581 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 631 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
582 return ret ? 1 : 0;
583} 632}
584 633
585int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 634int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
590 int err; 639 int err;
591 640
592 smp_mb(); 641 smp_mb();
593 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 642 if (cur_trans->state >= TRANS_STATE_BLOCKED ||
643 cur_trans->delayed_refs.flushing)
594 return 1; 644 return 1;
595 645
596 updates = trans->delayed_ref_updates; 646 updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
609{ 659{
610 struct btrfs_transaction *cur_trans = trans->transaction; 660 struct btrfs_transaction *cur_trans = trans->transaction;
611 struct btrfs_fs_info *info = root->fs_info; 661 struct btrfs_fs_info *info = root->fs_info;
612 int count = 0; 662 unsigned long cur = trans->delayed_ref_updates;
613 int lock = (trans->type != TRANS_JOIN_NOLOCK); 663 int lock = (trans->type != TRANS_JOIN_NOLOCK);
614 int err = 0; 664 int err = 0;
615 665
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
638 if (!list_empty(&trans->new_bgs)) 688 if (!list_empty(&trans->new_bgs))
639 btrfs_create_pending_block_groups(trans, root); 689 btrfs_create_pending_block_groups(trans, root);
640 690
641 while (count < 1) { 691 trans->delayed_ref_updates = 0;
642 unsigned long cur = trans->delayed_ref_updates; 692 if (btrfs_should_throttle_delayed_refs(trans, root)) {
693 cur = max_t(unsigned long, cur, 1);
643 trans->delayed_ref_updates = 0; 694 trans->delayed_ref_updates = 0;
644 if (cur && 695 btrfs_run_delayed_refs(trans, root, cur);
645 trans->transaction->delayed_refs.num_heads_ready > 64) {
646 trans->delayed_ref_updates = 0;
647 btrfs_run_delayed_refs(trans, root, cur);
648 } else {
649 break;
650 }
651 count++;
652 } 696 }
653 697
654 btrfs_trans_release_metadata(trans, root); 698 btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
658 btrfs_create_pending_block_groups(trans, root); 702 btrfs_create_pending_block_groups(trans, root);
659 703
660 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 704 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
661 should_end_transaction(trans, root)) { 705 should_end_transaction(trans, root) &&
662 trans->transaction->blocked = 1; 706 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
663 smp_wmb(); 707 spin_lock(&info->trans_lock);
708 if (cur_trans->state == TRANS_STATE_RUNNING)
709 cur_trans->state = TRANS_STATE_BLOCKED;
710 spin_unlock(&info->trans_lock);
664 } 711 }
665 712
666 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 713 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
667 if (throttle) { 714 if (throttle) {
668 /* 715 /*
669 * We may race with somebody else here so end up having 716 * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
677 } 724 }
678 } 725 }
679 726
680 if (trans->type < TRANS_JOIN_NOLOCK) 727 if (trans->type & __TRANS_FREEZABLE)
681 sb_end_intwrite(root->fs_info->sb); 728 sb_end_intwrite(root->fs_info->sb);
682 729
683 WARN_ON(cur_trans != info->running_transaction); 730 WARN_ON(cur_trans != info->running_transaction);
684 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 731 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
685 atomic_dec(&cur_trans->num_writers); 732 atomic_dec(&cur_trans->num_writers);
733 extwriter_counter_dec(cur_trans, trans->type);
686 734
687 smp_mb(); 735 smp_mb();
688 if (waitqueue_active(&cur_trans->writer_wait)) 736 if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
736 struct extent_state *cached_state = NULL; 784 struct extent_state *cached_state = NULL;
737 u64 start = 0; 785 u64 start = 0;
738 u64 end; 786 u64 end;
739 struct blk_plug plug;
740 787
741 blk_start_plug(&plug);
742 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 788 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
743 mark, &cached_state)) { 789 mark, &cached_state)) {
744 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 790 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
752 } 798 }
753 if (err) 799 if (err)
754 werr = err; 800 werr = err;
755 blk_finish_plug(&plug);
756 return werr; 801 return werr;
757} 802}
758 803
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
797{ 842{
798 int ret; 843 int ret;
799 int ret2; 844 int ret2;
845 struct blk_plug plug;
800 846
847 blk_start_plug(&plug);
801 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 848 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
849 blk_finish_plug(&plug);
802 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 850 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
803 851
804 if (ret) 852 if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
1318 1366
1319int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1367int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
1320{ 1368{
1369 struct btrfs_transaction *trans;
1321 int ret = 0; 1370 int ret = 0;
1371
1322 spin_lock(&info->trans_lock); 1372 spin_lock(&info->trans_lock);
1323 if (info->running_transaction) 1373 trans = info->running_transaction;
1324 ret = info->running_transaction->in_commit; 1374 if (trans)
1375 ret = (trans->state >= TRANS_STATE_COMMIT_START);
1325 spin_unlock(&info->trans_lock); 1376 spin_unlock(&info->trans_lock);
1326 return ret; 1377 return ret;
1327} 1378}
1328 1379
1329int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1380int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1330{ 1381{
1382 struct btrfs_transaction *trans;
1331 int ret = 0; 1383 int ret = 0;
1384
1332 spin_lock(&info->trans_lock); 1385 spin_lock(&info->trans_lock);
1333 if (info->running_transaction) 1386 trans = info->running_transaction;
1334 ret = info->running_transaction->blocked; 1387 if (trans)
1388 ret = is_transaction_blocked(trans);
1335 spin_unlock(&info->trans_lock); 1389 spin_unlock(&info->trans_lock);
1336 return ret; 1390 return ret;
1337} 1391}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1343static void wait_current_trans_commit_start(struct btrfs_root *root, 1397static void wait_current_trans_commit_start(struct btrfs_root *root,
1344 struct btrfs_transaction *trans) 1398 struct btrfs_transaction *trans)
1345{ 1399{
1346 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); 1400 wait_event(root->fs_info->transaction_blocked_wait,
1401 trans->state >= TRANS_STATE_COMMIT_START ||
1402 trans->aborted);
1347} 1403}
1348 1404
1349/* 1405/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1354 struct btrfs_transaction *trans) 1410 struct btrfs_transaction *trans)
1355{ 1411{
1356 wait_event(root->fs_info->transaction_wait, 1412 wait_event(root->fs_info->transaction_wait,
1357 trans->commit_done || (trans->in_commit && !trans->blocked)); 1413 trans->state >= TRANS_STATE_UNBLOCKED ||
1414 trans->aborted);
1358} 1415}
1359 1416
1360/* 1417/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1450 1507
1451 spin_lock(&root->fs_info->trans_lock); 1508 spin_lock(&root->fs_info->trans_lock);
1452 1509
1453 if (list_empty(&cur_trans->list)) { 1510 /*
1454 spin_unlock(&root->fs_info->trans_lock); 1511 * If the transaction is removed from the list, it means this
1455 btrfs_end_transaction(trans, root); 1512 * transaction has been committed successfully, so it is impossible
1456 return; 1513 * to call the cleanup function.
1457 } 1514 */
1515 BUG_ON(list_empty(&cur_trans->list));
1458 1516
1459 list_del_init(&cur_trans->list); 1517 list_del_init(&cur_trans->list);
1460 if (cur_trans == root->fs_info->running_transaction) { 1518 if (cur_trans == root->fs_info->running_transaction) {
1461 root->fs_info->trans_no_join = 1; 1519 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1462 spin_unlock(&root->fs_info->trans_lock); 1520 spin_unlock(&root->fs_info->trans_lock);
1463 wait_event(cur_trans->writer_wait, 1521 wait_event(cur_trans->writer_wait,
1464 atomic_read(&cur_trans->num_writers) == 1); 1522 atomic_read(&cur_trans->num_writers) == 1);
1465 1523
1466 spin_lock(&root->fs_info->trans_lock); 1524 spin_lock(&root->fs_info->trans_lock);
1467 root->fs_info->running_transaction = NULL;
1468 } 1525 }
1469 spin_unlock(&root->fs_info->trans_lock); 1526 spin_unlock(&root->fs_info->trans_lock);
1470 1527
1471 btrfs_cleanup_one_transaction(trans->transaction, root); 1528 btrfs_cleanup_one_transaction(trans->transaction, root);
1472 1529
1530 spin_lock(&root->fs_info->trans_lock);
1531 if (cur_trans == root->fs_info->running_transaction)
1532 root->fs_info->running_transaction = NULL;
1533 spin_unlock(&root->fs_info->trans_lock);
1534
1473 put_transaction(cur_trans); 1535 put_transaction(cur_trans);
1474 put_transaction(cur_trans); 1536 put_transaction(cur_trans);
1475 1537
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1481 current->journal_info = NULL; 1543 current->journal_info = NULL;
1482 1544
1483 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1545 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1484
1485 spin_lock(&root->fs_info->trans_lock);
1486 root->fs_info->trans_no_join = 0;
1487 spin_unlock(&root->fs_info->trans_lock);
1488} 1546}
1489 1547
1490static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1548static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root) 1549 struct btrfs_root *root)
1492{ 1550{
1493 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1494 int snap_pending = 0;
1495 int ret; 1551 int ret;
1496 1552
1497 if (!flush_on_commit) {
1498 spin_lock(&root->fs_info->trans_lock);
1499 if (!list_empty(&trans->transaction->pending_snapshots))
1500 snap_pending = 1;
1501 spin_unlock(&root->fs_info->trans_lock);
1502 }
1503
1504 if (flush_on_commit || snap_pending) {
1505 ret = btrfs_start_delalloc_inodes(root, 1);
1506 if (ret)
1507 return ret;
1508 btrfs_wait_ordered_extents(root, 1);
1509 }
1510
1511 ret = btrfs_run_delayed_items(trans, root); 1553 ret = btrfs_run_delayed_items(trans, root);
1512 if (ret) 1554 if (ret)
1513 return ret; 1555 return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1531 return ret; 1573 return ret;
1532} 1574}
1533 1575
1534/* 1576static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1535 * btrfs_transaction state sequence: 1577{
1536 * in_commit = 0, blocked = 0 (initial) 1578 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1537 * in_commit = 1, blocked = 1 1579 return btrfs_start_all_delalloc_inodes(fs_info, 1);
1538 * blocked = 0 1580 return 0;
1539 * commit_done = 1 1581}
1540 */ 1582
1583static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1584{
1585 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1586 btrfs_wait_all_ordered_extents(fs_info, 1);
1587}
1588
1541int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1589int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root) 1590 struct btrfs_root *root)
1543{ 1591{
1544 unsigned long joined = 0;
1545 struct btrfs_transaction *cur_trans = trans->transaction; 1592 struct btrfs_transaction *cur_trans = trans->transaction;
1546 struct btrfs_transaction *prev_trans = NULL; 1593 struct btrfs_transaction *prev_trans = NULL;
1547 DEFINE_WAIT(wait);
1548 int ret; 1594 int ret;
1549 int should_grow = 0;
1550 unsigned long now = get_seconds();
1551 1595
1552 ret = btrfs_run_ordered_operations(trans, root, 0); 1596 ret = btrfs_run_ordered_operations(trans, root, 0);
1553 if (ret) { 1597 if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1586 * start sending their work down. 1630 * start sending their work down.
1587 */ 1631 */
1588 cur_trans->delayed_refs.flushing = 1; 1632 cur_trans->delayed_refs.flushing = 1;
1633 smp_wmb();
1589 1634
1590 if (!list_empty(&trans->new_bgs)) 1635 if (!list_empty(&trans->new_bgs))
1591 btrfs_create_pending_block_groups(trans, root); 1636 btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1596 return ret; 1641 return ret;
1597 } 1642 }
1598 1643
1599 spin_lock(&cur_trans->commit_lock); 1644 spin_lock(&root->fs_info->trans_lock);
1600 if (cur_trans->in_commit) { 1645 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1601 spin_unlock(&cur_trans->commit_lock); 1646 spin_unlock(&root->fs_info->trans_lock);
1602 atomic_inc(&cur_trans->use_count); 1647 atomic_inc(&cur_trans->use_count);
1603 ret = btrfs_end_transaction(trans, root); 1648 ret = btrfs_end_transaction(trans, root);
1604 1649
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1609 return ret; 1654 return ret;
1610 } 1655 }
1611 1656
1612 trans->transaction->in_commit = 1; 1657 cur_trans->state = TRANS_STATE_COMMIT_START;
1613 trans->transaction->blocked = 1;
1614 spin_unlock(&cur_trans->commit_lock);
1615 wake_up(&root->fs_info->transaction_blocked_wait); 1658 wake_up(&root->fs_info->transaction_blocked_wait);
1616 1659
1617 spin_lock(&root->fs_info->trans_lock);
1618 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1660 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1619 prev_trans = list_entry(cur_trans->list.prev, 1661 prev_trans = list_entry(cur_trans->list.prev,
1620 struct btrfs_transaction, list); 1662 struct btrfs_transaction, list);
1621 if (!prev_trans->commit_done) { 1663 if (prev_trans->state != TRANS_STATE_COMPLETED) {
1622 atomic_inc(&prev_trans->use_count); 1664 atomic_inc(&prev_trans->use_count);
1623 spin_unlock(&root->fs_info->trans_lock); 1665 spin_unlock(&root->fs_info->trans_lock);
1624 1666
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1632 spin_unlock(&root->fs_info->trans_lock); 1674 spin_unlock(&root->fs_info->trans_lock);
1633 } 1675 }
1634 1676
1635 if (!btrfs_test_opt(root, SSD) && 1677 extwriter_counter_dec(cur_trans, trans->type);
1636 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1637 should_grow = 1;
1638
1639 do {
1640 joined = cur_trans->num_joined;
1641
1642 WARN_ON(cur_trans != trans->transaction);
1643
1644 ret = btrfs_flush_all_pending_stuffs(trans, root);
1645 if (ret)
1646 goto cleanup_transaction;
1647 1678
1648 prepare_to_wait(&cur_trans->writer_wait, &wait, 1679 ret = btrfs_start_delalloc_flush(root->fs_info);
1649 TASK_UNINTERRUPTIBLE); 1680 if (ret)
1681 goto cleanup_transaction;
1650 1682
1651 if (atomic_read(&cur_trans->num_writers) > 1) 1683 ret = btrfs_flush_all_pending_stuffs(trans, root);
1652 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1684 if (ret)
1653 else if (should_grow) 1685 goto cleanup_transaction;
1654 schedule_timeout(1);
1655 1686
1656 finish_wait(&cur_trans->writer_wait, &wait); 1687 wait_event(cur_trans->writer_wait,
1657 } while (atomic_read(&cur_trans->num_writers) > 1 || 1688 extwriter_counter_read(cur_trans) == 0);
1658 (should_grow && cur_trans->num_joined != joined));
1659 1689
1690 /* some pending stuffs might be added after the previous flush. */
1660 ret = btrfs_flush_all_pending_stuffs(trans, root); 1691 ret = btrfs_flush_all_pending_stuffs(trans, root);
1661 if (ret) 1692 if (ret)
1662 goto cleanup_transaction; 1693 goto cleanup_transaction;
1663 1694
1695 btrfs_wait_delalloc_flush(root->fs_info);
1664 /* 1696 /*
1665 * Ok now we need to make sure to block out any other joins while we 1697 * Ok now we need to make sure to block out any other joins while we
1666 * commit the transaction. We could have started a join before setting 1698 * commit the transaction. We could have started a join before setting
1667 * no_join so make sure to wait for num_writers to == 1 again. 1699 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
1668 */ 1700 */
1669 spin_lock(&root->fs_info->trans_lock); 1701 spin_lock(&root->fs_info->trans_lock);
1670 root->fs_info->trans_no_join = 1; 1702 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1671 spin_unlock(&root->fs_info->trans_lock); 1703 spin_unlock(&root->fs_info->trans_lock);
1672 wait_event(cur_trans->writer_wait, 1704 wait_event(cur_trans->writer_wait,
1673 atomic_read(&cur_trans->num_writers) == 1); 1705 atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1794 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1826 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1795 sizeof(*root->fs_info->super_copy)); 1827 sizeof(*root->fs_info->super_copy));
1796 1828
1797 trans->transaction->blocked = 0;
1798 spin_lock(&root->fs_info->trans_lock); 1829 spin_lock(&root->fs_info->trans_lock);
1830 cur_trans->state = TRANS_STATE_UNBLOCKED;
1799 root->fs_info->running_transaction = NULL; 1831 root->fs_info->running_transaction = NULL;
1800 root->fs_info->trans_no_join = 0;
1801 spin_unlock(&root->fs_info->trans_lock); 1832 spin_unlock(&root->fs_info->trans_lock);
1802 mutex_unlock(&root->fs_info->reloc_mutex); 1833 mutex_unlock(&root->fs_info->reloc_mutex);
1803 1834
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1825 1856
1826 btrfs_finish_extent_commit(trans, root); 1857 btrfs_finish_extent_commit(trans, root);
1827 1858
1828 cur_trans->commit_done = 1;
1829
1830 root->fs_info->last_trans_committed = cur_trans->transid; 1859 root->fs_info->last_trans_committed = cur_trans->transid;
1831 1860 /*
1861 * We needn't acquire the lock here because there is no other task
1862 * which can change it.
1863 */
1864 cur_trans->state = TRANS_STATE_COMPLETED;
1832 wake_up(&cur_trans->commit_wait); 1865 wake_up(&cur_trans->commit_wait);
1833 1866
1834 spin_lock(&root->fs_info->trans_lock); 1867 spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1838 put_transaction(cur_trans); 1871 put_transaction(cur_trans);
1839 put_transaction(cur_trans); 1872 put_transaction(cur_trans);
1840 1873
1841 if (trans->type < TRANS_JOIN_NOLOCK) 1874 if (trans->type & __TRANS_FREEZABLE)
1842 sb_end_intwrite(root->fs_info->sb); 1875 sb_end_intwrite(root->fs_info->sb);
1843 1876
1844 trace_btrfs_transaction_commit(root); 1877 trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1885 int ret; 1918 int ret;
1886 struct btrfs_fs_info *fs_info = root->fs_info; 1919 struct btrfs_fs_info *fs_info = root->fs_info;
1887 1920
1888 if (fs_info->sb->s_flags & MS_RDONLY) {
1889 pr_debug("btrfs: cleaner called for RO fs!\n");
1890 return 0;
1891 }
1892
1893 spin_lock(&fs_info->trans_lock); 1921 spin_lock(&fs_info->trans_lock);
1894 if (list_empty(&fs_info->dead_roots)) { 1922 if (list_empty(&fs_info->dead_roots)) {
1895 spin_unlock(&fs_info->trans_lock); 1923 spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h" 23#include "ctree.h"
24 24
25enum btrfs_trans_state {
26 TRANS_STATE_RUNNING = 0,
27 TRANS_STATE_BLOCKED = 1,
28 TRANS_STATE_COMMIT_START = 2,
29 TRANS_STATE_COMMIT_DOING = 3,
30 TRANS_STATE_UNBLOCKED = 4,
31 TRANS_STATE_COMPLETED = 5,
32 TRANS_STATE_MAX = 6,
33};
34
25struct btrfs_transaction { 35struct btrfs_transaction {
26 u64 transid; 36 u64 transid;
27 /* 37 /*
38 * total external writers(USERSPACE/START/ATTACH) in this
39 * transaction, it must be zero before the transaction is
40 * being committed
41 */
42 atomic_t num_extwriters;
43 /*
28 * total writers in this transaction, it must be zero before the 44 * total writers in this transaction, it must be zero before the
29 * transaction can end 45 * transaction can end
30 */ 46 */
31 atomic_t num_writers; 47 atomic_t num_writers;
32 atomic_t use_count; 48 atomic_t use_count;
33 49
34 unsigned long num_joined; 50 /* Be protected by fs_info->trans_lock when we want to change it. */
35 51 enum btrfs_trans_state state;
36 spinlock_t commit_lock;
37 int in_commit;
38 int commit_done;
39 int blocked;
40 struct list_head list; 52 struct list_head list;
41 struct extent_io_tree dirty_pages; 53 struct extent_io_tree dirty_pages;
42 unsigned long start_time; 54 unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
44 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
46 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks;
47 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
48 int aborted; 61 int aborted;
49}; 62};
50 63
51enum btrfs_trans_type { 64#define __TRANS_FREEZABLE (1U << 0)
52 TRANS_START, 65
53 TRANS_JOIN, 66#define __TRANS_USERSPACE (1U << 8)
54 TRANS_USERSPACE, 67#define __TRANS_START (1U << 9)
55 TRANS_JOIN_NOLOCK, 68#define __TRANS_ATTACH (1U << 10)
56 TRANS_ATTACH, 69#define __TRANS_JOIN (1U << 11)
57}; 70#define __TRANS_JOIN_NOLOCK (1U << 12)
71
72#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
73#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
74#define TRANS_ATTACH (__TRANS_ATTACH)
75#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
76#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
77
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH)
58 80
59struct btrfs_trans_handle { 81struct btrfs_trans_handle {
60 u64 transid; 82 u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
70 short aborted; 92 short aborted;
71 short adding_csums; 93 short adding_csums;
72 bool allocating_chunk; 94 bool allocating_chunk;
73 enum btrfs_trans_type type; 95 unsigned int type;
74 /* 96 /*
75 * this root is only needed to validate that the root passed to 97 * this root is only needed to validate that the root passed to
76 * start_transaction is the same as the one passed to end_transaction. 98 * start_transaction is the same as the one passed to end_transaction.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h>
21#include <linux/list_sort.h> 22#include <linux/list_sort.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
279{ 280{
280 int ret = 0; 281 int ret = 0;
281 282
283 /*
284 * If this fs is mixed then we need to be able to process the leaves to
285 * pin down any logged extents, so we have to read the block.
286 */
287 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
288 ret = btrfs_read_buffer(eb, gen);
289 if (ret)
290 return ret;
291 }
292
282 if (wc->pin) 293 if (wc->pin)
283 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 294 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
284 eb->start, eb->len); 295 eb->start, eb->len);
285 296
286 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 297 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
298 if (wc->pin && btrfs_header_level(eb) == 0)
299 ret = btrfs_exclude_logged_extents(log, eb);
287 if (wc->write) 300 if (wc->write)
288 btrfs_write_tree_block(eb); 301 btrfs_write_tree_block(eb);
289 if (wc->wait) 302 if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2016 eb, i, &key); 2029 eb, i, &key);
2017 if (ret) 2030 if (ret)
2018 break; 2031 break;
2019 } else if (key.type == BTRFS_INODE_REF_KEY) { 2032 } else if (key.type == BTRFS_INODE_REF_KEY ||
2020 ret = add_inode_ref(wc->trans, root, log, path, 2033 key.type == BTRFS_INODE_EXTREF_KEY) {
2021 eb, i, &key);
2022 if (ret && ret != -ENOENT)
2023 break;
2024 ret = 0;
2025 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
2026 ret = add_inode_ref(wc->trans, root, log, path, 2034 ret = add_inode_ref(wc->trans, root, log, path,
2027 eb, i, &key); 2035 eb, i, &key);
2028 if (ret && ret != -ENOENT) 2036 if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2358 struct btrfs_root *log = root->log_root; 2366 struct btrfs_root *log = root->log_root;
2359 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2367 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2360 unsigned long log_transid = 0; 2368 unsigned long log_transid = 0;
2369 struct blk_plug plug;
2361 2370
2362 mutex_lock(&root->log_mutex); 2371 mutex_lock(&root->log_mutex);
2363 log_transid = root->log_transid; 2372 log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2401 /* we start IO on all the marked extents here, but we don't actually 2410 /* we start IO on all the marked extents here, but we don't actually
2402 * wait for them until later. 2411 * wait for them until later.
2403 */ 2412 */
2413 blk_start_plug(&plug);
2404 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2414 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2405 if (ret) { 2415 if (ret) {
2416 blk_finish_plug(&plug);
2406 btrfs_abort_transaction(trans, root, ret); 2417 btrfs_abort_transaction(trans, root, ret);
2407 btrfs_free_logged_extents(log, log_transid); 2418 btrfs_free_logged_extents(log, log_transid);
2408 mutex_unlock(&root->log_mutex); 2419 mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2437 } 2448 }
2438 2449
2439 if (ret) { 2450 if (ret) {
2451 blk_finish_plug(&plug);
2440 if (ret != -ENOSPC) { 2452 if (ret != -ENOSPC) {
2441 btrfs_abort_transaction(trans, root, ret); 2453 btrfs_abort_transaction(trans, root, ret);
2442 mutex_unlock(&log_root_tree->log_mutex); 2454 mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2452 2464
2453 index2 = log_root_tree->log_transid % 2; 2465 index2 = log_root_tree->log_transid % 2;
2454 if (atomic_read(&log_root_tree->log_commit[index2])) { 2466 if (atomic_read(&log_root_tree->log_commit[index2])) {
2467 blk_finish_plug(&plug);
2455 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2468 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2456 wait_log_commit(trans, log_root_tree, 2469 wait_log_commit(trans, log_root_tree,
2457 log_root_tree->log_transid); 2470 log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2474 * check the full commit flag again 2487 * check the full commit flag again
2475 */ 2488 */
2476 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2489 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2490 blk_finish_plug(&plug);
2477 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2491 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2478 btrfs_free_logged_extents(log, log_transid); 2492 btrfs_free_logged_extents(log, log_transid);
2479 mutex_unlock(&log_root_tree->log_mutex); 2493 mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2481 goto out_wake_log_root; 2495 goto out_wake_log_root;
2482 } 2496 }
2483 2497
2484 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2498 ret = btrfs_write_marked_extents(log_root_tree,
2485 &log_root_tree->dirty_log_pages, 2499 &log_root_tree->dirty_log_pages,
2486 EXTENT_DIRTY | EXTENT_NEW); 2500 EXTENT_DIRTY | EXTENT_NEW);
2501 blk_finish_plug(&plug);
2487 if (ret) { 2502 if (ret) {
2488 btrfs_abort_transaction(trans, root, ret); 2503 btrfs_abort_transaction(trans, root, ret);
2489 btrfs_free_logged_extents(log, log_transid); 2504 btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2491 goto out_wake_log_root; 2506 goto out_wake_log_root;
2492 } 2507 }
2493 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2508 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2509 btrfs_wait_marked_extents(log_root_tree,
2510 &log_root_tree->dirty_log_pages,
2511 EXTENT_NEW | EXTENT_DIRTY);
2494 btrfs_wait_logged_extents(log, log_transid); 2512 btrfs_wait_logged_extents(log, log_transid);
2495 2513
2496 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2514 btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
4016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4034 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4017 break; 4035 break;
4018 4036
4019 log = btrfs_read_fs_root_no_radix(log_root_tree, 4037 log = btrfs_read_fs_root(log_root_tree, &found_key);
4020 &found_key);
4021 if (IS_ERR(log)) { 4038 if (IS_ERR(log)) {
4022 ret = PTR_ERR(log); 4039 ret = PTR_ERR(log);
4023 btrfs_error(fs_info, ret, 4040 btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
205 u64 new_alloced = ulist->nodes_alloced + 128; 205 u64 new_alloced = ulist->nodes_alloced + 128;
206 struct ulist_node *new_nodes; 206 struct ulist_node *new_nodes;
207 void *old = NULL; 207 void *old = NULL;
208 int i;
209
210 for (i = 0; i < ulist->nnodes; i++)
211 rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
208 212
209 /* 213 /*
210 * if nodes_alloced == ULIST_SIZE no memory has been allocated 214 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
224 228
225 ulist->nodes = new_nodes; 229 ulist->nodes = new_nodes;
226 ulist->nodes_alloced = new_alloced; 230 ulist->nodes_alloced = new_alloced;
231
232 /*
233 * krealloc actually uses memcpy, which does not copy rb_node
234 * pointers, so we have to do it ourselves. Otherwise we may
235 * be bitten by crashes.
236 */
237 for (i = 0; i < ulist->nnodes; i++) {
238 ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
239 if (ret < 0)
240 return ret;
241 }
227 } 242 }
228 ulist->nodes[ulist->nnodes].val = val; 243 ulist->nodes[ulist->nnodes].val = val;
229 ulist->nodes[ulist->nnodes].aux = aux; 244 ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
982 return ret; 982 return ret;
983} 983}
984 984
985static int contains_pending_extent(struct btrfs_trans_handle *trans,
986 struct btrfs_device *device,
987 u64 *start, u64 len)
988{
989 struct extent_map *em;
990 int ret = 0;
991
992 list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
993 struct map_lookup *map;
994 int i;
995
996 map = (struct map_lookup *)em->bdev;
997 for (i = 0; i < map->num_stripes; i++) {
998 if (map->stripes[i].dev != device)
999 continue;
1000 if (map->stripes[i].physical >= *start + len ||
1001 map->stripes[i].physical + em->orig_block_len <=
1002 *start)
1003 continue;
1004 *start = map->stripes[i].physical +
1005 em->orig_block_len;
1006 ret = 1;
1007 }
1008 }
1009
1010 return ret;
1011}
1012
1013
985/* 1014/*
986 * find_free_dev_extent - find free space in the specified device 1015 * find_free_dev_extent - find free space in the specified device
987 * @device: the device which we search the free space in 1016 * @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
1002 * But if we don't find suitable free space, it is used to store the size of 1031 * But if we don't find suitable free space, it is used to store the size of
1003 * the max free space. 1032 * the max free space.
1004 */ 1033 */
1005int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1034int find_free_dev_extent(struct btrfs_trans_handle *trans,
1035 struct btrfs_device *device, u64 num_bytes,
1006 u64 *start, u64 *len) 1036 u64 *start, u64 *len)
1007{ 1037{
1008 struct btrfs_key key; 1038 struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1026 */ 1056 */
1027 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1057 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1028 1058
1059 path = btrfs_alloc_path();
1060 if (!path)
1061 return -ENOMEM;
1062again:
1029 max_hole_start = search_start; 1063 max_hole_start = search_start;
1030 max_hole_size = 0; 1064 max_hole_size = 0;
1031 hole_size = 0; 1065 hole_size = 0;
1032 1066
1033 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1067 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1034 ret = -ENOSPC; 1068 ret = -ENOSPC;
1035 goto error; 1069 goto out;
1036 } 1070 }
1037 1071
1038 path = btrfs_alloc_path();
1039 if (!path) {
1040 ret = -ENOMEM;
1041 goto error;
1042 }
1043 path->reada = 2; 1072 path->reada = 2;
1073 path->search_commit_root = 1;
1074 path->skip_locking = 1;
1044 1075
1045 key.objectid = device->devid; 1076 key.objectid = device->devid;
1046 key.offset = search_start; 1077 key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1081 if (key.offset > search_start) { 1112 if (key.offset > search_start) {
1082 hole_size = key.offset - search_start; 1113 hole_size = key.offset - search_start;
1083 1114
1115 /*
1116 * Have to check before we set max_hole_start, otherwise
1117 * we could end up sending back this offset anyway.
1118 */
1119 if (contains_pending_extent(trans, device,
1120 &search_start,
1121 hole_size))
1122 hole_size = 0;
1123
1084 if (hole_size > max_hole_size) { 1124 if (hole_size > max_hole_size) {
1085 max_hole_start = search_start; 1125 max_hole_start = search_start;
1086 max_hole_size = hole_size; 1126 max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
1124 max_hole_size = hole_size; 1164 max_hole_size = hole_size;
1125 } 1165 }
1126 1166
1167 if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1168 btrfs_release_path(path);
1169 goto again;
1170 }
1171
1127 /* See above. */ 1172 /* See above. */
1128 if (hole_size < num_bytes) 1173 if (hole_size < num_bytes)
1129 ret = -ENOSPC; 1174 ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
1132 1177
1133out: 1178out:
1134 btrfs_free_path(path); 1179 btrfs_free_path(path);
1135error:
1136 *start = max_hole_start; 1180 *start = max_hole_start;
1137 if (len) 1181 if (len)
1138 *len = max_hole_size; 1182 *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
1244 return ret; 1288 return ret;
1245} 1289}
1246 1290
1247static noinline int find_next_chunk(struct btrfs_root *root, 1291static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1248 u64 objectid, u64 *offset)
1249{ 1292{
1250 struct btrfs_path *path; 1293 struct extent_map_tree *em_tree;
1251 int ret; 1294 struct extent_map *em;
1252 struct btrfs_key key; 1295 struct rb_node *n;
1253 struct btrfs_chunk *chunk; 1296 u64 ret = 0;
1254 struct btrfs_key found_key;
1255
1256 path = btrfs_alloc_path();
1257 if (!path)
1258 return -ENOMEM;
1259
1260 key.objectid = objectid;
1261 key.offset = (u64)-1;
1262 key.type = BTRFS_CHUNK_ITEM_KEY;
1263
1264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1265 if (ret < 0)
1266 goto error;
1267
1268 BUG_ON(ret == 0); /* Corruption */
1269 1297
1270 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1298 em_tree = &fs_info->mapping_tree.map_tree;
1271 if (ret) { 1299 read_lock(&em_tree->lock);
1272 *offset = 0; 1300 n = rb_last(&em_tree->map);
1273 } else { 1301 if (n) {
1274 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1302 em = rb_entry(n, struct extent_map, rb_node);
1275 path->slots[0]); 1303 ret = em->start + em->len;
1276 if (found_key.objectid != objectid)
1277 *offset = 0;
1278 else {
1279 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1280 struct btrfs_chunk);
1281 *offset = found_key.offset +
1282 btrfs_chunk_length(path->nodes[0], chunk);
1283 }
1284 } 1304 }
1285 ret = 0; 1305 read_unlock(&em_tree->lock);
1286error: 1306
1287 btrfs_free_path(path);
1288 return ret; 1307 return ret;
1289} 1308}
1290 1309
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1462 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1481 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1463 1482
1464 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1483 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1465 printk(KERN_ERR "btrfs: unable to go below four devices " 1484 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1466 "on raid10\n");
1467 ret = -EINVAL;
1468 goto out; 1485 goto out;
1469 } 1486 }
1470 1487
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1488 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1472 printk(KERN_ERR "btrfs: unable to go below two " 1489 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1473 "devices on raid1\n");
1474 ret = -EINVAL;
1475 goto out; 1490 goto out;
1476 } 1491 }
1477 1492
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1493 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1479 root->fs_info->fs_devices->rw_devices <= 2) { 1494 root->fs_info->fs_devices->rw_devices <= 2) {
1480 printk(KERN_ERR "btrfs: unable to go below two " 1495 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1481 "devices on raid5\n");
1482 ret = -EINVAL;
1483 goto out; 1496 goto out;
1484 } 1497 }
1485 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1498 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1486 root->fs_info->fs_devices->rw_devices <= 3) { 1499 root->fs_info->fs_devices->rw_devices <= 3) {
1487 printk(KERN_ERR "btrfs: unable to go below three " 1500 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1488 "devices on raid6\n");
1489 ret = -EINVAL;
1490 goto out; 1501 goto out;
1491 } 1502 }
1492 1503
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1512 bh = NULL; 1523 bh = NULL;
1513 disk_super = NULL; 1524 disk_super = NULL;
1514 if (!device) { 1525 if (!device) {
1515 printk(KERN_ERR "btrfs: no missing devices found to " 1526 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1516 "remove\n");
1517 goto out; 1527 goto out;
1518 } 1528 }
1519 } else { 1529 } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1535 } 1545 }
1536 1546
1537 if (device->is_tgtdev_for_dev_replace) { 1547 if (device->is_tgtdev_for_dev_replace) {
1538 pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1548 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1539 ret = -EINVAL;
1540 goto error_brelse; 1549 goto error_brelse;
1541 } 1550 }
1542 1551
1543 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1552 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1544 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1553 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1545 "device\n");
1546 ret = -EINVAL;
1547 goto error_brelse; 1554 goto error_brelse;
1548 } 1555 }
1549 1556
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3295 } 3302 }
3296 3303
3297 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3304 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3298 if (IS_ERR(tsk)) 3305 return PTR_RET(tsk);
3299 return PTR_ERR(tsk);
3300
3301 return 0;
3302} 3306}
3303 3307
3304int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3308int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3681} 3685}
3682 3686
3683static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3687static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3684 struct btrfs_root *extent_root, 3688 struct btrfs_root *extent_root, u64 start,
3685 struct map_lookup **map_ret, 3689 u64 type)
3686 u64 *num_bytes_out, u64 *stripe_size_out,
3687 u64 start, u64 type)
3688{ 3690{
3689 struct btrfs_fs_info *info = extent_root->fs_info; 3691 struct btrfs_fs_info *info = extent_root->fs_info;
3690 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3692 struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3791 if (total_avail == 0) 3793 if (total_avail == 0)
3792 continue; 3794 continue;
3793 3795
3794 ret = find_free_dev_extent(device, 3796 ret = find_free_dev_extent(trans, device,
3795 max_stripe_size * dev_stripes, 3797 max_stripe_size * dev_stripes,
3796 &dev_offset, &max_avail); 3798 &dev_offset, &max_avail);
3797 if (ret && ret != -ENOSPC) 3799 if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3903 map->type = type; 3905 map->type = type;
3904 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3905 3907
3906 *map_ret = map;
3907 num_bytes = stripe_size * data_stripes; 3908 num_bytes = stripe_size * data_stripes;
3908 3909
3909 *stripe_size_out = stripe_size;
3910 *num_bytes_out = num_bytes;
3911
3912 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3910 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3913 3911
3914 em = alloc_extent_map(); 3912 em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3921 em->len = num_bytes; 3919 em->len = num_bytes;
3922 em->block_start = 0; 3920 em->block_start = 0;
3923 em->block_len = em->len; 3921 em->block_len = em->len;
3922 em->orig_block_len = stripe_size;
3924 3923
3925 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3924 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3926 write_lock(&em_tree->lock); 3925 write_lock(&em_tree->lock);
3927 ret = add_extent_mapping(em_tree, em, 0); 3926 ret = add_extent_mapping(em_tree, em, 0);
3927 if (!ret) {
3928 list_add_tail(&em->list, &trans->transaction->pending_chunks);
3929 atomic_inc(&em->refs);
3930 }
3928 write_unlock(&em_tree->lock); 3931 write_unlock(&em_tree->lock);
3929 if (ret) { 3932 if (ret) {
3930 free_extent_map(em); 3933 free_extent_map(em);
3931 goto error; 3934 goto error;
3932 } 3935 }
3933 3936
3934 for (i = 0; i < map->num_stripes; ++i) {
3935 struct btrfs_device *device;
3936 u64 dev_offset;
3937
3938 device = map->stripes[i].dev;
3939 dev_offset = map->stripes[i].physical;
3940
3941 ret = btrfs_alloc_dev_extent(trans, device,
3942 info->chunk_root->root_key.objectid,
3943 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3944 start, dev_offset, stripe_size);
3945 if (ret)
3946 goto error_dev_extent;
3947 }
3948
3949 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3937 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3950 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3938 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3951 start, num_bytes); 3939 start, num_bytes);
3952 if (ret) { 3940 if (ret)
3953 i = map->num_stripes - 1; 3941 goto error_del_extent;
3954 goto error_dev_extent;
3955 }
3956 3942
3957 free_extent_map(em); 3943 free_extent_map(em);
3958 check_raid56_incompat_flag(extent_root->fs_info, type); 3944 check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3960 kfree(devices_info); 3946 kfree(devices_info);
3961 return 0; 3947 return 0;
3962 3948
3963error_dev_extent: 3949error_del_extent:
3964 for (; i >= 0; i--) {
3965 struct btrfs_device *device;
3966 int err;
3967
3968 device = map->stripes[i].dev;
3969 err = btrfs_free_dev_extent(trans, device, start);
3970 if (err) {
3971 btrfs_abort_transaction(trans, extent_root, err);
3972 break;
3973 }
3974 }
3975 write_lock(&em_tree->lock); 3950 write_lock(&em_tree->lock);
3976 remove_extent_mapping(em_tree, em); 3951 remove_extent_mapping(em_tree, em);
3977 write_unlock(&em_tree->lock); 3952 write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
3986 return ret; 3961 return ret;
3987} 3962}
3988 3963
3989static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3964int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, 3965 struct btrfs_root *extent_root,
3991 struct map_lookup *map, u64 chunk_offset, 3966 u64 chunk_offset, u64 chunk_size)
3992 u64 chunk_size, u64 stripe_size)
3993{ 3967{
3994 u64 dev_offset;
3995 struct btrfs_key key; 3968 struct btrfs_key key;
3996 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3969 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3997 struct btrfs_device *device; 3970 struct btrfs_device *device;
3998 struct btrfs_chunk *chunk; 3971 struct btrfs_chunk *chunk;
3999 struct btrfs_stripe *stripe; 3972 struct btrfs_stripe *stripe;
4000 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3973 struct extent_map_tree *em_tree;
4001 int index = 0; 3974 struct extent_map *em;
3975 struct map_lookup *map;
3976 size_t item_size;
3977 u64 dev_offset;
3978 u64 stripe_size;
3979 int i = 0;
4002 int ret; 3980 int ret;
4003 3981
3982 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3983 read_lock(&em_tree->lock);
3984 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
3985 read_unlock(&em_tree->lock);
3986
3987 if (!em) {
3988 btrfs_crit(extent_root->fs_info, "unable to find logical "
3989 "%Lu len %Lu", chunk_offset, chunk_size);
3990 return -EINVAL;
3991 }
3992
3993 if (em->start != chunk_offset || em->len != chunk_size) {
3994 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
3995 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
3996 chunk_size, em->start, em->len);
3997 free_extent_map(em);
3998 return -EINVAL;
3999 }
4000
4001 map = (struct map_lookup *)em->bdev;
4002 item_size = btrfs_chunk_item_size(map->num_stripes);
4003 stripe_size = em->orig_block_len;
4004
4004 chunk = kzalloc(item_size, GFP_NOFS); 4005 chunk = kzalloc(item_size, GFP_NOFS);
4005 if (!chunk) 4006 if (!chunk) {
4006 return -ENOMEM; 4007 ret = -ENOMEM;
4008 goto out;
4009 }
4010
4011 for (i = 0; i < map->num_stripes; i++) {
4012 device = map->stripes[i].dev;
4013 dev_offset = map->stripes[i].physical;
4007 4014
4008 index = 0;
4009 while (index < map->num_stripes) {
4010 device = map->stripes[index].dev;
4011 device->bytes_used += stripe_size; 4015 device->bytes_used += stripe_size;
4012 ret = btrfs_update_device(trans, device); 4016 ret = btrfs_update_device(trans, device);
4013 if (ret) 4017 if (ret)
4014 goto out_free; 4018 goto out;
4015 index++; 4019 ret = btrfs_alloc_dev_extent(trans, device,
4020 chunk_root->root_key.objectid,
4021 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4022 chunk_offset, dev_offset,
4023 stripe_size);
4024 if (ret)
4025 goto out;
4016 } 4026 }
4017 4027
4018 spin_lock(&extent_root->fs_info->free_chunk_lock); 4028 spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4020 map->num_stripes); 4030 map->num_stripes);
4021 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4031 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4022 4032
4023 index = 0;
4024 stripe = &chunk->stripe; 4033 stripe = &chunk->stripe;
4025 while (index < map->num_stripes) { 4034 for (i = 0; i < map->num_stripes; i++) {
4026 device = map->stripes[index].dev; 4035 device = map->stripes[i].dev;
4027 dev_offset = map->stripes[index].physical; 4036 dev_offset = map->stripes[i].physical;
4028 4037
4029 btrfs_set_stack_stripe_devid(stripe, device->devid); 4038 btrfs_set_stack_stripe_devid(stripe, device->devid);
4030 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4039 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4031 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4040 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4032 stripe++; 4041 stripe++;
4033 index++;
4034 } 4042 }
4035 4043
4036 btrfs_set_stack_chunk_length(chunk, chunk_size); 4044 btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4048 key.offset = chunk_offset; 4056 key.offset = chunk_offset;
4049 4057
4050 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4058 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4051
4052 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4059 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4053 /* 4060 /*
4054 * TODO: Cleanup of inserted chunk root in case of 4061 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4058 item_size); 4065 item_size);
4059 } 4066 }
4060 4067
4061out_free: 4068out:
4062 kfree(chunk); 4069 kfree(chunk);
4070 free_extent_map(em);
4063 return ret; 4071 return ret;
4064} 4072}
4065 4073
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4074 struct btrfs_root *extent_root, u64 type) 4082 struct btrfs_root *extent_root, u64 type)
4075{ 4083{
4076 u64 chunk_offset; 4084 u64 chunk_offset;
4077 u64 chunk_size;
4078 u64 stripe_size;
4079 struct map_lookup *map;
4080 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4081 int ret;
4082
4083 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4084 &chunk_offset);
4085 if (ret)
4086 return ret;
4087 4085
4088 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4086 chunk_offset = find_next_chunk(extent_root->fs_info);
4089 &stripe_size, chunk_offset, type); 4087 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4090 if (ret)
4091 return ret;
4092
4093 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4094 chunk_size, stripe_size);
4095 if (ret)
4096 return ret;
4097 return 0;
4098} 4088}
4099 4089
4100static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4090static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4103{ 4093{
4104 u64 chunk_offset; 4094 u64 chunk_offset;
4105 u64 sys_chunk_offset; 4095 u64 sys_chunk_offset;
4106 u64 chunk_size;
4107 u64 sys_chunk_size;
4108 u64 stripe_size;
4109 u64 sys_stripe_size;
4110 u64 alloc_profile; 4096 u64 alloc_profile;
4111 struct map_lookup *map;
4112 struct map_lookup *sys_map;
4113 struct btrfs_fs_info *fs_info = root->fs_info; 4097 struct btrfs_fs_info *fs_info = root->fs_info;
4114 struct btrfs_root *extent_root = fs_info->extent_root; 4098 struct btrfs_root *extent_root = fs_info->extent_root;
4115 int ret; 4099 int ret;
4116 4100
4117 ret = find_next_chunk(fs_info->chunk_root, 4101 chunk_offset = find_next_chunk(fs_info);
4118 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
4119 if (ret)
4120 return ret;
4121
4122 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4102 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4123 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4103 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4124 &stripe_size, chunk_offset, alloc_profile); 4104 alloc_profile);
4125 if (ret) 4105 if (ret)
4126 return ret; 4106 return ret;
4127 4107
4128 sys_chunk_offset = chunk_offset + chunk_size; 4108 sys_chunk_offset = find_next_chunk(root->fs_info);
4129
4130 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4109 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4131 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4110 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4132 &sys_chunk_size, &sys_stripe_size, 4111 alloc_profile);
4133 sys_chunk_offset, alloc_profile);
4134 if (ret) { 4112 if (ret) {
4135 btrfs_abort_transaction(trans, root, ret); 4113 btrfs_abort_transaction(trans, root, ret);
4136 goto out; 4114 goto out;
4137 } 4115 }
4138 4116
4139 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4117 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4140 if (ret) {
4141 btrfs_abort_transaction(trans, root, ret);
4142 goto out;
4143 }
4144
4145 /*
4146 * Modifying chunk tree needs allocating new blocks from both
4147 * system block group and metadata block group. So we only can
4148 * do operations require modifying the chunk tree after both
4149 * block groups were created.
4150 */
4151 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4152 chunk_size, stripe_size);
4153 if (ret) {
4154 btrfs_abort_transaction(trans, root, ret);
4155 goto out;
4156 }
4157
4158 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
4159 sys_chunk_offset, sys_chunk_size,
4160 sys_stripe_size);
4161 if (ret) 4118 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4119 btrfs_abort_transaction(trans, root, ret);
4163
4164out: 4120out:
4165
4166 return ret; 4121 return ret;
4167} 4122}
4168 4123
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4435 map = (struct map_lookup *)em->bdev; 4390 map = (struct map_lookup *)em->bdev;
4436 offset = logical - em->start; 4391 offset = logical - em->start;
4437 4392
4438 if (mirror_num > map->num_stripes)
4439 mirror_num = 0;
4440
4441 stripe_len = map->stripe_len; 4393 stripe_len = map->stripe_len;
4442 stripe_nr = offset; 4394 stripe_nr = offset;
4443 /* 4395 /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5367 return NULL; 5319 return NULL;
5368 list_add(&device->dev_list, 5320 list_add(&device->dev_list,
5369 &fs_devices->devices); 5321 &fs_devices->devices);
5370 device->dev_root = root->fs_info->dev_root;
5371 device->devid = devid; 5322 device->devid = devid;
5372 device->work.func = pending_bios_fn; 5323 device->work.func = pending_bios_fn;
5373 device->fs_devices = fs_devices; 5324 device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
5593 } 5544 }
5594 5545
5595 fill_device_from_item(leaf, dev_item, device); 5546 fill_device_from_item(leaf, dev_item, device);
5596 device->dev_root = root->fs_info->dev_root;
5597 device->in_fs_metadata = 1; 5547 device->in_fs_metadata = 1;
5598 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5548 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5599 device->fs_devices->total_rw_bytes += device->total_bytes; 5549 device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
5751 return ret; 5701 return ret;
5752} 5702}
5753 5703
5704void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
5705{
5706 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5707 struct btrfs_device *device;
5708
5709 mutex_lock(&fs_devices->device_list_mutex);
5710 list_for_each_entry(device, &fs_devices->devices, dev_list)
5711 device->dev_root = fs_info->dev_root;
5712 mutex_unlock(&fs_devices->device_list_mutex);
5713}
5714
5754static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 5715static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5755{ 5716{
5756 int i; 5717 int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
316int btrfs_pause_balance(struct btrfs_fs_info *fs_info); 316int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); 317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
319int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 319int find_free_dev_extent(struct btrfs_trans_handle *trans,
320 struct btrfs_device *device, u64 num_bytes,
320 u64 *start, u64 *max_avail); 321 u64 *start, u64 *max_avail);
321void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 322void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
322int btrfs_get_dev_stats(struct btrfs_root *root, 323int btrfs_get_dev_stats(struct btrfs_root *root,
323 struct btrfs_ioctl_get_dev_stats *stats); 324 struct btrfs_ioctl_get_dev_stats *stats);
325void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
324int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 326int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
325int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 327int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
326 struct btrfs_fs_info *fs_info); 328 struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
336unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 338unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
337 struct btrfs_mapping_tree *map_tree, 339 struct btrfs_mapping_tree *map_tree,
338 u64 logical); 340 u64 logical);
341int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
342 struct btrfs_root *extent_root,
343 u64 chunk_offset, u64 chunk_size);
339static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 344static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
340 int index) 345 int index)
341{ 346{