aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-11 12:22:21 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-11 12:22:21 -0400
commit859862ddd2b6b8dee00498c015ab37f02474b442 (patch)
treeb5597dd52b2e596401522bab802ca7993c1c20be /fs
parent412dd3a6daf0cadce1b2d6a34fa3713f40255579 (diff)
parentc7548af69d9ef71512eb52d8009521eba3e768fd (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "The biggest change here is Josef's rework of the btrfs quota accounting, which improves the in-memory tracking of delayed extent operations. I had been working on Btrfs stack usage for a while, mostly because it had become impossible to do long stress runs with slab, lockdep and pagealloc debugging turned on without blowing the stack. Even though you upgraded us to a nice king sized stack, I kept most of the patches. We also have some very hard to find corruption fixes, an awesome sysfs use after free, and the usual assortment of optimizations, cleanups and other fixes" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (80 commits) Btrfs: convert smp_mb__{before,after}_clear_bit Btrfs: fix scrub_print_warning to handle skinny metadata extents Btrfs: make fsync work after cloning into a file Btrfs: use right type to get real comparison Btrfs: don't check nodes for extent items Btrfs: don't release invalid page in btrfs_page_exists_in_range() Btrfs: make sure we retry if page is a retriable exception Btrfs: make sure we retry if we couldn't get the page btrfs: replace EINVAL with EOPNOTSUPP for dev_replace raid56 trivial: fs/btrfs/ioctl.c: fix typo s/substract/subtract/ Btrfs: fix leaf corruption after __btrfs_drop_extents Btrfs: ensure btrfs_prev_leaf doesn't miss 1 item Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled btrfs: free delayed node outside of root->inode_lock btrfs: replace EINVAL with ERANGE for resize when ULLONG_MAX Btrfs: fix transaction leak during fsync call btrfs: Avoid trucating page or punching hole in a already existed hole. Btrfs: update commit root on snapshot creation after orphan cleanup Btrfs: ioctl, don't re-lock extent range when not necessary Btrfs: avoid visiting all extent items when cloning a range ...
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c7
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/check-integrity.c5
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c104
-rw-r--r--fs/btrfs/ctree.h143
-rw-r--r--fs/btrfs/delayed-inode.c7
-rw-r--r--fs/btrfs/delayed-ref.c39
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c113
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c470
-rw-r--r--fs/btrfs/extent_io.c401
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c80
-rw-r--r--fs/btrfs/file.c152
-rw-r--r--fs/btrfs/free-space-cache.c312
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c293
-rw-r--r--fs/btrfs/ioctl.c396
-rw-r--r--fs/btrfs/lzo.c14
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c937
-rw-r--r--fs/btrfs/qgroup.h107
-rw-r--r--fs/btrfs/relocation.c21
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/scrub.c9
-rw-r--r--fs/btrfs/send.c290
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/sysfs.c50
-rw-r--r--fs/btrfs/tests/btrfs-tests.c97
-rw-r--r--fs/btrfs/tests/btrfs-tests.h9
-rw-r--r--fs/btrfs/tests/inode-tests.c35
-rw-r--r--fs/btrfs/tests/qgroup-tests.c468
-rw-r--r--fs/btrfs/transaction.c113
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c49
-rw-r--r--fs/btrfs/tree-log.h16
-rw-r--r--fs/btrfs/volumes.c122
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/zlib.c26
46 files changed, 3693 insertions, 1303 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f341a98031d2..6d1d0b93b1aa 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
16 16
17btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ 17btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
18 tests/extent-buffer-tests.o tests/btrfs-tests.o \ 18 tests/extent-buffer-tests.o tests/btrfs-tests.o \
19 tests/extent-io-tests.o tests/inode-tests.o 19 tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index ff9b3995d453..9a0124a95851 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
79 const char *name; 79 const char *name;
80 char *value = NULL; 80 char *value = NULL;
81 81
82 if (acl) {
83 ret = posix_acl_valid(acl);
84 if (ret < 0)
85 return ret;
86 ret = 0;
87 }
88
89 switch (type) { 82 switch (type) {
90 case ACL_TYPE_ACCESS: 83 case ACL_TYPE_ACCESS:
91 name = POSIX_ACL_XATTR_ACCESS; 84 name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 10db21fa0926..e25564bfcb46 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -900,7 +900,11 @@ again:
900 goto out; 900 goto out;
901 BUG_ON(ret == 0); 901 BUG_ON(ret == 0);
902 902
903#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
904 if (trans && likely(trans->type != __TRANS_DUMMY)) {
905#else
903 if (trans) { 906 if (trans) {
907#endif
904 /* 908 /*
905 * look if there are updates for this ref queued and lock the 909 * look if there are updates for this ref queued and lock the
906 * head 910 * head
@@ -984,11 +988,12 @@ again:
984 goto out; 988 goto out;
985 } 989 }
986 if (ref->count && ref->parent) { 990 if (ref->count && ref->parent) {
987 if (extent_item_pos && !ref->inode_list) { 991 if (extent_item_pos && !ref->inode_list &&
992 ref->level == 0) {
988 u32 bsz; 993 u32 bsz;
989 struct extent_buffer *eb; 994 struct extent_buffer *eb;
990 bsz = btrfs_level_size(fs_info->extent_root, 995 bsz = btrfs_level_size(fs_info->extent_root,
991 info_level); 996 ref->level);
992 eb = read_tree_block(fs_info->extent_root, 997 eb = read_tree_block(fs_info->extent_root,
993 ref->parent, bsz, 0); 998 ref->parent, bsz, 0);
994 if (!eb || !extent_buffer_uptodate(eb)) { 999 if (!eb || !extent_buffer_uptodate(eb)) {
@@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1404 * returns <0 on error 1409 * returns <0 on error
1405 */ 1410 */
1406static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, 1411static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1407 struct btrfs_extent_item *ei, u32 item_size, 1412 struct btrfs_key *key,
1408 struct btrfs_extent_inline_ref **out_eiref, 1413 struct btrfs_extent_item *ei, u32 item_size,
1409 int *out_type) 1414 struct btrfs_extent_inline_ref **out_eiref,
1415 int *out_type)
1410{ 1416{
1411 unsigned long end; 1417 unsigned long end;
1412 u64 flags; 1418 u64 flags;
@@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1416 /* first call */ 1422 /* first call */
1417 flags = btrfs_extent_flags(eb, ei); 1423 flags = btrfs_extent_flags(eb, ei);
1418 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1424 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1419 info = (struct btrfs_tree_block_info *)(ei + 1); 1425 if (key->type == BTRFS_METADATA_ITEM_KEY) {
1420 *out_eiref = 1426 /* a skinny metadata extent */
1421 (struct btrfs_extent_inline_ref *)(info + 1); 1427 *out_eiref =
1428 (struct btrfs_extent_inline_ref *)(ei + 1);
1429 } else {
1430 WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
1431 info = (struct btrfs_tree_block_info *)(ei + 1);
1432 *out_eiref =
1433 (struct btrfs_extent_inline_ref *)(info + 1);
1434 }
1422 } else { 1435 } else {
1423 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); 1436 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
1424 } 1437 }
1425 *ptr = (unsigned long)*out_eiref; 1438 *ptr = (unsigned long)*out_eiref;
1426 if ((void *)*ptr >= (void *)ei + item_size) 1439 if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
1427 return -ENOENT; 1440 return -ENOENT;
1428 } 1441 }
1429 1442
1430 end = (unsigned long)ei + item_size; 1443 end = (unsigned long)ei + item_size;
1431 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; 1444 *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
1432 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); 1445 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
1433 1446
1434 *ptr += btrfs_extent_inline_ref_size(*out_type); 1447 *ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1447 * <0 on error. 1460 * <0 on error.
1448 */ 1461 */
1449int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 1462int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1450 struct btrfs_extent_item *ei, u32 item_size, 1463 struct btrfs_key *key, struct btrfs_extent_item *ei,
1451 u64 *out_root, u8 *out_level) 1464 u32 item_size, u64 *out_root, u8 *out_level)
1452{ 1465{
1453 int ret; 1466 int ret;
1454 int type; 1467 int type;
@@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1459 return 1; 1472 return 1;
1460 1473
1461 while (1) { 1474 while (1) {
1462 ret = __get_extent_inline_ref(ptr, eb, ei, item_size, 1475 ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
1463 &eiref, &type); 1476 &eiref, &type);
1464 if (ret < 0) 1477 if (ret < 0)
1465 return ret; 1478 return ret;
1466 1479
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a910b27a8ad9..86fc20fec282 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
40 u64 *flags); 40 u64 *flags);
41 41
42int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 42int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
43 struct btrfs_extent_item *ei, u32 item_size, 43 struct btrfs_key *key, struct btrfs_extent_item *ei,
44 u64 *out_root, u8 *out_level); 44 u32 item_size, u64 *out_root, u8 *out_level);
45 45
46int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 46int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
47 u64 extent_item_objectid, 47 u64 extent_item_objectid,
@@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 56
57int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 57int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
58 struct btrfs_fs_info *fs_info, u64 bytenr, 58 struct btrfs_fs_info *fs_info, u64 bytenr,
59 u64 time_seq, struct ulist **roots); 59 u64 time_seq, struct ulist **roots);
60char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 60char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
61 u32 name_len, unsigned long name_off, 61 u32 name_len, unsigned long name_off,
62 struct extent_buffer *eb_in, u64 parent, 62 struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2256e9cceec5..4794923c410c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -284,4 +284,6 @@ static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
284 &BTRFS_I(inode)->runtime_flags); 284 &BTRFS_I(inode)->runtime_flags);
285} 285}
286 286
287bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
288
287#endif 289#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0e8388e72d8d..ce92ae30250f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error:
1093 next_stack = 1093 next_stack =
1094 btrfsic_stack_frame_alloc(); 1094 btrfsic_stack_frame_alloc();
1095 if (NULL == next_stack) { 1095 if (NULL == next_stack) {
1096 sf->error = -1;
1096 btrfsic_release_block_ctx( 1097 btrfsic_release_block_ctx(
1097 &sf-> 1098 &sf->
1098 next_block_ctx); 1099 next_block_ctx);
@@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame:
1190 sf->next_block_ctx.datav[0]; 1191 sf->next_block_ctx.datav[0];
1191 1192
1192 next_stack = btrfsic_stack_frame_alloc(); 1193 next_stack = btrfsic_stack_frame_alloc();
1193 if (NULL == next_stack) 1194 if (NULL == next_stack) {
1195 sf->error = -1;
1194 goto one_stack_frame_backwards; 1196 goto one_stack_frame_backwards;
1197 }
1195 1198
1196 next_stack->i = -1; 1199 next_stack->i = -1;
1197 next_stack->block = sf->next_block; 1200 next_stack->block = sf->next_block;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d43c544d3b68..92371c414228 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
887 887
888 workspace = find_workspace(type); 888 workspace = find_workspace(type);
889 if (IS_ERR(workspace)) 889 if (IS_ERR(workspace))
890 return -1; 890 return PTR_ERR(workspace);
891 891
892 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, 892 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
893 start, len, pages, 893 start, len, pages,
@@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
923 923
924 workspace = find_workspace(type); 924 workspace = find_workspace(type);
925 if (IS_ERR(workspace)) 925 if (IS_ERR(workspace))
926 return -ENOMEM; 926 return PTR_ERR(workspace);
927 927
928 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, 928 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
929 disk_start, 929 disk_start,
@@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
945 945
946 workspace = find_workspace(type); 946 workspace = find_workspace(type);
947 if (IS_ERR(workspace)) 947 if (IS_ERR(workspace))
948 return -ENOMEM; 948 return PTR_ERR(workspace);
949 949
950 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, 950 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
951 dest_page, start_byte, 951 dest_page, start_byte,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1bcfcdb23cf4..aeab453b8e24 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
224static void add_root_to_dirty_list(struct btrfs_root *root) 224static void add_root_to_dirty_list(struct btrfs_root *root)
225{ 225{
226 spin_lock(&root->fs_info->trans_lock); 226 spin_lock(&root->fs_info->trans_lock);
227 if (root->track_dirty && list_empty(&root->dirty_list)) { 227 if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
228 list_empty(&root->dirty_list)) {
228 list_add(&root->dirty_list, 229 list_add(&root->dirty_list,
229 &root->fs_info->dirty_cowonly_roots); 230 &root->fs_info->dirty_cowonly_roots);
230 } 231 }
@@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
246 int level; 247 int level;
247 struct btrfs_disk_key disk_key; 248 struct btrfs_disk_key disk_key;
248 249
249 WARN_ON(root->ref_cows && trans->transid != 250 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
250 root->fs_info->running_transaction->transid); 251 trans->transid != root->fs_info->running_transaction->transid);
251 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 252 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
253 trans->transid != root->last_trans);
252 254
253 level = btrfs_header_level(buf); 255 level = btrfs_header_level(buf);
254 if (level == 0) 256 if (level == 0)
@@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
354} 356}
355 357
356/* 358/*
357 * Increment the upper half of tree_mod_seq, set lower half zero. 359 * Pull a new tree mod seq number for our operation.
358 *
359 * Must be called with fs_info->tree_mod_seq_lock held.
360 */
361static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
362{
363 u64 seq = atomic64_read(&fs_info->tree_mod_seq);
364 seq &= 0xffffffff00000000ull;
365 seq += 1ull << 32;
366 atomic64_set(&fs_info->tree_mod_seq, seq);
367 return seq;
368}
369
370/*
371 * Increment the lower half of tree_mod_seq.
372 *
373 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
374 * are generated should not technically require a spin lock here. (Rationale:
375 * incrementing the minor while incrementing the major seq number is between its
376 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
377 * just returns a unique sequence number as usual.) We have decided to leave
378 * that requirement in here and rethink it once we notice it really imposes a
379 * problem on some workload.
380 */ 360 */
381static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) 361static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
382{ 362{
383 return atomic64_inc_return(&fs_info->tree_mod_seq); 363 return atomic64_inc_return(&fs_info->tree_mod_seq);
384} 364}
385 365
386/* 366/*
387 * return the last minor in the previous major tree_mod_seq number
388 */
389u64 btrfs_tree_mod_seq_prev(u64 seq)
390{
391 return (seq & 0xffffffff00000000ull) - 1ull;
392}
393
394/*
395 * This adds a new blocker to the tree mod log's blocker list if the @elem 367 * This adds a new blocker to the tree mod log's blocker list if the @elem
396 * passed does not already have a sequence number set. So when a caller expects 368 * passed does not already have a sequence number set. So when a caller expects
397 * to record tree modifications, it should ensure to set elem->seq to zero 369 * to record tree modifications, it should ensure to set elem->seq to zero
@@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
402u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 374u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
403 struct seq_list *elem) 375 struct seq_list *elem)
404{ 376{
405 u64 seq;
406
407 tree_mod_log_write_lock(fs_info); 377 tree_mod_log_write_lock(fs_info);
408 spin_lock(&fs_info->tree_mod_seq_lock); 378 spin_lock(&fs_info->tree_mod_seq_lock);
409 if (!elem->seq) { 379 if (!elem->seq) {
410 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); 380 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
411 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 381 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
412 } 382 }
413 seq = btrfs_inc_tree_mod_seq_minor(fs_info);
414 spin_unlock(&fs_info->tree_mod_seq_lock); 383 spin_unlock(&fs_info->tree_mod_seq_lock);
415 tree_mod_log_write_unlock(fs_info); 384 tree_mod_log_write_unlock(fs_info);
416 385
417 return seq; 386 return elem->seq;
418} 387}
419 388
420void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 389void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
487 456
488 BUG_ON(!tm); 457 BUG_ON(!tm);
489 458
490 spin_lock(&fs_info->tree_mod_seq_lock); 459 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
491 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
492 spin_unlock(&fs_info->tree_mod_seq_lock);
493 460
494 tm_root = &fs_info->tree_mod_log; 461 tm_root = &fs_info->tree_mod_log;
495 new = &tm_root->rb_node; 462 new = &tm_root->rb_node;
@@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
997 * snapshot and the block was not allocated by tree relocation, 964 * snapshot and the block was not allocated by tree relocation,
998 * we know the block is not shared. 965 * we know the block is not shared.
999 */ 966 */
1000 if (root->ref_cows && 967 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1001 buf != root->node && buf != root->commit_root && 968 buf != root->node && buf != root->commit_root &&
1002 (btrfs_header_generation(buf) <= 969 (btrfs_header_generation(buf) <=
1003 btrfs_root_last_snapshot(&root->root_item) || 970 btrfs_root_last_snapshot(&root->root_item) ||
1004 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 971 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
1005 return 1; 972 return 1;
1006#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 973#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1007 if (root->ref_cows && 974 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1008 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 975 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1009 return 1; 976 return 1;
1010#endif 977#endif
@@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1146 1113
1147 btrfs_assert_tree_locked(buf); 1114 btrfs_assert_tree_locked(buf);
1148 1115
1149 WARN_ON(root->ref_cows && trans->transid != 1116 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1150 root->fs_info->running_transaction->transid); 1117 trans->transid != root->fs_info->running_transaction->transid);
1151 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 1118 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1119 trans->transid != root->last_trans);
1152 1120
1153 level = btrfs_header_level(buf); 1121 level = btrfs_header_level(buf);
1154 1122
@@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1193 return ret; 1161 return ret;
1194 } 1162 }
1195 1163
1196 if (root->ref_cows) { 1164 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1197 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 1165 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1198 if (ret) 1166 if (ret)
1199 return ret; 1167 return ret;
@@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1538 struct btrfs_root *root, 1506 struct btrfs_root *root,
1539 struct extent_buffer *buf) 1507 struct extent_buffer *buf)
1540{ 1508{
1509#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1510 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1511 return 0;
1512#endif
1541 /* ensure we can see the force_cow */ 1513 /* ensure we can see the force_cow */
1542 smp_rmb(); 1514 smp_rmb();
1543 1515
@@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1556 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1528 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1557 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1529 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1558 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1530 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1559 !root->force_cow) 1531 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1560 return 0; 1532 return 0;
1561 return 1; 1533 return 1;
1562} 1534}
@@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5125 return ret; 5097 return ret;
5126 btrfs_item_key(path->nodes[0], &found_key, 0); 5098 btrfs_item_key(path->nodes[0], &found_key, 0);
5127 ret = comp_keys(&found_key, &key); 5099 ret = comp_keys(&found_key, &key);
5128 if (ret < 0) 5100 /*
5101 * We might have had an item with the previous key in the tree right
5102 * before we released our path. And after we released our path, that
5103 * item might have been pushed to the first slot (0) of the leaf we
5104 * were holding due to a tree balance. Alternatively, an item with the
5105 * previous key can exist as the only element of a leaf (big fat item).
5106 * Therefore account for these 2 cases, so that our callers (like
5107 * btrfs_previous_item) don't miss an existing item with a key matching
5108 * the previous key we computed above.
5109 */
5110 if (ret <= 0)
5129 return 0; 5111 return 0;
5130 return 1; 5112 return 1;
5131} 5113}
@@ -5736,6 +5718,24 @@ again:
5736 ret = 0; 5718 ret = 0;
5737 goto done; 5719 goto done;
5738 } 5720 }
5721 /*
5722 * So the above check misses one case:
5723 * - after releasing the path above, someone has removed the item that
5724 * used to be at the very end of the block, and balance between leafs
5725 * gets another one with bigger key.offset to replace it.
5726 *
5727 * This one should be returned as well, or we can get leaf corruption
5728 * later(esp. in __btrfs_drop_extents()).
5729 *
5730 * And a bit more explanation about this check,
5731 * with ret > 0, the key isn't found, the path points to the slot
5732 * where it should be inserted, so the path->slots[0] item must be the
5733 * bigger one.
5734 */
5735 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5736 ret = 0;
5737 goto done;
5738 }
5739 5739
5740 while (level < BTRFS_MAX_LEVEL) { 5740 while (level < BTRFS_MAX_LEVEL) {
5741 if (!path->nodes[level]) { 5741 if (!path->nodes[level]) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ba6b88528dc7..b7e2c1c1ef36 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
33#include <asm/kmap_types.h> 33#include <asm/kmap_types.h>
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/btrfs.h> 35#include <linux/btrfs.h>
36#include <linux/workqueue.h>
36#include "extent_io.h" 37#include "extent_io.h"
37#include "extent_map.h" 38#include "extent_map.h"
38#include "async-thread.h" 39#include "async-thread.h"
@@ -756,6 +757,12 @@ struct btrfs_dir_item {
756 757
757#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) 758#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
758 759
760/*
761 * Internal in-memory flag that a subvolume has been marked for deletion but
762 * still visible as a directory
763 */
764#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
765
759struct btrfs_root_item { 766struct btrfs_root_item {
760 struct btrfs_inode_item inode; 767 struct btrfs_inode_item inode;
761 __le64 generation; 768 __le64 generation;
@@ -840,7 +847,10 @@ struct btrfs_disk_balance_args {
840 /* BTRFS_BALANCE_ARGS_* */ 847 /* BTRFS_BALANCE_ARGS_* */
841 __le64 flags; 848 __le64 flags;
842 849
843 __le64 unused[8]; 850 /* BTRFS_BALANCE_ARGS_LIMIT value */
851 __le64 limit;
852
853 __le64 unused[7];
844} __attribute__ ((__packed__)); 854} __attribute__ ((__packed__));
845 855
846/* 856/*
@@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item {
1113 __le64 rsv_excl; 1123 __le64 rsv_excl;
1114} __attribute__ ((__packed__)); 1124} __attribute__ ((__packed__));
1115 1125
1126/* For raid type sysfs entries */
1127struct raid_kobject {
1128 int raid_type;
1129 struct kobject kobj;
1130};
1131
1116struct btrfs_space_info { 1132struct btrfs_space_info {
1117 spinlock_t lock; 1133 spinlock_t lock;
1118 1134
@@ -1163,7 +1179,7 @@ struct btrfs_space_info {
1163 wait_queue_head_t wait; 1179 wait_queue_head_t wait;
1164 1180
1165 struct kobject kobj; 1181 struct kobject kobj;
1166 struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; 1182 struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
1167}; 1183};
1168 1184
1169#define BTRFS_BLOCK_RSV_GLOBAL 1 1185#define BTRFS_BLOCK_RSV_GLOBAL 1
@@ -1313,6 +1329,8 @@ struct btrfs_stripe_hash_table {
1313 1329
1314#define BTRFS_STRIPE_HASH_TABLE_BITS 11 1330#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1315 1331
1332void btrfs_init_async_reclaim_work(struct work_struct *work);
1333
1316/* fs_info */ 1334/* fs_info */
1317struct reloc_control; 1335struct reloc_control;
1318struct btrfs_device; 1336struct btrfs_device;
@@ -1534,6 +1552,9 @@ struct btrfs_fs_info {
1534 */ 1552 */
1535 struct btrfs_workqueue *fixup_workers; 1553 struct btrfs_workqueue *fixup_workers;
1536 struct btrfs_workqueue *delayed_workers; 1554 struct btrfs_workqueue *delayed_workers;
1555
1556 /* the extent workers do delayed refs on the extent allocation tree */
1557 struct btrfs_workqueue *extent_workers;
1537 struct task_struct *transaction_kthread; 1558 struct task_struct *transaction_kthread;
1538 struct task_struct *cleaner_kthread; 1559 struct task_struct *cleaner_kthread;
1539 int thread_pool_size; 1560 int thread_pool_size;
@@ -1636,7 +1657,10 @@ struct btrfs_fs_info {
1636 1657
1637 /* holds configuration and tracking. Protected by qgroup_lock */ 1658 /* holds configuration and tracking. Protected by qgroup_lock */
1638 struct rb_root qgroup_tree; 1659 struct rb_root qgroup_tree;
1660 struct rb_root qgroup_op_tree;
1639 spinlock_t qgroup_lock; 1661 spinlock_t qgroup_lock;
1662 spinlock_t qgroup_op_lock;
1663 atomic_t qgroup_op_seq;
1640 1664
1641 /* 1665 /*
1642 * used to avoid frequently calling ulist_alloc()/ulist_free() 1666 * used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1688,6 +1712,9 @@ struct btrfs_fs_info {
1688 1712
1689 struct semaphore uuid_tree_rescan_sem; 1713 struct semaphore uuid_tree_rescan_sem;
1690 unsigned int update_uuid_tree_gen:1; 1714 unsigned int update_uuid_tree_gen:1;
1715
1716 /* Used to reclaim the metadata space in the background. */
1717 struct work_struct async_reclaim_work;
1691}; 1718};
1692 1719
1693struct btrfs_subvolume_writers { 1720struct btrfs_subvolume_writers {
@@ -1696,6 +1723,26 @@ struct btrfs_subvolume_writers {
1696}; 1723};
1697 1724
1698/* 1725/*
1726 * The state of btrfs root
1727 */
1728/*
1729 * btrfs_record_root_in_trans is a multi-step process,
1730 * and it can race with the balancing code. But the
1731 * race is very small, and only the first time the root
1732 * is added to each transaction. So IN_TRANS_SETUP
1733 * is used to tell us when more checks are required
1734 */
1735#define BTRFS_ROOT_IN_TRANS_SETUP 0
1736#define BTRFS_ROOT_REF_COWS 1
1737#define BTRFS_ROOT_TRACK_DIRTY 2
1738#define BTRFS_ROOT_IN_RADIX 3
1739#define BTRFS_ROOT_DUMMY_ROOT 4
1740#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
1741#define BTRFS_ROOT_DEFRAG_RUNNING 6
1742#define BTRFS_ROOT_FORCE_COW 7
1743#define BTRFS_ROOT_MULTI_LOG_TASKS 8
1744
1745/*
1699 * in ram representation of the tree. extent_root is used for all allocations 1746 * in ram representation of the tree. extent_root is used for all allocations
1700 * and for the extent tree extent_root root. 1747 * and for the extent tree extent_root root.
1701 */ 1748 */
@@ -1706,6 +1753,7 @@ struct btrfs_root {
1706 struct btrfs_root *log_root; 1753 struct btrfs_root *log_root;
1707 struct btrfs_root *reloc_root; 1754 struct btrfs_root *reloc_root;
1708 1755
1756 unsigned long state;
1709 struct btrfs_root_item root_item; 1757 struct btrfs_root_item root_item;
1710 struct btrfs_key root_key; 1758 struct btrfs_key root_key;
1711 struct btrfs_fs_info *fs_info; 1759 struct btrfs_fs_info *fs_info;
@@ -1740,7 +1788,6 @@ struct btrfs_root {
1740 /* Just be updated when the commit succeeds. */ 1788 /* Just be updated when the commit succeeds. */
1741 int last_log_commit; 1789 int last_log_commit;
1742 pid_t log_start_pid; 1790 pid_t log_start_pid;
1743 bool log_multiple_pids;
1744 1791
1745 u64 objectid; 1792 u64 objectid;
1746 u64 last_trans; 1793 u64 last_trans;
@@ -1760,23 +1807,13 @@ struct btrfs_root {
1760 1807
1761 u64 highest_objectid; 1808 u64 highest_objectid;
1762 1809
1763 /* btrfs_record_root_in_trans is a multi-step process,
1764 * and it can race with the balancing code. But the
1765 * race is very small, and only the first time the root
1766 * is added to each transaction. So in_trans_setup
1767 * is used to tell us when more checks are required
1768 */
1769 unsigned long in_trans_setup;
1770 int ref_cows;
1771 int track_dirty;
1772 int in_radix;
1773#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1810#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1774 int dummy_root; 1811 u64 alloc_bytenr;
1775#endif 1812#endif
1813
1776 u64 defrag_trans_start; 1814 u64 defrag_trans_start;
1777 struct btrfs_key defrag_progress; 1815 struct btrfs_key defrag_progress;
1778 struct btrfs_key defrag_max; 1816 struct btrfs_key defrag_max;
1779 int defrag_running;
1780 char *name; 1817 char *name;
1781 1818
1782 /* the dirty list is only used by non-reference counted roots */ 1819 /* the dirty list is only used by non-reference counted roots */
@@ -1790,7 +1827,6 @@ struct btrfs_root {
1790 spinlock_t orphan_lock; 1827 spinlock_t orphan_lock;
1791 atomic_t orphan_inodes; 1828 atomic_t orphan_inodes;
1792 struct btrfs_block_rsv *orphan_block_rsv; 1829 struct btrfs_block_rsv *orphan_block_rsv;
1793 int orphan_item_inserted;
1794 int orphan_cleanup_state; 1830 int orphan_cleanup_state;
1795 1831
1796 spinlock_t inode_lock; 1832 spinlock_t inode_lock;
@@ -1808,8 +1844,6 @@ struct btrfs_root {
1808 */ 1844 */
1809 dev_t anon_dev; 1845 dev_t anon_dev;
1810 1846
1811 int force_cow;
1812
1813 spinlock_t root_item_lock; 1847 spinlock_t root_item_lock;
1814 atomic_t refs; 1848 atomic_t refs;
1815 1849
@@ -2788,6 +2822,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
2788 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; 2822 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
2789} 2823}
2790 2824
2825static inline bool btrfs_root_dead(struct btrfs_root *root)
2826{
2827 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
2828}
2829
2791/* struct btrfs_root_backup */ 2830/* struct btrfs_root_backup */
2792BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, 2831BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2793 tree_root, 64); 2832 tree_root, 64);
@@ -2897,6 +2936,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2897 cpu->vend = le64_to_cpu(disk->vend); 2936 cpu->vend = le64_to_cpu(disk->vend);
2898 cpu->target = le64_to_cpu(disk->target); 2937 cpu->target = le64_to_cpu(disk->target);
2899 cpu->flags = le64_to_cpu(disk->flags); 2938 cpu->flags = le64_to_cpu(disk->flags);
2939 cpu->limit = le64_to_cpu(disk->limit);
2900} 2940}
2901 2941
2902static inline void 2942static inline void
@@ -2914,6 +2954,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2914 disk->vend = cpu_to_le64(cpu->vend); 2954 disk->vend = cpu_to_le64(cpu->vend);
2915 disk->target = cpu_to_le64(cpu->target); 2955 disk->target = cpu_to_le64(cpu->target);
2916 disk->flags = cpu_to_le64(cpu->flags); 2956 disk->flags = cpu_to_le64(cpu->flags);
2957 disk->limit = cpu_to_le64(cpu->limit);
2917} 2958}
2918 2959
2919/* struct btrfs_super_block */ 2960/* struct btrfs_super_block */
@@ -3236,6 +3277,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3236void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3277void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3237int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3278int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3238 struct btrfs_root *root, unsigned long count); 3279 struct btrfs_root *root, unsigned long count);
3280int btrfs_async_run_delayed_refs(struct btrfs_root *root,
3281 unsigned long count, int wait);
3239int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 3282int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
3240int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 3283int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
3241 struct btrfs_root *root, u64 bytenr, 3284 struct btrfs_root *root, u64 bytenr,
@@ -3275,9 +3318,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
3275 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 3318 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
3276 struct btrfs_key *ins, int is_data); 3319 struct btrfs_key *ins, int is_data);
3277int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3320int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3278 struct extent_buffer *buf, int full_backref, int for_cow); 3321 struct extent_buffer *buf, int full_backref, int no_quota);
3279int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3322int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3280 struct extent_buffer *buf, int full_backref, int for_cow); 3323 struct extent_buffer *buf, int full_backref, int no_quota);
3281int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3324int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3282 struct btrfs_root *root, 3325 struct btrfs_root *root,
3283 u64 bytenr, u64 num_bytes, u64 flags, 3326 u64 bytenr, u64 num_bytes, u64 flags,
@@ -3285,7 +3328,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3285int btrfs_free_extent(struct btrfs_trans_handle *trans, 3328int btrfs_free_extent(struct btrfs_trans_handle *trans,
3286 struct btrfs_root *root, 3329 struct btrfs_root *root,
3287 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 3330 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
3288 u64 owner, u64 offset, int for_cow); 3331 u64 owner, u64 offset, int no_quota);
3289 3332
3290int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 3333int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
3291int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 3334int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -3297,7 +3340,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3297int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 3340int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
3298 struct btrfs_root *root, 3341 struct btrfs_root *root,
3299 u64 bytenr, u64 num_bytes, u64 parent, 3342 u64 bytenr, u64 num_bytes, u64 parent,
3300 u64 root_objectid, u64 owner, u64 offset, int for_cow); 3343 u64 root_objectid, u64 owner, u64 offset, int no_quota);
3301 3344
3302int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3345int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3303 struct btrfs_root *root); 3346 struct btrfs_root *root);
@@ -3385,7 +3428,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3385int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3428int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3386 struct btrfs_fs_info *fs_info); 3429 struct btrfs_fs_info *fs_info);
3387int __get_raid_index(u64 flags); 3430int __get_raid_index(u64 flags);
3388
3389int btrfs_start_nocow_write(struct btrfs_root *root); 3431int btrfs_start_nocow_write(struct btrfs_root *root);
3390void btrfs_end_nocow_write(struct btrfs_root *root); 3432void btrfs_end_nocow_write(struct btrfs_root *root);
3391/* ctree.c */ 3433/* ctree.c */
@@ -3561,7 +3603,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3561 struct seq_list *elem); 3603 struct seq_list *elem);
3562void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3604void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3563 struct seq_list *elem); 3605 struct seq_list *elem);
3564u64 btrfs_tree_mod_seq_prev(u64 seq);
3565int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); 3606int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
3566 3607
3567/* root-item.c */ 3608/* root-item.c */
@@ -3708,6 +3749,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
3708 struct bio *bio, u64 file_start, int contig); 3749 struct bio *bio, u64 file_start, int contig);
3709int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3750int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3710 struct list_head *list, int search_commit); 3751 struct list_head *list, int search_commit);
3752void btrfs_extent_item_to_extent_map(struct inode *inode,
3753 const struct btrfs_path *path,
3754 struct btrfs_file_extent_item *fi,
3755 const bool new_inline,
3756 struct extent_map *em);
3757
3711/* inode.c */ 3758/* inode.c */
3712struct btrfs_delalloc_work { 3759struct btrfs_delalloc_work {
3713 struct inode *inode; 3760 struct inode *inode;
@@ -4069,52 +4116,6 @@ void btrfs_reada_detach(void *handle);
4069int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 4116int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
4070 u64 start, int err); 4117 u64 start, int err);
4071 4118
4072/* qgroup.c */
4073struct qgroup_update {
4074 struct list_head list;
4075 struct btrfs_delayed_ref_node *node;
4076 struct btrfs_delayed_extent_op *extent_op;
4077};
4078
4079int btrfs_quota_enable(struct btrfs_trans_handle *trans,
4080 struct btrfs_fs_info *fs_info);
4081int btrfs_quota_disable(struct btrfs_trans_handle *trans,
4082 struct btrfs_fs_info *fs_info);
4083int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
4084void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
4085int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
4086int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
4087 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
4088int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
4089 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
4090int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
4091 struct btrfs_fs_info *fs_info, u64 qgroupid,
4092 char *name);
4093int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
4094 struct btrfs_fs_info *fs_info, u64 qgroupid);
4095int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
4096 struct btrfs_fs_info *fs_info, u64 qgroupid,
4097 struct btrfs_qgroup_limit *limit);
4098int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
4099void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
4100struct btrfs_delayed_extent_op;
4101int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
4102 struct btrfs_delayed_ref_node *node,
4103 struct btrfs_delayed_extent_op *extent_op);
4104int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
4105 struct btrfs_fs_info *fs_info,
4106 struct btrfs_delayed_ref_node *node,
4107 struct btrfs_delayed_extent_op *extent_op);
4108int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
4109 struct btrfs_fs_info *fs_info);
4110int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
4111 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
4112 struct btrfs_qgroup_inherit *inherit);
4113int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
4114void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
4115
4116void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
4117
4118static inline int is_fstree(u64 rootid) 4119static inline int is_fstree(u64 rootid)
4119{ 4120{
4120 if (rootid == BTRFS_FS_TREE_OBJECTID || 4121 if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -4131,6 +4132,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
4131/* Sanity test specific functions */ 4132/* Sanity test specific functions */
4132#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4133#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4133void btrfs_test_destroy_inode(struct inode *inode); 4134void btrfs_test_destroy_inode(struct inode *inode);
4135int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
4136 u64 rfer, u64 excl);
4134#endif 4137#endif
4135 4138
4136#endif 4139#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 33e561a84013..da775bfdebc9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -149,8 +149,8 @@ again:
149 spin_lock(&root->inode_lock); 149 spin_lock(&root->inode_lock);
150 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); 150 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
151 if (ret == -EEXIST) { 151 if (ret == -EEXIST) {
152 kmem_cache_free(delayed_node_cache, node);
153 spin_unlock(&root->inode_lock); 152 spin_unlock(&root->inode_lock);
153 kmem_cache_free(delayed_node_cache, node);
154 radix_tree_preload_end(); 154 radix_tree_preload_end();
155 goto again; 155 goto again;
156 } 156 }
@@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node(
267 mutex_unlock(&delayed_node->mutex); 267 mutex_unlock(&delayed_node->mutex);
268 268
269 if (atomic_dec_and_test(&delayed_node->refs)) { 269 if (atomic_dec_and_test(&delayed_node->refs)) {
270 bool free = false;
270 struct btrfs_root *root = delayed_node->root; 271 struct btrfs_root *root = delayed_node->root;
271 spin_lock(&root->inode_lock); 272 spin_lock(&root->inode_lock);
272 if (atomic_read(&delayed_node->refs) == 0) { 273 if (atomic_read(&delayed_node->refs) == 0) {
273 radix_tree_delete(&root->delayed_nodes_tree, 274 radix_tree_delete(&root->delayed_nodes_tree,
274 delayed_node->inode_id); 275 delayed_node->inode_id);
275 kmem_cache_free(delayed_node_cache, delayed_node); 276 free = true;
276 } 277 }
277 spin_unlock(&root->inode_lock); 278 spin_unlock(&root->inode_lock);
279 if (free)
280 kmem_cache_free(delayed_node_cache, delayed_node);
278 } 281 }
279} 282}
280 283
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 31299646024d..6d16bea94e1c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
106 return -1; 106 return -1;
107 if (ref1->type > ref2->type) 107 if (ref1->type > ref2->type)
108 return 1; 108 return 1;
109 if (ref1->no_quota > ref2->no_quota)
110 return 1;
111 if (ref1->no_quota < ref2->no_quota)
112 return -1;
109 /* merging of sequenced refs is not allowed */ 113 /* merging of sequenced refs is not allowed */
110 if (compare_seq) { 114 if (compare_seq) {
111 if (ref1->seq < ref2->seq) 115 if (ref1->seq < ref2->seq)
@@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
635 struct btrfs_delayed_ref_head *head_ref, 639 struct btrfs_delayed_ref_head *head_ref,
636 struct btrfs_delayed_ref_node *ref, u64 bytenr, 640 struct btrfs_delayed_ref_node *ref, u64 bytenr,
637 u64 num_bytes, u64 parent, u64 ref_root, int level, 641 u64 num_bytes, u64 parent, u64 ref_root, int level,
638 int action, int for_cow) 642 int action, int no_quota)
639{ 643{
640 struct btrfs_delayed_ref_node *existing; 644 struct btrfs_delayed_ref_node *existing;
641 struct btrfs_delayed_tree_ref *full_ref; 645 struct btrfs_delayed_tree_ref *full_ref;
@@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
645 if (action == BTRFS_ADD_DELAYED_EXTENT) 649 if (action == BTRFS_ADD_DELAYED_EXTENT)
646 action = BTRFS_ADD_DELAYED_REF; 650 action = BTRFS_ADD_DELAYED_REF;
647 651
652 if (is_fstree(ref_root))
653 seq = atomic64_read(&fs_info->tree_mod_seq);
648 delayed_refs = &trans->transaction->delayed_refs; 654 delayed_refs = &trans->transaction->delayed_refs;
649 655
650 /* first set the basic ref node struct up */ 656 /* first set the basic ref node struct up */
@@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
655 ref->action = action; 661 ref->action = action;
656 ref->is_head = 0; 662 ref->is_head = 0;
657 ref->in_tree = 1; 663 ref->in_tree = 1;
658 664 ref->no_quota = no_quota;
659 if (need_ref_seq(for_cow, ref_root))
660 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
661 ref->seq = seq; 665 ref->seq = seq;
662 666
663 full_ref = btrfs_delayed_node_to_tree_ref(ref); 667 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
697 struct btrfs_delayed_ref_head *head_ref, 701 struct btrfs_delayed_ref_head *head_ref,
698 struct btrfs_delayed_ref_node *ref, u64 bytenr, 702 struct btrfs_delayed_ref_node *ref, u64 bytenr,
699 u64 num_bytes, u64 parent, u64 ref_root, u64 owner, 703 u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
700 u64 offset, int action, int for_cow) 704 u64 offset, int action, int no_quota)
701{ 705{
702 struct btrfs_delayed_ref_node *existing; 706 struct btrfs_delayed_ref_node *existing;
703 struct btrfs_delayed_data_ref *full_ref; 707 struct btrfs_delayed_data_ref *full_ref;
@@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
709 713
710 delayed_refs = &trans->transaction->delayed_refs; 714 delayed_refs = &trans->transaction->delayed_refs;
711 715
716 if (is_fstree(ref_root))
717 seq = atomic64_read(&fs_info->tree_mod_seq);
718
712 /* first set the basic ref node struct up */ 719 /* first set the basic ref node struct up */
713 atomic_set(&ref->refs, 1); 720 atomic_set(&ref->refs, 1);
714 ref->bytenr = bytenr; 721 ref->bytenr = bytenr;
@@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
717 ref->action = action; 724 ref->action = action;
718 ref->is_head = 0; 725 ref->is_head = 0;
719 ref->in_tree = 1; 726 ref->in_tree = 1;
720 727 ref->no_quota = no_quota;
721 if (need_ref_seq(for_cow, ref_root))
722 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
723 ref->seq = seq; 728 ref->seq = seq;
724 729
725 full_ref = btrfs_delayed_node_to_data_ref(ref); 730 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
762 u64 bytenr, u64 num_bytes, u64 parent, 767 u64 bytenr, u64 num_bytes, u64 parent,
763 u64 ref_root, int level, int action, 768 u64 ref_root, int level, int action,
764 struct btrfs_delayed_extent_op *extent_op, 769 struct btrfs_delayed_extent_op *extent_op,
765 int for_cow) 770 int no_quota)
766{ 771{
767 struct btrfs_delayed_tree_ref *ref; 772 struct btrfs_delayed_tree_ref *ref;
768 struct btrfs_delayed_ref_head *head_ref; 773 struct btrfs_delayed_ref_head *head_ref;
769 struct btrfs_delayed_ref_root *delayed_refs; 774 struct btrfs_delayed_ref_root *delayed_refs;
770 775
776 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
777 no_quota = 0;
778
771 BUG_ON(extent_op && extent_op->is_data); 779 BUG_ON(extent_op && extent_op->is_data);
772 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 780 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
773 if (!ref) 781 if (!ref)
@@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
793 801
794 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, 802 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
795 num_bytes, parent, ref_root, level, action, 803 num_bytes, parent, ref_root, level, action,
796 for_cow); 804 no_quota);
797 spin_unlock(&delayed_refs->lock); 805 spin_unlock(&delayed_refs->lock);
798 if (need_ref_seq(for_cow, ref_root))
799 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
800 806
801 return 0; 807 return 0;
802} 808}
@@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
810 u64 parent, u64 ref_root, 816 u64 parent, u64 ref_root,
811 u64 owner, u64 offset, int action, 817 u64 owner, u64 offset, int action,
812 struct btrfs_delayed_extent_op *extent_op, 818 struct btrfs_delayed_extent_op *extent_op,
813 int for_cow) 819 int no_quota)
814{ 820{
815 struct btrfs_delayed_data_ref *ref; 821 struct btrfs_delayed_data_ref *ref;
816 struct btrfs_delayed_ref_head *head_ref; 822 struct btrfs_delayed_ref_head *head_ref;
817 struct btrfs_delayed_ref_root *delayed_refs; 823 struct btrfs_delayed_ref_root *delayed_refs;
818 824
825 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
826 no_quota = 0;
827
819 BUG_ON(extent_op && !extent_op->is_data); 828 BUG_ON(extent_op && !extent_op->is_data);
820 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); 829 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
821 if (!ref) 830 if (!ref)
@@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
841 850
842 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, 851 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
843 num_bytes, parent, ref_root, owner, offset, 852 num_bytes, parent, ref_root, owner, offset,
844 action, for_cow); 853 action, no_quota);
845 spin_unlock(&delayed_refs->lock); 854 spin_unlock(&delayed_refs->lock);
846 if (need_ref_seq(for_cow, ref_root))
847 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
848 855
849 return 0; 856 return 0;
850} 857}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 4ba9b93022ff..a764e2340d48 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {
52 52
53 unsigned int action:8; 53 unsigned int action:8;
54 unsigned int type:8; 54 unsigned int type:8;
55 unsigned int no_quota:1;
55 /* is this node still in the rbtree? */ 56 /* is this node still in the rbtree? */
56 unsigned int is_head:1; 57 unsigned int is_head:1;
57 unsigned int in_tree:1; 58 unsigned int in_tree:1;
@@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
196 u64 bytenr, u64 num_bytes, u64 parent, 197 u64 bytenr, u64 num_bytes, u64 parent,
197 u64 ref_root, int level, int action, 198 u64 ref_root, int level, int action,
198 struct btrfs_delayed_extent_op *extent_op, 199 struct btrfs_delayed_extent_op *extent_op,
199 int for_cow); 200 int no_quota);
200int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, 201int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
201 struct btrfs_trans_handle *trans, 202 struct btrfs_trans_handle *trans,
202 u64 bytenr, u64 num_bytes, 203 u64 bytenr, u64 num_bytes,
203 u64 parent, u64 ref_root, 204 u64 parent, u64 ref_root,
204 u64 owner, u64 offset, int action, 205 u64 owner, u64 offset, int action,
205 struct btrfs_delayed_extent_op *extent_op, 206 struct btrfs_delayed_extent_op *extent_op,
206 int for_cow); 207 int no_quota);
207int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 208int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
208 struct btrfs_trans_handle *trans, 209 struct btrfs_trans_handle *trans,
209 u64 bytenr, u64 num_bytes, 210 u64 bytenr, u64 num_bytes,
@@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
231 u64 seq); 232 u64 seq);
232 233
233/* 234/*
234 * delayed refs with a ref_seq > 0 must be held back during backref walking.
235 * this only applies to items in one of the fs-trees. for_cow items never need
236 * to be held back, so they won't get a ref_seq number.
237 */
238static inline int need_ref_seq(int for_cow, u64 rootid)
239{
240 if (for_cow)
241 return 0;
242
243 if (rootid == BTRFS_FS_TREE_OBJECTID)
244 return 1;
245
246 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
247 return 1;
248
249 return 0;
250}
251
252/*
253 * a node might live in a head or a regular ref, this lets you 235 * a node might live in a head or a regular ref, this lets you
254 * test for the proper type to use. 236 * test for the proper type to use.
255 */ 237 */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9f2290509aca..2af6e66fe788 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,7 +313,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
313 313
314 if (btrfs_fs_incompat(fs_info, RAID56)) { 314 if (btrfs_fs_incompat(fs_info, RAID56)) {
315 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); 315 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
316 return -EINVAL; 316 return -EOPNOTSUPP;
317 } 317 }
318 318
319 switch (args->start.cont_reading_from_srcdev_mode) { 319 switch (args->start.cont_reading_from_srcdev_mode) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 983314932af3..8bb4aa19898f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
49#include "dev-replace.h" 49#include "dev-replace.h"
50#include "raid56.h" 50#include "raid56.h"
51#include "sysfs.h" 51#include "sysfs.h"
52#include "qgroup.h"
52 53
53#ifdef CONFIG_X86 54#ifdef CONFIG_X86
54#include <asm/cpufeature.h> 55#include <asm/cpufeature.h>
@@ -1109,6 +1110,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1109struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1110struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1110 u64 bytenr, u32 blocksize) 1111 u64 bytenr, u32 blocksize)
1111{ 1112{
1113#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1114 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1115 return alloc_test_extent_buffer(root->fs_info, bytenr,
1116 blocksize);
1117#endif
1112 return alloc_extent_buffer(root->fs_info, bytenr, blocksize); 1118 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1113} 1119}
1114 1120
@@ -1201,10 +1207,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1201 root->nodesize = nodesize; 1207 root->nodesize = nodesize;
1202 root->leafsize = leafsize; 1208 root->leafsize = leafsize;
1203 root->stripesize = stripesize; 1209 root->stripesize = stripesize;
1204 root->ref_cows = 0; 1210 root->state = 0;
1205 root->track_dirty = 0;
1206 root->in_radix = 0;
1207 root->orphan_item_inserted = 0;
1208 root->orphan_cleanup_state = 0; 1211 root->orphan_cleanup_state = 0;
1209 1212
1210 root->objectid = objectid; 1213 root->objectid = objectid;
@@ -1265,7 +1268,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1265 else 1268 else
1266 root->defrag_trans_start = 0; 1269 root->defrag_trans_start = 0;
1267 init_completion(&root->kobj_unregister); 1270 init_completion(&root->kobj_unregister);
1268 root->defrag_running = 0;
1269 root->root_key.objectid = objectid; 1271 root->root_key.objectid = objectid;
1270 root->anon_dev = 0; 1272 root->anon_dev = 0;
1271 1273
@@ -1290,7 +1292,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1290 if (!root) 1292 if (!root)
1291 return ERR_PTR(-ENOMEM); 1293 return ERR_PTR(-ENOMEM);
1292 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); 1294 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
1293 root->dummy_root = 1; 1295 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1296 root->alloc_bytenr = 0;
1294 1297
1295 return root; 1298 return root;
1296} 1299}
@@ -1341,8 +1344,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1341 btrfs_mark_buffer_dirty(leaf); 1344 btrfs_mark_buffer_dirty(leaf);
1342 1345
1343 root->commit_root = btrfs_root_node(root); 1346 root->commit_root = btrfs_root_node(root);
1344 root->track_dirty = 1; 1347 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1345
1346 1348
1347 root->root_item.flags = 0; 1349 root->root_item.flags = 0;
1348 root->root_item.byte_limit = 0; 1350 root->root_item.byte_limit = 0;
@@ -1371,6 +1373,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1371fail: 1373fail:
1372 if (leaf) { 1374 if (leaf) {
1373 btrfs_tree_unlock(leaf); 1375 btrfs_tree_unlock(leaf);
1376 free_extent_buffer(root->commit_root);
1374 free_extent_buffer(leaf); 1377 free_extent_buffer(leaf);
1375 } 1378 }
1376 kfree(root); 1379 kfree(root);
@@ -1396,13 +1399,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1396 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1399 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1397 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1400 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1398 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1401 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1402
1399 /* 1403 /*
1404 * DON'T set REF_COWS for log trees
1405 *
1400 * log trees do not get reference counted because they go away 1406 * log trees do not get reference counted because they go away
1401 * before a real commit is actually done. They do store pointers 1407 * before a real commit is actually done. They do store pointers
1402 * to file data extents, and those reference counts still get 1408 * to file data extents, and those reference counts still get
1403 * updated (along with back refs to the log tree). 1409 * updated (along with back refs to the log tree).
1404 */ 1410 */
1405 root->ref_cows = 0;
1406 1411
1407 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1412 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1408 BTRFS_TREE_LOG_OBJECTID, NULL, 1413 BTRFS_TREE_LOG_OBJECTID, NULL,
@@ -1536,7 +1541,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1536 return root; 1541 return root;
1537 1542
1538 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 1543 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1539 root->ref_cows = 1; 1544 set_bit(BTRFS_ROOT_REF_COWS, &root->state);
1540 btrfs_check_and_init_root_item(&root->root_item); 1545 btrfs_check_and_init_root_item(&root->root_item);
1541 } 1546 }
1542 1547
@@ -1606,7 +1611,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1606 (unsigned long)root->root_key.objectid, 1611 (unsigned long)root->root_key.objectid,
1607 root); 1612 root);
1608 if (ret == 0) 1613 if (ret == 0)
1609 root->in_radix = 1; 1614 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1610 spin_unlock(&fs_info->fs_roots_radix_lock); 1615 spin_unlock(&fs_info->fs_roots_radix_lock);
1611 radix_tree_preload_end(); 1616 radix_tree_preload_end();
1612 1617
@@ -1662,7 +1667,7 @@ again:
1662 if (ret < 0) 1667 if (ret < 0)
1663 goto fail; 1668 goto fail;
1664 if (ret == 0) 1669 if (ret == 0)
1665 root->orphan_item_inserted = 1; 1670 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1666 1671
1667 ret = btrfs_insert_fs_root(fs_info, root); 1672 ret = btrfs_insert_fs_root(fs_info, root);
1668 if (ret) { 1673 if (ret) {
@@ -2064,6 +2069,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2064 btrfs_destroy_workqueue(fs_info->readahead_workers); 2069 btrfs_destroy_workqueue(fs_info->readahead_workers);
2065 btrfs_destroy_workqueue(fs_info->flush_workers); 2070 btrfs_destroy_workqueue(fs_info->flush_workers);
2066 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); 2071 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2072 btrfs_destroy_workqueue(fs_info->extent_workers);
2067} 2073}
2068 2074
2069static void free_root_extent_buffers(struct btrfs_root *root) 2075static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2090,7 +2096,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
2090 free_root_extent_buffers(info->chunk_root); 2096 free_root_extent_buffers(info->chunk_root);
2091} 2097}
2092 2098
2093static void del_fs_roots(struct btrfs_fs_info *fs_info) 2099void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2094{ 2100{
2095 int ret; 2101 int ret;
2096 struct btrfs_root *gang[8]; 2102 struct btrfs_root *gang[8];
@@ -2101,7 +2107,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2101 struct btrfs_root, root_list); 2107 struct btrfs_root, root_list);
2102 list_del(&gang[0]->root_list); 2108 list_del(&gang[0]->root_list);
2103 2109
2104 if (gang[0]->in_radix) { 2110 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2105 btrfs_drop_and_free_fs_root(fs_info, gang[0]); 2111 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2106 } else { 2112 } else {
2107 free_extent_buffer(gang[0]->node); 2113 free_extent_buffer(gang[0]->node);
@@ -2221,6 +2227,7 @@ int open_ctree(struct super_block *sb,
2221 spin_lock_init(&fs_info->free_chunk_lock); 2227 spin_lock_init(&fs_info->free_chunk_lock);
2222 spin_lock_init(&fs_info->tree_mod_seq_lock); 2228 spin_lock_init(&fs_info->tree_mod_seq_lock);
2223 spin_lock_init(&fs_info->super_lock); 2229 spin_lock_init(&fs_info->super_lock);
2230 spin_lock_init(&fs_info->qgroup_op_lock);
2224 spin_lock_init(&fs_info->buffer_lock); 2231 spin_lock_init(&fs_info->buffer_lock);
2225 rwlock_init(&fs_info->tree_mod_log_lock); 2232 rwlock_init(&fs_info->tree_mod_log_lock);
2226 mutex_init(&fs_info->reloc_mutex); 2233 mutex_init(&fs_info->reloc_mutex);
@@ -2246,6 +2253,7 @@ int open_ctree(struct super_block *sb,
2246 atomic_set(&fs_info->async_submit_draining, 0); 2253 atomic_set(&fs_info->async_submit_draining, 0);
2247 atomic_set(&fs_info->nr_async_bios, 0); 2254 atomic_set(&fs_info->nr_async_bios, 0);
2248 atomic_set(&fs_info->defrag_running, 0); 2255 atomic_set(&fs_info->defrag_running, 0);
2256 atomic_set(&fs_info->qgroup_op_seq, 0);
2249 atomic64_set(&fs_info->tree_mod_seq, 0); 2257 atomic64_set(&fs_info->tree_mod_seq, 0);
2250 fs_info->sb = sb; 2258 fs_info->sb = sb;
2251 fs_info->max_inline = 8192 * 1024; 2259 fs_info->max_inline = 8192 * 1024;
@@ -2291,6 +2299,7 @@ int open_ctree(struct super_block *sb,
2291 atomic_set(&fs_info->balance_cancel_req, 0); 2299 atomic_set(&fs_info->balance_cancel_req, 0);
2292 fs_info->balance_ctl = NULL; 2300 fs_info->balance_ctl = NULL;
2293 init_waitqueue_head(&fs_info->balance_wait_q); 2301 init_waitqueue_head(&fs_info->balance_wait_q);
2302 btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
2294 2303
2295 sb->s_blocksize = 4096; 2304 sb->s_blocksize = 4096;
2296 sb->s_blocksize_bits = blksize_bits(4096); 2305 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2354,6 +2363,7 @@ int open_ctree(struct super_block *sb,
2354 spin_lock_init(&fs_info->qgroup_lock); 2363 spin_lock_init(&fs_info->qgroup_lock);
2355 mutex_init(&fs_info->qgroup_ioctl_lock); 2364 mutex_init(&fs_info->qgroup_ioctl_lock);
2356 fs_info->qgroup_tree = RB_ROOT; 2365 fs_info->qgroup_tree = RB_ROOT;
2366 fs_info->qgroup_op_tree = RB_ROOT;
2357 INIT_LIST_HEAD(&fs_info->dirty_qgroups); 2367 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2358 fs_info->qgroup_seq = 1; 2368 fs_info->qgroup_seq = 1;
2359 fs_info->quota_enabled = 0; 2369 fs_info->quota_enabled = 0;
@@ -2577,6 +2587,10 @@ int open_ctree(struct super_block *sb,
2577 btrfs_alloc_workqueue("readahead", flags, max_active, 2); 2587 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2578 fs_info->qgroup_rescan_workers = 2588 fs_info->qgroup_rescan_workers =
2579 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); 2589 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2590 fs_info->extent_workers =
2591 btrfs_alloc_workqueue("extent-refs", flags,
2592 min_t(u64, fs_devices->num_devices,
2593 max_active), 8);
2580 2594
2581 if (!(fs_info->workers && fs_info->delalloc_workers && 2595 if (!(fs_info->workers && fs_info->delalloc_workers &&
2582 fs_info->submit_workers && fs_info->flush_workers && 2596 fs_info->submit_workers && fs_info->flush_workers &&
@@ -2586,6 +2600,7 @@ int open_ctree(struct super_block *sb,
2586 fs_info->endio_freespace_worker && fs_info->rmw_workers && 2600 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2587 fs_info->caching_workers && fs_info->readahead_workers && 2601 fs_info->caching_workers && fs_info->readahead_workers &&
2588 fs_info->fixup_workers && fs_info->delayed_workers && 2602 fs_info->fixup_workers && fs_info->delayed_workers &&
2603 fs_info->fixup_workers && fs_info->extent_workers &&
2589 fs_info->qgroup_rescan_workers)) { 2604 fs_info->qgroup_rescan_workers)) {
2590 err = -ENOMEM; 2605 err = -ENOMEM;
2591 goto fail_sb_buffer; 2606 goto fail_sb_buffer;
@@ -2693,7 +2708,7 @@ retry_root_backup:
2693 ret = PTR_ERR(extent_root); 2708 ret = PTR_ERR(extent_root);
2694 goto recovery_tree_root; 2709 goto recovery_tree_root;
2695 } 2710 }
2696 extent_root->track_dirty = 1; 2711 set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
2697 fs_info->extent_root = extent_root; 2712 fs_info->extent_root = extent_root;
2698 2713
2699 location.objectid = BTRFS_DEV_TREE_OBJECTID; 2714 location.objectid = BTRFS_DEV_TREE_OBJECTID;
@@ -2702,7 +2717,7 @@ retry_root_backup:
2702 ret = PTR_ERR(dev_root); 2717 ret = PTR_ERR(dev_root);
2703 goto recovery_tree_root; 2718 goto recovery_tree_root;
2704 } 2719 }
2705 dev_root->track_dirty = 1; 2720 set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
2706 fs_info->dev_root = dev_root; 2721 fs_info->dev_root = dev_root;
2707 btrfs_init_devices_late(fs_info); 2722 btrfs_init_devices_late(fs_info);
2708 2723
@@ -2712,13 +2727,13 @@ retry_root_backup:
2712 ret = PTR_ERR(csum_root); 2727 ret = PTR_ERR(csum_root);
2713 goto recovery_tree_root; 2728 goto recovery_tree_root;
2714 } 2729 }
2715 csum_root->track_dirty = 1; 2730 set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
2716 fs_info->csum_root = csum_root; 2731 fs_info->csum_root = csum_root;
2717 2732
2718 location.objectid = BTRFS_QUOTA_TREE_OBJECTID; 2733 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2719 quota_root = btrfs_read_tree_root(tree_root, &location); 2734 quota_root = btrfs_read_tree_root(tree_root, &location);
2720 if (!IS_ERR(quota_root)) { 2735 if (!IS_ERR(quota_root)) {
2721 quota_root->track_dirty = 1; 2736 set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
2722 fs_info->quota_enabled = 1; 2737 fs_info->quota_enabled = 1;
2723 fs_info->pending_quota_state = 1; 2738 fs_info->pending_quota_state = 1;
2724 fs_info->quota_root = quota_root; 2739 fs_info->quota_root = quota_root;
@@ -2733,7 +2748,7 @@ retry_root_backup:
2733 create_uuid_tree = true; 2748 create_uuid_tree = true;
2734 check_uuid_tree = false; 2749 check_uuid_tree = false;
2735 } else { 2750 } else {
2736 uuid_root->track_dirty = 1; 2751 set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
2737 fs_info->uuid_root = uuid_root; 2752 fs_info->uuid_root = uuid_root;
2738 create_uuid_tree = false; 2753 create_uuid_tree = false;
2739 check_uuid_tree = 2754 check_uuid_tree =
@@ -2966,7 +2981,7 @@ fail_qgroup:
2966fail_trans_kthread: 2981fail_trans_kthread:
2967 kthread_stop(fs_info->transaction_kthread); 2982 kthread_stop(fs_info->transaction_kthread);
2968 btrfs_cleanup_transaction(fs_info->tree_root); 2983 btrfs_cleanup_transaction(fs_info->tree_root);
2969 del_fs_roots(fs_info); 2984 btrfs_free_fs_roots(fs_info);
2970fail_cleaner: 2985fail_cleaner:
2971 kthread_stop(fs_info->cleaner_kthread); 2986 kthread_stop(fs_info->cleaner_kthread);
2972 2987
@@ -3501,8 +3516,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3501 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 3516 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3502 btrfs_free_log(NULL, root); 3517 btrfs_free_log(NULL, root);
3503 3518
3504 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3519 if (root->free_ino_pinned)
3505 __btrfs_remove_free_space_cache(root->free_ino_ctl); 3520 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3521 if (root->free_ino_ctl)
3522 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3506 free_fs_root(root); 3523 free_fs_root(root);
3507} 3524}
3508 3525
@@ -3533,28 +3550,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3533{ 3550{
3534 u64 root_objectid = 0; 3551 u64 root_objectid = 0;
3535 struct btrfs_root *gang[8]; 3552 struct btrfs_root *gang[8];
3536 int i; 3553 int i = 0;
3537 int ret; 3554 int err = 0;
3555 unsigned int ret = 0;
3556 int index;
3538 3557
3539 while (1) { 3558 while (1) {
3559 index = srcu_read_lock(&fs_info->subvol_srcu);
3540 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 3560 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3541 (void **)gang, root_objectid, 3561 (void **)gang, root_objectid,
3542 ARRAY_SIZE(gang)); 3562 ARRAY_SIZE(gang));
3543 if (!ret) 3563 if (!ret) {
3564 srcu_read_unlock(&fs_info->subvol_srcu, index);
3544 break; 3565 break;
3545 3566 }
3546 root_objectid = gang[ret - 1]->root_key.objectid + 1; 3567 root_objectid = gang[ret - 1]->root_key.objectid + 1;
3568
3547 for (i = 0; i < ret; i++) { 3569 for (i = 0; i < ret; i++) {
3548 int err; 3570 /* Avoid to grab roots in dead_roots */
3571 if (btrfs_root_refs(&gang[i]->root_item) == 0) {
3572 gang[i] = NULL;
3573 continue;
3574 }
3575 /* grab all the search result for later use */
3576 gang[i] = btrfs_grab_fs_root(gang[i]);
3577 }
3578 srcu_read_unlock(&fs_info->subvol_srcu, index);
3549 3579
3580 for (i = 0; i < ret; i++) {
3581 if (!gang[i])
3582 continue;
3550 root_objectid = gang[i]->root_key.objectid; 3583 root_objectid = gang[i]->root_key.objectid;
3551 err = btrfs_orphan_cleanup(gang[i]); 3584 err = btrfs_orphan_cleanup(gang[i]);
3552 if (err) 3585 if (err)
3553 return err; 3586 break;
3587 btrfs_put_fs_root(gang[i]);
3554 } 3588 }
3555 root_objectid++; 3589 root_objectid++;
3556 } 3590 }
3557 return 0; 3591
3592 /* release the uncleaned roots due to error */
3593 for (; i < ret; i++) {
3594 if (gang[i])
3595 btrfs_put_fs_root(gang[i]);
3596 }
3597 return err;
3558} 3598}
3559 3599
3560int btrfs_commit_super(struct btrfs_root *root) 3600int btrfs_commit_super(struct btrfs_root *root)
@@ -3603,6 +3643,8 @@ int close_ctree(struct btrfs_root *root)
3603 /* clear out the rbtree of defraggable inodes */ 3643 /* clear out the rbtree of defraggable inodes */
3604 btrfs_cleanup_defrag_inodes(fs_info); 3644 btrfs_cleanup_defrag_inodes(fs_info);
3605 3645
3646 cancel_work_sync(&fs_info->async_reclaim_work);
3647
3606 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3648 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3607 ret = btrfs_commit_super(root); 3649 ret = btrfs_commit_super(root);
3608 if (ret) 3650 if (ret)
@@ -3627,12 +3669,17 @@ int close_ctree(struct btrfs_root *root)
3627 3669
3628 btrfs_sysfs_remove_one(fs_info); 3670 btrfs_sysfs_remove_one(fs_info);
3629 3671
3630 del_fs_roots(fs_info); 3672 btrfs_free_fs_roots(fs_info);
3631 3673
3632 btrfs_put_block_group_cache(fs_info); 3674 btrfs_put_block_group_cache(fs_info);
3633 3675
3634 btrfs_free_block_groups(fs_info); 3676 btrfs_free_block_groups(fs_info);
3635 3677
3678 /*
3679 * we must make sure there is not any read request to
3680 * submit after we stopping all workers.
3681 */
3682 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3636 btrfs_stop_all_workers(fs_info); 3683 btrfs_stop_all_workers(fs_info);
3637 3684
3638 free_root_pointers(fs_info, 1); 3685 free_root_pointers(fs_info, 1);
@@ -3709,6 +3756,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3709 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, 3756 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3710 buf->len, 3757 buf->len,
3711 root->fs_info->dirty_metadata_batch); 3758 root->fs_info->dirty_metadata_batch);
3759#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3760 if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
3761 btrfs_print_leaf(root, buf);
3762 ASSERT(0);
3763 }
3764#endif
3712} 3765}
3713 3766
3714static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3767static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 53059df350f8..23ce3ceba0a9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
68int btrfs_init_fs_root(struct btrfs_root *root); 68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, 69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root); 70 struct btrfs_root *root);
71void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
71 72
72struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, 73struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
73 struct btrfs_key *key, 74 struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5590af92094b..fafb3e53ecde 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,16 +26,16 @@
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h> 27#include <linux/percpu_counter.h>
28#include "hash.h" 28#include "hash.h"
29#include "ctree.h" 29#include "tree-log.h"
30#include "disk-io.h" 30#include "disk-io.h"
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h"
33#include "volumes.h" 32#include "volumes.h"
34#include "raid56.h" 33#include "raid56.h"
35#include "locking.h" 34#include "locking.h"
36#include "free-space-cache.h" 35#include "free-space-cache.h"
37#include "math.h" 36#include "math.h"
38#include "sysfs.h" 37#include "sysfs.h"
38#include "qgroup.h"
39 39
40#undef SCRAMBLE_DELAYED_REFS 40#undef SCRAMBLE_DELAYED_REFS
41 41
@@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
81 u64 bytenr, u64 num_bytes, u64 parent, 81 u64 bytenr, u64 num_bytes, u64 parent,
82 u64 root_objectid, u64 owner_objectid, 82 u64 root_objectid, u64 owner_objectid,
83 u64 owner_offset, int refs_to_drop, 83 u64 owner_offset, int refs_to_drop,
84 struct btrfs_delayed_extent_op *extra_op); 84 struct btrfs_delayed_extent_op *extra_op,
85 int no_quota);
85static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 86static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
86 struct extent_buffer *leaf, 87 struct extent_buffer *leaf,
87 struct btrfs_extent_item *ei); 88 struct btrfs_extent_item *ei);
@@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root, 95 struct btrfs_root *root,
95 u64 parent, u64 root_objectid, 96 u64 parent, u64 root_objectid,
96 u64 flags, struct btrfs_disk_key *key, 97 u64 flags, struct btrfs_disk_key *key,
97 int level, struct btrfs_key *ins); 98 int level, struct btrfs_key *ins,
99 int no_quota);
98static int do_chunk_alloc(struct btrfs_trans_handle *trans, 100static int do_chunk_alloc(struct btrfs_trans_handle *trans,
99 struct btrfs_root *extent_root, u64 flags, 101 struct btrfs_root *extent_root, u64 flags,
100 int force); 102 int force);
@@ -1271,7 +1273,7 @@ fail:
1271static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1273static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1272 struct btrfs_root *root, 1274 struct btrfs_root *root,
1273 struct btrfs_path *path, 1275 struct btrfs_path *path,
1274 int refs_to_drop) 1276 int refs_to_drop, int *last_ref)
1275{ 1277{
1276 struct btrfs_key key; 1278 struct btrfs_key key;
1277 struct btrfs_extent_data_ref *ref1 = NULL; 1279 struct btrfs_extent_data_ref *ref1 = NULL;
@@ -1307,6 +1309,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1307 1309
1308 if (num_refs == 0) { 1310 if (num_refs == 0) {
1309 ret = btrfs_del_item(trans, root, path); 1311 ret = btrfs_del_item(trans, root, path);
1312 *last_ref = 1;
1310 } else { 1313 } else {
1311 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1314 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1312 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1315 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1764,7 +1767,8 @@ void update_inline_extent_backref(struct btrfs_root *root,
1764 struct btrfs_path *path, 1767 struct btrfs_path *path,
1765 struct btrfs_extent_inline_ref *iref, 1768 struct btrfs_extent_inline_ref *iref,
1766 int refs_to_mod, 1769 int refs_to_mod,
1767 struct btrfs_delayed_extent_op *extent_op) 1770 struct btrfs_delayed_extent_op *extent_op,
1771 int *last_ref)
1768{ 1772{
1769 struct extent_buffer *leaf; 1773 struct extent_buffer *leaf;
1770 struct btrfs_extent_item *ei; 1774 struct btrfs_extent_item *ei;
@@ -1808,6 +1812,7 @@ void update_inline_extent_backref(struct btrfs_root *root,
1808 else 1812 else
1809 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1813 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1810 } else { 1814 } else {
1815 *last_ref = 1;
1811 size = btrfs_extent_inline_ref_size(type); 1816 size = btrfs_extent_inline_ref_size(type);
1812 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1817 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1813 ptr = (unsigned long)iref; 1818 ptr = (unsigned long)iref;
@@ -1839,7 +1844,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1839 if (ret == 0) { 1844 if (ret == 0) {
1840 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1845 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1841 update_inline_extent_backref(root, path, iref, 1846 update_inline_extent_backref(root, path, iref,
1842 refs_to_add, extent_op); 1847 refs_to_add, extent_op, NULL);
1843 } else if (ret == -ENOENT) { 1848 } else if (ret == -ENOENT) {
1844 setup_inline_extent_backref(root, path, iref, parent, 1849 setup_inline_extent_backref(root, path, iref, parent,
1845 root_objectid, owner, offset, 1850 root_objectid, owner, offset,
@@ -1872,17 +1877,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1872 struct btrfs_root *root, 1877 struct btrfs_root *root,
1873 struct btrfs_path *path, 1878 struct btrfs_path *path,
1874 struct btrfs_extent_inline_ref *iref, 1879 struct btrfs_extent_inline_ref *iref,
1875 int refs_to_drop, int is_data) 1880 int refs_to_drop, int is_data, int *last_ref)
1876{ 1881{
1877 int ret = 0; 1882 int ret = 0;
1878 1883
1879 BUG_ON(!is_data && refs_to_drop != 1); 1884 BUG_ON(!is_data && refs_to_drop != 1);
1880 if (iref) { 1885 if (iref) {
1881 update_inline_extent_backref(root, path, iref, 1886 update_inline_extent_backref(root, path, iref,
1882 -refs_to_drop, NULL); 1887 -refs_to_drop, NULL, last_ref);
1883 } else if (is_data) { 1888 } else if (is_data) {
1884 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1889 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1890 last_ref);
1885 } else { 1891 } else {
1892 *last_ref = 1;
1886 ret = btrfs_del_item(trans, root, path); 1893 ret = btrfs_del_item(trans, root, path);
1887 } 1894 }
1888 return ret; 1895 return ret;
@@ -1946,7 +1953,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1946int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1953int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1947 struct btrfs_root *root, 1954 struct btrfs_root *root,
1948 u64 bytenr, u64 num_bytes, u64 parent, 1955 u64 bytenr, u64 num_bytes, u64 parent,
1949 u64 root_objectid, u64 owner, u64 offset, int for_cow) 1956 u64 root_objectid, u64 owner, u64 offset,
1957 int no_quota)
1950{ 1958{
1951 int ret; 1959 int ret;
1952 struct btrfs_fs_info *fs_info = root->fs_info; 1960 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1958,12 +1966,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1958 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1966 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1959 num_bytes, 1967 num_bytes,
1960 parent, root_objectid, (int)owner, 1968 parent, root_objectid, (int)owner,
1961 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1969 BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1962 } else { 1970 } else {
1963 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1971 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1964 num_bytes, 1972 num_bytes,
1965 parent, root_objectid, owner, offset, 1973 parent, root_objectid, owner, offset,
1966 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1974 BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1967 } 1975 }
1968 return ret; 1976 return ret;
1969} 1977}
@@ -1973,31 +1981,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1973 u64 bytenr, u64 num_bytes, 1981 u64 bytenr, u64 num_bytes,
1974 u64 parent, u64 root_objectid, 1982 u64 parent, u64 root_objectid,
1975 u64 owner, u64 offset, int refs_to_add, 1983 u64 owner, u64 offset, int refs_to_add,
1984 int no_quota,
1976 struct btrfs_delayed_extent_op *extent_op) 1985 struct btrfs_delayed_extent_op *extent_op)
1977{ 1986{
1987 struct btrfs_fs_info *fs_info = root->fs_info;
1978 struct btrfs_path *path; 1988 struct btrfs_path *path;
1979 struct extent_buffer *leaf; 1989 struct extent_buffer *leaf;
1980 struct btrfs_extent_item *item; 1990 struct btrfs_extent_item *item;
1991 struct btrfs_key key;
1981 u64 refs; 1992 u64 refs;
1982 int ret; 1993 int ret;
1994 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
1983 1995
1984 path = btrfs_alloc_path(); 1996 path = btrfs_alloc_path();
1985 if (!path) 1997 if (!path)
1986 return -ENOMEM; 1998 return -ENOMEM;
1987 1999
2000 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
2001 no_quota = 1;
2002
1988 path->reada = 1; 2003 path->reada = 1;
1989 path->leave_spinning = 1; 2004 path->leave_spinning = 1;
1990 /* this will setup the path even if it fails to insert the back ref */ 2005 /* this will setup the path even if it fails to insert the back ref */
1991 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 2006 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
1992 path, bytenr, num_bytes, parent, 2007 bytenr, num_bytes, parent,
1993 root_objectid, owner, offset, 2008 root_objectid, owner, offset,
1994 refs_to_add, extent_op); 2009 refs_to_add, extent_op);
1995 if (ret != -EAGAIN) 2010 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
1996 goto out; 2011 goto out;
2012 /*
2013 * Ok we were able to insert an inline extent and it appears to be a new
2014 * reference, deal with the qgroup accounting.
2015 */
2016 if (!ret && !no_quota) {
2017 ASSERT(root->fs_info->quota_enabled);
2018 leaf = path->nodes[0];
2019 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2020 item = btrfs_item_ptr(leaf, path->slots[0],
2021 struct btrfs_extent_item);
2022 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
2023 type = BTRFS_QGROUP_OPER_ADD_SHARED;
2024 btrfs_release_path(path);
1997 2025
2026 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2027 bytenr, num_bytes, type, 0);
2028 goto out;
2029 }
2030
2031 /*
2032 * Ok we had -EAGAIN which means we didn't have space to insert and
2033 * inline extent ref, so just update the reference count and add a
2034 * normal backref.
2035 */
1998 leaf = path->nodes[0]; 2036 leaf = path->nodes[0];
2037 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1999 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2038 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2000 refs = btrfs_extent_refs(leaf, item); 2039 refs = btrfs_extent_refs(leaf, item);
2040 if (refs)
2041 type = BTRFS_QGROUP_OPER_ADD_SHARED;
2001 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2042 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2002 if (extent_op) 2043 if (extent_op)
2003 __run_delayed_extent_op(extent_op, leaf, item); 2044 __run_delayed_extent_op(extent_op, leaf, item);
@@ -2005,9 +2046,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2005 btrfs_mark_buffer_dirty(leaf); 2046 btrfs_mark_buffer_dirty(leaf);
2006 btrfs_release_path(path); 2047 btrfs_release_path(path);
2007 2048
2049 if (!no_quota) {
2050 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2051 bytenr, num_bytes, type, 0);
2052 if (ret)
2053 goto out;
2054 }
2055
2008 path->reada = 1; 2056 path->reada = 1;
2009 path->leave_spinning = 1; 2057 path->leave_spinning = 1;
2010
2011 /* now insert the actual backref */ 2058 /* now insert the actual backref */
2012 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2059 ret = insert_extent_backref(trans, root->fs_info->extent_root,
2013 path, bytenr, parent, root_objectid, 2060 path, bytenr, parent, root_objectid,
@@ -2041,8 +2088,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2041 2088
2042 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2089 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2043 parent = ref->parent; 2090 parent = ref->parent;
2044 else 2091 ref_root = ref->root;
2045 ref_root = ref->root;
2046 2092
2047 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2093 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2048 if (extent_op) 2094 if (extent_op)
@@ -2056,13 +2102,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2056 node->num_bytes, parent, 2102 node->num_bytes, parent,
2057 ref_root, ref->objectid, 2103 ref_root, ref->objectid,
2058 ref->offset, node->ref_mod, 2104 ref->offset, node->ref_mod,
2059 extent_op); 2105 node->no_quota, extent_op);
2060 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2106 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2061 ret = __btrfs_free_extent(trans, root, node->bytenr, 2107 ret = __btrfs_free_extent(trans, root, node->bytenr,
2062 node->num_bytes, parent, 2108 node->num_bytes, parent,
2063 ref_root, ref->objectid, 2109 ref_root, ref->objectid,
2064 ref->offset, node->ref_mod, 2110 ref->offset, node->ref_mod,
2065 extent_op); 2111 extent_op, node->no_quota);
2066 } else { 2112 } else {
2067 BUG(); 2113 BUG();
2068 } 2114 }
@@ -2199,8 +2245,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2199 2245
2200 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2246 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2201 parent = ref->parent; 2247 parent = ref->parent;
2202 else 2248 ref_root = ref->root;
2203 ref_root = ref->root;
2204 2249
2205 ins.objectid = node->bytenr; 2250 ins.objectid = node->bytenr;
2206 if (skinny_metadata) { 2251 if (skinny_metadata) {
@@ -2218,15 +2263,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2218 parent, ref_root, 2263 parent, ref_root,
2219 extent_op->flags_to_set, 2264 extent_op->flags_to_set,
2220 &extent_op->key, 2265 &extent_op->key,
2221 ref->level, &ins); 2266 ref->level, &ins,
2267 node->no_quota);
2222 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2268 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2223 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2269 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2224 node->num_bytes, parent, ref_root, 2270 node->num_bytes, parent, ref_root,
2225 ref->level, 0, 1, extent_op); 2271 ref->level, 0, 1, node->no_quota,
2272 extent_op);
2226 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2273 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2227 ret = __btrfs_free_extent(trans, root, node->bytenr, 2274 ret = __btrfs_free_extent(trans, root, node->bytenr,
2228 node->num_bytes, parent, ref_root, 2275 node->num_bytes, parent, ref_root,
2229 ref->level, 0, 1, extent_op); 2276 ref->level, 0, 1, extent_op,
2277 node->no_quota);
2230 } else { 2278 } else {
2231 BUG(); 2279 BUG();
2232 } 2280 }
@@ -2574,42 +2622,6 @@ static u64 find_middle(struct rb_root *root)
2574} 2622}
2575#endif 2623#endif
2576 2624
2577int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2578 struct btrfs_fs_info *fs_info)
2579{
2580 struct qgroup_update *qgroup_update;
2581 int ret = 0;
2582
2583 if (list_empty(&trans->qgroup_ref_list) !=
2584 !trans->delayed_ref_elem.seq) {
2585 /* list without seq or seq without list */
2586 btrfs_err(fs_info,
2587 "qgroup accounting update error, list is%s empty, seq is %#x.%x",
2588 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2589 (u32)(trans->delayed_ref_elem.seq >> 32),
2590 (u32)trans->delayed_ref_elem.seq);
2591 BUG();
2592 }
2593
2594 if (!trans->delayed_ref_elem.seq)
2595 return 0;
2596
2597 while (!list_empty(&trans->qgroup_ref_list)) {
2598 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2599 struct qgroup_update, list);
2600 list_del(&qgroup_update->list);
2601 if (!ret)
2602 ret = btrfs_qgroup_account_ref(
2603 trans, fs_info, qgroup_update->node,
2604 qgroup_update->extent_op);
2605 kfree(qgroup_update);
2606 }
2607
2608 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2609
2610 return ret;
2611}
2612
2613static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2625static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2614{ 2626{
2615 u64 num_bytes; 2627 u64 num_bytes;
@@ -2662,15 +2674,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2662 u64 num_entries = 2674 u64 num_entries =
2663 atomic_read(&trans->transaction->delayed_refs.num_entries); 2675 atomic_read(&trans->transaction->delayed_refs.num_entries);
2664 u64 avg_runtime; 2676 u64 avg_runtime;
2677 u64 val;
2665 2678
2666 smp_mb(); 2679 smp_mb();
2667 avg_runtime = fs_info->avg_delayed_ref_runtime; 2680 avg_runtime = fs_info->avg_delayed_ref_runtime;
2681 val = num_entries * avg_runtime;
2668 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2682 if (num_entries * avg_runtime >= NSEC_PER_SEC)
2669 return 1; 2683 return 1;
2684 if (val >= NSEC_PER_SEC / 2)
2685 return 2;
2670 2686
2671 return btrfs_check_space_for_delayed_refs(trans, root); 2687 return btrfs_check_space_for_delayed_refs(trans, root);
2672} 2688}
2673 2689
2690struct async_delayed_refs {
2691 struct btrfs_root *root;
2692 int count;
2693 int error;
2694 int sync;
2695 struct completion wait;
2696 struct btrfs_work work;
2697};
2698
2699static void delayed_ref_async_start(struct btrfs_work *work)
2700{
2701 struct async_delayed_refs *async;
2702 struct btrfs_trans_handle *trans;
2703 int ret;
2704
2705 async = container_of(work, struct async_delayed_refs, work);
2706
2707 trans = btrfs_join_transaction(async->root);
2708 if (IS_ERR(trans)) {
2709 async->error = PTR_ERR(trans);
2710 goto done;
2711 }
2712
2713 /*
2714 * trans->sync means that when we call end_transaciton, we won't
2715 * wait on delayed refs
2716 */
2717 trans->sync = true;
2718 ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2719 if (ret)
2720 async->error = ret;
2721
2722 ret = btrfs_end_transaction(trans, async->root);
2723 if (ret && !async->error)
2724 async->error = ret;
2725done:
2726 if (async->sync)
2727 complete(&async->wait);
2728 else
2729 kfree(async);
2730}
2731
2732int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2733 unsigned long count, int wait)
2734{
2735 struct async_delayed_refs *async;
2736 int ret;
2737
2738 async = kmalloc(sizeof(*async), GFP_NOFS);
2739 if (!async)
2740 return -ENOMEM;
2741
2742 async->root = root->fs_info->tree_root;
2743 async->count = count;
2744 async->error = 0;
2745 if (wait)
2746 async->sync = 1;
2747 else
2748 async->sync = 0;
2749 init_completion(&async->wait);
2750
2751 btrfs_init_work(&async->work, delayed_ref_async_start,
2752 NULL, NULL);
2753
2754 btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2755
2756 if (wait) {
2757 wait_for_completion(&async->wait);
2758 ret = async->error;
2759 kfree(async);
2760 return ret;
2761 }
2762 return 0;
2763}
2764
2674/* 2765/*
2675 * this starts processing the delayed reference count updates and 2766 * this starts processing the delayed reference count updates and
2676 * extent insertions we have queued up so far. count can be 2767 * extent insertions we have queued up so far. count can be
@@ -2698,8 +2789,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2698 if (root == root->fs_info->extent_root) 2789 if (root == root->fs_info->extent_root)
2699 root = root->fs_info->tree_root; 2790 root = root->fs_info->tree_root;
2700 2791
2701 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2702
2703 delayed_refs = &trans->transaction->delayed_refs; 2792 delayed_refs = &trans->transaction->delayed_refs;
2704 if (count == 0) { 2793 if (count == 0) {
2705 count = atomic_read(&delayed_refs->num_entries) * 2; 2794 count = atomic_read(&delayed_refs->num_entries) * 2;
@@ -2758,6 +2847,9 @@ again:
2758 goto again; 2847 goto again;
2759 } 2848 }
2760out: 2849out:
2850 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
2851 if (ret)
2852 return ret;
2761 assert_qgroups_uptodate(trans); 2853 assert_qgroups_uptodate(trans);
2762 return 0; 2854 return 0;
2763} 2855}
@@ -2964,7 +3056,7 @@ out:
2964static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3056static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2965 struct btrfs_root *root, 3057 struct btrfs_root *root,
2966 struct extent_buffer *buf, 3058 struct extent_buffer *buf,
2967 int full_backref, int inc, int for_cow) 3059 int full_backref, int inc, int no_quota)
2968{ 3060{
2969 u64 bytenr; 3061 u64 bytenr;
2970 u64 num_bytes; 3062 u64 num_bytes;
@@ -2979,11 +3071,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2979 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3071 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2980 u64, u64, u64, u64, u64, u64, int); 3072 u64, u64, u64, u64, u64, u64, int);
2981 3073
3074#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3075 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
3076 return 0;
3077#endif
2982 ref_root = btrfs_header_owner(buf); 3078 ref_root = btrfs_header_owner(buf);
2983 nritems = btrfs_header_nritems(buf); 3079 nritems = btrfs_header_nritems(buf);
2984 level = btrfs_header_level(buf); 3080 level = btrfs_header_level(buf);
2985 3081
2986 if (!root->ref_cows && level == 0) 3082 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2987 return 0; 3083 return 0;
2988 3084
2989 if (inc) 3085 if (inc)
@@ -3014,7 +3110,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3014 key.offset -= btrfs_file_extent_offset(buf, fi); 3110 key.offset -= btrfs_file_extent_offset(buf, fi);
3015 ret = process_func(trans, root, bytenr, num_bytes, 3111 ret = process_func(trans, root, bytenr, num_bytes,
3016 parent, ref_root, key.objectid, 3112 parent, ref_root, key.objectid,
3017 key.offset, for_cow); 3113 key.offset, no_quota);
3018 if (ret) 3114 if (ret)
3019 goto fail; 3115 goto fail;
3020 } else { 3116 } else {
@@ -3022,7 +3118,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3022 num_bytes = btrfs_level_size(root, level - 1); 3118 num_bytes = btrfs_level_size(root, level - 1);
3023 ret = process_func(trans, root, bytenr, num_bytes, 3119 ret = process_func(trans, root, bytenr, num_bytes,
3024 parent, ref_root, level - 1, 0, 3120 parent, ref_root, level - 1, 0,
3025 for_cow); 3121 no_quota);
3026 if (ret) 3122 if (ret)
3027 goto fail; 3123 goto fail;
3028 } 3124 }
@@ -3033,15 +3129,15 @@ fail:
3033} 3129}
3034 3130
3035int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3131int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3036 struct extent_buffer *buf, int full_backref, int for_cow) 3132 struct extent_buffer *buf, int full_backref, int no_quota)
3037{ 3133{
3038 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); 3134 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
3039} 3135}
3040 3136
3041int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3137int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3042 struct extent_buffer *buf, int full_backref, int for_cow) 3138 struct extent_buffer *buf, int full_backref, int no_quota)
3043{ 3139{
3044 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); 3140 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
3045} 3141}
3046 3142
3047static int write_one_cache_group(struct btrfs_trans_handle *trans, 3143static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3401,10 +3497,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3401 return ret; 3497 return ret;
3402 } 3498 }
3403 3499
3404 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 3500 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3405 INIT_LIST_HEAD(&found->block_groups[i]); 3501 INIT_LIST_HEAD(&found->block_groups[i]);
3406 kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
3407 }
3408 init_rwsem(&found->groups_sem); 3502 init_rwsem(&found->groups_sem);
3409 spin_lock_init(&found->lock); 3503 spin_lock_init(&found->lock);
3410 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3504 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -4204,6 +4298,104 @@ static int flush_space(struct btrfs_root *root,
4204 4298
4205 return ret; 4299 return ret;
4206} 4300}
4301
4302static inline u64
4303btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4304 struct btrfs_space_info *space_info)
4305{
4306 u64 used;
4307 u64 expected;
4308 u64 to_reclaim;
4309
4310 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4311 16 * 1024 * 1024);
4312 spin_lock(&space_info->lock);
4313 if (can_overcommit(root, space_info, to_reclaim,
4314 BTRFS_RESERVE_FLUSH_ALL)) {
4315 to_reclaim = 0;
4316 goto out;
4317 }
4318
4319 used = space_info->bytes_used + space_info->bytes_reserved +
4320 space_info->bytes_pinned + space_info->bytes_readonly +
4321 space_info->bytes_may_use;
4322 if (can_overcommit(root, space_info, 1024 * 1024,
4323 BTRFS_RESERVE_FLUSH_ALL))
4324 expected = div_factor_fine(space_info->total_bytes, 95);
4325 else
4326 expected = div_factor_fine(space_info->total_bytes, 90);
4327
4328 if (used > expected)
4329 to_reclaim = used - expected;
4330 else
4331 to_reclaim = 0;
4332 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4333 space_info->bytes_reserved);
4334out:
4335 spin_unlock(&space_info->lock);
4336
4337 return to_reclaim;
4338}
4339
4340static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4341 struct btrfs_fs_info *fs_info, u64 used)
4342{
4343 return (used >= div_factor_fine(space_info->total_bytes, 98) &&
4344 !btrfs_fs_closing(fs_info) &&
4345 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4346}
4347
4348static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4349 struct btrfs_fs_info *fs_info)
4350{
4351 u64 used;
4352
4353 spin_lock(&space_info->lock);
4354 used = space_info->bytes_used + space_info->bytes_reserved +
4355 space_info->bytes_pinned + space_info->bytes_readonly +
4356 space_info->bytes_may_use;
4357 if (need_do_async_reclaim(space_info, fs_info, used)) {
4358 spin_unlock(&space_info->lock);
4359 return 1;
4360 }
4361 spin_unlock(&space_info->lock);
4362
4363 return 0;
4364}
4365
4366static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4367{
4368 struct btrfs_fs_info *fs_info;
4369 struct btrfs_space_info *space_info;
4370 u64 to_reclaim;
4371 int flush_state;
4372
4373 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4374 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4375
4376 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4377 space_info);
4378 if (!to_reclaim)
4379 return;
4380
4381 flush_state = FLUSH_DELAYED_ITEMS_NR;
4382 do {
4383 flush_space(fs_info->fs_root, space_info, to_reclaim,
4384 to_reclaim, flush_state);
4385 flush_state++;
4386 if (!btrfs_need_do_async_reclaim(space_info, fs_info))
4387 return;
4388 } while (flush_state <= COMMIT_TRANS);
4389
4390 if (btrfs_need_do_async_reclaim(space_info, fs_info))
4391 queue_work(system_unbound_wq, work);
4392}
4393
4394void btrfs_init_async_reclaim_work(struct work_struct *work)
4395{
4396 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4397}
4398
4207/** 4399/**
4208 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4400 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4209 * @root - the root we're allocating for 4401 * @root - the root we're allocating for
@@ -4311,8 +4503,13 @@ again:
4311 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4503 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4312 flushing = true; 4504 flushing = true;
4313 space_info->flush = 1; 4505 space_info->flush = 1;
4506 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4507 used += orig_bytes;
4508 if (need_do_async_reclaim(space_info, root->fs_info, used) &&
4509 !work_busy(&root->fs_info->async_reclaim_work))
4510 queue_work(system_unbound_wq,
4511 &root->fs_info->async_reclaim_work);
4314 } 4512 }
4315
4316 spin_unlock(&space_info->lock); 4513 spin_unlock(&space_info->lock);
4317 4514
4318 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4515 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
@@ -4369,7 +4566,7 @@ static struct btrfs_block_rsv *get_block_rsv(
4369{ 4566{
4370 struct btrfs_block_rsv *block_rsv = NULL; 4567 struct btrfs_block_rsv *block_rsv = NULL;
4371 4568
4372 if (root->ref_cows) 4569 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4373 block_rsv = trans->block_rsv; 4570 block_rsv = trans->block_rsv;
4374 4571
4375 if (root == root->fs_info->csum_root && trans->adding_csums) 4572 if (root == root->fs_info->csum_root && trans->adding_csums)
@@ -5621,7 +5818,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5621 u64 bytenr, u64 num_bytes, u64 parent, 5818 u64 bytenr, u64 num_bytes, u64 parent,
5622 u64 root_objectid, u64 owner_objectid, 5819 u64 root_objectid, u64 owner_objectid,
5623 u64 owner_offset, int refs_to_drop, 5820 u64 owner_offset, int refs_to_drop,
5624 struct btrfs_delayed_extent_op *extent_op) 5821 struct btrfs_delayed_extent_op *extent_op,
5822 int no_quota)
5625{ 5823{
5626 struct btrfs_key key; 5824 struct btrfs_key key;
5627 struct btrfs_path *path; 5825 struct btrfs_path *path;
@@ -5637,9 +5835,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5637 int num_to_del = 1; 5835 int num_to_del = 1;
5638 u32 item_size; 5836 u32 item_size;
5639 u64 refs; 5837 u64 refs;
5838 int last_ref = 0;
5839 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
5640 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5840 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5641 SKINNY_METADATA); 5841 SKINNY_METADATA);
5642 5842
5843 if (!info->quota_enabled || !is_fstree(root_objectid))
5844 no_quota = 1;
5845
5643 path = btrfs_alloc_path(); 5846 path = btrfs_alloc_path();
5644 if (!path) 5847 if (!path)
5645 return -ENOMEM; 5848 return -ENOMEM;
@@ -5687,7 +5890,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5687 BUG_ON(iref); 5890 BUG_ON(iref);
5688 ret = remove_extent_backref(trans, extent_root, path, 5891 ret = remove_extent_backref(trans, extent_root, path,
5689 NULL, refs_to_drop, 5892 NULL, refs_to_drop,
5690 is_data); 5893 is_data, &last_ref);
5691 if (ret) { 5894 if (ret) {
5692 btrfs_abort_transaction(trans, extent_root, ret); 5895 btrfs_abort_transaction(trans, extent_root, ret);
5693 goto out; 5896 goto out;
@@ -5806,7 +6009,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5806 refs = btrfs_extent_refs(leaf, ei); 6009 refs = btrfs_extent_refs(leaf, ei);
5807 if (refs < refs_to_drop) { 6010 if (refs < refs_to_drop) {
5808 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6011 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
5809 "for bytenr %Lu\n", refs_to_drop, refs, bytenr); 6012 "for bytenr %Lu", refs_to_drop, refs, bytenr);
5810 ret = -EINVAL; 6013 ret = -EINVAL;
5811 btrfs_abort_transaction(trans, extent_root, ret); 6014 btrfs_abort_transaction(trans, extent_root, ret);
5812 goto out; 6015 goto out;
@@ -5814,6 +6017,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5814 refs -= refs_to_drop; 6017 refs -= refs_to_drop;
5815 6018
5816 if (refs > 0) { 6019 if (refs > 0) {
6020 type = BTRFS_QGROUP_OPER_SUB_SHARED;
5817 if (extent_op) 6021 if (extent_op)
5818 __run_delayed_extent_op(extent_op, leaf, ei); 6022 __run_delayed_extent_op(extent_op, leaf, ei);
5819 /* 6023 /*
@@ -5829,7 +6033,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5829 if (found_extent) { 6033 if (found_extent) {
5830 ret = remove_extent_backref(trans, extent_root, path, 6034 ret = remove_extent_backref(trans, extent_root, path,
5831 iref, refs_to_drop, 6035 iref, refs_to_drop,
5832 is_data); 6036 is_data, &last_ref);
5833 if (ret) { 6037 if (ret) {
5834 btrfs_abort_transaction(trans, extent_root, ret); 6038 btrfs_abort_transaction(trans, extent_root, ret);
5835 goto out; 6039 goto out;
@@ -5850,6 +6054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5850 } 6054 }
5851 } 6055 }
5852 6056
6057 last_ref = 1;
5853 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6058 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5854 num_to_del); 6059 num_to_del);
5855 if (ret) { 6060 if (ret) {
@@ -5872,6 +6077,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5872 goto out; 6077 goto out;
5873 } 6078 }
5874 } 6079 }
6080 btrfs_release_path(path);
6081
6082 /* Deal with the quota accounting */
6083 if (!ret && last_ref && !no_quota) {
6084 int mod_seq = 0;
6085
6086 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
6087 type == BTRFS_QGROUP_OPER_SUB_SHARED)
6088 mod_seq = 1;
6089
6090 ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
6091 bytenr, num_bytes, type,
6092 mod_seq);
6093 }
5875out: 6094out:
5876 btrfs_free_path(path); 6095 btrfs_free_path(path);
5877 return ret; 6096 return ret;
@@ -6008,11 +6227,15 @@ out:
6008/* Can return -ENOMEM */ 6227/* Can return -ENOMEM */
6009int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6228int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6010 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6229 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6011 u64 owner, u64 offset, int for_cow) 6230 u64 owner, u64 offset, int no_quota)
6012{ 6231{
6013 int ret; 6232 int ret;
6014 struct btrfs_fs_info *fs_info = root->fs_info; 6233 struct btrfs_fs_info *fs_info = root->fs_info;
6015 6234
6235#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6236 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
6237 return 0;
6238#endif
6016 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6239 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6017 6240
6018 /* 6241 /*
@@ -6028,13 +6251,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6028 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6251 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6029 num_bytes, 6252 num_bytes,
6030 parent, root_objectid, (int)owner, 6253 parent, root_objectid, (int)owner,
6031 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 6254 BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6032 } else { 6255 } else {
6033 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6256 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6034 num_bytes, 6257 num_bytes,
6035 parent, root_objectid, owner, 6258 parent, root_objectid, owner,
6036 offset, BTRFS_DROP_DELAYED_REF, 6259 offset, BTRFS_DROP_DELAYED_REF,
6037 NULL, for_cow); 6260 NULL, no_quota);
6038 } 6261 }
6039 return ret; 6262 return ret;
6040} 6263}
@@ -6514,8 +6737,14 @@ loop:
6514 loop++; 6737 loop++;
6515 if (loop == LOOP_ALLOC_CHUNK) { 6738 if (loop == LOOP_ALLOC_CHUNK) {
6516 struct btrfs_trans_handle *trans; 6739 struct btrfs_trans_handle *trans;
6740 int exist = 0;
6741
6742 trans = current->journal_info;
6743 if (trans)
6744 exist = 1;
6745 else
6746 trans = btrfs_join_transaction(root);
6517 6747
6518 trans = btrfs_join_transaction(root);
6519 if (IS_ERR(trans)) { 6748 if (IS_ERR(trans)) {
6520 ret = PTR_ERR(trans); 6749 ret = PTR_ERR(trans);
6521 goto out; 6750 goto out;
@@ -6532,7 +6761,8 @@ loop:
6532 root, ret); 6761 root, ret);
6533 else 6762 else
6534 ret = 0; 6763 ret = 0;
6535 btrfs_end_transaction(trans, root); 6764 if (!exist)
6765 btrfs_end_transaction(trans, root);
6536 if (ret) 6766 if (ret)
6537 goto out; 6767 goto out;
6538 } 6768 }
@@ -6733,6 +6963,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6733 btrfs_mark_buffer_dirty(path->nodes[0]); 6963 btrfs_mark_buffer_dirty(path->nodes[0]);
6734 btrfs_free_path(path); 6964 btrfs_free_path(path);
6735 6965
6966 /* Always set parent to 0 here since its exclusive anyway. */
6967 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
6968 ins->objectid, ins->offset,
6969 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
6970 if (ret)
6971 return ret;
6972
6736 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6973 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6737 if (ret) { /* -ENOENT, logic error */ 6974 if (ret) { /* -ENOENT, logic error */
6738 btrfs_err(fs_info, "update block group failed for %llu %llu", 6975 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6747,7 +6984,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6747 struct btrfs_root *root, 6984 struct btrfs_root *root,
6748 u64 parent, u64 root_objectid, 6985 u64 parent, u64 root_objectid,
6749 u64 flags, struct btrfs_disk_key *key, 6986 u64 flags, struct btrfs_disk_key *key,
6750 int level, struct btrfs_key *ins) 6987 int level, struct btrfs_key *ins,
6988 int no_quota)
6751{ 6989{
6752 int ret; 6990 int ret;
6753 struct btrfs_fs_info *fs_info = root->fs_info; 6991 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6757,6 +6995,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6757 struct btrfs_path *path; 6995 struct btrfs_path *path;
6758 struct extent_buffer *leaf; 6996 struct extent_buffer *leaf;
6759 u32 size = sizeof(*extent_item) + sizeof(*iref); 6997 u32 size = sizeof(*extent_item) + sizeof(*iref);
6998 u64 num_bytes = ins->offset;
6760 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6999 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6761 SKINNY_METADATA); 7000 SKINNY_METADATA);
6762 7001
@@ -6790,6 +7029,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6790 7029
6791 if (skinny_metadata) { 7030 if (skinny_metadata) {
6792 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7031 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7032 num_bytes = root->leafsize;
6793 } else { 7033 } else {
6794 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7034 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6795 btrfs_set_tree_block_key(leaf, block_info, key); 7035 btrfs_set_tree_block_key(leaf, block_info, key);
@@ -6811,6 +7051,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6811 btrfs_mark_buffer_dirty(leaf); 7051 btrfs_mark_buffer_dirty(leaf);
6812 btrfs_free_path(path); 7052 btrfs_free_path(path);
6813 7053
7054 if (!no_quota) {
7055 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
7056 ins->objectid, num_bytes,
7057 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
7058 if (ret)
7059 return ret;
7060 }
7061
6814 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7062 ret = update_block_group(root, ins->objectid, root->leafsize, 1);
6815 if (ret) { /* -ENOENT, logic error */ 7063 if (ret) { /* -ENOENT, logic error */
6816 btrfs_err(fs_info, "update block group failed for %llu %llu", 7064 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6994,6 +7242,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6994 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7242 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6995 SKINNY_METADATA); 7243 SKINNY_METADATA);
6996 7244
7245#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
7246 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
7247 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7248 blocksize, level);
7249 if (!IS_ERR(buf))
7250 root->alloc_bytenr += blocksize;
7251 return buf;
7252 }
7253#endif
6997 block_rsv = use_block_rsv(trans, root, blocksize); 7254 block_rsv = use_block_rsv(trans, root, blocksize);
6998 if (IS_ERR(block_rsv)) 7255 if (IS_ERR(block_rsv))
6999 return ERR_CAST(block_rsv); 7256 return ERR_CAST(block_rsv);
@@ -7735,7 +7992,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7735 } 7992 }
7736 } 7993 }
7737 7994
7738 if (root->in_radix) { 7995 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
7739 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 7996 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7740 } else { 7997 } else {
7741 free_extent_buffer(root->node); 7998 free_extent_buffer(root->node);
@@ -8327,8 +8584,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8327 list_del(&space_info->list); 8584 list_del(&space_info->list);
8328 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8585 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8329 struct kobject *kobj; 8586 struct kobject *kobj;
8330 kobj = &space_info->block_group_kobjs[i]; 8587 kobj = space_info->block_group_kobjs[i];
8331 if (kobj->parent) { 8588 space_info->block_group_kobjs[i] = NULL;
8589 if (kobj) {
8332 kobject_del(kobj); 8590 kobject_del(kobj);
8333 kobject_put(kobj); 8591 kobject_put(kobj);
8334 } 8592 }
@@ -8352,17 +8610,26 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8352 up_write(&space_info->groups_sem); 8610 up_write(&space_info->groups_sem);
8353 8611
8354 if (first) { 8612 if (first) {
8355 struct kobject *kobj = &space_info->block_group_kobjs[index]; 8613 struct raid_kobject *rkobj;
8356 int ret; 8614 int ret;
8357 8615
8358 kobject_get(&space_info->kobj); /* put in release */ 8616 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
8359 ret = kobject_add(kobj, &space_info->kobj, "%s", 8617 if (!rkobj)
8360 get_raid_name(index)); 8618 goto out_err;
8619 rkobj->raid_type = index;
8620 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
8621 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
8622 "%s", get_raid_name(index));
8361 if (ret) { 8623 if (ret) {
8362 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8624 kobject_put(&rkobj->kobj);
8363 kobject_put(&space_info->kobj); 8625 goto out_err;
8364 } 8626 }
8627 space_info->block_group_kobjs[index] = &rkobj->kobj;
8365 } 8628 }
8629
8630 return;
8631out_err:
8632 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
8366} 8633}
8367 8634
8368static struct btrfs_block_group_cache * 8635static struct btrfs_block_group_cache *
@@ -8611,7 +8878,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8611 8878
8612 extent_root = root->fs_info->extent_root; 8879 extent_root = root->fs_info->extent_root;
8613 8880
8614 root->fs_info->last_trans_log_full_commit = trans->transid; 8881 btrfs_set_log_full_commit(root->fs_info, trans);
8615 8882
8616 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 8883 cache = btrfs_create_block_group_cache(root, chunk_offset, size);
8617 if (!cache) 8884 if (!cache)
@@ -8697,6 +8964,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8697 struct btrfs_root *tree_root = root->fs_info->tree_root; 8964 struct btrfs_root *tree_root = root->fs_info->tree_root;
8698 struct btrfs_key key; 8965 struct btrfs_key key;
8699 struct inode *inode; 8966 struct inode *inode;
8967 struct kobject *kobj = NULL;
8700 int ret; 8968 int ret;
8701 int index; 8969 int index;
8702 int factor; 8970 int factor;
@@ -8796,11 +9064,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8796 */ 9064 */
8797 list_del_init(&block_group->list); 9065 list_del_init(&block_group->list);
8798 if (list_empty(&block_group->space_info->block_groups[index])) { 9066 if (list_empty(&block_group->space_info->block_groups[index])) {
8799 kobject_del(&block_group->space_info->block_group_kobjs[index]); 9067 kobj = block_group->space_info->block_group_kobjs[index];
8800 kobject_put(&block_group->space_info->block_group_kobjs[index]); 9068 block_group->space_info->block_group_kobjs[index] = NULL;
8801 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9069 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8802 } 9070 }
8803 up_write(&block_group->space_info->groups_sem); 9071 up_write(&block_group->space_info->groups_sem);
9072 if (kobj) {
9073 kobject_del(kobj);
9074 kobject_put(kobj);
9075 }
8804 9076
8805 if (block_group->cached == BTRFS_CACHE_STARTED) 9077 if (block_group->cached == BTRFS_CACHE_STARTED)
8806 wait_block_group_cache_done(block_group); 9078 wait_block_group_cache_done(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4cd0ac983f91..f25a9092b946 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1693,6 +1693,7 @@ again:
1693 * shortening the size of the delalloc range we're searching 1693 * shortening the size of the delalloc range we're searching
1694 */ 1694 */
1695 free_extent_state(cached_state); 1695 free_extent_state(cached_state);
1696 cached_state = NULL;
1696 if (!loops) { 1697 if (!loops) {
1697 max_bytes = PAGE_CACHE_SIZE; 1698 max_bytes = PAGE_CACHE_SIZE;
1698 loops = 1; 1699 loops = 1;
@@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2367 if (!uptodate) { 2368 if (!uptodate) {
2368 ClearPageUptodate(page); 2369 ClearPageUptodate(page);
2369 SetPageError(page); 2370 SetPageError(page);
2371 ret = ret < 0 ? ret : -EIO;
2372 mapping_set_error(page->mapping, ret);
2370 } 2373 }
2371 return 0; 2374 return 0;
2372} 2375}
@@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,
3098} 3101}
3099 3102
3100/* 3103/*
3101 * the writepage semantics are similar to regular writepage. extent 3104 * helper for __extent_writepage, doing all of the delayed allocation setup.
3102 * records are inserted to lock ranges in the tree, and as dirty areas 3105 *
3103 * are found, they are marked writeback. Then the lock bits are removed 3106 * This returns 1 if our fill_delalloc function did all the work required
3104 * and the end_io handler clears the writeback ranges 3107 * to write the page (copy into inline extent). In this case the IO has
3108 * been started and the page is already unlocked.
3109 *
3110 * This returns 0 if all went well (page still locked)
3111 * This returns < 0 if there were errors (page still locked)
3105 */ 3112 */
3106static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3113static noinline_for_stack int writepage_delalloc(struct inode *inode,
3107 void *data) 3114 struct page *page, struct writeback_control *wbc,
3115 struct extent_page_data *epd,
3116 u64 delalloc_start,
3117 unsigned long *nr_written)
3118{
3119 struct extent_io_tree *tree = epd->tree;
3120 u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
3121 u64 nr_delalloc;
3122 u64 delalloc_to_write = 0;
3123 u64 delalloc_end = 0;
3124 int ret;
3125 int page_started = 0;
3126
3127 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
3128 return 0;
3129
3130 while (delalloc_end < page_end) {
3131 nr_delalloc = find_lock_delalloc_range(inode, tree,
3132 page,
3133 &delalloc_start,
3134 &delalloc_end,
3135 128 * 1024 * 1024);
3136 if (nr_delalloc == 0) {
3137 delalloc_start = delalloc_end + 1;
3138 continue;
3139 }
3140 ret = tree->ops->fill_delalloc(inode, page,
3141 delalloc_start,
3142 delalloc_end,
3143 &page_started,
3144 nr_written);
3145 /* File system has been set read-only */
3146 if (ret) {
3147 SetPageError(page);
3148 /* fill_delalloc should be return < 0 for error
3149 * but just in case, we use > 0 here meaning the
3150 * IO is started, so we don't want to return > 0
3151 * unless things are going well.
3152 */
3153 ret = ret < 0 ? ret : -EIO;
3154 goto done;
3155 }
3156 /*
3157 * delalloc_end is already one less than the total
3158 * length, so we don't subtract one from
3159 * PAGE_CACHE_SIZE
3160 */
3161 delalloc_to_write += (delalloc_end - delalloc_start +
3162 PAGE_CACHE_SIZE) >>
3163 PAGE_CACHE_SHIFT;
3164 delalloc_start = delalloc_end + 1;
3165 }
3166 if (wbc->nr_to_write < delalloc_to_write) {
3167 int thresh = 8192;
3168
3169 if (delalloc_to_write < thresh * 2)
3170 thresh = delalloc_to_write;
3171 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3172 thresh);
3173 }
3174
3175 /* did the fill delalloc function already unlock and start
3176 * the IO?
3177 */
3178 if (page_started) {
3179 /*
3180 * we've unlocked the page, so we can't update
3181 * the mapping's writeback index, just update
3182 * nr_to_write.
3183 */
3184 wbc->nr_to_write -= *nr_written;
3185 return 1;
3186 }
3187
3188 ret = 0;
3189
3190done:
3191 return ret;
3192}
3193
3194/*
3195 * helper for __extent_writepage. This calls the writepage start hooks,
3196 * and does the loop to map the page into extents and bios.
3197 *
3198 * We return 1 if the IO is started and the page is unlocked,
3199 * 0 if all went well (page still locked)
3200 * < 0 if there were errors (page still locked)
3201 */
3202static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3203 struct page *page,
3204 struct writeback_control *wbc,
3205 struct extent_page_data *epd,
3206 loff_t i_size,
3207 unsigned long nr_written,
3208 int write_flags, int *nr_ret)
3108{ 3209{
3109 struct inode *inode = page->mapping->host;
3110 struct extent_page_data *epd = data;
3111 struct extent_io_tree *tree = epd->tree; 3210 struct extent_io_tree *tree = epd->tree;
3112 u64 start = page_offset(page); 3211 u64 start = page_offset(page);
3113 u64 delalloc_start;
3114 u64 page_end = start + PAGE_CACHE_SIZE - 1; 3212 u64 page_end = start + PAGE_CACHE_SIZE - 1;
3115 u64 end; 3213 u64 end;
3116 u64 cur = start; 3214 u64 cur = start;
3117 u64 extent_offset; 3215 u64 extent_offset;
3118 u64 last_byte = i_size_read(inode);
3119 u64 block_start; 3216 u64 block_start;
3120 u64 iosize; 3217 u64 iosize;
3121 sector_t sector; 3218 sector_t sector;
3122 struct extent_state *cached_state = NULL; 3219 struct extent_state *cached_state = NULL;
3123 struct extent_map *em; 3220 struct extent_map *em;
3124 struct block_device *bdev; 3221 struct block_device *bdev;
3125 int ret;
3126 int nr = 0;
3127 size_t pg_offset = 0; 3222 size_t pg_offset = 0;
3128 size_t blocksize; 3223 size_t blocksize;
3129 loff_t i_size = i_size_read(inode); 3224 int ret = 0;
3130 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 3225 int nr = 0;
3131 u64 nr_delalloc; 3226 bool compressed;
3132 u64 delalloc_end;
3133 int page_started;
3134 int compressed;
3135 int write_flags;
3136 unsigned long nr_written = 0;
3137 bool fill_delalloc = true;
3138
3139 if (wbc->sync_mode == WB_SYNC_ALL)
3140 write_flags = WRITE_SYNC;
3141 else
3142 write_flags = WRITE;
3143
3144 trace___extent_writepage(page, inode, wbc);
3145
3146 WARN_ON(!PageLocked(page));
3147
3148 ClearPageError(page);
3149
3150 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3151 if (page->index > end_index ||
3152 (page->index == end_index && !pg_offset)) {
3153 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3154 unlock_page(page);
3155 return 0;
3156 }
3157
3158 if (page->index == end_index) {
3159 char *userpage;
3160
3161 userpage = kmap_atomic(page);
3162 memset(userpage + pg_offset, 0,
3163 PAGE_CACHE_SIZE - pg_offset);
3164 kunmap_atomic(userpage);
3165 flush_dcache_page(page);
3166 }
3167 pg_offset = 0;
3168
3169 set_page_extent_mapped(page);
3170
3171 if (!tree->ops || !tree->ops->fill_delalloc)
3172 fill_delalloc = false;
3173
3174 delalloc_start = start;
3175 delalloc_end = 0;
3176 page_started = 0;
3177 if (!epd->extent_locked && fill_delalloc) {
3178 u64 delalloc_to_write = 0;
3179 /*
3180 * make sure the wbc mapping index is at least updated
3181 * to this page.
3182 */
3183 update_nr_written(page, wbc, 0);
3184
3185 while (delalloc_end < page_end) {
3186 nr_delalloc = find_lock_delalloc_range(inode, tree,
3187 page,
3188 &delalloc_start,
3189 &delalloc_end,
3190 128 * 1024 * 1024);
3191 if (nr_delalloc == 0) {
3192 delalloc_start = delalloc_end + 1;
3193 continue;
3194 }
3195 ret = tree->ops->fill_delalloc(inode, page,
3196 delalloc_start,
3197 delalloc_end,
3198 &page_started,
3199 &nr_written);
3200 /* File system has been set read-only */
3201 if (ret) {
3202 SetPageError(page);
3203 goto done;
3204 }
3205 /*
3206 * delalloc_end is already one less than the total
3207 * length, so we don't subtract one from
3208 * PAGE_CACHE_SIZE
3209 */
3210 delalloc_to_write += (delalloc_end - delalloc_start +
3211 PAGE_CACHE_SIZE) >>
3212 PAGE_CACHE_SHIFT;
3213 delalloc_start = delalloc_end + 1;
3214 }
3215 if (wbc->nr_to_write < delalloc_to_write) {
3216 int thresh = 8192;
3217
3218 if (delalloc_to_write < thresh * 2)
3219 thresh = delalloc_to_write;
3220 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3221 thresh);
3222 }
3223 3227
3224 /* did the fill delalloc function already unlock and start
3225 * the IO?
3226 */
3227 if (page_started) {
3228 ret = 0;
3229 /*
3230 * we've unlocked the page, so we can't update
3231 * the mapping's writeback index, just update
3232 * nr_to_write.
3233 */
3234 wbc->nr_to_write -= nr_written;
3235 goto done_unlocked;
3236 }
3237 }
3238 if (tree->ops && tree->ops->writepage_start_hook) { 3228 if (tree->ops && tree->ops->writepage_start_hook) {
3239 ret = tree->ops->writepage_start_hook(page, start, 3229 ret = tree->ops->writepage_start_hook(page, start,
3240 page_end); 3230 page_end);
@@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3244 wbc->pages_skipped++; 3234 wbc->pages_skipped++;
3245 else 3235 else
3246 redirty_page_for_writepage(wbc, page); 3236 redirty_page_for_writepage(wbc, page);
3237
3247 update_nr_written(page, wbc, nr_written); 3238 update_nr_written(page, wbc, nr_written);
3248 unlock_page(page); 3239 unlock_page(page);
3249 ret = 0; 3240 ret = 1;
3250 goto done_unlocked; 3241 goto done_unlocked;
3251 } 3242 }
3252 } 3243 }
@@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3258 update_nr_written(page, wbc, nr_written + 1); 3249 update_nr_written(page, wbc, nr_written + 1);
3259 3250
3260 end = page_end; 3251 end = page_end;
3261 if (last_byte <= start) { 3252 if (i_size <= start) {
3262 if (tree->ops && tree->ops->writepage_end_io_hook) 3253 if (tree->ops && tree->ops->writepage_end_io_hook)
3263 tree->ops->writepage_end_io_hook(page, start, 3254 tree->ops->writepage_end_io_hook(page, start,
3264 page_end, NULL, 1); 3255 page_end, NULL, 1);
@@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3268 blocksize = inode->i_sb->s_blocksize; 3259 blocksize = inode->i_sb->s_blocksize;
3269 3260
3270 while (cur <= end) { 3261 while (cur <= end) {
3271 if (cur >= last_byte) { 3262 u64 em_end;
3263 if (cur >= i_size) {
3272 if (tree->ops && tree->ops->writepage_end_io_hook) 3264 if (tree->ops && tree->ops->writepage_end_io_hook)
3273 tree->ops->writepage_end_io_hook(page, cur, 3265 tree->ops->writepage_end_io_hook(page, cur,
3274 page_end, NULL, 1); 3266 page_end, NULL, 1);
@@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3278 end - cur + 1, 1); 3270 end - cur + 1, 1);
3279 if (IS_ERR_OR_NULL(em)) { 3271 if (IS_ERR_OR_NULL(em)) {
3280 SetPageError(page); 3272 SetPageError(page);
3273 ret = PTR_ERR_OR_ZERO(em);
3281 break; 3274 break;
3282 } 3275 }
3283 3276
3284 extent_offset = cur - em->start; 3277 extent_offset = cur - em->start;
3285 BUG_ON(extent_map_end(em) <= cur); 3278 em_end = extent_map_end(em);
3279 BUG_ON(em_end <= cur);
3286 BUG_ON(end < cur); 3280 BUG_ON(end < cur);
3287 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3281 iosize = min(em_end - cur, end - cur + 1);
3288 iosize = ALIGN(iosize, blocksize); 3282 iosize = ALIGN(iosize, blocksize);
3289 sector = (em->block_start + extent_offset) >> 9; 3283 sector = (em->block_start + extent_offset) >> 9;
3290 bdev = em->bdev; 3284 bdev = em->bdev;
@@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3320 pg_offset += iosize; 3314 pg_offset += iosize;
3321 continue; 3315 continue;
3322 } 3316 }
3323 /* leave this out until we have a page_mkwrite call */
3324 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3325 EXTENT_DIRTY, 0, NULL)) {
3326 cur = cur + iosize;
3327 pg_offset += iosize;
3328 continue;
3329 }
3330 3317
3331 if (tree->ops && tree->ops->writepage_io_hook) { 3318 if (tree->ops && tree->ops->writepage_io_hook) {
3332 ret = tree->ops->writepage_io_hook(page, cur, 3319 ret = tree->ops->writepage_io_hook(page, cur,
@@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3337 if (ret) { 3324 if (ret) {
3338 SetPageError(page); 3325 SetPageError(page);
3339 } else { 3326 } else {
3340 unsigned long max_nr = end_index + 1; 3327 unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
3341 3328
3342 set_range_writeback(tree, cur, cur + iosize - 1); 3329 set_range_writeback(tree, cur, cur + iosize - 1);
3343 if (!PageWriteback(page)) { 3330 if (!PageWriteback(page)) {
@@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3359 nr++; 3346 nr++;
3360 } 3347 }
3361done: 3348done:
3349 *nr_ret = nr;
3350
3351done_unlocked:
3352
3353 /* drop our reference on any cached states */
3354 free_extent_state(cached_state);
3355 return ret;
3356}
3357
3358/*
3359 * the writepage semantics are similar to regular writepage. extent
3360 * records are inserted to lock ranges in the tree, and as dirty areas
3361 * are found, they are marked writeback. Then the lock bits are removed
3362 * and the end_io handler clears the writeback ranges
3363 */
3364static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3365 void *data)
3366{
3367 struct inode *inode = page->mapping->host;
3368 struct extent_page_data *epd = data;
3369 u64 start = page_offset(page);
3370 u64 page_end = start + PAGE_CACHE_SIZE - 1;
3371 int ret;
3372 int nr = 0;
3373 size_t pg_offset = 0;
3374 loff_t i_size = i_size_read(inode);
3375 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
3376 int write_flags;
3377 unsigned long nr_written = 0;
3378
3379 if (wbc->sync_mode == WB_SYNC_ALL)
3380 write_flags = WRITE_SYNC;
3381 else
3382 write_flags = WRITE;
3383
3384 trace___extent_writepage(page, inode, wbc);
3385
3386 WARN_ON(!PageLocked(page));
3387
3388 ClearPageError(page);
3389
3390 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3391 if (page->index > end_index ||
3392 (page->index == end_index && !pg_offset)) {
3393 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3394 unlock_page(page);
3395 return 0;
3396 }
3397
3398 if (page->index == end_index) {
3399 char *userpage;
3400
3401 userpage = kmap_atomic(page);
3402 memset(userpage + pg_offset, 0,
3403 PAGE_CACHE_SIZE - pg_offset);
3404 kunmap_atomic(userpage);
3405 flush_dcache_page(page);
3406 }
3407
3408 pg_offset = 0;
3409
3410 set_page_extent_mapped(page);
3411
3412 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
3413 if (ret == 1)
3414 goto done_unlocked;
3415 if (ret)
3416 goto done;
3417
3418 ret = __extent_writepage_io(inode, page, wbc, epd,
3419 i_size, nr_written, write_flags, &nr);
3420 if (ret == 1)
3421 goto done_unlocked;
3422
3423done:
3362 if (nr == 0) { 3424 if (nr == 0) {
3363 /* make sure the mapping tag for page dirty gets cleared */ 3425 /* make sure the mapping tag for page dirty gets cleared */
3364 set_page_writeback(page); 3426 set_page_writeback(page);
3365 end_page_writeback(page); 3427 end_page_writeback(page);
3366 } 3428 }
3429 if (PageError(page)) {
3430 ret = ret < 0 ? ret : -EIO;
3431 end_extent_writepage(page, ret, start, page_end);
3432 }
3367 unlock_page(page); 3433 unlock_page(page);
3434 return ret;
3368 3435
3369done_unlocked: 3436done_unlocked:
3370
3371 /* drop our reference on any cached states */
3372 free_extent_state(cached_state);
3373 return 0; 3437 return 0;
3374} 3438}
3375 3439
@@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3385 TASK_UNINTERRUPTIBLE); 3449 TASK_UNINTERRUPTIBLE);
3386} 3450}
3387 3451
3388static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3452static noinline_for_stack int
3389 struct btrfs_fs_info *fs_info, 3453lock_extent_buffer_for_io(struct extent_buffer *eb,
3390 struct extent_page_data *epd) 3454 struct btrfs_fs_info *fs_info,
3455 struct extent_page_data *epd)
3391{ 3456{
3392 unsigned long i, num_pages; 3457 unsigned long i, num_pages;
3393 int flush = 0; 3458 int flush = 0;
@@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3492 bio_put(bio); 3557 bio_put(bio);
3493} 3558}
3494 3559
3495static int write_one_eb(struct extent_buffer *eb, 3560static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3496 struct btrfs_fs_info *fs_info, 3561 struct btrfs_fs_info *fs_info,
3497 struct writeback_control *wbc, 3562 struct writeback_control *wbc,
3498 struct extent_page_data *epd) 3563 struct extent_page_data *epd)
@@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
3690 struct inode *inode = mapping->host; 3755 struct inode *inode = mapping->host;
3691 int ret = 0; 3756 int ret = 0;
3692 int done = 0; 3757 int done = 0;
3758 int err = 0;
3693 int nr_to_write_done = 0; 3759 int nr_to_write_done = 0;
3694 struct pagevec pvec; 3760 struct pagevec pvec;
3695 int nr_pages; 3761 int nr_pages;
@@ -3776,8 +3842,8 @@ retry:
3776 unlock_page(page); 3842 unlock_page(page);
3777 ret = 0; 3843 ret = 0;
3778 } 3844 }
3779 if (ret) 3845 if (!err && ret < 0)
3780 done = 1; 3846 err = ret;
3781 3847
3782 /* 3848 /*
3783 * the filesystem may choose to bump up nr_to_write. 3849 * the filesystem may choose to bump up nr_to_write.
@@ -3789,7 +3855,7 @@ retry:
3789 pagevec_release(&pvec); 3855 pagevec_release(&pvec);
3790 cond_resched(); 3856 cond_resched();
3791 } 3857 }
3792 if (!scanned && !done) { 3858 if (!scanned && !done && !err) {
3793 /* 3859 /*
3794 * We hit the last page and there is more work to be done: wrap 3860 * We hit the last page and there is more work to be done: wrap
3795 * back to the start of the file 3861 * back to the start of the file
@@ -3799,7 +3865,7 @@ retry:
3799 goto retry; 3865 goto retry;
3800 } 3866 }
3801 btrfs_add_delayed_iput(inode); 3867 btrfs_add_delayed_iput(inode);
3802 return ret; 3868 return err;
3803} 3869}
3804 3870
3805static void flush_epd_write_bio(struct extent_page_data *epd) 3871static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4543,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4543 return NULL; 4609 return NULL;
4544} 4610}
4545 4611
4612#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4613struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4614 u64 start, unsigned long len)
4615{
4616 struct extent_buffer *eb, *exists = NULL;
4617 int ret;
4618
4619 eb = find_extent_buffer(fs_info, start);
4620 if (eb)
4621 return eb;
4622 eb = alloc_dummy_extent_buffer(start, len);
4623 if (!eb)
4624 return NULL;
4625 eb->fs_info = fs_info;
4626again:
4627 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4628 if (ret)
4629 goto free_eb;
4630 spin_lock(&fs_info->buffer_lock);
4631 ret = radix_tree_insert(&fs_info->buffer_radix,
4632 start >> PAGE_CACHE_SHIFT, eb);
4633 spin_unlock(&fs_info->buffer_lock);
4634 radix_tree_preload_end();
4635 if (ret == -EEXIST) {
4636 exists = find_extent_buffer(fs_info, start);
4637 if (exists)
4638 goto free_eb;
4639 else
4640 goto again;
4641 }
4642 check_buffer_tree_ref(eb);
4643 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4644
4645 /*
4646 * We will free dummy extent buffer's if they come into
4647 * free_extent_buffer with a ref count of 2, but if we are using this we
4648 * want the buffers to stay in memory until we're done with them, so
4649 * bump the ref count again.
4650 */
4651 atomic_inc(&eb->refs);
4652 return eb;
4653free_eb:
4654 btrfs_release_extent_buffer(eb);
4655 return exists;
4656}
4657#endif
4658
4546struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4659struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4547 u64 start, unsigned long len) 4660 u64 start, unsigned long len)
4548{ 4661{
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c488b45237bf..8b63f2d46518 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -350,5 +350,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
350 struct extent_io_tree *tree, 350 struct extent_io_tree *tree,
351 struct page *locked_page, u64 *start, 351 struct page *locked_page, u64 *start,
352 u64 *end, u64 max_bytes); 352 u64 *end, u64 max_bytes);
353struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
354 u64 start, unsigned long len);
353#endif 355#endif
354#endif 356#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 127555b29f58..f46cfe45d686 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
281found: 281found:
282 csum += count * csum_size; 282 csum += count * csum_size;
283 nblocks -= count; 283 nblocks -= count;
284 bio_index += count;
284 while (count--) { 285 while (count--) {
285 disk_bytenr += bvec->bv_len; 286 disk_bytenr += bvec->bv_len;
286 offset += bvec->bv_len; 287 offset += bvec->bv_len;
287 bio_index++;
288 bvec++; 288 bvec++;
289 } 289 }
290 } 290 }
@@ -750,7 +750,7 @@ again:
750 int slot = path->slots[0] + 1; 750 int slot = path->slots[0] + 1;
751 /* we didn't find a csum item, insert one */ 751 /* we didn't find a csum item, insert one */
752 nritems = btrfs_header_nritems(path->nodes[0]); 752 nritems = btrfs_header_nritems(path->nodes[0]);
753 if (path->slots[0] >= nritems - 1) { 753 if (!nritems || (path->slots[0] >= nritems - 1)) {
754 ret = btrfs_next_leaf(root, path); 754 ret = btrfs_next_leaf(root, path);
755 if (ret == 1) 755 if (ret == 1)
756 found_next = 1; 756 found_next = 1;
@@ -885,3 +885,79 @@ out:
885fail_unlock: 885fail_unlock:
886 goto out; 886 goto out;
887} 887}
888
889void btrfs_extent_item_to_extent_map(struct inode *inode,
890 const struct btrfs_path *path,
891 struct btrfs_file_extent_item *fi,
892 const bool new_inline,
893 struct extent_map *em)
894{
895 struct btrfs_root *root = BTRFS_I(inode)->root;
896 struct extent_buffer *leaf = path->nodes[0];
897 const int slot = path->slots[0];
898 struct btrfs_key key;
899 u64 extent_start, extent_end;
900 u64 bytenr;
901 u8 type = btrfs_file_extent_type(leaf, fi);
902 int compress_type = btrfs_file_extent_compression(leaf, fi);
903
904 em->bdev = root->fs_info->fs_devices->latest_bdev;
905 btrfs_item_key_to_cpu(leaf, &key, slot);
906 extent_start = key.offset;
907
908 if (type == BTRFS_FILE_EXTENT_REG ||
909 type == BTRFS_FILE_EXTENT_PREALLOC) {
910 extent_end = extent_start +
911 btrfs_file_extent_num_bytes(leaf, fi);
912 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
913 size_t size;
914 size = btrfs_file_extent_inline_len(leaf, slot, fi);
915 extent_end = ALIGN(extent_start + size, root->sectorsize);
916 }
917
918 em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
919 if (type == BTRFS_FILE_EXTENT_REG ||
920 type == BTRFS_FILE_EXTENT_PREALLOC) {
921 em->start = extent_start;
922 em->len = extent_end - extent_start;
923 em->orig_start = extent_start -
924 btrfs_file_extent_offset(leaf, fi);
925 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
926 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
927 if (bytenr == 0) {
928 em->block_start = EXTENT_MAP_HOLE;
929 return;
930 }
931 if (compress_type != BTRFS_COMPRESS_NONE) {
932 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
933 em->compress_type = compress_type;
934 em->block_start = bytenr;
935 em->block_len = em->orig_block_len;
936 } else {
937 bytenr += btrfs_file_extent_offset(leaf, fi);
938 em->block_start = bytenr;
939 em->block_len = em->len;
940 if (type == BTRFS_FILE_EXTENT_PREALLOC)
941 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
942 }
943 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
944 em->block_start = EXTENT_MAP_INLINE;
945 em->start = extent_start;
946 em->len = extent_end - extent_start;
947 /*
948 * Initialize orig_start and block_len with the same values
949 * as in inode.c:btrfs_get_extent().
950 */
951 em->orig_start = EXTENT_MAP_HOLE;
952 em->block_len = (u64)-1;
953 if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
954 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
955 em->compress_type = compress_type;
956 }
957 } else {
958 btrfs_err(root->fs_info,
959 "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
960 type, btrfs_ino(inode), extent_start,
961 root->root_key.objectid);
962 }
963}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 74272a3f9d9b..e472441feb5d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,7 @@
40#include "tree-log.h" 40#include "tree-log.h"
41#include "locking.h" 41#include "locking.h"
42#include "volumes.h" 42#include "volumes.h"
43#include "qgroup.h"
43 44
44static struct kmem_cache *btrfs_inode_defrag_cachep; 45static struct kmem_cache *btrfs_inode_defrag_cachep;
45/* 46/*
@@ -715,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
715 int recow; 716 int recow;
716 int ret; 717 int ret;
717 int modify_tree = -1; 718 int modify_tree = -1;
718 int update_refs = (root->ref_cows || root == root->fs_info->tree_root); 719 int update_refs;
719 int found = 0; 720 int found = 0;
720 int leafs_visited = 0; 721 int leafs_visited = 0;
721 722
@@ -725,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
725 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) 726 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
726 modify_tree = 0; 727 modify_tree = 0;
727 728
729 update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
730 root == root->fs_info->tree_root);
728 while (1) { 731 while (1) {
729 recow = 0; 732 recow = 0;
730 ret = btrfs_lookup_file_extent(trans, root, path, ino, 733 ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -781,6 +784,18 @@ next_slot:
781 extent_end = search_start; 784 extent_end = search_start;
782 } 785 }
783 786
787 /*
788 * Don't skip extent items representing 0 byte lengths. They
789 * used to be created (bug) if while punching holes we hit
790 * -ENOSPC condition. So if we find one here, just ensure we
791 * delete it, otherwise we would insert a new file extent item
792 * with the same key (offset) as that 0 bytes length file
793 * extent item in the call to setup_items_for_insert() later
794 * in this function.
795 */
796 if (extent_end == key.offset && extent_end >= search_start)
797 goto delete_extent_item;
798
784 if (extent_end <= search_start) { 799 if (extent_end <= search_start) {
785 path->slots[0]++; 800 path->slots[0]++;
786 goto next_slot; 801 goto next_slot;
@@ -836,7 +851,7 @@ next_slot:
836 disk_bytenr, num_bytes, 0, 851 disk_bytenr, num_bytes, 0,
837 root->root_key.objectid, 852 root->root_key.objectid,
838 new_key.objectid, 853 new_key.objectid,
839 start - extent_offset, 0); 854 start - extent_offset, 1);
840 BUG_ON(ret); /* -ENOMEM */ 855 BUG_ON(ret); /* -ENOMEM */
841 } 856 }
842 key.offset = start; 857 key.offset = start;
@@ -894,6 +909,7 @@ next_slot:
894 * | ------ extent ------ | 909 * | ------ extent ------ |
895 */ 910 */
896 if (start <= key.offset && end >= extent_end) { 911 if (start <= key.offset && end >= extent_end) {
912delete_extent_item:
897 if (del_nr == 0) { 913 if (del_nr == 0) {
898 del_slot = path->slots[0]; 914 del_slot = path->slots[0];
899 del_nr = 1; 915 del_nr = 1;
@@ -1192,7 +1208,7 @@ again:
1192 1208
1193 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1209 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1194 root->root_key.objectid, 1210 root->root_key.objectid,
1195 ino, orig_offset, 0); 1211 ino, orig_offset, 1);
1196 BUG_ON(ret); /* -ENOMEM */ 1212 BUG_ON(ret); /* -ENOMEM */
1197 1213
1198 if (split == start) { 1214 if (split == start) {
@@ -2010,8 +2026,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2010 if (!full_sync) { 2026 if (!full_sync) {
2011 ret = btrfs_wait_ordered_range(inode, start, 2027 ret = btrfs_wait_ordered_range(inode, start,
2012 end - start + 1); 2028 end - start + 1);
2013 if (ret) 2029 if (ret) {
2030 btrfs_end_transaction(trans, root);
2014 goto out; 2031 goto out;
2032 }
2015 } 2033 }
2016 ret = btrfs_commit_transaction(trans, root); 2034 ret = btrfs_commit_transaction(trans, root);
2017 } else { 2035 } else {
@@ -2169,6 +2187,37 @@ out:
2169 return 0; 2187 return 0;
2170} 2188}
2171 2189
2190/*
2191 * Find a hole extent on given inode and change start/len to the end of hole
2192 * extent.(hole/vacuum extent whose em->start <= start &&
2193 * em->start + em->len > start)
2194 * When a hole extent is found, return 1 and modify start/len.
2195 */
2196static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2197{
2198 struct extent_map *em;
2199 int ret = 0;
2200
2201 em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
2202 if (IS_ERR_OR_NULL(em)) {
2203 if (!em)
2204 ret = -ENOMEM;
2205 else
2206 ret = PTR_ERR(em);
2207 return ret;
2208 }
2209
2210 /* Hole or vacuum extent(only exists in no-hole mode) */
2211 if (em->block_start == EXTENT_MAP_HOLE) {
2212 ret = 1;
2213 *len = em->start + em->len > *start + *len ?
2214 0 : *start + *len - em->start - em->len;
2215 *start = em->start + em->len;
2216 }
2217 free_extent_map(em);
2218 return ret;
2219}
2220
2172static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 2221static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2173{ 2222{
2174 struct btrfs_root *root = BTRFS_I(inode)->root; 2223 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2176,25 +2225,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2176 struct btrfs_path *path; 2225 struct btrfs_path *path;
2177 struct btrfs_block_rsv *rsv; 2226 struct btrfs_block_rsv *rsv;
2178 struct btrfs_trans_handle *trans; 2227 struct btrfs_trans_handle *trans;
2179 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2228 u64 lockstart;
2180 u64 lockend = round_down(offset + len, 2229 u64 lockend;
2181 BTRFS_I(inode)->root->sectorsize) - 1; 2230 u64 tail_start;
2182 u64 cur_offset = lockstart; 2231 u64 tail_len;
2232 u64 orig_start = offset;
2233 u64 cur_offset;
2183 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 2234 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2184 u64 drop_end; 2235 u64 drop_end;
2185 int ret = 0; 2236 int ret = 0;
2186 int err = 0; 2237 int err = 0;
2187 int rsv_count; 2238 int rsv_count;
2188 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2239 bool same_page;
2189 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2190 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2240 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2191 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); 2241 u64 ino_size;
2192 2242
2193 ret = btrfs_wait_ordered_range(inode, offset, len); 2243 ret = btrfs_wait_ordered_range(inode, offset, len);
2194 if (ret) 2244 if (ret)
2195 return ret; 2245 return ret;
2196 2246
2197 mutex_lock(&inode->i_mutex); 2247 mutex_lock(&inode->i_mutex);
2248 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2249 ret = find_first_non_hole(inode, &offset, &len);
2250 if (ret < 0)
2251 goto out_only_mutex;
2252 if (ret && !len) {
2253 /* Already in a large hole */
2254 ret = 0;
2255 goto out_only_mutex;
2256 }
2257
2258 lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
2259 lockend = round_down(offset + len,
2260 BTRFS_I(inode)->root->sectorsize) - 1;
2261 same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2262 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2263
2198 /* 2264 /*
2199 * We needn't truncate any page which is beyond the end of the file 2265 * We needn't truncate any page which is beyond the end of the file
2200 * because we are sure there is no data there. 2266 * because we are sure there is no data there.
@@ -2206,8 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2206 if (same_page && len < PAGE_CACHE_SIZE) { 2272 if (same_page && len < PAGE_CACHE_SIZE) {
2207 if (offset < ino_size) 2273 if (offset < ino_size)
2208 ret = btrfs_truncate_page(inode, offset, len, 0); 2274 ret = btrfs_truncate_page(inode, offset, len, 0);
2209 mutex_unlock(&inode->i_mutex); 2275 goto out_only_mutex;
2210 return ret;
2211 } 2276 }
2212 2277
2213 /* zero back part of the first page */ 2278 /* zero back part of the first page */
@@ -2219,12 +2284,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2219 } 2284 }
2220 } 2285 }
2221 2286
2222 /* zero the front end of the last page */ 2287 /* Check the aligned pages after the first unaligned page,
2223 if (offset + len < ino_size) { 2288 * if offset != orig_start, which means the first unaligned page
2224 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2289 * including serveral following pages are already in holes,
2225 if (ret) { 2290 * the extra check can be skipped */
2226 mutex_unlock(&inode->i_mutex); 2291 if (offset == orig_start) {
2227 return ret; 2292 /* after truncate page, check hole again */
2293 len = offset + len - lockstart;
2294 offset = lockstart;
2295 ret = find_first_non_hole(inode, &offset, &len);
2296 if (ret < 0)
2297 goto out_only_mutex;
2298 if (ret && !len) {
2299 ret = 0;
2300 goto out_only_mutex;
2301 }
2302 lockstart = offset;
2303 }
2304
2305 /* Check the tail unaligned part is in a hole */
2306 tail_start = lockend + 1;
2307 tail_len = offset + len - tail_start;
2308 if (tail_len) {
2309 ret = find_first_non_hole(inode, &tail_start, &tail_len);
2310 if (unlikely(ret < 0))
2311 goto out_only_mutex;
2312 if (!ret) {
2313 /* zero the front end of the last page */
2314 if (tail_start + tail_len < ino_size) {
2315 ret = btrfs_truncate_page(inode,
2316 tail_start + tail_len, 0, 1);
2317 if (ret)
2318 goto out_only_mutex;
2319 }
2228 } 2320 }
2229 } 2321 }
2230 2322
@@ -2250,9 +2342,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2250 if ((!ordered || 2342 if ((!ordered ||
2251 (ordered->file_offset + ordered->len <= lockstart || 2343 (ordered->file_offset + ordered->len <= lockstart ||
2252 ordered->file_offset > lockend)) && 2344 ordered->file_offset > lockend)) &&
2253 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, 2345 !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
2254 lockend, EXTENT_UPTODATE, 0,
2255 cached_state)) {
2256 if (ordered) 2346 if (ordered)
2257 btrfs_put_ordered_extent(ordered); 2347 btrfs_put_ordered_extent(ordered);
2258 break; 2348 break;
@@ -2300,6 +2390,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2300 BUG_ON(ret); 2390 BUG_ON(ret);
2301 trans->block_rsv = rsv; 2391 trans->block_rsv = rsv;
2302 2392
2393 cur_offset = lockstart;
2394 len = lockend - cur_offset;
2303 while (cur_offset < lockend) { 2395 while (cur_offset < lockend) {
2304 ret = __btrfs_drop_extents(trans, root, inode, path, 2396 ret = __btrfs_drop_extents(trans, root, inode, path,
2305 cur_offset, lockend + 1, 2397 cur_offset, lockend + 1,
@@ -2340,6 +2432,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2340 rsv, min_size); 2432 rsv, min_size);
2341 BUG_ON(ret); /* shouldn't happen */ 2433 BUG_ON(ret); /* shouldn't happen */
2342 trans->block_rsv = rsv; 2434 trans->block_rsv = rsv;
2435
2436 ret = find_first_non_hole(inode, &cur_offset, &len);
2437 if (unlikely(ret < 0))
2438 break;
2439 if (ret && !len) {
2440 ret = 0;
2441 break;
2442 }
2343 } 2443 }
2344 2444
2345 if (ret) { 2445 if (ret) {
@@ -2348,7 +2448,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2348 } 2448 }
2349 2449
2350 trans->block_rsv = &root->fs_info->trans_block_rsv; 2450 trans->block_rsv = &root->fs_info->trans_block_rsv;
2351 if (cur_offset < ino_size) { 2451 /*
2452 * Don't insert file hole extent item if it's for a range beyond eof
2453 * (because it's useless) or if it represents a 0 bytes range (when
2454 * cur_offset == drop_end).
2455 */
2456 if (cur_offset < ino_size && cur_offset < drop_end) {
2352 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2457 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2353 if (ret) { 2458 if (ret) {
2354 err = ret; 2459 err = ret;
@@ -2373,6 +2478,7 @@ out_free:
2373out: 2478out:
2374 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2479 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2375 &cached_state, GFP_NOFS); 2480 &cached_state, GFP_NOFS);
2481out_only_mutex:
2376 mutex_unlock(&inode->i_mutex); 2482 mutex_unlock(&inode->i_mutex);
2377 if (ret && !err) 2483 if (ret && !err)
2378 err = ret; 2484 err = ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 73f3de7a083c..372b05ff1943 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -831,7 +831,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
831 831
832 if (!matched) { 832 if (!matched) {
833 __btrfs_remove_free_space_cache(ctl); 833 __btrfs_remove_free_space_cache(ctl);
834 btrfs_err(fs_info, "block group %llu has wrong amount of free space", 834 btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
835 block_group->key.objectid); 835 block_group->key.objectid);
836 ret = -1; 836 ret = -1;
837 } 837 }
@@ -843,7 +843,7 @@ out:
843 spin_unlock(&block_group->lock); 843 spin_unlock(&block_group->lock);
844 ret = 0; 844 ret = 0;
845 845
846 btrfs_err(fs_info, "failed to load free space cache for block group %llu", 846 btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
847 block_group->key.objectid); 847 block_group->key.objectid);
848 } 848 }
849 849
@@ -851,90 +851,44 @@ out:
851 return ret; 851 return ret;
852} 852}
853 853
854/** 854static noinline_for_stack
855 * __btrfs_write_out_cache - write out cached info to an inode 855int write_cache_extent_entries(struct io_ctl *io_ctl,
856 * @root - the root the inode belongs to 856 struct btrfs_free_space_ctl *ctl,
857 * @ctl - the free space cache we are going to write out 857 struct btrfs_block_group_cache *block_group,
858 * @block_group - the block_group for this cache if it belongs to a block_group 858 int *entries, int *bitmaps,
859 * @trans - the trans handle 859 struct list_head *bitmap_list)
860 * @path - the path to use
861 * @offset - the offset for the key we'll insert
862 *
863 * This function writes out a free space cache struct to disk for quick recovery
864 * on mount. This will return 0 if it was successfull in writing the cache out,
865 * and -1 if it was not.
866 */
867static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
868 struct btrfs_free_space_ctl *ctl,
869 struct btrfs_block_group_cache *block_group,
870 struct btrfs_trans_handle *trans,
871 struct btrfs_path *path, u64 offset)
872{ 860{
873 struct btrfs_free_space_header *header;
874 struct extent_buffer *leaf;
875 struct rb_node *node;
876 struct list_head *pos, *n;
877 struct extent_state *cached_state = NULL;
878 struct btrfs_free_cluster *cluster = NULL;
879 struct extent_io_tree *unpin = NULL;
880 struct io_ctl io_ctl;
881 struct list_head bitmap_list;
882 struct btrfs_key key;
883 u64 start, extent_start, extent_end, len;
884 int entries = 0;
885 int bitmaps = 0;
886 int ret; 861 int ret;
887 int err = -1; 862 struct btrfs_free_cluster *cluster = NULL;
888 863 struct rb_node *node = rb_first(&ctl->free_space_offset);
889 INIT_LIST_HEAD(&bitmap_list);
890
891 if (!i_size_read(inode))
892 return -1;
893
894 ret = io_ctl_init(&io_ctl, inode, root);
895 if (ret)
896 return -1;
897 864
898 /* Get the cluster for this block_group if it exists */ 865 /* Get the cluster for this block_group if it exists */
899 if (block_group && !list_empty(&block_group->cluster_list)) 866 if (block_group && !list_empty(&block_group->cluster_list)) {
900 cluster = list_entry(block_group->cluster_list.next, 867 cluster = list_entry(block_group->cluster_list.next,
901 struct btrfs_free_cluster, 868 struct btrfs_free_cluster,
902 block_group_list); 869 block_group_list);
870 }
903 871
904 /* Lock all pages first so we can lock the extent safely. */
905 io_ctl_prepare_pages(&io_ctl, inode, 0);
906
907 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
908 0, &cached_state);
909
910 node = rb_first(&ctl->free_space_offset);
911 if (!node && cluster) { 872 if (!node && cluster) {
912 node = rb_first(&cluster->root); 873 node = rb_first(&cluster->root);
913 cluster = NULL; 874 cluster = NULL;
914 } 875 }
915 876
916 /* Make sure we can fit our crcs into the first page */
917 if (io_ctl.check_crcs &&
918 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
919 goto out_nospc;
920
921 io_ctl_set_generation(&io_ctl, trans->transid);
922
923 /* Write out the extent entries */ 877 /* Write out the extent entries */
924 while (node) { 878 while (node) {
925 struct btrfs_free_space *e; 879 struct btrfs_free_space *e;
926 880
927 e = rb_entry(node, struct btrfs_free_space, offset_index); 881 e = rb_entry(node, struct btrfs_free_space, offset_index);
928 entries++; 882 *entries += 1;
929 883
930 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, 884 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
931 e->bitmap); 885 e->bitmap);
932 if (ret) 886 if (ret)
933 goto out_nospc; 887 goto fail;
934 888
935 if (e->bitmap) { 889 if (e->bitmap) {
936 list_add_tail(&e->list, &bitmap_list); 890 list_add_tail(&e->list, bitmap_list);
937 bitmaps++; 891 *bitmaps += 1;
938 } 892 }
939 node = rb_next(node); 893 node = rb_next(node);
940 if (!node && cluster) { 894 if (!node && cluster) {
@@ -942,13 +896,84 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
942 cluster = NULL; 896 cluster = NULL;
943 } 897 }
944 } 898 }
899 return 0;
900fail:
901 return -ENOSPC;
902}
903
904static noinline_for_stack int
905update_cache_item(struct btrfs_trans_handle *trans,
906 struct btrfs_root *root,
907 struct inode *inode,
908 struct btrfs_path *path, u64 offset,
909 int entries, int bitmaps)
910{
911 struct btrfs_key key;
912 struct btrfs_free_space_header *header;
913 struct extent_buffer *leaf;
914 int ret;
915
916 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
917 key.offset = offset;
918 key.type = 0;
919
920 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
921 if (ret < 0) {
922 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
923 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
924 GFP_NOFS);
925 goto fail;
926 }
927 leaf = path->nodes[0];
928 if (ret > 0) {
929 struct btrfs_key found_key;
930 ASSERT(path->slots[0]);
931 path->slots[0]--;
932 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
933 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
934 found_key.offset != offset) {
935 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
936 inode->i_size - 1,
937 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
938 NULL, GFP_NOFS);
939 btrfs_release_path(path);
940 goto fail;
941 }
942 }
943
944 BTRFS_I(inode)->generation = trans->transid;
945 header = btrfs_item_ptr(leaf, path->slots[0],
946 struct btrfs_free_space_header);
947 btrfs_set_free_space_entries(leaf, header, entries);
948 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
949 btrfs_set_free_space_generation(leaf, header, trans->transid);
950 btrfs_mark_buffer_dirty(leaf);
951 btrfs_release_path(path);
952
953 return 0;
954
955fail:
956 return -1;
957}
958
959static noinline_for_stack int
960add_ioctl_entries(struct btrfs_root *root,
961 struct inode *inode,
962 struct btrfs_block_group_cache *block_group,
963 struct io_ctl *io_ctl,
964 struct extent_state **cached_state,
965 struct list_head *bitmap_list,
966 int *entries)
967{
968 u64 start, extent_start, extent_end, len;
969 struct list_head *pos, *n;
970 struct extent_io_tree *unpin = NULL;
971 int ret;
945 972
946 /* 973 /*
947 * We want to add any pinned extents to our free space cache 974 * We want to add any pinned extents to our free space cache
948 * so we don't leak the space 975 * so we don't leak the space
949 */ 976 *
950
951 /*
952 * We shouldn't have switched the pinned extents yet so this is the 977 * We shouldn't have switched the pinned extents yet so this is the
953 * right one 978 * right one
954 */ 979 */
@@ -977,8 +1002,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
977 block_group->key.offset, extent_end + 1); 1002 block_group->key.offset, extent_end + 1);
978 len = extent_end - extent_start; 1003 len = extent_end - extent_start;
979 1004
980 entries++; 1005 *entries += 1;
981 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); 1006 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
982 if (ret) 1007 if (ret)
983 goto out_nospc; 1008 goto out_nospc;
984 1009
@@ -986,74 +1011,129 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
986 } 1011 }
987 1012
988 /* Write out the bitmaps */ 1013 /* Write out the bitmaps */
989 list_for_each_safe(pos, n, &bitmap_list) { 1014 list_for_each_safe(pos, n, bitmap_list) {
990 struct btrfs_free_space *entry = 1015 struct btrfs_free_space *entry =
991 list_entry(pos, struct btrfs_free_space, list); 1016 list_entry(pos, struct btrfs_free_space, list);
992 1017
993 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); 1018 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
994 if (ret) 1019 if (ret)
995 goto out_nospc; 1020 goto out_nospc;
996 list_del_init(&entry->list); 1021 list_del_init(&entry->list);
997 } 1022 }
998 1023
999 /* Zero out the rest of the pages just to make sure */ 1024 /* Zero out the rest of the pages just to make sure */
1000 io_ctl_zero_remaining_pages(&io_ctl); 1025 io_ctl_zero_remaining_pages(io_ctl);
1001 1026
1002 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, 1027 ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
1003 0, i_size_read(inode), &cached_state); 1028 0, i_size_read(inode), cached_state);
1004 io_ctl_drop_pages(&io_ctl); 1029 io_ctl_drop_pages(io_ctl);
1005 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1030 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1006 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 1031 i_size_read(inode) - 1, cached_state, GFP_NOFS);
1007 1032
1008 if (ret) 1033 if (ret)
1009 goto out; 1034 goto fail;
1010 1035
1011 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1036 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1012 if (ret) { 1037 if (ret) {
1013 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1038 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1014 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1039 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1015 GFP_NOFS); 1040 GFP_NOFS);
1016 goto out; 1041 goto fail;
1017 } 1042 }
1043 return 0;
1018 1044
1019 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1045fail:
1020 key.offset = offset; 1046 return -1;
1021 key.type = 0;
1022 1047
1023 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1048out_nospc:
1024 if (ret < 0) { 1049 return -ENOSPC;
1025 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1050}
1026 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1051
1027 GFP_NOFS); 1052static void noinline_for_stack
1028 goto out; 1053cleanup_write_cache_enospc(struct inode *inode,
1029 } 1054 struct io_ctl *io_ctl,
1030 leaf = path->nodes[0]; 1055 struct extent_state **cached_state,
1031 if (ret > 0) { 1056 struct list_head *bitmap_list)
1032 struct btrfs_key found_key; 1057{
1033 ASSERT(path->slots[0]); 1058 struct list_head *pos, *n;
1034 path->slots[0]--; 1059 list_for_each_safe(pos, n, bitmap_list) {
1035 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1060 struct btrfs_free_space *entry =
1036 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1061 list_entry(pos, struct btrfs_free_space, list);
1037 found_key.offset != offset) { 1062 list_del_init(&entry->list);
1038 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1039 inode->i_size - 1,
1040 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
1041 NULL, GFP_NOFS);
1042 btrfs_release_path(path);
1043 goto out;
1044 }
1045 } 1063 }
1064 io_ctl_drop_pages(io_ctl);
1065 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1066 i_size_read(inode) - 1, cached_state,
1067 GFP_NOFS);
1068}
1046 1069
1047 BTRFS_I(inode)->generation = trans->transid; 1070/**
1048 header = btrfs_item_ptr(leaf, path->slots[0], 1071 * __btrfs_write_out_cache - write out cached info to an inode
1049 struct btrfs_free_space_header); 1072 * @root - the root the inode belongs to
1050 btrfs_set_free_space_entries(leaf, header, entries); 1073 * @ctl - the free space cache we are going to write out
1051 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1074 * @block_group - the block_group for this cache if it belongs to a block_group
1052 btrfs_set_free_space_generation(leaf, header, trans->transid); 1075 * @trans - the trans handle
1053 btrfs_mark_buffer_dirty(leaf); 1076 * @path - the path to use
1054 btrfs_release_path(path); 1077 * @offset - the offset for the key we'll insert
1078 *
1079 * This function writes out a free space cache struct to disk for quick recovery
1080 * on mount. This will return 0 if it was successfull in writing the cache out,
1081 * and -1 if it was not.
1082 */
1083static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1084 struct btrfs_free_space_ctl *ctl,
1085 struct btrfs_block_group_cache *block_group,
1086 struct btrfs_trans_handle *trans,
1087 struct btrfs_path *path, u64 offset)
1088{
1089 struct extent_state *cached_state = NULL;
1090 struct io_ctl io_ctl;
1091 struct list_head bitmap_list;
1092 int entries = 0;
1093 int bitmaps = 0;
1094 int ret;
1095 int err = -1;
1096
1097 INIT_LIST_HEAD(&bitmap_list);
1098
1099 if (!i_size_read(inode))
1100 return -1;
1101
1102 ret = io_ctl_init(&io_ctl, inode, root);
1103 if (ret)
1104 return -1;
1105
1106 /* Lock all pages first so we can lock the extent safely. */
1107 io_ctl_prepare_pages(&io_ctl, inode, 0);
1108
1109 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1110 0, &cached_state);
1111
1112
1113 /* Make sure we can fit our crcs into the first page */
1114 if (io_ctl.check_crcs &&
1115 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
1116 goto out_nospc;
1117
1118 io_ctl_set_generation(&io_ctl, trans->transid);
1119
1120 ret = write_cache_extent_entries(&io_ctl, ctl,
1121 block_group, &entries, &bitmaps,
1122 &bitmap_list);
1123 if (ret)
1124 goto out_nospc;
1125
1126 ret = add_ioctl_entries(root, inode, block_group, &io_ctl,
1127 &cached_state, &bitmap_list, &entries);
1128
1129 if (ret == -ENOSPC)
1130 goto out_nospc;
1131 else if (ret)
1132 goto out;
1133
1134 err = update_cache_item(trans, root, inode, path, offset,
1135 entries, bitmaps);
1055 1136
1056 err = 0;
1057out: 1137out:
1058 io_ctl_free(&io_ctl); 1138 io_ctl_free(&io_ctl);
1059 if (err) { 1139 if (err) {
@@ -1064,14 +1144,8 @@ out:
1064 return err; 1144 return err;
1065 1145
1066out_nospc: 1146out_nospc:
1067 list_for_each_safe(pos, n, &bitmap_list) { 1147
1068 struct btrfs_free_space *entry = 1148 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
1069 list_entry(pos, struct btrfs_free_space, list);
1070 list_del_init(&entry->list);
1071 }
1072 io_ctl_drop_pages(&io_ctl);
1073 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1074 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1075 goto out; 1149 goto out;
1076} 1150}
1077 1151
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 86935f5ae291..888fbe19079f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -174,7 +174,7 @@ static void start_caching(struct btrfs_root *root)
174 BTRFS_LAST_FREE_OBJECTID - objectid + 1); 174 BTRFS_LAST_FREE_OBJECTID - objectid + 1);
175 } 175 }
176 176
177 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", 177 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
178 root->root_key.objectid); 178 root->root_key.objectid);
179 if (IS_ERR(tsk)) { 179 if (IS_ERR(tsk)) {
180 btrfs_warn(root->fs_info, "failed to start inode caching task"); 180 btrfs_warn(root->fs_info, "failed to start inode caching task");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a3b8371772e..7fa5f7fd7bc7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
125 * the btree. The caller should have done a btrfs_drop_extents so that 125 * the btree. The caller should have done a btrfs_drop_extents so that
126 * no overlapping inline items exist in the btree 126 * no overlapping inline items exist in the btree
127 */ 127 */
128static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 128static int insert_inline_extent(struct btrfs_trans_handle *trans,
129 struct btrfs_path *path, int extent_inserted, 129 struct btrfs_path *path, int extent_inserted,
130 struct btrfs_root *root, struct inode *inode, 130 struct btrfs_root *root, struct inode *inode,
131 u64 start, size_t size, size_t compressed_size, 131 u64 start, size_t size, size_t compressed_size,
@@ -2678,6 +2678,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2678 trans = NULL; 2678 trans = NULL;
2679 goto out_unlock; 2679 goto out_unlock;
2680 } 2680 }
2681
2681 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2682 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2682 2683
2683 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2947,14 +2948,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2947 root->orphan_block_rsv = NULL; 2948 root->orphan_block_rsv = NULL;
2948 spin_unlock(&root->orphan_lock); 2949 spin_unlock(&root->orphan_lock);
2949 2950
2950 if (root->orphan_item_inserted && 2951 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
2951 btrfs_root_refs(&root->root_item) > 0) { 2952 btrfs_root_refs(&root->root_item) > 0) {
2952 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2953 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2953 root->root_key.objectid); 2954 root->root_key.objectid);
2954 if (ret) 2955 if (ret)
2955 btrfs_abort_transaction(trans, root, ret); 2956 btrfs_abort_transaction(trans, root, ret);
2956 else 2957 else
2957 root->orphan_item_inserted = 0; 2958 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
2959 &root->state);
2958 } 2960 }
2959 2961
2960 if (block_rsv) { 2962 if (block_rsv) {
@@ -3271,7 +3273,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3271 btrfs_block_rsv_release(root, root->orphan_block_rsv, 3273 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3272 (u64)-1); 3274 (u64)-1);
3273 3275
3274 if (root->orphan_block_rsv || root->orphan_item_inserted) { 3276 if (root->orphan_block_rsv ||
3277 test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3275 trans = btrfs_join_transaction(root); 3278 trans = btrfs_join_transaction(root);
3276 if (!IS_ERR(trans)) 3279 if (!IS_ERR(trans))
3277 btrfs_end_transaction(trans, root); 3280 btrfs_end_transaction(trans, root);
@@ -3473,7 +3476,7 @@ cache_acl:
3473 ret = btrfs_load_inode_props(inode, path); 3476 ret = btrfs_load_inode_props(inode, path);
3474 if (ret) 3477 if (ret)
3475 btrfs_err(root->fs_info, 3478 btrfs_err(root->fs_info,
3476 "error loading props for ino %llu (root %llu): %d\n", 3479 "error loading props for ino %llu (root %llu): %d",
3477 btrfs_ino(inode), 3480 btrfs_ino(inode),
3478 root->root_key.objectid, ret); 3481 root->root_key.objectid, ret);
3479 } 3482 }
@@ -3998,7 +4001,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3998 * not block aligned since we will be keeping the last block of the 4001 * not block aligned since we will be keeping the last block of the
3999 * extent just the way it is. 4002 * extent just the way it is.
4000 */ 4003 */
4001 if (root->ref_cows || root == root->fs_info->tree_root) 4004 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4005 root == root->fs_info->tree_root)
4002 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4006 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4003 root->sectorsize), (u64)-1, 0); 4007 root->sectorsize), (u64)-1, 0);
4004 4008
@@ -4091,7 +4095,9 @@ search_again:
4091 extent_num_bytes); 4095 extent_num_bytes);
4092 num_dec = (orig_num_bytes - 4096 num_dec = (orig_num_bytes -
4093 extent_num_bytes); 4097 extent_num_bytes);
4094 if (root->ref_cows && extent_start != 0) 4098 if (test_bit(BTRFS_ROOT_REF_COWS,
4099 &root->state) &&
4100 extent_start != 0)
4095 inode_sub_bytes(inode, num_dec); 4101 inode_sub_bytes(inode, num_dec);
4096 btrfs_mark_buffer_dirty(leaf); 4102 btrfs_mark_buffer_dirty(leaf);
4097 } else { 4103 } else {
@@ -4105,7 +4111,8 @@ search_again:
4105 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4111 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4106 if (extent_start != 0) { 4112 if (extent_start != 0) {
4107 found_extent = 1; 4113 found_extent = 1;
4108 if (root->ref_cows) 4114 if (test_bit(BTRFS_ROOT_REF_COWS,
4115 &root->state))
4109 inode_sub_bytes(inode, num_dec); 4116 inode_sub_bytes(inode, num_dec);
4110 } 4117 }
4111 } 4118 }
@@ -4120,10 +4127,9 @@ search_again:
4120 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4127 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4121 u32 size = new_size - found_key.offset; 4128 u32 size = new_size - found_key.offset;
4122 4129
4123 if (root->ref_cows) { 4130 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4124 inode_sub_bytes(inode, item_end + 1 - 4131 inode_sub_bytes(inode, item_end + 1 -
4125 new_size); 4132 new_size);
4126 }
4127 4133
4128 /* 4134 /*
4129 * update the ram bytes to properly reflect 4135 * update the ram bytes to properly reflect
@@ -4133,7 +4139,8 @@ search_again:
4133 size = 4139 size =
4134 btrfs_file_extent_calc_inline_size(size); 4140 btrfs_file_extent_calc_inline_size(size);
4135 btrfs_truncate_item(root, path, size, 1); 4141 btrfs_truncate_item(root, path, size, 1);
4136 } else if (root->ref_cows) { 4142 } else if (test_bit(BTRFS_ROOT_REF_COWS,
4143 &root->state)) {
4137 inode_sub_bytes(inode, item_end + 1 - 4144 inode_sub_bytes(inode, item_end + 1 -
4138 found_key.offset); 4145 found_key.offset);
4139 } 4146 }
@@ -4155,8 +4162,9 @@ delete:
4155 } else { 4162 } else {
4156 break; 4163 break;
4157 } 4164 }
4158 if (found_extent && (root->ref_cows || 4165 if (found_extent &&
4159 root == root->fs_info->tree_root)) { 4166 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4167 root == root->fs_info->tree_root)) {
4160 btrfs_set_path_blocking(path); 4168 btrfs_set_path_blocking(path);
4161 ret = btrfs_free_extent(trans, root, extent_start, 4169 ret = btrfs_free_extent(trans, root, extent_start,
4162 extent_num_bytes, 0, 4170 extent_num_bytes, 0,
@@ -5168,8 +5176,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
5168 5176
5169static void btrfs_dentry_release(struct dentry *dentry) 5177static void btrfs_dentry_release(struct dentry *dentry)
5170{ 5178{
5171 if (dentry->d_fsdata) 5179 kfree(dentry->d_fsdata);
5172 kfree(dentry->d_fsdata);
5173} 5180}
5174 5181
5175static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5182static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
@@ -5553,6 +5560,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5553 struct btrfs_inode_ref *ref; 5560 struct btrfs_inode_ref *ref;
5554 struct btrfs_key key[2]; 5561 struct btrfs_key key[2];
5555 u32 sizes[2]; 5562 u32 sizes[2];
5563 int nitems = name ? 2 : 1;
5556 unsigned long ptr; 5564 unsigned long ptr;
5557 int ret; 5565 int ret;
5558 5566
@@ -5572,7 +5580,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5572 */ 5580 */
5573 inode->i_ino = objectid; 5581 inode->i_ino = objectid;
5574 5582
5575 if (dir) { 5583 if (dir && name) {
5576 trace_btrfs_inode_request(dir); 5584 trace_btrfs_inode_request(dir);
5577 5585
5578 ret = btrfs_set_inode_index(dir, index); 5586 ret = btrfs_set_inode_index(dir, index);
@@ -5581,6 +5589,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5581 iput(inode); 5589 iput(inode);
5582 return ERR_PTR(ret); 5590 return ERR_PTR(ret);
5583 } 5591 }
5592 } else if (dir) {
5593 *index = 0;
5584 } 5594 }
5585 /* 5595 /*
5586 * index_cnt is ignored for everything but a dir, 5596 * index_cnt is ignored for everything but a dir,
@@ -5605,21 +5615,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5605 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5615 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5606 key[0].offset = 0; 5616 key[0].offset = 0;
5607 5617
5608 /*
5609 * Start new inodes with an inode_ref. This is slightly more
5610 * efficient for small numbers of hard links since they will
5611 * be packed into one item. Extended refs will kick in if we
5612 * add more hard links than can fit in the ref item.
5613 */
5614 key[1].objectid = objectid;
5615 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5616 key[1].offset = ref_objectid;
5617
5618 sizes[0] = sizeof(struct btrfs_inode_item); 5618 sizes[0] = sizeof(struct btrfs_inode_item);
5619 sizes[1] = name_len + sizeof(*ref); 5619
5620 if (name) {
5621 /*
5622 * Start new inodes with an inode_ref. This is slightly more
5623 * efficient for small numbers of hard links since they will
5624 * be packed into one item. Extended refs will kick in if we
5625 * add more hard links than can fit in the ref item.
5626 */
5627 key[1].objectid = objectid;
5628 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5629 key[1].offset = ref_objectid;
5630
5631 sizes[1] = name_len + sizeof(*ref);
5632 }
5620 5633
5621 path->leave_spinning = 1; 5634 path->leave_spinning = 1;
5622 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 5635 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5623 if (ret != 0) 5636 if (ret != 0)
5624 goto fail; 5637 goto fail;
5625 5638
@@ -5632,12 +5645,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5632 sizeof(*inode_item)); 5645 sizeof(*inode_item));
5633 fill_inode_item(trans, path->nodes[0], inode_item, inode); 5646 fill_inode_item(trans, path->nodes[0], inode_item, inode);
5634 5647
5635 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 5648 if (name) {
5636 struct btrfs_inode_ref); 5649 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5637 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 5650 struct btrfs_inode_ref);
5638 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 5651 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5639 ptr = (unsigned long)(ref + 1); 5652 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5640 write_extent_buffer(path->nodes[0], name, ptr, name_len); 5653 ptr = (unsigned long)(ref + 1);
5654 write_extent_buffer(path->nodes[0], name, ptr, name_len);
5655 }
5641 5656
5642 btrfs_mark_buffer_dirty(path->nodes[0]); 5657 btrfs_mark_buffer_dirty(path->nodes[0]);
5643 btrfs_free_path(path); 5658 btrfs_free_path(path);
@@ -5673,7 +5688,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5673 5688
5674 return inode; 5689 return inode;
5675fail: 5690fail:
5676 if (dir) 5691 if (dir && name)
5677 BTRFS_I(dir)->index_cnt--; 5692 BTRFS_I(dir)->index_cnt--;
5678 btrfs_free_path(path); 5693 btrfs_free_path(path);
5679 iput(inode); 5694 iput(inode);
@@ -5958,6 +5973,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5958 err = btrfs_update_inode(trans, root, inode); 5973 err = btrfs_update_inode(trans, root, inode);
5959 if (err) 5974 if (err)
5960 goto fail; 5975 goto fail;
5976 if (inode->i_nlink == 1) {
5977 /*
5978 * If new hard link count is 1, it's a file created
5979 * with open(2) O_TMPFILE flag.
5980 */
5981 err = btrfs_orphan_del(trans, inode);
5982 if (err)
5983 goto fail;
5984 }
5961 d_instantiate(dentry, inode); 5985 d_instantiate(dentry, inode);
5962 btrfs_log_new_name(trans, inode, NULL, parent); 5986 btrfs_log_new_name(trans, inode, NULL, parent);
5963 } 5987 }
@@ -6086,16 +6110,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
6086 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 6110 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6087 ret = btrfs_decompress(compress_type, tmp, page, 6111 ret = btrfs_decompress(compress_type, tmp, page,
6088 extent_offset, inline_size, max_size); 6112 extent_offset, inline_size, max_size);
6089 if (ret) {
6090 char *kaddr = kmap_atomic(page);
6091 unsigned long copy_size = min_t(u64,
6092 PAGE_CACHE_SIZE - pg_offset,
6093 max_size - extent_offset);
6094 memset(kaddr + pg_offset, 0, copy_size);
6095 kunmap_atomic(kaddr);
6096 }
6097 kfree(tmp); 6113 kfree(tmp);
6098 return 0; 6114 return ret;
6099} 6115}
6100 6116
6101/* 6117/*
@@ -6113,7 +6129,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6113{ 6129{
6114 int ret; 6130 int ret;
6115 int err = 0; 6131 int err = 0;
6116 u64 bytenr;
6117 u64 extent_start = 0; 6132 u64 extent_start = 0;
6118 u64 extent_end = 0; 6133 u64 extent_end = 0;
6119 u64 objectid = btrfs_ino(inode); 6134 u64 objectid = btrfs_ino(inode);
@@ -6127,7 +6142,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6127 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 6142 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6128 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6143 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6129 struct btrfs_trans_handle *trans = NULL; 6144 struct btrfs_trans_handle *trans = NULL;
6130 int compress_type; 6145 const bool new_inline = !page || create;
6131 6146
6132again: 6147again:
6133 read_lock(&em_tree->lock); 6148 read_lock(&em_tree->lock);
@@ -6201,7 +6216,6 @@ again:
6201 6216
6202 found_type = btrfs_file_extent_type(leaf, item); 6217 found_type = btrfs_file_extent_type(leaf, item);
6203 extent_start = found_key.offset; 6218 extent_start = found_key.offset;
6204 compress_type = btrfs_file_extent_compression(leaf, item);
6205 if (found_type == BTRFS_FILE_EXTENT_REG || 6219 if (found_type == BTRFS_FILE_EXTENT_REG ||
6206 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6220 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6207 extent_end = extent_start + 6221 extent_end = extent_start +
@@ -6236,32 +6250,10 @@ next:
6236 goto not_found_em; 6250 goto not_found_em;
6237 } 6251 }
6238 6252
6239 em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 6253 btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6254
6240 if (found_type == BTRFS_FILE_EXTENT_REG || 6255 if (found_type == BTRFS_FILE_EXTENT_REG ||
6241 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6256 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6242 em->start = extent_start;
6243 em->len = extent_end - extent_start;
6244 em->orig_start = extent_start -
6245 btrfs_file_extent_offset(leaf, item);
6246 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
6247 item);
6248 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
6249 if (bytenr == 0) {
6250 em->block_start = EXTENT_MAP_HOLE;
6251 goto insert;
6252 }
6253 if (compress_type != BTRFS_COMPRESS_NONE) {
6254 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6255 em->compress_type = compress_type;
6256 em->block_start = bytenr;
6257 em->block_len = em->orig_block_len;
6258 } else {
6259 bytenr += btrfs_file_extent_offset(leaf, item);
6260 em->block_start = bytenr;
6261 em->block_len = em->len;
6262 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
6263 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6264 }
6265 goto insert; 6257 goto insert;
6266 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6258 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6267 unsigned long ptr; 6259 unsigned long ptr;
@@ -6270,12 +6262,8 @@ next:
6270 size_t extent_offset; 6262 size_t extent_offset;
6271 size_t copy_size; 6263 size_t copy_size;
6272 6264
6273 em->block_start = EXTENT_MAP_INLINE; 6265 if (new_inline)
6274 if (!page || create) {
6275 em->start = extent_start;
6276 em->len = extent_end - extent_start;
6277 goto out; 6266 goto out;
6278 }
6279 6267
6280 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); 6268 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6281 extent_offset = page_offset(page) + pg_offset - extent_start; 6269 extent_offset = page_offset(page) + pg_offset - extent_start;
@@ -6285,10 +6273,6 @@ next:
6285 em->len = ALIGN(copy_size, root->sectorsize); 6273 em->len = ALIGN(copy_size, root->sectorsize);
6286 em->orig_block_len = em->len; 6274 em->orig_block_len = em->len;
6287 em->orig_start = em->start; 6275 em->orig_start = em->start;
6288 if (compress_type) {
6289 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6290 em->compress_type = compress_type;
6291 }
6292 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6276 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6293 if (create == 0 && !PageUptodate(page)) { 6277 if (create == 0 && !PageUptodate(page)) {
6294 if (btrfs_file_extent_compression(leaf, item) != 6278 if (btrfs_file_extent_compression(leaf, item) !=
@@ -6296,7 +6280,10 @@ next:
6296 ret = uncompress_inline(path, inode, page, 6280 ret = uncompress_inline(path, inode, page,
6297 pg_offset, 6281 pg_offset,
6298 extent_offset, item); 6282 extent_offset, item);
6299 BUG_ON(ret); /* -ENOMEM */ 6283 if (ret) {
6284 err = ret;
6285 goto out;
6286 }
6300 } else { 6287 } else {
6301 map = kmap(page); 6288 map = kmap(page);
6302 read_extent_buffer(leaf, map + pg_offset, ptr, 6289 read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6332,8 +6319,6 @@ next:
6332 set_extent_uptodate(io_tree, em->start, 6319 set_extent_uptodate(io_tree, em->start,
6333 extent_map_end(em) - 1, NULL, GFP_NOFS); 6320 extent_map_end(em) - 1, NULL, GFP_NOFS);
6334 goto insert; 6321 goto insert;
6335 } else {
6336 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
6337 } 6322 }
6338not_found: 6323not_found:
6339 em->start = start; 6324 em->start = start;
@@ -6717,6 +6702,76 @@ out:
6717 return ret; 6702 return ret;
6718} 6703}
6719 6704
6705bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
6706{
6707 struct radix_tree_root *root = &inode->i_mapping->page_tree;
6708 int found = false;
6709 void **pagep = NULL;
6710 struct page *page = NULL;
6711 int start_idx;
6712 int end_idx;
6713
6714 start_idx = start >> PAGE_CACHE_SHIFT;
6715
6716 /*
6717 * end is the last byte in the last page. end == start is legal
6718 */
6719 end_idx = end >> PAGE_CACHE_SHIFT;
6720
6721 rcu_read_lock();
6722
6723 /* Most of the code in this while loop is lifted from
6724 * find_get_page. It's been modified to begin searching from a
6725 * page and return just the first page found in that range. If the
6726 * found idx is less than or equal to the end idx then we know that
6727 * a page exists. If no pages are found or if those pages are
6728 * outside of the range then we're fine (yay!) */
6729 while (page == NULL &&
6730 radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
6731 page = radix_tree_deref_slot(pagep);
6732 if (unlikely(!page))
6733 break;
6734
6735 if (radix_tree_exception(page)) {
6736 if (radix_tree_deref_retry(page)) {
6737 page = NULL;
6738 continue;
6739 }
6740 /*
6741 * Otherwise, shmem/tmpfs must be storing a swap entry
6742 * here as an exceptional entry: so return it without
6743 * attempting to raise page count.
6744 */
6745 page = NULL;
6746 break; /* TODO: Is this relevant for this use case? */
6747 }
6748
6749 if (!page_cache_get_speculative(page)) {
6750 page = NULL;
6751 continue;
6752 }
6753
6754 /*
6755 * Has the page moved?
6756 * This is part of the lockless pagecache protocol. See
6757 * include/linux/pagemap.h for details.
6758 */
6759 if (unlikely(page != *pagep)) {
6760 page_cache_release(page);
6761 page = NULL;
6762 }
6763 }
6764
6765 if (page) {
6766 if (page->index <= end_idx)
6767 found = true;
6768 page_cache_release(page);
6769 }
6770
6771 rcu_read_unlock();
6772 return found;
6773}
6774
6720static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 6775static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6721 struct extent_state **cached_state, int writing) 6776 struct extent_state **cached_state, int writing)
6722{ 6777{
@@ -6741,10 +6796,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6741 * invalidate needs to happen so that reads after a write do not 6796 * invalidate needs to happen so that reads after a write do not
6742 * get stale data. 6797 * get stale data.
6743 */ 6798 */
6744 if (!ordered && (!writing || 6799 if (!ordered &&
6745 !test_range_bit(&BTRFS_I(inode)->io_tree, 6800 (!writing ||
6746 lockstart, lockend, EXTENT_UPTODATE, 0, 6801 !btrfs_page_exists_in_range(inode, lockstart, lockend)))
6747 *cached_state)))
6748 break; 6802 break;
6749 6803
6750 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6804 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -7992,7 +8046,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7992 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8046 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
7993 if (err) 8047 if (err)
7994 btrfs_err(new_root->fs_info, 8048 btrfs_err(new_root->fs_info,
7995 "error inheriting subvolume %llu properties: %d\n", 8049 "error inheriting subvolume %llu properties: %d",
7996 new_root->root_key.objectid, err); 8050 new_root->root_key.objectid, err);
7997 8051
7998 err = btrfs_update_inode(trans, new_root, inode); 8052 err = btrfs_update_inode(trans, new_root, inode);
@@ -8311,7 +8365,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8311 BTRFS_I(old_inode)->dir_index = 0ULL; 8365 BTRFS_I(old_inode)->dir_index = 0ULL;
8312 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8366 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8313 /* force full log commit if subvolume involved. */ 8367 /* force full log commit if subvolume involved. */
8314 root->fs_info->last_trans_log_full_commit = trans->transid; 8368 btrfs_set_log_full_commit(root->fs_info, trans);
8315 } else { 8369 } else {
8316 ret = btrfs_insert_inode_ref(trans, dest, 8370 ret = btrfs_insert_inode_ref(trans, dest,
8317 new_dentry->d_name.name, 8371 new_dentry->d_name.name,
@@ -8889,6 +8943,66 @@ static int btrfs_permission(struct inode *inode, int mask)
8889 return generic_permission(inode, mask); 8943 return generic_permission(inode, mask);
8890} 8944}
8891 8945
8946static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8947{
8948 struct btrfs_trans_handle *trans;
8949 struct btrfs_root *root = BTRFS_I(dir)->root;
8950 struct inode *inode = NULL;
8951 u64 objectid;
8952 u64 index;
8953 int ret = 0;
8954
8955 /*
8956 * 5 units required for adding orphan entry
8957 */
8958 trans = btrfs_start_transaction(root, 5);
8959 if (IS_ERR(trans))
8960 return PTR_ERR(trans);
8961
8962 ret = btrfs_find_free_ino(root, &objectid);
8963 if (ret)
8964 goto out;
8965
8966 inode = btrfs_new_inode(trans, root, dir, NULL, 0,
8967 btrfs_ino(dir), objectid, mode, &index);
8968 if (IS_ERR(inode)) {
8969 ret = PTR_ERR(inode);
8970 inode = NULL;
8971 goto out;
8972 }
8973
8974 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
8975 if (ret)
8976 goto out;
8977
8978 ret = btrfs_update_inode(trans, root, inode);
8979 if (ret)
8980 goto out;
8981
8982 inode->i_fop = &btrfs_file_operations;
8983 inode->i_op = &btrfs_file_inode_operations;
8984
8985 inode->i_mapping->a_ops = &btrfs_aops;
8986 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8987 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8988
8989 ret = btrfs_orphan_add(trans, inode);
8990 if (ret)
8991 goto out;
8992
8993 d_tmpfile(dentry, inode);
8994 mark_inode_dirty(inode);
8995
8996out:
8997 btrfs_end_transaction(trans, root);
8998 if (ret)
8999 iput(inode);
9000 btrfs_balance_delayed_items(root);
9001 btrfs_btree_balance_dirty(root);
9002
9003 return ret;
9004}
9005
8892static const struct inode_operations btrfs_dir_inode_operations = { 9006static const struct inode_operations btrfs_dir_inode_operations = {
8893 .getattr = btrfs_getattr, 9007 .getattr = btrfs_getattr,
8894 .lookup = btrfs_lookup, 9008 .lookup = btrfs_lookup,
@@ -8909,6 +9023,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
8909 .get_acl = btrfs_get_acl, 9023 .get_acl = btrfs_get_acl,
8910 .set_acl = btrfs_set_acl, 9024 .set_acl = btrfs_set_acl,
8911 .update_time = btrfs_update_time, 9025 .update_time = btrfs_update_time,
9026 .tmpfile = btrfs_tmpfile,
8912}; 9027};
8913static const struct inode_operations btrfs_dir_ro_inode_operations = { 9028static const struct inode_operations btrfs_dir_ro_inode_operations = {
8914 .lookup = btrfs_lookup, 9029 .lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3f52bb7a58d2..82c18ba12e3f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -58,6 +58,7 @@
58#include "dev-replace.h" 58#include "dev-replace.h"
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61#include "qgroup.h"
61 62
62#ifdef CONFIG_64BIT 63#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 64/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -638,7 +639,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
638 struct btrfs_trans_handle *trans; 639 struct btrfs_trans_handle *trans;
639 int ret; 640 int ret;
640 641
641 if (!root->ref_cows) 642 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
642 return -EINVAL; 643 return -EINVAL;
643 644
644 atomic_inc(&root->will_be_snapshoted); 645 atomic_inc(&root->will_be_snapshoted);
@@ -711,6 +712,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
711 if (ret) 712 if (ret)
712 goto fail; 713 goto fail;
713 714
715 /*
716 * If orphan cleanup did remove any orphans, it means the tree was
717 * modified and therefore the commit root is not the same as the
718 * current root anymore. This is a problem, because send uses the
719 * commit root and therefore can see inode items that don't exist
720 * in the current root anymore, and for example make calls to
721 * btrfs_iget, which will do tree lookups based on the current root
722 * and not on the commit root. Those lookups will fail, returning a
723 * -ESTALE error, and making send fail with that error. So make sure
724 * a send does not see any orphans we have just removed, and that it
725 * will see the same inodes regardless of whether a transaction
726 * commit happened before it started (meaning that the commit root
727 * will be the same as the current root) or not.
728 */
729 if (readonly && pending_snapshot->snap->node !=
730 pending_snapshot->snap->commit_root) {
731 trans = btrfs_join_transaction(pending_snapshot->snap);
732 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
733 ret = PTR_ERR(trans);
734 goto fail;
735 }
736 if (!IS_ERR(trans)) {
737 ret = btrfs_commit_transaction(trans,
738 pending_snapshot->snap);
739 if (ret)
740 goto fail;
741 }
742 }
743
714 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 744 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
715 if (IS_ERR(inode)) { 745 if (IS_ERR(inode)) {
716 ret = PTR_ERR(inode); 746 ret = PTR_ERR(inode);
@@ -1502,11 +1532,12 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1502 sizestr = vol_args->name; 1532 sizestr = vol_args->name;
1503 devstr = strchr(sizestr, ':'); 1533 devstr = strchr(sizestr, ':');
1504 if (devstr) { 1534 if (devstr) {
1505 char *end;
1506 sizestr = devstr + 1; 1535 sizestr = devstr + 1;
1507 *devstr = '\0'; 1536 *devstr = '\0';
1508 devstr = vol_args->name; 1537 devstr = vol_args->name;
1509 devid = simple_strtoull(devstr, &end, 10); 1538 ret = kstrtoull(devstr, 10, &devid);
1539 if (ret)
1540 goto out_free;
1510 if (!devid) { 1541 if (!devid) {
1511 ret = -EINVAL; 1542 ret = -EINVAL;
1512 goto out_free; 1543 goto out_free;
@@ -1562,7 +1593,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1562 new_size = old_size - new_size; 1593 new_size = old_size - new_size;
1563 } else if (mod > 0) { 1594 } else if (mod > 0) {
1564 if (new_size > ULLONG_MAX - old_size) { 1595 if (new_size > ULLONG_MAX - old_size) {
1565 ret = -EINVAL; 1596 ret = -ERANGE;
1566 goto out_free; 1597 goto out_free;
1567 } 1598 }
1568 new_size = old_size + new_size; 1599 new_size = old_size + new_size;
@@ -2219,6 +2250,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2219 struct btrfs_ioctl_vol_args *vol_args; 2250 struct btrfs_ioctl_vol_args *vol_args;
2220 struct btrfs_trans_handle *trans; 2251 struct btrfs_trans_handle *trans;
2221 struct btrfs_block_rsv block_rsv; 2252 struct btrfs_block_rsv block_rsv;
2253 u64 root_flags;
2222 u64 qgroup_reserved; 2254 u64 qgroup_reserved;
2223 int namelen; 2255 int namelen;
2224 int ret; 2256 int ret;
@@ -2240,6 +2272,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2240 if (err) 2272 if (err)
2241 goto out; 2273 goto out;
2242 2274
2275
2243 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 2276 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
2244 if (err == -EINTR) 2277 if (err == -EINTR)
2245 goto out_drop_write; 2278 goto out_drop_write;
@@ -2301,6 +2334,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2301 } 2334 }
2302 2335
2303 mutex_lock(&inode->i_mutex); 2336 mutex_lock(&inode->i_mutex);
2337
2338 /*
2339 * Don't allow to delete a subvolume with send in progress. This is
2340 * inside the i_mutex so the error handling that has to drop the bit
2341 * again is not run concurrently.
2342 */
2343 spin_lock(&dest->root_item_lock);
2344 root_flags = btrfs_root_flags(&dest->root_item);
2345 if (dest->send_in_progress == 0) {
2346 btrfs_set_root_flags(&dest->root_item,
2347 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
2348 spin_unlock(&dest->root_item_lock);
2349 } else {
2350 spin_unlock(&dest->root_item_lock);
2351 btrfs_warn(root->fs_info,
2352 "Attempt to delete subvolume %llu during send",
2353 dest->root_key.objectid);
2354 err = -EPERM;
2355 goto out_dput;
2356 }
2357
2304 err = d_invalidate(dentry); 2358 err = d_invalidate(dentry);
2305 if (err) 2359 if (err)
2306 goto out_unlock; 2360 goto out_unlock;
@@ -2346,7 +2400,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2346 dest->root_item.drop_level = 0; 2400 dest->root_item.drop_level = 0;
2347 btrfs_set_root_refs(&dest->root_item, 0); 2401 btrfs_set_root_refs(&dest->root_item, 0);
2348 2402
2349 if (!xchg(&dest->orphan_item_inserted, 1)) { 2403 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
2350 ret = btrfs_insert_orphan_item(trans, 2404 ret = btrfs_insert_orphan_item(trans,
2351 root->fs_info->tree_root, 2405 root->fs_info->tree_root,
2352 dest->root_key.objectid); 2406 dest->root_key.objectid);
@@ -2389,11 +2443,19 @@ out_release:
2389out_up_write: 2443out_up_write:
2390 up_write(&root->fs_info->subvol_sem); 2444 up_write(&root->fs_info->subvol_sem);
2391out_unlock: 2445out_unlock:
2446 if (err) {
2447 spin_lock(&dest->root_item_lock);
2448 root_flags = btrfs_root_flags(&dest->root_item);
2449 btrfs_set_root_flags(&dest->root_item,
2450 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
2451 spin_unlock(&dest->root_item_lock);
2452 }
2392 mutex_unlock(&inode->i_mutex); 2453 mutex_unlock(&inode->i_mutex);
2393 if (!err) { 2454 if (!err) {
2394 shrink_dcache_sb(root->fs_info->sb); 2455 shrink_dcache_sb(root->fs_info->sb);
2395 btrfs_invalidate_inodes(dest); 2456 btrfs_invalidate_inodes(dest);
2396 d_delete(dentry); 2457 d_delete(dentry);
2458 ASSERT(dest->send_in_progress == 0);
2397 2459
2398 /* the last ref */ 2460 /* the last ref */
2399 if (dest->cache_inode) { 2461 if (dest->cache_inode) {
@@ -2557,9 +2619,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2557 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2619 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2558 int ret = 0; 2620 int ret = 0;
2559 2621
2560 if (!capable(CAP_SYS_ADMIN))
2561 return -EPERM;
2562
2563 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); 2622 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
2564 if (!fi_args) 2623 if (!fi_args)
2565 return -ENOMEM; 2624 return -ENOMEM;
@@ -2574,6 +2633,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2574 } 2633 }
2575 mutex_unlock(&fs_devices->device_list_mutex); 2634 mutex_unlock(&fs_devices->device_list_mutex);
2576 2635
2636 fi_args->nodesize = root->fs_info->super_copy->nodesize;
2637 fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
2638 fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
2639
2577 if (copy_to_user(arg, fi_args, sizeof(*fi_args))) 2640 if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
2578 ret = -EFAULT; 2641 ret = -EFAULT;
2579 2642
@@ -2589,9 +2652,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2589 int ret = 0; 2652 int ret = 0;
2590 char *s_uuid = NULL; 2653 char *s_uuid = NULL;
2591 2654
2592 if (!capable(CAP_SYS_ADMIN))
2593 return -EPERM;
2594
2595 di_args = memdup_user(arg, sizeof(*di_args)); 2655 di_args = memdup_user(arg, sizeof(*di_args));
2596 if (IS_ERR(di_args)) 2656 if (IS_ERR(di_args))
2597 return PTR_ERR(di_args); 2657 return PTR_ERR(di_args);
@@ -2669,10 +2729,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
2669 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2729 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
2670 ordered = btrfs_lookup_first_ordered_extent(inode, 2730 ordered = btrfs_lookup_first_ordered_extent(inode,
2671 off + len - 1); 2731 off + len - 1);
2672 if (!ordered && 2732 if ((!ordered ||
2733 ordered->file_offset + ordered->len <= off ||
2734 ordered->file_offset >= off + len) &&
2673 !test_range_bit(&BTRFS_I(inode)->io_tree, off, 2735 !test_range_bit(&BTRFS_I(inode)->io_tree, off,
2674 off + len - 1, EXTENT_DELALLOC, 0, NULL)) 2736 off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
2737 if (ordered)
2738 btrfs_put_ordered_extent(ordered);
2675 break; 2739 break;
2740 }
2676 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2741 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
2677 if (ordered) 2742 if (ordered)
2678 btrfs_put_ordered_extent(ordered); 2743 btrfs_put_ordered_extent(ordered);
@@ -2912,6 +2977,126 @@ out:
2912 return ret; 2977 return ret;
2913} 2978}
2914 2979
2980/* Helper to check and see if this root currently has a ref on the given disk
2981 * bytenr. If it does then we need to update the quota for this root. This
2982 * doesn't do anything if quotas aren't enabled.
2983 */
2984static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2985 u64 disko)
2986{
2987 struct seq_list tree_mod_seq_elem = {};
2988 struct ulist *roots;
2989 struct ulist_iterator uiter;
2990 struct ulist_node *root_node = NULL;
2991 int ret;
2992
2993 if (!root->fs_info->quota_enabled)
2994 return 1;
2995
2996 btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
2997 ret = btrfs_find_all_roots(trans, root->fs_info, disko,
2998 tree_mod_seq_elem.seq, &roots);
2999 if (ret < 0)
3000 goto out;
3001 ret = 0;
3002 ULIST_ITER_INIT(&uiter);
3003 while ((root_node = ulist_next(roots, &uiter))) {
3004 if (root_node->val == root->objectid) {
3005 ret = 1;
3006 break;
3007 }
3008 }
3009 ulist_free(roots);
3010out:
3011 btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
3012 return ret;
3013}
3014
3015static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
3016 struct inode *inode,
3017 u64 endoff,
3018 const u64 destoff,
3019 const u64 olen)
3020{
3021 struct btrfs_root *root = BTRFS_I(inode)->root;
3022 int ret;
3023
3024 inode_inc_iversion(inode);
3025 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3026 /*
3027 * We round up to the block size at eof when determining which
3028 * extents to clone above, but shouldn't round up the file size.
3029 */
3030 if (endoff > destoff + olen)
3031 endoff = destoff + olen;
3032 if (endoff > inode->i_size)
3033 btrfs_i_size_write(inode, endoff);
3034
3035 ret = btrfs_update_inode(trans, root, inode);
3036 if (ret) {
3037 btrfs_abort_transaction(trans, root, ret);
3038 btrfs_end_transaction(trans, root);
3039 goto out;
3040 }
3041 ret = btrfs_end_transaction(trans, root);
3042out:
3043 return ret;
3044}
3045
3046static void clone_update_extent_map(struct inode *inode,
3047 const struct btrfs_trans_handle *trans,
3048 const struct btrfs_path *path,
3049 struct btrfs_file_extent_item *fi,
3050 const u64 hole_offset,
3051 const u64 hole_len)
3052{
3053 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3054 struct extent_map *em;
3055 int ret;
3056
3057 em = alloc_extent_map();
3058 if (!em) {
3059 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3060 &BTRFS_I(inode)->runtime_flags);
3061 return;
3062 }
3063
3064 if (fi) {
3065 btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
3066 em->generation = -1;
3067 if (btrfs_file_extent_type(path->nodes[0], fi) ==
3068 BTRFS_FILE_EXTENT_INLINE)
3069 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3070 &BTRFS_I(inode)->runtime_flags);
3071 } else {
3072 em->start = hole_offset;
3073 em->len = hole_len;
3074 em->ram_bytes = em->len;
3075 em->orig_start = hole_offset;
3076 em->block_start = EXTENT_MAP_HOLE;
3077 em->block_len = 0;
3078 em->orig_block_len = 0;
3079 em->compress_type = BTRFS_COMPRESS_NONE;
3080 em->generation = trans->transid;
3081 }
3082
3083 while (1) {
3084 write_lock(&em_tree->lock);
3085 ret = add_extent_mapping(em_tree, em, 1);
3086 write_unlock(&em_tree->lock);
3087 if (ret != -EEXIST) {
3088 free_extent_map(em);
3089 break;
3090 }
3091 btrfs_drop_extent_cache(inode, em->start,
3092 em->start + em->len - 1, 0);
3093 }
3094
3095 if (unlikely(ret))
3096 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3097 &BTRFS_I(inode)->runtime_flags);
3098}
3099
2915/** 3100/**
2916 * btrfs_clone() - clone a range from inode file to another 3101 * btrfs_clone() - clone a range from inode file to another
2917 * 3102 *
@@ -2924,7 +3109,8 @@ out:
2924 * @destoff: Offset within @inode to start clone 3109 * @destoff: Offset within @inode to start clone
2925 */ 3110 */
2926static int btrfs_clone(struct inode *src, struct inode *inode, 3111static int btrfs_clone(struct inode *src, struct inode *inode,
2927 u64 off, u64 olen, u64 olen_aligned, u64 destoff) 3112 const u64 off, const u64 olen, const u64 olen_aligned,
3113 const u64 destoff)
2928{ 3114{
2929 struct btrfs_root *root = BTRFS_I(inode)->root; 3115 struct btrfs_root *root = BTRFS_I(inode)->root;
2930 struct btrfs_path *path = NULL; 3116 struct btrfs_path *path = NULL;
@@ -2935,7 +3121,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2935 u32 nritems; 3121 u32 nritems;
2936 int slot; 3122 int slot;
2937 int ret; 3123 int ret;
2938 u64 len = olen_aligned; 3124 int no_quota;
3125 const u64 len = olen_aligned;
3126 u64 last_disko = 0;
3127 u64 last_dest_end = destoff;
2939 3128
2940 ret = -ENOMEM; 3129 ret = -ENOMEM;
2941 buf = vmalloc(btrfs_level_size(root, 0)); 3130 buf = vmalloc(btrfs_level_size(root, 0));
@@ -2952,7 +3141,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2952 /* clone data */ 3141 /* clone data */
2953 key.objectid = btrfs_ino(src); 3142 key.objectid = btrfs_ino(src);
2954 key.type = BTRFS_EXTENT_DATA_KEY; 3143 key.type = BTRFS_EXTENT_DATA_KEY;
2955 key.offset = 0; 3144 key.offset = off;
2956 3145
2957 while (1) { 3146 while (1) {
2958 /* 3147 /*
@@ -2964,9 +3153,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2964 0, 0); 3153 0, 0);
2965 if (ret < 0) 3154 if (ret < 0)
2966 goto out; 3155 goto out;
3156 /*
3157 * First search, if no extent item that starts at offset off was
3158 * found but the previous item is an extent item, it's possible
3159 * it might overlap our target range, therefore process it.
3160 */
3161 if (key.offset == off && ret > 0 && path->slots[0] > 0) {
3162 btrfs_item_key_to_cpu(path->nodes[0], &key,
3163 path->slots[0] - 1);
3164 if (key.type == BTRFS_EXTENT_DATA_KEY)
3165 path->slots[0]--;
3166 }
2967 3167
2968 nritems = btrfs_header_nritems(path->nodes[0]); 3168 nritems = btrfs_header_nritems(path->nodes[0]);
2969process_slot: 3169process_slot:
3170 no_quota = 1;
2970 if (path->slots[0] >= nritems) { 3171 if (path->slots[0] >= nritems) {
2971 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3172 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2972 if (ret < 0) 3173 if (ret < 0)
@@ -2991,7 +3192,7 @@ process_slot:
2991 u64 disko = 0, diskl = 0; 3192 u64 disko = 0, diskl = 0;
2992 u64 datao = 0, datal = 0; 3193 u64 datao = 0, datal = 0;
2993 u8 comp; 3194 u8 comp;
2994 u64 endoff; 3195 u64 drop_start;
2995 3196
2996 extent = btrfs_item_ptr(leaf, slot, 3197 extent = btrfs_item_ptr(leaf, slot,
2997 struct btrfs_file_extent_item); 3198 struct btrfs_file_extent_item);
@@ -3012,10 +3213,16 @@ process_slot:
3012 extent); 3213 extent);
3013 } 3214 }
3014 3215
3015 if (key.offset + datal <= off || 3216 /*
3016 key.offset >= off + len - 1) { 3217 * The first search might have left us at an extent
3218 * item that ends before our target range's start, can
3219 * happen if we have holes and NO_HOLES feature enabled.
3220 */
3221 if (key.offset + datal <= off) {
3017 path->slots[0]++; 3222 path->slots[0]++;
3018 goto process_slot; 3223 goto process_slot;
3224 } else if (key.offset >= off + len) {
3225 break;
3019 } 3226 }
3020 3227
3021 size = btrfs_item_size_nr(leaf, slot); 3228 size = btrfs_item_size_nr(leaf, slot);
@@ -3034,6 +3241,18 @@ process_slot:
3034 new_key.offset = destoff; 3241 new_key.offset = destoff;
3035 3242
3036 /* 3243 /*
3244 * Deal with a hole that doesn't have an extent item
3245 * that represents it (NO_HOLES feature enabled).
3246 * This hole is either in the middle of the cloning
3247 * range or at the beginning (fully overlaps it or
3248 * partially overlaps it).
3249 */
3250 if (new_key.offset != last_dest_end)
3251 drop_start = last_dest_end;
3252 else
3253 drop_start = new_key.offset;
3254
3255 /*
3037 * 1 - adjusting old extent (we may have to split it) 3256 * 1 - adjusting old extent (we may have to split it)
3038 * 1 - add new extent 3257 * 1 - add new extent
3039 * 1 - inode update 3258 * 1 - inode update
@@ -3051,18 +3270,18 @@ process_slot:
3051 * | ------------- extent ------------- | 3270 * | ------------- extent ------------- |
3052 */ 3271 */
3053 3272
3054 /* substract range b */ 3273 /* subtract range b */
3055 if (key.offset + datal > off + len) 3274 if (key.offset + datal > off + len)
3056 datal = off + len - key.offset; 3275 datal = off + len - key.offset;
3057 3276
3058 /* substract range a */ 3277 /* subtract range a */
3059 if (off > key.offset) { 3278 if (off > key.offset) {
3060 datao += off - key.offset; 3279 datao += off - key.offset;
3061 datal -= off - key.offset; 3280 datal -= off - key.offset;
3062 } 3281 }
3063 3282
3064 ret = btrfs_drop_extents(trans, root, inode, 3283 ret = btrfs_drop_extents(trans, root, inode,
3065 new_key.offset, 3284 drop_start,
3066 new_key.offset + datal, 3285 new_key.offset + datal,
3067 1); 3286 1);
3068 if (ret) { 3287 if (ret) {
@@ -3099,6 +3318,28 @@ process_slot:
3099 datao); 3318 datao);
3100 btrfs_set_file_extent_num_bytes(leaf, extent, 3319 btrfs_set_file_extent_num_bytes(leaf, extent,
3101 datal); 3320 datal);
3321
3322 /*
3323 * We need to look up the roots that point at
3324 * this bytenr and see if the new root does. If
3325 * it does not we need to make sure we update
3326 * quotas appropriately.
3327 */
3328 if (disko && root != BTRFS_I(src)->root &&
3329 disko != last_disko) {
3330 no_quota = check_ref(trans, root,
3331 disko);
3332 if (no_quota < 0) {
3333 btrfs_abort_transaction(trans,
3334 root,
3335 ret);
3336 btrfs_end_transaction(trans,
3337 root);
3338 ret = no_quota;
3339 goto out;
3340 }
3341 }
3342
3102 if (disko) { 3343 if (disko) {
3103 inode_add_bytes(inode, datal); 3344 inode_add_bytes(inode, datal);
3104 ret = btrfs_inc_extent_ref(trans, root, 3345 ret = btrfs_inc_extent_ref(trans, root,
@@ -3106,7 +3347,7 @@ process_slot:
3106 root->root_key.objectid, 3347 root->root_key.objectid,
3107 btrfs_ino(inode), 3348 btrfs_ino(inode),
3108 new_key.offset - datao, 3349 new_key.offset - datao,
3109 0); 3350 no_quota);
3110 if (ret) { 3351 if (ret) {
3111 btrfs_abort_transaction(trans, 3352 btrfs_abort_transaction(trans,
3112 root, 3353 root,
@@ -3141,7 +3382,7 @@ process_slot:
3141 aligned_end = ALIGN(new_key.offset + datal, 3382 aligned_end = ALIGN(new_key.offset + datal,
3142 root->sectorsize); 3383 root->sectorsize);
3143 ret = btrfs_drop_extents(trans, root, inode, 3384 ret = btrfs_drop_extents(trans, root, inode,
3144 new_key.offset, 3385 drop_start,
3145 aligned_end, 3386 aligned_end,
3146 1); 3387 1);
3147 if (ret) { 3388 if (ret) {
@@ -3174,40 +3415,69 @@ process_slot:
3174 btrfs_item_ptr_offset(leaf, slot), 3415 btrfs_item_ptr_offset(leaf, slot),
3175 size); 3416 size);
3176 inode_add_bytes(inode, datal); 3417 inode_add_bytes(inode, datal);
3418 extent = btrfs_item_ptr(leaf, slot,
3419 struct btrfs_file_extent_item);
3177 } 3420 }
3178 3421
3422 /* If we have an implicit hole (NO_HOLES feature). */
3423 if (drop_start < new_key.offset)
3424 clone_update_extent_map(inode, trans,
3425 path, NULL, drop_start,
3426 new_key.offset - drop_start);
3427
3428 clone_update_extent_map(inode, trans, path,
3429 extent, 0, 0);
3430
3179 btrfs_mark_buffer_dirty(leaf); 3431 btrfs_mark_buffer_dirty(leaf);
3180 btrfs_release_path(path); 3432 btrfs_release_path(path);
3181 3433
3182 inode_inc_iversion(inode); 3434 last_dest_end = new_key.offset + datal;
3183 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3435 ret = clone_finish_inode_update(trans, inode,
3184 3436 last_dest_end,
3185 /* 3437 destoff, olen);
3186 * we round up to the block size at eof when 3438 if (ret)
3187 * determining which extents to clone above,
3188 * but shouldn't round up the file size
3189 */
3190 endoff = new_key.offset + datal;
3191 if (endoff > destoff+olen)
3192 endoff = destoff+olen;
3193 if (endoff > inode->i_size)
3194 btrfs_i_size_write(inode, endoff);
3195
3196 ret = btrfs_update_inode(trans, root, inode);
3197 if (ret) {
3198 btrfs_abort_transaction(trans, root, ret);
3199 btrfs_end_transaction(trans, root);
3200 goto out; 3439 goto out;
3201 } 3440 if (new_key.offset + datal >= destoff + len)
3202 ret = btrfs_end_transaction(trans, root); 3441 break;
3203 } 3442 }
3204 btrfs_release_path(path); 3443 btrfs_release_path(path);
3205 key.offset++; 3444 key.offset++;
3206 } 3445 }
3207 ret = 0; 3446 ret = 0;
3208 3447
3448 if (last_dest_end < destoff + len) {
3449 /*
3450 * We have an implicit hole (NO_HOLES feature is enabled) that
3451 * fully or partially overlaps our cloning range at its end.
3452 */
3453 btrfs_release_path(path);
3454
3455 /*
3456 * 1 - remove extent(s)
3457 * 1 - inode update
3458 */
3459 trans = btrfs_start_transaction(root, 2);
3460 if (IS_ERR(trans)) {
3461 ret = PTR_ERR(trans);
3462 goto out;
3463 }
3464 ret = btrfs_drop_extents(trans, root, inode,
3465 last_dest_end, destoff + len, 1);
3466 if (ret) {
3467 if (ret != -EOPNOTSUPP)
3468 btrfs_abort_transaction(trans, root, ret);
3469 btrfs_end_transaction(trans, root);
3470 goto out;
3471 }
3472 ret = clone_finish_inode_update(trans, inode, destoff + len,
3473 destoff, olen);
3474 if (ret)
3475 goto out;
3476 clone_update_extent_map(inode, trans, path, NULL, last_dest_end,
3477 destoff + len - last_dest_end);
3478 }
3479
3209out: 3480out:
3210 btrfs_release_path(path);
3211 btrfs_free_path(path); 3481 btrfs_free_path(path);
3212 vfree(buf); 3482 vfree(buf);
3213 return ret; 3483 return ret;
@@ -3319,15 +3589,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3319 goto out_unlock; 3589 goto out_unlock;
3320 } 3590 }
3321 3591
3322 /* truncate page cache pages from target inode range */ 3592 /*
3323 truncate_inode_pages_range(&inode->i_data, destoff, 3593 * Lock the target range too. Right after we replace the file extent
3324 PAGE_CACHE_ALIGN(destoff + len) - 1); 3594 * items in the fs tree (which now point to the cloned data), we might
3595 * have a worker replace them with extent items relative to a write
3596 * operation that was issued before this clone operation (i.e. confront
3597 * with inode.c:btrfs_finish_ordered_io).
3598 */
3599 if (same_inode) {
3600 u64 lock_start = min_t(u64, off, destoff);
3601 u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
3325 3602
3326 lock_extent_range(src, off, len); 3603 lock_extent_range(src, lock_start, lock_len);
3604 } else {
3605 lock_extent_range(src, off, len);
3606 lock_extent_range(inode, destoff, len);
3607 }
3327 3608
3328 ret = btrfs_clone(src, inode, off, olen, len, destoff); 3609 ret = btrfs_clone(src, inode, off, olen, len, destoff);
3329 3610
3330 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 3611 if (same_inode) {
3612 u64 lock_start = min_t(u64, off, destoff);
3613 u64 lock_end = max_t(u64, off, destoff) + len - 1;
3614
3615 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
3616 } else {
3617 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
3618 unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
3619 destoff + len - 1);
3620 }
3621 /*
3622 * Truncate page cache pages so that future reads will see the cloned
3623 * data immediately and not the previous data.
3624 */
3625 truncate_inode_pages_range(&inode->i_data, destoff,
3626 PAGE_CACHE_ALIGN(destoff + len) - 1);
3331out_unlock: 3627out_unlock:
3332 if (!same_inode) { 3628 if (!same_inode) {
3333 if (inode < src) { 3629 if (inode < src) {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b47f669aca75..dfad8514f0da 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws,
143 if (ret != LZO_E_OK) { 143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 144 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
145 ret); 145 ret);
146 ret = -1; 146 ret = -EIO;
147 goto out; 147 goto out;
148 } 148 }
149 149
@@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,
189 kunmap(out_page); 189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) { 190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL; 191 out_page = NULL;
192 ret = -1; 192 ret = -E2BIG;
193 goto out; 193 goto out;
194 } 194 }
195 195
@@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws,
208 208
209 /* we're making it bigger, give up */ 209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out) { 210 if (tot_in > 8192 && tot_in < tot_out) {
211 ret = -1; 211 ret = -E2BIG;
212 goto out; 212 goto out;
213 } 213 }
214 214
@@ -335,7 +335,7 @@ cont:
335 break; 335 break;
336 336
337 if (page_in_index + 1 >= total_pages_in) { 337 if (page_in_index + 1 >= total_pages_in) {
338 ret = -1; 338 ret = -EIO;
339 goto done; 339 goto done;
340 } 340 }
341 341
@@ -358,7 +358,7 @@ cont:
358 kunmap(pages_in[page_in_index - 1]); 358 kunmap(pages_in[page_in_index - 1]);
359 if (ret != LZO_E_OK) { 359 if (ret != LZO_E_OK) {
360 printk(KERN_WARNING "BTRFS: decompress failed\n"); 360 printk(KERN_WARNING "BTRFS: decompress failed\n");
361 ret = -1; 361 ret = -EIO;
362 break; 362 break;
363 } 363 }
364 364
@@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
402 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); 402 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
403 if (ret != LZO_E_OK) { 403 if (ret != LZO_E_OK) {
404 printk(KERN_WARNING "BTRFS: decompress failed!\n"); 404 printk(KERN_WARNING "BTRFS: decompress failed!\n");
405 ret = -1; 405 ret = -EIO;
406 goto out; 406 goto out;
407 } 407 }
408 408
409 if (out_len < start_byte) { 409 if (out_len < start_byte) {
410 ret = -1; 410 ret = -EIO;
411 goto out; 411 goto out;
412 } 412 }
413 413
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a94b05f72869..e12441c7cf1d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,
67{ 67{
68 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 68 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
69 btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " 69 btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
70 "%llu\n", offset); 70 "%llu", offset);
71} 71}
72 72
73/* 73/*
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2cf905877aaf..cf5aead95a7f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -32,6 +32,7 @@
32#include "ulist.h" 32#include "ulist.h"
33#include "backref.h" 33#include "backref.h"
34#include "extent_io.h" 34#include "extent_io.h"
35#include "qgroup.h"
35 36
36/* TODO XXX FIXME 37/* TODO XXX FIXME
37 * - subvol delete -> delete when ref goes to 0? delete limits also? 38 * - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -84,8 +85,8 @@ struct btrfs_qgroup {
84 /* 85 /*
85 * temp variables for accounting operations 86 * temp variables for accounting operations
86 */ 87 */
87 u64 tag; 88 u64 old_refcnt;
88 u64 refcnt; 89 u64 new_refcnt;
89}; 90};
90 91
91/* 92/*
@@ -98,6 +99,9 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 99 struct btrfs_qgroup *member;
99}; 100};
100 101
102#define ptr_to_u64(x) ((u64)(uintptr_t)x)
103#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
104
101static int 105static int
102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 106qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 int init_flags); 107 int init_flags);
@@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
242 return -ENOENT; 246 return -ENOENT;
243} 247}
244 248
249#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
250int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
251 u64 rfer, u64 excl)
252{
253 struct btrfs_qgroup *qgroup;
254
255 qgroup = find_qgroup_rb(fs_info, qgroupid);
256 if (!qgroup)
257 return -EINVAL;
258 if (qgroup->rfer != rfer || qgroup->excl != excl)
259 return -EINVAL;
260 return 0;
261}
262#endif
263
245/* 264/*
246 * The full config is read in one go, only called from open_ctree() 265 * The full config is read in one go, only called from open_ctree()
247 * It doesn't use any locking, as at this point we're still single-threaded 266 * It doesn't use any locking, as at this point we're still single-threaded
@@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
520 struct extent_buffer *leaf; 539 struct extent_buffer *leaf;
521 struct btrfs_key key; 540 struct btrfs_key key;
522 541
542#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
543 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
544 return 0;
545#endif
523 path = btrfs_alloc_path(); 546 path = btrfs_alloc_path();
524 if (!path) 547 if (!path)
525 return -ENOMEM; 548 return -ENOMEM;
@@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
669 int ret; 692 int ret;
670 int slot; 693 int slot;
671 694
695#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
696 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
697 return 0;
698#endif
672 key.objectid = 0; 699 key.objectid = 0;
673 key.type = BTRFS_QGROUP_INFO_KEY; 700 key.type = BTRFS_QGROUP_INFO_KEY;
674 key.offset = qgroup->qgroupid; 701 key.offset = qgroup->qgroupid;
@@ -1174,33 +1201,198 @@ out:
1174 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1201 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1175 return ret; 1202 return ret;
1176} 1203}
1204static int comp_oper(struct btrfs_qgroup_operation *oper1,
1205 struct btrfs_qgroup_operation *oper2)
1206{
1207 if (oper1->bytenr < oper2->bytenr)
1208 return -1;
1209 if (oper1->bytenr > oper2->bytenr)
1210 return 1;
1211 if (oper1->seq < oper2->seq)
1212 return -1;
1213 if (oper1->seq > oper2->seq)
1214 return -1;
1215 if (oper1->ref_root < oper2->ref_root)
1216 return -1;
1217 if (oper1->ref_root > oper2->ref_root)
1218 return 1;
1219 if (oper1->type < oper2->type)
1220 return -1;
1221 if (oper1->type > oper2->type)
1222 return 1;
1223 return 0;
1224}
1225
1226static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
1227 struct btrfs_qgroup_operation *oper)
1228{
1229 struct rb_node **p;
1230 struct rb_node *parent = NULL;
1231 struct btrfs_qgroup_operation *cur;
1232 int cmp;
1233
1234 spin_lock(&fs_info->qgroup_op_lock);
1235 p = &fs_info->qgroup_op_tree.rb_node;
1236 while (*p) {
1237 parent = *p;
1238 cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
1239 cmp = comp_oper(cur, oper);
1240 if (cmp < 0) {
1241 p = &(*p)->rb_right;
1242 } else if (cmp) {
1243 p = &(*p)->rb_left;
1244 } else {
1245 spin_unlock(&fs_info->qgroup_op_lock);
1246 return -EEXIST;
1247 }
1248 }
1249 rb_link_node(&oper->n, parent, p);
1250 rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
1251 spin_unlock(&fs_info->qgroup_op_lock);
1252 return 0;
1253}
1177 1254
1178/* 1255/*
1179 * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts 1256 * Record a quota operation for processing later on.
1180 * the modification into a list that's later used by btrfs_end_transaction to 1257 * @trans: the transaction we are adding the delayed op to.
1181 * pass the recorded modifications on to btrfs_qgroup_account_ref. 1258 * @fs_info: the fs_info for this fs.
1259 * @ref_root: the root of the reference we are acting on,
1260 * @bytenr: the bytenr we are acting on.
1261 * @num_bytes: the number of bytes in the reference.
1262 * @type: the type of operation this is.
1263 * @mod_seq: do we need to get a sequence number for looking up roots.
1264 *
1265 * We just add it to our trans qgroup_ref_list and carry on and process these
1266 * operations in order at some later point. If the reference root isn't a fs
1267 * root then we don't bother with doing anything.
1268 *
1269 * MUST BE HOLDING THE REF LOCK.
1182 */ 1270 */
1183int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, 1271int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1184 struct btrfs_delayed_ref_node *node, 1272 struct btrfs_fs_info *fs_info, u64 ref_root,
1185 struct btrfs_delayed_extent_op *extent_op) 1273 u64 bytenr, u64 num_bytes,
1274 enum btrfs_qgroup_operation_type type, int mod_seq)
1186{ 1275{
1187 struct qgroup_update *u; 1276 struct btrfs_qgroup_operation *oper;
1277 int ret;
1278
1279 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
1280 return 0;
1188 1281
1189 BUG_ON(!trans->delayed_ref_elem.seq); 1282 oper = kmalloc(sizeof(*oper), GFP_NOFS);
1190 u = kmalloc(sizeof(*u), GFP_NOFS); 1283 if (!oper)
1191 if (!u)
1192 return -ENOMEM; 1284 return -ENOMEM;
1193 1285
1194 u->node = node; 1286 oper->ref_root = ref_root;
1195 u->extent_op = extent_op; 1287 oper->bytenr = bytenr;
1196 list_add_tail(&u->list, &trans->qgroup_ref_list); 1288 oper->num_bytes = num_bytes;
1289 oper->type = type;
1290 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1291 INIT_LIST_HEAD(&oper->elem.list);
1292 oper->elem.seq = 0;
1293 ret = insert_qgroup_oper(fs_info, oper);
1294 if (ret) {
1295 /* Shouldn't happen so have an assert for developers */
1296 ASSERT(0);
1297 kfree(oper);
1298 return ret;
1299 }
1300 list_add_tail(&oper->list, &trans->qgroup_ref_list);
1301
1302 if (mod_seq)
1303 btrfs_get_tree_mod_seq(fs_info, &oper->elem);
1197 1304
1198 return 0; 1305 return 0;
1199} 1306}
1200 1307
1201static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, 1308/*
1202 struct ulist *roots, struct ulist *tmp, 1309 * The easy accounting, if we are adding/removing the only ref for an extent
1203 u64 seq) 1310 * then this qgroup and all of the parent qgroups get their refrence and
1311 * exclusive counts adjusted.
1312 */
1313static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1314 struct btrfs_qgroup_operation *oper)
1315{
1316 struct btrfs_qgroup *qgroup;
1317 struct ulist *tmp;
1318 struct btrfs_qgroup_list *glist;
1319 struct ulist_node *unode;
1320 struct ulist_iterator uiter;
1321 int sign = 0;
1322 int ret = 0;
1323
1324 tmp = ulist_alloc(GFP_NOFS);
1325 if (!tmp)
1326 return -ENOMEM;
1327
1328 spin_lock(&fs_info->qgroup_lock);
1329 if (!fs_info->quota_root)
1330 goto out;
1331 qgroup = find_qgroup_rb(fs_info, oper->ref_root);
1332 if (!qgroup)
1333 goto out;
1334 switch (oper->type) {
1335 case BTRFS_QGROUP_OPER_ADD_EXCL:
1336 sign = 1;
1337 break;
1338 case BTRFS_QGROUP_OPER_SUB_EXCL:
1339 sign = -1;
1340 break;
1341 default:
1342 ASSERT(0);
1343 }
1344 qgroup->rfer += sign * oper->num_bytes;
1345 qgroup->rfer_cmpr += sign * oper->num_bytes;
1346
1347 WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
1348 qgroup->excl += sign * oper->num_bytes;
1349 qgroup->excl_cmpr += sign * oper->num_bytes;
1350
1351 qgroup_dirty(fs_info, qgroup);
1352
1353 /* Get all of the parent groups that contain this qgroup */
1354 list_for_each_entry(glist, &qgroup->groups, next_group) {
1355 ret = ulist_add(tmp, glist->group->qgroupid,
1356 ptr_to_u64(glist->group), GFP_ATOMIC);
1357 if (ret < 0)
1358 goto out;
1359 }
1360
1361 /* Iterate all of the parents and adjust their reference counts */
1362 ULIST_ITER_INIT(&uiter);
1363 while ((unode = ulist_next(tmp, &uiter))) {
1364 qgroup = u64_to_ptr(unode->aux);
1365 qgroup->rfer += sign * oper->num_bytes;
1366 qgroup->rfer_cmpr += sign * oper->num_bytes;
1367 qgroup->excl += sign * oper->num_bytes;
1368 if (sign < 0)
1369 WARN_ON(qgroup->excl < oper->num_bytes);
1370 qgroup->excl_cmpr += sign * oper->num_bytes;
1371 qgroup_dirty(fs_info, qgroup);
1372
1373 /* Add any parents of the parents */
1374 list_for_each_entry(glist, &qgroup->groups, next_group) {
1375 ret = ulist_add(tmp, glist->group->qgroupid,
1376 ptr_to_u64(glist->group), GFP_ATOMIC);
1377 if (ret < 0)
1378 goto out;
1379 }
1380 }
1381 ret = 0;
1382out:
1383 spin_unlock(&fs_info->qgroup_lock);
1384 ulist_free(tmp);
1385 return ret;
1386}
1387
1388/*
1389 * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
1390 * properly.
1391 */
1392static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
1393 u64 root_to_skip, struct ulist *tmp,
1394 struct ulist *roots, struct ulist *qgroups,
1395 u64 seq, int *old_roots, int rescan)
1204{ 1396{
1205 struct ulist_node *unode; 1397 struct ulist_node *unode;
1206 struct ulist_iterator uiter; 1398 struct ulist_iterator uiter;
@@ -1211,256 +1403,549 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
1211 1403
1212 ULIST_ITER_INIT(&uiter); 1404 ULIST_ITER_INIT(&uiter);
1213 while ((unode = ulist_next(roots, &uiter))) { 1405 while ((unode = ulist_next(roots, &uiter))) {
1406 /* We don't count our current root here */
1407 if (unode->val == root_to_skip)
1408 continue;
1214 qg = find_qgroup_rb(fs_info, unode->val); 1409 qg = find_qgroup_rb(fs_info, unode->val);
1215 if (!qg) 1410 if (!qg)
1216 continue; 1411 continue;
1412 /*
1413 * We could have a pending removal of this same ref so we may
1414 * not have actually found our ref root when doing
1415 * btrfs_find_all_roots, so we need to keep track of how many
1416 * old roots we find in case we removed ours and added a
1417 * different one at the same time. I don't think this could
1418 * happen in practice but that sort of thinking leads to pain
1419 * and suffering and to the dark side.
1420 */
1421 (*old_roots)++;
1217 1422
1218 ulist_reinit(tmp); 1423 ulist_reinit(tmp);
1219 /* XXX id not needed */ 1424 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1220 ret = ulist_add(tmp, qg->qgroupid, 1425 GFP_ATOMIC);
1221 (u64)(uintptr_t)qg, GFP_ATOMIC); 1426 if (ret < 0)
1427 return ret;
1428 ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
1222 if (ret < 0) 1429 if (ret < 0)
1223 return ret; 1430 return ret;
1224 ULIST_ITER_INIT(&tmp_uiter); 1431 ULIST_ITER_INIT(&tmp_uiter);
1225 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1432 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1226 struct btrfs_qgroup_list *glist; 1433 struct btrfs_qgroup_list *glist;
1227 1434
1228 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; 1435 qg = u64_to_ptr(tmp_unode->aux);
1229 if (qg->refcnt < seq) 1436 /*
1230 qg->refcnt = seq + 1; 1437 * We use this sequence number to keep from having to
1438 * run the whole list and 0 out the refcnt every time.
1439 * We basically use sequnce as the known 0 count and
1440 * then add 1 everytime we see a qgroup. This is how we
1441 * get how many of the roots actually point up to the
1442 * upper level qgroups in order to determine exclusive
1443 * counts.
1444 *
1445 * For rescan we want to set old_refcnt to seq so our
1446 * exclusive calculations end up correct.
1447 */
1448 if (rescan)
1449 qg->old_refcnt = seq;
1450 else if (qg->old_refcnt < seq)
1451 qg->old_refcnt = seq + 1;
1231 else 1452 else
1232 ++qg->refcnt; 1453 qg->old_refcnt++;
1233 1454
1455 if (qg->new_refcnt < seq)
1456 qg->new_refcnt = seq + 1;
1457 else
1458 qg->new_refcnt++;
1234 list_for_each_entry(glist, &qg->groups, next_group) { 1459 list_for_each_entry(glist, &qg->groups, next_group) {
1460 ret = ulist_add(qgroups, glist->group->qgroupid,
1461 ptr_to_u64(glist->group),
1462 GFP_ATOMIC);
1463 if (ret < 0)
1464 return ret;
1235 ret = ulist_add(tmp, glist->group->qgroupid, 1465 ret = ulist_add(tmp, glist->group->qgroupid,
1236 (u64)(uintptr_t)glist->group, 1466 ptr_to_u64(glist->group),
1237 GFP_ATOMIC); 1467 GFP_ATOMIC);
1238 if (ret < 0) 1468 if (ret < 0)
1239 return ret; 1469 return ret;
1240 } 1470 }
1241 } 1471 }
1242 } 1472 }
1473 return 0;
1474}
1475
1476/*
1477 * We need to walk forward in our operation tree and account for any roots that
1478 * were deleted after we made this operation.
1479 */
1480static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
1481 struct btrfs_qgroup_operation *oper,
1482 struct ulist *tmp,
1483 struct ulist *qgroups, u64 seq,
1484 int *old_roots)
1485{
1486 struct ulist_node *unode;
1487 struct ulist_iterator uiter;
1488 struct btrfs_qgroup *qg;
1489 struct btrfs_qgroup_operation *tmp_oper;
1490 struct rb_node *n;
1491 int ret;
1492
1493 ulist_reinit(tmp);
1243 1494
1495 /*
1496 * We only walk forward in the tree since we're only interested in
1497 * removals that happened _after_ our operation.
1498 */
1499 spin_lock(&fs_info->qgroup_op_lock);
1500 n = rb_next(&oper->n);
1501 spin_unlock(&fs_info->qgroup_op_lock);
1502 if (!n)
1503 return 0;
1504 tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1505 while (tmp_oper->bytenr == oper->bytenr) {
1506 /*
1507 * If it's not a removal we don't care, additions work out
1508 * properly with our refcnt tracking.
1509 */
1510 if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
1511 tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
1512 goto next;
1513 qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
1514 if (!qg)
1515 goto next;
1516 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1517 GFP_ATOMIC);
1518 if (ret) {
1519 if (ret < 0)
1520 return ret;
1521 /*
1522 * We only want to increase old_roots if this qgroup is
1523 * not already in the list of qgroups. If it is already
1524 * there then that means it must have been re-added or
1525 * the delete will be discarded because we had an
1526 * existing ref that we haven't looked up yet. In this
1527 * case we don't want to increase old_roots. So if ret
1528 * == 1 then we know that this is the first time we've
1529 * seen this qgroup and we can bump the old_roots.
1530 */
1531 (*old_roots)++;
1532 ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
1533 GFP_ATOMIC);
1534 if (ret < 0)
1535 return ret;
1536 }
1537next:
1538 spin_lock(&fs_info->qgroup_op_lock);
1539 n = rb_next(&tmp_oper->n);
1540 spin_unlock(&fs_info->qgroup_op_lock);
1541 if (!n)
1542 break;
1543 tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1544 }
1545
1546 /* Ok now process the qgroups we found */
1547 ULIST_ITER_INIT(&uiter);
1548 while ((unode = ulist_next(tmp, &uiter))) {
1549 struct btrfs_qgroup_list *glist;
1550
1551 qg = u64_to_ptr(unode->aux);
1552 if (qg->old_refcnt < seq)
1553 qg->old_refcnt = seq + 1;
1554 else
1555 qg->old_refcnt++;
1556 if (qg->new_refcnt < seq)
1557 qg->new_refcnt = seq + 1;
1558 else
1559 qg->new_refcnt++;
1560 list_for_each_entry(glist, &qg->groups, next_group) {
1561 ret = ulist_add(qgroups, glist->group->qgroupid,
1562 ptr_to_u64(glist->group), GFP_ATOMIC);
1563 if (ret < 0)
1564 return ret;
1565 ret = ulist_add(tmp, glist->group->qgroupid,
1566 ptr_to_u64(glist->group), GFP_ATOMIC);
1567 if (ret < 0)
1568 return ret;
1569 }
1570 }
1244 return 0; 1571 return 0;
1245} 1572}
1246 1573
1247static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, 1574/* Add refcnt for the newly added reference. */
1248 struct ulist *roots, struct ulist *tmp, 1575static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
1249 u64 seq, int sgn, u64 num_bytes, 1576 struct btrfs_qgroup_operation *oper,
1250 struct btrfs_qgroup *qgroup) 1577 struct btrfs_qgroup *qgroup,
1578 struct ulist *tmp, struct ulist *qgroups,
1579 u64 seq)
1251{ 1580{
1252 struct ulist_node *unode; 1581 struct ulist_node *unode;
1253 struct ulist_iterator uiter; 1582 struct ulist_iterator uiter;
1254 struct btrfs_qgroup *qg; 1583 struct btrfs_qgroup *qg;
1255 struct btrfs_qgroup_list *glist;
1256 int ret; 1584 int ret;
1257 1585
1258 ulist_reinit(tmp); 1586 ulist_reinit(tmp);
1259 ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); 1587 ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
1588 GFP_ATOMIC);
1589 if (ret < 0)
1590 return ret;
1591 ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
1592 GFP_ATOMIC);
1260 if (ret < 0) 1593 if (ret < 0)
1261 return ret; 1594 return ret;
1262
1263 ULIST_ITER_INIT(&uiter); 1595 ULIST_ITER_INIT(&uiter);
1264 while ((unode = ulist_next(tmp, &uiter))) { 1596 while ((unode = ulist_next(tmp, &uiter))) {
1265 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1597 struct btrfs_qgroup_list *glist;
1266 if (qg->refcnt < seq) {
1267 /* not visited by step 1 */
1268 qg->rfer += sgn * num_bytes;
1269 qg->rfer_cmpr += sgn * num_bytes;
1270 if (roots->nnodes == 0) {
1271 qg->excl += sgn * num_bytes;
1272 qg->excl_cmpr += sgn * num_bytes;
1273 }
1274 qgroup_dirty(fs_info, qg);
1275 }
1276 WARN_ON(qg->tag >= seq);
1277 qg->tag = seq;
1278 1598
1599 qg = u64_to_ptr(unode->aux);
1600 if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
1601 if (qg->new_refcnt < seq)
1602 qg->new_refcnt = seq + 1;
1603 else
1604 qg->new_refcnt++;
1605 } else {
1606 if (qg->old_refcnt < seq)
1607 qg->old_refcnt = seq + 1;
1608 else
1609 qg->old_refcnt++;
1610 }
1279 list_for_each_entry(glist, &qg->groups, next_group) { 1611 list_for_each_entry(glist, &qg->groups, next_group) {
1280 ret = ulist_add(tmp, glist->group->qgroupid, 1612 ret = ulist_add(tmp, glist->group->qgroupid,
1281 (uintptr_t)glist->group, GFP_ATOMIC); 1613 ptr_to_u64(glist->group), GFP_ATOMIC);
1614 if (ret < 0)
1615 return ret;
1616 ret = ulist_add(qgroups, glist->group->qgroupid,
1617 ptr_to_u64(glist->group), GFP_ATOMIC);
1282 if (ret < 0) 1618 if (ret < 0)
1283 return ret; 1619 return ret;
1284 } 1620 }
1285 } 1621 }
1286
1287 return 0; 1622 return 0;
1288} 1623}
1289 1624
1290static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, 1625/*
1291 struct ulist *roots, struct ulist *tmp, 1626 * This adjusts the counters for all referenced qgroups if need be.
1292 u64 seq, int sgn, u64 num_bytes) 1627 */
1628static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
1629 u64 root_to_skip, u64 num_bytes,
1630 struct ulist *qgroups, u64 seq,
1631 int old_roots, int new_roots, int rescan)
1293{ 1632{
1294 struct ulist_node *unode; 1633 struct ulist_node *unode;
1295 struct ulist_iterator uiter; 1634 struct ulist_iterator uiter;
1296 struct btrfs_qgroup *qg; 1635 struct btrfs_qgroup *qg;
1297 struct ulist_node *tmp_unode; 1636 u64 cur_new_count, cur_old_count;
1298 struct ulist_iterator tmp_uiter;
1299 int ret;
1300 1637
1301 ULIST_ITER_INIT(&uiter); 1638 ULIST_ITER_INIT(&uiter);
1302 while ((unode = ulist_next(roots, &uiter))) { 1639 while ((unode = ulist_next(qgroups, &uiter))) {
1303 qg = find_qgroup_rb(fs_info, unode->val); 1640 bool dirty = false;
1304 if (!qg)
1305 continue;
1306 1641
1307 ulist_reinit(tmp); 1642 qg = u64_to_ptr(unode->aux);
1308 ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); 1643 /*
1309 if (ret < 0) 1644 * Wasn't referenced before but is now, add to the reference
1310 return ret; 1645 * counters.
1646 */
1647 if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
1648 qg->rfer += num_bytes;
1649 qg->rfer_cmpr += num_bytes;
1650 dirty = true;
1651 }
1311 1652
1312 ULIST_ITER_INIT(&tmp_uiter); 1653 /*
1313 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1654 * Was referenced before but isn't now, subtract from the
1314 struct btrfs_qgroup_list *glist; 1655 * reference counters.
1656 */
1657 if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
1658 qg->rfer -= num_bytes;
1659 qg->rfer_cmpr -= num_bytes;
1660 dirty = true;
1661 }
1315 1662
1316 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; 1663 if (qg->old_refcnt < seq)
1317 if (qg->tag == seq) 1664 cur_old_count = 0;
1318 continue; 1665 else
1666 cur_old_count = qg->old_refcnt - seq;
1667 if (qg->new_refcnt < seq)
1668 cur_new_count = 0;
1669 else
1670 cur_new_count = qg->new_refcnt - seq;
1319 1671
1320 if (qg->refcnt - seq == roots->nnodes) { 1672 /*
1321 qg->excl -= sgn * num_bytes; 1673 * If our refcount was the same as the roots previously but our
1322 qg->excl_cmpr -= sgn * num_bytes; 1674 * new count isn't the same as the number of roots now then we
1323 qgroup_dirty(fs_info, qg); 1675 * went from having a exclusive reference on this range to not.
1324 } 1676 */
1677 if (old_roots && cur_old_count == old_roots &&
1678 (cur_new_count != new_roots || new_roots == 0)) {
1679 WARN_ON(cur_new_count != new_roots && new_roots == 0);
1680 qg->excl -= num_bytes;
1681 qg->excl_cmpr -= num_bytes;
1682 dirty = true;
1683 }
1325 1684
1326 list_for_each_entry(glist, &qg->groups, next_group) { 1685 /*
1327 ret = ulist_add(tmp, glist->group->qgroupid, 1686 * If we didn't reference all the roots before but now we do we
1328 (uintptr_t)glist->group, 1687 * have an exclusive reference to this range.
1329 GFP_ATOMIC); 1688 */
1330 if (ret < 0) 1689 if ((!old_roots || (old_roots && cur_old_count != old_roots))
1331 return ret; 1690 && cur_new_count == new_roots) {
1332 } 1691 qg->excl += num_bytes;
1692 qg->excl_cmpr += num_bytes;
1693 dirty = true;
1333 } 1694 }
1334 }
1335 1695
1696 if (dirty)
1697 qgroup_dirty(fs_info, qg);
1698 }
1336 return 0; 1699 return 0;
1337} 1700}
1338 1701
1339/* 1702/*
1340 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 1703 * If we removed a data extent and there were other references for that bytenr
1341 * from the fs. First, all roots referencing the extent are searched, and 1704 * then we need to lookup all referenced roots to make sure we still don't
1342 * then the space is accounted accordingly to the different roots. The 1705 * reference this bytenr. If we do then we can just discard this operation.
1343 * accounting algorithm works in 3 steps documented inline.
1344 */ 1706 */
1345int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, 1707static int check_existing_refs(struct btrfs_trans_handle *trans,
1346 struct btrfs_fs_info *fs_info, 1708 struct btrfs_fs_info *fs_info,
1347 struct btrfs_delayed_ref_node *node, 1709 struct btrfs_qgroup_operation *oper)
1348 struct btrfs_delayed_extent_op *extent_op)
1349{ 1710{
1350 struct btrfs_root *quota_root;
1351 u64 ref_root;
1352 struct btrfs_qgroup *qgroup;
1353 struct ulist *roots = NULL; 1711 struct ulist *roots = NULL;
1354 u64 seq; 1712 struct ulist_node *unode;
1713 struct ulist_iterator uiter;
1355 int ret = 0; 1714 int ret = 0;
1356 int sgn;
1357 1715
1358 if (!fs_info->quota_enabled) 1716 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1359 return 0; 1717 oper->elem.seq, &roots);
1360 1718 if (ret < 0)
1361 BUG_ON(!fs_info->quota_root); 1719 return ret;
1720 ret = 0;
1362 1721
1363 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1722 ULIST_ITER_INIT(&uiter);
1364 node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 1723 while ((unode = ulist_next(roots, &uiter))) {
1365 struct btrfs_delayed_tree_ref *ref; 1724 if (unode->val == oper->ref_root) {
1366 ref = btrfs_delayed_node_to_tree_ref(node); 1725 ret = 1;
1367 ref_root = ref->root; 1726 break;
1368 } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1727 }
1369 node->type == BTRFS_SHARED_DATA_REF_KEY) {
1370 struct btrfs_delayed_data_ref *ref;
1371 ref = btrfs_delayed_node_to_data_ref(node);
1372 ref_root = ref->root;
1373 } else {
1374 BUG();
1375 } 1728 }
1729 ulist_free(roots);
1730 btrfs_put_tree_mod_seq(fs_info, &oper->elem);
1376 1731
1377 if (!is_fstree(ref_root)) { 1732 return ret;
1378 /* 1733}
1379 * non-fs-trees are not being accounted
1380 */
1381 return 0;
1382 }
1383 1734
1384 switch (node->action) { 1735/*
1385 case BTRFS_ADD_DELAYED_REF: 1736 * If we share a reference across multiple roots then we may need to adjust
1386 case BTRFS_ADD_DELAYED_EXTENT: 1737 * various qgroups referenced and exclusive counters. The basic premise is this
1387 sgn = 1; 1738 *
1388 seq = btrfs_tree_mod_seq_prev(node->seq); 1739 * 1) We have seq to represent a 0 count. Instead of looping through all of the
1389 break; 1740 * qgroups and resetting their refcount to 0 we just constantly bump this
1390 case BTRFS_DROP_DELAYED_REF: 1741 * sequence number to act as the base reference count. This means that if
1391 sgn = -1; 1742 * anybody is equal to or below this sequence they were never referenced. We
1392 seq = node->seq; 1743 * jack this sequence up by the number of roots we found each time in order to
1393 break; 1744 * make sure we don't have any overlap.
1394 case BTRFS_UPDATE_DELAYED_HEAD: 1745 *
1395 return 0; 1746 * 2) We first search all the roots that reference the area _except_ the root
1396 default: 1747 * we're acting on currently. This makes up the old_refcnt of all the qgroups
1397 BUG(); 1748 * before.
1398 } 1749 *
1750 * 3) We walk all of the qgroups referenced by the root we are currently acting
1751 * on, and will either adjust old_refcnt in the case of a removal or the
1752 * new_refcnt in the case of an addition.
1753 *
1754 * 4) Finally we walk all the qgroups that are referenced by this range
1755 * including the root we are acting on currently. We will adjust the counters
1756 * based on the number of roots we had and will have after this operation.
1757 *
1758 * Take this example as an illustration
1759 *
1760 * [qgroup 1/0]
1761 * / | \
1762 * [qg 0/0] [qg 0/1] [qg 0/2]
1763 * \ | /
1764 * [ extent ]
1765 *
1766 * Say we are adding a reference that is covered by qg 0/0. The first step
1767 * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
1768 * old_roots being 2. Because it is adding new_roots will be 1. We then go
1769 * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
1770 * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
1771 * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
1772 * reference and thus must add the size to the referenced bytes. Everything
1773 * else is the same so nothing else changes.
1774 */
1775static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
1776 struct btrfs_fs_info *fs_info,
1777 struct btrfs_qgroup_operation *oper)
1778{
1779 struct ulist *roots = NULL;
1780 struct ulist *qgroups, *tmp;
1781 struct btrfs_qgroup *qgroup;
1782 struct seq_list elem = {};
1783 u64 seq;
1784 int old_roots = 0;
1785 int new_roots = 0;
1786 int ret = 0;
1399 1787
1400 mutex_lock(&fs_info->qgroup_rescan_lock); 1788 if (oper->elem.seq) {
1401 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 1789 ret = check_existing_refs(trans, fs_info, oper);
1402 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { 1790 if (ret < 0)
1403 mutex_unlock(&fs_info->qgroup_rescan_lock); 1791 return ret;
1792 if (ret)
1404 return 0; 1793 return 0;
1405 }
1406 } 1794 }
1407 mutex_unlock(&fs_info->qgroup_rescan_lock);
1408 1795
1409 /* 1796 qgroups = ulist_alloc(GFP_NOFS);
1410 * the delayed ref sequence number we pass depends on the direction of 1797 if (!qgroups)
1411 * the operation. for add operations, we pass 1798 return -ENOMEM;
1412 * tree_mod_log_prev_seq(node->seq) to skip
1413 * the delayed ref's current sequence number, because we need the state
1414 * of the tree before the add operation. for delete operations, we pass
1415 * (node->seq) to include the delayed ref's current sequence number,
1416 * because we need the state of the tree after the delete operation.
1417 */
1418 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
1419 if (ret < 0)
1420 return ret;
1421
1422 spin_lock(&fs_info->qgroup_lock);
1423 1799
1424 quota_root = fs_info->quota_root; 1800 tmp = ulist_alloc(GFP_NOFS);
1425 if (!quota_root) 1801 if (!tmp)
1426 goto unlock; 1802 return -ENOMEM;
1427 1803
1428 qgroup = find_qgroup_rb(fs_info, ref_root); 1804 btrfs_get_tree_mod_seq(fs_info, &elem);
1805 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
1806 &roots);
1807 btrfs_put_tree_mod_seq(fs_info, &elem);
1808 if (ret < 0) {
1809 ulist_free(qgroups);
1810 ulist_free(tmp);
1811 return ret;
1812 }
1813 spin_lock(&fs_info->qgroup_lock);
1814 qgroup = find_qgroup_rb(fs_info, oper->ref_root);
1429 if (!qgroup) 1815 if (!qgroup)
1430 goto unlock; 1816 goto out;
1817 seq = fs_info->qgroup_seq;
1431 1818
1432 /* 1819 /*
1433 * step 1: for each old ref, visit all nodes once and inc refcnt 1820 * So roots is the list of all the roots currently pointing at the
1821 * bytenr, including the ref we are adding if we are adding, or not if
1822 * we are removing a ref. So we pass in the ref_root to skip that root
1823 * in our calculations. We set old_refnct and new_refcnt cause who the
1824 * hell knows what everything looked like before, and it doesn't matter
1825 * except...
1434 */ 1826 */
1435 ulist_reinit(fs_info->qgroup_ulist); 1827 ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
1436 seq = fs_info->qgroup_seq; 1828 seq, &old_roots, 0);
1437 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1829 if (ret < 0)
1830 goto out;
1438 1831
1439 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, 1832 /*
1440 seq); 1833 * Now adjust the refcounts of the qgroups that care about this
1441 if (ret) 1834 * reference, either the old_count in the case of removal or new_count
1442 goto unlock; 1835 * in the case of an addition.
1836 */
1837 ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
1838 seq);
1839 if (ret < 0)
1840 goto out;
1443 1841
1444 /* 1842 /*
1445 * step 2: walk from the new root 1843 * ...in the case of removals. If we had a removal before we got around
1844 * to processing this operation then we need to find that guy and count
1845 * his references as if they really existed so we don't end up screwing
1846 * up the exclusive counts. Then whenever we go to process the delete
1847 * everything will be grand and we can account for whatever exclusive
1848 * changes need to be made there. We also have to pass in old_roots so
1849 * we have an accurate count of the roots as it pertains to this
1850 * operations view of the world.
1446 */ 1851 */
1447 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, 1852 ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
1448 seq, sgn, node->num_bytes, qgroup); 1853 &old_roots);
1449 if (ret) 1854 if (ret < 0)
1450 goto unlock; 1855 goto out;
1451 1856
1452 /* 1857 /*
1453 * step 3: walk again from old refs 1858 * We are adding our root, need to adjust up the number of roots,
1859 * otherwise old_roots is the number of roots we want.
1454 */ 1860 */
1455 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, 1861 if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
1456 seq, sgn, node->num_bytes); 1862 new_roots = old_roots + 1;
1457 if (ret) 1863 } else {
1458 goto unlock; 1864 new_roots = old_roots;
1865 old_roots++;
1866 }
1867 fs_info->qgroup_seq += old_roots + 1;
1459 1868
1460unlock: 1869
1870 /*
1871 * And now the magic happens, bless Arne for having a pretty elegant
1872 * solution for this.
1873 */
1874 qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
1875 qgroups, seq, old_roots, new_roots, 0);
1876out:
1461 spin_unlock(&fs_info->qgroup_lock); 1877 spin_unlock(&fs_info->qgroup_lock);
1878 ulist_free(qgroups);
1462 ulist_free(roots); 1879 ulist_free(roots);
1880 ulist_free(tmp);
1881 return ret;
1882}
1883
1884/*
1885 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1886 * from the fs. First, all roots referencing the extent are searched, and
1887 * then the space is accounted accordingly to the different roots. The
1888 * accounting algorithm works in 3 steps documented inline.
1889 */
1890static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1891 struct btrfs_fs_info *fs_info,
1892 struct btrfs_qgroup_operation *oper)
1893{
1894 int ret = 0;
1895
1896 if (!fs_info->quota_enabled)
1897 return 0;
1898
1899 BUG_ON(!fs_info->quota_root);
1900
1901 mutex_lock(&fs_info->qgroup_rescan_lock);
1902 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1903 if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
1904 mutex_unlock(&fs_info->qgroup_rescan_lock);
1905 return 0;
1906 }
1907 }
1908 mutex_unlock(&fs_info->qgroup_rescan_lock);
1909
1910 ASSERT(is_fstree(oper->ref_root));
1911
1912 switch (oper->type) {
1913 case BTRFS_QGROUP_OPER_ADD_EXCL:
1914 case BTRFS_QGROUP_OPER_SUB_EXCL:
1915 ret = qgroup_excl_accounting(fs_info, oper);
1916 break;
1917 case BTRFS_QGROUP_OPER_ADD_SHARED:
1918 case BTRFS_QGROUP_OPER_SUB_SHARED:
1919 ret = qgroup_shared_accounting(trans, fs_info, oper);
1920 break;
1921 default:
1922 ASSERT(0);
1923 }
1924 return ret;
1925}
1926
1927/*
1928 * Needs to be called everytime we run delayed refs, even if there is an error
1929 * in order to cleanup outstanding operations.
1930 */
1931int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
1932 struct btrfs_fs_info *fs_info)
1933{
1934 struct btrfs_qgroup_operation *oper;
1935 int ret = 0;
1463 1936
1937 while (!list_empty(&trans->qgroup_ref_list)) {
1938 oper = list_first_entry(&trans->qgroup_ref_list,
1939 struct btrfs_qgroup_operation, list);
1940 list_del_init(&oper->list);
1941 if (!ret || !trans->aborted)
1942 ret = btrfs_qgroup_account(trans, fs_info, oper);
1943 spin_lock(&fs_info->qgroup_op_lock);
1944 rb_erase(&oper->n, &fs_info->qgroup_op_tree);
1945 spin_unlock(&fs_info->qgroup_op_lock);
1946 btrfs_put_tree_mod_seq(fs_info, &oper->elem);
1947 kfree(oper);
1948 }
1464 return ret; 1949 return ret;
1465} 1950}
1466 1951
@@ -1629,8 +2114,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1629 srcgroup = find_qgroup_rb(fs_info, srcid); 2114 srcgroup = find_qgroup_rb(fs_info, srcid);
1630 if (!srcgroup) 2115 if (!srcgroup)
1631 goto unlock; 2116 goto unlock;
1632 dstgroup->rfer = srcgroup->rfer - level_size; 2117
1633 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; 2118 /*
2119 * We call inherit after we clone the root in order to make sure
2120 * our counts don't go crazy, so at this point the only
2121 * difference between the two roots should be the root node.
2122 */
2123 dstgroup->rfer = srcgroup->rfer;
2124 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2125 dstgroup->excl = level_size;
2126 dstgroup->excl_cmpr = level_size;
1634 srcgroup->excl = level_size; 2127 srcgroup->excl = level_size;
1635 srcgroup->excl_cmpr = level_size; 2128 srcgroup->excl_cmpr = level_size;
1636 qgroup_dirty(fs_info, dstgroup); 2129 qgroup_dirty(fs_info, dstgroup);
@@ -1734,7 +2227,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1734 struct btrfs_qgroup *qg; 2227 struct btrfs_qgroup *qg;
1735 struct btrfs_qgroup_list *glist; 2228 struct btrfs_qgroup_list *glist;
1736 2229
1737 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 2230 qg = u64_to_ptr(unode->aux);
1738 2231
1739 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 2232 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1740 qg->reserved + (s64)qg->rfer + num_bytes > 2233 qg->reserved + (s64)qg->rfer + num_bytes >
@@ -1766,7 +2259,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1766 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 2259 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1767 struct btrfs_qgroup *qg; 2260 struct btrfs_qgroup *qg;
1768 2261
1769 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 2262 qg = u64_to_ptr(unode->aux);
1770 2263
1771 qg->reserved += num_bytes; 2264 qg->reserved += num_bytes;
1772 } 2265 }
@@ -1812,7 +2305,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1812 struct btrfs_qgroup *qg; 2305 struct btrfs_qgroup *qg;
1813 struct btrfs_qgroup_list *glist; 2306 struct btrfs_qgroup_list *glist;
1814 2307
1815 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 2308 qg = u64_to_ptr(unode->aux);
1816 2309
1817 qg->reserved -= num_bytes; 2310 qg->reserved -= num_bytes;
1818 2311
@@ -1848,15 +2341,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1848 */ 2341 */
1849static int 2342static int
1850qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 2343qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1851 struct btrfs_trans_handle *trans, struct ulist *tmp, 2344 struct btrfs_trans_handle *trans, struct ulist *qgroups,
1852 struct extent_buffer *scratch_leaf) 2345 struct ulist *tmp, struct extent_buffer *scratch_leaf)
1853{ 2346{
1854 struct btrfs_key found; 2347 struct btrfs_key found;
1855 struct ulist *roots = NULL; 2348 struct ulist *roots = NULL;
1856 struct ulist_node *unode;
1857 struct ulist_iterator uiter;
1858 struct seq_list tree_mod_seq_elem = {}; 2349 struct seq_list tree_mod_seq_elem = {};
2350 u64 num_bytes;
1859 u64 seq; 2351 u64 seq;
2352 int new_roots;
1860 int slot; 2353 int slot;
1861 int ret; 2354 int ret;
1862 2355
@@ -1897,8 +2390,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1897 mutex_unlock(&fs_info->qgroup_rescan_lock); 2390 mutex_unlock(&fs_info->qgroup_rescan_lock);
1898 2391
1899 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 2392 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
1900 u64 num_bytes;
1901
1902 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 2393 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
1903 if (found.type != BTRFS_EXTENT_ITEM_KEY && 2394 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
1904 found.type != BTRFS_METADATA_ITEM_KEY) 2395 found.type != BTRFS_METADATA_ITEM_KEY)
@@ -1908,76 +2399,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1908 else 2399 else
1909 num_bytes = found.offset; 2400 num_bytes = found.offset;
1910 2401
1911 ret = btrfs_find_all_roots(trans, fs_info, found.objectid, 2402 ulist_reinit(qgroups);
1912 tree_mod_seq_elem.seq, &roots); 2403 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
2404 &roots);
1913 if (ret < 0) 2405 if (ret < 0)
1914 goto out; 2406 goto out;
1915 spin_lock(&fs_info->qgroup_lock); 2407 spin_lock(&fs_info->qgroup_lock);
1916 seq = fs_info->qgroup_seq; 2408 seq = fs_info->qgroup_seq;
1917 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 2409 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1918 2410
1919 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 2411 new_roots = 0;
1920 if (ret) { 2412 ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
2413 seq, &new_roots, 1);
2414 if (ret < 0) {
1921 spin_unlock(&fs_info->qgroup_lock); 2415 spin_unlock(&fs_info->qgroup_lock);
1922 ulist_free(roots); 2416 ulist_free(roots);
1923 goto out; 2417 goto out;
1924 } 2418 }
1925 2419
1926 /* 2420 ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
1927 * step2 of btrfs_qgroup_account_ref works from a single root, 2421 seq, 0, new_roots, 1);
1928 * we're doing all at once here. 2422 if (ret < 0) {
1929 */ 2423 spin_unlock(&fs_info->qgroup_lock);
1930 ulist_reinit(tmp); 2424 ulist_free(roots);
1931 ULIST_ITER_INIT(&uiter); 2425 goto out;
1932 while ((unode = ulist_next(roots, &uiter))) {
1933 struct btrfs_qgroup *qg;
1934
1935 qg = find_qgroup_rb(fs_info, unode->val);
1936 if (!qg)
1937 continue;
1938
1939 ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
1940 GFP_ATOMIC);
1941 if (ret < 0) {
1942 spin_unlock(&fs_info->qgroup_lock);
1943 ulist_free(roots);
1944 goto out;
1945 }
1946 }
1947
1948 /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
1949 ULIST_ITER_INIT(&uiter);
1950 while ((unode = ulist_next(tmp, &uiter))) {
1951 struct btrfs_qgroup *qg;
1952 struct btrfs_qgroup_list *glist;
1953
1954 qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
1955 qg->rfer += num_bytes;
1956 qg->rfer_cmpr += num_bytes;
1957 WARN_ON(qg->tag >= seq);
1958 if (qg->refcnt - seq == roots->nnodes) {
1959 qg->excl += num_bytes;
1960 qg->excl_cmpr += num_bytes;
1961 }
1962 qgroup_dirty(fs_info, qg);
1963
1964 list_for_each_entry(glist, &qg->groups, next_group) {
1965 ret = ulist_add(tmp, glist->group->qgroupid,
1966 (uintptr_t)glist->group,
1967 GFP_ATOMIC);
1968 if (ret < 0) {
1969 spin_unlock(&fs_info->qgroup_lock);
1970 ulist_free(roots);
1971 goto out;
1972 }
1973 }
1974 } 2426 }
1975
1976 spin_unlock(&fs_info->qgroup_lock); 2427 spin_unlock(&fs_info->qgroup_lock);
1977 ulist_free(roots); 2428 ulist_free(roots);
1978 ret = 0;
1979 } 2429 }
1980
1981out: 2430out:
1982 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 2431 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1983 2432
@@ -1990,13 +2439,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
1990 qgroup_rescan_work); 2439 qgroup_rescan_work);
1991 struct btrfs_path *path; 2440 struct btrfs_path *path;
1992 struct btrfs_trans_handle *trans = NULL; 2441 struct btrfs_trans_handle *trans = NULL;
1993 struct ulist *tmp = NULL; 2442 struct ulist *tmp = NULL, *qgroups = NULL;
1994 struct extent_buffer *scratch_leaf = NULL; 2443 struct extent_buffer *scratch_leaf = NULL;
1995 int err = -ENOMEM; 2444 int err = -ENOMEM;
1996 2445
1997 path = btrfs_alloc_path(); 2446 path = btrfs_alloc_path();
1998 if (!path) 2447 if (!path)
1999 goto out; 2448 goto out;
2449 qgroups = ulist_alloc(GFP_NOFS);
2450 if (!qgroups)
2451 goto out;
2000 tmp = ulist_alloc(GFP_NOFS); 2452 tmp = ulist_alloc(GFP_NOFS);
2001 if (!tmp) 2453 if (!tmp)
2002 goto out; 2454 goto out;
@@ -2015,7 +2467,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2015 err = -EINTR; 2467 err = -EINTR;
2016 } else { 2468 } else {
2017 err = qgroup_rescan_leaf(fs_info, path, trans, 2469 err = qgroup_rescan_leaf(fs_info, path, trans,
2018 tmp, scratch_leaf); 2470 qgroups, tmp, scratch_leaf);
2019 } 2471 }
2020 if (err > 0) 2472 if (err > 0)
2021 btrfs_commit_transaction(trans, fs_info->fs_root); 2473 btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2025,6 +2477,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2025 2477
2026out: 2478out:
2027 kfree(scratch_leaf); 2479 kfree(scratch_leaf);
2480 ulist_free(qgroups);
2028 ulist_free(tmp); 2481 ulist_free(tmp);
2029 btrfs_free_path(path); 2482 btrfs_free_path(path);
2030 2483
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 000000000000..5952ff1fbd7a
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
1/*
2 * Copyright (C) 2014 Facebook. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_QGROUP__
20#define __BTRFS_QGROUP__
21
22/*
23 * A description of the operations, all of these operations only happen when we
24 * are adding the 1st reference for that subvolume in the case of adding space
25 * or on the last reference delete in the case of subtraction. The only
26 * exception is the last one, which is added for confusion.
27 *
28 * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
29 * one pointing at the bytes we are adding. This is called on the first
30 * allocation.
31 *
32 * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
33 * shared between subvols. This is called on the creation of a ref that already
34 * has refs from a different subvolume, so basically reflink.
35 *
36 * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
37 * one referencing the range.
38 *
39 * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
40 * refs with other subvolumes.
41 */
42enum btrfs_qgroup_operation_type {
43 BTRFS_QGROUP_OPER_ADD_EXCL,
44 BTRFS_QGROUP_OPER_ADD_SHARED,
45 BTRFS_QGROUP_OPER_SUB_EXCL,
46 BTRFS_QGROUP_OPER_SUB_SHARED,
47};
48
49struct btrfs_qgroup_operation {
50 u64 ref_root;
51 u64 bytenr;
52 u64 num_bytes;
53 u64 seq;
54 enum btrfs_qgroup_operation_type type;
55 struct seq_list elem;
56 struct rb_node n;
57 struct list_head list;
58};
59
60int btrfs_quota_enable(struct btrfs_trans_handle *trans,
61 struct btrfs_fs_info *fs_info);
62int btrfs_quota_disable(struct btrfs_trans_handle *trans,
63 struct btrfs_fs_info *fs_info);
64int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
65void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
66int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
67int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
68 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
69int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
70 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
71int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
72 struct btrfs_fs_info *fs_info, u64 qgroupid,
73 char *name);
74int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
75 struct btrfs_fs_info *fs_info, u64 qgroupid);
76int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
77 struct btrfs_fs_info *fs_info, u64 qgroupid,
78 struct btrfs_qgroup_limit *limit);
79int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
80void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
81struct btrfs_delayed_extent_op;
82int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info, u64 ref_root,
84 u64 bytenr, u64 num_bytes,
85 enum btrfs_qgroup_operation_type type,
86 int mod_seq);
87int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
88 struct btrfs_fs_info *fs_info);
89void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
90 struct btrfs_fs_info *fs_info,
91 struct btrfs_qgroup_operation *oper);
92int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
93 struct btrfs_fs_info *fs_info);
94int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
95 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
96 struct btrfs_qgroup_inherit *inherit);
97int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
98void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
99
100void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
101
102#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
103int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
104 u64 rfer, u64 excl);
105#endif
106
107#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7f92ab1daa87..65245a07275b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
337 if (bnode->root) 337 if (bnode->root)
338 fs_info = bnode->root->fs_info; 338 fs_info = bnode->root->fs_info;
339 btrfs_panic(fs_info, errno, "Inconsistency in backref cache " 339 btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
340 "found at offset %llu\n", bytenr); 340 "found at offset %llu", bytenr);
341} 341}
342 342
343/* 343/*
@@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root)
528{ 528{
529 struct btrfs_root *reloc_root; 529 struct btrfs_root *reloc_root;
530 530
531 if (!root->ref_cows) 531 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
532 return 0; 532 return 0;
533 533
534 reloc_root = root->reloc_root; 534 reloc_root = root->reloc_root;
@@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,
610 root = read_fs_root(rc->extent_root->fs_info, root_objectid); 610 root = read_fs_root(rc->extent_root->fs_info, root_objectid);
611 BUG_ON(IS_ERR(root)); 611 BUG_ON(IS_ERR(root));
612 612
613 if (root->ref_cows && 613 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
614 generation != btrfs_root_generation(&root->root_item)) 614 generation != btrfs_root_generation(&root->root_item))
615 return NULL; 615 return NULL;
616 616
@@ -887,7 +887,7 @@ again:
887 goto out; 887 goto out;
888 } 888 }
889 889
890 if (!root->ref_cows) 890 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
891 cur->cowonly = 1; 891 cur->cowonly = 1;
892 892
893 if (btrfs_root_level(&root->root_item) == cur->level) { 893 if (btrfs_root_level(&root->root_item) == cur->level) {
@@ -954,7 +954,8 @@ again:
954 upper->bytenr = eb->start; 954 upper->bytenr = eb->start;
955 upper->owner = btrfs_header_owner(eb); 955 upper->owner = btrfs_header_owner(eb);
956 upper->level = lower->level + 1; 956 upper->level = lower->level + 1;
957 if (!root->ref_cows) 957 if (!test_bit(BTRFS_ROOT_REF_COWS,
958 &root->state))
958 upper->cowonly = 1; 959 upper->cowonly = 1;
959 960
960 /* 961 /*
@@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
1258 if (rb_node) { 1259 if (rb_node) {
1259 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " 1260 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1260 "for start=%llu while inserting into relocation " 1261 "for start=%llu while inserting into relocation "
1261 "tree\n", node->bytenr); 1262 "tree", node->bytenr);
1262 kfree(node); 1263 kfree(node);
1263 return -EEXIST; 1264 return -EEXIST;
1264 } 1265 }
@@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2441 next = walk_up_backref(next, edges, &index); 2442 next = walk_up_backref(next, edges, &index);
2442 root = next->root; 2443 root = next->root;
2443 BUG_ON(!root); 2444 BUG_ON(!root);
2444 BUG_ON(!root->ref_cows); 2445 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2445 2446
2446 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2447 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2447 record_reloc_root_in_trans(trans, root); 2448 record_reloc_root_in_trans(trans, root);
@@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2506 BUG_ON(!root); 2507 BUG_ON(!root);
2507 2508
2508 /* no other choice for non-references counted tree */ 2509 /* no other choice for non-references counted tree */
2509 if (!root->ref_cows) 2510 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2510 return root; 2511 return root;
2511 2512
2512 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2513 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2893 goto out; 2894 goto out;
2894 } 2895 }
2895 2896
2896 if (!root || root->ref_cows) { 2897 if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2897 ret = reserve_metadata_space(trans, rc, node); 2898 ret = reserve_metadata_space(trans, rc, node);
2898 if (ret) 2899 if (ret)
2899 goto out; 2900 goto out;
2900 } 2901 }
2901 2902
2902 if (root) { 2903 if (root) {
2903 if (root->ref_cows) { 2904 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2904 BUG_ON(node->new_bytenr); 2905 BUG_ON(node->new_bytenr);
2905 BUG_ON(!list_empty(&node->list)); 2906 BUG_ON(!list_empty(&node->list));
2906 btrfs_record_root_in_trans(trans, root); 2907 btrfs_record_root_in_trans(trans, root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 38bb47e7d6b1..360a728a639f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
306 break; 306 break;
307 } 307 }
308 308
309 root->orphan_item_inserted = 1; 309 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
310 310
311 err = btrfs_insert_fs_root(root->fs_info, root); 311 err = btrfs_insert_fs_root(root->fs_info, root);
312 if (err) { 312 if (err) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0be77993378e..ac80188eec88 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
588 588
589 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 589 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
590 do { 590 do {
591 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 591 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
592 &ref_root, &ref_level); 592 item_size, &ref_root,
593 &ref_level);
593 printk_in_rcu(KERN_WARNING 594 printk_in_rcu(KERN_WARNING
594 "BTRFS: %s at logical %llu on dev %s, " 595 "BTRFS: %s at logical %llu on dev %s, "
595 "sector %llu: metadata %s (level %d) in tree " 596 "sector %llu: metadata %s (level %d) in tree "
@@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
717out: 718out:
718 if (page) 719 if (page)
719 put_page(page); 720 put_page(page);
720 if (inode) 721
721 iput(inode); 722 iput(inode);
722 723
723 if (ret < 0) 724 if (ret < 0)
724 return ret; 725 return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 484aacac2c89..6528aa662181 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -975,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
975 struct btrfs_dir_item *di; 975 struct btrfs_dir_item *di;
976 struct btrfs_key di_key; 976 struct btrfs_key di_key;
977 char *buf = NULL; 977 char *buf = NULL;
978 const int buf_len = PATH_MAX; 978 int buf_len;
979 u32 name_len; 979 u32 name_len;
980 u32 data_len; 980 u32 data_len;
981 u32 cur; 981 u32 cur;
@@ -985,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
985 int num; 985 int num;
986 u8 type; 986 u8 type;
987 987
988 if (found_key->type == BTRFS_XATTR_ITEM_KEY)
989 buf_len = BTRFS_MAX_XATTR_SIZE(root);
990 else
991 buf_len = PATH_MAX;
992
988 buf = kmalloc(buf_len, GFP_NOFS); 993 buf = kmalloc(buf_len, GFP_NOFS);
989 if (!buf) { 994 if (!buf) {
990 ret = -ENOMEM; 995 ret = -ENOMEM;
@@ -1006,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1006 type = btrfs_dir_type(eb, di); 1011 type = btrfs_dir_type(eb, di);
1007 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1012 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1008 1013
1009 /* 1014 if (type == BTRFS_FT_XATTR) {
1010 * Path too long 1015 if (name_len > XATTR_NAME_MAX) {
1011 */ 1016 ret = -ENAMETOOLONG;
1012 if (name_len + data_len > buf_len) { 1017 goto out;
1013 ret = -ENAMETOOLONG; 1018 }
1014 goto out; 1019 if (name_len + data_len > buf_len) {
1020 ret = -E2BIG;
1021 goto out;
1022 }
1023 } else {
1024 /*
1025 * Path too long
1026 */
1027 if (name_len + data_len > buf_len) {
1028 ret = -ENAMETOOLONG;
1029 goto out;
1030 }
1015 } 1031 }
1016 1032
1017 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1033 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -1349,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx,
1349 ret = -EIO; 1365 ret = -EIO;
1350 btrfs_err(sctx->send_root->fs_info, "did not find backref in " 1366 btrfs_err(sctx->send_root->fs_info, "did not find backref in "
1351 "send_root. inode=%llu, offset=%llu, " 1367 "send_root. inode=%llu, offset=%llu, "
1352 "disk_byte=%llu found extent=%llu\n", 1368 "disk_byte=%llu found extent=%llu",
1353 ino, data_offset, disk_byte, found_key.objectid); 1369 ino, data_offset, disk_byte, found_key.objectid);
1354 goto out; 1370 goto out;
1355 } 1371 }
@@ -1628,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
1628 goto out; 1644 goto out;
1629 } 1645 }
1630 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1646 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1647 if (key.type == BTRFS_ROOT_ITEM_KEY) {
1648 ret = -ENOENT;
1649 goto out;
1650 }
1631 *found_inode = key.objectid; 1651 *found_inode = key.objectid;
1632 *found_type = btrfs_dir_type(path->nodes[0], di); 1652 *found_type = btrfs_dir_type(path->nodes[0], di);
1633 1653
@@ -1693,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
1693 goto out; 1713 goto out;
1694 btrfs_release_path(path); 1714 btrfs_release_path(path);
1695 1715
1696 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, 1716 if (dir_gen) {
1697 NULL, NULL); 1717 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
1698 if (ret < 0) 1718 NULL, NULL, NULL);
1699 goto out; 1719 if (ret < 0)
1720 goto out;
1721 }
1700 1722
1701 *dir = parent_dir; 1723 *dir = parent_dir;
1702 1724
@@ -1712,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,
1712 int ret; 1734 int ret;
1713 struct fs_path *tmp_name; 1735 struct fs_path *tmp_name;
1714 u64 tmp_dir; 1736 u64 tmp_dir;
1715 u64 tmp_dir_gen;
1716 1737
1717 tmp_name = fs_path_alloc(); 1738 tmp_name = fs_path_alloc();
1718 if (!tmp_name) 1739 if (!tmp_name)
1719 return -ENOMEM; 1740 return -ENOMEM;
1720 1741
1721 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1742 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
1722 if (ret < 0) 1743 if (ret < 0)
1723 goto out; 1744 goto out;
1724 1745
@@ -2029,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2029{ 2050{
2030 int ret; 2051 int ret;
2031 int nce_ret; 2052 int nce_ret;
2032 struct btrfs_path *path = NULL;
2033 struct name_cache_entry *nce = NULL; 2053 struct name_cache_entry *nce = NULL;
2034 2054
2035 /* 2055 /*
@@ -2055,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2055 } 2075 }
2056 } 2076 }
2057 2077
2058 path = alloc_path_for_send();
2059 if (!path)
2060 return -ENOMEM;
2061
2062 /* 2078 /*
2063 * If the inode is not existent yet, add the orphan name and return 1. 2079 * If the inode is not existent yet, add the orphan name and return 1.
2064 * This should only happen for the parent dir that we determine in 2080 * This should only happen for the parent dir that we determine in
@@ -2134,7 +2150,6 @@ out_cache:
2134 name_cache_clean_unused(sctx); 2150 name_cache_clean_unused(sctx);
2135 2151
2136out: 2152out:
2137 btrfs_free_path(path);
2138 return ret; 2153 return ret;
2139} 2154}
2140 2155
@@ -2945,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx,
2945static int add_pending_dir_move(struct send_ctx *sctx, 2960static int add_pending_dir_move(struct send_ctx *sctx,
2946 u64 ino, 2961 u64 ino,
2947 u64 ino_gen, 2962 u64 ino_gen,
2948 u64 parent_ino) 2963 u64 parent_ino,
2964 struct list_head *new_refs,
2965 struct list_head *deleted_refs)
2949{ 2966{
2950 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2967 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2951 struct rb_node *parent = NULL; 2968 struct rb_node *parent = NULL;
@@ -2977,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx,
2977 } 2994 }
2978 } 2995 }
2979 2996
2980 list_for_each_entry(cur, &sctx->deleted_refs, list) { 2997 list_for_each_entry(cur, deleted_refs, list) {
2981 ret = dup_ref(cur, &pm->update_refs); 2998 ret = dup_ref(cur, &pm->update_refs);
2982 if (ret < 0) 2999 if (ret < 0)
2983 goto out; 3000 goto out;
2984 } 3001 }
2985 list_for_each_entry(cur, &sctx->new_refs, list) { 3002 list_for_each_entry(cur, new_refs, list) {
2986 ret = dup_ref(cur, &pm->update_refs); 3003 ret = dup_ref(cur, &pm->update_refs);
2987 if (ret < 0) 3004 if (ret < 0)
2988 goto out; 3005 goto out;
@@ -3025,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3025 return NULL; 3042 return NULL;
3026} 3043}
3027 3044
3045static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3046 u64 ino, u64 gen, u64 *ancestor_ino)
3047{
3048 int ret = 0;
3049 u64 parent_inode = 0;
3050 u64 parent_gen = 0;
3051 u64 start_ino = ino;
3052
3053 *ancestor_ino = 0;
3054 while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3055 fs_path_reset(name);
3056
3057 if (is_waiting_for_rm(sctx, ino))
3058 break;
3059 if (is_waiting_for_move(sctx, ino)) {
3060 if (*ancestor_ino == 0)
3061 *ancestor_ino = ino;
3062 ret = get_first_ref(sctx->parent_root, ino,
3063 &parent_inode, &parent_gen, name);
3064 } else {
3065 ret = __get_cur_name_and_parent(sctx, ino, gen,
3066 &parent_inode,
3067 &parent_gen, name);
3068 if (ret > 0) {
3069 ret = 0;
3070 break;
3071 }
3072 }
3073 if (ret < 0)
3074 break;
3075 if (parent_inode == start_ino) {
3076 ret = 1;
3077 if (*ancestor_ino == 0)
3078 *ancestor_ino = ino;
3079 break;
3080 }
3081 ino = parent_inode;
3082 gen = parent_gen;
3083 }
3084 return ret;
3085}
3086
3028static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3087static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3029{ 3088{
3030 struct fs_path *from_path = NULL; 3089 struct fs_path *from_path = NULL;
@@ -3036,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3036 struct waiting_dir_move *dm = NULL; 3095 struct waiting_dir_move *dm = NULL;
3037 u64 rmdir_ino = 0; 3096 u64 rmdir_ino = 0;
3038 int ret; 3097 int ret;
3098 u64 ancestor = 0;
3039 3099
3040 name = fs_path_alloc(); 3100 name = fs_path_alloc();
3041 from_path = fs_path_alloc(); 3101 from_path = fs_path_alloc();
@@ -3054,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3054 if (ret < 0) 3114 if (ret < 0)
3055 goto out; 3115 goto out;
3056 3116
3057 if (parent_ino == sctx->cur_ino) { 3117 ret = get_cur_path(sctx, parent_ino, parent_gen,
3058 /* child only renamed, not moved */ 3118 from_path);
3059 ASSERT(parent_gen == sctx->cur_inode_gen); 3119 if (ret < 0)
3060 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, 3120 goto out;
3061 from_path); 3121 ret = fs_path_add_path(from_path, name);
3062 if (ret < 0) 3122 if (ret < 0)
3063 goto out; 3123 goto out;
3064 ret = fs_path_add_path(from_path, name); 3124
3065 if (ret < 0) 3125 sctx->send_progress = sctx->cur_ino + 1;
3066 goto out; 3126 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3067 } else { 3127 if (ret) {
3068 /* child moved and maybe renamed too */ 3128 LIST_HEAD(deleted_refs);
3069 sctx->send_progress = pm->ino; 3129 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3070 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3130 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3131 &pm->update_refs, &deleted_refs);
3071 if (ret < 0) 3132 if (ret < 0)
3072 goto out; 3133 goto out;
3073 } 3134 if (rmdir_ino) {
3074 3135 dm = get_waiting_dir_move(sctx, pm->ino);
3075 fs_path_free(name); 3136 ASSERT(dm);
3076 name = NULL; 3137 dm->rmdir_ino = rmdir_ino;
3077 3138 }
3078 to_path = fs_path_alloc();
3079 if (!to_path) {
3080 ret = -ENOMEM;
3081 goto out; 3139 goto out;
3082 } 3140 }
3083 3141 fs_path_reset(name);
3084 sctx->send_progress = sctx->cur_ino + 1; 3142 to_path = name;
3143 name = NULL;
3085 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3144 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
3086 if (ret < 0) 3145 if (ret < 0)
3087 goto out; 3146 goto out;
@@ -3205,127 +3264,74 @@ out:
3205static int wait_for_parent_move(struct send_ctx *sctx, 3264static int wait_for_parent_move(struct send_ctx *sctx,
3206 struct recorded_ref *parent_ref) 3265 struct recorded_ref *parent_ref)
3207{ 3266{
3208 int ret; 3267 int ret = 0;
3209 u64 ino = parent_ref->dir; 3268 u64 ino = parent_ref->dir;
3210 u64 parent_ino_before, parent_ino_after; 3269 u64 parent_ino_before, parent_ino_after;
3211 u64 old_gen;
3212 struct fs_path *path_before = NULL; 3270 struct fs_path *path_before = NULL;
3213 struct fs_path *path_after = NULL; 3271 struct fs_path *path_after = NULL;
3214 int len1, len2; 3272 int len1, len2;
3215 int register_upper_dirs;
3216 u64 gen;
3217
3218 if (is_waiting_for_move(sctx, ino))
3219 return 1;
3220
3221 if (parent_ref->dir <= sctx->cur_ino)
3222 return 0;
3223
3224 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3225 NULL, NULL, NULL, NULL);
3226 if (ret == -ENOENT)
3227 return 0;
3228 else if (ret < 0)
3229 return ret;
3230
3231 if (parent_ref->dir_gen != old_gen)
3232 return 0;
3233
3234 path_before = fs_path_alloc();
3235 if (!path_before)
3236 return -ENOMEM;
3237
3238 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3239 NULL, path_before);
3240 if (ret == -ENOENT) {
3241 ret = 0;
3242 goto out;
3243 } else if (ret < 0) {
3244 goto out;
3245 }
3246 3273
3247 path_after = fs_path_alloc(); 3274 path_after = fs_path_alloc();
3248 if (!path_after) { 3275 path_before = fs_path_alloc();
3276 if (!path_after || !path_before) {
3249 ret = -ENOMEM; 3277 ret = -ENOMEM;
3250 goto out; 3278 goto out;
3251 } 3279 }
3252 3280
3253 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3254 &gen, path_after);
3255 if (ret == -ENOENT) {
3256 ret = 0;
3257 goto out;
3258 } else if (ret < 0) {
3259 goto out;
3260 }
3261
3262 len1 = fs_path_len(path_before);
3263 len2 = fs_path_len(path_after);
3264 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3265 memcmp(path_before->start, path_after->start, len1)) {
3266 ret = 1;
3267 goto out;
3268 }
3269 ret = 0;
3270
3271 /* 3281 /*
3272 * Ok, our new most direct ancestor has a higher inode number but 3282 * Our current directory inode may not yet be renamed/moved because some
3273 * wasn't moved/renamed. So maybe some of the new ancestors higher in 3283 * ancestor (immediate or not) has to be renamed/moved first. So find if
3274 * the hierarchy have an higher inode number too *and* were renamed 3284 * such ancestor exists and make sure our own rename/move happens after
3275 * or moved - in this case we need to wait for the ancestor's rename 3285 * that ancestor is processed.
3276 * or move operation before we can do the move/rename for the current
3277 * inode.
3278 */ 3286 */
3279 register_upper_dirs = 0; 3287 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3280 ino = parent_ino_after; 3288 if (is_waiting_for_move(sctx, ino)) {
3281again: 3289 ret = 1;
3282 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { 3290 break;
3283 u64 parent_gen; 3291 }
3284 3292
3285 fs_path_reset(path_before); 3293 fs_path_reset(path_before);
3286 fs_path_reset(path_after); 3294 fs_path_reset(path_after);
3287 3295
3288 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3296 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3289 &parent_gen, path_after); 3297 NULL, path_after);
3290 if (ret < 0) 3298 if (ret < 0)
3291 goto out; 3299 goto out;
3292 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, 3300 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3293 NULL, path_before); 3301 NULL, path_before);
3294 if (ret == -ENOENT) { 3302 if (ret < 0 && ret != -ENOENT) {
3295 ret = 0;
3296 break;
3297 } else if (ret < 0) {
3298 goto out; 3303 goto out;
3304 } else if (ret == -ENOENT) {
3305 ret = 1;
3306 break;
3299 } 3307 }
3300 3308
3301 len1 = fs_path_len(path_before); 3309 len1 = fs_path_len(path_before);
3302 len2 = fs_path_len(path_after); 3310 len2 = fs_path_len(path_after);
3303 if (parent_ino_before != parent_ino_after || len1 != len2 || 3311 if (ino > sctx->cur_ino &&
3304 memcmp(path_before->start, path_after->start, len1)) { 3312 (parent_ino_before != parent_ino_after || len1 != len2 ||
3313 memcmp(path_before->start, path_after->start, len1))) {
3305 ret = 1; 3314 ret = 1;
3306 if (register_upper_dirs) { 3315 break;
3307 break;
3308 } else {
3309 register_upper_dirs = 1;
3310 ino = parent_ref->dir;
3311 gen = parent_ref->dir_gen;
3312 goto again;
3313 }
3314 } else if (register_upper_dirs) {
3315 ret = add_pending_dir_move(sctx, ino, gen,
3316 parent_ino_after);
3317 if (ret < 0 && ret != -EEXIST)
3318 goto out;
3319 } 3316 }
3320
3321 ino = parent_ino_after; 3317 ino = parent_ino_after;
3322 gen = parent_gen;
3323 } 3318 }
3324 3319
3325out: 3320out:
3326 fs_path_free(path_before); 3321 fs_path_free(path_before);
3327 fs_path_free(path_after); 3322 fs_path_free(path_after);
3328 3323
3324 if (ret == 1) {
3325 ret = add_pending_dir_move(sctx,
3326 sctx->cur_ino,
3327 sctx->cur_inode_gen,
3328 ino,
3329 &sctx->new_refs,
3330 &sctx->deleted_refs);
3331 if (!ret)
3332 ret = 1;
3333 }
3334
3329 return ret; 3335 return ret;
3330} 3336}
3331 3337
@@ -3486,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3486 if (ret < 0) 3492 if (ret < 0)
3487 goto out; 3493 goto out;
3488 if (ret) { 3494 if (ret) {
3489 ret = add_pending_dir_move(sctx,
3490 sctx->cur_ino,
3491 sctx->cur_inode_gen,
3492 cur->dir);
3493 *pending_move = 1; 3495 *pending_move = 1;
3494 } else { 3496 } else {
3495 ret = send_rename(sctx, valid_path, 3497 ret = send_rename(sctx, valid_path,
@@ -5490,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
5490 */ 5492 */
5491 if (root->send_in_progress < 0) 5493 if (root->send_in_progress < 0)
5492 btrfs_err(root->fs_info, 5494 btrfs_err(root->fs_info,
5493 "send_in_progres unbalanced %d root %llu\n", 5495 "send_in_progres unbalanced %d root %llu",
5494 root->send_in_progress, root->root_key.objectid); 5496 root->send_in_progress, root->root_key.objectid);
5495 spin_unlock(&root->root_item_lock); 5497 spin_unlock(&root->root_item_lock);
5496} 5498}
@@ -5518,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5518 5520
5519 /* 5521 /*
5520 * The subvolume must remain read-only during send, protect against 5522 * The subvolume must remain read-only during send, protect against
5521 * making it RW. 5523 * making it RW. This also protects against deletion.
5522 */ 5524 */
5523 spin_lock(&send_root->root_item_lock); 5525 spin_lock(&send_root->root_item_lock);
5524 send_root->send_in_progress++; 5526 send_root->send_in_progress++;
@@ -5578,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5578 } 5580 }
5579 5581
5580 sctx->send_root = send_root; 5582 sctx->send_root = send_root;
5583 /*
5584 * Unlikely but possible, if the subvolume is marked for deletion but
5585 * is slow to remove the directory entry, send can still be started
5586 */
5587 if (btrfs_root_dead(sctx->send_root)) {
5588 ret = -EPERM;
5589 goto out;
5590 }
5591
5581 sctx->clone_roots_cnt = arg->clone_sources_count; 5592 sctx->clone_roots_cnt = arg->clone_sources_count;
5582 5593
5583 sctx->send_max_size = BTRFS_SEND_BUF_SIZE; 5594 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
@@ -5667,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5667 5678
5668 spin_lock(&sctx->parent_root->root_item_lock); 5679 spin_lock(&sctx->parent_root->root_item_lock);
5669 sctx->parent_root->send_in_progress++; 5680 sctx->parent_root->send_in_progress++;
5670 if (!btrfs_root_readonly(sctx->parent_root)) { 5681 if (!btrfs_root_readonly(sctx->parent_root) ||
5682 btrfs_root_dead(sctx->parent_root)) {
5671 spin_unlock(&sctx->parent_root->root_item_lock); 5683 spin_unlock(&sctx->parent_root->root_item_lock);
5672 srcu_read_unlock(&fs_info->subvol_srcu, index); 5684 srcu_read_unlock(&fs_info->subvol_srcu, index);
5673 ret = -EPERM; 5685 ret = -EPERM;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9601d25a4607..4662d92a4b73 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -511,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
511 } else if (compress) { 511 } else if (compress) {
512 if (!btrfs_test_opt(root, COMPRESS)) 512 if (!btrfs_test_opt(root, COMPRESS))
513 btrfs_info(root->fs_info, 513 btrfs_info(root->fs_info,
514 "btrfs: use %s compression\n", 514 "btrfs: use %s compression",
515 compress_type); 515 compress_type);
516 } 516 }
517 break; 517 break;
@@ -580,8 +580,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
580 } 580 }
581 break; 581 break;
582 case Opt_acl: 582 case Opt_acl:
583#ifdef CONFIG_BTRFS_FS_POSIX_ACL
583 root->fs_info->sb->s_flags |= MS_POSIXACL; 584 root->fs_info->sb->s_flags |= MS_POSIXACL;
584 break; 585 break;
586#else
587 btrfs_err(root->fs_info,
588 "support for ACL not compiled in!");
589 ret = -EINVAL;
590 goto out;
591#endif
585 case Opt_noacl: 592 case Opt_noacl:
586 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 593 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
587 break; 594 break;
@@ -1413,6 +1420,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1413 * this also happens on 'umount -rf' or on shutdown, when 1420 * this also happens on 'umount -rf' or on shutdown, when
1414 * the filesystem is busy. 1421 * the filesystem is busy.
1415 */ 1422 */
1423 cancel_work_sync(&fs_info->async_reclaim_work);
1416 1424
1417 /* wait for the uuid_scan task to finish */ 1425 /* wait for the uuid_scan task to finish */
1418 down(&fs_info->uuid_tree_rescan_sem); 1426 down(&fs_info->uuid_tree_rescan_sem);
@@ -1894,6 +1902,9 @@ static int btrfs_run_sanity_tests(void)
1894 if (ret) 1902 if (ret)
1895 goto out; 1903 goto out;
1896 ret = btrfs_test_inodes(); 1904 ret = btrfs_test_inodes();
1905 if (ret)
1906 goto out;
1907 ret = btrfs_test_qgroups();
1897out: 1908out:
1898 btrfs_destroy_test_fs(); 1909 btrfs_destroy_test_fs();
1899 return ret; 1910 return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c5eb2143dc66..df39458f1487 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); 254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
255 255
256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
257 258
258static ssize_t raid_bytes_show(struct kobject *kobj, 259static ssize_t raid_bytes_show(struct kobject *kobj,
259 struct kobj_attribute *attr, char *buf); 260 struct kobj_attribute *attr, char *buf);
@@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
266{ 267{
267 struct btrfs_space_info *sinfo = to_space_info(kobj->parent); 268 struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
268 struct btrfs_block_group_cache *block_group; 269 struct btrfs_block_group_cache *block_group;
269 int index = kobj - sinfo->block_group_kobjs; 270 int index = to_raid_kobj(kobj)->raid_type;
270 u64 val = 0; 271 u64 val = 0;
271 272
272 down_read(&sinfo->groups_sem); 273 down_read(&sinfo->groups_sem);
@@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = {
288 289
289static void release_raid_kobj(struct kobject *kobj) 290static void release_raid_kobj(struct kobject *kobj)
290{ 291{
291 kobject_put(kobj->parent); 292 kfree(to_raid_kobj(kobj));
292} 293}
293 294
294struct kobj_type btrfs_raid_ktype = { 295struct kobj_type btrfs_raid_ktype = {
@@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
374 struct btrfs_root *root = fs_info->fs_root; 375 struct btrfs_root *root = fs_info->fs_root;
375 int ret; 376 int ret;
376 377
377 if (len >= BTRFS_LABEL_SIZE) { 378 if (len >= BTRFS_LABEL_SIZE)
378 pr_err("BTRFS: unable to set label with more than %d bytes\n",
379 BTRFS_LABEL_SIZE - 1);
380 return -EINVAL; 379 return -EINVAL;
381 }
382 380
383 trans = btrfs_start_transaction(root, 0); 381 trans = btrfs_start_transaction(root, 0);
384 if (IS_ERR(trans)) 382 if (IS_ERR(trans))
@@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
396} 394}
397BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); 395BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
398 396
397static ssize_t btrfs_no_store(struct kobject *kobj,
398 struct kobj_attribute *a,
399 const char *buf, size_t len)
400{
401 return -EPERM;
402}
403
404static ssize_t btrfs_nodesize_show(struct kobject *kobj,
405 struct kobj_attribute *a, char *buf)
406{
407 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
408
409 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
410}
411
412BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
413
414static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
415 struct kobj_attribute *a, char *buf)
416{
417 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
418
419 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
420}
421
422BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
423
424static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
425 struct kobj_attribute *a, char *buf)
426{
427 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
428
429 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
430}
431
432BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
433
399static struct attribute *btrfs_attrs[] = { 434static struct attribute *btrfs_attrs[] = {
400 BTRFS_ATTR_PTR(label), 435 BTRFS_ATTR_PTR(label),
436 BTRFS_ATTR_PTR(nodesize),
437 BTRFS_ATTR_PTR(sectorsize),
438 BTRFS_ATTR_PTR(clone_alignment),
401 NULL, 439 NULL,
402}; 440};
403 441
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 757ef00a75a4..a5dcacb5df9c 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include "btrfs-tests.h" 22#include "btrfs-tests.h"
23#include "../ctree.h" 23#include "../ctree.h"
24#include "../volumes.h"
25#include "../disk-io.h"
26#include "../qgroup.h"
24 27
25static struct vfsmount *test_mnt = NULL; 28static struct vfsmount *test_mnt = NULL;
26 29
@@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void)
72 kern_unmount(test_mnt); 75 kern_unmount(test_mnt);
73 unregister_filesystem(&test_type); 76 unregister_filesystem(&test_type);
74} 77}
78
79struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
80{
81 struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
82 GFP_NOFS);
83
84 if (!fs_info)
85 return fs_info;
86 fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
87 GFP_NOFS);
88 if (!fs_info->fs_devices) {
89 kfree(fs_info);
90 return NULL;
91 }
92 fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
93 GFP_NOFS);
94 if (!fs_info->super_copy) {
95 kfree(fs_info->fs_devices);
96 kfree(fs_info);
97 return NULL;
98 }
99
100 if (init_srcu_struct(&fs_info->subvol_srcu)) {
101 kfree(fs_info->fs_devices);
102 kfree(fs_info->super_copy);
103 kfree(fs_info);
104 return NULL;
105 }
106
107 spin_lock_init(&fs_info->buffer_lock);
108 spin_lock_init(&fs_info->qgroup_lock);
109 spin_lock_init(&fs_info->qgroup_op_lock);
110 spin_lock_init(&fs_info->super_lock);
111 spin_lock_init(&fs_info->fs_roots_radix_lock);
112 spin_lock_init(&fs_info->tree_mod_seq_lock);
113 mutex_init(&fs_info->qgroup_ioctl_lock);
114 mutex_init(&fs_info->qgroup_rescan_lock);
115 rwlock_init(&fs_info->tree_mod_log_lock);
116 fs_info->running_transaction = NULL;
117 fs_info->qgroup_tree = RB_ROOT;
118 fs_info->qgroup_ulist = NULL;
119 atomic64_set(&fs_info->tree_mod_seq, 0);
120 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
121 INIT_LIST_HEAD(&fs_info->dead_roots);
122 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
123 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
124 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
125 return fs_info;
126}
127
128static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
129{
130 struct radix_tree_iter iter;
131 void **slot;
132
133 spin_lock(&fs_info->buffer_lock);
134restart:
135 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
136 struct extent_buffer *eb;
137
138 eb = radix_tree_deref_slot(slot);
139 if (!eb)
140 continue;
141 /* Shouldn't happen but that kind of thinking creates CVE's */
142 if (radix_tree_exception(eb)) {
143 if (radix_tree_deref_retry(eb))
144 goto restart;
145 continue;
146 }
147 spin_unlock(&fs_info->buffer_lock);
148 free_extent_buffer_stale(eb);
149 spin_lock(&fs_info->buffer_lock);
150 }
151 spin_unlock(&fs_info->buffer_lock);
152
153 btrfs_free_qgroup_config(fs_info);
154 btrfs_free_fs_roots(fs_info);
155 cleanup_srcu_struct(&fs_info->subvol_srcu);
156 kfree(fs_info->super_copy);
157 kfree(fs_info->fs_devices);
158 kfree(fs_info);
159}
160
161void btrfs_free_dummy_root(struct btrfs_root *root)
162{
163 if (!root)
164 return;
165 if (root->node)
166 free_extent_buffer(root->node);
167 if (root->fs_info)
168 btrfs_free_dummy_fs_info(root->fs_info);
169 kfree(root);
170}
171
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 312560a9123d..fd3954224480 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,13 +23,18 @@
23 23
24#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) 24#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
25 25
26struct btrfs_root;
27
26int btrfs_test_free_space_cache(void); 28int btrfs_test_free_space_cache(void);
27int btrfs_test_extent_buffer_operations(void); 29int btrfs_test_extent_buffer_operations(void);
28int btrfs_test_extent_io(void); 30int btrfs_test_extent_io(void);
29int btrfs_test_inodes(void); 31int btrfs_test_inodes(void);
32int btrfs_test_qgroups(void);
30int btrfs_init_test_fs(void); 33int btrfs_init_test_fs(void);
31void btrfs_destroy_test_fs(void); 34void btrfs_destroy_test_fs(void);
32struct inode *btrfs_new_test_inode(void); 35struct inode *btrfs_new_test_inode(void);
36struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
37void btrfs_free_dummy_root(struct btrfs_root *root);
33#else 38#else
34static inline int btrfs_test_free_space_cache(void) 39static inline int btrfs_test_free_space_cache(void)
35{ 40{
@@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void)
54{ 59{
55 return 0; 60 return 0;
56} 61}
62static inline int btrfs_test_qgroups(void)
63{
64 return 0;
65}
57#endif 66#endif
58 67
59#endif 68#endif
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 397d1f99a8eb..3ae0f5b8bb80 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -23,33 +23,6 @@
23#include "../extent_io.h" 23#include "../extent_io.h"
24#include "../volumes.h" 24#include "../volumes.h"
25 25
26static struct btrfs_fs_info *alloc_dummy_fs_info(void)
27{
28 struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
29 GFP_NOFS);
30 if (!fs_info)
31 return fs_info;
32 fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
33 GFP_NOFS);
34 if (!fs_info->fs_devices) {
35 kfree(fs_info);
36 return NULL;
37 }
38 return fs_info;
39}
40static void free_dummy_root(struct btrfs_root *root)
41{
42 if (!root)
43 return;
44 if (root->fs_info) {
45 kfree(root->fs_info->fs_devices);
46 kfree(root->fs_info);
47 }
48 if (root->node)
49 free_extent_buffer(root->node);
50 kfree(root);
51}
52
53static void insert_extent(struct btrfs_root *root, u64 start, u64 len, 26static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
54 u64 ram_bytes, u64 offset, u64 disk_bytenr, 27 u64 ram_bytes, u64 offset, u64 disk_bytenr,
55 u64 disk_len, u32 type, u8 compression, int slot) 28 u64 disk_len, u32 type, u8 compression, int slot)
@@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void)
276 * We do this since btrfs_get_extent wants to assign em->bdev to 249 * We do this since btrfs_get_extent wants to assign em->bdev to
277 * root->fs_info->fs_devices->latest_bdev. 250 * root->fs_info->fs_devices->latest_bdev.
278 */ 251 */
279 root->fs_info = alloc_dummy_fs_info(); 252 root->fs_info = btrfs_alloc_dummy_fs_info();
280 if (!root->fs_info) { 253 if (!root->fs_info) {
281 test_msg("Couldn't allocate dummy fs info\n"); 254 test_msg("Couldn't allocate dummy fs info\n");
282 goto out; 255 goto out;
@@ -837,7 +810,7 @@ out:
837 if (!IS_ERR(em)) 810 if (!IS_ERR(em))
838 free_extent_map(em); 811 free_extent_map(em);
839 iput(inode); 812 iput(inode);
840 free_dummy_root(root); 813 btrfs_free_dummy_root(root);
841 return ret; 814 return ret;
842} 815}
843 816
@@ -864,7 +837,7 @@ static int test_hole_first(void)
864 goto out; 837 goto out;
865 } 838 }
866 839
867 root->fs_info = alloc_dummy_fs_info(); 840 root->fs_info = btrfs_alloc_dummy_fs_info();
868 if (!root->fs_info) { 841 if (!root->fs_info) {
869 test_msg("Couldn't allocate dummy fs info\n"); 842 test_msg("Couldn't allocate dummy fs info\n");
870 goto out; 843 goto out;
@@ -934,7 +907,7 @@ out:
934 if (!IS_ERR(em)) 907 if (!IS_ERR(em))
935 free_extent_map(em); 908 free_extent_map(em);
936 iput(inode); 909 iput(inode);
937 free_dummy_root(root); 910 btrfs_free_dummy_root(root);
938 return ret; 911 return ret;
939} 912}
940 913
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 000000000000..fa691b754aaf
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,468 @@
1/*
2 * Copyright (C) 2013 Facebook. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "btrfs-tests.h"
20#include "../ctree.h"
21#include "../transaction.h"
22#include "../disk-io.h"
23#include "../qgroup.h"
24
25static void init_dummy_trans(struct btrfs_trans_handle *trans)
26{
27 memset(trans, 0, sizeof(*trans));
28 trans->transid = 1;
29 INIT_LIST_HEAD(&trans->qgroup_ref_list);
30 trans->type = __TRANS_DUMMY;
31}
32
33static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
34 u64 num_bytes, u64 parent, u64 root_objectid)
35{
36 struct btrfs_trans_handle trans;
37 struct btrfs_extent_item *item;
38 struct btrfs_extent_inline_ref *iref;
39 struct btrfs_tree_block_info *block_info;
40 struct btrfs_path *path;
41 struct extent_buffer *leaf;
42 struct btrfs_key ins;
43 u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
44 int ret;
45
46 init_dummy_trans(&trans);
47
48 ins.objectid = bytenr;
49 ins.type = BTRFS_EXTENT_ITEM_KEY;
50 ins.offset = num_bytes;
51
52 path = btrfs_alloc_path();
53 if (!path) {
54 test_msg("Couldn't allocate path\n");
55 return -ENOMEM;
56 }
57
58 path->leave_spinning = 1;
59 ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
60 if (ret) {
61 test_msg("Couldn't insert ref %d\n", ret);
62 btrfs_free_path(path);
63 return ret;
64 }
65
66 leaf = path->nodes[0];
67 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
68 btrfs_set_extent_refs(leaf, item, 1);
69 btrfs_set_extent_generation(leaf, item, 1);
70 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
71 block_info = (struct btrfs_tree_block_info *)(item + 1);
72 btrfs_set_tree_block_level(leaf, block_info, 1);
73 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
74 if (parent > 0) {
75 btrfs_set_extent_inline_ref_type(leaf, iref,
76 BTRFS_SHARED_BLOCK_REF_KEY);
77 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
78 } else {
79 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
80 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
81 }
82 btrfs_free_path(path);
83 return 0;
84}
85
86static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
87 u64 parent, u64 root_objectid)
88{
89 struct btrfs_trans_handle trans;
90 struct btrfs_extent_item *item;
91 struct btrfs_path *path;
92 struct btrfs_key key;
93 u64 refs;
94 int ret;
95
96 init_dummy_trans(&trans);
97
98 key.objectid = bytenr;
99 key.type = BTRFS_EXTENT_ITEM_KEY;
100 key.offset = num_bytes;
101
102 path = btrfs_alloc_path();
103 if (!path) {
104 test_msg("Couldn't allocate path\n");
105 return -ENOMEM;
106 }
107
108 path->leave_spinning = 1;
109 ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
110 if (ret) {
111 test_msg("Couldn't find extent ref\n");
112 btrfs_free_path(path);
113 return ret;
114 }
115
116 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
117 struct btrfs_extent_item);
118 refs = btrfs_extent_refs(path->nodes[0], item);
119 btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
120 btrfs_release_path(path);
121
122 key.objectid = bytenr;
123 if (parent) {
124 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
125 key.offset = parent;
126 } else {
127 key.type = BTRFS_TREE_BLOCK_REF_KEY;
128 key.offset = root_objectid;
129 }
130
131 ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
132 if (ret)
133 test_msg("Failed to insert backref\n");
134 btrfs_free_path(path);
135 return ret;
136}
137
138static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
139 u64 num_bytes)
140{
141 struct btrfs_trans_handle trans;
142 struct btrfs_key key;
143 struct btrfs_path *path;
144 int ret;
145
146 init_dummy_trans(&trans);
147
148 key.objectid = bytenr;
149 key.type = BTRFS_EXTENT_ITEM_KEY;
150 key.offset = num_bytes;
151
152 path = btrfs_alloc_path();
153 if (!path) {
154 test_msg("Couldn't allocate path\n");
155 return -ENOMEM;
156 }
157 path->leave_spinning = 1;
158
159 ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
160 if (ret) {
161 test_msg("Didn't find our key %d\n", ret);
162 btrfs_free_path(path);
163 return ret;
164 }
165 btrfs_del_item(&trans, root, path);
166 btrfs_free_path(path);
167 return 0;
168}
169
170static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
171 u64 num_bytes, u64 parent, u64 root_objectid)
172{
173 struct btrfs_trans_handle trans;
174 struct btrfs_extent_item *item;
175 struct btrfs_path *path;
176 struct btrfs_key key;
177 u64 refs;
178 int ret;
179
180 init_dummy_trans(&trans);
181
182 key.objectid = bytenr;
183 key.type = BTRFS_EXTENT_ITEM_KEY;
184 key.offset = num_bytes;
185
186 path = btrfs_alloc_path();
187 if (!path) {
188 test_msg("Couldn't allocate path\n");
189 return -ENOMEM;
190 }
191
192 path->leave_spinning = 1;
193 ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
194 if (ret) {
195 test_msg("Couldn't find extent ref\n");
196 btrfs_free_path(path);
197 return ret;
198 }
199
200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
201 struct btrfs_extent_item);
202 refs = btrfs_extent_refs(path->nodes[0], item);
203 btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
204 btrfs_release_path(path);
205
206 key.objectid = bytenr;
207 if (parent) {
208 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
209 key.offset = parent;
210 } else {
211 key.type = BTRFS_TREE_BLOCK_REF_KEY;
212 key.offset = root_objectid;
213 }
214
215 ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
216 if (ret) {
217 test_msg("Couldn't find backref %d\n", ret);
218 btrfs_free_path(path);
219 return ret;
220 }
221 btrfs_del_item(&trans, root, path);
222 btrfs_free_path(path);
223 return ret;
224}
225
226static int test_no_shared_qgroup(struct btrfs_root *root)
227{
228 struct btrfs_trans_handle trans;
229 struct btrfs_fs_info *fs_info = root->fs_info;
230 int ret;
231
232 init_dummy_trans(&trans);
233
234 test_msg("Qgroup basic add\n");
235 ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
236 if (ret) {
237 test_msg("Couldn't create a qgroup %d\n", ret);
238 return ret;
239 }
240
241 ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
242 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
243 if (ret) {
244 test_msg("Couldn't add space to a qgroup %d\n", ret);
245 return ret;
246 }
247
248 ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
249 if (ret)
250 return ret;
251
252 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
253 if (ret) {
254 test_msg("Delayed qgroup accounting failed %d\n", ret);
255 return ret;
256 }
257
258 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
259 test_msg("Qgroup counts didn't match expected values\n");
260 return -EINVAL;
261 }
262
263 ret = remove_extent_item(root, 4096, 4096);
264 if (ret)
265 return -EINVAL;
266
267 ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
268 BTRFS_QGROUP_OPER_SUB_EXCL, 0);
269 if (ret) {
270 test_msg("Couldn't remove space from the qgroup %d\n", ret);
271 return -EINVAL;
272 }
273
274 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
275 if (ret) {
276 test_msg("Qgroup accounting failed %d\n", ret);
277 return -EINVAL;
278 }
279
280 if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
281 test_msg("Qgroup counts didn't match expected values\n");
282 return -EINVAL;
283 }
284
285 return 0;
286}
287
288/*
289 * Add a ref for two different roots to make sure the shared value comes out
290 * right, also remove one of the roots and make sure the exclusive count is
291 * adjusted properly.
292 */
293static int test_multiple_refs(struct btrfs_root *root)
294{
295 struct btrfs_trans_handle trans;
296 struct btrfs_fs_info *fs_info = root->fs_info;
297 int ret;
298
299 init_dummy_trans(&trans);
300
301 test_msg("Qgroup multiple refs test\n");
302
303 /* We have 5 created already from the previous test */
304 ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
305 if (ret) {
306 test_msg("Couldn't create a qgroup %d\n", ret);
307 return ret;
308 }
309
310 ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
311 if (ret)
312 return ret;
313
314 ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
315 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
316 if (ret) {
317 test_msg("Couldn't add space to a qgroup %d\n", ret);
318 return ret;
319 }
320
321 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
322 if (ret) {
323 test_msg("Delayed qgroup accounting failed %d\n", ret);
324 return ret;
325 }
326
327 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
328 test_msg("Qgroup counts didn't match expected values\n");
329 return -EINVAL;
330 }
331
332 ret = add_tree_ref(root, 4096, 4096, 0, 256);
333 if (ret)
334 return ret;
335
336 ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
337 BTRFS_QGROUP_OPER_ADD_SHARED, 0);
338 if (ret) {
339 test_msg("Qgroup record ref failed %d\n", ret);
340 return ret;
341 }
342
343 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
344 if (ret) {
345 test_msg("Qgroup accounting failed %d\n", ret);
346 return ret;
347 }
348
349 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
350 test_msg("Qgroup counts didn't match expected values\n");
351 return -EINVAL;
352 }
353
354 if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
355 test_msg("Qgroup counts didn't match expected values\n");
356 return -EINVAL;
357 }
358
359 ret = remove_extent_ref(root, 4096, 4096, 0, 256);
360 if (ret)
361 return ret;
362
363 ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
364 BTRFS_QGROUP_OPER_SUB_SHARED, 0);
365 if (ret) {
366 test_msg("Qgroup record ref failed %d\n", ret);
367 return ret;
368 }
369
370 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
371 if (ret) {
372 test_msg("Qgroup accounting failed %d\n", ret);
373 return ret;
374 }
375
376 if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
377 test_msg("Qgroup counts didn't match expected values\n");
378 return -EINVAL;
379 }
380
381 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
382 test_msg("Qgroup counts didn't match expected values\n");
383 return -EINVAL;
384 }
385
386 return 0;
387}
388
389int btrfs_test_qgroups(void)
390{
391 struct btrfs_root *root;
392 struct btrfs_root *tmp_root;
393 int ret = 0;
394
395 root = btrfs_alloc_dummy_root();
396 if (IS_ERR(root)) {
397 test_msg("Couldn't allocate root\n");
398 return PTR_ERR(root);
399 }
400
401 root->fs_info = btrfs_alloc_dummy_fs_info();
402 if (!root->fs_info) {
403 test_msg("Couldn't allocate dummy fs info\n");
404 ret = -ENOMEM;
405 goto out;
406 }
407
408 /*
409 * Can't use bytenr 0, some things freak out
410 * *cough*backref walking code*cough*
411 */
412 root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
413 if (!root->node) {
414 test_msg("Couldn't allocate dummy buffer\n");
415 ret = -ENOMEM;
416 goto out;
417 }
418 root->alloc_bytenr += 8192;
419
420 tmp_root = btrfs_alloc_dummy_root();
421 if (IS_ERR(tmp_root)) {
422 test_msg("Couldn't allocate a fs root\n");
423 ret = PTR_ERR(tmp_root);
424 goto out;
425 }
426
427 tmp_root->root_key.objectid = 5;
428 root->fs_info->fs_root = tmp_root;
429 ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
430 if (ret) {
431 test_msg("Couldn't insert fs root %d\n", ret);
432 goto out;
433 }
434
435 tmp_root = btrfs_alloc_dummy_root();
436 if (IS_ERR(tmp_root)) {
437 test_msg("Couldn't allocate a fs root\n");
438 ret = PTR_ERR(tmp_root);
439 goto out;
440 }
441
442 tmp_root->root_key.objectid = 256;
443 ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
444 if (ret) {
445 test_msg("Couldn't insert fs root %d\n", ret);
446 goto out;
447 }
448
449 /* We are using this root as our extent root */
450 root->fs_info->extent_root = root;
451
452 /*
453 * Some of the paths we test assume we have a filled out fs_info, so we
454 * just need to addt he root in there so we don't panic.
455 */
456 root->fs_info->tree_root = root;
457 root->fs_info->quota_root = root;
458 root->fs_info->quota_enabled = 1;
459
460 test_msg("Running qgroup tests\n");
461 ret = test_no_shared_qgroup(root);
462 if (ret)
463 goto out;
464 ret = test_multiple_refs(root);
465out:
466 btrfs_free_dummy_root(root);
467 return ret;
468}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7579f6d0b854..9630f10f8e1e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,6 +31,7 @@
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h" 33#include "dev-replace.h"
34#include "qgroup.h"
34 35
35#define BTRFS_ROOT_TRANS_TAG 0 36#define BTRFS_ROOT_TRANS_TAG 0
36 37
@@ -241,18 +242,19 @@ loop:
241static int record_root_in_trans(struct btrfs_trans_handle *trans, 242static int record_root_in_trans(struct btrfs_trans_handle *trans,
242 struct btrfs_root *root) 243 struct btrfs_root *root)
243{ 244{
244 if (root->ref_cows && root->last_trans < trans->transid) { 245 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
246 root->last_trans < trans->transid) {
245 WARN_ON(root == root->fs_info->extent_root); 247 WARN_ON(root == root->fs_info->extent_root);
246 WARN_ON(root->commit_root != root->node); 248 WARN_ON(root->commit_root != root->node);
247 249
248 /* 250 /*
249 * see below for in_trans_setup usage rules 251 * see below for IN_TRANS_SETUP usage rules
250 * we have the reloc mutex held now, so there 252 * we have the reloc mutex held now, so there
251 * is only one writer in this function 253 * is only one writer in this function
252 */ 254 */
253 root->in_trans_setup = 1; 255 set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
254 256
255 /* make sure readers find in_trans_setup before 257 /* make sure readers find IN_TRANS_SETUP before
256 * they find our root->last_trans update 258 * they find our root->last_trans update
257 */ 259 */
258 smp_wmb(); 260 smp_wmb();
@@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
279 * But, we have to set root->last_trans before we 281 * But, we have to set root->last_trans before we
280 * init the relocation root, otherwise, we trip over warnings 282 * init the relocation root, otherwise, we trip over warnings
281 * in ctree.c. The solution used here is to flag ourselves 283 * in ctree.c. The solution used here is to flag ourselves
282 * with root->in_trans_setup. When this is 1, we're still 284 * with root IN_TRANS_SETUP. When this is 1, we're still
283 * fixing up the reloc trees and everyone must wait. 285 * fixing up the reloc trees and everyone must wait.
284 * 286 *
285 * When this is zero, they can trust root->last_trans and fly 287 * When this is zero, they can trust root->last_trans and fly
@@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
288 * done before we pop in the zero below 290 * done before we pop in the zero below
289 */ 291 */
290 btrfs_init_reloc_root(trans, root); 292 btrfs_init_reloc_root(trans, root);
291 smp_wmb(); 293 smp_mb__before_atomic();
292 root->in_trans_setup = 0; 294 clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
293 } 295 }
294 return 0; 296 return 0;
295} 297}
@@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
298int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 300int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
299 struct btrfs_root *root) 301 struct btrfs_root *root)
300{ 302{
301 if (!root->ref_cows) 303 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
302 return 0; 304 return 0;
303 305
304 /* 306 /*
305 * see record_root_in_trans for comments about in_trans_setup usage 307 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
306 * and barriers 308 * and barriers
307 */ 309 */
308 smp_rmb(); 310 smp_rmb();
309 if (root->last_trans == trans->transid && 311 if (root->last_trans == trans->transid &&
310 !root->in_trans_setup) 312 !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
311 return 0; 313 return 0;
312 314
313 mutex_lock(&root->fs_info->reloc_mutex); 315 mutex_lock(&root->fs_info->reloc_mutex);
@@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
365static inline bool need_reserve_reloc_root(struct btrfs_root *root) 367static inline bool need_reserve_reloc_root(struct btrfs_root *root)
366{ 368{
367 if (!root->fs_info->reloc_ctl || 369 if (!root->fs_info->reloc_ctl ||
368 !root->ref_cows || 370 !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
369 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 371 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
370 root->reloc_root) 372 root->reloc_root)
371 return false; 373 return false;
@@ -695,6 +697,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
695 unsigned long cur = trans->delayed_ref_updates; 697 unsigned long cur = trans->delayed_ref_updates;
696 int lock = (trans->type != TRANS_JOIN_NOLOCK); 698 int lock = (trans->type != TRANS_JOIN_NOLOCK);
697 int err = 0; 699 int err = 0;
700 int must_run_delayed_refs = 0;
698 701
699 if (trans->use_count > 1) { 702 if (trans->use_count > 1) {
700 trans->use_count--; 703 trans->use_count--;
@@ -702,14 +705,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
702 return 0; 705 return 0;
703 } 706 }
704 707
705 /*
706 * do the qgroup accounting as early as possible
707 */
708 err = btrfs_delayed_refs_qgroup_accounting(trans, info);
709
710 btrfs_trans_release_metadata(trans, root); 708 btrfs_trans_release_metadata(trans, root);
711 trans->block_rsv = NULL; 709 trans->block_rsv = NULL;
712 710
711 if (!list_empty(&trans->new_bgs))
712 btrfs_create_pending_block_groups(trans, root);
713
714 trans->delayed_ref_updates = 0;
715 if (!trans->sync) {
716 must_run_delayed_refs =
717 btrfs_should_throttle_delayed_refs(trans, root);
718 cur = max_t(unsigned long, cur, 32);
719
720 /*
721 * don't make the caller wait if they are from a NOLOCK
722 * or ATTACH transaction, it will deadlock with commit
723 */
724 if (must_run_delayed_refs == 1 &&
725 (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
726 must_run_delayed_refs = 2;
727 }
728
713 if (trans->qgroup_reserved) { 729 if (trans->qgroup_reserved) {
714 /* 730 /*
715 * the same root has to be passed here between start_transaction 731 * the same root has to be passed here between start_transaction
@@ -719,16 +735,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
719 trans->qgroup_reserved = 0; 735 trans->qgroup_reserved = 0;
720 } 736 }
721 737
722 if (!list_empty(&trans->new_bgs))
723 btrfs_create_pending_block_groups(trans, root);
724
725 trans->delayed_ref_updates = 0;
726 if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
727 cur = max_t(unsigned long, cur, 32);
728 trans->delayed_ref_updates = 0;
729 btrfs_run_delayed_refs(trans, root, cur);
730 }
731
732 btrfs_trans_release_metadata(trans, root); 738 btrfs_trans_release_metadata(trans, root);
733 trans->block_rsv = NULL; 739 trans->block_rsv = NULL;
734 740
@@ -778,6 +784,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
778 assert_qgroups_uptodate(trans); 784 assert_qgroups_uptodate(trans);
779 785
780 kmem_cache_free(btrfs_trans_handle_cachep, trans); 786 kmem_cache_free(btrfs_trans_handle_cachep, trans);
787 if (must_run_delayed_refs) {
788 btrfs_async_run_delayed_refs(root, cur,
789 must_run_delayed_refs == 1);
790 }
781 return err; 791 return err;
782} 792}
783 793
@@ -1049,8 +1059,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
1049 btrfs_save_ino_cache(root, trans); 1059 btrfs_save_ino_cache(root, trans);
1050 1060
1051 /* see comments in should_cow_block() */ 1061 /* see comments in should_cow_block() */
1052 root->force_cow = 0; 1062 clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1053 smp_wmb(); 1063 smp_mb__after_atomic();
1054 1064
1055 if (root->commit_root != root->node) { 1065 if (root->commit_root != root->node) {
1056 list_add_tail(&root->dirty_list, 1066 list_add_tail(&root->dirty_list,
@@ -1081,7 +1091,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
1081 struct btrfs_trans_handle *trans; 1091 struct btrfs_trans_handle *trans;
1082 int ret; 1092 int ret;
1083 1093
1084 if (xchg(&root->defrag_running, 1)) 1094 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
1085 return 0; 1095 return 0;
1086 1096
1087 while (1) { 1097 while (1) {
@@ -1104,7 +1114,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
1104 break; 1114 break;
1105 } 1115 }
1106 } 1116 }
1107 root->defrag_running = 0; 1117 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
1108 return ret; 1118 return ret;
1109} 1119}
1110 1120
@@ -1168,12 +1178,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1168 goto no_free_objectid; 1178 goto no_free_objectid;
1169 } 1179 }
1170 1180
1171 pending->error = btrfs_qgroup_inherit(trans, fs_info,
1172 root->root_key.objectid,
1173 objectid, pending->inherit);
1174 if (pending->error)
1175 goto no_free_objectid;
1176
1177 key.objectid = objectid; 1181 key.objectid = objectid;
1178 key.offset = (u64)-1; 1182 key.offset = (u64)-1;
1179 key.type = BTRFS_ROOT_ITEM_KEY; 1183 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1270,8 +1274,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1270 goto fail; 1274 goto fail;
1271 } 1275 }
1272 1276
1277 /*
1278 * We need to flush delayed refs in order to make sure all of our quota
1279 * operations have been done before we call btrfs_qgroup_inherit.
1280 */
1281 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1282 if (ret) {
1283 btrfs_abort_transaction(trans, root, ret);
1284 goto fail;
1285 }
1286
1287 pending->error = btrfs_qgroup_inherit(trans, fs_info,
1288 root->root_key.objectid,
1289 objectid, pending->inherit);
1290 if (pending->error)
1291 goto no_free_objectid;
1292
1273 /* see comments in should_cow_block() */ 1293 /* see comments in should_cow_block() */
1274 root->force_cow = 1; 1294 set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1275 smp_wmb(); 1295 smp_wmb();
1276 1296
1277 btrfs_set_root_node(new_root_item, tmp); 1297 btrfs_set_root_node(new_root_item, tmp);
@@ -1598,12 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1598 * them now so that they hinder processing of more delayed refs 1618 * them now so that they hinder processing of more delayed refs
1599 * as little as possible. 1619 * as little as possible.
1600 */ 1620 */
1601 if (ret) {
1602 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1603 return ret;
1604 }
1605
1606 ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1607 if (ret) 1621 if (ret)
1608 return ret; 1622 return ret;
1609 1623
@@ -1984,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1984 } 1998 }
1985 root = list_first_entry(&fs_info->dead_roots, 1999 root = list_first_entry(&fs_info->dead_roots,
1986 struct btrfs_root, root_list); 2000 struct btrfs_root, root_list);
1987 /*
1988 * Make sure root is not involved in send,
1989 * if we fail with first root, we return
1990 * directly rather than continue.
1991 */
1992 spin_lock(&root->root_item_lock);
1993 if (root->send_in_progress) {
1994 spin_unlock(&fs_info->trans_lock);
1995 spin_unlock(&root->root_item_lock);
1996 return 0;
1997 }
1998 spin_unlock(&root->root_item_lock);
1999
2000 list_del_init(&root->root_list); 2001 list_del_init(&root->root_list);
2001 spin_unlock(&fs_info->trans_lock); 2002 spin_unlock(&fs_info->trans_lock);
2002 2003
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b57b924e8e03..7dd558ed0716 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,6 +69,7 @@ struct btrfs_transaction {
69#define __TRANS_ATTACH (1U << 10) 69#define __TRANS_ATTACH (1U << 10)
70#define __TRANS_JOIN (1U << 11) 70#define __TRANS_JOIN (1U << 11)
71#define __TRANS_JOIN_NOLOCK (1U << 12) 71#define __TRANS_JOIN_NOLOCK (1U << 12)
72#define __TRANS_DUMMY (1U << 13)
72 73
73#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) 74#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
74#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) 75#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 76928ca97741..a63719cc9578 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
49 goto out; 49 goto out;
50 } 50 }
51 51
52 if (root->ref_cows == 0) 52 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
53 goto out; 53 goto out;
54 54
55 if (btrfs_test_opt(root, SSD)) 55 if (btrfs_test_opt(root, SSD))
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e2f45fc02610..9e1f2cd5e67a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,13 +20,11 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/list_sort.h> 22#include <linux/list_sort.h>
23#include "ctree.h" 23#include "tree-log.h"
24#include "transaction.h"
25#include "disk-io.h" 24#include "disk-io.h"
26#include "locking.h" 25#include "locking.h"
27#include "print-tree.h" 26#include "print-tree.h"
28#include "backref.h" 27#include "backref.h"
29#include "tree-log.h"
30#include "hash.h" 28#include "hash.h"
31 29
32/* magic values for the inode_only field in btrfs_log_inode: 30/* magic values for the inode_only field in btrfs_log_inode:
@@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
144 142
145 mutex_lock(&root->log_mutex); 143 mutex_lock(&root->log_mutex);
146 if (root->log_root) { 144 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == 145 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
148 trans->transid) {
149 ret = -EAGAIN; 146 ret = -EAGAIN;
150 goto out; 147 goto out;
151 } 148 }
152
153 if (!root->log_start_pid) { 149 if (!root->log_start_pid) {
154 root->log_start_pid = current->pid; 150 root->log_start_pid = current->pid;
155 root->log_multiple_pids = false; 151 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
156 } else if (root->log_start_pid != current->pid) { 152 } else if (root->log_start_pid != current->pid) {
157 root->log_multiple_pids = true; 153 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
158 } 154 }
159 155
160 atomic_inc(&root->log_batch); 156 atomic_inc(&root->log_batch);
@@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
181 if (ret) 177 if (ret)
182 goto out; 178 goto out;
183 } 179 }
184 root->log_multiple_pids = false; 180 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
185 root->log_start_pid = current->pid; 181 root->log_start_pid = current->pid;
186 atomic_inc(&root->log_batch); 182 atomic_inc(&root->log_batch);
187 atomic_inc(&root->log_writers); 183 atomic_inc(&root->log_writers);
@@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2500 while (1) { 2496 while (1) {
2501 int batch = atomic_read(&root->log_batch); 2497 int batch = atomic_read(&root->log_batch);
2502 /* when we're on an ssd, just kick the log commit out */ 2498 /* when we're on an ssd, just kick the log commit out */
2503 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2499 if (!btrfs_test_opt(root, SSD) &&
2500 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2504 mutex_unlock(&root->log_mutex); 2501 mutex_unlock(&root->log_mutex);
2505 schedule_timeout_uninterruptible(1); 2502 schedule_timeout_uninterruptible(1);
2506 mutex_lock(&root->log_mutex); 2503 mutex_lock(&root->log_mutex);
@@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2511 } 2508 }
2512 2509
2513 /* bail out if we need to do a full commit */ 2510 /* bail out if we need to do a full commit */
2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == 2511 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2515 trans->transid) {
2516 ret = -EAGAIN; 2512 ret = -EAGAIN;
2517 btrfs_free_logged_extents(log, log_transid); 2513 btrfs_free_logged_extents(log, log_transid);
2518 mutex_unlock(&root->log_mutex); 2514 mutex_unlock(&root->log_mutex);
@@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2533 blk_finish_plug(&plug); 2529 blk_finish_plug(&plug);
2534 btrfs_abort_transaction(trans, root, ret); 2530 btrfs_abort_transaction(trans, root, ret);
2535 btrfs_free_logged_extents(log, log_transid); 2531 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2532 btrfs_set_log_full_commit(root->fs_info, trans);
2537 trans->transid;
2538 mutex_unlock(&root->log_mutex); 2533 mutex_unlock(&root->log_mutex);
2539 goto out; 2534 goto out;
2540 } 2535 }
@@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2577 list_del_init(&root_log_ctx.list); 2572 list_del_init(&root_log_ctx.list);
2578 2573
2579 blk_finish_plug(&plug); 2574 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2575 btrfs_set_log_full_commit(root->fs_info, trans);
2581 trans->transid; 2576
2582 if (ret != -ENOSPC) { 2577 if (ret != -ENOSPC) {
2583 btrfs_abort_transaction(trans, root, ret); 2578 btrfs_abort_transaction(trans, root, ret);
2584 mutex_unlock(&log_root_tree->log_mutex); 2579 mutex_unlock(&log_root_tree->log_mutex);
@@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2622 * now that we've moved on to the tree of log tree roots, 2617 * now that we've moved on to the tree of log tree roots,
2623 * check the full commit flag again 2618 * check the full commit flag again
2624 */ 2619 */
2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == 2620 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2626 trans->transid) {
2627 blk_finish_plug(&plug); 2621 blk_finish_plug(&plug);
2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2622 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2629 btrfs_free_logged_extents(log, log_transid); 2623 btrfs_free_logged_extents(log, log_transid);
@@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2637 EXTENT_DIRTY | EXTENT_NEW); 2631 EXTENT_DIRTY | EXTENT_NEW);
2638 blk_finish_plug(&plug); 2632 blk_finish_plug(&plug);
2639 if (ret) { 2633 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2634 btrfs_set_log_full_commit(root->fs_info, trans);
2641 trans->transid;
2642 btrfs_abort_transaction(trans, root, ret); 2635 btrfs_abort_transaction(trans, root, ret);
2643 btrfs_free_logged_extents(log, log_transid); 2636 btrfs_free_logged_extents(log, log_transid);
2644 mutex_unlock(&log_root_tree->log_mutex); 2637 mutex_unlock(&log_root_tree->log_mutex);
@@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2667 */ 2660 */
2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2661 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2669 if (ret) { 2662 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2663 btrfs_set_log_full_commit(root->fs_info, trans);
2671 trans->transid;
2672 btrfs_abort_transaction(trans, root, ret); 2664 btrfs_abort_transaction(trans, root, ret);
2673 goto out_wake_log_root; 2665 goto out_wake_log_root;
2674 } 2666 }
@@ -2886,7 +2878,7 @@ fail:
2886out_unlock: 2878out_unlock:
2887 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2879 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2888 if (ret == -ENOSPC) { 2880 if (ret == -ENOSPC) {
2889 root->fs_info->last_trans_log_full_commit = trans->transid; 2881 btrfs_set_log_full_commit(root->fs_info, trans);
2890 ret = 0; 2882 ret = 0;
2891 } else if (ret < 0) 2883 } else if (ret < 0)
2892 btrfs_abort_transaction(trans, root, ret); 2884 btrfs_abort_transaction(trans, root, ret);
@@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2919 dirid, &index); 2911 dirid, &index);
2920 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2912 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2921 if (ret == -ENOSPC) { 2913 if (ret == -ENOSPC) {
2922 root->fs_info->last_trans_log_full_commit = trans->transid; 2914 btrfs_set_log_full_commit(root->fs_info, trans);
2923 ret = 0; 2915 ret = 0;
2924 } else if (ret < 0 && ret != -ENOENT) 2916 } else if (ret < 0 && ret != -ENOENT)
2925 btrfs_abort_transaction(trans, root, ret); 2917 btrfs_abort_transaction(trans, root, ret);
@@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4130 * make sure any commits to the log are forced 4122 * make sure any commits to the log are forced
4131 * to be full commits 4123 * to be full commits
4132 */ 4124 */
4133 root->fs_info->last_trans_log_full_commit = 4125 btrfs_set_log_full_commit(root->fs_info, trans);
4134 trans->transid;
4135 ret = 1; 4126 ret = 1;
4136 break; 4127 break;
4137 } 4128 }
@@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4177 goto end_no_trans; 4168 goto end_no_trans;
4178 } 4169 }
4179 4170
4171 /*
4172 * The prev transaction commit doesn't complete, we need do
4173 * full commit by ourselves.
4174 */
4180 if (root->fs_info->last_trans_log_full_commit > 4175 if (root->fs_info->last_trans_log_full_commit >
4181 root->fs_info->last_trans_committed) { 4176 root->fs_info->last_trans_committed) {
4182 ret = 1; 4177 ret = 1;
@@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4246end_trans: 4241end_trans:
4247 dput(old_parent); 4242 dput(old_parent);
4248 if (ret < 0) { 4243 if (ret < 0) {
4249 root->fs_info->last_trans_log_full_commit = trans->transid; 4244 btrfs_set_log_full_commit(root->fs_info, trans);
4250 ret = 1; 4245 ret = 1;
4251 } 4246 }
4252 4247
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 91b145fce333..7f5b41bd5373 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
19#ifndef __TREE_LOG_ 19#ifndef __TREE_LOG_
20#define __TREE_LOG_ 20#define __TREE_LOG_
21 21
22#include "ctree.h"
23#include "transaction.h"
24
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 25/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 26#define BTRFS_NO_LOG_SYNC 256
24 27
@@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
35 INIT_LIST_HEAD(&ctx->list); 38 INIT_LIST_HEAD(&ctx->list);
36} 39}
37 40
41static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
42 struct btrfs_trans_handle *trans)
43{
44 ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
45}
46
47static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
48 struct btrfs_trans_handle *trans)
49{
50 return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
51 trans->transid;
52}
53
38int btrfs_sync_log(struct btrfs_trans_handle *trans, 54int btrfs_sync_log(struct btrfs_trans_handle *trans,
39 struct btrfs_root *root, struct btrfs_log_ctx *ctx); 55 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 56int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 49d7fab73360..ffeed6d6326f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1452,6 +1452,22 @@ out:
1452 return ret; 1452 return ret;
1453} 1453}
1454 1454
1455/*
1456 * Function to update ctime/mtime for a given device path.
1457 * Mainly used for ctime/mtime based probe like libblkid.
1458 */
1459static void update_dev_time(char *path_name)
1460{
1461 struct file *filp;
1462
1463 filp = filp_open(path_name, O_RDWR, 0);
1464 if (!filp)
1465 return;
1466 file_update_time(filp);
1467 filp_close(filp, NULL);
1468 return;
1469}
1470
1455static int btrfs_rm_dev_item(struct btrfs_root *root, 1471static int btrfs_rm_dev_item(struct btrfs_root *root,
1456 struct btrfs_device *device) 1472 struct btrfs_device *device)
1457{ 1473{
@@ -1674,11 +1690,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1674 struct btrfs_fs_devices *fs_devices; 1690 struct btrfs_fs_devices *fs_devices;
1675 fs_devices = root->fs_info->fs_devices; 1691 fs_devices = root->fs_info->fs_devices;
1676 while (fs_devices) { 1692 while (fs_devices) {
1677 if (fs_devices->seed == cur_devices) 1693 if (fs_devices->seed == cur_devices) {
1694 fs_devices->seed = cur_devices->seed;
1678 break; 1695 break;
1696 }
1679 fs_devices = fs_devices->seed; 1697 fs_devices = fs_devices->seed;
1680 } 1698 }
1681 fs_devices->seed = cur_devices->seed;
1682 cur_devices->seed = NULL; 1699 cur_devices->seed = NULL;
1683 lock_chunks(root); 1700 lock_chunks(root);
1684 __btrfs_close_devices(cur_devices); 1701 __btrfs_close_devices(cur_devices);
@@ -1694,20 +1711,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1694 * remove it from the devices list and zero out the old super 1711 * remove it from the devices list and zero out the old super
1695 */ 1712 */
1696 if (clear_super && disk_super) { 1713 if (clear_super && disk_super) {
1714 u64 bytenr;
1715 int i;
1716
1697 /* make sure this device isn't detected as part of 1717 /* make sure this device isn't detected as part of
1698 * the FS anymore 1718 * the FS anymore
1699 */ 1719 */
1700 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1720 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1701 set_buffer_dirty(bh); 1721 set_buffer_dirty(bh);
1702 sync_dirty_buffer(bh); 1722 sync_dirty_buffer(bh);
1723
1724 /* clear the mirror copies of super block on the disk
1725 * being removed, 0th copy is been taken care above and
1726 * the below would take of the rest
1727 */
1728 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1729 bytenr = btrfs_sb_offset(i);
1730 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1731 i_size_read(bdev->bd_inode))
1732 break;
1733
1734 brelse(bh);
1735 bh = __bread(bdev, bytenr / 4096,
1736 BTRFS_SUPER_INFO_SIZE);
1737 if (!bh)
1738 continue;
1739
1740 disk_super = (struct btrfs_super_block *)bh->b_data;
1741
1742 if (btrfs_super_bytenr(disk_super) != bytenr ||
1743 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1744 continue;
1745 }
1746 memset(&disk_super->magic, 0,
1747 sizeof(disk_super->magic));
1748 set_buffer_dirty(bh);
1749 sync_dirty_buffer(bh);
1750 }
1703 } 1751 }
1704 1752
1705 ret = 0; 1753 ret = 0;
1706 1754
1707 /* Notify udev that device has changed */ 1755 if (bdev) {
1708 if (bdev) 1756 /* Notify udev that device has changed */
1709 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1757 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1710 1758
1759 /* Update ctime/mtime for device path for libblkid */
1760 update_dev_time(device_path);
1761 }
1762
1711error_brelse: 1763error_brelse:
1712 brelse(bh); 1764 brelse(bh);
1713 if (bdev) 1765 if (bdev)
@@ -1883,7 +1935,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1883 fs_devices->seeding = 0; 1935 fs_devices->seeding = 0;
1884 fs_devices->num_devices = 0; 1936 fs_devices->num_devices = 0;
1885 fs_devices->open_devices = 0; 1937 fs_devices->open_devices = 0;
1886 fs_devices->total_devices = 0;
1887 fs_devices->seed = seed_devices; 1938 fs_devices->seed = seed_devices;
1888 1939
1889 generate_random_uuid(fs_devices->fsid); 1940 generate_random_uuid(fs_devices->fsid);
@@ -2146,6 +2197,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2146 ret = btrfs_commit_transaction(trans, root); 2197 ret = btrfs_commit_transaction(trans, root);
2147 } 2198 }
2148 2199
2200 /* Update ctime/mtime for libblkid */
2201 update_dev_time(device_path);
2149 return ret; 2202 return ret;
2150 2203
2151error_trans: 2204error_trans:
@@ -2922,6 +2975,16 @@ static int should_balance_chunk(struct btrfs_root *root,
2922 return 0; 2975 return 0;
2923 } 2976 }
2924 2977
2978 /*
2979 * limited by count, must be the last filter
2980 */
2981 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
2982 if (bargs->limit == 0)
2983 return 0;
2984 else
2985 bargs->limit--;
2986 }
2987
2925 return 1; 2988 return 1;
2926} 2989}
2927 2990
@@ -2944,6 +3007,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2944 int ret; 3007 int ret;
2945 int enospc_errors = 0; 3008 int enospc_errors = 0;
2946 bool counting = true; 3009 bool counting = true;
3010 u64 limit_data = bctl->data.limit;
3011 u64 limit_meta = bctl->meta.limit;
3012 u64 limit_sys = bctl->sys.limit;
2947 3013
2948 /* step one make some room on all the devices */ 3014 /* step one make some room on all the devices */
2949 devices = &fs_info->fs_devices->devices; 3015 devices = &fs_info->fs_devices->devices;
@@ -2982,6 +3048,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2982 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3048 memset(&bctl->stat, 0, sizeof(bctl->stat));
2983 spin_unlock(&fs_info->balance_lock); 3049 spin_unlock(&fs_info->balance_lock);
2984again: 3050again:
3051 if (!counting) {
3052 bctl->data.limit = limit_data;
3053 bctl->meta.limit = limit_meta;
3054 bctl->sys.limit = limit_sys;
3055 }
2985 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3056 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2986 key.offset = (u64)-1; 3057 key.offset = (u64)-1;
2987 key.type = BTRFS_CHUNK_ITEM_KEY; 3058 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -3881,7 +3952,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
3881 u8 *ptr; 3952 u8 *ptr;
3882 3953
3883 array_size = btrfs_super_sys_array_size(super_copy); 3954 array_size = btrfs_super_sys_array_size(super_copy);
3884 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3955 if (array_size + item_size + sizeof(disk_key)
3956 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3885 return -EFBIG; 3957 return -EFBIG;
3886 3958
3887 ptr = super_copy->sys_chunk_array + array_size; 3959 ptr = super_copy->sys_chunk_array + array_size;
@@ -3986,6 +4058,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3986 btrfs_set_fs_incompat(info, RAID56); 4058 btrfs_set_fs_incompat(info, RAID56);
3987} 4059}
3988 4060
4061#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
4062 - sizeof(struct btrfs_item) \
4063 - sizeof(struct btrfs_chunk)) \
4064 / sizeof(struct btrfs_stripe) + 1)
4065
4066#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
4067 - 2 * sizeof(struct btrfs_disk_key) \
4068 - 2 * sizeof(struct btrfs_chunk)) \
4069 / sizeof(struct btrfs_stripe) + 1)
4070
3989static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4071static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, u64 start, 4072 struct btrfs_root *extent_root, u64 start,
3991 u64 type) 4073 u64 type)
@@ -4035,6 +4117,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4035 if (type & BTRFS_BLOCK_GROUP_DATA) { 4117 if (type & BTRFS_BLOCK_GROUP_DATA) {
4036 max_stripe_size = 1024 * 1024 * 1024; 4118 max_stripe_size = 1024 * 1024 * 1024;
4037 max_chunk_size = 10 * max_stripe_size; 4119 max_chunk_size = 10 * max_stripe_size;
4120 if (!devs_max)
4121 devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4038 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4122 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4039 /* for larger filesystems, use larger metadata chunks */ 4123 /* for larger filesystems, use larger metadata chunks */
4040 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4124 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
@@ -4042,11 +4126,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4042 else 4126 else
4043 max_stripe_size = 256 * 1024 * 1024; 4127 max_stripe_size = 256 * 1024 * 1024;
4044 max_chunk_size = max_stripe_size; 4128 max_chunk_size = max_stripe_size;
4129 if (!devs_max)
4130 devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4045 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4131 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4046 max_stripe_size = 32 * 1024 * 1024; 4132 max_stripe_size = 32 * 1024 * 1024;
4047 max_chunk_size = 2 * max_stripe_size; 4133 max_chunk_size = 2 * max_stripe_size;
4134 if (!devs_max)
4135 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4048 } else { 4136 } else {
4049 btrfs_err(info, "invalid chunk type 0x%llx requested\n", 4137 btrfs_err(info, "invalid chunk type 0x%llx requested",
4050 type); 4138 type);
4051 BUG_ON(1); 4139 BUG_ON(1);
4052 } 4140 }
@@ -4294,7 +4382,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4294 4382
4295 if (em->start != chunk_offset || em->len != chunk_size) { 4383 if (em->start != chunk_offset || em->len != chunk_size) {
4296 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4384 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4297 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, 4385 " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4298 chunk_size, em->start, em->len); 4386 chunk_size, em->start, em->len);
4299 free_extent_map(em); 4387 free_extent_map(em);
4300 return -EINVAL; 4388 return -EINVAL;
@@ -4496,14 +4584,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4496 * and exit, so return 1 so the callers don't try to use other copies. 4584 * and exit, so return 1 so the callers don't try to use other copies.
4497 */ 4585 */
4498 if (!em) { 4586 if (!em) {
4499 btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, 4587 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4500 logical+len); 4588 logical+len);
4501 return 1; 4589 return 1;
4502 } 4590 }
4503 4591
4504 if (em->start > logical || em->start + em->len < logical) { 4592 if (em->start > logical || em->start + em->len < logical) {
4505 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4593 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4506 "%Lu-%Lu\n", logical, logical+len, em->start, 4594 "%Lu-%Lu", logical, logical+len, em->start,
4507 em->start + em->len); 4595 em->start + em->len);
4508 free_extent_map(em); 4596 free_extent_map(em);
4509 return 1; 4597 return 1;
@@ -4684,7 +4772,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4684 4772
4685 if (em->start > logical || em->start + em->len < logical) { 4773 if (em->start > logical || em->start + em->len < logical) {
4686 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4774 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4687 "found %Lu-%Lu\n", logical, em->start, 4775 "found %Lu-%Lu", logical, em->start,
4688 em->start + em->len); 4776 em->start + em->len);
4689 free_extent_map(em); 4777 free_extent_map(em);
4690 return -EINVAL; 4778 return -EINVAL;
@@ -6058,10 +6146,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6058 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6146 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6059 struct btrfs_device *device; 6147 struct btrfs_device *device;
6060 6148
6061 mutex_lock(&fs_devices->device_list_mutex); 6149 while (fs_devices) {
6062 list_for_each_entry(device, &fs_devices->devices, dev_list) 6150 mutex_lock(&fs_devices->device_list_mutex);
6063 device->dev_root = fs_info->dev_root; 6151 list_for_each_entry(device, &fs_devices->devices, dev_list)
6064 mutex_unlock(&fs_devices->device_list_mutex); 6152 device->dev_root = fs_info->dev_root;
6153 mutex_unlock(&fs_devices->device_list_mutex);
6154
6155 fs_devices = fs_devices->seed;
6156 }
6065} 6157}
6066 6158
6067static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6159static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80754f9dd3df..1a15bbeb65e2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -255,6 +255,7 @@ struct map_lookup {
255#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) 255#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
256#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) 256#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
257#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) 257#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
258#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
258 259
259/* 260/*
260 * Profile changing flags. When SOFT is set we won't relocate chunk if 261 * Profile changing flags. When SOFT is set we won't relocate chunk if
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8e57191950cb..4f196314c0c1 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws,
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "BTRFS: deflateInit failed\n"); 100 printk(KERN_WARNING "BTRFS: deflateInit failed\n");
101 ret = -1; 101 ret = -EIO;
102 goto out; 102 goto out;
103 } 103 }
104 104
@@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,
110 110
111 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 111 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
112 if (out_page == NULL) { 112 if (out_page == NULL) {
113 ret = -1; 113 ret = -ENOMEM;
114 goto out; 114 goto out;
115 } 115 }
116 cpage_out = kmap(out_page); 116 cpage_out = kmap(out_page);
@@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws,
128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -EIO;
132 goto out; 132 goto out;
133 } 133 }
134 134
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
136 if (workspace->def_strm.total_in > 8192 && 136 if (workspace->def_strm.total_in > 8192 &&
137 workspace->def_strm.total_in < 137 workspace->def_strm.total_in <
138 workspace->def_strm.total_out) { 138 workspace->def_strm.total_out) {
139 ret = -1; 139 ret = -EIO;
140 goto out; 140 goto out;
141 } 141 }
142 /* we need another page for writing out. Test this 142 /* we need another page for writing out. Test this
@@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,
147 kunmap(out_page); 147 kunmap(out_page);
148 if (nr_pages == nr_dest_pages) { 148 if (nr_pages == nr_dest_pages) {
149 out_page = NULL; 149 out_page = NULL;
150 ret = -1; 150 ret = -E2BIG;
151 goto out; 151 goto out;
152 } 152 }
153 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 153 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
154 if (out_page == NULL) { 154 if (out_page == NULL) {
155 ret = -1; 155 ret = -ENOMEM;
156 goto out; 156 goto out;
157 } 157 }
158 cpage_out = kmap(out_page); 158 cpage_out = kmap(out_page);
@@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,
188 zlib_deflateEnd(&workspace->def_strm); 188 zlib_deflateEnd(&workspace->def_strm);
189 189
190 if (ret != Z_STREAM_END) { 190 if (ret != Z_STREAM_END) {
191 ret = -1; 191 ret = -EIO;
192 goto out; 192 goto out;
193 } 193 }
194 194
195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { 195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
196 ret = -1; 196 ret = -E2BIG;
197 goto out; 197 goto out;
198 } 198 }
199 199
@@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 255 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
256 return -1; 256 return -EIO;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
295 } 295 }
296 } 296 }
297 if (ret != Z_STREAM_END) 297 if (ret != Z_STREAM_END)
298 ret = -1; 298 ret = -EIO;
299 else 299 else
300 ret = 0; 300 ret = 0;
301done: 301done:
@@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 339 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
340 return -1; 340 return -EIO;
341 } 341 }
342 342
343 while (bytes_left > 0) { 343 while (bytes_left > 0) {
@@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
354 total_out = workspace->inf_strm.total_out; 354 total_out = workspace->inf_strm.total_out;
355 355
356 if (total_out == buf_start) { 356 if (total_out == buf_start) {
357 ret = -1; 357 ret = -EIO;
358 break; 358 break;
359 } 359 }
360 360
@@ -382,7 +382,7 @@ next:
382 } 382 }
383 383
384 if (ret != Z_STREAM_END && bytes_left != 0) 384 if (ret != Z_STREAM_END && bytes_left != 0)
385 ret = -1; 385 ret = -EIO;
386 else 386 else
387 ret = 0; 387 ret = 0;
388 388