aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-16 11:06:55 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-16 11:06:55 -0400
commite64df3ebe8262c8203d1fe4f541e0241c3112c01 (patch)
tree86511b00e8626a08089bb169fe5a64c542925762
parent53b95d6341c142a02538e41bdf1405ef8888bf8b (diff)
parent8d875f95da43c6a8f18f77869f2ef26e9594fecc (diff)
Merge branch 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "These are all fixes I'd like to get out to a broader audience. The biggest of the bunch is Mark's quota fix, which is also in the SUSE kernel, and makes our subvolume quotas dramatically more accurate. I've been running xfstests with these against your current git overnight, but I'm queueing up longer tests as well" * 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: btrfs: disable strict file flushes for renames and truncates Btrfs: fix csum tree corruption, duplicate and outdated checksums Btrfs: Fix memory corruption by ulist_add_merge() on 32bit arch Btrfs: fix compressed write corruption on enospc btrfs: correctly handle return from ulist_add btrfs: qgroup: account shared subtrees during snapshot delete Btrfs: read lock extent buffer while walking backrefs Btrfs: __btrfs_mod_ref should always use no_quota btrfs: adjust statfs calculations according to raid profiles
-rw-r--r--fs/btrfs/backref.c14
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/ctree.c20
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c32
-rw-r--r--fs/btrfs/extent-tree.c285
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c26
-rw-r--r--fs/btrfs/inode.c59
-rw-r--r--fs/btrfs/ordered-data.c123
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/qgroup.c169
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/super.c51
-rw-r--r--fs/btrfs/transaction.c33
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/ulist.h15
17 files changed, 541 insertions, 305 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..54a201dac7f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
276 } 276 }
277 if (ret > 0) 277 if (ret > 0)
278 goto next; 278 goto next;
279 ret = ulist_add_merge(parents, eb->start, 279 ret = ulist_add_merge_ptr(parents, eb->start,
280 (uintptr_t)eie, 280 eie, (void **)&old, GFP_NOFS);
281 (u64 *)&old, GFP_NOFS);
282 if (ret < 0) 281 if (ret < 0)
283 break; 282 break;
284 if (!ret && extent_item_pos) { 283 if (!ret && extent_item_pos) {
@@ -1001,16 +1000,19 @@ again:
1001 ret = -EIO; 1000 ret = -EIO;
1002 goto out; 1001 goto out;
1003 } 1002 }
1003 btrfs_tree_read_lock(eb);
1004 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1004 ret = find_extent_in_eb(eb, bytenr, 1005 ret = find_extent_in_eb(eb, bytenr,
1005 *extent_item_pos, &eie); 1006 *extent_item_pos, &eie);
1007 btrfs_tree_read_unlock_blocking(eb);
1006 free_extent_buffer(eb); 1008 free_extent_buffer(eb);
1007 if (ret < 0) 1009 if (ret < 0)
1008 goto out; 1010 goto out;
1009 ref->inode_list = eie; 1011 ref->inode_list = eie;
1010 } 1012 }
1011 ret = ulist_add_merge(refs, ref->parent, 1013 ret = ulist_add_merge_ptr(refs, ref->parent,
1012 (uintptr_t)ref->inode_list, 1014 ref->inode_list,
1013 (u64 *)&eie, GFP_NOFS); 1015 (void **)&eie, GFP_NOFS);
1014 if (ret < 0) 1016 if (ret < 0)
1015 goto out; 1017 goto out;
1016 if (!ret && extent_item_pos) { 1018 if (!ret && extent_item_pos) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..43527fd78825 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -84,12 +84,6 @@ struct btrfs_inode {
84 */ 84 */
85 struct list_head delalloc_inodes; 85 struct list_head delalloc_inodes;
86 86
87 /*
88 * list for tracking inodes that must be sent to disk before a
89 * rename or truncate commit
90 */
91 struct list_head ordered_operations;
92
93 /* node for the red-black tree that links inodes in subvolume root */ 87 /* node for the red-black tree that links inodes in subvolume root */
94 struct rb_node rb_node; 88 struct rb_node rb_node;
95 89
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..44ee5d2e52a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
280 280
281 WARN_ON(btrfs_header_generation(buf) > trans->transid); 281 WARN_ON(btrfs_header_generation(buf) > trans->transid);
282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
283 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 283 ret = btrfs_inc_ref(trans, root, cow, 1);
284 else 284 else
285 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 285 ret = btrfs_inc_ref(trans, root, cow, 0);
286 286
287 if (ret) 287 if (ret)
288 return ret; 288 return ret;
@@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1035 if ((owner == root->root_key.objectid || 1035 if ((owner == root->root_key.objectid ||
1036 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 1036 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1037 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 1037 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1038 ret = btrfs_inc_ref(trans, root, buf, 1, 1); 1038 ret = btrfs_inc_ref(trans, root, buf, 1);
1039 BUG_ON(ret); /* -ENOMEM */ 1039 BUG_ON(ret); /* -ENOMEM */
1040 1040
1041 if (root->root_key.objectid == 1041 if (root->root_key.objectid ==
1042 BTRFS_TREE_RELOC_OBJECTID) { 1042 BTRFS_TREE_RELOC_OBJECTID) {
1043 ret = btrfs_dec_ref(trans, root, buf, 0, 1); 1043 ret = btrfs_dec_ref(trans, root, buf, 0);
1044 BUG_ON(ret); /* -ENOMEM */ 1044 BUG_ON(ret); /* -ENOMEM */
1045 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1045 ret = btrfs_inc_ref(trans, root, cow, 1);
1046 BUG_ON(ret); /* -ENOMEM */ 1046 BUG_ON(ret); /* -ENOMEM */
1047 } 1047 }
1048 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 1048 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1050 1050
1051 if (root->root_key.objectid == 1051 if (root->root_key.objectid ==
1052 BTRFS_TREE_RELOC_OBJECTID) 1052 BTRFS_TREE_RELOC_OBJECTID)
1053 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1053 ret = btrfs_inc_ref(trans, root, cow, 1);
1054 else 1054 else
1055 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 1055 ret = btrfs_inc_ref(trans, root, cow, 0);
1056 BUG_ON(ret); /* -ENOMEM */ 1056 BUG_ON(ret); /* -ENOMEM */
1057 } 1057 }
1058 if (new_flags != 0) { 1058 if (new_flags != 0) {
@@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1069 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 1069 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1070 if (root->root_key.objectid == 1070 if (root->root_key.objectid ==
1071 BTRFS_TREE_RELOC_OBJECTID) 1071 BTRFS_TREE_RELOC_OBJECTID)
1072 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1072 ret = btrfs_inc_ref(trans, root, cow, 1);
1073 else 1073 else
1074 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 1074 ret = btrfs_inc_ref(trans, root, cow, 0);
1075 BUG_ON(ret); /* -ENOMEM */ 1075 BUG_ON(ret); /* -ENOMEM */
1076 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 1076 ret = btrfs_dec_ref(trans, root, buf, 1);
1077 BUG_ON(ret); /* -ENOMEM */ 1077 BUG_ON(ret); /* -ENOMEM */
1078 } 1078 }
1079 clean_tree_block(trans, root, buf); 1079 clean_tree_block(trans, root, buf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be91397f4e92..8e29b614fe93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
3326 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 3326 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
3327 struct btrfs_key *ins, int is_data, int delalloc); 3327 struct btrfs_key *ins, int is_data, int delalloc);
3328int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3328int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3329 struct extent_buffer *buf, int full_backref, int no_quota); 3329 struct extent_buffer *buf, int full_backref);
3330int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3330int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3331 struct extent_buffer *buf, int full_backref, int no_quota); 3331 struct extent_buffer *buf, int full_backref);
3332int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3332int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3333 struct btrfs_root *root, 3333 struct btrfs_root *root,
3334 u64 bytenr, u64 num_bytes, u64 flags, 3334 u64 bytenr, u64 num_bytes, u64 flags,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08e65e9cf2aa..d0ed9e664f7d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
60static void free_fs_root(struct btrfs_root *root); 60static void free_fs_root(struct btrfs_root *root);
61static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 61static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
62 int read_only); 62 int read_only);
63static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
64 struct btrfs_root *root);
65static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 63static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
66static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 64static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
67 struct btrfs_root *root); 65 struct btrfs_root *root);
@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
3829 btrfs_cleanup_transaction(root); 3827 btrfs_cleanup_transaction(root);
3830} 3828}
3831 3829
3832static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3833 struct btrfs_root *root)
3834{
3835 struct btrfs_inode *btrfs_inode;
3836 struct list_head splice;
3837
3838 INIT_LIST_HEAD(&splice);
3839
3840 mutex_lock(&root->fs_info->ordered_operations_mutex);
3841 spin_lock(&root->fs_info->ordered_root_lock);
3842
3843 list_splice_init(&t->ordered_operations, &splice);
3844 while (!list_empty(&splice)) {
3845 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3846 ordered_operations);
3847
3848 list_del_init(&btrfs_inode->ordered_operations);
3849 spin_unlock(&root->fs_info->ordered_root_lock);
3850
3851 btrfs_invalidate_inodes(btrfs_inode->root);
3852
3853 spin_lock(&root->fs_info->ordered_root_lock);
3854 }
3855
3856 spin_unlock(&root->fs_info->ordered_root_lock);
3857 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3858}
3859
3860static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 3830static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3861{ 3831{
3862 struct btrfs_ordered_extent *ordered; 3832 struct btrfs_ordered_extent *ordered;
@@ -4093,8 +4063,6 @@ again:
4093void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4063void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4094 struct btrfs_root *root) 4064 struct btrfs_root *root)
4095{ 4065{
4096 btrfs_destroy_ordered_operations(cur_trans, root);
4097
4098 btrfs_destroy_delayed_refs(cur_trans, root); 4066 btrfs_destroy_delayed_refs(cur_trans, root);
4099 4067
4100 cur_trans->state = TRANS_STATE_COMMIT_START; 4068 cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813537f362f9..102ed3143976 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3057,7 +3057,7 @@ out:
3057static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3057static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3058 struct btrfs_root *root, 3058 struct btrfs_root *root,
3059 struct extent_buffer *buf, 3059 struct extent_buffer *buf,
3060 int full_backref, int inc, int no_quota) 3060 int full_backref, int inc)
3061{ 3061{
3062 u64 bytenr; 3062 u64 bytenr;
3063 u64 num_bytes; 3063 u64 num_bytes;
@@ -3111,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3111 key.offset -= btrfs_file_extent_offset(buf, fi); 3111 key.offset -= btrfs_file_extent_offset(buf, fi);
3112 ret = process_func(trans, root, bytenr, num_bytes, 3112 ret = process_func(trans, root, bytenr, num_bytes,
3113 parent, ref_root, key.objectid, 3113 parent, ref_root, key.objectid,
3114 key.offset, no_quota); 3114 key.offset, 1);
3115 if (ret) 3115 if (ret)
3116 goto fail; 3116 goto fail;
3117 } else { 3117 } else {
@@ -3119,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3119 num_bytes = btrfs_level_size(root, level - 1); 3119 num_bytes = btrfs_level_size(root, level - 1);
3120 ret = process_func(trans, root, bytenr, num_bytes, 3120 ret = process_func(trans, root, bytenr, num_bytes,
3121 parent, ref_root, level - 1, 0, 3121 parent, ref_root, level - 1, 0,
3122 no_quota); 3122 1);
3123 if (ret) 3123 if (ret)
3124 goto fail; 3124 goto fail;
3125 } 3125 }
@@ -3130,15 +3130,15 @@ fail:
3130} 3130}
3131 3131
3132int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3132int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3133 struct extent_buffer *buf, int full_backref, int no_quota) 3133 struct extent_buffer *buf, int full_backref)
3134{ 3134{
3135 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); 3135 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3136} 3136}
3137 3137
3138int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3138int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3139 struct extent_buffer *buf, int full_backref, int no_quota) 3139 struct extent_buffer *buf, int full_backref)
3140{ 3140{
3141 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); 3141 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3142} 3142}
3143 3143
3144static int write_one_cache_group(struct btrfs_trans_handle *trans, 3144static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -7478,6 +7478,220 @@ reada:
7478 wc->reada_slot = slot; 7478 wc->reada_slot = slot;
7479} 7479}
7480 7480
7481static int account_leaf_items(struct btrfs_trans_handle *trans,
7482 struct btrfs_root *root,
7483 struct extent_buffer *eb)
7484{
7485 int nr = btrfs_header_nritems(eb);
7486 int i, extent_type, ret;
7487 struct btrfs_key key;
7488 struct btrfs_file_extent_item *fi;
7489 u64 bytenr, num_bytes;
7490
7491 for (i = 0; i < nr; i++) {
7492 btrfs_item_key_to_cpu(eb, &key, i);
7493
7494 if (key.type != BTRFS_EXTENT_DATA_KEY)
7495 continue;
7496
7497 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7498 /* filter out non qgroup-accountable extents */
7499 extent_type = btrfs_file_extent_type(eb, fi);
7500
7501 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7502 continue;
7503
7504 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7505 if (!bytenr)
7506 continue;
7507
7508 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7509
7510 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7511 root->objectid,
7512 bytenr, num_bytes,
7513 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
7514 if (ret)
7515 return ret;
7516 }
7517 return 0;
7518}
7519
7520/*
7521 * Walk up the tree from the bottom, freeing leaves and any interior
7522 * nodes which have had all slots visited. If a node (leaf or
7523 * interior) is freed, the node above it will have it's slot
7524 * incremented. The root node will never be freed.
7525 *
7526 * At the end of this function, we should have a path which has all
7527 * slots incremented to the next position for a search. If we need to
7528 * read a new node it will be NULL and the node above it will have the
7529 * correct slot selected for a later read.
7530 *
7531 * If we increment the root nodes slot counter past the number of
7532 * elements, 1 is returned to signal completion of the search.
7533 */
7534static int adjust_slots_upwards(struct btrfs_root *root,
7535 struct btrfs_path *path, int root_level)
7536{
7537 int level = 0;
7538 int nr, slot;
7539 struct extent_buffer *eb;
7540
7541 if (root_level == 0)
7542 return 1;
7543
7544 while (level <= root_level) {
7545 eb = path->nodes[level];
7546 nr = btrfs_header_nritems(eb);
7547 path->slots[level]++;
7548 slot = path->slots[level];
7549 if (slot >= nr || level == 0) {
7550 /*
7551 * Don't free the root - we will detect this
7552 * condition after our loop and return a
7553 * positive value for caller to stop walking the tree.
7554 */
7555 if (level != root_level) {
7556 btrfs_tree_unlock_rw(eb, path->locks[level]);
7557 path->locks[level] = 0;
7558
7559 free_extent_buffer(eb);
7560 path->nodes[level] = NULL;
7561 path->slots[level] = 0;
7562 }
7563 } else {
7564 /*
7565 * We have a valid slot to walk back down
7566 * from. Stop here so caller can process these
7567 * new nodes.
7568 */
7569 break;
7570 }
7571
7572 level++;
7573 }
7574
7575 eb = path->nodes[root_level];
7576 if (path->slots[root_level] >= btrfs_header_nritems(eb))
7577 return 1;
7578
7579 return 0;
7580}
7581
7582/*
7583 * root_eb is the subtree root and is locked before this function is called.
7584 */
7585static int account_shared_subtree(struct btrfs_trans_handle *trans,
7586 struct btrfs_root *root,
7587 struct extent_buffer *root_eb,
7588 u64 root_gen,
7589 int root_level)
7590{
7591 int ret = 0;
7592 int level;
7593 struct extent_buffer *eb = root_eb;
7594 struct btrfs_path *path = NULL;
7595
7596 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7597 BUG_ON(root_eb == NULL);
7598
7599 if (!root->fs_info->quota_enabled)
7600 return 0;
7601
7602 if (!extent_buffer_uptodate(root_eb)) {
7603 ret = btrfs_read_buffer(root_eb, root_gen);
7604 if (ret)
7605 goto out;
7606 }
7607
7608 if (root_level == 0) {
7609 ret = account_leaf_items(trans, root, root_eb);
7610 goto out;
7611 }
7612
7613 path = btrfs_alloc_path();
7614 if (!path)
7615 return -ENOMEM;
7616
7617 /*
7618 * Walk down the tree. Missing extent blocks are filled in as
7619 * we go. Metadata is accounted every time we read a new
7620 * extent block.
7621 *
7622 * When we reach a leaf, we account for file extent items in it,
7623 * walk back up the tree (adjusting slot pointers as we go)
7624 * and restart the search process.
7625 */
7626 extent_buffer_get(root_eb); /* For path */
7627 path->nodes[root_level] = root_eb;
7628 path->slots[root_level] = 0;
7629 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7630walk_down:
7631 level = root_level;
7632 while (level >= 0) {
7633 if (path->nodes[level] == NULL) {
7634 int child_bsize = root->nodesize;
7635 int parent_slot;
7636 u64 child_gen;
7637 u64 child_bytenr;
7638
7639 /* We need to get child blockptr/gen from
7640 * parent before we can read it. */
7641 eb = path->nodes[level + 1];
7642 parent_slot = path->slots[level + 1];
7643 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7644 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7645
7646 eb = read_tree_block(root, child_bytenr, child_bsize,
7647 child_gen);
7648 if (!eb || !extent_buffer_uptodate(eb)) {
7649 ret = -EIO;
7650 goto out;
7651 }
7652
7653 path->nodes[level] = eb;
7654 path->slots[level] = 0;
7655
7656 btrfs_tree_read_lock(eb);
7657 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
7658 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
7659
7660 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7661 root->objectid,
7662 child_bytenr,
7663 child_bsize,
7664 BTRFS_QGROUP_OPER_SUB_SUBTREE,
7665 0);
7666 if (ret)
7667 goto out;
7668
7669 }
7670
7671 if (level == 0) {
7672 ret = account_leaf_items(trans, root, path->nodes[level]);
7673 if (ret)
7674 goto out;
7675
7676 /* Nonzero return here means we completed our search */
7677 ret = adjust_slots_upwards(root, path, root_level);
7678 if (ret)
7679 break;
7680
7681 /* Restart search with new slots */
7682 goto walk_down;
7683 }
7684
7685 level--;
7686 }
7687
7688 ret = 0;
7689out:
7690 btrfs_free_path(path);
7691
7692 return ret;
7693}
7694
7481/* 7695/*
7482 * helper to process tree block while walking down the tree. 7696 * helper to process tree block while walking down the tree.
7483 * 7697 *
@@ -7532,9 +7746,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7532 /* wc->stage == UPDATE_BACKREF */ 7746 /* wc->stage == UPDATE_BACKREF */
7533 if (!(wc->flags[level] & flag)) { 7747 if (!(wc->flags[level] & flag)) {
7534 BUG_ON(!path->locks[level]); 7748 BUG_ON(!path->locks[level]);
7535 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 7749 ret = btrfs_inc_ref(trans, root, eb, 1);
7536 BUG_ON(ret); /* -ENOMEM */ 7750 BUG_ON(ret); /* -ENOMEM */
7537 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 7751 ret = btrfs_dec_ref(trans, root, eb, 0);
7538 BUG_ON(ret); /* -ENOMEM */ 7752 BUG_ON(ret); /* -ENOMEM */
7539 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7753 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7540 eb->len, flag, 7754 eb->len, flag,
@@ -7581,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7581 int level = wc->level; 7795 int level = wc->level;
7582 int reada = 0; 7796 int reada = 0;
7583 int ret = 0; 7797 int ret = 0;
7798 bool need_account = false;
7584 7799
7585 generation = btrfs_node_ptr_generation(path->nodes[level], 7800 generation = btrfs_node_ptr_generation(path->nodes[level],
7586 path->slots[level]); 7801 path->slots[level]);
@@ -7626,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7626 7841
7627 if (wc->stage == DROP_REFERENCE) { 7842 if (wc->stage == DROP_REFERENCE) {
7628 if (wc->refs[level - 1] > 1) { 7843 if (wc->refs[level - 1] > 1) {
7844 need_account = true;
7629 if (level == 1 && 7845 if (level == 1 &&
7630 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7846 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7631 goto skip; 7847 goto skip;
@@ -7689,6 +7905,16 @@ skip:
7689 parent = 0; 7905 parent = 0;
7690 } 7906 }
7691 7907
7908 if (need_account) {
7909 ret = account_shared_subtree(trans, root, next,
7910 generation, level - 1);
7911 if (ret) {
7912 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
7913 "%d accounting shared subtree. Quota "
7914 "is out of sync, rescan required.\n",
7915 root->fs_info->sb->s_id, ret);
7916 }
7917 }
7692 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7918 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7693 root->root_key.objectid, level - 1, 0, 0); 7919 root->root_key.objectid, level - 1, 0, 0);
7694 BUG_ON(ret); /* -ENOMEM */ 7920 BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +7995,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7769 if (wc->refs[level] == 1) { 7995 if (wc->refs[level] == 1) {
7770 if (level == 0) { 7996 if (level == 0) {
7771 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7997 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7772 ret = btrfs_dec_ref(trans, root, eb, 1, 7998 ret = btrfs_dec_ref(trans, root, eb, 1);
7773 wc->for_reloc);
7774 else 7999 else
7775 ret = btrfs_dec_ref(trans, root, eb, 0, 8000 ret = btrfs_dec_ref(trans, root, eb, 0);
7776 wc->for_reloc);
7777 BUG_ON(ret); /* -ENOMEM */ 8001 BUG_ON(ret); /* -ENOMEM */
8002 ret = account_leaf_items(trans, root, eb);
8003 if (ret) {
8004 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8005 "%d accounting leaf items. Quota "
8006 "is out of sync, rescan required.\n",
8007 root->fs_info->sb->s_id, ret);
8008 }
7778 } 8009 }
7779 /* make block locked assertion in clean_tree_block happy */ 8010 /* make block locked assertion in clean_tree_block happy */
7780 if (!path->locks[level] && 8011 if (!path->locks[level] &&
@@ -7900,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7900 int level; 8131 int level;
7901 bool root_dropped = false; 8132 bool root_dropped = false;
7902 8133
8134 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8135
7903 path = btrfs_alloc_path(); 8136 path = btrfs_alloc_path();
7904 if (!path) { 8137 if (!path) {
7905 err = -ENOMEM; 8138 err = -ENOMEM;
@@ -8025,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8025 goto out_end_trans; 8258 goto out_end_trans;
8026 } 8259 }
8027 8260
8261 /*
8262 * Qgroup update accounting is run from
8263 * delayed ref handling. This usually works
8264 * out because delayed refs are normally the
8265 * only way qgroup updates are added. However,
8266 * we may have added updates during our tree
8267 * walk so run qgroups here to make sure we
8268 * don't lose any updates.
8269 */
8270 ret = btrfs_delayed_qgroup_accounting(trans,
8271 root->fs_info);
8272 if (ret)
8273 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8274 "running qgroup updates "
8275 "during snapshot delete. "
8276 "Quota is out of sync, "
8277 "rescan required.\n", ret);
8278
8028 btrfs_end_transaction_throttle(trans, tree_root); 8279 btrfs_end_transaction_throttle(trans, tree_root);
8029 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8280 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8030 pr_debug("BTRFS: drop snapshot early exit\n"); 8281 pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8078 } 8329 }
8079 root_dropped = true; 8330 root_dropped = true;
8080out_end_trans: 8331out_end_trans:
8332 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
8333 if (ret)
8334 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8335 "running qgroup updates "
8336 "during snapshot delete. "
8337 "Quota is out of sync, "
8338 "rescan required.\n", ret);
8339
8081 btrfs_end_transaction_throttle(trans, tree_root); 8340 btrfs_end_transaction_throttle(trans, tree_root);
8082out_free: 8341out_free:
8083 kfree(wc); 8342 kfree(wc);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..54c84daec9b5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -756,7 +756,7 @@ again:
756 found_next = 1; 756 found_next = 1;
757 if (ret != 0) 757 if (ret != 0)
758 goto insert; 758 goto insert;
759 slot = 0; 759 slot = path->slots[0];
760 } 760 }
761 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); 761 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
762 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || 762 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..d3afac292d67 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1838,33 +1838,9 @@ out:
1838 1838
1839int btrfs_release_file(struct inode *inode, struct file *filp) 1839int btrfs_release_file(struct inode *inode, struct file *filp)
1840{ 1840{
1841 /*
1842 * ordered_data_close is set by settattr when we are about to truncate
1843 * a file from a non-zero size to a zero size. This tries to
1844 * flush down new bytes that may have been written if the
1845 * application were using truncate to replace a file in place.
1846 */
1847 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1848 &BTRFS_I(inode)->runtime_flags)) {
1849 struct btrfs_trans_handle *trans;
1850 struct btrfs_root *root = BTRFS_I(inode)->root;
1851
1852 /*
1853 * We need to block on a committing transaction to keep us from
1854 * throwing a ordered operation on to the list and causing
1855 * something like sync to deadlock trying to flush out this
1856 * inode.
1857 */
1858 trans = btrfs_start_transaction(root, 0);
1859 if (IS_ERR(trans))
1860 return PTR_ERR(trans);
1861 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1862 btrfs_end_transaction(trans, root);
1863 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1864 filemap_flush(inode->i_mapping);
1865 }
1866 if (filp->private_data) 1841 if (filp->private_data)
1867 btrfs_ioctl_trans_end(filp); 1842 btrfs_ioctl_trans_end(filp);
1843 filemap_flush(inode->i_mapping);
1868 return 0; 1844 return 0;
1869} 1845}
1870 1846
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3183742d6f0d..03708ef3deef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -709,6 +709,18 @@ retry:
709 unlock_extent(io_tree, async_extent->start, 709 unlock_extent(io_tree, async_extent->start,
710 async_extent->start + 710 async_extent->start +
711 async_extent->ram_size - 1); 711 async_extent->ram_size - 1);
712
713 /*
714 * we need to redirty the pages if we decide to
715 * fallback to uncompressed IO, otherwise we
716 * will not submit these pages down to lower
717 * layers.
718 */
719 extent_range_redirty_for_io(inode,
720 async_extent->start,
721 async_extent->start +
722 async_extent->ram_size - 1);
723
712 goto retry; 724 goto retry;
713 } 725 }
714 goto out_free; 726 goto out_free;
@@ -7939,27 +7951,6 @@ static int btrfs_truncate(struct inode *inode)
7939 BUG_ON(ret); 7951 BUG_ON(ret);
7940 7952
7941 /* 7953 /*
7942 * setattr is responsible for setting the ordered_data_close flag,
7943 * but that is only tested during the last file release. That
7944 * could happen well after the next commit, leaving a great big
7945 * window where new writes may get lost if someone chooses to write
7946 * to this file after truncating to zero
7947 *
7948 * The inode doesn't have any dirty data here, and so if we commit
7949 * this is a noop. If someone immediately starts writing to the inode
7950 * it is very likely we'll catch some of their writes in this
7951 * transaction, and the commit will find this file on the ordered
7952 * data list with good things to send down.
7953 *
7954 * This is a best effort solution, there is still a window where
7955 * using truncate to replace the contents of the file will
7956 * end up with a zero length file after a crash.
7957 */
7958 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7959 &BTRFS_I(inode)->runtime_flags))
7960 btrfs_add_ordered_operation(trans, root, inode);
7961
7962 /*
7963 * So if we truncate and then write and fsync we normally would just 7954 * So if we truncate and then write and fsync we normally would just
7964 * write the extents that changed, which is a problem if we need to 7955 * write the extents that changed, which is a problem if we need to
7965 * first truncate that entire inode. So set this flag so we write out 7956 * first truncate that entire inode. So set this flag so we write out
@@ -8106,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8106 mutex_init(&ei->delalloc_mutex); 8097 mutex_init(&ei->delalloc_mutex);
8107 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8098 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8108 INIT_LIST_HEAD(&ei->delalloc_inodes); 8099 INIT_LIST_HEAD(&ei->delalloc_inodes);
8109 INIT_LIST_HEAD(&ei->ordered_operations);
8110 RB_CLEAR_NODE(&ei->rb_node); 8100 RB_CLEAR_NODE(&ei->rb_node);
8111 8101
8112 return inode; 8102 return inode;
@@ -8146,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
8146 if (!root) 8136 if (!root)
8147 goto free; 8137 goto free;
8148 8138
8149 /*
8150 * Make sure we're properly removed from the ordered operation
8151 * lists.
8152 */
8153 smp_mb();
8154 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8155 spin_lock(&root->fs_info->ordered_root_lock);
8156 list_del_init(&BTRFS_I(inode)->ordered_operations);
8157 spin_unlock(&root->fs_info->ordered_root_lock);
8158 }
8159
8160 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 8139 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8161 &BTRFS_I(inode)->runtime_flags)) { 8140 &BTRFS_I(inode)->runtime_flags)) {
8162 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 8141 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8338 ret = 0; 8317 ret = 0;
8339 8318
8340 /* 8319 /*
8341 * we're using rename to replace one file with another. 8320 * we're using rename to replace one file with another. Start IO on it
8342 * and the replacement file is large. Start IO on it now so 8321 * now so we don't add too much work to the end of the transaction
8343 * we don't add too much work to the end of the transaction
8344 */ 8322 */
8345 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 8323 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8346 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8347 filemap_flush(old_inode->i_mapping); 8324 filemap_flush(old_inode->i_mapping);
8348 8325
8349 /* close the racy window with snapshot create/destroy ioctl */ 8326 /* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8391 */ 8368 */
8392 btrfs_pin_log_trans(root); 8369 btrfs_pin_log_trans(root);
8393 } 8370 }
8394 /*
8395 * make sure the inode gets flushed if it is replacing
8396 * something.
8397 */
8398 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8399 btrfs_add_ordered_operation(trans, root, old_inode);
8400 8371
8401 inode_inc_iversion(old_dir); 8372 inode_inc_iversion(old_dir);
8402 inode_inc_iversion(new_dir); 8373 inode_inc_iversion(new_dir);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..963895c1f801 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
571 571
572 trace_btrfs_ordered_extent_remove(inode, entry); 572 trace_btrfs_ordered_extent_remove(inode, entry);
573 573
574 /*
575 * we have no more ordered extents for this inode and
576 * no dirty pages. We can safely remove it from the
577 * list of ordered extents
578 */
579 if (RB_EMPTY_ROOT(&tree->tree) &&
580 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
581 spin_lock(&root->fs_info->ordered_root_lock);
582 list_del_init(&BTRFS_I(inode)->ordered_operations);
583 spin_unlock(&root->fs_info->ordered_root_lock);
584 }
585
586 if (!root->nr_ordered_extents) { 574 if (!root->nr_ordered_extents) {
587 spin_lock(&root->fs_info->ordered_root_lock); 575 spin_lock(&root->fs_info->ordered_root_lock);
588 BUG_ON(list_empty(&root->ordered_root)); 576 BUG_ON(list_empty(&root->ordered_root));
@@ -687,81 +675,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
687} 675}
688 676
689/* 677/*
690 * this is used during transaction commit to write all the inodes
691 * added to the ordered operation list. These files must be fully on
692 * disk before the transaction commits.
693 *
694 * we have two modes here, one is to just start the IO via filemap_flush
695 * and the other is to wait for all the io. When we wait, we have an
696 * extra check to make sure the ordered operation list really is empty
697 * before we return
698 */
699int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
700 struct btrfs_root *root, int wait)
701{
702 struct btrfs_inode *btrfs_inode;
703 struct inode *inode;
704 struct btrfs_transaction *cur_trans = trans->transaction;
705 struct list_head splice;
706 struct list_head works;
707 struct btrfs_delalloc_work *work, *next;
708 int ret = 0;
709
710 INIT_LIST_HEAD(&splice);
711 INIT_LIST_HEAD(&works);
712
713 mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
714 spin_lock(&root->fs_info->ordered_root_lock);
715 list_splice_init(&cur_trans->ordered_operations, &splice);
716 while (!list_empty(&splice)) {
717 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
718 ordered_operations);
719 inode = &btrfs_inode->vfs_inode;
720
721 list_del_init(&btrfs_inode->ordered_operations);
722
723 /*
724 * the inode may be getting freed (in sys_unlink path).
725 */
726 inode = igrab(inode);
727 if (!inode)
728 continue;
729
730 if (!wait)
731 list_add_tail(&BTRFS_I(inode)->ordered_operations,
732 &cur_trans->ordered_operations);
733 spin_unlock(&root->fs_info->ordered_root_lock);
734
735 work = btrfs_alloc_delalloc_work(inode, wait, 1);
736 if (!work) {
737 spin_lock(&root->fs_info->ordered_root_lock);
738 if (list_empty(&BTRFS_I(inode)->ordered_operations))
739 list_add_tail(&btrfs_inode->ordered_operations,
740 &splice);
741 list_splice_tail(&splice,
742 &cur_trans->ordered_operations);
743 spin_unlock(&root->fs_info->ordered_root_lock);
744 ret = -ENOMEM;
745 goto out;
746 }
747 list_add_tail(&work->list, &works);
748 btrfs_queue_work(root->fs_info->flush_workers,
749 &work->work);
750
751 cond_resched();
752 spin_lock(&root->fs_info->ordered_root_lock);
753 }
754 spin_unlock(&root->fs_info->ordered_root_lock);
755out:
756 list_for_each_entry_safe(work, next, &works, list) {
757 list_del_init(&work->list);
758 btrfs_wait_and_free_delalloc_work(work);
759 }
760 mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
761 return ret;
762}
763
764/*
765 * Used to start IO or wait for a given ordered extent to finish. 678 * Used to start IO or wait for a given ordered extent to finish.
766 * 679 *
767 * If wait is one, this effectively waits on page writeback for all the pages 680 * If wait is one, this effectively waits on page writeback for all the pages
@@ -1120,42 +1033,6 @@ out:
1120 return index; 1033 return index;
1121} 1034}
1122 1035
1123
1124/*
1125 * add a given inode to the list of inodes that must be fully on
1126 * disk before a transaction commit finishes.
1127 *
1128 * This basically gives us the ext3 style data=ordered mode, and it is mostly
1129 * used to make sure renamed files are fully on disk.
1130 *
1131 * It is a noop if the inode is already fully on disk.
1132 *
1133 * If trans is not null, we'll do a friendly check for a transaction that
1134 * is already flushing things and force the IO down ourselves.
1135 */
1136void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1137 struct btrfs_root *root, struct inode *inode)
1138{
1139 struct btrfs_transaction *cur_trans = trans->transaction;
1140 u64 last_mod;
1141
1142 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
1143
1144 /*
1145 * if this file hasn't been changed since the last transaction
1146 * commit, we can safely return without doing anything
1147 */
1148 if (last_mod <= root->fs_info->last_trans_committed)
1149 return;
1150
1151 spin_lock(&root->fs_info->ordered_root_lock);
1152 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1153 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1154 &cur_trans->ordered_operations);
1155 }
1156 spin_unlock(&root->fs_info->ordered_root_lock);
1157}
1158
1159int __init ordered_data_init(void) 1036int __init ordered_data_init(void)
1160{ 1037{
1161 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 1038 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
192 u32 *sum, int len); 192 u32 *sum, int len);
193int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
194 struct btrfs_root *root, int wait);
195void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
196 struct btrfs_root *root,
197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 193int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 194void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct inode *inode, 195void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..b497498484be 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1201,6 +1201,50 @@ out:
1201 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1201 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1202 return ret; 1202 return ret;
1203} 1203}
1204
1205static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
1206 struct btrfs_qgroup_operation *oper2)
1207{
1208 /*
1209 * Ignore seq and type here, we're looking for any operation
1210 * at all related to this extent on that root.
1211 */
1212 if (oper1->bytenr < oper2->bytenr)
1213 return -1;
1214 if (oper1->bytenr > oper2->bytenr)
1215 return 1;
1216 if (oper1->ref_root < oper2->ref_root)
1217 return -1;
1218 if (oper1->ref_root > oper2->ref_root)
1219 return 1;
1220 return 0;
1221}
1222
1223static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
1224 struct btrfs_qgroup_operation *oper)
1225{
1226 struct rb_node *n;
1227 struct btrfs_qgroup_operation *cur;
1228 int cmp;
1229
1230 spin_lock(&fs_info->qgroup_op_lock);
1231 n = fs_info->qgroup_op_tree.rb_node;
1232 while (n) {
1233 cur = rb_entry(n, struct btrfs_qgroup_operation, n);
1234 cmp = comp_oper_exist(cur, oper);
1235 if (cmp < 0) {
1236 n = n->rb_right;
1237 } else if (cmp) {
1238 n = n->rb_left;
1239 } else {
1240 spin_unlock(&fs_info->qgroup_op_lock);
1241 return -EEXIST;
1242 }
1243 }
1244 spin_unlock(&fs_info->qgroup_op_lock);
1245 return 0;
1246}
1247
1204static int comp_oper(struct btrfs_qgroup_operation *oper1, 1248static int comp_oper(struct btrfs_qgroup_operation *oper1,
1205 struct btrfs_qgroup_operation *oper2) 1249 struct btrfs_qgroup_operation *oper2)
1206{ 1250{
@@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1290 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); 1334 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1291 INIT_LIST_HEAD(&oper->elem.list); 1335 INIT_LIST_HEAD(&oper->elem.list);
1292 oper->elem.seq = 0; 1336 oper->elem.seq = 0;
1337
1338 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1339 /*
1340 * If any operation for this bytenr/ref_root combo
1341 * exists, then we know it's not exclusively owned and
1342 * shouldn't be queued up.
1343 *
1344 * This also catches the case where we have a cloned
1345 * extent that gets queued up multiple times during
1346 * drop snapshot.
1347 */
1348 if (qgroup_oper_exists(fs_info, oper)) {
1349 kfree(oper);
1350 return 0;
1351 }
1352 }
1353
1293 ret = insert_qgroup_oper(fs_info, oper); 1354 ret = insert_qgroup_oper(fs_info, oper);
1294 if (ret) { 1355 if (ret) {
1295 /* Shouldn't happen so have an assert for developers */ 1356 /* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1945,111 @@ out:
1884} 1945}
1885 1946
1886/* 1947/*
1948 * Process a reference to a shared subtree. This type of operation is
1949 * queued during snapshot removal when we encounter extents which are
1950 * shared between more than one root.
1951 */
1952static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
1953 struct btrfs_fs_info *fs_info,
1954 struct btrfs_qgroup_operation *oper)
1955{
1956 struct ulist *roots = NULL;
1957 struct ulist_node *unode;
1958 struct ulist_iterator uiter;
1959 struct btrfs_qgroup_list *glist;
1960 struct ulist *parents;
1961 int ret = 0;
1962 int err;
1963 struct btrfs_qgroup *qg;
1964 u64 root_obj = 0;
1965 struct seq_list elem = {};
1966
1967 parents = ulist_alloc(GFP_NOFS);
1968 if (!parents)
1969 return -ENOMEM;
1970
1971 btrfs_get_tree_mod_seq(fs_info, &elem);
1972 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1973 elem.seq, &roots);
1974 btrfs_put_tree_mod_seq(fs_info, &elem);
1975 if (ret < 0)
1976 return ret;
1977
1978 if (roots->nnodes != 1)
1979 goto out;
1980
1981 ULIST_ITER_INIT(&uiter);
1982 unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
1983 /*
1984 * If we find our ref root then that means all refs
1985 * this extent has to the root have not yet been
1986 * deleted. In that case, we do nothing and let the
1987 * last ref for this bytenr drive our update.
1988 *
1989 * This can happen for example if an extent is
1990 * referenced multiple times in a snapshot (clone,
1991 * etc). If we are in the middle of snapshot removal,
1992 * queued updates for such an extent will find the
1993 * root if we have not yet finished removing the
1994 * snapshot.
1995 */
1996 if (unode->val == oper->ref_root)
1997 goto out;
1998
1999 root_obj = unode->val;
2000 BUG_ON(!root_obj);
2001
2002 spin_lock(&fs_info->qgroup_lock);
2003 qg = find_qgroup_rb(fs_info, root_obj);
2004 if (!qg)
2005 goto out_unlock;
2006
2007 qg->excl += oper->num_bytes;
2008 qg->excl_cmpr += oper->num_bytes;
2009 qgroup_dirty(fs_info, qg);
2010
2011 /*
2012 * Adjust counts for parent groups. First we find all
2013 * parents, then in the 2nd loop we do the adjustment
2014 * while adding parents of the parents to our ulist.
2015 */
2016 list_for_each_entry(glist, &qg->groups, next_group) {
2017 err = ulist_add(parents, glist->group->qgroupid,
2018 ptr_to_u64(glist->group), GFP_ATOMIC);
2019 if (err < 0) {
2020 ret = err;
2021 goto out_unlock;
2022 }
2023 }
2024
2025 ULIST_ITER_INIT(&uiter);
2026 while ((unode = ulist_next(parents, &uiter))) {
2027 qg = u64_to_ptr(unode->aux);
2028 qg->excl += oper->num_bytes;
2029 qg->excl_cmpr += oper->num_bytes;
2030 qgroup_dirty(fs_info, qg);
2031
2032 /* Add any parents of the parents */
2033 list_for_each_entry(glist, &qg->groups, next_group) {
2034 err = ulist_add(parents, glist->group->qgroupid,
2035 ptr_to_u64(glist->group), GFP_ATOMIC);
2036 if (err < 0) {
2037 ret = err;
2038 goto out_unlock;
2039 }
2040 }
2041 }
2042
2043out_unlock:
2044 spin_unlock(&fs_info->qgroup_lock);
2045
2046out:
2047 ulist_free(roots);
2048 ulist_free(parents);
2049 return ret;
2050}
2051
2052/*
1887 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 2053 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1888 * from the fs. First, all roots referencing the extent are searched, and 2054 * from the fs. First, all roots referencing the extent are searched, and
1889 * then the space is accounted accordingly to the different roots. The 2055 * then the space is accounted accordingly to the different roots. The
@@ -1920,6 +2086,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1920 case BTRFS_QGROUP_OPER_SUB_SHARED: 2086 case BTRFS_QGROUP_OPER_SUB_SHARED:
1921 ret = qgroup_shared_accounting(trans, fs_info, oper); 2087 ret = qgroup_shared_accounting(trans, fs_info, oper);
1922 break; 2088 break;
2089 case BTRFS_QGROUP_OPER_SUB_SUBTREE:
2090 ret = qgroup_subtree_accounting(trans, fs_info, oper);
2091 break;
1923 default: 2092 default:
1924 ASSERT(0); 2093 ASSERT(0);
1925 } 2094 }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
44 BTRFS_QGROUP_OPER_ADD_SHARED, 44 BTRFS_QGROUP_OPER_ADD_SHARED,
45 BTRFS_QGROUP_OPER_SUB_EXCL, 45 BTRFS_QGROUP_OPER_SUB_EXCL,
46 BTRFS_QGROUP_OPER_SUB_SHARED, 46 BTRFS_QGROUP_OPER_SUB_SHARED,
47 BTRFS_QGROUP_OPER_SUB_SUBTREE,
47}; 48};
48 49
49struct btrfs_qgroup_operation { 50struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67b48b9a03e0..c4124de4435b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1665,6 +1665,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1665 return 0; 1665 return 0;
1666} 1666}
1667 1667
1668/*
1669 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
1670 *
1671 * If there's a redundant raid level at DATA block groups, use the respective
1672 * multiplier to scale the sizes.
1673 *
1674 * Unused device space usage is based on simulating the chunk allocator
1675 * algorithm that respects the device sizes, order of allocations and the
1676 * 'alloc_start' value, this is a close approximation of the actual use but
1677 * there are other factors that may change the result (like a new metadata
1678 * chunk).
1679 *
1680 * FIXME: not accurate for mixed block groups, total and free/used are ok,
1681 * available appears slightly larger.
1682 */
1668static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1683static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1669{ 1684{
1670 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 1685 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1675,6 +1690,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1675 u64 total_free_data = 0; 1690 u64 total_free_data = 0;
1676 int bits = dentry->d_sb->s_blocksize_bits; 1691 int bits = dentry->d_sb->s_blocksize_bits;
1677 __be32 *fsid = (__be32 *)fs_info->fsid; 1692 __be32 *fsid = (__be32 *)fs_info->fsid;
1693 unsigned factor = 1;
1694 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
1678 int ret; 1695 int ret;
1679 1696
1680 /* holding chunk_muext to avoid allocating new chunks */ 1697 /* holding chunk_muext to avoid allocating new chunks */
@@ -1682,30 +1699,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1682 rcu_read_lock(); 1699 rcu_read_lock();
1683 list_for_each_entry_rcu(found, head, list) { 1700 list_for_each_entry_rcu(found, head, list) {
1684 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1701 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
1702 int i;
1703
1685 total_free_data += found->disk_total - found->disk_used; 1704 total_free_data += found->disk_total - found->disk_used;
1686 total_free_data -= 1705 total_free_data -=
1687 btrfs_account_ro_block_groups_free_space(found); 1706 btrfs_account_ro_block_groups_free_space(found);
1707
1708 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1709 if (!list_empty(&found->block_groups[i])) {
1710 switch (i) {
1711 case BTRFS_RAID_DUP:
1712 case BTRFS_RAID_RAID1:
1713 case BTRFS_RAID_RAID10:
1714 factor = 2;
1715 }
1716 }
1717 }
1688 } 1718 }
1689 1719
1690 total_used += found->disk_used; 1720 total_used += found->disk_used;
1691 } 1721 }
1722
1692 rcu_read_unlock(); 1723 rcu_read_unlock();
1693 1724
1694 buf->f_namelen = BTRFS_NAME_LEN; 1725 buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
1695 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1726 buf->f_blocks >>= bits;
1696 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1727 buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
1697 buf->f_bsize = dentry->d_sb->s_blocksize; 1728
1698 buf->f_type = BTRFS_SUPER_MAGIC; 1729 /* Account global block reserve as used, it's in logical size already */
1730 spin_lock(&block_rsv->lock);
1731 buf->f_bfree -= block_rsv->size >> bits;
1732 spin_unlock(&block_rsv->lock);
1733
1699 buf->f_bavail = total_free_data; 1734 buf->f_bavail = total_free_data;
1700 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1735 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1701 if (ret) { 1736 if (ret) {
1702 mutex_unlock(&fs_info->chunk_mutex); 1737 mutex_unlock(&fs_info->chunk_mutex);
1703 return ret; 1738 return ret;
1704 } 1739 }
1705 buf->f_bavail += total_free_data; 1740 buf->f_bavail += div_u64(total_free_data, factor);
1706 buf->f_bavail = buf->f_bavail >> bits; 1741 buf->f_bavail = buf->f_bavail >> bits;
1707 mutex_unlock(&fs_info->chunk_mutex); 1742 mutex_unlock(&fs_info->chunk_mutex);
1708 1743
1744 buf->f_type = BTRFS_SUPER_MAGIC;
1745 buf->f_bsize = dentry->d_sb->s_blocksize;
1746 buf->f_namelen = BTRFS_NAME_LEN;
1747
1709 /* We treat it as constant endianness (it doesn't matter _which_) 1748 /* We treat it as constant endianness (it doesn't matter _which_)
1710 because we want the fsid to come out the same whether mounted 1749 because we want the fsid to come out the same whether mounted
1711 on a big-endian or little-endian host */ 1750 on a big-endian or little-endian host */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5f379affdf23..d89c6d3542ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
218 spin_lock_init(&cur_trans->delayed_refs.lock); 218 spin_lock_init(&cur_trans->delayed_refs.lock);
219 219
220 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 220 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221 INIT_LIST_HEAD(&cur_trans->ordered_operations);
222 INIT_LIST_HEAD(&cur_trans->pending_chunks); 221 INIT_LIST_HEAD(&cur_trans->pending_chunks);
223 INIT_LIST_HEAD(&cur_trans->switch_commits); 222 INIT_LIST_HEAD(&cur_trans->switch_commits);
224 list_add_tail(&cur_trans->list, &fs_info->trans_list); 223 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1612 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1611 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1613} 1612}
1614 1613
1615static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1616 struct btrfs_root *root)
1617{
1618 int ret;
1619
1620 ret = btrfs_run_delayed_items(trans, root);
1621 if (ret)
1622 return ret;
1623
1624 /*
1625 * rename don't use btrfs_join_transaction, so, once we
1626 * set the transaction to blocked above, we aren't going
1627 * to get any new ordered operations. We can safely run
1628 * it here and no for sure that nothing new will be added
1629 * to the list
1630 */
1631 ret = btrfs_run_ordered_operations(trans, root, 1);
1632
1633 return ret;
1634}
1635
1636static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1614static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1637{ 1615{
1638 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1653 struct btrfs_transaction *prev_trans = NULL; 1631 struct btrfs_transaction *prev_trans = NULL;
1654 int ret; 1632 int ret;
1655 1633
1656 ret = btrfs_run_ordered_operations(trans, root, 0);
1657 if (ret) {
1658 btrfs_abort_transaction(trans, root, ret);
1659 btrfs_end_transaction(trans, root);
1660 return ret;
1661 }
1662
1663 /* Stop the commit early if ->aborted is set */ 1634 /* Stop the commit early if ->aborted is set */
1664 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1635 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1665 ret = cur_trans->aborted; 1636 ret = cur_trans->aborted;
@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1740 if (ret) 1711 if (ret)
1741 goto cleanup_transaction; 1712 goto cleanup_transaction;
1742 1713
1743 ret = btrfs_flush_all_pending_stuffs(trans, root); 1714 ret = btrfs_run_delayed_items(trans, root);
1744 if (ret) 1715 if (ret)
1745 goto cleanup_transaction; 1716 goto cleanup_transaction;
1746 1717
@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1748 extwriter_counter_read(cur_trans) == 0); 1719 extwriter_counter_read(cur_trans) == 0);
1749 1720
1750 /* some pending stuffs might be added after the previous flush. */ 1721 /* some pending stuffs might be added after the previous flush. */
1751 ret = btrfs_flush_all_pending_stuffs(trans, root); 1722 ret = btrfs_run_delayed_items(trans, root);
1752 if (ret) 1723 if (ret)
1753 goto cleanup_transaction; 1724 goto cleanup_transaction;
1754 1725
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..579be51b27e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
55 wait_queue_head_t writer_wait; 55 wait_queue_head_t writer_wait;
56 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head ordered_operations;
59 struct list_head pending_chunks; 58 struct list_head pending_chunks;
60 struct list_head switch_commits; 59 struct list_head switch_commits;
61 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
57int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); 57int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
58int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, 58int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
59 u64 *old_aux, gfp_t gfp_mask); 59 u64 *old_aux, gfp_t gfp_mask);
60
61/* just like ulist_add_merge() but take a pointer for the aux data */
62static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
63 void **old_aux, gfp_t gfp_mask)
64{
65#if BITS_PER_LONG == 32
66 u64 old64 = (uintptr_t)*old_aux;
67 int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
68 *old_aux = (void *)((uintptr_t)old64);
69 return ret;
70#else
71 return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
72#endif
73}
74
60struct ulist_node *ulist_next(struct ulist *ulist, 75struct ulist_node *ulist_next(struct ulist *ulist,
61 struct ulist_iterator *uiter); 76 struct ulist_iterator *uiter);
62 77