aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 17:48:55 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 17:48:55 -0400
commite2aed8dfa50bb061747eeb14e6af099554a03b76 (patch)
tree900c96a2dfe7195e56ec3c1f027418029d0a8444 /fs
parent476525004ac7e2f990b6956efcd44d0780c2ab4c (diff)
parentb24baf6917a376420d535548e1f88744028bcf24 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull large btrfs update from Chris Mason: "This pull request is very large, and the two main features in here have been under testing/devel for quite a while. We have subvolume quotas from the strato developers. This enables full tracking of how many blocks are allocated to each subvolume (and all snapshots) and you can set limits on a per-subvolume basis. You can also create quota groups and toss multiple subvolumes into a big group. It's everything you need to be a web hosting company and give each user their own subvolume. The userland side of the quotas is being refreshed, they'll send out details on where to grab it soon. Next is the kernel side of btrfs send/receive from Alexander Block. This leverages the same infrastructure as the quota code to figure out relationships between blocks and their owners. It can then compute the difference between two snapshots and sends the diffs in a neutral format into userland. The basic model: create a snapshot send that snapshot as the initial backup make changes create a second snapshot send the incremental as a backup delete the first snapshot (use the second snapshot for the next incremental) The receive portion is all in userland, and in the 'next' branch of my btrfs-progs repo. There's still some work to do in terms of optimizing the send side from kernel to userland. The really important part is figuring out how two snapshots are different, and this is where we are concentrating right now. The initial send of a dataset is a little slower than tar, but the incremental sends are dramatically faster than what rsync can do. On top of all of that, we have a nice queue of fixes, cleanups and optimizations." Fix up trivial modify/del conflict in fs/btrfs/ioctl.c Also fix up semantic conflict in fs/btrfs/send.c: the interface to dentry_open() changed in commit 765927b2d508 ("switch dentry_open() to struct path, make it grab references itself"), and since it now grabs whatever references it needs, we should no longer do the mntget() on the mnt (and we need to dput() the dentry reference we took). * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (65 commits) Btrfs: uninit variable fixes in send/receive Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive Btrfs: add btrfs_compare_trees function Btrfs: introduce subvol uuids and times Btrfs: make iref_to_path non static Btrfs: add a barrier before a waitqueue_active check Btrfs: call the ordered free operation without any locks held Btrfs: Check INCOMPAT flags on remount and add helper function Btrfs: add helper for tree enumeration btrfs: allow cross-subvolume file clone Btrfs: improve multi-thread buffer read Btrfs: make btrfs's allocation smoothly with preallocation Btrfs: lock the transition from dirty to writeback for an eb Btrfs: fix potential race in extent buffer freeing Btrfs: don't return true in releasepage unless we actually freed the eb Btrfs: suppress printk() if all device I/O stats are zero Btrfs: remove unwanted printk() for btrfs device I/O stats Btrfs: rewrite BTRFS_SETGET_FUNCS Btrfs: zero unused bytes in inode item Btrfs: kill free_space pointer from inode structure ... Conflicts: fs/btrfs/ioctl.c
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/async-thread.c9
-rw-r--r--fs/btrfs/backref.c40
-rw-r--r--fs/btrfs/backref.h7
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c7
-rw-r--r--fs/btrfs/ctree.c775
-rw-r--r--fs/btrfs/ctree.h368
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c56
-rw-r--r--fs/btrfs/delayed-ref.h62
-rw-r--r--fs/btrfs/disk-io.c150
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c358
-rw-r--r--fs/btrfs/extent_io.c58
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c42
-rw-r--r--fs/btrfs/ioctl.c467
-rw-r--r--fs/btrfs/ioctl.h97
-rw-r--r--fs/btrfs/locking.c14
-rw-r--r--fs/btrfs/qgroup.c1571
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/root-tree.c107
-rw-r--r--fs/btrfs/send.c4571
-rw-r--r--fs/btrfs/send.h133
-rw-r--r--fs/btrfs/struct-funcs.c196
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/transaction.c101
-rw-r--r--fs/btrfs/transaction.h12
-rw-r--r--fs/btrfs/tree-log.c4
-rw-r--r--fs/btrfs/volumes.c25
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/inode.c2
35 files changed, 8689 insertions, 631 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2befae..d7fcdba141a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o 11 reada.o backref.o ulist.o qgroup.o send.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 42704149b72..58b7d14b08e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers,
206 206
207 work->ordered_func(work); 207 work->ordered_func(work);
208 208
209 /* now take the lock again and call the freeing code */ 209 /* now take the lock again and drop our item from the list */
210 spin_lock(&workers->order_lock); 210 spin_lock(&workers->order_lock);
211 list_del(&work->order_list); 211 list_del(&work->order_list);
212 spin_unlock(&workers->order_lock);
213
214 /*
215 * we don't want to call the ordered free functions
216 * with the lock held though
217 */
212 work->ordered_free(work); 218 work->ordered_free(work);
219 spin_lock(&workers->order_lock);
213 } 220 }
214 221
215 spin_unlock(&workers->order_lock); 222 spin_unlock(&workers->order_lock);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a383c18e74e..a256f3b2a84 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
773 */ 773 */
774static int find_parent_nodes(struct btrfs_trans_handle *trans, 774static int find_parent_nodes(struct btrfs_trans_handle *trans,
775 struct btrfs_fs_info *fs_info, u64 bytenr, 775 struct btrfs_fs_info *fs_info, u64 bytenr,
776 u64 delayed_ref_seq, u64 time_seq, 776 u64 time_seq, struct ulist *refs,
777 struct ulist *refs, struct ulist *roots, 777 struct ulist *roots, const u64 *extent_item_pos)
778 const u64 *extent_item_pos)
779{ 778{
780 struct btrfs_key key; 779 struct btrfs_key key;
781 struct btrfs_path *path; 780 struct btrfs_path *path;
@@ -837,7 +836,7 @@ again:
837 btrfs_put_delayed_ref(&head->node); 836 btrfs_put_delayed_ref(&head->node);
838 goto again; 837 goto again;
839 } 838 }
840 ret = __add_delayed_refs(head, delayed_ref_seq, 839 ret = __add_delayed_refs(head, time_seq,
841 &prefs_delayed); 840 &prefs_delayed);
842 mutex_unlock(&head->mutex); 841 mutex_unlock(&head->mutex);
843 if (ret) { 842 if (ret) {
@@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks)
981 */ 980 */
982static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 981static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
983 struct btrfs_fs_info *fs_info, u64 bytenr, 982 struct btrfs_fs_info *fs_info, u64 bytenr,
984 u64 delayed_ref_seq, u64 time_seq, 983 u64 time_seq, struct ulist **leafs,
985 struct ulist **leafs,
986 const u64 *extent_item_pos) 984 const u64 *extent_item_pos)
987{ 985{
988 struct ulist *tmp; 986 struct ulist *tmp;
@@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
997 return -ENOMEM; 995 return -ENOMEM;
998 } 996 }
999 997
1000 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 998 ret = find_parent_nodes(trans, fs_info, bytenr,
1001 time_seq, *leafs, tmp, extent_item_pos); 999 time_seq, *leafs, tmp, extent_item_pos);
1002 ulist_free(tmp); 1000 ulist_free(tmp);
1003 1001
@@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1024 */ 1022 */
1025int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1023int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1026 struct btrfs_fs_info *fs_info, u64 bytenr, 1024 struct btrfs_fs_info *fs_info, u64 bytenr,
1027 u64 delayed_ref_seq, u64 time_seq, 1025 u64 time_seq, struct ulist **roots)
1028 struct ulist **roots)
1029{ 1026{
1030 struct ulist *tmp; 1027 struct ulist *tmp;
1031 struct ulist_node *node = NULL; 1028 struct ulist_node *node = NULL;
@@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1043 1040
1044 ULIST_ITER_INIT(&uiter); 1041 ULIST_ITER_INIT(&uiter);
1045 while (1) { 1042 while (1) {
1046 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 1043 ret = find_parent_nodes(trans, fs_info, bytenr,
1047 time_seq, tmp, *roots, NULL); 1044 time_seq, tmp, *roots, NULL);
1048 if (ret < 0 && ret != -ENOENT) { 1045 if (ret < 0 && ret != -ENOENT) {
1049 ulist_free(tmp); 1046 ulist_free(tmp);
@@ -1125,10 +1122,10 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1125 * required for the path to fit into the buffer. in that case, the returned 1122 * required for the path to fit into the buffer. in that case, the returned
1126 * value will be smaller than dest. callers must check this! 1123 * value will be smaller than dest. callers must check this!
1127 */ 1124 */
1128static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1129 struct btrfs_inode_ref *iref, 1126 struct btrfs_inode_ref *iref,
1130 struct extent_buffer *eb_in, u64 parent, 1127 struct extent_buffer *eb_in, u64 parent,
1131 char *dest, u32 size) 1128 char *dest, u32 size)
1132{ 1129{
1133 u32 len; 1130 u32 len;
1134 int slot; 1131 int slot;
@@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1376 struct ulist *roots = NULL; 1373 struct ulist *roots = NULL;
1377 struct ulist_node *ref_node = NULL; 1374 struct ulist_node *ref_node = NULL;
1378 struct ulist_node *root_node = NULL; 1375 struct ulist_node *root_node = NULL;
1379 struct seq_list seq_elem = {};
1380 struct seq_list tree_mod_seq_elem = {}; 1376 struct seq_list tree_mod_seq_elem = {};
1381 struct ulist_iterator ref_uiter; 1377 struct ulist_iterator ref_uiter;
1382 struct ulist_iterator root_uiter; 1378 struct ulist_iterator root_uiter;
1383 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1384 1379
1385 pr_debug("resolving all inodes for extent %llu\n", 1380 pr_debug("resolving all inodes for extent %llu\n",
1386 extent_item_objectid); 1381 extent_item_objectid);
@@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1391 trans = btrfs_join_transaction(fs_info->extent_root); 1386 trans = btrfs_join_transaction(fs_info->extent_root);
1392 if (IS_ERR(trans)) 1387 if (IS_ERR(trans))
1393 return PTR_ERR(trans); 1388 return PTR_ERR(trans);
1394
1395 delayed_refs = &trans->transaction->delayed_refs;
1396 spin_lock(&delayed_refs->lock);
1397 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1398 spin_unlock(&delayed_refs->lock);
1399 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1389 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1400 } 1390 }
1401 1391
1402 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1392 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1403 seq_elem.seq, tree_mod_seq_elem.seq, &refs, 1393 tree_mod_seq_elem.seq, &refs,
1404 &extent_item_pos); 1394 &extent_item_pos);
1405 if (ret) 1395 if (ret)
1406 goto out; 1396 goto out;
@@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1408 ULIST_ITER_INIT(&ref_uiter); 1398 ULIST_ITER_INIT(&ref_uiter);
1409 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1399 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1410 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, 1400 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1411 seq_elem.seq, 1401 tree_mod_seq_elem.seq, &roots);
1412 tree_mod_seq_elem.seq, &roots);
1413 if (ret) 1402 if (ret)
1414 break; 1403 break;
1415 ULIST_ITER_INIT(&root_uiter); 1404 ULIST_ITER_INIT(&root_uiter);
@@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1431out: 1420out:
1432 if (!search_commit_root) { 1421 if (!search_commit_root) {
1433 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1422 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1434 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1435 btrfs_end_transaction(trans, fs_info->extent_root); 1423 btrfs_end_transaction(trans, fs_info->extent_root);
1436 } 1424 }
1437 1425
@@ -1543,7 +1531,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1543 ipath->fspath->bytes_left - s_ptr : 0; 1531 ipath->fspath->bytes_left - s_ptr : 0;
1544 1532
1545 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1546 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
1547 inum, fspath_min, bytes_left); 1535 inum, fspath_min, bytes_left);
1548 if (IS_ERR(fspath)) 1536 if (IS_ERR(fspath))
1549 return PTR_ERR(fspath); 1537 return PTR_ERR(fspath);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac7b79..032f4dc7eab 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -21,6 +21,7 @@
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h"
24 25
25#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) 26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
26 27
@@ -58,8 +59,10 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 59
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 60int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 61 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 delayed_ref_seq, u64 time_seq, 62 u64 time_seq, struct ulist **roots);
62 struct ulist **roots); 63char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
64 struct btrfs_inode_ref *iref, struct extent_buffer *eb,
65 u64 parent, char *dest, u32 size);
63 66
64struct btrfs_data_container *init_data_container(u32 total_bytes); 67struct btrfs_data_container *init_data_container(u32 total_bytes);
65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 68struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 12394a90d60..5b2ad6bc4fe 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -87,9 +87,6 @@ struct btrfs_inode {
87 /* node for the red-black tree that links inodes in subvolume root */ 87 /* node for the red-black tree that links inodes in subvolume root */
88 struct rb_node rb_node; 88 struct rb_node rb_node;
89 89
90 /* the space_info for where this inode's data allocations are done */
91 struct btrfs_space_info *space_info;
92
93 unsigned long runtime_flags; 90 unsigned long runtime_flags;
94 91
95 /* full 64 bit generation number, struct vfs_inode doesn't have a big 92 /* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -191,11 +188,14 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
191 BTRFS_I(inode)->disk_i_size = size; 188 BTRFS_I(inode)->disk_i_size = size;
192} 189}
193 190
194static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, 191static inline bool btrfs_is_free_space_inode(struct inode *inode)
195 struct inode *inode)
196{ 192{
197 if (root == root->fs_info->tree_root || 193 struct btrfs_root *root = BTRFS_I(inode)->root;
198 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) 194
195 if (root == root->fs_info->tree_root &&
196 btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
197 return true;
198 if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
199 return true; 199 return true;
200 return false; 200 return false;
201} 201}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index da6e9364a5e..9197e2e3340 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1032,6 +1032,7 @@ continue_with_current_leaf_stack_frame:
1032 struct btrfs_disk_key *disk_key; 1032 struct btrfs_disk_key *disk_key;
1033 u8 type; 1033 u8 type;
1034 u32 item_offset; 1034 u32 item_offset;
1035 u32 item_size;
1035 1036
1036 if (disk_item_offset + sizeof(struct btrfs_item) > 1037 if (disk_item_offset + sizeof(struct btrfs_item) >
1037 sf->block_ctx->len) { 1038 sf->block_ctx->len) {
@@ -1047,6 +1048,7 @@ leaf_item_out_of_bounce_error:
1047 disk_item_offset, 1048 disk_item_offset,
1048 sizeof(struct btrfs_item)); 1049 sizeof(struct btrfs_item));
1049 item_offset = le32_to_cpu(disk_item.offset); 1050 item_offset = le32_to_cpu(disk_item.offset);
1051 item_size = le32_to_cpu(disk_item.size);
1050 disk_key = &disk_item.key; 1052 disk_key = &disk_item.key;
1051 type = disk_key->type; 1053 type = disk_key->type;
1052 1054
@@ -1057,14 +1059,13 @@ leaf_item_out_of_bounce_error:
1057 1059
1058 root_item_offset = item_offset + 1060 root_item_offset = item_offset +
1059 offsetof(struct btrfs_leaf, items); 1061 offsetof(struct btrfs_leaf, items);
1060 if (root_item_offset + 1062 if (root_item_offset + item_size >
1061 sizeof(struct btrfs_root_item) >
1062 sf->block_ctx->len) 1063 sf->block_ctx->len)
1063 goto leaf_item_out_of_bounce_error; 1064 goto leaf_item_out_of_bounce_error;
1064 btrfsic_read_from_block_data( 1065 btrfsic_read_from_block_data(
1065 sf->block_ctx, &root_item, 1066 sf->block_ctx, &root_item,
1066 root_item_offset, 1067 root_item_offset,
1067 sizeof(struct btrfs_root_item)); 1068 item_size);
1068 next_bytenr = le64_to_cpu(root_item.bytenr); 1069 next_bytenr = le64_to_cpu(root_item.bytenr);
1069 1070
1070 sf->error = 1071 sf->error =
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b390058..9d7621f271f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -321,7 +321,7 @@ struct tree_mod_root {
321struct tree_mod_elem { 321struct tree_mod_elem {
322 struct rb_node node; 322 struct rb_node node;
323 u64 index; /* shifted logical */ 323 u64 index; /* shifted logical */
324 struct seq_list elem; 324 u64 seq;
325 enum mod_log_op op; 325 enum mod_log_op op;
326 326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +341,50 @@ struct tree_mod_elem {
341 struct tree_mod_root old_root; 341 struct tree_mod_root old_root;
342}; 342};
343 343
344static inline void 344static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{ 345{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); 346 read_lock(&fs_info->tree_mod_log_lock);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349} 347}
350 348
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 349static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
352 struct seq_list *elem) 350{
351 read_unlock(&fs_info->tree_mod_log_lock);
352}
353
354static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
355{
356 write_lock(&fs_info->tree_mod_log_lock);
357}
358
359static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
360{
361 write_unlock(&fs_info->tree_mod_log_lock);
362}
363
364/*
365 * This adds a new blocker to the tree mod log's blocker list if the @elem
366 * passed does not already have a sequence number set. So when a caller expects
367 * to record tree modifications, it should ensure to set elem->seq to zero
368 * before calling btrfs_get_tree_mod_seq.
369 * Returns a fresh, unused tree log modification sequence number, even if no new
370 * blocker was added.
371 */
372u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
373 struct seq_list *elem)
353{ 374{
354 elem->flags = 1; 375 u64 seq;
376
377 tree_mod_log_write_lock(fs_info);
355 spin_lock(&fs_info->tree_mod_seq_lock); 378 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem); 379 if (!elem->seq) {
380 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
381 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
382 }
383 seq = btrfs_inc_tree_mod_seq(fs_info);
357 spin_unlock(&fs_info->tree_mod_seq_lock); 384 spin_unlock(&fs_info->tree_mod_seq_lock);
385 tree_mod_log_write_unlock(fs_info);
386
387 return seq;
358} 388}
359 389
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 390void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
371 if (!seq_putting) 401 if (!seq_putting)
372 return; 402 return;
373 403
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock); 404 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list); 405 list_del(&elem->list);
406 elem->seq = 0;
377 407
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 408 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { 409 if (cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) { 410 if (seq_putting > cur_elem->seq) {
381 /* 411 /*
382 * blocker with lower sequence number exists, we 412 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log 413 * cannot remove anything from the log
384 */ 414 */
385 goto out; 415 spin_unlock(&fs_info->tree_mod_seq_lock);
416 return;
386 } 417 }
387 min_seq = cur_elem->seq; 418 min_seq = cur_elem->seq;
388 } 419 }
389 } 420 }
421 spin_unlock(&fs_info->tree_mod_seq_lock);
422
423 /*
424 * we removed the lowest blocker from the blocker list, so there may be
425 * more processible delayed refs.
426 */
427 wake_up(&fs_info->tree_mod_seq_wait);
390 428
391 /* 429 /*
392 * anything that's lower than the lowest existing (read: blocked) 430 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree. 431 * sequence number can be removed from the tree.
394 */ 432 */
395 write_lock(&fs_info->tree_mod_log_lock); 433 tree_mod_log_write_lock(fs_info);
396 tm_root = &fs_info->tree_mod_log; 434 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) { 435 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node); 436 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node); 437 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq) 438 if (tm->seq > min_seq)
401 continue; 439 continue;
402 rb_erase(node, tm_root); 440 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm); 441 kfree(tm);
405 } 442 }
406 write_unlock(&fs_info->tree_mod_log_lock); 443 tree_mod_log_write_unlock(fs_info);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409} 444}
410 445
411/* 446/*
@@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
423 struct rb_node **new; 458 struct rb_node **new;
424 struct rb_node *parent = NULL; 459 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur; 460 struct tree_mod_elem *cur;
426 int ret = 0;
427 461
428 BUG_ON(!tm || !tm->elem.seq); 462 BUG_ON(!tm || !tm->seq);
429 463
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log; 464 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node; 465 new = &tm_root->rb_node;
433 while (*new) { 466 while (*new) {
@@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
437 new = &((*new)->rb_left); 470 new = &((*new)->rb_left);
438 else if (cur->index > tm->index) 471 else if (cur->index > tm->index)
439 new = &((*new)->rb_right); 472 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq) 473 else if (cur->seq < tm->seq)
441 new = &((*new)->rb_left); 474 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq) 475 else if (cur->seq > tm->seq)
443 new = &((*new)->rb_right); 476 new = &((*new)->rb_right);
444 else { 477 else {
445 kfree(tm); 478 kfree(tm);
446 ret = -EEXIST; 479 return -EEXIST;
447 goto unlock;
448 } 480 }
449 } 481 }
450 482
451 rb_link_node(&tm->node, parent, new); 483 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root); 484 rb_insert_color(&tm->node, tm_root);
453unlock: 485 return 0;
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456} 486}
457 487
488/*
489 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
490 * returns zero with the tree_mod_log_lock acquired. The caller must hold
491 * this until all tree mod log insertions are recorded in the rb tree and then
492 * call tree_mod_log_write_unlock() to release.
493 */
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 494static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) { 495 struct extent_buffer *eb) {
460 smp_mb(); 496 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list)) 497 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1; 498 return 1;
463 if (!eb) 499 if (eb && btrfs_header_level(eb) == 0)
464 return 0; 500 return 1;
465 if (btrfs_header_level(eb) == 0) 501
502 tree_mod_log_write_lock(fs_info);
503 if (list_empty(&fs_info->tree_mod_seq_list)) {
504 /*
505 * someone emptied the list while we were waiting for the lock.
506 * we must not add to the list when no blocker exists.
507 */
508 tree_mod_log_write_unlock(fs_info);
466 return 1; 509 return 1;
510 }
511
467 return 0; 512 return 0;
468} 513}
469 514
470/* 515/*
471 * This allocates memory and gets a tree modification sequence number when 516 * This allocates memory and gets a tree modification sequence number.
472 * needed.
473 * 517 *
474 * Returns 0 when no sequence number is needed, < 0 on error. 518 * Returns <0 on error.
475 * Returns 1 when a sequence number was added. In this case, 519 * Returns >0 (the added sequence number) on success.
476 * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
477 * after inserting into the rb tree.
478 */ 520 */
479static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, 521static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
480 struct tree_mod_elem **tm_ret) 522 struct tree_mod_elem **tm_ret)
481{ 523{
482 struct tree_mod_elem *tm; 524 struct tree_mod_elem *tm;
483 int seq;
484 525
485 if (tree_mod_dont_log(fs_info, NULL)) 526 /*
486 return 0; 527 * once we switch from spin locks to something different, we should
487 528 * honor the flags parameter here.
488 tm = *tm_ret = kzalloc(sizeof(*tm), flags); 529 */
530 tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
489 if (!tm) 531 if (!tm)
490 return -ENOMEM; 532 return -ENOMEM;
491 533
492 tm->elem.flags = 0; 534 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
493 spin_lock(&fs_info->tree_mod_seq_lock); 535 return tm->seq;
494 if (list_empty(&fs_info->tree_mod_seq_list)) {
495 /*
496 * someone emptied the list while we were waiting for the lock.
497 * we must not add to the list, because no blocker exists. items
498 * are removed from the list only when the existing blocker is
499 * removed from the list.
500 */
501 kfree(tm);
502 seq = 0;
503 spin_unlock(&fs_info->tree_mod_seq_lock);
504 } else {
505 __get_tree_mod_seq(fs_info, &tm->elem);
506 seq = tm->elem.seq;
507 }
508
509 return seq;
510} 536}
511 537
512static noinline int 538static inline int
513tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, 539__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
514 struct extent_buffer *eb, int slot, 540 struct extent_buffer *eb, int slot,
515 enum mod_log_op op, gfp_t flags) 541 enum mod_log_op op, gfp_t flags)
516{ 542{
517 struct tree_mod_elem *tm;
518 int ret; 543 int ret;
544 struct tree_mod_elem *tm;
519 545
520 ret = tree_mod_alloc(fs_info, flags, &tm); 546 ret = tree_mod_alloc(fs_info, flags, &tm);
521 if (ret <= 0) 547 if (ret < 0)
522 return ret; 548 return ret;
523 549
524 tm->index = eb->start >> PAGE_CACHE_SHIFT; 550 tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
530 tm->slot = slot; 556 tm->slot = slot;
531 tm->generation = btrfs_node_ptr_generation(eb, slot); 557 tm->generation = btrfs_node_ptr_generation(eb, slot);
532 558
533 ret = __tree_mod_log_insert(fs_info, tm); 559 return __tree_mod_log_insert(fs_info, tm);
534 spin_unlock(&fs_info->tree_mod_seq_lock); 560}
561
562static noinline int
563tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
564 struct extent_buffer *eb, int slot,
565 enum mod_log_op op, gfp_t flags)
566{
567 int ret;
568
569 if (tree_mod_dont_log(fs_info, eb))
570 return 0;
571
572 ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
573
574 tree_mod_log_write_unlock(fs_info);
535 return ret; 575 return ret;
536} 576}
537 577
@@ -543,6 +583,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
543} 583}
544 584
545static noinline int 585static noinline int
586tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
587 struct extent_buffer *eb, int slot,
588 enum mod_log_op op)
589{
590 return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
591}
592
593static noinline int
546tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 594tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
547 struct extent_buffer *eb, int dst_slot, int src_slot, 595 struct extent_buffer *eb, int dst_slot, int src_slot,
548 int nr_items, gfp_t flags) 596 int nr_items, gfp_t flags)
@@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
555 return 0; 603 return 0;
556 604
557 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 605 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
558 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, 606 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
559 MOD_LOG_KEY_REMOVE_WHILE_MOVING); 607 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
560 BUG_ON(ret < 0); 608 BUG_ON(ret < 0);
561 } 609 }
562 610
563 ret = tree_mod_alloc(fs_info, flags, &tm); 611 ret = tree_mod_alloc(fs_info, flags, &tm);
564 if (ret <= 0) 612 if (ret < 0)
565 return ret; 613 goto out;
566 614
567 tm->index = eb->start >> PAGE_CACHE_SHIFT; 615 tm->index = eb->start >> PAGE_CACHE_SHIFT;
568 tm->slot = src_slot; 616 tm->slot = src_slot;
@@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
571 tm->op = MOD_LOG_MOVE_KEYS; 619 tm->op = MOD_LOG_MOVE_KEYS;
572 620
573 ret = __tree_mod_log_insert(fs_info, tm); 621 ret = __tree_mod_log_insert(fs_info, tm);
574 spin_unlock(&fs_info->tree_mod_seq_lock); 622out:
623 tree_mod_log_write_unlock(fs_info);
575 return ret; 624 return ret;
576} 625}
577 626
627static inline void
628__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
629{
630 int i;
631 u32 nritems;
632 int ret;
633
634 nritems = btrfs_header_nritems(eb);
635 for (i = nritems - 1; i >= 0; i--) {
636 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
637 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
638 BUG_ON(ret < 0);
639 }
640}
641
578static noinline int 642static noinline int
579tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 643tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
580 struct extent_buffer *old_root, 644 struct extent_buffer *old_root,
@@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
583 struct tree_mod_elem *tm; 647 struct tree_mod_elem *tm;
584 int ret; 648 int ret;
585 649
650 if (tree_mod_dont_log(fs_info, NULL))
651 return 0;
652
653 __tree_mod_log_free_eb(fs_info, old_root);
654
586 ret = tree_mod_alloc(fs_info, flags, &tm); 655 ret = tree_mod_alloc(fs_info, flags, &tm);
587 if (ret <= 0) 656 if (ret < 0)
588 return ret; 657 goto out;
589 658
590 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 659 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
591 tm->old_root.logical = old_root->start; 660 tm->old_root.logical = old_root->start;
@@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
594 tm->op = MOD_LOG_ROOT_REPLACE; 663 tm->op = MOD_LOG_ROOT_REPLACE;
595 664
596 ret = __tree_mod_log_insert(fs_info, tm); 665 ret = __tree_mod_log_insert(fs_info, tm);
597 spin_unlock(&fs_info->tree_mod_seq_lock); 666out:
667 tree_mod_log_write_unlock(fs_info);
598 return ret; 668 return ret;
599} 669}
600 670
@@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
608 struct tree_mod_elem *found = NULL; 678 struct tree_mod_elem *found = NULL;
609 u64 index = start >> PAGE_CACHE_SHIFT; 679 u64 index = start >> PAGE_CACHE_SHIFT;
610 680
611 read_lock(&fs_info->tree_mod_log_lock); 681 tree_mod_log_read_lock(fs_info);
612 tm_root = &fs_info->tree_mod_log; 682 tm_root = &fs_info->tree_mod_log;
613 node = tm_root->rb_node; 683 node = tm_root->rb_node;
614 while (node) { 684 while (node) {
@@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
617 node = node->rb_left; 687 node = node->rb_left;
618 } else if (cur->index > index) { 688 } else if (cur->index > index) {
619 node = node->rb_right; 689 node = node->rb_right;
620 } else if (cur->elem.seq < min_seq) { 690 } else if (cur->seq < min_seq) {
621 node = node->rb_left; 691 node = node->rb_left;
622 } else if (!smallest) { 692 } else if (!smallest) {
623 /* we want the node with the highest seq */ 693 /* we want the node with the highest seq */
624 if (found) 694 if (found)
625 BUG_ON(found->elem.seq > cur->elem.seq); 695 BUG_ON(found->seq > cur->seq);
626 found = cur; 696 found = cur;
627 node = node->rb_left; 697 node = node->rb_left;
628 } else if (cur->elem.seq > min_seq) { 698 } else if (cur->seq > min_seq) {
629 /* we want the node with the smallest seq */ 699 /* we want the node with the smallest seq */
630 if (found) 700 if (found)
631 BUG_ON(found->elem.seq < cur->elem.seq); 701 BUG_ON(found->seq < cur->seq);
632 found = cur; 702 found = cur;
633 node = node->rb_right; 703 node = node->rb_right;
634 } else { 704 } else {
@@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
636 break; 706 break;
637 } 707 }
638 } 708 }
639 read_unlock(&fs_info->tree_mod_log_lock); 709 tree_mod_log_read_unlock(fs_info);
640 710
641 return found; 711 return found;
642} 712}
@@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
664 return __tree_mod_log_search(fs_info, start, min_seq, 0); 734 return __tree_mod_log_search(fs_info, start, min_seq, 0);
665} 735}
666 736
667static inline void 737static noinline void
668tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 738tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
669 struct extent_buffer *src, unsigned long dst_offset, 739 struct extent_buffer *src, unsigned long dst_offset,
670 unsigned long src_offset, int nr_items) 740 unsigned long src_offset, int nr_items)
@@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
675 if (tree_mod_dont_log(fs_info, NULL)) 745 if (tree_mod_dont_log(fs_info, NULL))
676 return; 746 return;
677 747
678 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 748 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
749 tree_mod_log_write_unlock(fs_info);
679 return; 750 return;
751 }
680 752
681 /* speed this up by single seq for all operations? */
682 for (i = 0; i < nr_items; i++) { 753 for (i = 0; i < nr_items; i++) {
683 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, 754 ret = tree_mod_log_insert_key_locked(fs_info, src,
684 MOD_LOG_KEY_REMOVE); 755 i + src_offset,
756 MOD_LOG_KEY_REMOVE);
685 BUG_ON(ret < 0); 757 BUG_ON(ret < 0);
686 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, 758 ret = tree_mod_log_insert_key_locked(fs_info, dst,
687 MOD_LOG_KEY_ADD); 759 i + dst_offset,
760 MOD_LOG_KEY_ADD);
688 BUG_ON(ret < 0); 761 BUG_ON(ret < 0);
689 } 762 }
763
764 tree_mod_log_write_unlock(fs_info);
690} 765}
691 766
692static inline void 767static inline void
@@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
699 BUG_ON(ret < 0); 774 BUG_ON(ret < 0);
700} 775}
701 776
702static inline void 777static noinline void
703tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
704 struct extent_buffer *eb, 779 struct extent_buffer *eb,
705 struct btrfs_disk_key *disk_key, int slot, int atomic) 780 struct btrfs_disk_key *disk_key, int slot, int atomic)
@@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
712 BUG_ON(ret < 0); 787 BUG_ON(ret < 0);
713} 788}
714 789
715static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 790static noinline void
716 struct extent_buffer *eb) 791tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
717{ 792{
718 int i;
719 int ret;
720 u32 nritems;
721
722 if (tree_mod_dont_log(fs_info, eb)) 793 if (tree_mod_dont_log(fs_info, eb))
723 return; 794 return;
724 795
725 nritems = btrfs_header_nritems(eb); 796 __tree_mod_log_free_eb(fs_info, eb);
726 for (i = nritems - 1; i >= 0; i--) { 797
727 ret = tree_mod_log_insert_key(fs_info, eb, i, 798 tree_mod_log_write_unlock(fs_info);
728 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
729 BUG_ON(ret < 0);
730 }
731} 799}
732 800
733static inline void 801static noinline void
734tree_mod_log_set_root_pointer(struct btrfs_root *root, 802tree_mod_log_set_root_pointer(struct btrfs_root *root,
735 struct extent_buffer *new_root_node) 803 struct extent_buffer *new_root_node)
736{ 804{
737 int ret; 805 int ret;
738 tree_mod_log_free_eb(root->fs_info, root->node);
739 ret = tree_mod_log_insert_root(root->fs_info, root->node, 806 ret = tree_mod_log_insert_root(root->fs_info, root->node,
740 new_root_node, GFP_NOFS); 807 new_root_node, GFP_NOFS);
741 BUG_ON(ret < 0); 808 BUG_ON(ret < 0);
@@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1069 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1136 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1070 1137
1071 n = btrfs_header_nritems(eb); 1138 n = btrfs_header_nritems(eb);
1072 while (tm && tm->elem.seq >= time_seq) { 1139 while (tm && tm->seq >= time_seq) {
1073 /* 1140 /*
1074 * all the operations are recorded with the operator used for 1141 * all the operations are recorded with the operator used for
1075 * the modification. as we're going backwards, we do the 1142 * the modification. as we're going backwards, we do the
@@ -2722,6 +2789,80 @@ done:
2722} 2789}
2723 2790
2724/* 2791/*
2792 * helper to use instead of search slot if no exact match is needed but
2793 * instead the next or previous item should be returned.
2794 * When find_higher is true, the next higher item is returned, the next lower
2795 * otherwise.
2796 * When return_any and find_higher are both true, and no higher item is found,
2797 * return the next lower instead.
2798 * When return_any is true and find_higher is false, and no lower item is found,
2799 * return the next higher instead.
2800 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2801 * < 0 on error
2802 */
2803int btrfs_search_slot_for_read(struct btrfs_root *root,
2804 struct btrfs_key *key, struct btrfs_path *p,
2805 int find_higher, int return_any)
2806{
2807 int ret;
2808 struct extent_buffer *leaf;
2809
2810again:
2811 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2812 if (ret <= 0)
2813 return ret;
2814 /*
2815 * a return value of 1 means the path is at the position where the
2816 * item should be inserted. Normally this is the next bigger item,
2817 * but in case the previous item is the last in a leaf, path points
2818 * to the first free slot in the previous leaf, i.e. at an invalid
2819 * item.
2820 */
2821 leaf = p->nodes[0];
2822
2823 if (find_higher) {
2824 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2825 ret = btrfs_next_leaf(root, p);
2826 if (ret <= 0)
2827 return ret;
2828 if (!return_any)
2829 return 1;
2830 /*
2831 * no higher item found, return the next
2832 * lower instead
2833 */
2834 return_any = 0;
2835 find_higher = 0;
2836 btrfs_release_path(p);
2837 goto again;
2838 }
2839 } else {
2840 if (p->slots[0] == 0) {
2841 ret = btrfs_prev_leaf(root, p);
2842 if (ret < 0)
2843 return ret;
2844 if (!ret) {
2845 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2846 return 0;
2847 }
2848 if (!return_any)
2849 return 1;
2850 /*
2851 * no lower item found, return the next
2852 * higher instead
2853 */
2854 return_any = 0;
2855 find_higher = 1;
2856 btrfs_release_path(p);
2857 goto again;
2858 } else {
2859 --p->slots[0];
2860 }
2861 }
2862 return 0;
2863}
2864
2865/*
2725 * adjust the pointers going up the tree, starting at level 2866 * adjust the pointers going up the tree, starting at level
2726 * making sure the right key of each node is points to 'key'. 2867 * making sure the right key of each node is points to 'key'.
2727 * This is used after shifting pointers to the left, so it stops 2868 * This is used after shifting pointers to the left, so it stops
@@ -4931,6 +5072,431 @@ out:
4931 return ret; 5072 return ret;
4932} 5073}
4933 5074
5075static void tree_move_down(struct btrfs_root *root,
5076 struct btrfs_path *path,
5077 int *level, int root_level)
5078{
5079 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5080 path->slots[*level]);
5081 path->slots[*level - 1] = 0;
5082 (*level)--;
5083}
5084
5085static int tree_move_next_or_upnext(struct btrfs_root *root,
5086 struct btrfs_path *path,
5087 int *level, int root_level)
5088{
5089 int ret = 0;
5090 int nritems;
5091 nritems = btrfs_header_nritems(path->nodes[*level]);
5092
5093 path->slots[*level]++;
5094
5095 while (path->slots[*level] == nritems) {
5096 if (*level == root_level)
5097 return -1;
5098
5099 /* move upnext */
5100 path->slots[*level] = 0;
5101 free_extent_buffer(path->nodes[*level]);
5102 path->nodes[*level] = NULL;
5103 (*level)++;
5104 path->slots[*level]++;
5105
5106 nritems = btrfs_header_nritems(path->nodes[*level]);
5107 ret = 1;
5108 }
5109 return ret;
5110}
5111
5112/*
5113 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5114 * or down.
5115 */
5116static int tree_advance(struct btrfs_root *root,
5117 struct btrfs_path *path,
5118 int *level, int root_level,
5119 int allow_down,
5120 struct btrfs_key *key)
5121{
5122 int ret;
5123
5124 if (*level == 0 || !allow_down) {
5125 ret = tree_move_next_or_upnext(root, path, level, root_level);
5126 } else {
5127 tree_move_down(root, path, level, root_level);
5128 ret = 0;
5129 }
5130 if (ret >= 0) {
5131 if (*level == 0)
5132 btrfs_item_key_to_cpu(path->nodes[*level], key,
5133 path->slots[*level]);
5134 else
5135 btrfs_node_key_to_cpu(path->nodes[*level], key,
5136 path->slots[*level]);
5137 }
5138 return ret;
5139}
5140
5141static int tree_compare_item(struct btrfs_root *left_root,
5142 struct btrfs_path *left_path,
5143 struct btrfs_path *right_path,
5144 char *tmp_buf)
5145{
5146 int cmp;
5147 int len1, len2;
5148 unsigned long off1, off2;
5149
5150 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5151 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5152 if (len1 != len2)
5153 return 1;
5154
5155 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5156 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5157 right_path->slots[0]);
5158
5159 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5160
5161 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5162 if (cmp)
5163 return 1;
5164 return 0;
5165}
5166
5167#define ADVANCE 1
5168#define ADVANCE_ONLY_NEXT -1
5169
5170/*
5171 * This function compares two trees and calls the provided callback for
5172 * every changed/new/deleted item it finds.
5173 * If shared tree blocks are encountered, whole subtrees are skipped, making
5174 * the compare pretty fast on snapshotted subvolumes.
5175 *
5176 * This currently works on commit roots only. As commit roots are read only,
5177 * we don't do any locking. The commit roots are protected with transactions.
5178 * Transactions are ended and rejoined when a commit is tried in between.
5179 *
5180 * This function checks for modifications done to the trees while comparing.
5181 * If it detects a change, it aborts immediately.
5182 */
5183int btrfs_compare_trees(struct btrfs_root *left_root,
5184 struct btrfs_root *right_root,
5185 btrfs_changed_cb_t changed_cb, void *ctx)
5186{
5187 int ret;
5188 int cmp;
5189 struct btrfs_trans_handle *trans = NULL;
5190 struct btrfs_path *left_path = NULL;
5191 struct btrfs_path *right_path = NULL;
5192 struct btrfs_key left_key;
5193 struct btrfs_key right_key;
5194 char *tmp_buf = NULL;
5195 int left_root_level;
5196 int right_root_level;
5197 int left_level;
5198 int right_level;
5199 int left_end_reached;
5200 int right_end_reached;
5201 int advance_left;
5202 int advance_right;
5203 u64 left_blockptr;
5204 u64 right_blockptr;
5205 u64 left_start_ctransid;
5206 u64 right_start_ctransid;
5207 u64 ctransid;
5208
5209 left_path = btrfs_alloc_path();
5210 if (!left_path) {
5211 ret = -ENOMEM;
5212 goto out;
5213 }
5214 right_path = btrfs_alloc_path();
5215 if (!right_path) {
5216 ret = -ENOMEM;
5217 goto out;
5218 }
5219
5220 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5221 if (!tmp_buf) {
5222 ret = -ENOMEM;
5223 goto out;
5224 }
5225
5226 left_path->search_commit_root = 1;
5227 left_path->skip_locking = 1;
5228 right_path->search_commit_root = 1;
5229 right_path->skip_locking = 1;
5230
5231 spin_lock(&left_root->root_times_lock);
5232 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5233 spin_unlock(&left_root->root_times_lock);
5234
5235 spin_lock(&right_root->root_times_lock);
5236 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5237 spin_unlock(&right_root->root_times_lock);
5238
5239 trans = btrfs_join_transaction(left_root);
5240 if (IS_ERR(trans)) {
5241 ret = PTR_ERR(trans);
5242 trans = NULL;
5243 goto out;
5244 }
5245
5246 /*
5247 * Strategy: Go to the first items of both trees. Then do
5248 *
5249 * If both trees are at level 0
5250 * Compare keys of current items
5251 * If left < right treat left item as new, advance left tree
5252 * and repeat
5253 * If left > right treat right item as deleted, advance right tree
5254 * and repeat
5255 * If left == right do deep compare of items, treat as changed if
5256 * needed, advance both trees and repeat
5257 * If both trees are at the same level but not at level 0
5258 * Compare keys of current nodes/leafs
5259 * If left < right advance left tree and repeat
5260 * If left > right advance right tree and repeat
5261 * If left == right compare blockptrs of the next nodes/leafs
5262 * If they match advance both trees but stay at the same level
5263 * and repeat
5264 * If they don't match advance both trees while allowing to go
5265 * deeper and repeat
5266 * If tree levels are different
5267 * Advance the tree that needs it and repeat
5268 *
5269 * Advancing a tree means:
5270 * If we are at level 0, try to go to the next slot. If that's not
5271 * possible, go one level up and repeat. Stop when we found a level
5272 * where we could go to the next slot. We may at this point be on a
5273 * node or a leaf.
5274 *
5275 * If we are not at level 0 and not on shared tree blocks, go one
5276 * level deeper.
5277 *
5278 * If we are not at level 0 and on shared tree blocks, go one slot to
5279 * the right if possible or go up and right.
5280 */
5281
5282 left_level = btrfs_header_level(left_root->commit_root);
5283 left_root_level = left_level;
5284 left_path->nodes[left_level] = left_root->commit_root;
5285 extent_buffer_get(left_path->nodes[left_level]);
5286
5287 right_level = btrfs_header_level(right_root->commit_root);
5288 right_root_level = right_level;
5289 right_path->nodes[right_level] = right_root->commit_root;
5290 extent_buffer_get(right_path->nodes[right_level]);
5291
5292 if (left_level == 0)
5293 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5294 &left_key, left_path->slots[left_level]);
5295 else
5296 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5297 &left_key, left_path->slots[left_level]);
5298 if (right_level == 0)
5299 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5300 &right_key, right_path->slots[right_level]);
5301 else
5302 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5303 &right_key, right_path->slots[right_level]);
5304
5305 left_end_reached = right_end_reached = 0;
5306 advance_left = advance_right = 0;
5307
5308 while (1) {
5309 /*
5310 * We need to make sure the transaction does not get committed
5311 * while we do anything on commit roots. This means, we need to
5312 * join and leave transactions for every item that we process.
5313 */
5314 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5315 btrfs_release_path(left_path);
5316 btrfs_release_path(right_path);
5317
5318 ret = btrfs_end_transaction(trans, left_root);
5319 trans = NULL;
5320 if (ret < 0)
5321 goto out;
5322 }
5323 /* now rejoin the transaction */
5324 if (!trans) {
5325 trans = btrfs_join_transaction(left_root);
5326 if (IS_ERR(trans)) {
5327 ret = PTR_ERR(trans);
5328 trans = NULL;
5329 goto out;
5330 }
5331
5332 spin_lock(&left_root->root_times_lock);
5333 ctransid = btrfs_root_ctransid(&left_root->root_item);
5334 spin_unlock(&left_root->root_times_lock);
5335 if (ctransid != left_start_ctransid)
5336 left_start_ctransid = 0;
5337
5338 spin_lock(&right_root->root_times_lock);
5339 ctransid = btrfs_root_ctransid(&right_root->root_item);
5340 spin_unlock(&right_root->root_times_lock);
5341 if (ctransid != right_start_ctransid)
5342 right_start_ctransid = 0;
5343
5344 if (!left_start_ctransid || !right_start_ctransid) {
5345 WARN(1, KERN_WARNING
5346 "btrfs: btrfs_compare_tree detected "
5347 "a change in one of the trees while "
5348 "iterating. This is probably a "
5349 "bug.\n");
5350 ret = -EIO;
5351 goto out;
5352 }
5353
5354 /*
5355 * the commit root may have changed, so start again
5356 * where we stopped
5357 */
5358 left_path->lowest_level = left_level;
5359 right_path->lowest_level = right_level;
5360 ret = btrfs_search_slot(NULL, left_root,
5361 &left_key, left_path, 0, 0);
5362 if (ret < 0)
5363 goto out;
5364 ret = btrfs_search_slot(NULL, right_root,
5365 &right_key, right_path, 0, 0);
5366 if (ret < 0)
5367 goto out;
5368 }
5369
5370 if (advance_left && !left_end_reached) {
5371 ret = tree_advance(left_root, left_path, &left_level,
5372 left_root_level,
5373 advance_left != ADVANCE_ONLY_NEXT,
5374 &left_key);
5375 if (ret < 0)
5376 left_end_reached = ADVANCE;
5377 advance_left = 0;
5378 }
5379 if (advance_right && !right_end_reached) {
5380 ret = tree_advance(right_root, right_path, &right_level,
5381 right_root_level,
5382 advance_right != ADVANCE_ONLY_NEXT,
5383 &right_key);
5384 if (ret < 0)
5385 right_end_reached = ADVANCE;
5386 advance_right = 0;
5387 }
5388
5389 if (left_end_reached && right_end_reached) {
5390 ret = 0;
5391 goto out;
5392 } else if (left_end_reached) {
5393 if (right_level == 0) {
5394 ret = changed_cb(left_root, right_root,
5395 left_path, right_path,
5396 &right_key,
5397 BTRFS_COMPARE_TREE_DELETED,
5398 ctx);
5399 if (ret < 0)
5400 goto out;
5401 }
5402 advance_right = ADVANCE;
5403 continue;
5404 } else if (right_end_reached) {
5405 if (left_level == 0) {
5406 ret = changed_cb(left_root, right_root,
5407 left_path, right_path,
5408 &left_key,
5409 BTRFS_COMPARE_TREE_NEW,
5410 ctx);
5411 if (ret < 0)
5412 goto out;
5413 }
5414 advance_left = ADVANCE;
5415 continue;
5416 }
5417
5418 if (left_level == 0 && right_level == 0) {
5419 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5420 if (cmp < 0) {
5421 ret = changed_cb(left_root, right_root,
5422 left_path, right_path,
5423 &left_key,
5424 BTRFS_COMPARE_TREE_NEW,
5425 ctx);
5426 if (ret < 0)
5427 goto out;
5428 advance_left = ADVANCE;
5429 } else if (cmp > 0) {
5430 ret = changed_cb(left_root, right_root,
5431 left_path, right_path,
5432 &right_key,
5433 BTRFS_COMPARE_TREE_DELETED,
5434 ctx);
5435 if (ret < 0)
5436 goto out;
5437 advance_right = ADVANCE;
5438 } else {
5439 ret = tree_compare_item(left_root, left_path,
5440 right_path, tmp_buf);
5441 if (ret) {
5442 ret = changed_cb(left_root, right_root,
5443 left_path, right_path,
5444 &left_key,
5445 BTRFS_COMPARE_TREE_CHANGED,
5446 ctx);
5447 if (ret < 0)
5448 goto out;
5449 }
5450 advance_left = ADVANCE;
5451 advance_right = ADVANCE;
5452 }
5453 } else if (left_level == right_level) {
5454 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5455 if (cmp < 0) {
5456 advance_left = ADVANCE;
5457 } else if (cmp > 0) {
5458 advance_right = ADVANCE;
5459 } else {
5460 left_blockptr = btrfs_node_blockptr(
5461 left_path->nodes[left_level],
5462 left_path->slots[left_level]);
5463 right_blockptr = btrfs_node_blockptr(
5464 right_path->nodes[right_level],
5465 right_path->slots[right_level]);
5466 if (left_blockptr == right_blockptr) {
5467 /*
5468 * As we're on a shared block, don't
5469 * allow to go deeper.
5470 */
5471 advance_left = ADVANCE_ONLY_NEXT;
5472 advance_right = ADVANCE_ONLY_NEXT;
5473 } else {
5474 advance_left = ADVANCE;
5475 advance_right = ADVANCE;
5476 }
5477 }
5478 } else if (left_level < right_level) {
5479 advance_right = ADVANCE;
5480 } else {
5481 advance_left = ADVANCE;
5482 }
5483 }
5484
5485out:
5486 btrfs_free_path(left_path);
5487 btrfs_free_path(right_path);
5488 kfree(tmp_buf);
5489
5490 if (trans) {
5491 if (!ret)
5492 ret = btrfs_end_transaction(trans, left_root);
5493 else
5494 btrfs_end_transaction(trans, left_root);
5495 }
5496
5497 return ret;
5498}
5499
4934/* 5500/*
4935 * this is similar to btrfs_next_leaf, but does not try to preserve 5501 * this is similar to btrfs_next_leaf, but does not try to preserve
4936 * and fixup the path. It looks for and returns the next key in the 5502 * and fixup the path. It looks for and returns the next key in the
@@ -5127,6 +5693,7 @@ again:
5127 * locked. To solve this situation, we give up 5693 * locked. To solve this situation, we give up
5128 * on our lock and cycle. 5694 * on our lock and cycle.
5129 */ 5695 */
5696 free_extent_buffer(next);
5130 btrfs_release_path(path); 5697 btrfs_release_path(path);
5131 cond_resched(); 5698 cond_resched();
5132 goto again; 5699 goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b3907..adb1cd7ceb9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
91/* for storing balance parameters in the root tree */ 91/* for storing balance parameters in the root tree */
92#define BTRFS_BALANCE_OBJECTID -4ULL 92#define BTRFS_BALANCE_OBJECTID -4ULL
93 93
94/* holds quota configuration and tracking */
95#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
96
94/* orhpan objectid for tracking unlinked/truncated files */ 97/* orhpan objectid for tracking unlinked/truncated files */
95#define BTRFS_ORPHAN_OBJECTID -5ULL 98#define BTRFS_ORPHAN_OBJECTID -5ULL
96 99
@@ -709,6 +712,36 @@ struct btrfs_root_item {
709 struct btrfs_disk_key drop_progress; 712 struct btrfs_disk_key drop_progress;
710 u8 drop_level; 713 u8 drop_level;
711 u8 level; 714 u8 level;
715
716 /*
717 * The following fields appear after subvol_uuids+subvol_times
718 * were introduced.
719 */
720
721 /*
722 * This generation number is used to test if the new fields are valid
723 * and up to date while reading the root item. Everytime the root item
724 * is written out, the "generation" field is copied into this field. If
725 * anyone ever mounted the fs with an older kernel, we will have
726 * mismatching generation values here and thus must invalidate the
727 * new fields. See btrfs_update_root and btrfs_find_last_root for
728 * details.
729 * the offset of generation_v2 is also used as the start for the memset
730 * when invalidating the fields.
731 */
732 __le64 generation_v2;
733 u8 uuid[BTRFS_UUID_SIZE];
734 u8 parent_uuid[BTRFS_UUID_SIZE];
735 u8 received_uuid[BTRFS_UUID_SIZE];
736 __le64 ctransid; /* updated when an inode changes */
737 __le64 otransid; /* trans when created */
738 __le64 stransid; /* trans when sent. non-zero for received subvol */
739 __le64 rtransid; /* trans when received. non-zero for received subvol */
740 struct btrfs_timespec ctime;
741 struct btrfs_timespec otime;
742 struct btrfs_timespec stime;
743 struct btrfs_timespec rtime;
744 __le64 reserved[8]; /* for future */
712} __attribute__ ((__packed__)); 745} __attribute__ ((__packed__));
713 746
714/* 747/*
@@ -883,6 +916,72 @@ struct btrfs_block_group_item {
883 __le64 flags; 916 __le64 flags;
884} __attribute__ ((__packed__)); 917} __attribute__ ((__packed__));
885 918
919/*
920 * is subvolume quota turned on?
921 */
922#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
923/*
924 * SCANNING is set during the initialization phase
925 */
926#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1)
927/*
928 * Some qgroup entries are known to be out of date,
929 * either because the configuration has changed in a way that
930 * makes a rescan necessary, or because the fs has been mounted
931 * with a non-qgroup-aware version.
932 * Turning qouta off and on again makes it inconsistent, too.
933 */
934#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
935
936#define BTRFS_QGROUP_STATUS_VERSION 1
937
938struct btrfs_qgroup_status_item {
939 __le64 version;
940 /*
941 * the generation is updated during every commit. As older
942 * versions of btrfs are not aware of qgroups, it will be
943 * possible to detect inconsistencies by checking the
944 * generation on mount time
945 */
946 __le64 generation;
947
948 /* flag definitions see above */
949 __le64 flags;
950
951 /*
952 * only used during scanning to record the progress
953 * of the scan. It contains a logical address
954 */
955 __le64 scan;
956} __attribute__ ((__packed__));
957
958struct btrfs_qgroup_info_item {
959 __le64 generation;
960 __le64 rfer;
961 __le64 rfer_cmpr;
962 __le64 excl;
963 __le64 excl_cmpr;
964} __attribute__ ((__packed__));
965
966/* flags definition for qgroup limits */
967#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
968#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
969#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
970#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
971#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
972#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
973
974struct btrfs_qgroup_limit_item {
975 /*
976 * only updated when any of the other values change
977 */
978 __le64 flags;
979 __le64 max_rfer;
980 __le64 max_excl;
981 __le64 rsv_rfer;
982 __le64 rsv_excl;
983} __attribute__ ((__packed__));
984
886struct btrfs_space_info { 985struct btrfs_space_info {
887 u64 flags; 986 u64 flags;
888 987
@@ -1030,6 +1129,13 @@ struct btrfs_block_group_cache {
1030 struct list_head cluster_list; 1129 struct list_head cluster_list;
1031}; 1130};
1032 1131
1132/* delayed seq elem */
1133struct seq_list {
1134 struct list_head list;
1135 u64 seq;
1136};
1137
1138/* fs_info */
1033struct reloc_control; 1139struct reloc_control;
1034struct btrfs_device; 1140struct btrfs_device;
1035struct btrfs_fs_devices; 1141struct btrfs_fs_devices;
@@ -1044,6 +1150,7 @@ struct btrfs_fs_info {
1044 struct btrfs_root *dev_root; 1150 struct btrfs_root *dev_root;
1045 struct btrfs_root *fs_root; 1151 struct btrfs_root *fs_root;
1046 struct btrfs_root *csum_root; 1152 struct btrfs_root *csum_root;
1153 struct btrfs_root *quota_root;
1047 1154
1048 /* the log root tree is a directory of all the other log roots */ 1155 /* the log root tree is a directory of all the other log roots */
1049 struct btrfs_root *log_root_tree; 1156 struct btrfs_root *log_root_tree;
@@ -1144,6 +1251,8 @@ struct btrfs_fs_info {
1144 spinlock_t tree_mod_seq_lock; 1251 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq; 1252 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list; 1253 struct list_head tree_mod_seq_list;
1254 struct seq_list tree_mod_seq_elem;
1255 wait_queue_head_t tree_mod_seq_wait;
1147 1256
1148 /* this protects tree_mod_log */ 1257 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock; 1258 rwlock_t tree_mod_log_lock;
@@ -1240,6 +1349,8 @@ struct btrfs_fs_info {
1240 */ 1349 */
1241 struct list_head space_info; 1350 struct list_head space_info;
1242 1351
1352 struct btrfs_space_info *data_sinfo;
1353
1243 struct reloc_control *reloc_ctl; 1354 struct reloc_control *reloc_ctl;
1244 1355
1245 spinlock_t delalloc_lock; 1356 spinlock_t delalloc_lock;
@@ -1296,6 +1407,29 @@ struct btrfs_fs_info {
1296#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1407#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1297 u32 check_integrity_print_mask; 1408 u32 check_integrity_print_mask;
1298#endif 1409#endif
1410 /*
1411 * quota information
1412 */
1413 unsigned int quota_enabled:1;
1414
1415 /*
1416 * quota_enabled only changes state after a commit. This holds the
1417 * next state.
1418 */
1419 unsigned int pending_quota_state:1;
1420
1421 /* is qgroup tracking in a consistent state? */
1422 u64 qgroup_flags;
1423
1424 /* holds configuration and tracking. Protected by qgroup_lock */
1425 struct rb_root qgroup_tree;
1426 spinlock_t qgroup_lock;
1427
1428 /* list of dirty qgroups to be written at next commit */
1429 struct list_head dirty_qgroups;
1430
1431 /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
1432 u64 qgroup_seq;
1299 1433
1300 /* filesystem state */ 1434 /* filesystem state */
1301 u64 fs_state; 1435 u64 fs_state;
@@ -1416,6 +1550,8 @@ struct btrfs_root {
1416 dev_t anon_dev; 1550 dev_t anon_dev;
1417 1551
1418 int force_cow; 1552 int force_cow;
1553
1554 spinlock_t root_times_lock;
1419}; 1555};
1420 1556
1421struct btrfs_ioctl_defrag_range_args { 1557struct btrfs_ioctl_defrag_range_args {
@@ -1525,6 +1661,30 @@ struct btrfs_ioctl_defrag_range_args {
1525#define BTRFS_DEV_ITEM_KEY 216 1661#define BTRFS_DEV_ITEM_KEY 216
1526#define BTRFS_CHUNK_ITEM_KEY 228 1662#define BTRFS_CHUNK_ITEM_KEY 228
1527 1663
1664/*
1665 * Records the overall state of the qgroups.
1666 * There's only one instance of this key present,
1667 * (0, BTRFS_QGROUP_STATUS_KEY, 0)
1668 */
1669#define BTRFS_QGROUP_STATUS_KEY 240
1670/*
1671 * Records the currently used space of the qgroup.
1672 * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
1673 */
1674#define BTRFS_QGROUP_INFO_KEY 242
1675/*
1676 * Contains the user configured limits for the qgroup.
1677 * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
1678 */
1679#define BTRFS_QGROUP_LIMIT_KEY 244
1680/*
1681 * Records the child-parent relationship of qgroups. For
1682 * each relation, 2 keys are present:
1683 * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
1684 * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
1685 */
1686#define BTRFS_QGROUP_RELATION_KEY 246
1687
1528#define BTRFS_BALANCE_ITEM_KEY 248 1688#define BTRFS_BALANCE_ITEM_KEY 248
1529 1689
1530/* 1690/*
@@ -1621,13 +1781,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1621 offsetof(type, member), \ 1781 offsetof(type, member), \
1622 sizeof(((type *)0)->member))) 1782 sizeof(((type *)0)->member)))
1623 1783
1624#ifndef BTRFS_SETGET_FUNCS 1784#define DECLARE_BTRFS_SETGET_BITS(bits) \
1785u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
1786 unsigned long off, \
1787 struct btrfs_map_token *token); \
1788void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
1789 unsigned long off, u##bits val, \
1790 struct btrfs_map_token *token); \
1791static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
1792 unsigned long off) \
1793{ \
1794 return btrfs_get_token_##bits(eb, ptr, off, NULL); \
1795} \
1796static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
1797 unsigned long off, u##bits val) \
1798{ \
1799 btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
1800}
1801
1802DECLARE_BTRFS_SETGET_BITS(8)
1803DECLARE_BTRFS_SETGET_BITS(16)
1804DECLARE_BTRFS_SETGET_BITS(32)
1805DECLARE_BTRFS_SETGET_BITS(64)
1806
1625#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 1807#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
1626u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ 1808static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
1627u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ 1809{ \
1628void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ 1810 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1629void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); 1811 return btrfs_get_##bits(eb, s, offsetof(type, member)); \
1630#endif 1812} \
1813static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
1814 u##bits val) \
1815{ \
1816 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1817 btrfs_set_##bits(eb, s, offsetof(type, member), val); \
1818} \
1819static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
1820 struct btrfs_map_token *token) \
1821{ \
1822 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1823 return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
1824} \
1825static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
1826 type *s, u##bits val, \
1827 struct btrfs_map_token *token) \
1828{ \
1829 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1830 btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
1831}
1631 1832
1632#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1833#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1633static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1834static inline u##bits btrfs_##name(struct extent_buffer *eb) \
@@ -2189,6 +2390,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
2189BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); 2390BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
2190BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 2391BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
2191 last_snapshot, 64); 2392 last_snapshot, 64);
2393BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
2394 generation_v2, 64);
2395BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
2396 ctransid, 64);
2397BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
2398 otransid, 64);
2399BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
2400 stransid, 64);
2401BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
2402 rtransid, 64);
2192 2403
2193static inline bool btrfs_root_readonly(struct btrfs_root *root) 2404static inline bool btrfs_root_readonly(struct btrfs_root *root)
2194{ 2405{
@@ -2465,6 +2676,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
2465 sizeof(val)); 2676 sizeof(val));
2466} 2677}
2467 2678
2679/* btrfs_qgroup_status_item */
2680BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
2681 generation, 64);
2682BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
2683 version, 64);
2684BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
2685 flags, 64);
2686BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
2687 scan, 64);
2688
2689/* btrfs_qgroup_info_item */
2690BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
2691 generation, 64);
2692BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
2693BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
2694 rfer_cmpr, 64);
2695BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
2696BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
2697 excl_cmpr, 64);
2698
2699BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
2700 struct btrfs_qgroup_info_item, generation, 64);
2701BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
2702 rfer, 64);
2703BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
2704 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
2705BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
2706 excl, 64);
2707BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
2708 struct btrfs_qgroup_info_item, excl_cmpr, 64);
2709
2710/* btrfs_qgroup_limit_item */
2711BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
2712 flags, 64);
2713BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
2714 max_rfer, 64);
2715BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
2716 max_excl, 64);
2717BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2718 rsv_rfer, 64);
2719BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2720 rsv_excl, 64);
2721
2468static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2722static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2469{ 2723{
2470 return sb->s_fs_info; 2724 return sb->s_fs_info;
@@ -2607,7 +2861,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2607 struct btrfs_root *root, u64 group_start); 2861 struct btrfs_root *root, u64 group_start);
2608u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2862u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2609u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2863u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2610void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2611void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2864void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2612int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2865int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2613void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2866void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
@@ -2661,6 +2914,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2661int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); 2914int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2662 2915
2663int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 2916int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2917int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2918 struct btrfs_fs_info *fs_info);
2664/* ctree.c */ 2919/* ctree.c */
2665int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2920int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2666 int level, int *slot); 2921 int level, int *slot);
@@ -2680,6 +2935,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
2680 struct btrfs_key *max_key, 2935 struct btrfs_key *max_key,
2681 struct btrfs_path *path, int cache_only, 2936 struct btrfs_path *path, int cache_only,
2682 u64 min_trans); 2937 u64 min_trans);
2938enum btrfs_compare_tree_result {
2939 BTRFS_COMPARE_TREE_NEW,
2940 BTRFS_COMPARE_TREE_DELETED,
2941 BTRFS_COMPARE_TREE_CHANGED,
2942};
2943typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
2944 struct btrfs_root *right_root,
2945 struct btrfs_path *left_path,
2946 struct btrfs_path *right_path,
2947 struct btrfs_key *key,
2948 enum btrfs_compare_tree_result result,
2949 void *ctx);
2950int btrfs_compare_trees(struct btrfs_root *left_root,
2951 struct btrfs_root *right_root,
2952 btrfs_changed_cb_t cb, void *ctx);
2683int btrfs_cow_block(struct btrfs_trans_handle *trans, 2953int btrfs_cow_block(struct btrfs_trans_handle *trans,
2684 struct btrfs_root *root, struct extent_buffer *buf, 2954 struct btrfs_root *root, struct extent_buffer *buf,
2685 struct extent_buffer *parent, int parent_slot, 2955 struct extent_buffer *parent, int parent_slot,
@@ -2711,6 +2981,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2711 ins_len, int cow); 2981 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2982int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq); 2983 struct btrfs_path *p, u64 time_seq);
2984int btrfs_search_slot_for_read(struct btrfs_root *root,
2985 struct btrfs_key *key, struct btrfs_path *p,
2986 int find_higher, int return_any);
2714int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2987int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2715 struct btrfs_root *root, struct extent_buffer *parent, 2988 struct btrfs_root *root, struct extent_buffer *parent,
2716 int start_slot, int cache_only, u64 *last_ret, 2989 int start_slot, int cache_only, u64 *last_ret,
@@ -2793,11 +3066,22 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2793 kfree(fs_info->chunk_root); 3066 kfree(fs_info->chunk_root);
2794 kfree(fs_info->dev_root); 3067 kfree(fs_info->dev_root);
2795 kfree(fs_info->csum_root); 3068 kfree(fs_info->csum_root);
3069 kfree(fs_info->quota_root);
2796 kfree(fs_info->super_copy); 3070 kfree(fs_info->super_copy);
2797 kfree(fs_info->super_for_commit); 3071 kfree(fs_info->super_for_commit);
2798 kfree(fs_info); 3072 kfree(fs_info);
2799} 3073}
2800 3074
3075/* tree mod log functions from ctree.c */
3076u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3077 struct seq_list *elem);
3078void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3079 struct seq_list *elem);
3080static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
3081{
3082 return atomic_inc_return(&fs_info->tree_mod_seq);
3083}
3084
2801/* root-item.c */ 3085/* root-item.c */
2802int btrfs_find_root_ref(struct btrfs_root *tree_root, 3086int btrfs_find_root_ref(struct btrfs_root *tree_root,
2803 struct btrfs_path *path, 3087 struct btrfs_path *path,
@@ -2819,6 +3103,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
2819 struct btrfs_root *root, 3103 struct btrfs_root *root,
2820 struct btrfs_key *key, 3104 struct btrfs_key *key,
2821 struct btrfs_root_item *item); 3105 struct btrfs_root_item *item);
3106void btrfs_read_root_item(struct btrfs_root *root,
3107 struct extent_buffer *eb, int slot,
3108 struct btrfs_root_item *item);
2822int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3109int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2823 btrfs_root_item *item, struct btrfs_key *key); 3110 btrfs_root_item *item, struct btrfs_key *key);
2824int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3111int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2826,6 +3113,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2826void btrfs_set_root_node(struct btrfs_root_item *item, 3113void btrfs_set_root_node(struct btrfs_root_item *item,
2827 struct extent_buffer *node); 3114 struct extent_buffer *node);
2828void btrfs_check_and_init_root_item(struct btrfs_root_item *item); 3115void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
3116void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3117 struct btrfs_root *root);
2829 3118
2830/* dir-item.c */ 3119/* dir-item.c */
2831int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3120int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -3061,6 +3350,23 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
3061 struct btrfs_root *root, const char *function, 3350 struct btrfs_root *root, const char *function,
3062 unsigned int line, int errno); 3351 unsigned int line, int errno);
3063 3352
3353#define btrfs_set_fs_incompat(__fs_info, opt) \
3354 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
3355
3356static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3357 u64 flag)
3358{
3359 struct btrfs_super_block *disk_super;
3360 u64 features;
3361
3362 disk_super = fs_info->super_copy;
3363 features = btrfs_super_incompat_flags(disk_super);
3364 if (!(features & flag)) {
3365 features |= flag;
3366 btrfs_set_super_incompat_flags(disk_super, features);
3367 }
3368}
3369
3064#define btrfs_abort_transaction(trans, root, errno) \ 3370#define btrfs_abort_transaction(trans, root, errno) \
3065do { \ 3371do { \
3066 __btrfs_abort_transaction(trans, root, __func__, \ 3372 __btrfs_abort_transaction(trans, root, __func__, \
@@ -3156,17 +3462,49 @@ void btrfs_reada_detach(void *handle);
3156int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3462int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3157 u64 start, int err); 3463 u64 start, int err);
3158 3464
3159/* delayed seq elem */ 3465/* qgroup.c */
3160struct seq_list { 3466struct qgroup_update {
3161 struct list_head list; 3467 struct list_head list;
3162 u64 seq; 3468 struct btrfs_delayed_ref_node *node;
3163 u32 flags; 3469 struct btrfs_delayed_extent_op *extent_op;
3164}; 3470};
3165 3471
3166void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 3472int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3167 struct seq_list *elem); 3473 struct btrfs_fs_info *fs_info);
3168void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3474int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3169 struct seq_list *elem); 3475 struct btrfs_fs_info *fs_info);
3476int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
3477int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3478 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3479int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
3480 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3481int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
3482 struct btrfs_fs_info *fs_info, u64 qgroupid,
3483 char *name);
3484int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
3485 struct btrfs_fs_info *fs_info, u64 qgroupid);
3486int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
3487 struct btrfs_fs_info *fs_info, u64 qgroupid,
3488 struct btrfs_qgroup_limit *limit);
3489int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
3490void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
3491struct btrfs_delayed_extent_op;
3492int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
3493 struct btrfs_delayed_ref_node *node,
3494 struct btrfs_delayed_extent_op *extent_op);
3495int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
3496 struct btrfs_fs_info *fs_info,
3497 struct btrfs_delayed_ref_node *node,
3498 struct btrfs_delayed_extent_op *extent_op);
3499int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
3500 struct btrfs_fs_info *fs_info);
3501int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
3502 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
3503 struct btrfs_qgroup_inherit *inherit);
3504int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
3505void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
3506
3507void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
3170 3508
3171static inline int is_fstree(u64 rootid) 3509static inline int is_fstree(u64 rootid)
3172{ 3510{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2399f408691..335605c8cea 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node(
62 INIT_LIST_HEAD(&delayed_node->n_list); 62 INIT_LIST_HEAD(&delayed_node->n_list);
63 INIT_LIST_HEAD(&delayed_node->p_list); 63 INIT_LIST_HEAD(&delayed_node->p_list);
64 delayed_node->bytes_reserved = 0; 64 delayed_node->bytes_reserved = 0;
65 memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
65} 66}
66 67
67static inline int btrfs_is_continuous_delayed_item( 68static inline int btrfs_is_continuous_delayed_item(
@@ -1113,8 +1114,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1113 * Returns < 0 on error and returns with an aborted transaction with any 1114 * Returns < 0 on error and returns with an aborted transaction with any
1114 * outstanding delayed items cleaned up. 1115 * outstanding delayed items cleaned up.
1115 */ 1116 */
1116int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 1117static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1117 struct btrfs_root *root) 1118 struct btrfs_root *root, int nr)
1118{ 1119{
1119 struct btrfs_root *curr_root = root; 1120 struct btrfs_root *curr_root = root;
1120 struct btrfs_delayed_root *delayed_root; 1121 struct btrfs_delayed_root *delayed_root;
@@ -1122,6 +1123,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1122 struct btrfs_path *path; 1123 struct btrfs_path *path;
1123 struct btrfs_block_rsv *block_rsv; 1124 struct btrfs_block_rsv *block_rsv;
1124 int ret = 0; 1125 int ret = 0;
1126 bool count = (nr > 0);
1125 1127
1126 if (trans->aborted) 1128 if (trans->aborted)
1127 return -EIO; 1129 return -EIO;
@@ -1137,7 +1139,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1137 delayed_root = btrfs_get_delayed_root(root); 1139 delayed_root = btrfs_get_delayed_root(root);
1138 1140
1139 curr_node = btrfs_first_delayed_node(delayed_root); 1141 curr_node = btrfs_first_delayed_node(delayed_root);
1140 while (curr_node) { 1142 while (curr_node && (!count || (count && nr--))) {
1141 curr_root = curr_node->root; 1143 curr_root = curr_node->root;
1142 ret = btrfs_insert_delayed_items(trans, path, curr_root, 1144 ret = btrfs_insert_delayed_items(trans, path, curr_root,
1143 curr_node); 1145 curr_node);
@@ -1149,6 +1151,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1149 path, curr_node); 1151 path, curr_node);
1150 if (ret) { 1152 if (ret) {
1151 btrfs_release_delayed_node(curr_node); 1153 btrfs_release_delayed_node(curr_node);
1154 curr_node = NULL;
1152 btrfs_abort_transaction(trans, root, ret); 1155 btrfs_abort_transaction(trans, root, ret);
1153 break; 1156 break;
1154 } 1157 }
@@ -1158,12 +1161,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1158 btrfs_release_delayed_node(prev_node); 1161 btrfs_release_delayed_node(prev_node);
1159 } 1162 }
1160 1163
1164 if (curr_node)
1165 btrfs_release_delayed_node(curr_node);
1161 btrfs_free_path(path); 1166 btrfs_free_path(path);
1162 trans->block_rsv = block_rsv; 1167 trans->block_rsv = block_rsv;
1163 1168
1164 return ret; 1169 return ret;
1165} 1170}
1166 1171
1172int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1173 struct btrfs_root *root)
1174{
1175 return __btrfs_run_delayed_items(trans, root, -1);
1176}
1177
1178int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1179 struct btrfs_root *root, int nr)
1180{
1181 return __btrfs_run_delayed_items(trans, root, nr);
1182}
1183
1167static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1184static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1168 struct btrfs_delayed_node *node) 1185 struct btrfs_delayed_node *node)
1169{ 1186{
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f5aa4023d3e..4f808e1baee 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode);
107 107
108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root); 109 struct btrfs_root *root);
110int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root, int nr);
110 112
111void btrfs_balance_delayed_items(struct btrfs_root *root); 113void btrfs_balance_delayed_items(struct btrfs_root *root);
112 114
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b04790..da7419ed01b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
233 return 0; 233 return 0;
234} 234}
235 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 236int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
237 struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq) 238 u64 seq)
238{ 239{
239 struct seq_list *elem; 240 struct seq_list *elem;
240 241 int ret = 0;
241 assert_spin_locked(&delayed_refs->lock); 242
242 if (list_empty(&delayed_refs->seq_head)) 243 spin_lock(&fs_info->tree_mod_seq_lock);
243 return 0; 244 if (!list_empty(&fs_info->tree_mod_seq_list)) {
244 245 elem = list_first_entry(&fs_info->tree_mod_seq_list,
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); 246 struct seq_list, list);
246 if (seq >= elem->seq) { 247 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", 248 pr_debug("holding back delayed_ref %llu, lowest is "
248 seq, elem->seq, delayed_refs); 249 "%llu (%p)\n", seq, elem->seq, delayed_refs);
249 return 1; 250 ret = 1;
251 }
250 } 252 }
251 return 0; 253
254 spin_unlock(&fs_info->tree_mod_seq_lock);
255 return ret;
252} 256}
253 257
254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 258int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -525,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 529 ref->is_head = 0;
526 ref->in_tree = 1; 530 ref->in_tree = 1;
527 531
528 if (is_fstree(ref_root)) 532 if (need_ref_seq(for_cow, ref_root))
529 seq = inc_delayed_seq(delayed_refs); 533 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
530 ref->seq = seq; 534 ref->seq = seq;
531 535
532 full_ref = btrfs_delayed_node_to_tree_ref(ref); 536 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -584,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 588 ref->is_head = 0;
585 ref->in_tree = 1; 589 ref->in_tree = 1;
586 590
587 if (is_fstree(ref_root)) 591 if (need_ref_seq(for_cow, ref_root))
588 seq = inc_delayed_seq(delayed_refs); 592 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
589 ref->seq = seq; 593 ref->seq = seq;
590 594
591 full_ref = btrfs_delayed_node_to_data_ref(ref); 595 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -658,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 662 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 663 num_bytes, parent, ref_root, level, action,
660 for_cow); 664 for_cow);
661 if (!is_fstree(ref_root) && 665 if (!need_ref_seq(for_cow, ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 666 waitqueue_active(&fs_info->tree_mod_seq_wait))
663 wake_up(&delayed_refs->seq_wait); 667 wake_up(&fs_info->tree_mod_seq_wait);
664 spin_unlock(&delayed_refs->lock); 668 spin_unlock(&delayed_refs->lock);
669 if (need_ref_seq(for_cow, ref_root))
670 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
665 671
666 return 0; 672 return 0;
667} 673}
@@ -707,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 713 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
708 num_bytes, parent, ref_root, owner, offset, 714 num_bytes, parent, ref_root, owner, offset,
709 action, for_cow); 715 action, for_cow);
710 if (!is_fstree(ref_root) && 716 if (!need_ref_seq(for_cow, ref_root) &&
711 waitqueue_active(&delayed_refs->seq_wait)) 717 waitqueue_active(&fs_info->tree_mod_seq_wait))
712 wake_up(&delayed_refs->seq_wait); 718 wake_up(&fs_info->tree_mod_seq_wait);
713 spin_unlock(&delayed_refs->lock); 719 spin_unlock(&delayed_refs->lock);
720 if (need_ref_seq(for_cow, ref_root))
721 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
714 722
715 return 0; 723 return 0;
716} 724}
@@ -736,8 +744,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
736 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
737 extent_op->is_data); 745 extent_op->is_data);
738 746
739 if (waitqueue_active(&delayed_refs->seq_wait)) 747 if (waitqueue_active(&fs_info->tree_mod_seq_wait))
740 wake_up(&delayed_refs->seq_wait); 748 wake_up(&fs_info->tree_mod_seq_wait);
741 spin_unlock(&delayed_refs->lock); 749 spin_unlock(&delayed_refs->lock);
742 return 0; 750 return 0;
743} 751}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927fb995..0d7c90c366b 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
139 int flushing; 139 int flushing;
140 140
141 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
162}; 142};
163 143
164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 144static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -195,34 +175,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 175int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 176 struct list_head *cluster, u64 search_start);
197 177
198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 178int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
199{ 179 struct btrfs_delayed_ref_root *delayed_refs,
200 assert_spin_locked(&delayed_refs->lock); 180 u64 seq);
201 ++delayed_refs->seq;
202 return delayed_refs->seq;
203}
204 181
205static inline void 182/*
206btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 183 * delayed refs with a ref_seq > 0 must be held back during backref walking.
207 struct seq_list *elem) 184 * this only applies to items in one of the fs-trees. for_cow items never need
185 * to be held back, so they won't get a ref_seq number.
186 */
187static inline int need_ref_seq(int for_cow, u64 rootid)
208{ 188{
209 assert_spin_locked(&delayed_refs->lock); 189 if (for_cow)
210 elem->seq = delayed_refs->seq; 190 return 0;
211 list_add_tail(&elem->list, &delayed_refs->seq_head);
212}
213 191
214static inline void 192 if (rootid == BTRFS_FS_TREE_OBJECTID)
215btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 193 return 1;
216 struct seq_list *elem)
217{
218 spin_lock(&delayed_refs->lock);
219 list_del(&elem->list);
220 wake_up(&delayed_refs->seq_wait);
221 spin_unlock(&delayed_refs->lock);
222}
223 194
224int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 195 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
225 u64 seq); 196 return 1;
197
198 return 0;
199}
226 200
227/* 201/*
228 * a node might live in a head or a regular ref, this lets you 202 * a node might live in a head or a regular ref, this lets you
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2936ca49b3b..502b20c56e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
407 break; 407 break;
408 } 408 }
409 409
410 if (failed && !ret) 410 if (failed && !ret && failed_mirror)
411 repair_eb_io_failure(root, eb, failed_mirror); 411 repair_eb_io_failure(root, eb, failed_mirror);
412 412
413 return ret; 413 return ret;
@@ -1182,6 +1182,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1182 root->defrag_running = 0; 1182 root->defrag_running = 0;
1183 root->root_key.objectid = objectid; 1183 root->root_key.objectid = objectid;
1184 root->anon_dev = 0; 1184 root->anon_dev = 0;
1185
1186 spin_lock_init(&root->root_times_lock);
1185} 1187}
1186 1188
1187static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1189static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1225,6 +1227,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1225 return root; 1227 return root;
1226} 1228}
1227 1229
1230struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1231 struct btrfs_fs_info *fs_info,
1232 u64 objectid)
1233{
1234 struct extent_buffer *leaf;
1235 struct btrfs_root *tree_root = fs_info->tree_root;
1236 struct btrfs_root *root;
1237 struct btrfs_key key;
1238 int ret = 0;
1239 u64 bytenr;
1240
1241 root = btrfs_alloc_root(fs_info);
1242 if (!root)
1243 return ERR_PTR(-ENOMEM);
1244
1245 __setup_root(tree_root->nodesize, tree_root->leafsize,
1246 tree_root->sectorsize, tree_root->stripesize,
1247 root, fs_info, objectid);
1248 root->root_key.objectid = objectid;
1249 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1250 root->root_key.offset = 0;
1251
1252 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
1253 0, objectid, NULL, 0, 0, 0);
1254 if (IS_ERR(leaf)) {
1255 ret = PTR_ERR(leaf);
1256 goto fail;
1257 }
1258
1259 bytenr = leaf->start;
1260 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1261 btrfs_set_header_bytenr(leaf, leaf->start);
1262 btrfs_set_header_generation(leaf, trans->transid);
1263 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1264 btrfs_set_header_owner(leaf, objectid);
1265 root->node = leaf;
1266
1267 write_extent_buffer(leaf, fs_info->fsid,
1268 (unsigned long)btrfs_header_fsid(leaf),
1269 BTRFS_FSID_SIZE);
1270 write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
1271 (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
1272 BTRFS_UUID_SIZE);
1273 btrfs_mark_buffer_dirty(leaf);
1274
1275 root->commit_root = btrfs_root_node(root);
1276 root->track_dirty = 1;
1277
1278
1279 root->root_item.flags = 0;
1280 root->root_item.byte_limit = 0;
1281 btrfs_set_root_bytenr(&root->root_item, leaf->start);
1282 btrfs_set_root_generation(&root->root_item, trans->transid);
1283 btrfs_set_root_level(&root->root_item, 0);
1284 btrfs_set_root_refs(&root->root_item, 1);
1285 btrfs_set_root_used(&root->root_item, leaf->len);
1286 btrfs_set_root_last_snapshot(&root->root_item, 0);
1287 btrfs_set_root_dirid(&root->root_item, 0);
1288 root->root_item.drop_level = 0;
1289
1290 key.objectid = objectid;
1291 key.type = BTRFS_ROOT_ITEM_KEY;
1292 key.offset = 0;
1293 ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1294 if (ret)
1295 goto fail;
1296
1297 btrfs_tree_unlock(leaf);
1298
1299fail:
1300 if (ret)
1301 return ERR_PTR(ret);
1302
1303 return root;
1304}
1305
1228static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1306static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1229 struct btrfs_fs_info *fs_info) 1307 struct btrfs_fs_info *fs_info)
1230{ 1308{
@@ -1326,6 +1404,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1326 u64 generation; 1404 u64 generation;
1327 u32 blocksize; 1405 u32 blocksize;
1328 int ret = 0; 1406 int ret = 0;
1407 int slot;
1329 1408
1330 root = btrfs_alloc_root(fs_info); 1409 root = btrfs_alloc_root(fs_info);
1331 if (!root) 1410 if (!root)
@@ -1352,9 +1431,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1352 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1431 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1353 if (ret == 0) { 1432 if (ret == 0) {
1354 l = path->nodes[0]; 1433 l = path->nodes[0];
1355 read_extent_buffer(l, &root->root_item, 1434 slot = path->slots[0];
1356 btrfs_item_ptr_offset(l, path->slots[0]), 1435 btrfs_read_root_item(tree_root, l, slot, &root->root_item);
1357 sizeof(root->root_item));
1358 memcpy(&root->root_key, location, sizeof(*location)); 1436 memcpy(&root->root_key, location, sizeof(*location));
1359 } 1437 }
1360 btrfs_free_path(path); 1438 btrfs_free_path(path);
@@ -1396,6 +1474,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1396 return fs_info->dev_root; 1474 return fs_info->dev_root;
1397 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) 1475 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1398 return fs_info->csum_root; 1476 return fs_info->csum_root;
1477 if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1478 return fs_info->quota_root ? fs_info->quota_root :
1479 ERR_PTR(-ENOENT);
1399again: 1480again:
1400 spin_lock(&fs_info->fs_roots_radix_lock); 1481 spin_lock(&fs_info->fs_roots_radix_lock);
1401 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1482 root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1823,6 +1904,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1823 free_extent_buffer(info->extent_root->commit_root); 1904 free_extent_buffer(info->extent_root->commit_root);
1824 free_extent_buffer(info->csum_root->node); 1905 free_extent_buffer(info->csum_root->node);
1825 free_extent_buffer(info->csum_root->commit_root); 1906 free_extent_buffer(info->csum_root->commit_root);
1907 if (info->quota_root) {
1908 free_extent_buffer(info->quota_root->node);
1909 free_extent_buffer(info->quota_root->commit_root);
1910 }
1826 1911
1827 info->tree_root->node = NULL; 1912 info->tree_root->node = NULL;
1828 info->tree_root->commit_root = NULL; 1913 info->tree_root->commit_root = NULL;
@@ -1832,6 +1917,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1832 info->extent_root->commit_root = NULL; 1917 info->extent_root->commit_root = NULL;
1833 info->csum_root->node = NULL; 1918 info->csum_root->node = NULL;
1834 info->csum_root->commit_root = NULL; 1919 info->csum_root->commit_root = NULL;
1920 if (info->quota_root) {
1921 info->quota_root->node = NULL;
1922 info->quota_root->commit_root = NULL;
1923 }
1835 1924
1836 if (chunk_root) { 1925 if (chunk_root) {
1837 free_extent_buffer(info->chunk_root->node); 1926 free_extent_buffer(info->chunk_root->node);
@@ -1862,6 +1951,7 @@ int open_ctree(struct super_block *sb,
1862 struct btrfs_root *csum_root; 1951 struct btrfs_root *csum_root;
1863 struct btrfs_root *chunk_root; 1952 struct btrfs_root *chunk_root;
1864 struct btrfs_root *dev_root; 1953 struct btrfs_root *dev_root;
1954 struct btrfs_root *quota_root;
1865 struct btrfs_root *log_tree_root; 1955 struct btrfs_root *log_tree_root;
1866 int ret; 1956 int ret;
1867 int err = -EINVAL; 1957 int err = -EINVAL;
@@ -1873,9 +1963,10 @@ int open_ctree(struct super_block *sb,
1873 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); 1963 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1874 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 1964 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1875 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 1965 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1966 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
1876 1967
1877 if (!tree_root || !extent_root || !csum_root || 1968 if (!tree_root || !extent_root || !csum_root ||
1878 !chunk_root || !dev_root) { 1969 !chunk_root || !dev_root || !quota_root) {
1879 err = -ENOMEM; 1970 err = -ENOMEM;
1880 goto fail; 1971 goto fail;
1881 } 1972 }
@@ -1944,6 +2035,8 @@ int open_ctree(struct super_block *sb,
1944 fs_info->free_chunk_space = 0; 2035 fs_info->free_chunk_space = 0;
1945 fs_info->tree_mod_log = RB_ROOT; 2036 fs_info->tree_mod_log = RB_ROOT;
1946 2037
2038 init_waitqueue_head(&fs_info->tree_mod_seq_wait);
2039
1947 /* readahead state */ 2040 /* readahead state */
1948 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2041 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1949 spin_lock_init(&fs_info->reada_lock); 2042 spin_lock_init(&fs_info->reada_lock);
@@ -2032,6 +2125,13 @@ int open_ctree(struct super_block *sb,
2032 init_rwsem(&fs_info->cleanup_work_sem); 2125 init_rwsem(&fs_info->cleanup_work_sem);
2033 init_rwsem(&fs_info->subvol_sem); 2126 init_rwsem(&fs_info->subvol_sem);
2034 2127
2128 spin_lock_init(&fs_info->qgroup_lock);
2129 fs_info->qgroup_tree = RB_ROOT;
2130 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2131 fs_info->qgroup_seq = 1;
2132 fs_info->quota_enabled = 0;
2133 fs_info->pending_quota_state = 0;
2134
2035 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2135 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2036 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 2136 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2037 2137
@@ -2244,7 +2344,7 @@ int open_ctree(struct super_block *sb,
2244 ret |= btrfs_start_workers(&fs_info->caching_workers); 2344 ret |= btrfs_start_workers(&fs_info->caching_workers);
2245 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2345 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2246 if (ret) { 2346 if (ret) {
2247 ret = -ENOMEM; 2347 err = -ENOMEM;
2248 goto fail_sb_buffer; 2348 goto fail_sb_buffer;
2249 } 2349 }
2250 2350
@@ -2356,6 +2456,17 @@ retry_root_backup:
2356 goto recovery_tree_root; 2456 goto recovery_tree_root;
2357 csum_root->track_dirty = 1; 2457 csum_root->track_dirty = 1;
2358 2458
2459 ret = find_and_setup_root(tree_root, fs_info,
2460 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
2461 if (ret) {
2462 kfree(quota_root);
2463 quota_root = fs_info->quota_root = NULL;
2464 } else {
2465 quota_root->track_dirty = 1;
2466 fs_info->quota_enabled = 1;
2467 fs_info->pending_quota_state = 1;
2468 }
2469
2359 fs_info->generation = generation; 2470 fs_info->generation = generation;
2360 fs_info->last_trans_committed = generation; 2471 fs_info->last_trans_committed = generation;
2361 2472
@@ -2415,6 +2526,9 @@ retry_root_backup:
2415 " integrity check module %s\n", sb->s_id); 2526 " integrity check module %s\n", sb->s_id);
2416 } 2527 }
2417#endif 2528#endif
2529 ret = btrfs_read_qgroup_config(fs_info);
2530 if (ret)
2531 goto fail_trans_kthread;
2418 2532
2419 /* do not make disk changes in broken FS */ 2533 /* do not make disk changes in broken FS */
2420 if (btrfs_super_log_root(disk_super) != 0 && 2534 if (btrfs_super_log_root(disk_super) != 0 &&
@@ -2425,7 +2539,7 @@ retry_root_backup:
2425 printk(KERN_WARNING "Btrfs log replay required " 2539 printk(KERN_WARNING "Btrfs log replay required "
2426 "on RO media\n"); 2540 "on RO media\n");
2427 err = -EIO; 2541 err = -EIO;
2428 goto fail_trans_kthread; 2542 goto fail_qgroup;
2429 } 2543 }
2430 blocksize = 2544 blocksize =
2431 btrfs_level_size(tree_root, 2545 btrfs_level_size(tree_root,
@@ -2434,7 +2548,7 @@ retry_root_backup:
2434 log_tree_root = btrfs_alloc_root(fs_info); 2548 log_tree_root = btrfs_alloc_root(fs_info);
2435 if (!log_tree_root) { 2549 if (!log_tree_root) {
2436 err = -ENOMEM; 2550 err = -ENOMEM;
2437 goto fail_trans_kthread; 2551 goto fail_qgroup;
2438 } 2552 }
2439 2553
2440 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2554 __setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2466,15 +2580,15 @@ retry_root_backup:
2466 2580
2467 if (!(sb->s_flags & MS_RDONLY)) { 2581 if (!(sb->s_flags & MS_RDONLY)) {
2468 ret = btrfs_cleanup_fs_roots(fs_info); 2582 ret = btrfs_cleanup_fs_roots(fs_info);
2469 if (ret) { 2583 if (ret)
2470 } 2584 goto fail_trans_kthread;
2471 2585
2472 ret = btrfs_recover_relocation(tree_root); 2586 ret = btrfs_recover_relocation(tree_root);
2473 if (ret < 0) { 2587 if (ret < 0) {
2474 printk(KERN_WARNING 2588 printk(KERN_WARNING
2475 "btrfs: failed to recover relocation\n"); 2589 "btrfs: failed to recover relocation\n");
2476 err = -EINVAL; 2590 err = -EINVAL;
2477 goto fail_trans_kthread; 2591 goto fail_qgroup;
2478 } 2592 }
2479 } 2593 }
2480 2594
@@ -2484,10 +2598,10 @@ retry_root_backup:
2484 2598
2485 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2599 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2486 if (!fs_info->fs_root) 2600 if (!fs_info->fs_root)
2487 goto fail_trans_kthread; 2601 goto fail_qgroup;
2488 if (IS_ERR(fs_info->fs_root)) { 2602 if (IS_ERR(fs_info->fs_root)) {
2489 err = PTR_ERR(fs_info->fs_root); 2603 err = PTR_ERR(fs_info->fs_root);
2490 goto fail_trans_kthread; 2604 goto fail_qgroup;
2491 } 2605 }
2492 2606
2493 if (sb->s_flags & MS_RDONLY) 2607 if (sb->s_flags & MS_RDONLY)
@@ -2511,6 +2625,8 @@ retry_root_backup:
2511 2625
2512 return 0; 2626 return 0;
2513 2627
2628fail_qgroup:
2629 btrfs_free_qgroup_config(fs_info);
2514fail_trans_kthread: 2630fail_trans_kthread:
2515 kthread_stop(fs_info->transaction_kthread); 2631 kthread_stop(fs_info->transaction_kthread);
2516fail_cleaner: 2632fail_cleaner:
@@ -3109,6 +3225,8 @@ int close_ctree(struct btrfs_root *root)
3109 fs_info->closing = 2; 3225 fs_info->closing = 2;
3110 smp_mb(); 3226 smp_mb();
3111 3227
3228 btrfs_free_qgroup_config(root->fs_info);
3229
3112 if (fs_info->delalloc_bytes) { 3230 if (fs_info->delalloc_bytes) {
3113 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3231 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3114 (unsigned long long)fs_info->delalloc_bytes); 3232 (unsigned long long)fs_info->delalloc_bytes);
@@ -3128,6 +3246,10 @@ int close_ctree(struct btrfs_root *root)
3128 free_extent_buffer(fs_info->dev_root->commit_root); 3246 free_extent_buffer(fs_info->dev_root->commit_root);
3129 free_extent_buffer(fs_info->csum_root->node); 3247 free_extent_buffer(fs_info->csum_root->node);
3130 free_extent_buffer(fs_info->csum_root->commit_root); 3248 free_extent_buffer(fs_info->csum_root->commit_root);
3249 if (fs_info->quota_root) {
3250 free_extent_buffer(fs_info->quota_root->node);
3251 free_extent_buffer(fs_info->quota_root->commit_root);
3252 }
3131 3253
3132 btrfs_free_block_groups(fs_info); 3254 btrfs_free_block_groups(fs_info);
3133 3255
@@ -3258,7 +3380,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3258 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3380 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3259} 3381}
3260 3382
3261static int btree_lock_page_hook(struct page *page, void *data, 3383int btree_lock_page_hook(struct page *page, void *data,
3262 void (*flush_fn)(void *)) 3384 void (*flush_fn)(void *))
3263{ 3385{
3264 struct inode *inode = page->mapping->host; 3386 struct inode *inode = page->mapping->host;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b3fab39f7..95e147eea23 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
89int btrfs_cleanup_transaction(struct btrfs_root *root); 89int btrfs_cleanup_transaction(struct btrfs_root *root);
90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
91 struct btrfs_root *root); 91 struct btrfs_root *root);
92void btrfs_abort_devices(struct btrfs_root *root);
93struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
94 struct btrfs_fs_info *fs_info,
95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *));
92 98
93#ifdef CONFIG_DEBUG_LOCK_ALLOC 99#ifdef CONFIG_DEBUG_LOCK_ALLOC
94void btrfs_init_lockdep(void); 100void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6e1d36702ff..4e1b153b7c4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36 36
37#undef SCRAMBLE_DELAYED_REFS
38
37/* 39/*
38 * control flags for do_chunk_alloc's force field 40 * control flags for do_chunk_alloc's force field
39 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 41 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2217 struct btrfs_delayed_ref_node *ref; 2219 struct btrfs_delayed_ref_node *ref;
2218 struct btrfs_delayed_ref_head *locked_ref = NULL; 2220 struct btrfs_delayed_ref_head *locked_ref = NULL;
2219 struct btrfs_delayed_extent_op *extent_op; 2221 struct btrfs_delayed_extent_op *extent_op;
2222 struct btrfs_fs_info *fs_info = root->fs_info;
2220 int ret; 2223 int ret;
2221 int count = 0; 2224 int count = 0;
2222 int must_insert_reserved = 0; 2225 int must_insert_reserved = 0;
@@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2255 ref = select_delayed_ref(locked_ref); 2258 ref = select_delayed_ref(locked_ref);
2256 2259
2257 if (ref && ref->seq && 2260 if (ref && ref->seq &&
2258 btrfs_check_delayed_seq(delayed_refs, ref->seq)) { 2261 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2259 /* 2262 /*
2260 * there are still refs with lower seq numbers in the 2263 * there are still refs with lower seq numbers in the
2261 * process of being added. Don't run this ref yet. 2264 * process of being added. Don't run this ref yet.
@@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 } 2340 }
2338 2341
2339next: 2342next:
2340 do_chunk_alloc(trans, root->fs_info->extent_root, 2343 do_chunk_alloc(trans, fs_info->extent_root,
2341 2 * 1024 * 1024, 2344 2 * 1024 * 1024,
2342 btrfs_get_alloc_profile(root, 0), 2345 btrfs_get_alloc_profile(root, 0),
2343 CHUNK_ALLOC_NO_FORCE); 2346 CHUNK_ALLOC_NO_FORCE);
@@ -2347,21 +2350,99 @@ next:
2347 return count; 2350 return count;
2348} 2351}
2349 2352
2350static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, 2353static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
2354 struct btrfs_delayed_ref_root *delayed_refs,
2351 unsigned long num_refs, 2355 unsigned long num_refs,
2352 struct list_head *first_seq) 2356 struct list_head *first_seq)
2353{ 2357{
2354 spin_unlock(&delayed_refs->lock); 2358 spin_unlock(&delayed_refs->lock);
2355 pr_debug("waiting for more refs (num %ld, first %p)\n", 2359 pr_debug("waiting for more refs (num %ld, first %p)\n",
2356 num_refs, first_seq); 2360 num_refs, first_seq);
2357 wait_event(delayed_refs->seq_wait, 2361 wait_event(fs_info->tree_mod_seq_wait,
2358 num_refs != delayed_refs->num_entries || 2362 num_refs != delayed_refs->num_entries ||
2359 delayed_refs->seq_head.next != first_seq); 2363 fs_info->tree_mod_seq_list.next != first_seq);
2360 pr_debug("done waiting for more refs (num %ld, first %p)\n", 2364 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2361 delayed_refs->num_entries, delayed_refs->seq_head.next); 2365 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
2362 spin_lock(&delayed_refs->lock); 2366 spin_lock(&delayed_refs->lock);
2363} 2367}
2364 2368
2369#ifdef SCRAMBLE_DELAYED_REFS
2370/*
2371 * Normally delayed refs get processed in ascending bytenr order. This
2372 * correlates in most cases to the order added. To expose dependencies on this
2373 * order, we start to process the tree in the middle instead of the beginning
2374 */
2375static u64 find_middle(struct rb_root *root)
2376{
2377 struct rb_node *n = root->rb_node;
2378 struct btrfs_delayed_ref_node *entry;
2379 int alt = 1;
2380 u64 middle;
2381 u64 first = 0, last = 0;
2382
2383 n = rb_first(root);
2384 if (n) {
2385 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2386 first = entry->bytenr;
2387 }
2388 n = rb_last(root);
2389 if (n) {
2390 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2391 last = entry->bytenr;
2392 }
2393 n = root->rb_node;
2394
2395 while (n) {
2396 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2397 WARN_ON(!entry->in_tree);
2398
2399 middle = entry->bytenr;
2400
2401 if (alt)
2402 n = n->rb_left;
2403 else
2404 n = n->rb_right;
2405
2406 alt = 1 - alt;
2407 }
2408 return middle;
2409}
2410#endif
2411
2412int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2413 struct btrfs_fs_info *fs_info)
2414{
2415 struct qgroup_update *qgroup_update;
2416 int ret = 0;
2417
2418 if (list_empty(&trans->qgroup_ref_list) !=
2419 !trans->delayed_ref_elem.seq) {
2420 /* list without seq or seq without list */
2421 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2422 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2423 trans->delayed_ref_elem.seq);
2424 BUG();
2425 }
2426
2427 if (!trans->delayed_ref_elem.seq)
2428 return 0;
2429
2430 while (!list_empty(&trans->qgroup_ref_list)) {
2431 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2432 struct qgroup_update, list);
2433 list_del(&qgroup_update->list);
2434 if (!ret)
2435 ret = btrfs_qgroup_account_ref(
2436 trans, fs_info, qgroup_update->node,
2437 qgroup_update->extent_op);
2438 kfree(qgroup_update);
2439 }
2440
2441 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2442
2443 return ret;
2444}
2445
2365/* 2446/*
2366 * this starts processing the delayed reference count updates and 2447 * this starts processing the delayed reference count updates and
2367 * extent insertions we have queued up so far. count can be 2448 * extent insertions we have queued up so far. count can be
@@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2398 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), 2479 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2399 CHUNK_ALLOC_NO_FORCE); 2480 CHUNK_ALLOC_NO_FORCE);
2400 2481
2482 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2483
2401 delayed_refs = &trans->transaction->delayed_refs; 2484 delayed_refs = &trans->transaction->delayed_refs;
2402 INIT_LIST_HEAD(&cluster); 2485 INIT_LIST_HEAD(&cluster);
2403again: 2486again:
2404 consider_waiting = 0; 2487 consider_waiting = 0;
2405 spin_lock(&delayed_refs->lock); 2488 spin_lock(&delayed_refs->lock);
2489
2490#ifdef SCRAMBLE_DELAYED_REFS
2491 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2492#endif
2493
2406 if (count == 0) { 2494 if (count == 0) {
2407 count = delayed_refs->num_entries * 2; 2495 count = delayed_refs->num_entries * 2;
2408 run_most = 1; 2496 run_most = 1;
@@ -2437,7 +2525,7 @@ again:
2437 num_refs = delayed_refs->num_entries; 2525 num_refs = delayed_refs->num_entries;
2438 first_seq = root->fs_info->tree_mod_seq_list.next; 2526 first_seq = root->fs_info->tree_mod_seq_list.next;
2439 } else { 2527 } else {
2440 wait_for_more_refs(delayed_refs, 2528 wait_for_more_refs(root->fs_info, delayed_refs,
2441 num_refs, first_seq); 2529 num_refs, first_seq);
2442 /* 2530 /*
2443 * after waiting, things have changed. we 2531 * after waiting, things have changed. we
@@ -2502,6 +2590,7 @@ again:
2502 } 2590 }
2503out: 2591out:
2504 spin_unlock(&delayed_refs->lock); 2592 spin_unlock(&delayed_refs->lock);
2593 assert_qgroups_uptodate(trans);
2505 return 0; 2594 return 0;
2506} 2595}
2507 2596
@@ -2581,8 +2670,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2581 2670
2582 node = rb_prev(node); 2671 node = rb_prev(node);
2583 if (node) { 2672 if (node) {
2673 int seq = ref->seq;
2674
2584 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2675 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2585 if (ref->bytenr == bytenr) 2676 if (ref->bytenr == bytenr && ref->seq == seq)
2586 goto out_unlock; 2677 goto out_unlock;
2587 } 2678 }
2588 2679
@@ -2903,8 +2994,13 @@ again:
2903 } 2994 }
2904 2995
2905 spin_lock(&block_group->lock); 2996 spin_lock(&block_group->lock);
2906 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2997 if (block_group->cached != BTRFS_CACHE_FINISHED ||
2907 /* We're not cached, don't bother trying to write stuff out */ 2998 !btrfs_test_opt(root, SPACE_CACHE)) {
2999 /*
3000 * don't bother trying to write stuff out _if_
3001 * a) we're not cached,
3002 * b) we're with nospace_cache mount option.
3003 */
2908 dcs = BTRFS_DC_WRITTEN; 3004 dcs = BTRFS_DC_WRITTEN;
2909 spin_unlock(&block_group->lock); 3005 spin_unlock(&block_group->lock);
2910 goto out_put; 3006 goto out_put;
@@ -3134,6 +3230,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3134 init_waitqueue_head(&found->wait); 3230 init_waitqueue_head(&found->wait);
3135 *space_info = found; 3231 *space_info = found;
3136 list_add_rcu(&found->list, &info->space_info); 3232 list_add_rcu(&found->list, &info->space_info);
3233 if (flags & BTRFS_BLOCK_GROUP_DATA)
3234 info->data_sinfo = found;
3137 return 0; 3235 return 0;
3138} 3236}
3139 3237
@@ -3263,12 +3361,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3263 return get_alloc_profile(root, flags); 3361 return get_alloc_profile(root, flags);
3264} 3362}
3265 3363
3266void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3267{
3268 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3269 BTRFS_BLOCK_GROUP_DATA);
3270}
3271
3272/* 3364/*
3273 * This will check the space that the inode allocates from to make sure we have 3365 * This will check the space that the inode allocates from to make sure we have
3274 * enough space for bytes. 3366 * enough space for bytes.
@@ -3277,6 +3369,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3277{ 3369{
3278 struct btrfs_space_info *data_sinfo; 3370 struct btrfs_space_info *data_sinfo;
3279 struct btrfs_root *root = BTRFS_I(inode)->root; 3371 struct btrfs_root *root = BTRFS_I(inode)->root;
3372 struct btrfs_fs_info *fs_info = root->fs_info;
3280 u64 used; 3373 u64 used;
3281 int ret = 0, committed = 0, alloc_chunk = 1; 3374 int ret = 0, committed = 0, alloc_chunk = 1;
3282 3375
@@ -3289,7 +3382,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3289 committed = 1; 3382 committed = 1;
3290 } 3383 }
3291 3384
3292 data_sinfo = BTRFS_I(inode)->space_info; 3385 data_sinfo = fs_info->data_sinfo;
3293 if (!data_sinfo) 3386 if (!data_sinfo)
3294 goto alloc; 3387 goto alloc;
3295 3388
@@ -3330,10 +3423,9 @@ alloc:
3330 goto commit_trans; 3423 goto commit_trans;
3331 } 3424 }
3332 3425
3333 if (!data_sinfo) { 3426 if (!data_sinfo)
3334 btrfs_set_inode_space_info(root, inode); 3427 data_sinfo = fs_info->data_sinfo;
3335 data_sinfo = BTRFS_I(inode)->space_info; 3428
3336 }
3337 goto again; 3429 goto again;
3338 } 3430 }
3339 3431
@@ -3380,7 +3472,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3380 /* make sure bytes are sectorsize aligned */ 3472 /* make sure bytes are sectorsize aligned */
3381 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3473 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3382 3474
3383 data_sinfo = BTRFS_I(inode)->space_info; 3475 data_sinfo = root->fs_info->data_sinfo;
3384 spin_lock(&data_sinfo->lock); 3476 spin_lock(&data_sinfo->lock);
3385 data_sinfo->bytes_may_use -= bytes; 3477 data_sinfo->bytes_may_use -= bytes;
3386 trace_btrfs_space_reservation(root->fs_info, "space_info", 3478 trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3586,89 +3678,58 @@ out:
3586/* 3678/*
3587 * shrink metadata reservation for delalloc 3679 * shrink metadata reservation for delalloc
3588 */ 3680 */
3589static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, 3681static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3590 bool wait_ordered) 3682 bool wait_ordered)
3591{ 3683{
3592 struct btrfs_block_rsv *block_rsv; 3684 struct btrfs_block_rsv *block_rsv;
3593 struct btrfs_space_info *space_info; 3685 struct btrfs_space_info *space_info;
3594 struct btrfs_trans_handle *trans; 3686 struct btrfs_trans_handle *trans;
3595 u64 reserved; 3687 u64 delalloc_bytes;
3596 u64 max_reclaim; 3688 u64 max_reclaim;
3597 u64 reclaimed = 0;
3598 long time_left; 3689 long time_left;
3599 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3690 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3600 int loops = 0; 3691 int loops = 0;
3601 unsigned long progress;
3602 3692
3603 trans = (struct btrfs_trans_handle *)current->journal_info; 3693 trans = (struct btrfs_trans_handle *)current->journal_info;
3604 block_rsv = &root->fs_info->delalloc_block_rsv; 3694 block_rsv = &root->fs_info->delalloc_block_rsv;
3605 space_info = block_rsv->space_info; 3695 space_info = block_rsv->space_info;
3606 3696
3607 smp_mb(); 3697 smp_mb();
3608 reserved = space_info->bytes_may_use; 3698 delalloc_bytes = root->fs_info->delalloc_bytes;
3609 progress = space_info->reservation_progress; 3699 if (delalloc_bytes == 0) {
3610
3611 if (reserved == 0)
3612 return 0;
3613
3614 smp_mb();
3615 if (root->fs_info->delalloc_bytes == 0) {
3616 if (trans) 3700 if (trans)
3617 return 0; 3701 return;
3618 btrfs_wait_ordered_extents(root, 0, 0); 3702 btrfs_wait_ordered_extents(root, 0, 0);
3619 return 0; 3703 return;
3620 } 3704 }
3621 3705
3622 max_reclaim = min(reserved, to_reclaim); 3706 while (delalloc_bytes && loops < 3) {
3623 nr_pages = max_t(unsigned long, nr_pages, 3707 max_reclaim = min(delalloc_bytes, to_reclaim);
3624 max_reclaim >> PAGE_CACHE_SHIFT); 3708 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3625 while (loops < 1024) {
3626 /* have the flusher threads jump in and do some IO */
3627 smp_mb();
3628 nr_pages = min_t(unsigned long, nr_pages,
3629 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3630 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3709 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3631 WB_REASON_FS_FREE_SPACE); 3710 WB_REASON_FS_FREE_SPACE);
3632 3711
3633 spin_lock(&space_info->lock); 3712 spin_lock(&space_info->lock);
3634 if (reserved > space_info->bytes_may_use) 3713 if (space_info->bytes_used + space_info->bytes_reserved +
3635 reclaimed += reserved - space_info->bytes_may_use; 3714 space_info->bytes_pinned + space_info->bytes_readonly +
3636 reserved = space_info->bytes_may_use; 3715 space_info->bytes_may_use + orig <=
3716 space_info->total_bytes) {
3717 spin_unlock(&space_info->lock);
3718 break;
3719 }
3637 spin_unlock(&space_info->lock); 3720 spin_unlock(&space_info->lock);
3638 3721
3639 loops++; 3722 loops++;
3640
3641 if (reserved == 0 || reclaimed >= max_reclaim)
3642 break;
3643
3644 if (trans && trans->transaction->blocked)
3645 return -EAGAIN;
3646
3647 if (wait_ordered && !trans) { 3723 if (wait_ordered && !trans) {
3648 btrfs_wait_ordered_extents(root, 0, 0); 3724 btrfs_wait_ordered_extents(root, 0, 0);
3649 } else { 3725 } else {
3650 time_left = schedule_timeout_interruptible(1); 3726 time_left = schedule_timeout_killable(1);
3651
3652 /* We were interrupted, exit */
3653 if (time_left) 3727 if (time_left)
3654 break; 3728 break;
3655 } 3729 }
3656 3730 smp_mb();
3657 /* we've kicked the IO a few times, if anything has been freed, 3731 delalloc_bytes = root->fs_info->delalloc_bytes;
3658 * exit. There is no sense in looping here for a long time
3659 * when we really need to commit the transaction, or there are
3660 * just too many writers without enough free space
3661 */
3662
3663 if (loops > 3) {
3664 smp_mb();
3665 if (progress != space_info->reservation_progress)
3666 break;
3667 }
3668
3669 } 3732 }
3670
3671 return reclaimed >= to_reclaim;
3672} 3733}
3673 3734
3674/** 3735/**
@@ -3728,6 +3789,58 @@ commit:
3728 return btrfs_commit_transaction(trans, root); 3789 return btrfs_commit_transaction(trans, root);
3729} 3790}
3730 3791
3792enum flush_state {
3793 FLUSH_DELALLOC = 1,
3794 FLUSH_DELALLOC_WAIT = 2,
3795 FLUSH_DELAYED_ITEMS_NR = 3,
3796 FLUSH_DELAYED_ITEMS = 4,
3797 COMMIT_TRANS = 5,
3798};
3799
3800static int flush_space(struct btrfs_root *root,
3801 struct btrfs_space_info *space_info, u64 num_bytes,
3802 u64 orig_bytes, int state)
3803{
3804 struct btrfs_trans_handle *trans;
3805 int nr;
3806 int ret = 0;
3807
3808 switch (state) {
3809 case FLUSH_DELALLOC:
3810 case FLUSH_DELALLOC_WAIT:
3811 shrink_delalloc(root, num_bytes, orig_bytes,
3812 state == FLUSH_DELALLOC_WAIT);
3813 break;
3814 case FLUSH_DELAYED_ITEMS_NR:
3815 case FLUSH_DELAYED_ITEMS:
3816 if (state == FLUSH_DELAYED_ITEMS_NR) {
3817 u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3818
3819 nr = (int)div64_u64(num_bytes, bytes);
3820 if (!nr)
3821 nr = 1;
3822 nr *= 2;
3823 } else {
3824 nr = -1;
3825 }
3826 trans = btrfs_join_transaction(root);
3827 if (IS_ERR(trans)) {
3828 ret = PTR_ERR(trans);
3829 break;
3830 }
3831 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3832 btrfs_end_transaction(trans, root);
3833 break;
3834 case COMMIT_TRANS:
3835 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3836 break;
3837 default:
3838 ret = -ENOSPC;
3839 break;
3840 }
3841
3842 return ret;
3843}
3731/** 3844/**
3732 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 3845 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3733 * @root - the root we're allocating for 3846 * @root - the root we're allocating for
@@ -3749,11 +3862,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
3749 struct btrfs_space_info *space_info = block_rsv->space_info; 3862 struct btrfs_space_info *space_info = block_rsv->space_info;
3750 u64 used; 3863 u64 used;
3751 u64 num_bytes = orig_bytes; 3864 u64 num_bytes = orig_bytes;
3752 int retries = 0; 3865 int flush_state = FLUSH_DELALLOC;
3753 int ret = 0; 3866 int ret = 0;
3754 bool committed = false;
3755 bool flushing = false; 3867 bool flushing = false;
3756 bool wait_ordered = false; 3868 bool committed = false;
3757 3869
3758again: 3870again:
3759 ret = 0; 3871 ret = 0;
@@ -3812,9 +3924,8 @@ again:
3812 * amount plus the amount of bytes that we need for this 3924 * amount plus the amount of bytes that we need for this
3813 * reservation. 3925 * reservation.
3814 */ 3926 */
3815 wait_ordered = true;
3816 num_bytes = used - space_info->total_bytes + 3927 num_bytes = used - space_info->total_bytes +
3817 (orig_bytes * (retries + 1)); 3928 (orig_bytes * 2);
3818 } 3929 }
3819 3930
3820 if (ret) { 3931 if (ret) {
@@ -3867,8 +3978,6 @@ again:
3867 trace_btrfs_space_reservation(root->fs_info, 3978 trace_btrfs_space_reservation(root->fs_info,
3868 "space_info", space_info->flags, orig_bytes, 1); 3979 "space_info", space_info->flags, orig_bytes, 1);
3869 ret = 0; 3980 ret = 0;
3870 } else {
3871 wait_ordered = true;
3872 } 3981 }
3873 } 3982 }
3874 3983
@@ -3887,36 +3996,13 @@ again:
3887 if (!ret || !flush) 3996 if (!ret || !flush)
3888 goto out; 3997 goto out;
3889 3998
3890 /* 3999 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3891 * We do synchronous shrinking since we don't actually unreserve 4000 flush_state);
3892 * metadata until after the IO is completed. 4001 flush_state++;
3893 */ 4002 if (!ret)
3894 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3895 if (ret < 0)
3896 goto out;
3897
3898 ret = 0;
3899
3900 /*
3901 * So if we were overcommitted it's possible that somebody else flushed
3902 * out enough space and we simply didn't have enough space to reclaim,
3903 * so go back around and try again.
3904 */
3905 if (retries < 2) {
3906 wait_ordered = true;
3907 retries++;
3908 goto again; 4003 goto again;
3909 } 4004 else if (flush_state <= COMMIT_TRANS)
3910
3911 ret = -ENOSPC;
3912 if (committed)
3913 goto out;
3914
3915 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3916 if (!ret) {
3917 committed = true;
3918 goto again; 4005 goto again;
3919 }
3920 4006
3921out: 4007out:
3922 if (flushing) { 4008 if (flushing) {
@@ -3934,7 +4020,10 @@ static struct btrfs_block_rsv *get_block_rsv(
3934{ 4020{
3935 struct btrfs_block_rsv *block_rsv = NULL; 4021 struct btrfs_block_rsv *block_rsv = NULL;
3936 4022
3937 if (root->ref_cows || root == root->fs_info->csum_root) 4023 if (root->ref_cows)
4024 block_rsv = trans->block_rsv;
4025
4026 if (root == root->fs_info->csum_root && trans->adding_csums)
3938 block_rsv = trans->block_rsv; 4027 block_rsv = trans->block_rsv;
3939 4028
3940 if (!block_rsv) 4029 if (!block_rsv)
@@ -4286,6 +4375,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4286void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4375void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4287 struct btrfs_root *root) 4376 struct btrfs_root *root)
4288{ 4377{
4378 if (!trans->block_rsv)
4379 return;
4380
4289 if (!trans->bytes_reserved) 4381 if (!trans->bytes_reserved)
4290 return; 4382 return;
4291 4383
@@ -4444,7 +4536,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4444 int ret; 4536 int ret;
4445 4537
4446 /* Need to be holding the i_mutex here if we aren't free space cache */ 4538 /* Need to be holding the i_mutex here if we aren't free space cache */
4447 if (btrfs_is_free_space_inode(root, inode)) 4539 if (btrfs_is_free_space_inode(inode))
4448 flush = 0; 4540 flush = 0;
4449 4541
4450 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4542 if (flush && btrfs_transaction_in_commit(root->fs_info))
@@ -4476,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4476 csum_bytes = BTRFS_I(inode)->csum_bytes; 4568 csum_bytes = BTRFS_I(inode)->csum_bytes;
4477 spin_unlock(&BTRFS_I(inode)->lock); 4569 spin_unlock(&BTRFS_I(inode)->lock);
4478 4570
4571 if (root->fs_info->quota_enabled) {
4572 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize);
4574 if (ret)
4575 return ret;
4576 }
4577
4479 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4578 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4480 if (ret) { 4579 if (ret) {
4481 u64 to_free = 0; 4580 u64 to_free = 0;
@@ -4554,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4554 4653
4555 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4654 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4556 btrfs_ino(inode), to_free, 0); 4655 btrfs_ino(inode), to_free, 0);
4656 if (root->fs_info->quota_enabled) {
4657 btrfs_qgroup_free(root, num_bytes +
4658 dropped * root->leafsize);
4659 }
4660
4557 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4661 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4558 to_free); 4662 to_free);
4559} 4663}
@@ -5190,8 +5294,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5190 rb_erase(&head->node.rb_node, &delayed_refs->root); 5294 rb_erase(&head->node.rb_node, &delayed_refs->root);
5191 5295
5192 delayed_refs->num_entries--; 5296 delayed_refs->num_entries--;
5193 if (waitqueue_active(&delayed_refs->seq_wait)) 5297 smp_mb();
5194 wake_up(&delayed_refs->seq_wait); 5298 if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
5299 wake_up(&root->fs_info->tree_mod_seq_wait);
5195 5300
5196 /* 5301 /*
5197 * we don't take a ref on the node because we're removing it from the 5302 * we don't take a ref on the node because we're removing it from the
@@ -5748,7 +5853,11 @@ loop:
5748 ret = do_chunk_alloc(trans, root, num_bytes + 5853 ret = do_chunk_alloc(trans, root, num_bytes +
5749 2 * 1024 * 1024, data, 5854 2 * 1024 * 1024, data,
5750 CHUNK_ALLOC_LIMITED); 5855 CHUNK_ALLOC_LIMITED);
5751 if (ret < 0) { 5856 /*
5857 * Do not bail out on ENOSPC since we
5858 * can do more things.
5859 */
5860 if (ret < 0 && ret != -ENOSPC) {
5752 btrfs_abort_transaction(trans, 5861 btrfs_abort_transaction(trans,
5753 root, ret); 5862 root, ret);
5754 goto out; 5863 goto out;
@@ -5816,13 +5925,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5816again: 5925again:
5817 list_for_each_entry(cache, &info->block_groups[index], list) { 5926 list_for_each_entry(cache, &info->block_groups[index], list) {
5818 spin_lock(&cache->lock); 5927 spin_lock(&cache->lock);
5819 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 5928 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5820 "%llu pinned %llu reserved\n",
5821 (unsigned long long)cache->key.objectid, 5929 (unsigned long long)cache->key.objectid,
5822 (unsigned long long)cache->key.offset, 5930 (unsigned long long)cache->key.offset,
5823 (unsigned long long)btrfs_block_group_used(&cache->item), 5931 (unsigned long long)btrfs_block_group_used(&cache->item),
5824 (unsigned long long)cache->pinned, 5932 (unsigned long long)cache->pinned,
5825 (unsigned long long)cache->reserved); 5933 (unsigned long long)cache->reserved,
5934 cache->ro ? "[readonly]" : "");
5826 btrfs_dump_free_space(cache, bytes); 5935 btrfs_dump_free_space(cache, bytes);
5827 spin_unlock(&cache->lock); 5936 spin_unlock(&cache->lock);
5828 } 5937 }
@@ -7610,8 +7719,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7610 INIT_LIST_HEAD(&cache->list); 7719 INIT_LIST_HEAD(&cache->list);
7611 INIT_LIST_HEAD(&cache->cluster_list); 7720 INIT_LIST_HEAD(&cache->cluster_list);
7612 7721
7613 if (need_clear) 7722 if (need_clear) {
7723 /*
7724 * When we mount with old space cache, we need to
7725 * set BTRFS_DC_CLEAR and set dirty flag.
7726 *
7727 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7728 * truncate the old free space cache inode and
7729 * setup a new one.
7730 * b) Setting 'dirty flag' makes sure that we flush
7731 * the new space cache info onto disk.
7732 */
7614 cache->disk_cache_state = BTRFS_DC_CLEAR; 7733 cache->disk_cache_state = BTRFS_DC_CLEAR;
7734 if (btrfs_test_opt(root, SPACE_CACHE))
7735 cache->dirty = 1;
7736 }
7615 7737
7616 read_extent_buffer(leaf, &cache->item, 7738 read_extent_buffer(leaf, &cache->item,
7617 btrfs_item_ptr_offset(leaf, path->slots[0]), 7739 btrfs_item_ptr_offset(leaf, path->slots[0]),
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index deafe19c34b..45c81bb4ac8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1919,7 +1919,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1919 return -EIO; 1919 return -EIO;
1920 } 1920 }
1921 1921
1922 printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 1922 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1923 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 1923 "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1924 start, rcu_str_deref(dev->name), sector); 1924 start, rcu_str_deref(dev->name), sector);
1925 1925
@@ -3078,8 +3078,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3078 } 3078 }
3079 } 3079 }
3080 3080
3081 /*
3082 * We need to do this to prevent races in people who check if the eb is
3083 * under IO since we can end up having no IO bits set for a short period
3084 * of time.
3085 */
3086 spin_lock(&eb->refs_lock);
3081 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3087 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3082 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3088 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3089 spin_unlock(&eb->refs_lock);
3083 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3090 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3084 spin_lock(&fs_info->delalloc_lock); 3091 spin_lock(&fs_info->delalloc_lock);
3085 if (fs_info->dirty_metadata_bytes >= eb->len) 3092 if (fs_info->dirty_metadata_bytes >= eb->len)
@@ -3088,6 +3095,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3088 WARN_ON(1); 3095 WARN_ON(1);
3089 spin_unlock(&fs_info->delalloc_lock); 3096 spin_unlock(&fs_info->delalloc_lock);
3090 ret = 1; 3097 ret = 1;
3098 } else {
3099 spin_unlock(&eb->refs_lock);
3091 } 3100 }
3092 3101
3093 btrfs_tree_unlock(eb); 3102 btrfs_tree_unlock(eb);
@@ -3558,19 +3567,38 @@ int extent_readpages(struct extent_io_tree *tree,
3558 struct bio *bio = NULL; 3567 struct bio *bio = NULL;
3559 unsigned page_idx; 3568 unsigned page_idx;
3560 unsigned long bio_flags = 0; 3569 unsigned long bio_flags = 0;
3570 struct page *pagepool[16];
3571 struct page *page;
3572 int i = 0;
3573 int nr = 0;
3561 3574
3562 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3575 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3563 struct page *page = list_entry(pages->prev, struct page, lru); 3576 page = list_entry(pages->prev, struct page, lru);
3564 3577
3565 prefetchw(&page->flags); 3578 prefetchw(&page->flags);
3566 list_del(&page->lru); 3579 list_del(&page->lru);
3567 if (!add_to_page_cache_lru(page, mapping, 3580 if (add_to_page_cache_lru(page, mapping,
3568 page->index, GFP_NOFS)) { 3581 page->index, GFP_NOFS)) {
3569 __extent_read_full_page(tree, page, get_extent, 3582 page_cache_release(page);
3570 &bio, 0, &bio_flags); 3583 continue;
3571 } 3584 }
3572 page_cache_release(page); 3585
3586 pagepool[nr++] = page;
3587 if (nr < ARRAY_SIZE(pagepool))
3588 continue;
3589 for (i = 0; i < nr; i++) {
3590 __extent_read_full_page(tree, pagepool[i], get_extent,
3591 &bio, 0, &bio_flags);
3592 page_cache_release(pagepool[i]);
3593 }
3594 nr = 0;
3595 }
3596 for (i = 0; i < nr; i++) {
3597 __extent_read_full_page(tree, pagepool[i], get_extent,
3598 &bio, 0, &bio_flags);
3599 page_cache_release(pagepool[i]);
3573 } 3600 }
3601
3574 BUG_ON(!list_empty(pages)); 3602 BUG_ON(!list_empty(pages));
3575 if (bio) 3603 if (bio)
3576 return submit_one_bio(READ, bio, 0, bio_flags); 3604 return submit_one_bio(READ, bio, 0, bio_flags);
@@ -4124,11 +4152,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4124 * So bump the ref count first, then set the bit. If someone 4152 * So bump the ref count first, then set the bit. If someone
4125 * beat us to it, drop the ref we added. 4153 * beat us to it, drop the ref we added.
4126 */ 4154 */
4127 if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4155 spin_lock(&eb->refs_lock);
4156 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4128 atomic_inc(&eb->refs); 4157 atomic_inc(&eb->refs);
4129 if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4158 spin_unlock(&eb->refs_lock);
4130 atomic_dec(&eb->refs);
4131 }
4132} 4159}
4133 4160
4134static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4161static void mark_extent_buffer_accessed(struct extent_buffer *eb)
@@ -4240,9 +4267,7 @@ again:
4240 goto free_eb; 4267 goto free_eb;
4241 } 4268 }
4242 /* add one reference for the tree */ 4269 /* add one reference for the tree */
4243 spin_lock(&eb->refs_lock);
4244 check_buffer_tree_ref(eb); 4270 check_buffer_tree_ref(eb);
4245 spin_unlock(&eb->refs_lock);
4246 spin_unlock(&tree->buffer_lock); 4271 spin_unlock(&tree->buffer_lock);
4247 radix_tree_preload_end(); 4272 radix_tree_preload_end();
4248 4273
@@ -4301,7 +4326,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4301} 4326}
4302 4327
4303/* Expects to have eb->eb_lock already held */ 4328/* Expects to have eb->eb_lock already held */
4304static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) 4329static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4305{ 4330{
4306 WARN_ON(atomic_read(&eb->refs) == 0); 4331 WARN_ON(atomic_read(&eb->refs) == 0);
4307 if (atomic_dec_and_test(&eb->refs)) { 4332 if (atomic_dec_and_test(&eb->refs)) {
@@ -4322,9 +4347,11 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4322 btrfs_release_extent_buffer_page(eb, 0); 4347 btrfs_release_extent_buffer_page(eb, 0);
4323 4348
4324 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4349 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4325 return; 4350 return 1;
4326 } 4351 }
4327 spin_unlock(&eb->refs_lock); 4352 spin_unlock(&eb->refs_lock);
4353
4354 return 0;
4328} 4355}
4329 4356
4330void free_extent_buffer(struct extent_buffer *eb) 4357void free_extent_buffer(struct extent_buffer *eb)
@@ -4963,7 +4990,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
4963 spin_unlock(&eb->refs_lock); 4990 spin_unlock(&eb->refs_lock);
4964 return 0; 4991 return 0;
4965 } 4992 }
4966 release_extent_buffer(eb, mask);
4967 4993
4968 return 1; 4994 return release_extent_buffer(eb, mask);
4969} 4995}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5d158d32023..b45b9de0c21 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -183,7 +183,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
183 * read from the commit root and sidestep a nasty deadlock 183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree. 184 * between reading the free space cache and updating the csum tree.
185 */ 185 */
186 if (btrfs_is_free_space_inode(root, inode)) { 186 if (btrfs_is_free_space_inode(inode)) {
187 path->search_commit_root = 1; 187 path->search_commit_root = 1;
188 path->skip_locking = 1; 188 path->skip_locking = 1;
189 } 189 }
@@ -690,6 +690,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
690 return -ENOMEM; 690 return -ENOMEM;
691 691
692 sector_sum = sums->sums; 692 sector_sum = sums->sums;
693 trans->adding_csums = 1;
693again: 694again:
694 next_offset = (u64)-1; 695 next_offset = (u64)-1;
695 found_next = 0; 696 found_next = 0;
@@ -853,6 +854,7 @@ next_sector:
853 goto again; 854 goto again;
854 } 855 }
855out: 856out:
857 trans->adding_csums = 0;
856 btrfs_free_path(path); 858 btrfs_free_path(path);
857 return ret; 859 return ret;
858 860
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6c4e2baa929..6b10acfc2f5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1968,7 +1968,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1968 1968
1969 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 1969 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
1970 info = rb_entry(n, struct btrfs_free_space, offset_index); 1970 info = rb_entry(n, struct btrfs_free_space, offset_index);
1971 if (info->bytes >= bytes) 1971 if (info->bytes >= bytes && !block_group->ro)
1972 count++; 1972 count++;
1973 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 1973 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
1974 (unsigned long long)info->offset, 1974 (unsigned long long)info->offset,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb8d671d00e..48bdfd2591c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -825,7 +825,7 @@ static noinline int cow_file_range(struct inode *inode,
825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
826 int ret = 0; 826 int ret = 0;
827 827
828 BUG_ON(btrfs_is_free_space_inode(root, inode)); 828 BUG_ON(btrfs_is_free_space_inode(inode));
829 trans = btrfs_join_transaction(root); 829 trans = btrfs_join_transaction(root);
830 if (IS_ERR(trans)) { 830 if (IS_ERR(trans)) {
831 extent_clear_unlock_delalloc(inode, 831 extent_clear_unlock_delalloc(inode,
@@ -1010,7 +1010,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
1011 1011
1012 if (atomic_read(&root->fs_info->async_delalloc_pages) < 1012 if (atomic_read(&root->fs_info->async_delalloc_pages) <
1013 5 * 1042 * 1024 && 1013 5 * 1024 * 1024 &&
1014 waitqueue_active(&root->fs_info->async_submit_wait)) 1014 waitqueue_active(&root->fs_info->async_submit_wait))
1015 wake_up(&root->fs_info->async_submit_wait); 1015 wake_up(&root->fs_info->async_submit_wait);
1016 1016
@@ -1035,7 +1035,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1035 struct btrfs_root *root = BTRFS_I(inode)->root; 1035 struct btrfs_root *root = BTRFS_I(inode)->root;
1036 unsigned long nr_pages; 1036 unsigned long nr_pages;
1037 u64 cur_end; 1037 u64 cur_end;
1038 int limit = 10 * 1024 * 1042; 1038 int limit = 10 * 1024 * 1024;
1039 1039
1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1041 1, 0, NULL, GFP_NOFS); 1041 1, 0, NULL, GFP_NOFS);
@@ -1153,7 +1153,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1153 return -ENOMEM; 1153 return -ENOMEM;
1154 } 1154 }
1155 1155
1156 nolock = btrfs_is_free_space_inode(root, inode); 1156 nolock = btrfs_is_free_space_inode(inode);
1157 1157
1158 if (nolock) 1158 if (nolock)
1159 trans = btrfs_join_transaction_nolock(root); 1159 trans = btrfs_join_transaction_nolock(root);
@@ -1466,7 +1466,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1467 struct btrfs_root *root = BTRFS_I(inode)->root; 1467 struct btrfs_root *root = BTRFS_I(inode)->root;
1468 u64 len = state->end + 1 - state->start; 1468 u64 len = state->end + 1 - state->start;
1469 bool do_list = !btrfs_is_free_space_inode(root, inode); 1469 bool do_list = !btrfs_is_free_space_inode(inode);
1470 1470
1471 if (*bits & EXTENT_FIRST_DELALLOC) { 1471 if (*bits & EXTENT_FIRST_DELALLOC) {
1472 *bits &= ~EXTENT_FIRST_DELALLOC; 1472 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1501,7 +1501,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1502 struct btrfs_root *root = BTRFS_I(inode)->root; 1502 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 u64 len = state->end + 1 - state->start; 1503 u64 len = state->end + 1 - state->start;
1504 bool do_list = !btrfs_is_free_space_inode(root, inode); 1504 bool do_list = !btrfs_is_free_space_inode(inode);
1505 1505
1506 if (*bits & EXTENT_FIRST_DELALLOC) { 1506 if (*bits & EXTENT_FIRST_DELALLOC) {
1507 *bits &= ~EXTENT_FIRST_DELALLOC; 1507 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1612,7 +1612,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1612 1612
1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1614 1614
1615 if (btrfs_is_free_space_inode(root, inode)) 1615 if (btrfs_is_free_space_inode(inode))
1616 metadata = 2; 1616 metadata = 2;
1617 1617
1618 if (!(rw & REQ_WRITE)) { 1618 if (!(rw & REQ_WRITE)) {
@@ -1869,7 +1869,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1869 int ret; 1869 int ret;
1870 bool nolock; 1870 bool nolock;
1871 1871
1872 nolock = btrfs_is_free_space_inode(root, inode); 1872 nolock = btrfs_is_free_space_inode(inode);
1873 1873
1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1875 ret = -EIO; 1875 ret = -EIO;
@@ -2007,7 +2007,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2007 ordered_extent->work.func = finish_ordered_fn; 2007 ordered_extent->work.func = finish_ordered_fn;
2008 ordered_extent->work.flags = 0; 2008 ordered_extent->work.flags = 0;
2009 2009
2010 if (btrfs_is_free_space_inode(root, inode)) 2010 if (btrfs_is_free_space_inode(inode))
2011 workers = &root->fs_info->endio_freespace_worker; 2011 workers = &root->fs_info->endio_freespace_worker;
2012 else 2012 else
2013 workers = &root->fs_info->endio_write_workers; 2013 workers = &root->fs_info->endio_write_workers;
@@ -2732,8 +2732,10 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2732 * The data relocation inode should also be directly updated 2732 * The data relocation inode should also be directly updated
2733 * without delay 2733 * without delay
2734 */ 2734 */
2735 if (!btrfs_is_free_space_inode(root, inode) 2735 if (!btrfs_is_free_space_inode(inode)
2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2737 btrfs_update_root_times(trans, root);
2738
2737 ret = btrfs_delayed_update_inode(trans, root, inode); 2739 ret = btrfs_delayed_update_inode(trans, root, inode);
2738 if (!ret) 2740 if (!ret)
2739 btrfs_set_inode_last_trans(trans, inode); 2741 btrfs_set_inode_last_trans(trans, inode);
@@ -2833,7 +2835,7 @@ err:
2833 inode_inc_iversion(inode); 2835 inode_inc_iversion(inode);
2834 inode_inc_iversion(dir); 2836 inode_inc_iversion(dir);
2835 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2837 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2836 btrfs_update_inode(trans, root, dir); 2838 ret = btrfs_update_inode(trans, root, dir);
2837out: 2839out:
2838 return ret; 2840 return ret;
2839} 2841}
@@ -3743,7 +3745,7 @@ void btrfs_evict_inode(struct inode *inode)
3743 3745
3744 truncate_inode_pages(&inode->i_data, 0); 3746 truncate_inode_pages(&inode->i_data, 0);
3745 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3747 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3746 btrfs_is_free_space_inode(root, inode))) 3748 btrfs_is_free_space_inode(inode)))
3747 goto no_delete; 3749 goto no_delete;
3748 3750
3749 if (is_bad_inode(inode)) { 3751 if (is_bad_inode(inode)) {
@@ -4082,7 +4084,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
4082 struct btrfs_iget_args *args = p; 4084 struct btrfs_iget_args *args = p;
4083 inode->i_ino = args->ino; 4085 inode->i_ino = args->ino;
4084 BTRFS_I(inode)->root = args->root; 4086 BTRFS_I(inode)->root = args->root;
4085 btrfs_set_inode_space_info(args->root, inode);
4086 return 0; 4087 return 0;
4087} 4088}
4088 4089
@@ -4457,7 +4458,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4457 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4458 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4458 return 0; 4459 return 0;
4459 4460
4460 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4461 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
4461 nolock = true; 4462 nolock = true;
4462 4463
4463 if (wbc->sync_mode == WB_SYNC_ALL) { 4464 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4518,6 +4519,11 @@ int btrfs_dirty_inode(struct inode *inode)
4518static int btrfs_update_time(struct inode *inode, struct timespec *now, 4519static int btrfs_update_time(struct inode *inode, struct timespec *now,
4519 int flags) 4520 int flags)
4520{ 4521{
4522 struct btrfs_root *root = BTRFS_I(inode)->root;
4523
4524 if (btrfs_root_readonly(root))
4525 return -EROFS;
4526
4521 if (flags & S_VERSION) 4527 if (flags & S_VERSION)
4522 inode_inc_iversion(inode); 4528 inode_inc_iversion(inode);
4523 if (flags & S_CTIME) 4529 if (flags & S_CTIME)
@@ -4662,7 +4668,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4662 BTRFS_I(inode)->root = root; 4668 BTRFS_I(inode)->root = root;
4663 BTRFS_I(inode)->generation = trans->transid; 4669 BTRFS_I(inode)->generation = trans->transid;
4664 inode->i_generation = BTRFS_I(inode)->generation; 4670 inode->i_generation = BTRFS_I(inode)->generation;
4665 btrfs_set_inode_space_info(root, inode);
4666 4671
4667 if (S_ISDIR(mode)) 4672 if (S_ISDIR(mode))
4668 owner = 0; 4673 owner = 0;
@@ -4690,6 +4695,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4690 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4695 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4691 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4696 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4692 struct btrfs_inode_item); 4697 struct btrfs_inode_item);
4698 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
4699 sizeof(*inode_item));
4693 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4700 fill_inode_item(trans, path->nodes[0], inode_item, inode);
4694 4701
4695 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4702 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
@@ -4723,6 +4730,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4723 trace_btrfs_inode_new(inode); 4730 trace_btrfs_inode_new(inode);
4724 btrfs_set_inode_last_trans(trans, inode); 4731 btrfs_set_inode_last_trans(trans, inode);
4725 4732
4733 btrfs_update_root_times(trans, root);
4734
4726 return inode; 4735 return inode;
4727fail: 4736fail:
4728 if (dir) 4737 if (dir)
@@ -6939,7 +6948,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6939 return NULL; 6948 return NULL;
6940 6949
6941 ei->root = NULL; 6950 ei->root = NULL;
6942 ei->space_info = NULL;
6943 ei->generation = 0; 6951 ei->generation = 0;
6944 ei->last_trans = 0; 6952 ei->last_trans = 0;
6945 ei->last_sub_trans = 0; 6953 ei->last_sub_trans = 0;
@@ -7046,7 +7054,7 @@ int btrfs_drop_inode(struct inode *inode)
7046 struct btrfs_root *root = BTRFS_I(inode)->root; 7054 struct btrfs_root *root = BTRFS_I(inode)->root;
7047 7055
7048 if (btrfs_root_refs(&root->root_item) == 0 && 7056 if (btrfs_root_refs(&root->root_item) == 0 &&
7049 !btrfs_is_free_space_inode(root, inode)) 7057 !btrfs_is_free_space_inode(inode))
7050 return 1; 7058 return 1;
7051 else 7059 else
7052 return generic_drop_inode(inode); 7060 return generic_drop_inode(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1e9f6c019ad..43f0012016e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -41,6 +41,7 @@
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h>
44#include "compat.h" 45#include "compat.h"
45#include "ctree.h" 46#include "ctree.h"
46#include "disk-io.h" 47#include "disk-io.h"
@@ -53,6 +54,7 @@
53#include "inode-map.h" 54#include "inode-map.h"
54#include "backref.h" 55#include "backref.h"
55#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h"
56 58
57/* Mask out flags that are inappropriate for the given type of inode. */ 59/* Mask out flags that are inappropriate for the given type of inode. */
58static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -336,7 +338,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
336static noinline int create_subvol(struct btrfs_root *root, 338static noinline int create_subvol(struct btrfs_root *root,
337 struct dentry *dentry, 339 struct dentry *dentry,
338 char *name, int namelen, 340 char *name, int namelen,
339 u64 *async_transid) 341 u64 *async_transid,
342 struct btrfs_qgroup_inherit **inherit)
340{ 343{
341 struct btrfs_trans_handle *trans; 344 struct btrfs_trans_handle *trans;
342 struct btrfs_key key; 345 struct btrfs_key key;
@@ -346,11 +349,13 @@ static noinline int create_subvol(struct btrfs_root *root,
346 struct btrfs_root *new_root; 349 struct btrfs_root *new_root;
347 struct dentry *parent = dentry->d_parent; 350 struct dentry *parent = dentry->d_parent;
348 struct inode *dir; 351 struct inode *dir;
352 struct timespec cur_time = CURRENT_TIME;
349 int ret; 353 int ret;
350 int err; 354 int err;
351 u64 objectid; 355 u64 objectid;
352 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 356 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
353 u64 index = 0; 357 u64 index = 0;
358 uuid_le new_uuid;
354 359
355 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 360 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
356 if (ret) 361 if (ret)
@@ -368,6 +373,11 @@ static noinline int create_subvol(struct btrfs_root *root,
368 if (IS_ERR(trans)) 373 if (IS_ERR(trans))
369 return PTR_ERR(trans); 374 return PTR_ERR(trans);
370 375
376 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
377 inherit ? *inherit : NULL);
378 if (ret)
379 goto fail;
380
371 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 381 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
372 0, objectid, NULL, 0, 0, 0); 382 0, objectid, NULL, 0, 0, 0);
373 if (IS_ERR(leaf)) { 383 if (IS_ERR(leaf)) {
@@ -389,8 +399,9 @@ static noinline int create_subvol(struct btrfs_root *root,
389 BTRFS_UUID_SIZE); 399 BTRFS_UUID_SIZE);
390 btrfs_mark_buffer_dirty(leaf); 400 btrfs_mark_buffer_dirty(leaf);
391 401
402 memset(&root_item, 0, sizeof(root_item));
403
392 inode_item = &root_item.inode; 404 inode_item = &root_item.inode;
393 memset(inode_item, 0, sizeof(*inode_item));
394 inode_item->generation = cpu_to_le64(1); 405 inode_item->generation = cpu_to_le64(1);
395 inode_item->size = cpu_to_le64(3); 406 inode_item->size = cpu_to_le64(3);
396 inode_item->nlink = cpu_to_le32(1); 407 inode_item->nlink = cpu_to_le32(1);
@@ -408,8 +419,15 @@ static noinline int create_subvol(struct btrfs_root *root,
408 btrfs_set_root_used(&root_item, leaf->len); 419 btrfs_set_root_used(&root_item, leaf->len);
409 btrfs_set_root_last_snapshot(&root_item, 0); 420 btrfs_set_root_last_snapshot(&root_item, 0);
410 421
411 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 422 btrfs_set_root_generation_v2(&root_item,
412 root_item.drop_level = 0; 423 btrfs_root_generation(&root_item));
424 uuid_le_gen(&new_uuid);
425 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
426 root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
427 root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec);
428 root_item.ctime = root_item.otime;
429 btrfs_set_root_ctransid(&root_item, trans->transid);
430 btrfs_set_root_otransid(&root_item, trans->transid);
413 431
414 btrfs_tree_unlock(leaf); 432 btrfs_tree_unlock(leaf);
415 free_extent_buffer(leaf); 433 free_extent_buffer(leaf);
@@ -484,7 +502,7 @@ fail:
484 502
485static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 503static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
486 char *name, int namelen, u64 *async_transid, 504 char *name, int namelen, u64 *async_transid,
487 bool readonly) 505 bool readonly, struct btrfs_qgroup_inherit **inherit)
488{ 506{
489 struct inode *inode; 507 struct inode *inode;
490 struct btrfs_pending_snapshot *pending_snapshot; 508 struct btrfs_pending_snapshot *pending_snapshot;
@@ -502,6 +520,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
502 pending_snapshot->dentry = dentry; 520 pending_snapshot->dentry = dentry;
503 pending_snapshot->root = root; 521 pending_snapshot->root = root;
504 pending_snapshot->readonly = readonly; 522 pending_snapshot->readonly = readonly;
523 if (inherit) {
524 pending_snapshot->inherit = *inherit;
525 *inherit = NULL; /* take responsibility to free it */
526 }
505 527
506 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
507 if (IS_ERR(trans)) { 529 if (IS_ERR(trans)) {
@@ -635,7 +657,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
635static noinline int btrfs_mksubvol(struct path *parent, 657static noinline int btrfs_mksubvol(struct path *parent,
636 char *name, int namelen, 658 char *name, int namelen,
637 struct btrfs_root *snap_src, 659 struct btrfs_root *snap_src,
638 u64 *async_transid, bool readonly) 660 u64 *async_transid, bool readonly,
661 struct btrfs_qgroup_inherit **inherit)
639{ 662{
640 struct inode *dir = parent->dentry->d_inode; 663 struct inode *dir = parent->dentry->d_inode;
641 struct dentry *dentry; 664 struct dentry *dentry;
@@ -652,13 +675,9 @@ static noinline int btrfs_mksubvol(struct path *parent,
652 if (dentry->d_inode) 675 if (dentry->d_inode)
653 goto out_dput; 676 goto out_dput;
654 677
655 error = mnt_want_write(parent->mnt);
656 if (error)
657 goto out_dput;
658
659 error = btrfs_may_create(dir, dentry); 678 error = btrfs_may_create(dir, dentry);
660 if (error) 679 if (error)
661 goto out_drop_write; 680 goto out_dput;
662 681
663 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 682 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
664 683
@@ -666,18 +685,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
666 goto out_up_read; 685 goto out_up_read;
667 686
668 if (snap_src) { 687 if (snap_src) {
669 error = create_snapshot(snap_src, dentry, 688 error = create_snapshot(snap_src, dentry, name, namelen,
670 name, namelen, async_transid, readonly); 689 async_transid, readonly, inherit);
671 } else { 690 } else {
672 error = create_subvol(BTRFS_I(dir)->root, dentry, 691 error = create_subvol(BTRFS_I(dir)->root, dentry,
673 name, namelen, async_transid); 692 name, namelen, async_transid, inherit);
674 } 693 }
675 if (!error) 694 if (!error)
676 fsnotify_mkdir(dir, dentry); 695 fsnotify_mkdir(dir, dentry);
677out_up_read: 696out_up_read:
678 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 697 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
679out_drop_write:
680 mnt_drop_write(parent->mnt);
681out_dput: 698out_dput:
682 dput(dentry); 699 dput(dentry);
683out_unlock: 700out_unlock:
@@ -832,7 +849,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
832} 849}
833 850
834static int should_defrag_range(struct inode *inode, u64 start, int thresh, 851static int should_defrag_range(struct inode *inode, u64 start, int thresh,
835 u64 *last_len, u64 *skip, u64 *defrag_end) 852 u64 *last_len, u64 *skip, u64 *defrag_end,
853 int compress)
836{ 854{
837 struct extent_map *em; 855 struct extent_map *em;
838 int ret = 1; 856 int ret = 1;
@@ -863,7 +881,7 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
863 * we hit a real extent, if it is big or the next extent is not a 881 * we hit a real extent, if it is big or the next extent is not a
864 * real extent, don't bother defragging it 882 * real extent, don't bother defragging it
865 */ 883 */
866 if ((*last_len == 0 || *last_len >= thresh) && 884 if (!compress && (*last_len == 0 || *last_len >= thresh) &&
867 (em->len >= thresh || !next_mergeable)) 885 (em->len >= thresh || !next_mergeable))
868 ret = 0; 886 ret = 0;
869out: 887out:
@@ -1047,11 +1065,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1047 u64 newer_than, unsigned long max_to_defrag) 1065 u64 newer_than, unsigned long max_to_defrag)
1048{ 1066{
1049 struct btrfs_root *root = BTRFS_I(inode)->root; 1067 struct btrfs_root *root = BTRFS_I(inode)->root;
1050 struct btrfs_super_block *disk_super;
1051 struct file_ra_state *ra = NULL; 1068 struct file_ra_state *ra = NULL;
1052 unsigned long last_index; 1069 unsigned long last_index;
1053 u64 isize = i_size_read(inode); 1070 u64 isize = i_size_read(inode);
1054 u64 features;
1055 u64 last_len = 0; 1071 u64 last_len = 0;
1056 u64 skip = 0; 1072 u64 skip = 0;
1057 u64 defrag_end = 0; 1073 u64 defrag_end = 0;
@@ -1145,7 +1161,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1145 1161
1146 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1162 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1147 extent_thresh, &last_len, &skip, 1163 extent_thresh, &last_len, &skip,
1148 &defrag_end)) { 1164 &defrag_end, range->flags &
1165 BTRFS_DEFRAG_RANGE_COMPRESS)) {
1149 unsigned long next; 1166 unsigned long next;
1150 /* 1167 /*
1151 * the should_defrag function tells us how much to skip 1168 * the should_defrag function tells us how much to skip
@@ -1237,11 +1254,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1237 mutex_unlock(&inode->i_mutex); 1254 mutex_unlock(&inode->i_mutex);
1238 } 1255 }
1239 1256
1240 disk_super = root->fs_info->super_copy;
1241 features = btrfs_super_incompat_flags(disk_super);
1242 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1257 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1243 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1258 btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
1244 btrfs_set_super_incompat_flags(disk_super, features);
1245 } 1259 }
1246 1260
1247 ret = defrag_count; 1261 ret = defrag_count;
@@ -1379,41 +1393,39 @@ out:
1379} 1393}
1380 1394
1381static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1395static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1382 char *name, 1396 char *name, unsigned long fd, int subvol,
1383 unsigned long fd, 1397 u64 *transid, bool readonly,
1384 int subvol, 1398 struct btrfs_qgroup_inherit **inherit)
1385 u64 *transid,
1386 bool readonly)
1387{ 1399{
1388 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1389 struct file *src_file; 1400 struct file *src_file;
1390 int namelen; 1401 int namelen;
1391 int ret = 0; 1402 int ret = 0;
1392 1403
1393 if (root->fs_info->sb->s_flags & MS_RDONLY) 1404 ret = mnt_want_write_file(file);
1394 return -EROFS; 1405 if (ret)
1406 goto out;
1395 1407
1396 namelen = strlen(name); 1408 namelen = strlen(name);
1397 if (strchr(name, '/')) { 1409 if (strchr(name, '/')) {
1398 ret = -EINVAL; 1410 ret = -EINVAL;
1399 goto out; 1411 goto out_drop_write;
1400 } 1412 }
1401 1413
1402 if (name[0] == '.' && 1414 if (name[0] == '.' &&
1403 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1415 (namelen == 1 || (name[1] == '.' && namelen == 2))) {
1404 ret = -EEXIST; 1416 ret = -EEXIST;
1405 goto out; 1417 goto out_drop_write;
1406 } 1418 }
1407 1419
1408 if (subvol) { 1420 if (subvol) {
1409 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1421 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1410 NULL, transid, readonly); 1422 NULL, transid, readonly, inherit);
1411 } else { 1423 } else {
1412 struct inode *src_inode; 1424 struct inode *src_inode;
1413 src_file = fget(fd); 1425 src_file = fget(fd);
1414 if (!src_file) { 1426 if (!src_file) {
1415 ret = -EINVAL; 1427 ret = -EINVAL;
1416 goto out; 1428 goto out_drop_write;
1417 } 1429 }
1418 1430
1419 src_inode = src_file->f_path.dentry->d_inode; 1431 src_inode = src_file->f_path.dentry->d_inode;
@@ -1422,13 +1434,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1422 "another FS\n"); 1434 "another FS\n");
1423 ret = -EINVAL; 1435 ret = -EINVAL;
1424 fput(src_file); 1436 fput(src_file);
1425 goto out; 1437 goto out_drop_write;
1426 } 1438 }
1427 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1439 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1428 BTRFS_I(src_inode)->root, 1440 BTRFS_I(src_inode)->root,
1429 transid, readonly); 1441 transid, readonly, inherit);
1430 fput(src_file); 1442 fput(src_file);
1431 } 1443 }
1444out_drop_write:
1445 mnt_drop_write_file(file);
1432out: 1446out:
1433 return ret; 1447 return ret;
1434} 1448}
@@ -1446,7 +1460,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
1446 1460
1447 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1461 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1448 vol_args->fd, subvol, 1462 vol_args->fd, subvol,
1449 NULL, false); 1463 NULL, false, NULL);
1450 1464
1451 kfree(vol_args); 1465 kfree(vol_args);
1452 return ret; 1466 return ret;
@@ -1460,6 +1474,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1460 u64 transid = 0; 1474 u64 transid = 0;
1461 u64 *ptr = NULL; 1475 u64 *ptr = NULL;
1462 bool readonly = false; 1476 bool readonly = false;
1477 struct btrfs_qgroup_inherit *inherit = NULL;
1463 1478
1464 vol_args = memdup_user(arg, sizeof(*vol_args)); 1479 vol_args = memdup_user(arg, sizeof(*vol_args));
1465 if (IS_ERR(vol_args)) 1480 if (IS_ERR(vol_args))
@@ -1467,7 +1482,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1467 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1482 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1468 1483
1469 if (vol_args->flags & 1484 if (vol_args->flags &
1470 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { 1485 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1486 BTRFS_SUBVOL_QGROUP_INHERIT)) {
1471 ret = -EOPNOTSUPP; 1487 ret = -EOPNOTSUPP;
1472 goto out; 1488 goto out;
1473 } 1489 }
@@ -1476,10 +1492,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1476 ptr = &transid; 1492 ptr = &transid;
1477 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1493 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1478 readonly = true; 1494 readonly = true;
1495 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1496 if (vol_args->size > PAGE_CACHE_SIZE) {
1497 ret = -EINVAL;
1498 goto out;
1499 }
1500 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1501 if (IS_ERR(inherit)) {
1502 ret = PTR_ERR(inherit);
1503 goto out;
1504 }
1505 }
1479 1506
1480 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1507 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1481 vol_args->fd, subvol, 1508 vol_args->fd, subvol, ptr,
1482 ptr, readonly); 1509 readonly, &inherit);
1483 1510
1484 if (ret == 0 && ptr && 1511 if (ret == 0 && ptr &&
1485 copy_to_user(arg + 1512 copy_to_user(arg +
@@ -1488,6 +1515,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1488 ret = -EFAULT; 1515 ret = -EFAULT;
1489out: 1516out:
1490 kfree(vol_args); 1517 kfree(vol_args);
1518 kfree(inherit);
1491 return ret; 1519 return ret;
1492} 1520}
1493 1521
@@ -1523,29 +1551,40 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1523 u64 flags; 1551 u64 flags;
1524 int ret = 0; 1552 int ret = 0;
1525 1553
1526 if (root->fs_info->sb->s_flags & MS_RDONLY) 1554 ret = mnt_want_write_file(file);
1527 return -EROFS; 1555 if (ret)
1556 goto out;
1528 1557
1529 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) 1558 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
1530 return -EINVAL; 1559 ret = -EINVAL;
1560 goto out_drop_write;
1561 }
1531 1562
1532 if (copy_from_user(&flags, arg, sizeof(flags))) 1563 if (copy_from_user(&flags, arg, sizeof(flags))) {
1533 return -EFAULT; 1564 ret = -EFAULT;
1565 goto out_drop_write;
1566 }
1534 1567
1535 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) 1568 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
1536 return -EINVAL; 1569 ret = -EINVAL;
1570 goto out_drop_write;
1571 }
1537 1572
1538 if (flags & ~BTRFS_SUBVOL_RDONLY) 1573 if (flags & ~BTRFS_SUBVOL_RDONLY) {
1539 return -EOPNOTSUPP; 1574 ret = -EOPNOTSUPP;
1575 goto out_drop_write;
1576 }
1540 1577
1541 if (!inode_owner_or_capable(inode)) 1578 if (!inode_owner_or_capable(inode)) {
1542 return -EACCES; 1579 ret = -EACCES;
1580 goto out_drop_write;
1581 }
1543 1582
1544 down_write(&root->fs_info->subvol_sem); 1583 down_write(&root->fs_info->subvol_sem);
1545 1584
1546 /* nothing to do */ 1585 /* nothing to do */
1547 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1586 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1548 goto out; 1587 goto out_drop_sem;
1549 1588
1550 root_flags = btrfs_root_flags(&root->root_item); 1589 root_flags = btrfs_root_flags(&root->root_item);
1551 if (flags & BTRFS_SUBVOL_RDONLY) 1590 if (flags & BTRFS_SUBVOL_RDONLY)
@@ -1568,8 +1607,11 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1568out_reset: 1607out_reset:
1569 if (ret) 1608 if (ret)
1570 btrfs_set_root_flags(&root->root_item, root_flags); 1609 btrfs_set_root_flags(&root->root_item, root_flags);
1571out: 1610out_drop_sem:
1572 up_write(&root->fs_info->subvol_sem); 1611 up_write(&root->fs_info->subvol_sem);
1612out_drop_write:
1613 mnt_drop_write_file(file);
1614out:
1573 return ret; 1615 return ret;
1574} 1616}
1575 1617
@@ -2340,6 +2382,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2340 goto out_drop_write; 2382 goto out_drop_write;
2341 } 2383 }
2342 2384
2385 ret = -EXDEV;
2386 if (src_file->f_path.mnt != file->f_path.mnt)
2387 goto out_fput;
2388
2343 src = src_file->f_dentry->d_inode; 2389 src = src_file->f_dentry->d_inode;
2344 2390
2345 ret = -EINVAL; 2391 ret = -EINVAL;
@@ -2360,7 +2406,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2360 goto out_fput; 2406 goto out_fput;
2361 2407
2362 ret = -EXDEV; 2408 ret = -EXDEV;
2363 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) 2409 if (src->i_sb != inode->i_sb)
2364 goto out_fput; 2410 goto out_fput;
2365 2411
2366 ret = -ENOMEM; 2412 ret = -ENOMEM;
@@ -2434,13 +2480,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2434 * note the key will change type as we walk through the 2480 * note the key will change type as we walk through the
2435 * tree. 2481 * tree.
2436 */ 2482 */
2437 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2483 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
2484 0, 0);
2438 if (ret < 0) 2485 if (ret < 0)
2439 goto out; 2486 goto out;
2440 2487
2441 nritems = btrfs_header_nritems(path->nodes[0]); 2488 nritems = btrfs_header_nritems(path->nodes[0]);
2442 if (path->slots[0] >= nritems) { 2489 if (path->slots[0] >= nritems) {
2443 ret = btrfs_next_leaf(root, path); 2490 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2444 if (ret < 0) 2491 if (ret < 0)
2445 goto out; 2492 goto out;
2446 if (ret > 0) 2493 if (ret > 0)
@@ -2749,8 +2796,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2749 struct btrfs_path *path; 2796 struct btrfs_path *path;
2750 struct btrfs_key location; 2797 struct btrfs_key location;
2751 struct btrfs_disk_key disk_key; 2798 struct btrfs_disk_key disk_key;
2752 struct btrfs_super_block *disk_super;
2753 u64 features;
2754 u64 objectid = 0; 2799 u64 objectid = 0;
2755 u64 dir_id; 2800 u64 dir_id;
2756 2801
@@ -2801,12 +2846,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2801 btrfs_mark_buffer_dirty(path->nodes[0]); 2846 btrfs_mark_buffer_dirty(path->nodes[0]);
2802 btrfs_free_path(path); 2847 btrfs_free_path(path);
2803 2848
2804 disk_super = root->fs_info->super_copy; 2849 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2805 features = btrfs_super_incompat_flags(disk_super);
2806 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2807 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
2808 btrfs_set_super_incompat_flags(disk_super, features);
2809 }
2810 btrfs_end_transaction(trans, root); 2850 btrfs_end_transaction(trans, root);
2811 2851
2812 return 0; 2852 return 0;
@@ -3063,19 +3103,21 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3063} 3103}
3064 3104
3065static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, 3105static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3066 void __user *arg, int reset_after_read) 3106 void __user *arg)
3067{ 3107{
3068 struct btrfs_ioctl_get_dev_stats *sa; 3108 struct btrfs_ioctl_get_dev_stats *sa;
3069 int ret; 3109 int ret;
3070 3110
3071 if (reset_after_read && !capable(CAP_SYS_ADMIN))
3072 return -EPERM;
3073
3074 sa = memdup_user(arg, sizeof(*sa)); 3111 sa = memdup_user(arg, sizeof(*sa));
3075 if (IS_ERR(sa)) 3112 if (IS_ERR(sa))
3076 return PTR_ERR(sa); 3113 return PTR_ERR(sa);
3077 3114
3078 ret = btrfs_get_dev_stats(root, sa, reset_after_read); 3115 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
3116 kfree(sa);
3117 return -EPERM;
3118 }
3119
3120 ret = btrfs_get_dev_stats(root, sa);
3079 3121
3080 if (copy_to_user(arg, sa, sizeof(*sa))) 3122 if (copy_to_user(arg, sa, sizeof(*sa)))
3081 ret = -EFAULT; 3123 ret = -EFAULT;
@@ -3265,9 +3307,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3265 if (!capable(CAP_SYS_ADMIN)) 3307 if (!capable(CAP_SYS_ADMIN))
3266 return -EPERM; 3308 return -EPERM;
3267 3309
3268 if (fs_info->sb->s_flags & MS_RDONLY)
3269 return -EROFS;
3270
3271 ret = mnt_want_write_file(file); 3310 ret = mnt_want_write_file(file);
3272 if (ret) 3311 if (ret)
3273 return ret; 3312 return ret;
@@ -3390,6 +3429,264 @@ out:
3390 return ret; 3429 return ret;
3391} 3430}
3392 3431
3432static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3433{
3434 struct btrfs_ioctl_quota_ctl_args *sa;
3435 struct btrfs_trans_handle *trans = NULL;
3436 int ret;
3437 int err;
3438
3439 if (!capable(CAP_SYS_ADMIN))
3440 return -EPERM;
3441
3442 if (root->fs_info->sb->s_flags & MS_RDONLY)
3443 return -EROFS;
3444
3445 sa = memdup_user(arg, sizeof(*sa));
3446 if (IS_ERR(sa))
3447 return PTR_ERR(sa);
3448
3449 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3450 trans = btrfs_start_transaction(root, 2);
3451 if (IS_ERR(trans)) {
3452 ret = PTR_ERR(trans);
3453 goto out;
3454 }
3455 }
3456
3457 switch (sa->cmd) {
3458 case BTRFS_QUOTA_CTL_ENABLE:
3459 ret = btrfs_quota_enable(trans, root->fs_info);
3460 break;
3461 case BTRFS_QUOTA_CTL_DISABLE:
3462 ret = btrfs_quota_disable(trans, root->fs_info);
3463 break;
3464 case BTRFS_QUOTA_CTL_RESCAN:
3465 ret = btrfs_quota_rescan(root->fs_info);
3466 break;
3467 default:
3468 ret = -EINVAL;
3469 break;
3470 }
3471
3472 if (copy_to_user(arg, sa, sizeof(*sa)))
3473 ret = -EFAULT;
3474
3475 if (trans) {
3476 err = btrfs_commit_transaction(trans, root);
3477 if (err && !ret)
3478 ret = err;
3479 }
3480
3481out:
3482 kfree(sa);
3483 return ret;
3484}
3485
3486static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3487{
3488 struct btrfs_ioctl_qgroup_assign_args *sa;
3489 struct btrfs_trans_handle *trans;
3490 int ret;
3491 int err;
3492
3493 if (!capable(CAP_SYS_ADMIN))
3494 return -EPERM;
3495
3496 if (root->fs_info->sb->s_flags & MS_RDONLY)
3497 return -EROFS;
3498
3499 sa = memdup_user(arg, sizeof(*sa));
3500 if (IS_ERR(sa))
3501 return PTR_ERR(sa);
3502
3503 trans = btrfs_join_transaction(root);
3504 if (IS_ERR(trans)) {
3505 ret = PTR_ERR(trans);
3506 goto out;
3507 }
3508
3509 /* FIXME: check if the IDs really exist */
3510 if (sa->assign) {
3511 ret = btrfs_add_qgroup_relation(trans, root->fs_info,
3512 sa->src, sa->dst);
3513 } else {
3514 ret = btrfs_del_qgroup_relation(trans, root->fs_info,
3515 sa->src, sa->dst);
3516 }
3517
3518 err = btrfs_end_transaction(trans, root);
3519 if (err && !ret)
3520 ret = err;
3521
3522out:
3523 kfree(sa);
3524 return ret;
3525}
3526
3527static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3528{
3529 struct btrfs_ioctl_qgroup_create_args *sa;
3530 struct btrfs_trans_handle *trans;
3531 int ret;
3532 int err;
3533
3534 if (!capable(CAP_SYS_ADMIN))
3535 return -EPERM;
3536
3537 if (root->fs_info->sb->s_flags & MS_RDONLY)
3538 return -EROFS;
3539
3540 sa = memdup_user(arg, sizeof(*sa));
3541 if (IS_ERR(sa))
3542 return PTR_ERR(sa);
3543
3544 trans = btrfs_join_transaction(root);
3545 if (IS_ERR(trans)) {
3546 ret = PTR_ERR(trans);
3547 goto out;
3548 }
3549
3550 /* FIXME: check if the IDs really exist */
3551 if (sa->create) {
3552 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
3553 NULL);
3554 } else {
3555 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
3556 }
3557
3558 err = btrfs_end_transaction(trans, root);
3559 if (err && !ret)
3560 ret = err;
3561
3562out:
3563 kfree(sa);
3564 return ret;
3565}
3566
3567static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3568{
3569 struct btrfs_ioctl_qgroup_limit_args *sa;
3570 struct btrfs_trans_handle *trans;
3571 int ret;
3572 int err;
3573 u64 qgroupid;
3574
3575 if (!capable(CAP_SYS_ADMIN))
3576 return -EPERM;
3577
3578 if (root->fs_info->sb->s_flags & MS_RDONLY)
3579 return -EROFS;
3580
3581 sa = memdup_user(arg, sizeof(*sa));
3582 if (IS_ERR(sa))
3583 return PTR_ERR(sa);
3584
3585 trans = btrfs_join_transaction(root);
3586 if (IS_ERR(trans)) {
3587 ret = PTR_ERR(trans);
3588 goto out;
3589 }
3590
3591 qgroupid = sa->qgroupid;
3592 if (!qgroupid) {
3593 /* take the current subvol as qgroup */
3594 qgroupid = root->root_key.objectid;
3595 }
3596
3597 /* FIXME: check if the IDs really exist */
3598 ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
3599
3600 err = btrfs_end_transaction(trans, root);
3601 if (err && !ret)
3602 ret = err;
3603
3604out:
3605 kfree(sa);
3606 return ret;
3607}
3608
3609static long btrfs_ioctl_set_received_subvol(struct file *file,
3610 void __user *arg)
3611{
3612 struct btrfs_ioctl_received_subvol_args *sa = NULL;
3613 struct inode *inode = fdentry(file)->d_inode;
3614 struct btrfs_root *root = BTRFS_I(inode)->root;
3615 struct btrfs_root_item *root_item = &root->root_item;
3616 struct btrfs_trans_handle *trans;
3617 struct timespec ct = CURRENT_TIME;
3618 int ret = 0;
3619
3620 ret = mnt_want_write_file(file);
3621 if (ret < 0)
3622 return ret;
3623
3624 down_write(&root->fs_info->subvol_sem);
3625
3626 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
3627 ret = -EINVAL;
3628 goto out;
3629 }
3630
3631 if (btrfs_root_readonly(root)) {
3632 ret = -EROFS;
3633 goto out;
3634 }
3635
3636 if (!inode_owner_or_capable(inode)) {
3637 ret = -EACCES;
3638 goto out;
3639 }
3640
3641 sa = memdup_user(arg, sizeof(*sa));
3642 if (IS_ERR(sa)) {
3643 ret = PTR_ERR(sa);
3644 sa = NULL;
3645 goto out;
3646 }
3647
3648 trans = btrfs_start_transaction(root, 1);
3649 if (IS_ERR(trans)) {
3650 ret = PTR_ERR(trans);
3651 trans = NULL;
3652 goto out;
3653 }
3654
3655 sa->rtransid = trans->transid;
3656 sa->rtime.sec = ct.tv_sec;
3657 sa->rtime.nsec = ct.tv_nsec;
3658
3659 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
3660 btrfs_set_root_stransid(root_item, sa->stransid);
3661 btrfs_set_root_rtransid(root_item, sa->rtransid);
3662 root_item->stime.sec = cpu_to_le64(sa->stime.sec);
3663 root_item->stime.nsec = cpu_to_le32(sa->stime.nsec);
3664 root_item->rtime.sec = cpu_to_le64(sa->rtime.sec);
3665 root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec);
3666
3667 ret = btrfs_update_root(trans, root->fs_info->tree_root,
3668 &root->root_key, &root->root_item);
3669 if (ret < 0) {
3670 btrfs_end_transaction(trans, root);
3671 trans = NULL;
3672 goto out;
3673 } else {
3674 ret = btrfs_commit_transaction(trans, root);
3675 if (ret < 0)
3676 goto out;
3677 }
3678
3679 ret = copy_to_user(arg, sa, sizeof(*sa));
3680 if (ret)
3681 ret = -EFAULT;
3682
3683out:
3684 kfree(sa);
3685 up_write(&root->fs_info->subvol_sem);
3686 mnt_drop_write_file(file);
3687 return ret;
3688}
3689
3393long btrfs_ioctl(struct file *file, unsigned int 3690long btrfs_ioctl(struct file *file, unsigned int
3394 cmd, unsigned long arg) 3691 cmd, unsigned long arg)
3395{ 3692{
@@ -3411,6 +3708,8 @@ long btrfs_ioctl(struct file *file, unsigned int
3411 return btrfs_ioctl_snap_create_v2(file, argp, 0); 3708 return btrfs_ioctl_snap_create_v2(file, argp, 0);
3412 case BTRFS_IOC_SUBVOL_CREATE: 3709 case BTRFS_IOC_SUBVOL_CREATE:
3413 return btrfs_ioctl_snap_create(file, argp, 1); 3710 return btrfs_ioctl_snap_create(file, argp, 1);
3711 case BTRFS_IOC_SUBVOL_CREATE_V2:
3712 return btrfs_ioctl_snap_create_v2(file, argp, 1);
3414 case BTRFS_IOC_SNAP_DESTROY: 3713 case BTRFS_IOC_SNAP_DESTROY:
3415 return btrfs_ioctl_snap_destroy(file, argp); 3714 return btrfs_ioctl_snap_destroy(file, argp);
3416 case BTRFS_IOC_SUBVOL_GETFLAGS: 3715 case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -3472,10 +3771,20 @@ long btrfs_ioctl(struct file *file, unsigned int
3472 return btrfs_ioctl_balance_ctl(root, arg); 3771 return btrfs_ioctl_balance_ctl(root, arg);
3473 case BTRFS_IOC_BALANCE_PROGRESS: 3772 case BTRFS_IOC_BALANCE_PROGRESS:
3474 return btrfs_ioctl_balance_progress(root, argp); 3773 return btrfs_ioctl_balance_progress(root, argp);
3774 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
3775 return btrfs_ioctl_set_received_subvol(file, argp);
3776 case BTRFS_IOC_SEND:
3777 return btrfs_ioctl_send(file, argp);
3475 case BTRFS_IOC_GET_DEV_STATS: 3778 case BTRFS_IOC_GET_DEV_STATS:
3476 return btrfs_ioctl_get_dev_stats(root, argp, 0); 3779 return btrfs_ioctl_get_dev_stats(root, argp);
3477 case BTRFS_IOC_GET_AND_RESET_DEV_STATS: 3780 case BTRFS_IOC_QUOTA_CTL:
3478 return btrfs_ioctl_get_dev_stats(root, argp, 1); 3781 return btrfs_ioctl_quota_ctl(root, argp);
3782 case BTRFS_IOC_QGROUP_ASSIGN:
3783 return btrfs_ioctl_qgroup_assign(root, argp);
3784 case BTRFS_IOC_QGROUP_CREATE:
3785 return btrfs_ioctl_qgroup_create(root, argp);
3786 case BTRFS_IOC_QGROUP_LIMIT:
3787 return btrfs_ioctl_qgroup_limit(root, argp);
3479 } 3788 }
3480 3789
3481 return -ENOTTY; 3790 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e440aa653c3..731e2875ab9 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -32,15 +32,46 @@ struct btrfs_ioctl_vol_args {
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
35#define BTRFS_FSID_SIZE 16 36#define BTRFS_FSID_SIZE 16
36#define BTRFS_UUID_SIZE 16 37#define BTRFS_UUID_SIZE 16
37 38
39#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
40
41struct btrfs_qgroup_limit {
42 __u64 flags;
43 __u64 max_rfer;
44 __u64 max_excl;
45 __u64 rsv_rfer;
46 __u64 rsv_excl;
47};
48
49struct btrfs_qgroup_inherit {
50 __u64 flags;
51 __u64 num_qgroups;
52 __u64 num_ref_copies;
53 __u64 num_excl_copies;
54 struct btrfs_qgroup_limit lim;
55 __u64 qgroups[0];
56};
57
58struct btrfs_ioctl_qgroup_limit_args {
59 __u64 qgroupid;
60 struct btrfs_qgroup_limit lim;
61};
62
38#define BTRFS_SUBVOL_NAME_MAX 4039 63#define BTRFS_SUBVOL_NAME_MAX 4039
39struct btrfs_ioctl_vol_args_v2 { 64struct btrfs_ioctl_vol_args_v2 {
40 __s64 fd; 65 __s64 fd;
41 __u64 transid; 66 __u64 transid;
42 __u64 flags; 67 __u64 flags;
43 __u64 unused[4]; 68 union {
69 struct {
70 __u64 size;
71 struct btrfs_qgroup_inherit __user *qgroup_inherit;
72 };
73 __u64 unused[4];
74 };
44 char name[BTRFS_SUBVOL_NAME_MAX + 1]; 75 char name[BTRFS_SUBVOL_NAME_MAX + 1];
45}; 76};
46 77
@@ -285,9 +316,13 @@ enum btrfs_dev_stat_values {
285 BTRFS_DEV_STAT_VALUES_MAX 316 BTRFS_DEV_STAT_VALUES_MAX
286}; 317};
287 318
319/* Reset statistics after reading; needs SYS_ADMIN capability */
320#define BTRFS_DEV_STATS_RESET (1ULL << 0)
321
288struct btrfs_ioctl_get_dev_stats { 322struct btrfs_ioctl_get_dev_stats {
289 __u64 devid; /* in */ 323 __u64 devid; /* in */
290 __u64 nr_items; /* in/out */ 324 __u64 nr_items; /* in/out */
325 __u64 flags; /* in/out */
291 326
292 /* out values: */ 327 /* out values: */
293 __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; 328 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
@@ -295,6 +330,48 @@ struct btrfs_ioctl_get_dev_stats {
295 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ 330 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
296}; 331};
297 332
333#define BTRFS_QUOTA_CTL_ENABLE 1
334#define BTRFS_QUOTA_CTL_DISABLE 2
335#define BTRFS_QUOTA_CTL_RESCAN 3
336struct btrfs_ioctl_quota_ctl_args {
337 __u64 cmd;
338 __u64 status;
339};
340
341struct btrfs_ioctl_qgroup_assign_args {
342 __u64 assign;
343 __u64 src;
344 __u64 dst;
345};
346
347struct btrfs_ioctl_qgroup_create_args {
348 __u64 create;
349 __u64 qgroupid;
350};
351struct btrfs_ioctl_timespec {
352 __u64 sec;
353 __u32 nsec;
354};
355
356struct btrfs_ioctl_received_subvol_args {
357 char uuid[BTRFS_UUID_SIZE]; /* in */
358 __u64 stransid; /* in */
359 __u64 rtransid; /* out */
360 struct btrfs_ioctl_timespec stime; /* in */
361 struct btrfs_ioctl_timespec rtime; /* out */
362 __u64 flags; /* in */
363 __u64 reserved[16]; /* in */
364};
365
366struct btrfs_ioctl_send_args {
367 __s64 send_fd; /* in */
368 __u64 clone_sources_count; /* in */
369 __u64 __user *clone_sources; /* in */
370 __u64 parent_root; /* in */
371 __u64 flags; /* in */
372 __u64 reserved[4]; /* in */
373};
374
298#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 375#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
299 struct btrfs_ioctl_vol_args) 376 struct btrfs_ioctl_vol_args)
300#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 377#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -339,6 +416,8 @@ struct btrfs_ioctl_get_dev_stats {
339#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 416#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
340#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 417#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
341 struct btrfs_ioctl_vol_args_v2) 418 struct btrfs_ioctl_vol_args_v2)
419#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
420 struct btrfs_ioctl_vol_args_v2)
342#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) 421#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
343#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) 422#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
344#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ 423#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
@@ -359,9 +438,19 @@ struct btrfs_ioctl_get_dev_stats {
359 struct btrfs_ioctl_ino_path_args) 438 struct btrfs_ioctl_ino_path_args)
360#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 439#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
361 struct btrfs_ioctl_ino_path_args) 440 struct btrfs_ioctl_ino_path_args)
441#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
442 struct btrfs_ioctl_received_subvol_args)
443#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
444#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
445 struct btrfs_ioctl_vol_args)
446#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
447 struct btrfs_ioctl_quota_ctl_args)
448#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
449 struct btrfs_ioctl_qgroup_assign_args)
450#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
451 struct btrfs_ioctl_qgroup_create_args)
452#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
453 struct btrfs_ioctl_qgroup_limit_args)
362#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
363 struct btrfs_ioctl_get_dev_stats) 455 struct btrfs_ioctl_get_dev_stats)
364#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
365 struct btrfs_ioctl_get_dev_stats)
366
367#endif 456#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 272f911203f..a44eff07480 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -78,13 +78,15 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
78 write_lock(&eb->lock); 78 write_lock(&eb->lock);
79 WARN_ON(atomic_read(&eb->spinning_writers)); 79 WARN_ON(atomic_read(&eb->spinning_writers));
80 atomic_inc(&eb->spinning_writers); 80 atomic_inc(&eb->spinning_writers);
81 if (atomic_dec_and_test(&eb->blocking_writers)) 81 if (atomic_dec_and_test(&eb->blocking_writers) &&
82 waitqueue_active(&eb->write_lock_wq))
82 wake_up(&eb->write_lock_wq); 83 wake_up(&eb->write_lock_wq);
83 } else if (rw == BTRFS_READ_LOCK_BLOCKING) { 84 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
84 BUG_ON(atomic_read(&eb->blocking_readers) == 0); 85 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
85 read_lock(&eb->lock); 86 read_lock(&eb->lock);
86 atomic_inc(&eb->spinning_readers); 87 atomic_inc(&eb->spinning_readers);
87 if (atomic_dec_and_test(&eb->blocking_readers)) 88 if (atomic_dec_and_test(&eb->blocking_readers) &&
89 waitqueue_active(&eb->read_lock_wq))
88 wake_up(&eb->read_lock_wq); 90 wake_up(&eb->read_lock_wq);
89 } 91 }
90 return; 92 return;
@@ -199,7 +201,8 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
199 } 201 }
200 btrfs_assert_tree_read_locked(eb); 202 btrfs_assert_tree_read_locked(eb);
201 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 203 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
202 if (atomic_dec_and_test(&eb->blocking_readers)) 204 if (atomic_dec_and_test(&eb->blocking_readers) &&
205 waitqueue_active(&eb->read_lock_wq))
203 wake_up(&eb->read_lock_wq); 206 wake_up(&eb->read_lock_wq);
204 atomic_dec(&eb->read_locks); 207 atomic_dec(&eb->read_locks);
205} 208}
@@ -247,8 +250,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
247 if (blockers) { 250 if (blockers) {
248 WARN_ON(atomic_read(&eb->spinning_writers)); 251 WARN_ON(atomic_read(&eb->spinning_writers));
249 atomic_dec(&eb->blocking_writers); 252 atomic_dec(&eb->blocking_writers);
250 smp_wmb(); 253 smp_mb();
251 wake_up(&eb->write_lock_wq); 254 if (waitqueue_active(&eb->write_lock_wq))
255 wake_up(&eb->write_lock_wq);
252 } else { 256 } else {
253 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 257 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
254 atomic_dec(&eb->spinning_writers); 258 atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 00000000000..bc424ae5a81
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,1571 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26
27#include "ctree.h"
28#include "transaction.h"
29#include "disk-io.h"
30#include "locking.h"
31#include "ulist.h"
32#include "ioctl.h"
33#include "backref.h"
34
35/* TODO XXX FIXME
36 * - subvol delete -> delete when ref goes to 0? delete limits also?
37 * - reorganize keys
38 * - compressed
39 * - sync
40 * - rescan
41 * - copy also limits on subvol creation
42 * - limit
43 * - caches fuer ulists
44 * - performance benchmarks
45 * - check all ioctl parameters
46 */
47
48/*
49 * one struct for each qgroup, organized in fs_info->qgroup_tree.
50 */
51struct btrfs_qgroup {
52 u64 qgroupid;
53
54 /*
55 * state
56 */
57 u64 rfer; /* referenced */
58 u64 rfer_cmpr; /* referenced compressed */
59 u64 excl; /* exclusive */
60 u64 excl_cmpr; /* exclusive compressed */
61
62 /*
63 * limits
64 */
65 u64 lim_flags; /* which limits are set */
66 u64 max_rfer;
67 u64 max_excl;
68 u64 rsv_rfer;
69 u64 rsv_excl;
70
71 /*
72 * reservation tracking
73 */
74 u64 reserved;
75
76 /*
77 * lists
78 */
79 struct list_head groups; /* groups this group is member of */
80 struct list_head members; /* groups that are members of this group */
81 struct list_head dirty; /* dirty groups */
82 struct rb_node node; /* tree of qgroups */
83
84 /*
85 * temp variables for accounting operations
86 */
87 u64 tag;
88 u64 refcnt;
89};
90
91/*
92 * glue structure to represent the relations between qgroups.
93 */
94struct btrfs_qgroup_list {
95 struct list_head next_group;
96 struct list_head next_member;
97 struct btrfs_qgroup *group;
98 struct btrfs_qgroup *member;
99};
100
101/* must be called with qgroup_lock held */
102static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
103 u64 qgroupid)
104{
105 struct rb_node *n = fs_info->qgroup_tree.rb_node;
106 struct btrfs_qgroup *qgroup;
107
108 while (n) {
109 qgroup = rb_entry(n, struct btrfs_qgroup, node);
110 if (qgroup->qgroupid < qgroupid)
111 n = n->rb_left;
112 else if (qgroup->qgroupid > qgroupid)
113 n = n->rb_right;
114 else
115 return qgroup;
116 }
117 return NULL;
118}
119
120/* must be called with qgroup_lock held */
121static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
122 u64 qgroupid)
123{
124 struct rb_node **p = &fs_info->qgroup_tree.rb_node;
125 struct rb_node *parent = NULL;
126 struct btrfs_qgroup *qgroup;
127
128 while (*p) {
129 parent = *p;
130 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
131
132 if (qgroup->qgroupid < qgroupid)
133 p = &(*p)->rb_left;
134 else if (qgroup->qgroupid > qgroupid)
135 p = &(*p)->rb_right;
136 else
137 return qgroup;
138 }
139
140 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
141 if (!qgroup)
142 return ERR_PTR(-ENOMEM);
143
144 qgroup->qgroupid = qgroupid;
145 INIT_LIST_HEAD(&qgroup->groups);
146 INIT_LIST_HEAD(&qgroup->members);
147 INIT_LIST_HEAD(&qgroup->dirty);
148
149 rb_link_node(&qgroup->node, parent, p);
150 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
151
152 return qgroup;
153}
154
155/* must be called with qgroup_lock held */
156static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
157{
158 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
159 struct btrfs_qgroup_list *list;
160
161 if (!qgroup)
162 return -ENOENT;
163
164 rb_erase(&qgroup->node, &fs_info->qgroup_tree);
165 list_del(&qgroup->dirty);
166
167 while (!list_empty(&qgroup->groups)) {
168 list = list_first_entry(&qgroup->groups,
169 struct btrfs_qgroup_list, next_group);
170 list_del(&list->next_group);
171 list_del(&list->next_member);
172 kfree(list);
173 }
174
175 while (!list_empty(&qgroup->members)) {
176 list = list_first_entry(&qgroup->members,
177 struct btrfs_qgroup_list, next_member);
178 list_del(&list->next_group);
179 list_del(&list->next_member);
180 kfree(list);
181 }
182 kfree(qgroup);
183
184 return 0;
185}
186
187/* must be called with qgroup_lock held */
188static int add_relation_rb(struct btrfs_fs_info *fs_info,
189 u64 memberid, u64 parentid)
190{
191 struct btrfs_qgroup *member;
192 struct btrfs_qgroup *parent;
193 struct btrfs_qgroup_list *list;
194
195 member = find_qgroup_rb(fs_info, memberid);
196 parent = find_qgroup_rb(fs_info, parentid);
197 if (!member || !parent)
198 return -ENOENT;
199
200 list = kzalloc(sizeof(*list), GFP_ATOMIC);
201 if (!list)
202 return -ENOMEM;
203
204 list->group = parent;
205 list->member = member;
206 list_add_tail(&list->next_group, &member->groups);
207 list_add_tail(&list->next_member, &parent->members);
208
209 return 0;
210}
211
212/* must be called with qgroup_lock held */
213static int del_relation_rb(struct btrfs_fs_info *fs_info,
214 u64 memberid, u64 parentid)
215{
216 struct btrfs_qgroup *member;
217 struct btrfs_qgroup *parent;
218 struct btrfs_qgroup_list *list;
219
220 member = find_qgroup_rb(fs_info, memberid);
221 parent = find_qgroup_rb(fs_info, parentid);
222 if (!member || !parent)
223 return -ENOENT;
224
225 list_for_each_entry(list, &member->groups, next_group) {
226 if (list->group == parent) {
227 list_del(&list->next_group);
228 list_del(&list->next_member);
229 kfree(list);
230 return 0;
231 }
232 }
233 return -ENOENT;
234}
235
236/*
237 * The full config is read in one go, only called from open_ctree()
238 * It doesn't use any locking, as at this point we're still single-threaded
239 */
240int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
241{
242 struct btrfs_key key;
243 struct btrfs_key found_key;
244 struct btrfs_root *quota_root = fs_info->quota_root;
245 struct btrfs_path *path = NULL;
246 struct extent_buffer *l;
247 int slot;
248 int ret = 0;
249 u64 flags = 0;
250
251 if (!fs_info->quota_enabled)
252 return 0;
253
254 path = btrfs_alloc_path();
255 if (!path) {
256 ret = -ENOMEM;
257 goto out;
258 }
259
260 /* default this to quota off, in case no status key is found */
261 fs_info->qgroup_flags = 0;
262
263 /*
264 * pass 1: read status, all qgroup infos and limits
265 */
266 key.objectid = 0;
267 key.type = 0;
268 key.offset = 0;
269 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
270 if (ret)
271 goto out;
272
273 while (1) {
274 struct btrfs_qgroup *qgroup;
275
276 slot = path->slots[0];
277 l = path->nodes[0];
278 btrfs_item_key_to_cpu(l, &found_key, slot);
279
280 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
281 struct btrfs_qgroup_status_item *ptr;
282
283 ptr = btrfs_item_ptr(l, slot,
284 struct btrfs_qgroup_status_item);
285
286 if (btrfs_qgroup_status_version(l, ptr) !=
287 BTRFS_QGROUP_STATUS_VERSION) {
288 printk(KERN_ERR
289 "btrfs: old qgroup version, quota disabled\n");
290 goto out;
291 }
292 if (btrfs_qgroup_status_generation(l, ptr) !=
293 fs_info->generation) {
294 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
295 printk(KERN_ERR
296 "btrfs: qgroup generation mismatch, "
297 "marked as inconsistent\n");
298 }
299 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
300 ptr);
301 /* FIXME read scan element */
302 goto next1;
303 }
304
305 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
306 found_key.type != BTRFS_QGROUP_LIMIT_KEY)
307 goto next1;
308
309 qgroup = find_qgroup_rb(fs_info, found_key.offset);
310 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
311 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
312 printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
313 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
314 }
315 if (!qgroup) {
316 qgroup = add_qgroup_rb(fs_info, found_key.offset);
317 if (IS_ERR(qgroup)) {
318 ret = PTR_ERR(qgroup);
319 goto out;
320 }
321 }
322 switch (found_key.type) {
323 case BTRFS_QGROUP_INFO_KEY: {
324 struct btrfs_qgroup_info_item *ptr;
325
326 ptr = btrfs_item_ptr(l, slot,
327 struct btrfs_qgroup_info_item);
328 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
329 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
330 qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
331 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
332 /* generation currently unused */
333 break;
334 }
335 case BTRFS_QGROUP_LIMIT_KEY: {
336 struct btrfs_qgroup_limit_item *ptr;
337
338 ptr = btrfs_item_ptr(l, slot,
339 struct btrfs_qgroup_limit_item);
340 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
341 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
342 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
343 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
344 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
345 break;
346 }
347 }
348next1:
349 ret = btrfs_next_item(quota_root, path);
350 if (ret < 0)
351 goto out;
352 if (ret)
353 break;
354 }
355 btrfs_release_path(path);
356
357 /*
358 * pass 2: read all qgroup relations
359 */
360 key.objectid = 0;
361 key.type = BTRFS_QGROUP_RELATION_KEY;
362 key.offset = 0;
363 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
364 if (ret)
365 goto out;
366 while (1) {
367 slot = path->slots[0];
368 l = path->nodes[0];
369 btrfs_item_key_to_cpu(l, &found_key, slot);
370
371 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
372 goto next2;
373
374 if (found_key.objectid > found_key.offset) {
375 /* parent <- member, not needed to build config */
376 /* FIXME should we omit the key completely? */
377 goto next2;
378 }
379
380 ret = add_relation_rb(fs_info, found_key.objectid,
381 found_key.offset);
382 if (ret)
383 goto out;
384next2:
385 ret = btrfs_next_item(quota_root, path);
386 if (ret < 0)
387 goto out;
388 if (ret)
389 break;
390 }
391out:
392 fs_info->qgroup_flags |= flags;
393 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
394 fs_info->quota_enabled = 0;
395 fs_info->pending_quota_state = 0;
396 }
397 btrfs_free_path(path);
398
399 return ret < 0 ? ret : 0;
400}
401
402/*
403 * This is only called from close_ctree() or open_ctree(), both in single-
404 * treaded paths. Clean up the in-memory structures. No locking needed.
405 */
406void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
407{
408 struct rb_node *n;
409 struct btrfs_qgroup *qgroup;
410 struct btrfs_qgroup_list *list;
411
412 while ((n = rb_first(&fs_info->qgroup_tree))) {
413 qgroup = rb_entry(n, struct btrfs_qgroup, node);
414 rb_erase(n, &fs_info->qgroup_tree);
415
416 WARN_ON(!list_empty(&qgroup->dirty));
417
418 while (!list_empty(&qgroup->groups)) {
419 list = list_first_entry(&qgroup->groups,
420 struct btrfs_qgroup_list,
421 next_group);
422 list_del(&list->next_group);
423 list_del(&list->next_member);
424 kfree(list);
425 }
426
427 while (!list_empty(&qgroup->members)) {
428 list = list_first_entry(&qgroup->members,
429 struct btrfs_qgroup_list,
430 next_member);
431 list_del(&list->next_group);
432 list_del(&list->next_member);
433 kfree(list);
434 }
435 kfree(qgroup);
436 }
437}
438
439static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
440 struct btrfs_root *quota_root,
441 u64 src, u64 dst)
442{
443 int ret;
444 struct btrfs_path *path;
445 struct btrfs_key key;
446
447 path = btrfs_alloc_path();
448 if (!path)
449 return -ENOMEM;
450
451 key.objectid = src;
452 key.type = BTRFS_QGROUP_RELATION_KEY;
453 key.offset = dst;
454
455 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
456
457 btrfs_mark_buffer_dirty(path->nodes[0]);
458
459 btrfs_free_path(path);
460 return ret;
461}
462
463static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
464 struct btrfs_root *quota_root,
465 u64 src, u64 dst)
466{
467 int ret;
468 struct btrfs_path *path;
469 struct btrfs_key key;
470
471 path = btrfs_alloc_path();
472 if (!path)
473 return -ENOMEM;
474
475 key.objectid = src;
476 key.type = BTRFS_QGROUP_RELATION_KEY;
477 key.offset = dst;
478
479 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
480 if (ret < 0)
481 goto out;
482
483 if (ret > 0) {
484 ret = -ENOENT;
485 goto out;
486 }
487
488 ret = btrfs_del_item(trans, quota_root, path);
489out:
490 btrfs_free_path(path);
491 return ret;
492}
493
494static int add_qgroup_item(struct btrfs_trans_handle *trans,
495 struct btrfs_root *quota_root, u64 qgroupid)
496{
497 int ret;
498 struct btrfs_path *path;
499 struct btrfs_qgroup_info_item *qgroup_info;
500 struct btrfs_qgroup_limit_item *qgroup_limit;
501 struct extent_buffer *leaf;
502 struct btrfs_key key;
503
504 path = btrfs_alloc_path();
505 if (!path)
506 return -ENOMEM;
507
508 key.objectid = 0;
509 key.type = BTRFS_QGROUP_INFO_KEY;
510 key.offset = qgroupid;
511
512 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
513 sizeof(*qgroup_info));
514 if (ret)
515 goto out;
516
517 leaf = path->nodes[0];
518 qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
519 struct btrfs_qgroup_info_item);
520 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
521 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
522 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
523 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
524 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
525
526 btrfs_mark_buffer_dirty(leaf);
527
528 btrfs_release_path(path);
529
530 key.type = BTRFS_QGROUP_LIMIT_KEY;
531 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
532 sizeof(*qgroup_limit));
533 if (ret)
534 goto out;
535
536 leaf = path->nodes[0];
537 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
538 struct btrfs_qgroup_limit_item);
539 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
540 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
541 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
542 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
543 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
544
545 btrfs_mark_buffer_dirty(leaf);
546
547 ret = 0;
548out:
549 btrfs_free_path(path);
550 return ret;
551}
552
553static int del_qgroup_item(struct btrfs_trans_handle *trans,
554 struct btrfs_root *quota_root, u64 qgroupid)
555{
556 int ret;
557 struct btrfs_path *path;
558 struct btrfs_key key;
559
560 path = btrfs_alloc_path();
561 if (!path)
562 return -ENOMEM;
563
564 key.objectid = 0;
565 key.type = BTRFS_QGROUP_INFO_KEY;
566 key.offset = qgroupid;
567 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
568 if (ret < 0)
569 goto out;
570
571 if (ret > 0) {
572 ret = -ENOENT;
573 goto out;
574 }
575
576 ret = btrfs_del_item(trans, quota_root, path);
577 if (ret)
578 goto out;
579
580 btrfs_release_path(path);
581
582 key.type = BTRFS_QGROUP_LIMIT_KEY;
583 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
584 if (ret < 0)
585 goto out;
586
587 if (ret > 0) {
588 ret = -ENOENT;
589 goto out;
590 }
591
592 ret = btrfs_del_item(trans, quota_root, path);
593
594out:
595 btrfs_free_path(path);
596 return ret;
597}
598
599static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
600 struct btrfs_root *root, u64 qgroupid,
601 u64 flags, u64 max_rfer, u64 max_excl,
602 u64 rsv_rfer, u64 rsv_excl)
603{
604 struct btrfs_path *path;
605 struct btrfs_key key;
606 struct extent_buffer *l;
607 struct btrfs_qgroup_limit_item *qgroup_limit;
608 int ret;
609 int slot;
610
611 key.objectid = 0;
612 key.type = BTRFS_QGROUP_LIMIT_KEY;
613 key.offset = qgroupid;
614
615 path = btrfs_alloc_path();
616 BUG_ON(!path);
617 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
618 if (ret > 0)
619 ret = -ENOENT;
620
621 if (ret)
622 goto out;
623
624 l = path->nodes[0];
625 slot = path->slots[0];
626 qgroup_limit = btrfs_item_ptr(l, path->slots[0],
627 struct btrfs_qgroup_limit_item);
628 btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
629 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
630 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
631 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
632 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
633
634 btrfs_mark_buffer_dirty(l);
635
636out:
637 btrfs_free_path(path);
638 return ret;
639}
640
641static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
642 struct btrfs_root *root,
643 struct btrfs_qgroup *qgroup)
644{
645 struct btrfs_path *path;
646 struct btrfs_key key;
647 struct extent_buffer *l;
648 struct btrfs_qgroup_info_item *qgroup_info;
649 int ret;
650 int slot;
651
652 key.objectid = 0;
653 key.type = BTRFS_QGROUP_INFO_KEY;
654 key.offset = qgroup->qgroupid;
655
656 path = btrfs_alloc_path();
657 BUG_ON(!path);
658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
659 if (ret > 0)
660 ret = -ENOENT;
661
662 if (ret)
663 goto out;
664
665 l = path->nodes[0];
666 slot = path->slots[0];
667 qgroup_info = btrfs_item_ptr(l, path->slots[0],
668 struct btrfs_qgroup_info_item);
669 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
670 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
671 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
672 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
673 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
674
675 btrfs_mark_buffer_dirty(l);
676
677out:
678 btrfs_free_path(path);
679 return ret;
680}
681
682static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
683 struct btrfs_fs_info *fs_info,
684 struct btrfs_root *root)
685{
686 struct btrfs_path *path;
687 struct btrfs_key key;
688 struct extent_buffer *l;
689 struct btrfs_qgroup_status_item *ptr;
690 int ret;
691 int slot;
692
693 key.objectid = 0;
694 key.type = BTRFS_QGROUP_STATUS_KEY;
695 key.offset = 0;
696
697 path = btrfs_alloc_path();
698 BUG_ON(!path);
699 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
700 if (ret > 0)
701 ret = -ENOENT;
702
703 if (ret)
704 goto out;
705
706 l = path->nodes[0];
707 slot = path->slots[0];
708 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
709 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
710 btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
711 /* XXX scan */
712
713 btrfs_mark_buffer_dirty(l);
714
715out:
716 btrfs_free_path(path);
717 return ret;
718}
719
720/*
721 * called with qgroup_lock held
722 */
723static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
724 struct btrfs_root *root)
725{
726 struct btrfs_path *path;
727 struct btrfs_key key;
728 int ret;
729
730 if (!root)
731 return -EINVAL;
732
733 path = btrfs_alloc_path();
734 if (!path)
735 return -ENOMEM;
736
737 while (1) {
738 key.objectid = 0;
739 key.offset = 0;
740 key.type = 0;
741
742 path->leave_spinning = 1;
743 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
744 if (ret > 0) {
745 if (path->slots[0] == 0)
746 break;
747 path->slots[0]--;
748 } else if (ret < 0) {
749 break;
750 }
751
752 ret = btrfs_del_item(trans, root, path);
753 if (ret)
754 goto out;
755 btrfs_release_path(path);
756 }
757 ret = 0;
758out:
759 root->fs_info->pending_quota_state = 0;
760 btrfs_free_path(path);
761 return ret;
762}
763
764int btrfs_quota_enable(struct btrfs_trans_handle *trans,
765 struct btrfs_fs_info *fs_info)
766{
767 struct btrfs_root *quota_root;
768 struct btrfs_path *path = NULL;
769 struct btrfs_qgroup_status_item *ptr;
770 struct extent_buffer *leaf;
771 struct btrfs_key key;
772 int ret = 0;
773
774 spin_lock(&fs_info->qgroup_lock);
775 if (fs_info->quota_root) {
776 fs_info->pending_quota_state = 1;
777 spin_unlock(&fs_info->qgroup_lock);
778 goto out;
779 }
780 spin_unlock(&fs_info->qgroup_lock);
781
782 /*
783 * initially create the quota tree
784 */
785 quota_root = btrfs_create_tree(trans, fs_info,
786 BTRFS_QUOTA_TREE_OBJECTID);
787 if (IS_ERR(quota_root)) {
788 ret = PTR_ERR(quota_root);
789 goto out;
790 }
791
792 path = btrfs_alloc_path();
793 if (!path)
794 return -ENOMEM;
795
796 key.objectid = 0;
797 key.type = BTRFS_QGROUP_STATUS_KEY;
798 key.offset = 0;
799
800 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
801 sizeof(*ptr));
802 if (ret)
803 goto out;
804
805 leaf = path->nodes[0];
806 ptr = btrfs_item_ptr(leaf, path->slots[0],
807 struct btrfs_qgroup_status_item);
808 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
809 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
810 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
811 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
812 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
813 btrfs_set_qgroup_status_scan(leaf, ptr, 0);
814
815 btrfs_mark_buffer_dirty(leaf);
816
817 spin_lock(&fs_info->qgroup_lock);
818 fs_info->quota_root = quota_root;
819 fs_info->pending_quota_state = 1;
820 spin_unlock(&fs_info->qgroup_lock);
821out:
822 btrfs_free_path(path);
823 return ret;
824}
825
826int btrfs_quota_disable(struct btrfs_trans_handle *trans,
827 struct btrfs_fs_info *fs_info)
828{
829 struct btrfs_root *tree_root = fs_info->tree_root;
830 struct btrfs_root *quota_root;
831 int ret = 0;
832
833 spin_lock(&fs_info->qgroup_lock);
834 fs_info->quota_enabled = 0;
835 fs_info->pending_quota_state = 0;
836 quota_root = fs_info->quota_root;
837 fs_info->quota_root = NULL;
838 btrfs_free_qgroup_config(fs_info);
839 spin_unlock(&fs_info->qgroup_lock);
840
841 if (!quota_root)
842 return -EINVAL;
843
844 ret = btrfs_clean_quota_tree(trans, quota_root);
845 if (ret)
846 goto out;
847
848 ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
849 if (ret)
850 goto out;
851
852 list_del(&quota_root->dirty_list);
853
854 btrfs_tree_lock(quota_root->node);
855 clean_tree_block(trans, tree_root, quota_root->node);
856 btrfs_tree_unlock(quota_root->node);
857 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
858
859 free_extent_buffer(quota_root->node);
860 free_extent_buffer(quota_root->commit_root);
861 kfree(quota_root);
862out:
863 return ret;
864}
865
866int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
867{
868 /* FIXME */
869 return 0;
870}
871
872int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
873 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
874{
875 struct btrfs_root *quota_root;
876 int ret = 0;
877
878 quota_root = fs_info->quota_root;
879 if (!quota_root)
880 return -EINVAL;
881
882 ret = add_qgroup_relation_item(trans, quota_root, src, dst);
883 if (ret)
884 return ret;
885
886 ret = add_qgroup_relation_item(trans, quota_root, dst, src);
887 if (ret) {
888 del_qgroup_relation_item(trans, quota_root, src, dst);
889 return ret;
890 }
891
892 spin_lock(&fs_info->qgroup_lock);
893 ret = add_relation_rb(quota_root->fs_info, src, dst);
894 spin_unlock(&fs_info->qgroup_lock);
895
896 return ret;
897}
898
899int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
900 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
901{
902 struct btrfs_root *quota_root;
903 int ret = 0;
904 int err;
905
906 quota_root = fs_info->quota_root;
907 if (!quota_root)
908 return -EINVAL;
909
910 ret = del_qgroup_relation_item(trans, quota_root, src, dst);
911 err = del_qgroup_relation_item(trans, quota_root, dst, src);
912 if (err && !ret)
913 ret = err;
914
915 spin_lock(&fs_info->qgroup_lock);
916 del_relation_rb(fs_info, src, dst);
917
918 spin_unlock(&fs_info->qgroup_lock);
919
920 return ret;
921}
922
923int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
924 struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
925{
926 struct btrfs_root *quota_root;
927 struct btrfs_qgroup *qgroup;
928 int ret = 0;
929
930 quota_root = fs_info->quota_root;
931 if (!quota_root)
932 return -EINVAL;
933
934 ret = add_qgroup_item(trans, quota_root, qgroupid);
935
936 spin_lock(&fs_info->qgroup_lock);
937 qgroup = add_qgroup_rb(fs_info, qgroupid);
938 spin_unlock(&fs_info->qgroup_lock);
939
940 if (IS_ERR(qgroup))
941 ret = PTR_ERR(qgroup);
942
943 return ret;
944}
945
946int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
947 struct btrfs_fs_info *fs_info, u64 qgroupid)
948{
949 struct btrfs_root *quota_root;
950 int ret = 0;
951
952 quota_root = fs_info->quota_root;
953 if (!quota_root)
954 return -EINVAL;
955
956 ret = del_qgroup_item(trans, quota_root, qgroupid);
957
958 spin_lock(&fs_info->qgroup_lock);
959 del_qgroup_rb(quota_root->fs_info, qgroupid);
960
961 spin_unlock(&fs_info->qgroup_lock);
962
963 return ret;
964}
965
966int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
967 struct btrfs_fs_info *fs_info, u64 qgroupid,
968 struct btrfs_qgroup_limit *limit)
969{
970 struct btrfs_root *quota_root = fs_info->quota_root;
971 struct btrfs_qgroup *qgroup;
972 int ret = 0;
973
974 if (!quota_root)
975 return -EINVAL;
976
977 ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
978 limit->flags, limit->max_rfer,
979 limit->max_excl, limit->rsv_rfer,
980 limit->rsv_excl);
981 if (ret) {
982 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
983 printk(KERN_INFO "unable to update quota limit for %llu\n",
984 (unsigned long long)qgroupid);
985 }
986
987 spin_lock(&fs_info->qgroup_lock);
988
989 qgroup = find_qgroup_rb(fs_info, qgroupid);
990 if (!qgroup) {
991 ret = -ENOENT;
992 goto unlock;
993 }
994 qgroup->lim_flags = limit->flags;
995 qgroup->max_rfer = limit->max_rfer;
996 qgroup->max_excl = limit->max_excl;
997 qgroup->rsv_rfer = limit->rsv_rfer;
998 qgroup->rsv_excl = limit->rsv_excl;
999
1000unlock:
1001 spin_unlock(&fs_info->qgroup_lock);
1002
1003 return ret;
1004}
1005
1006static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1007 struct btrfs_qgroup *qgroup)
1008{
1009 if (list_empty(&qgroup->dirty))
1010 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1011}
1012
1013/*
1014 * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
1015 * the modification into a list that's later used by btrfs_end_transaction to
1016 * pass the recorded modifications on to btrfs_qgroup_account_ref.
1017 */
1018int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_delayed_ref_node *node,
1020 struct btrfs_delayed_extent_op *extent_op)
1021{
1022 struct qgroup_update *u;
1023
1024 BUG_ON(!trans->delayed_ref_elem.seq);
1025 u = kmalloc(sizeof(*u), GFP_NOFS);
1026 if (!u)
1027 return -ENOMEM;
1028
1029 u->node = node;
1030 u->extent_op = extent_op;
1031 list_add_tail(&u->list, &trans->qgroup_ref_list);
1032
1033 return 0;
1034}
1035
1036/*
1037 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1038 * from the fs. First, all roots referencing the extent are searched, and
1039 * then the space is accounted accordingly to the different roots. The
1040 * accounting algorithm works in 3 steps documented inline.
1041 */
1042int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info,
1044 struct btrfs_delayed_ref_node *node,
1045 struct btrfs_delayed_extent_op *extent_op)
1046{
1047 struct btrfs_key ins;
1048 struct btrfs_root *quota_root;
1049 u64 ref_root;
1050 struct btrfs_qgroup *qgroup;
1051 struct ulist_node *unode;
1052 struct ulist *roots = NULL;
1053 struct ulist *tmp = NULL;
1054 struct ulist_iterator uiter;
1055 u64 seq;
1056 int ret = 0;
1057 int sgn;
1058
1059 if (!fs_info->quota_enabled)
1060 return 0;
1061
1062 BUG_ON(!fs_info->quota_root);
1063
1064 ins.objectid = node->bytenr;
1065 ins.offset = node->num_bytes;
1066 ins.type = BTRFS_EXTENT_ITEM_KEY;
1067
1068 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1069 node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
1070 struct btrfs_delayed_tree_ref *ref;
1071 ref = btrfs_delayed_node_to_tree_ref(node);
1072 ref_root = ref->root;
1073 } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1074 node->type == BTRFS_SHARED_DATA_REF_KEY) {
1075 struct btrfs_delayed_data_ref *ref;
1076 ref = btrfs_delayed_node_to_data_ref(node);
1077 ref_root = ref->root;
1078 } else {
1079 BUG();
1080 }
1081
1082 if (!is_fstree(ref_root)) {
1083 /*
1084 * non-fs-trees are not being accounted
1085 */
1086 return 0;
1087 }
1088
1089 switch (node->action) {
1090 case BTRFS_ADD_DELAYED_REF:
1091 case BTRFS_ADD_DELAYED_EXTENT:
1092 sgn = 1;
1093 break;
1094 case BTRFS_DROP_DELAYED_REF:
1095 sgn = -1;
1096 break;
1097 case BTRFS_UPDATE_DELAYED_HEAD:
1098 return 0;
1099 default:
1100 BUG();
1101 }
1102
1103 /*
1104 * the delayed ref sequence number we pass depends on the direction of
1105 * the operation. for add operations, we pass (node->seq - 1) to skip
1106 * the delayed ref's current sequence number, because we need the state
1107 * of the tree before the add operation. for delete operations, we pass
1108 * (node->seq) to include the delayed ref's current sequence number,
1109 * because we need the state of the tree after the delete operation.
1110 */
1111 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
1112 sgn > 0 ? node->seq - 1 : node->seq, &roots);
1113 if (ret < 0)
1114 goto out;
1115
1116 spin_lock(&fs_info->qgroup_lock);
1117 quota_root = fs_info->quota_root;
1118 if (!quota_root)
1119 goto unlock;
1120
1121 qgroup = find_qgroup_rb(fs_info, ref_root);
1122 if (!qgroup)
1123 goto unlock;
1124
1125 /*
1126 * step 1: for each old ref, visit all nodes once and inc refcnt
1127 */
1128 tmp = ulist_alloc(GFP_ATOMIC);
1129 if (!tmp) {
1130 ret = -ENOMEM;
1131 goto unlock;
1132 }
1133 seq = fs_info->qgroup_seq;
1134 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1135
1136 ULIST_ITER_INIT(&uiter);
1137 while ((unode = ulist_next(roots, &uiter))) {
1138 struct ulist_node *tmp_unode;
1139 struct ulist_iterator tmp_uiter;
1140 struct btrfs_qgroup *qg;
1141
1142 qg = find_qgroup_rb(fs_info, unode->val);
1143 if (!qg)
1144 continue;
1145
1146 ulist_reinit(tmp);
1147 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist;
1152
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux;
1154 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1;
1156 else
1157 ++qg->refcnt;
1158
1159 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group,
1162 GFP_ATOMIC);
1163 }
1164 }
1165 }
1166
1167 /*
1168 * step 2: walk from the new root
1169 */
1170 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist;
1176
1177 qg = (struct btrfs_qgroup *)unode->aux;
1178 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes;
1181 qg->rfer_cmpr += sgn * node->num_bytes;
1182 if (roots->nnodes == 0) {
1183 qg->excl += sgn * node->num_bytes;
1184 qg->excl_cmpr += sgn * node->num_bytes;
1185 }
1186 qgroup_dirty(fs_info, qg);
1187 }
1188 WARN_ON(qg->tag >= seq);
1189 qg->tag = seq;
1190
1191 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC);
1194 }
1195 }
1196
1197 /*
1198 * step 3: walk again from old refs
1199 */
1200 ULIST_ITER_INIT(&uiter);
1201 while ((unode = ulist_next(roots, &uiter))) {
1202 struct btrfs_qgroup *qg;
1203 struct ulist_node *tmp_unode;
1204 struct ulist_iterator tmp_uiter;
1205
1206 qg = find_qgroup_rb(fs_info, unode->val);
1207 if (!qg)
1208 continue;
1209
1210 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist;
1215
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux;
1217 if (qg->tag == seq)
1218 continue;
1219
1220 if (qg->refcnt - seq == roots->nnodes) {
1221 qg->excl -= sgn * node->num_bytes;
1222 qg->excl_cmpr -= sgn * node->num_bytes;
1223 qgroup_dirty(fs_info, qg);
1224 }
1225
1226 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group,
1229 GFP_ATOMIC);
1230 }
1231 }
1232 }
1233 ret = 0;
1234unlock:
1235 spin_unlock(&fs_info->qgroup_lock);
1236out:
1237 ulist_free(roots);
1238 ulist_free(tmp);
1239
1240 return ret;
1241}
1242
1243/*
1244 * called from commit_transaction. Writes all changed qgroups to disk.
1245 */
1246int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1247 struct btrfs_fs_info *fs_info)
1248{
1249 struct btrfs_root *quota_root = fs_info->quota_root;
1250 int ret = 0;
1251
1252 if (!quota_root)
1253 goto out;
1254
1255 fs_info->quota_enabled = fs_info->pending_quota_state;
1256
1257 spin_lock(&fs_info->qgroup_lock);
1258 while (!list_empty(&fs_info->dirty_qgroups)) {
1259 struct btrfs_qgroup *qgroup;
1260 qgroup = list_first_entry(&fs_info->dirty_qgroups,
1261 struct btrfs_qgroup, dirty);
1262 list_del_init(&qgroup->dirty);
1263 spin_unlock(&fs_info->qgroup_lock);
1264 ret = update_qgroup_info_item(trans, quota_root, qgroup);
1265 if (ret)
1266 fs_info->qgroup_flags |=
1267 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1268 spin_lock(&fs_info->qgroup_lock);
1269 }
1270 if (fs_info->quota_enabled)
1271 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
1272 else
1273 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1274 spin_unlock(&fs_info->qgroup_lock);
1275
1276 ret = update_qgroup_status_item(trans, fs_info, quota_root);
1277 if (ret)
1278 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1279
1280out:
1281
1282 return ret;
1283}
1284
1285/*
1286 * copy the acounting information between qgroups. This is necessary when a
1287 * snapshot or a subvolume is created
1288 */
1289int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1290 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
1291 struct btrfs_qgroup_inherit *inherit)
1292{
1293 int ret = 0;
1294 int i;
1295 u64 *i_qgroups;
1296 struct btrfs_root *quota_root = fs_info->quota_root;
1297 struct btrfs_qgroup *srcgroup;
1298 struct btrfs_qgroup *dstgroup;
1299 u32 level_size = 0;
1300
1301 if (!fs_info->quota_enabled)
1302 return 0;
1303
1304 if (!quota_root)
1305 return -EINVAL;
1306
1307 /*
1308 * create a tracking group for the subvol itself
1309 */
1310 ret = add_qgroup_item(trans, quota_root, objectid);
1311 if (ret)
1312 goto out;
1313
1314 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
1315 ret = update_qgroup_limit_item(trans, quota_root, objectid,
1316 inherit->lim.flags,
1317 inherit->lim.max_rfer,
1318 inherit->lim.max_excl,
1319 inherit->lim.rsv_rfer,
1320 inherit->lim.rsv_excl);
1321 if (ret)
1322 goto out;
1323 }
1324
1325 if (srcid) {
1326 struct btrfs_root *srcroot;
1327 struct btrfs_key srckey;
1328 int srcroot_level;
1329
1330 srckey.objectid = srcid;
1331 srckey.type = BTRFS_ROOT_ITEM_KEY;
1332 srckey.offset = (u64)-1;
1333 srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
1334 if (IS_ERR(srcroot)) {
1335 ret = PTR_ERR(srcroot);
1336 goto out;
1337 }
1338
1339 rcu_read_lock();
1340 srcroot_level = btrfs_header_level(srcroot->node);
1341 level_size = btrfs_level_size(srcroot, srcroot_level);
1342 rcu_read_unlock();
1343 }
1344
1345 /*
1346 * add qgroup to all inherited groups
1347 */
1348 if (inherit) {
1349 i_qgroups = (u64 *)(inherit + 1);
1350 for (i = 0; i < inherit->num_qgroups; ++i) {
1351 ret = add_qgroup_relation_item(trans, quota_root,
1352 objectid, *i_qgroups);
1353 if (ret)
1354 goto out;
1355 ret = add_qgroup_relation_item(trans, quota_root,
1356 *i_qgroups, objectid);
1357 if (ret)
1358 goto out;
1359 ++i_qgroups;
1360 }
1361 }
1362
1363
1364 spin_lock(&fs_info->qgroup_lock);
1365
1366 dstgroup = add_qgroup_rb(fs_info, objectid);
1367 if (!dstgroup)
1368 goto unlock;
1369
1370 if (srcid) {
1371 srcgroup = find_qgroup_rb(fs_info, srcid);
1372 if (!srcgroup)
1373 goto unlock;
1374 dstgroup->rfer = srcgroup->rfer - level_size;
1375 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
1376 srcgroup->excl = level_size;
1377 srcgroup->excl_cmpr = level_size;
1378 qgroup_dirty(fs_info, dstgroup);
1379 qgroup_dirty(fs_info, srcgroup);
1380 }
1381
1382 if (!inherit)
1383 goto unlock;
1384
1385 i_qgroups = (u64 *)(inherit + 1);
1386 for (i = 0; i < inherit->num_qgroups; ++i) {
1387 ret = add_relation_rb(quota_root->fs_info, objectid,
1388 *i_qgroups);
1389 if (ret)
1390 goto unlock;
1391 ++i_qgroups;
1392 }
1393
1394 for (i = 0; i < inherit->num_ref_copies; ++i) {
1395 struct btrfs_qgroup *src;
1396 struct btrfs_qgroup *dst;
1397
1398 src = find_qgroup_rb(fs_info, i_qgroups[0]);
1399 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
1400
1401 if (!src || !dst) {
1402 ret = -EINVAL;
1403 goto unlock;
1404 }
1405
1406 dst->rfer = src->rfer - level_size;
1407 dst->rfer_cmpr = src->rfer_cmpr - level_size;
1408 i_qgroups += 2;
1409 }
1410 for (i = 0; i < inherit->num_excl_copies; ++i) {
1411 struct btrfs_qgroup *src;
1412 struct btrfs_qgroup *dst;
1413
1414 src = find_qgroup_rb(fs_info, i_qgroups[0]);
1415 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
1416
1417 if (!src || !dst) {
1418 ret = -EINVAL;
1419 goto unlock;
1420 }
1421
1422 dst->excl = src->excl + level_size;
1423 dst->excl_cmpr = src->excl_cmpr + level_size;
1424 i_qgroups += 2;
1425 }
1426
1427unlock:
1428 spin_unlock(&fs_info->qgroup_lock);
1429out:
1430 return ret;
1431}
1432
1433/*
1434 * reserve some space for a qgroup and all its parents. The reservation takes
1435 * place with start_transaction or dealloc_reserve, similar to ENOSPC
1436 * accounting. If not enough space is available, EDQUOT is returned.
1437 * We assume that the requested space is new for all qgroups.
1438 */
1439int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1440{
1441 struct btrfs_root *quota_root;
1442 struct btrfs_qgroup *qgroup;
1443 struct btrfs_fs_info *fs_info = root->fs_info;
1444 u64 ref_root = root->root_key.objectid;
1445 int ret = 0;
1446 struct ulist *ulist = NULL;
1447 struct ulist_node *unode;
1448 struct ulist_iterator uiter;
1449
1450 if (!is_fstree(ref_root))
1451 return 0;
1452
1453 if (num_bytes == 0)
1454 return 0;
1455
1456 spin_lock(&fs_info->qgroup_lock);
1457 quota_root = fs_info->quota_root;
1458 if (!quota_root)
1459 goto out;
1460
1461 qgroup = find_qgroup_rb(fs_info, ref_root);
1462 if (!qgroup)
1463 goto out;
1464
1465 /*
1466 * in a first step, we check all affected qgroups if any limits would
1467 * be exceeded
1468 */
1469 ulist = ulist_alloc(GFP_ATOMIC);
1470 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1471 ULIST_ITER_INIT(&uiter);
1472 while ((unode = ulist_next(ulist, &uiter))) {
1473 struct btrfs_qgroup *qg;
1474 struct btrfs_qgroup_list *glist;
1475
1476 qg = (struct btrfs_qgroup *)unode->aux;
1477
1478 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1479 qg->reserved + qg->rfer + num_bytes >
1480 qg->max_rfer)
1481 ret = -EDQUOT;
1482
1483 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
1484 qg->reserved + qg->excl + num_bytes >
1485 qg->max_excl)
1486 ret = -EDQUOT;
1487
1488 list_for_each_entry(glist, &qg->groups, next_group) {
1489 ulist_add(ulist, glist->group->qgroupid,
1490 (unsigned long)glist->group, GFP_ATOMIC);
1491 }
1492 }
1493 if (ret)
1494 goto out;
1495
1496 /*
1497 * no limits exceeded, now record the reservation into all qgroups
1498 */
1499 ULIST_ITER_INIT(&uiter);
1500 while ((unode = ulist_next(ulist, &uiter))) {
1501 struct btrfs_qgroup *qg;
1502
1503 qg = (struct btrfs_qgroup *)unode->aux;
1504
1505 qg->reserved += num_bytes;
1506 }
1507
1508out:
1509 spin_unlock(&fs_info->qgroup_lock);
1510 ulist_free(ulist);
1511
1512 return ret;
1513}
1514
1515void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1516{
1517 struct btrfs_root *quota_root;
1518 struct btrfs_qgroup *qgroup;
1519 struct btrfs_fs_info *fs_info = root->fs_info;
1520 struct ulist *ulist = NULL;
1521 struct ulist_node *unode;
1522 struct ulist_iterator uiter;
1523 u64 ref_root = root->root_key.objectid;
1524
1525 if (!is_fstree(ref_root))
1526 return;
1527
1528 if (num_bytes == 0)
1529 return;
1530
1531 spin_lock(&fs_info->qgroup_lock);
1532
1533 quota_root = fs_info->quota_root;
1534 if (!quota_root)
1535 goto out;
1536
1537 qgroup = find_qgroup_rb(fs_info, ref_root);
1538 if (!qgroup)
1539 goto out;
1540
1541 ulist = ulist_alloc(GFP_ATOMIC);
1542 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1543 ULIST_ITER_INIT(&uiter);
1544 while ((unode = ulist_next(ulist, &uiter))) {
1545 struct btrfs_qgroup *qg;
1546 struct btrfs_qgroup_list *glist;
1547
1548 qg = (struct btrfs_qgroup *)unode->aux;
1549
1550 qg->reserved -= num_bytes;
1551
1552 list_for_each_entry(glist, &qg->groups, next_group) {
1553 ulist_add(ulist, glist->group->qgroupid,
1554 (unsigned long)glist->group, GFP_ATOMIC);
1555 }
1556 }
1557
1558out:
1559 spin_unlock(&fs_info->qgroup_lock);
1560 ulist_free(ulist);
1561}
1562
1563void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1564{
1565 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1566 return;
1567 printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
1568 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1569 trans->delayed_ref_elem.seq);
1570 BUG();
1571}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 646ee21bb03..c5dbd914967 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1239,10 +1239,11 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
1239 node->bytenr, &node->rb_node); 1239 node->bytenr, &node->rb_node);
1240 spin_unlock(&rc->reloc_root_tree.lock); 1240 spin_unlock(&rc->reloc_root_tree.lock);
1241 if (rb_node) { 1241 if (rb_node) {
1242 kfree(node);
1243 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " 1242 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1244 "for start=%llu while inserting into relocation " 1243 "for start=%llu while inserting into relocation "
1245 "tree\n"); 1244 "tree\n");
1245 kfree(node);
1246 return -EEXIST;
1246 } 1247 }
1247 1248
1248 list_add_tail(&root->root_list, &rc->reloc_roots); 1249 list_add_tail(&root->root_list, &rc->reloc_roots);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 24fb8ce4e07..6bb465cca20 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,12 +16,55 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/uuid.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "transaction.h" 21#include "transaction.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
24/* 25/*
26 * Read a root item from the tree. In case we detect a root item smaller then
27 * sizeof(root_item), we know it's an old version of the root structure and
28 * initialize all new fields to zero. The same happens if we detect mismatching
29 * generation numbers as then we know the root was once mounted with an older
30 * kernel that was not aware of the root item structure change.
31 */
32void btrfs_read_root_item(struct btrfs_root *root,
33 struct extent_buffer *eb, int slot,
34 struct btrfs_root_item *item)
35{
36 uuid_le uuid;
37 int len;
38 int need_reset = 0;
39
40 len = btrfs_item_size_nr(eb, slot);
41 read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
42 min_t(int, len, (int)sizeof(*item)));
43 if (len < sizeof(*item))
44 need_reset = 1;
45 if (!need_reset && btrfs_root_generation(item)
46 != btrfs_root_generation_v2(item)) {
47 if (btrfs_root_generation_v2(item) != 0) {
48 printk(KERN_WARNING "btrfs: mismatching "
49 "generation and generation_v2 "
50 "found in root item. This root "
51 "was probably mounted with an "
52 "older kernel. Resetting all "
53 "new fields.\n");
54 }
55 need_reset = 1;
56 }
57 if (need_reset) {
58 memset(&item->generation_v2, 0,
59 sizeof(*item) - offsetof(struct btrfs_root_item,
60 generation_v2));
61
62 uuid_le_gen(&uuid);
63 memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
64 }
65}
66
67/*
25 * lookup the root with the highest offset for a given objectid. The key we do 68 * lookup the root with the highest offset for a given objectid. The key we do
26 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 69 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
27 * on error. 70 * on error.
@@ -61,10 +104,10 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
61 goto out; 104 goto out;
62 } 105 }
63 if (item) 106 if (item)
64 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), 107 btrfs_read_root_item(root, l, slot, item);
65 sizeof(*item));
66 if (key) 108 if (key)
67 memcpy(key, &found_key, sizeof(found_key)); 109 memcpy(key, &found_key, sizeof(found_key));
110
68 ret = 0; 111 ret = 0;
69out: 112out:
70 btrfs_free_path(path); 113 btrfs_free_path(path);
@@ -91,16 +134,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
91 int ret; 134 int ret;
92 int slot; 135 int slot;
93 unsigned long ptr; 136 unsigned long ptr;
137 int old_len;
94 138
95 path = btrfs_alloc_path(); 139 path = btrfs_alloc_path();
96 if (!path) 140 if (!path)
97 return -ENOMEM; 141 return -ENOMEM;
98 142
99 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
100 if (ret < 0) { 144 if (ret < 0)
101 btrfs_abort_transaction(trans, root, ret); 145 goto out_abort;
102 goto out;
103 }
104 146
105 if (ret != 0) { 147 if (ret != 0) {
106 btrfs_print_leaf(root, path->nodes[0]); 148 btrfs_print_leaf(root, path->nodes[0]);
@@ -113,16 +155,56 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
113 l = path->nodes[0]; 155 l = path->nodes[0];
114 slot = path->slots[0]; 156 slot = path->slots[0];
115 ptr = btrfs_item_ptr_offset(l, slot); 157 ptr = btrfs_item_ptr_offset(l, slot);
158 old_len = btrfs_item_size_nr(l, slot);
159
160 /*
161 * If this is the first time we update the root item which originated
162 * from an older kernel, we need to enlarge the item size to make room
163 * for the added fields.
164 */
165 if (old_len < sizeof(*item)) {
166 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1);
169 if (ret < 0)
170 goto out_abort;
171 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0)
173 goto out_abort;
174 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item));
177 if (ret < 0)
178 goto out_abort;
179 l = path->nodes[0];
180 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot);
182 }
183
184 /*
185 * Update generation_v2 so at the next mount we know the new root
186 * fields are valid.
187 */
188 btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
189
116 write_extent_buffer(l, item, ptr, sizeof(*item)); 190 write_extent_buffer(l, item, ptr, sizeof(*item));
117 btrfs_mark_buffer_dirty(path->nodes[0]); 191 btrfs_mark_buffer_dirty(path->nodes[0]);
118out: 192out:
119 btrfs_free_path(path); 193 btrfs_free_path(path);
120 return ret; 194 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
121} 199}
122 200
123int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
124 struct btrfs_key *key, struct btrfs_root_item *item) 202 struct btrfs_key *key, struct btrfs_root_item *item)
125{ 203{
204 /*
205 * Make sure generation v1 and v2 match. See update_root for details.
206 */
207 btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
126 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 208 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
127} 209}
128 210
@@ -454,3 +536,16 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
454 root_item->byte_limit = 0; 536 root_item->byte_limit = 0;
455 } 537 }
456} 538}
539
540void btrfs_update_root_times(struct btrfs_trans_handle *trans,
541 struct btrfs_root *root)
542{
543 struct btrfs_root_item *item = &root->root_item;
544 struct timespec ct = CURRENT_TIME;
545
546 spin_lock(&root->root_times_lock);
547 item->ctransid = trans->transid;
548 item->ctime.sec = cpu_to_le64(ct.tv_sec);
549 item->ctime.nsec = cpu_to_le64(ct.tv_nsec);
550 spin_unlock(&root->root_times_lock);
551}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 00000000000..c8ca49b1bb4
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,4571 @@
1/*
2 * Copyright (C) 2012 Alexander Block. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bsearch.h>
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/sort.h>
23#include <linux/mount.h>
24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h>
26#include <linux/radix-tree.h>
27#include <linux/crc32c.h>
28
29#include "send.h"
30#include "backref.h"
31#include "locking.h"
32#include "disk-io.h"
33#include "btrfs_inode.h"
34#include "transaction.h"
35
36static int g_verbose = 0;
37
38#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
39
40/*
41 * A fs_path is a helper to dynamically build path names with unknown size.
42 * It reallocates the internal buffer on demand.
43 * It allows fast adding of path elements on the right side (normal path) and
44 * fast adding to the left side (reversed path). A reversed path can also be
45 * unreversed if needed.
46 */
47struct fs_path {
48 union {
49 struct {
50 char *start;
51 char *end;
52 char *prepared;
53
54 char *buf;
55 int buf_len;
56 int reversed:1;
57 int virtual_mem:1;
58 char inline_buf[];
59 };
60 char pad[PAGE_SIZE];
61 };
62};
63#define FS_PATH_INLINE_SIZE \
64 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
65
66
67/* reused for each extent */
68struct clone_root {
69 struct btrfs_root *root;
70 u64 ino;
71 u64 offset;
72
73 u64 found_refs;
74};
75
76#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
77#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
78
79struct send_ctx {
80 struct file *send_filp;
81 loff_t send_off;
82 char *send_buf;
83 u32 send_size;
84 u32 send_max_size;
85 u64 total_send_size;
86 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
87
88 struct vfsmount *mnt;
89
90 struct btrfs_root *send_root;
91 struct btrfs_root *parent_root;
92 struct clone_root *clone_roots;
93 int clone_roots_cnt;
94
95 /* current state of the compare_tree call */
96 struct btrfs_path *left_path;
97 struct btrfs_path *right_path;
98 struct btrfs_key *cmp_key;
99
100 /*
101 * infos of the currently processed inode. In case of deleted inodes,
102 * these are the values from the deleted inode.
103 */
104 u64 cur_ino;
105 u64 cur_inode_gen;
106 int cur_inode_new;
107 int cur_inode_new_gen;
108 int cur_inode_deleted;
109 int cur_inode_first_ref_orphan;
110 u64 cur_inode_size;
111 u64 cur_inode_mode;
112
113 u64 send_progress;
114
115 struct list_head new_refs;
116 struct list_head deleted_refs;
117
118 struct radix_tree_root name_cache;
119 struct list_head name_cache_list;
120 int name_cache_size;
121
122 struct file *cur_inode_filp;
123 char *read_buf;
124};
125
126struct name_cache_entry {
127 struct list_head list;
128 struct list_head use_list;
129 u64 ino;
130 u64 gen;
131 u64 parent_ino;
132 u64 parent_gen;
133 int ret;
134 int need_later_update;
135 int name_len;
136 char name[];
137};
138
139static void fs_path_reset(struct fs_path *p)
140{
141 if (p->reversed) {
142 p->start = p->buf + p->buf_len - 1;
143 p->end = p->start;
144 *p->start = 0;
145 } else {
146 p->start = p->buf;
147 p->end = p->start;
148 *p->start = 0;
149 }
150}
151
152static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
153{
154 struct fs_path *p;
155
156 p = kmalloc(sizeof(*p), GFP_NOFS);
157 if (!p)
158 return NULL;
159 p->reversed = 0;
160 p->virtual_mem = 0;
161 p->buf = p->inline_buf;
162 p->buf_len = FS_PATH_INLINE_SIZE;
163 fs_path_reset(p);
164 return p;
165}
166
167static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
168{
169 struct fs_path *p;
170
171 p = fs_path_alloc(sctx);
172 if (!p)
173 return NULL;
174 p->reversed = 1;
175 fs_path_reset(p);
176 return p;
177}
178
179static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
180{
181 if (!p)
182 return;
183 if (p->buf != p->inline_buf) {
184 if (p->virtual_mem)
185 vfree(p->buf);
186 else
187 kfree(p->buf);
188 }
189 kfree(p);
190}
191
192static int fs_path_len(struct fs_path *p)
193{
194 return p->end - p->start;
195}
196
197static int fs_path_ensure_buf(struct fs_path *p, int len)
198{
199 char *tmp_buf;
200 int path_len;
201 int old_buf_len;
202
203 len++;
204
205 if (p->buf_len >= len)
206 return 0;
207
208 path_len = p->end - p->start;
209 old_buf_len = p->buf_len;
210 len = PAGE_ALIGN(len);
211
212 if (p->buf == p->inline_buf) {
213 tmp_buf = kmalloc(len, GFP_NOFS);
214 if (!tmp_buf) {
215 tmp_buf = vmalloc(len);
216 if (!tmp_buf)
217 return -ENOMEM;
218 p->virtual_mem = 1;
219 }
220 memcpy(tmp_buf, p->buf, p->buf_len);
221 p->buf = tmp_buf;
222 p->buf_len = len;
223 } else {
224 if (p->virtual_mem) {
225 tmp_buf = vmalloc(len);
226 if (!tmp_buf)
227 return -ENOMEM;
228 memcpy(tmp_buf, p->buf, p->buf_len);
229 vfree(p->buf);
230 } else {
231 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
232 if (!tmp_buf) {
233 tmp_buf = vmalloc(len);
234 if (!tmp_buf)
235 return -ENOMEM;
236 memcpy(tmp_buf, p->buf, p->buf_len);
237 kfree(p->buf);
238 p->virtual_mem = 1;
239 }
240 }
241 p->buf = tmp_buf;
242 p->buf_len = len;
243 }
244 if (p->reversed) {
245 tmp_buf = p->buf + old_buf_len - path_len - 1;
246 p->end = p->buf + p->buf_len - 1;
247 p->start = p->end - path_len;
248 memmove(p->start, tmp_buf, path_len + 1);
249 } else {
250 p->start = p->buf;
251 p->end = p->start + path_len;
252 }
253 return 0;
254}
255
256static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
257{
258 int ret;
259 int new_len;
260
261 new_len = p->end - p->start + name_len;
262 if (p->start != p->end)
263 new_len++;
264 ret = fs_path_ensure_buf(p, new_len);
265 if (ret < 0)
266 goto out;
267
268 if (p->reversed) {
269 if (p->start != p->end)
270 *--p->start = '/';
271 p->start -= name_len;
272 p->prepared = p->start;
273 } else {
274 if (p->start != p->end)
275 *p->end++ = '/';
276 p->prepared = p->end;
277 p->end += name_len;
278 *p->end = 0;
279 }
280
281out:
282 return ret;
283}
284
285static int fs_path_add(struct fs_path *p, const char *name, int name_len)
286{
287 int ret;
288
289 ret = fs_path_prepare_for_add(p, name_len);
290 if (ret < 0)
291 goto out;
292 memcpy(p->prepared, name, name_len);
293 p->prepared = NULL;
294
295out:
296 return ret;
297}
298
299static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
300{
301 int ret;
302
303 ret = fs_path_prepare_for_add(p, p2->end - p2->start);
304 if (ret < 0)
305 goto out;
306 memcpy(p->prepared, p2->start, p2->end - p2->start);
307 p->prepared = NULL;
308
309out:
310 return ret;
311}
312
313static int fs_path_add_from_extent_buffer(struct fs_path *p,
314 struct extent_buffer *eb,
315 unsigned long off, int len)
316{
317 int ret;
318
319 ret = fs_path_prepare_for_add(p, len);
320 if (ret < 0)
321 goto out;
322
323 read_extent_buffer(eb, p->prepared, off, len);
324 p->prepared = NULL;
325
326out:
327 return ret;
328}
329
330static void fs_path_remove(struct fs_path *p)
331{
332 BUG_ON(p->reversed);
333 while (p->start != p->end && *p->end != '/')
334 p->end--;
335 *p->end = 0;
336}
337
338static int fs_path_copy(struct fs_path *p, struct fs_path *from)
339{
340 int ret;
341
342 p->reversed = from->reversed;
343 fs_path_reset(p);
344
345 ret = fs_path_add_path(p, from);
346
347 return ret;
348}
349
350
351static void fs_path_unreverse(struct fs_path *p)
352{
353 char *tmp;
354 int len;
355
356 if (!p->reversed)
357 return;
358
359 tmp = p->start;
360 len = p->end - p->start;
361 p->start = p->buf;
362 p->end = p->start + len;
363 memmove(p->start, tmp, len + 1);
364 p->reversed = 0;
365}
366
367static struct btrfs_path *alloc_path_for_send(void)
368{
369 struct btrfs_path *path;
370
371 path = btrfs_alloc_path();
372 if (!path)
373 return NULL;
374 path->search_commit_root = 1;
375 path->skip_locking = 1;
376 return path;
377}
378
379static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
380{
381 int ret;
382 mm_segment_t old_fs;
383 u32 pos = 0;
384
385 old_fs = get_fs();
386 set_fs(KERNEL_DS);
387
388 while (pos < len) {
389 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
390 &sctx->send_off);
391 /* TODO handle that correctly */
392 /*if (ret == -ERESTARTSYS) {
393 continue;
394 }*/
395 if (ret < 0)
396 goto out;
397 if (ret == 0) {
398 ret = -EIO;
399 goto out;
400 }
401 pos += ret;
402 }
403
404 ret = 0;
405
406out:
407 set_fs(old_fs);
408 return ret;
409}
410
411static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
412{
413 struct btrfs_tlv_header *hdr;
414 int total_len = sizeof(*hdr) + len;
415 int left = sctx->send_max_size - sctx->send_size;
416
417 if (unlikely(left < total_len))
418 return -EOVERFLOW;
419
420 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
421 hdr->tlv_type = cpu_to_le16(attr);
422 hdr->tlv_len = cpu_to_le16(len);
423 memcpy(hdr + 1, data, len);
424 sctx->send_size += total_len;
425
426 return 0;
427}
428
429#if 0
430static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
431{
432 return tlv_put(sctx, attr, &value, sizeof(value));
433}
434
435static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
436{
437 __le16 tmp = cpu_to_le16(value);
438 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
439}
440
441static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
442{
443 __le32 tmp = cpu_to_le32(value);
444 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
445}
446#endif
447
448static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
449{
450 __le64 tmp = cpu_to_le64(value);
451 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
452}
453
454static int tlv_put_string(struct send_ctx *sctx, u16 attr,
455 const char *str, int len)
456{
457 if (len == -1)
458 len = strlen(str);
459 return tlv_put(sctx, attr, str, len);
460}
461
462static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
463 const u8 *uuid)
464{
465 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
466}
467
468#if 0
469static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
470 struct timespec *ts)
471{
472 struct btrfs_timespec bts;
473 bts.sec = cpu_to_le64(ts->tv_sec);
474 bts.nsec = cpu_to_le32(ts->tv_nsec);
475 return tlv_put(sctx, attr, &bts, sizeof(bts));
476}
477#endif
478
479static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
480 struct extent_buffer *eb,
481 struct btrfs_timespec *ts)
482{
483 struct btrfs_timespec bts;
484 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
485 return tlv_put(sctx, attr, &bts, sizeof(bts));
486}
487
488
489#define TLV_PUT(sctx, attrtype, attrlen, data) \
490 do { \
491 ret = tlv_put(sctx, attrtype, attrlen, data); \
492 if (ret < 0) \
493 goto tlv_put_failure; \
494 } while (0)
495
496#define TLV_PUT_INT(sctx, attrtype, bits, value) \
497 do { \
498 ret = tlv_put_u##bits(sctx, attrtype, value); \
499 if (ret < 0) \
500 goto tlv_put_failure; \
501 } while (0)
502
503#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
504#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
505#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
506#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
507#define TLV_PUT_STRING(sctx, attrtype, str, len) \
508 do { \
509 ret = tlv_put_string(sctx, attrtype, str, len); \
510 if (ret < 0) \
511 goto tlv_put_failure; \
512 } while (0)
513#define TLV_PUT_PATH(sctx, attrtype, p) \
514 do { \
515 ret = tlv_put_string(sctx, attrtype, p->start, \
516 p->end - p->start); \
517 if (ret < 0) \
518 goto tlv_put_failure; \
519 } while(0)
520#define TLV_PUT_UUID(sctx, attrtype, uuid) \
521 do { \
522 ret = tlv_put_uuid(sctx, attrtype, uuid); \
523 if (ret < 0) \
524 goto tlv_put_failure; \
525 } while (0)
526#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
527 do { \
528 ret = tlv_put_timespec(sctx, attrtype, ts); \
529 if (ret < 0) \
530 goto tlv_put_failure; \
531 } while (0)
532#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
533 do { \
534 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
535 if (ret < 0) \
536 goto tlv_put_failure; \
537 } while (0)
538
539static int send_header(struct send_ctx *sctx)
540{
541 struct btrfs_stream_header hdr;
542
543 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
544 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
545
546 return write_buf(sctx, &hdr, sizeof(hdr));
547}
548
549/*
550 * For each command/item we want to send to userspace, we call this function.
551 */
552static int begin_cmd(struct send_ctx *sctx, int cmd)
553{
554 struct btrfs_cmd_header *hdr;
555
556 if (!sctx->send_buf) {
557 WARN_ON(1);
558 return -EINVAL;
559 }
560
561 BUG_ON(sctx->send_size);
562
563 sctx->send_size += sizeof(*hdr);
564 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
565 hdr->cmd = cpu_to_le16(cmd);
566
567 return 0;
568}
569
570static int send_cmd(struct send_ctx *sctx)
571{
572 int ret;
573 struct btrfs_cmd_header *hdr;
574 u32 crc;
575
576 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
577 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
578 hdr->crc = 0;
579
580 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
581 hdr->crc = cpu_to_le32(crc);
582
583 ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
584
585 sctx->total_send_size += sctx->send_size;
586 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
587 sctx->send_size = 0;
588
589 return ret;
590}
591
592/*
593 * Sends a move instruction to user space
594 */
595static int send_rename(struct send_ctx *sctx,
596 struct fs_path *from, struct fs_path *to)
597{
598 int ret;
599
600verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start);
601
602 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
603 if (ret < 0)
604 goto out;
605
606 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
607 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
608
609 ret = send_cmd(sctx);
610
611tlv_put_failure:
612out:
613 return ret;
614}
615
616/*
617 * Sends a link instruction to user space
618 */
619static int send_link(struct send_ctx *sctx,
620 struct fs_path *path, struct fs_path *lnk)
621{
622 int ret;
623
624verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start);
625
626 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
627 if (ret < 0)
628 goto out;
629
630 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
631 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
632
633 ret = send_cmd(sctx);
634
635tlv_put_failure:
636out:
637 return ret;
638}
639
640/*
641 * Sends an unlink instruction to user space
642 */
643static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
644{
645 int ret;
646
647verbose_printk("btrfs: send_unlink %s\n", path->start);
648
649 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
650 if (ret < 0)
651 goto out;
652
653 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
654
655 ret = send_cmd(sctx);
656
657tlv_put_failure:
658out:
659 return ret;
660}
661
662/*
663 * Sends a rmdir instruction to user space
664 */
665static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
666{
667 int ret;
668
669verbose_printk("btrfs: send_rmdir %s\n", path->start);
670
671 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
672 if (ret < 0)
673 goto out;
674
675 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
676
677 ret = send_cmd(sctx);
678
679tlv_put_failure:
680out:
681 return ret;
682}
683
684/*
685 * Helper function to retrieve some fields from an inode item.
686 */
687static int get_inode_info(struct btrfs_root *root,
688 u64 ino, u64 *size, u64 *gen,
689 u64 *mode, u64 *uid, u64 *gid)
690{
691 int ret;
692 struct btrfs_inode_item *ii;
693 struct btrfs_key key;
694 struct btrfs_path *path;
695
696 path = alloc_path_for_send();
697 if (!path)
698 return -ENOMEM;
699
700 key.objectid = ino;
701 key.type = BTRFS_INODE_ITEM_KEY;
702 key.offset = 0;
703 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
704 if (ret < 0)
705 goto out;
706 if (ret) {
707 ret = -ENOENT;
708 goto out;
709 }
710
711 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
712 struct btrfs_inode_item);
713 if (size)
714 *size = btrfs_inode_size(path->nodes[0], ii);
715 if (gen)
716 *gen = btrfs_inode_generation(path->nodes[0], ii);
717 if (mode)
718 *mode = btrfs_inode_mode(path->nodes[0], ii);
719 if (uid)
720 *uid = btrfs_inode_uid(path->nodes[0], ii);
721 if (gid)
722 *gid = btrfs_inode_gid(path->nodes[0], ii);
723
724out:
725 btrfs_free_path(path);
726 return ret;
727}
728
729typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
730 struct fs_path *p,
731 void *ctx);
732
733/*
734 * Helper function to iterate the entries in ONE btrfs_inode_ref.
735 * The iterate callback may return a non zero value to stop iteration. This can
736 * be a negative value for error codes or 1 to simply stop it.
737 *
738 * path must point to the INODE_REF when called.
739 */
740static int iterate_inode_ref(struct send_ctx *sctx,
741 struct btrfs_root *root, struct btrfs_path *path,
742 struct btrfs_key *found_key, int resolve,
743 iterate_inode_ref_t iterate, void *ctx)
744{
745 struct extent_buffer *eb;
746 struct btrfs_item *item;
747 struct btrfs_inode_ref *iref;
748 struct btrfs_path *tmp_path;
749 struct fs_path *p;
750 u32 cur;
751 u32 len;
752 u32 total;
753 int slot;
754 u32 name_len;
755 char *start;
756 int ret = 0;
757 int num;
758 int index;
759
760 p = fs_path_alloc_reversed(sctx);
761 if (!p)
762 return -ENOMEM;
763
764 tmp_path = alloc_path_for_send();
765 if (!tmp_path) {
766 fs_path_free(sctx, p);
767 return -ENOMEM;
768 }
769
770 eb = path->nodes[0];
771 slot = path->slots[0];
772 item = btrfs_item_nr(eb, slot);
773 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
774 cur = 0;
775 len = 0;
776 total = btrfs_item_size(eb, item);
777
778 num = 0;
779 while (cur < total) {
780 fs_path_reset(p);
781
782 name_len = btrfs_inode_ref_name_len(eb, iref);
783 index = btrfs_inode_ref_index(eb, iref);
784 if (resolve) {
785 start = btrfs_iref_to_path(root, tmp_path, iref, eb,
786 found_key->offset, p->buf,
787 p->buf_len);
788 if (IS_ERR(start)) {
789 ret = PTR_ERR(start);
790 goto out;
791 }
792 if (start < p->buf) {
793 /* overflow , try again with larger buffer */
794 ret = fs_path_ensure_buf(p,
795 p->buf_len + p->buf - start);
796 if (ret < 0)
797 goto out;
798 start = btrfs_iref_to_path(root, tmp_path, iref,
799 eb, found_key->offset, p->buf,
800 p->buf_len);
801 if (IS_ERR(start)) {
802 ret = PTR_ERR(start);
803 goto out;
804 }
805 BUG_ON(start < p->buf);
806 }
807 p->start = start;
808 } else {
809 ret = fs_path_add_from_extent_buffer(p, eb,
810 (unsigned long)(iref + 1), name_len);
811 if (ret < 0)
812 goto out;
813 }
814
815
816 len = sizeof(*iref) + name_len;
817 iref = (struct btrfs_inode_ref *)((char *)iref + len);
818 cur += len;
819
820 ret = iterate(num, found_key->offset, index, p, ctx);
821 if (ret)
822 goto out;
823
824 num++;
825 }
826
827out:
828 btrfs_free_path(tmp_path);
829 fs_path_free(sctx, p);
830 return ret;
831}
832
833typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
834 const char *name, int name_len,
835 const char *data, int data_len,
836 u8 type, void *ctx);
837
838/*
839 * Helper function to iterate the entries in ONE btrfs_dir_item.
840 * The iterate callback may return a non zero value to stop iteration. This can
841 * be a negative value for error codes or 1 to simply stop it.
842 *
843 * path must point to the dir item when called.
844 */
845static int iterate_dir_item(struct send_ctx *sctx,
846 struct btrfs_root *root, struct btrfs_path *path,
847 struct btrfs_key *found_key,
848 iterate_dir_item_t iterate, void *ctx)
849{
850 int ret = 0;
851 struct extent_buffer *eb;
852 struct btrfs_item *item;
853 struct btrfs_dir_item *di;
854 struct btrfs_path *tmp_path = NULL;
855 struct btrfs_key di_key;
856 char *buf = NULL;
857 char *buf2 = NULL;
858 int buf_len;
859 int buf_virtual = 0;
860 u32 name_len;
861 u32 data_len;
862 u32 cur;
863 u32 len;
864 u32 total;
865 int slot;
866 int num;
867 u8 type;
868
869 buf_len = PAGE_SIZE;
870 buf = kmalloc(buf_len, GFP_NOFS);
871 if (!buf) {
872 ret = -ENOMEM;
873 goto out;
874 }
875
876 tmp_path = alloc_path_for_send();
877 if (!tmp_path) {
878 ret = -ENOMEM;
879 goto out;
880 }
881
882 eb = path->nodes[0];
883 slot = path->slots[0];
884 item = btrfs_item_nr(eb, slot);
885 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
886 cur = 0;
887 len = 0;
888 total = btrfs_item_size(eb, item);
889
890 num = 0;
891 while (cur < total) {
892 name_len = btrfs_dir_name_len(eb, di);
893 data_len = btrfs_dir_data_len(eb, di);
894 type = btrfs_dir_type(eb, di);
895 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
896
897 if (name_len + data_len > buf_len) {
898 buf_len = PAGE_ALIGN(name_len + data_len);
899 if (buf_virtual) {
900 buf2 = vmalloc(buf_len);
901 if (!buf2) {
902 ret = -ENOMEM;
903 goto out;
904 }
905 vfree(buf);
906 } else {
907 buf2 = krealloc(buf, buf_len, GFP_NOFS);
908 if (!buf2) {
909 buf2 = vmalloc(buf_len);
910 if (!buf2) {
911 ret = -ENOMEM;
912 goto out;
913 }
914 kfree(buf);
915 buf_virtual = 1;
916 }
917 }
918
919 buf = buf2;
920 buf2 = NULL;
921 }
922
923 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
924 name_len + data_len);
925
926 len = sizeof(*di) + name_len + data_len;
927 di = (struct btrfs_dir_item *)((char *)di + len);
928 cur += len;
929
930 ret = iterate(num, &di_key, buf, name_len, buf + name_len,
931 data_len, type, ctx);
932 if (ret < 0)
933 goto out;
934 if (ret) {
935 ret = 0;
936 goto out;
937 }
938
939 num++;
940 }
941
942out:
943 btrfs_free_path(tmp_path);
944 if (buf_virtual)
945 vfree(buf);
946 else
947 kfree(buf);
948 return ret;
949}
950
951static int __copy_first_ref(int num, u64 dir, int index,
952 struct fs_path *p, void *ctx)
953{
954 int ret;
955 struct fs_path *pt = ctx;
956
957 ret = fs_path_copy(pt, p);
958 if (ret < 0)
959 return ret;
960
961 /* we want the first only */
962 return 1;
963}
964
965/*
966 * Retrieve the first path of an inode. If an inode has more then one
967 * ref/hardlink, this is ignored.
968 */
969static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
970 u64 ino, struct fs_path *path)
971{
972 int ret;
973 struct btrfs_key key, found_key;
974 struct btrfs_path *p;
975
976 p = alloc_path_for_send();
977 if (!p)
978 return -ENOMEM;
979
980 fs_path_reset(path);
981
982 key.objectid = ino;
983 key.type = BTRFS_INODE_REF_KEY;
984 key.offset = 0;
985
986 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
987 if (ret < 0)
988 goto out;
989 if (ret) {
990 ret = 1;
991 goto out;
992 }
993 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
994 if (found_key.objectid != ino ||
995 found_key.type != BTRFS_INODE_REF_KEY) {
996 ret = -ENOENT;
997 goto out;
998 }
999
1000 ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
1001 __copy_first_ref, path);
1002 if (ret < 0)
1003 goto out;
1004 ret = 0;
1005
1006out:
1007 btrfs_free_path(p);
1008 return ret;
1009}
1010
1011struct backref_ctx {
1012 struct send_ctx *sctx;
1013
1014 /* number of total found references */
1015 u64 found;
1016
1017 /*
1018 * used for clones found in send_root. clones found behind cur_objectid
1019 * and cur_offset are not considered as allowed clones.
1020 */
1021 u64 cur_objectid;
1022 u64 cur_offset;
1023
1024 /* may be truncated in case it's the last extent in a file */
1025 u64 extent_len;
1026
1027 /* Just to check for bugs in backref resolving */
1028 int found_in_send_root;
1029};
1030
1031static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1032{
1033 u64 root = (u64)key;
1034 struct clone_root *cr = (struct clone_root *)elt;
1035
1036 if (root < cr->root->objectid)
1037 return -1;
1038 if (root > cr->root->objectid)
1039 return 1;
1040 return 0;
1041}
1042
1043static int __clone_root_cmp_sort(const void *e1, const void *e2)
1044{
1045 struct clone_root *cr1 = (struct clone_root *)e1;
1046 struct clone_root *cr2 = (struct clone_root *)e2;
1047
1048 if (cr1->root->objectid < cr2->root->objectid)
1049 return -1;
1050 if (cr1->root->objectid > cr2->root->objectid)
1051 return 1;
1052 return 0;
1053}
1054
1055/*
1056 * Called for every backref that is found for the current extent.
1057 */
1058static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1059{
1060 struct backref_ctx *bctx = ctx_;
1061 struct clone_root *found;
1062 int ret;
1063 u64 i_size;
1064
1065 /* First check if the root is in the list of accepted clone sources */
1066 found = bsearch((void *)root, bctx->sctx->clone_roots,
1067 bctx->sctx->clone_roots_cnt,
1068 sizeof(struct clone_root),
1069 __clone_root_cmp_bsearch);
1070 if (!found)
1071 return 0;
1072
1073 if (found->root == bctx->sctx->send_root &&
1074 ino == bctx->cur_objectid &&
1075 offset == bctx->cur_offset) {
1076 bctx->found_in_send_root = 1;
1077 }
1078
1079 /*
1080 * There are inodes that have extents that lie behind it's i_size. Don't
1081 * accept clones from these extents.
1082 */
1083 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
1084 if (ret < 0)
1085 return ret;
1086
1087 if (offset + bctx->extent_len > i_size)
1088 return 0;
1089
1090 /*
1091 * Make sure we don't consider clones from send_root that are
1092 * behind the current inode/offset.
1093 */
1094 if (found->root == bctx->sctx->send_root) {
1095 /*
1096 * TODO for the moment we don't accept clones from the inode
1097 * that is currently send. We may change this when
1098 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
1099 * file.
1100 */
1101 if (ino >= bctx->cur_objectid)
1102 return 0;
1103 /*if (ino > ctx->cur_objectid)
1104 return 0;
1105 if (offset + ctx->extent_len > ctx->cur_offset)
1106 return 0;*/
1107
1108 bctx->found++;
1109 found->found_refs++;
1110 found->ino = ino;
1111 found->offset = offset;
1112 return 0;
1113 }
1114
1115 bctx->found++;
1116 found->found_refs++;
1117 if (ino < found->ino) {
1118 found->ino = ino;
1119 found->offset = offset;
1120 } else if (found->ino == ino) {
1121 /*
1122 * same extent found more then once in the same file.
1123 */
1124 if (found->offset > offset + bctx->extent_len)
1125 found->offset = offset;
1126 }
1127
1128 return 0;
1129}
1130
1131/*
1132 * path must point to the extent item when called.
1133 */
1134static int find_extent_clone(struct send_ctx *sctx,
1135 struct btrfs_path *path,
1136 u64 ino, u64 data_offset,
1137 u64 ino_size,
1138 struct clone_root **found)
1139{
1140 int ret;
1141 int extent_type;
1142 u64 logical;
1143 u64 num_bytes;
1144 u64 extent_item_pos;
1145 struct btrfs_file_extent_item *fi;
1146 struct extent_buffer *eb = path->nodes[0];
1147 struct backref_ctx backref_ctx;
1148 struct clone_root *cur_clone_root;
1149 struct btrfs_key found_key;
1150 struct btrfs_path *tmp_path;
1151 u32 i;
1152
1153 tmp_path = alloc_path_for_send();
1154 if (!tmp_path)
1155 return -ENOMEM;
1156
1157 if (data_offset >= ino_size) {
1158 /*
1159 * There may be extents that lie behind the file's size.
1160 * I at least had this in combination with snapshotting while
1161 * writing large files.
1162 */
1163 ret = 0;
1164 goto out;
1165 }
1166
1167 fi = btrfs_item_ptr(eb, path->slots[0],
1168 struct btrfs_file_extent_item);
1169 extent_type = btrfs_file_extent_type(eb, fi);
1170 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1176 logical = btrfs_file_extent_disk_bytenr(eb, fi);
1177 if (logical == 0) {
1178 ret = -ENOENT;
1179 goto out;
1180 }
1181 logical += btrfs_file_extent_offset(eb, fi);
1182
1183 ret = extent_from_logical(sctx->send_root->fs_info,
1184 logical, tmp_path, &found_key);
1185 btrfs_release_path(tmp_path);
1186
1187 if (ret < 0)
1188 goto out;
1189 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1190 ret = -EIO;
1191 goto out;
1192 }
1193
1194 /*
1195 * Setup the clone roots.
1196 */
1197 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1198 cur_clone_root = sctx->clone_roots + i;
1199 cur_clone_root->ino = (u64)-1;
1200 cur_clone_root->offset = 0;
1201 cur_clone_root->found_refs = 0;
1202 }
1203
1204 backref_ctx.sctx = sctx;
1205 backref_ctx.found = 0;
1206 backref_ctx.cur_objectid = ino;
1207 backref_ctx.cur_offset = data_offset;
1208 backref_ctx.found_in_send_root = 0;
1209 backref_ctx.extent_len = num_bytes;
1210
1211 /*
1212 * The last extent of a file may be too large due to page alignment.
1213 * We need to adjust extent_len in this case so that the checks in
1214 * __iterate_backrefs work.
1215 */
1216 if (data_offset + num_bytes >= ino_size)
1217 backref_ctx.extent_len = ino_size - data_offset;
1218
1219 /*
1220 * Now collect all backrefs.
1221 */
1222 extent_item_pos = logical - found_key.objectid;
1223 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1224 found_key.objectid, extent_item_pos, 1,
1225 __iterate_backrefs, &backref_ctx);
1226 if (ret < 0)
1227 goto out;
1228
1229 if (!backref_ctx.found_in_send_root) {
1230 /* found a bug in backref code? */
1231 ret = -EIO;
1232 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1233 "send_root. inode=%llu, offset=%llu, "
1234 "logical=%llu\n",
1235 ino, data_offset, logical);
1236 goto out;
1237 }
1238
1239verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1240 "ino=%llu, "
1241 "num_bytes=%llu, logical=%llu\n",
1242 data_offset, ino, num_bytes, logical);
1243
1244 if (!backref_ctx.found)
1245 verbose_printk("btrfs: no clones found\n");
1246
1247 cur_clone_root = NULL;
1248 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1249 if (sctx->clone_roots[i].found_refs) {
1250 if (!cur_clone_root)
1251 cur_clone_root = sctx->clone_roots + i;
1252 else if (sctx->clone_roots[i].root == sctx->send_root)
1253 /* prefer clones from send_root over others */
1254 cur_clone_root = sctx->clone_roots + i;
1255 break;
1256 }
1257
1258 }
1259
1260 if (cur_clone_root) {
1261 *found = cur_clone_root;
1262 ret = 0;
1263 } else {
1264 ret = -ENOENT;
1265 }
1266
1267out:
1268 btrfs_free_path(tmp_path);
1269 return ret;
1270}
1271
1272static int read_symlink(struct send_ctx *sctx,
1273 struct btrfs_root *root,
1274 u64 ino,
1275 struct fs_path *dest)
1276{
1277 int ret;
1278 struct btrfs_path *path;
1279 struct btrfs_key key;
1280 struct btrfs_file_extent_item *ei;
1281 u8 type;
1282 u8 compression;
1283 unsigned long off;
1284 int len;
1285
1286 path = alloc_path_for_send();
1287 if (!path)
1288 return -ENOMEM;
1289
1290 key.objectid = ino;
1291 key.type = BTRFS_EXTENT_DATA_KEY;
1292 key.offset = 0;
1293 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1294 if (ret < 0)
1295 goto out;
1296 BUG_ON(ret);
1297
1298 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1299 struct btrfs_file_extent_item);
1300 type = btrfs_file_extent_type(path->nodes[0], ei);
1301 compression = btrfs_file_extent_compression(path->nodes[0], ei);
1302 BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
1303 BUG_ON(compression);
1304
1305 off = btrfs_file_extent_inline_start(ei);
1306 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1307
1308 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1309 if (ret < 0)
1310 goto out;
1311
1312out:
1313 btrfs_free_path(path);
1314 return ret;
1315}
1316
1317/*
1318 * Helper function to generate a file name that is unique in the root of
1319 * send_root and parent_root. This is used to generate names for orphan inodes.
1320 */
1321static int gen_unique_name(struct send_ctx *sctx,
1322 u64 ino, u64 gen,
1323 struct fs_path *dest)
1324{
1325 int ret = 0;
1326 struct btrfs_path *path;
1327 struct btrfs_dir_item *di;
1328 char tmp[64];
1329 int len;
1330 u64 idx = 0;
1331
1332 path = alloc_path_for_send();
1333 if (!path)
1334 return -ENOMEM;
1335
1336 while (1) {
1337 len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
1338 ino, gen, idx);
1339 if (len >= sizeof(tmp)) {
1340 /* should really not happen */
1341 ret = -EOVERFLOW;
1342 goto out;
1343 }
1344
1345 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1346 path, BTRFS_FIRST_FREE_OBJECTID,
1347 tmp, strlen(tmp), 0);
1348 btrfs_release_path(path);
1349 if (IS_ERR(di)) {
1350 ret = PTR_ERR(di);
1351 goto out;
1352 }
1353 if (di) {
1354 /* not unique, try again */
1355 idx++;
1356 continue;
1357 }
1358
1359 if (!sctx->parent_root) {
1360 /* unique */
1361 ret = 0;
1362 break;
1363 }
1364
1365 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1366 path, BTRFS_FIRST_FREE_OBJECTID,
1367 tmp, strlen(tmp), 0);
1368 btrfs_release_path(path);
1369 if (IS_ERR(di)) {
1370 ret = PTR_ERR(di);
1371 goto out;
1372 }
1373 if (di) {
1374 /* not unique, try again */
1375 idx++;
1376 continue;
1377 }
1378 /* unique */
1379 break;
1380 }
1381
1382 ret = fs_path_add(dest, tmp, strlen(tmp));
1383
1384out:
1385 btrfs_free_path(path);
1386 return ret;
1387}
1388
1389enum inode_state {
1390 inode_state_no_change,
1391 inode_state_will_create,
1392 inode_state_did_create,
1393 inode_state_will_delete,
1394 inode_state_did_delete,
1395};
1396
1397static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1398{
1399 int ret;
1400 int left_ret;
1401 int right_ret;
1402 u64 left_gen;
1403 u64 right_gen;
1404
1405 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1406 NULL);
1407 if (ret < 0 && ret != -ENOENT)
1408 goto out;
1409 left_ret = ret;
1410
1411 if (!sctx->parent_root) {
1412 right_ret = -ENOENT;
1413 } else {
1414 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1415 NULL, NULL, NULL);
1416 if (ret < 0 && ret != -ENOENT)
1417 goto out;
1418 right_ret = ret;
1419 }
1420
1421 if (!left_ret && !right_ret) {
1422 if (left_gen == gen && right_gen == gen)
1423 ret = inode_state_no_change;
1424 else if (left_gen == gen) {
1425 if (ino < sctx->send_progress)
1426 ret = inode_state_did_create;
1427 else
1428 ret = inode_state_will_create;
1429 } else if (right_gen == gen) {
1430 if (ino < sctx->send_progress)
1431 ret = inode_state_did_delete;
1432 else
1433 ret = inode_state_will_delete;
1434 } else {
1435 ret = -ENOENT;
1436 }
1437 } else if (!left_ret) {
1438 if (left_gen == gen) {
1439 if (ino < sctx->send_progress)
1440 ret = inode_state_did_create;
1441 else
1442 ret = inode_state_will_create;
1443 } else {
1444 ret = -ENOENT;
1445 }
1446 } else if (!right_ret) {
1447 if (right_gen == gen) {
1448 if (ino < sctx->send_progress)
1449 ret = inode_state_did_delete;
1450 else
1451 ret = inode_state_will_delete;
1452 } else {
1453 ret = -ENOENT;
1454 }
1455 } else {
1456 ret = -ENOENT;
1457 }
1458
1459out:
1460 return ret;
1461}
1462
1463static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
1464{
1465 int ret;
1466
1467 ret = get_cur_inode_state(sctx, ino, gen);
1468 if (ret < 0)
1469 goto out;
1470
1471 if (ret == inode_state_no_change ||
1472 ret == inode_state_did_create ||
1473 ret == inode_state_will_delete)
1474 ret = 1;
1475 else
1476 ret = 0;
1477
1478out:
1479 return ret;
1480}
1481
1482/*
1483 * Helper function to lookup a dir item in a dir.
1484 */
1485static int lookup_dir_item_inode(struct btrfs_root *root,
1486 u64 dir, const char *name, int name_len,
1487 u64 *found_inode,
1488 u8 *found_type)
1489{
1490 int ret = 0;
1491 struct btrfs_dir_item *di;
1492 struct btrfs_key key;
1493 struct btrfs_path *path;
1494
1495 path = alloc_path_for_send();
1496 if (!path)
1497 return -ENOMEM;
1498
1499 di = btrfs_lookup_dir_item(NULL, root, path,
1500 dir, name, name_len, 0);
1501 if (!di) {
1502 ret = -ENOENT;
1503 goto out;
1504 }
1505 if (IS_ERR(di)) {
1506 ret = PTR_ERR(di);
1507 goto out;
1508 }
1509 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1510 *found_inode = key.objectid;
1511 *found_type = btrfs_dir_type(path->nodes[0], di);
1512
1513out:
1514 btrfs_free_path(path);
1515 return ret;
1516}
1517
1518static int get_first_ref(struct send_ctx *sctx,
1519 struct btrfs_root *root, u64 ino,
1520 u64 *dir, u64 *dir_gen, struct fs_path *name)
1521{
1522 int ret;
1523 struct btrfs_key key;
1524 struct btrfs_key found_key;
1525 struct btrfs_path *path;
1526 struct btrfs_inode_ref *iref;
1527 int len;
1528
1529 path = alloc_path_for_send();
1530 if (!path)
1531 return -ENOMEM;
1532
1533 key.objectid = ino;
1534 key.type = BTRFS_INODE_REF_KEY;
1535 key.offset = 0;
1536
1537 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
1538 if (ret < 0)
1539 goto out;
1540 if (!ret)
1541 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1542 path->slots[0]);
1543 if (ret || found_key.objectid != key.objectid ||
1544 found_key.type != key.type) {
1545 ret = -ENOENT;
1546 goto out;
1547 }
1548
1549 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1550 struct btrfs_inode_ref);
1551 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1552 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1553 (unsigned long)(iref + 1), len);
1554 if (ret < 0)
1555 goto out;
1556 btrfs_release_path(path);
1557
1558 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
1559 NULL);
1560 if (ret < 0)
1561 goto out;
1562
1563 *dir = found_key.offset;
1564
1565out:
1566 btrfs_free_path(path);
1567 return ret;
1568}
1569
1570static int is_first_ref(struct send_ctx *sctx,
1571 struct btrfs_root *root,
1572 u64 ino, u64 dir,
1573 const char *name, int name_len)
1574{
1575 int ret;
1576 struct fs_path *tmp_name;
1577 u64 tmp_dir;
1578 u64 tmp_dir_gen;
1579
1580 tmp_name = fs_path_alloc(sctx);
1581 if (!tmp_name)
1582 return -ENOMEM;
1583
1584 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1585 if (ret < 0)
1586 goto out;
1587
1588 if (name_len != fs_path_len(tmp_name)) {
1589 ret = 0;
1590 goto out;
1591 }
1592
1593 ret = memcmp(tmp_name->start, name, name_len);
1594 if (ret)
1595 ret = 0;
1596 else
1597 ret = 1;
1598
1599out:
1600 fs_path_free(sctx, tmp_name);
1601 return ret;
1602}
1603
1604static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1605 const char *name, int name_len,
1606 u64 *who_ino, u64 *who_gen)
1607{
1608 int ret = 0;
1609 u64 other_inode = 0;
1610 u8 other_type = 0;
1611
1612 if (!sctx->parent_root)
1613 goto out;
1614
1615 ret = is_inode_existent(sctx, dir, dir_gen);
1616 if (ret <= 0)
1617 goto out;
1618
1619 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
1620 &other_inode, &other_type);
1621 if (ret < 0 && ret != -ENOENT)
1622 goto out;
1623 if (ret) {
1624 ret = 0;
1625 goto out;
1626 }
1627
1628 if (other_inode > sctx->send_progress) {
1629 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1630 who_gen, NULL, NULL, NULL);
1631 if (ret < 0)
1632 goto out;
1633
1634 ret = 1;
1635 *who_ino = other_inode;
1636 } else {
1637 ret = 0;
1638 }
1639
1640out:
1641 return ret;
1642}
1643
1644static int did_overwrite_ref(struct send_ctx *sctx,
1645 u64 dir, u64 dir_gen,
1646 u64 ino, u64 ino_gen,
1647 const char *name, int name_len)
1648{
1649 int ret = 0;
1650 u64 gen;
1651 u64 ow_inode;
1652 u8 other_type;
1653
1654 if (!sctx->parent_root)
1655 goto out;
1656
1657 ret = is_inode_existent(sctx, dir, dir_gen);
1658 if (ret <= 0)
1659 goto out;
1660
1661 /* check if the ref was overwritten by another ref */
1662 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
1663 &ow_inode, &other_type);
1664 if (ret < 0 && ret != -ENOENT)
1665 goto out;
1666 if (ret) {
1667 /* was never and will never be overwritten */
1668 ret = 0;
1669 goto out;
1670 }
1671
1672 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1673 NULL);
1674 if (ret < 0)
1675 goto out;
1676
1677 if (ow_inode == ino && gen == ino_gen) {
1678 ret = 0;
1679 goto out;
1680 }
1681
1682 /* we know that it is or will be overwritten. check this now */
1683 if (ow_inode < sctx->send_progress)
1684 ret = 1;
1685 else
1686 ret = 0;
1687
1688out:
1689 return ret;
1690}
1691
1692static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1693{
1694 int ret = 0;
1695 struct fs_path *name = NULL;
1696 u64 dir;
1697 u64 dir_gen;
1698
1699 if (!sctx->parent_root)
1700 goto out;
1701
1702 name = fs_path_alloc(sctx);
1703 if (!name)
1704 return -ENOMEM;
1705
1706 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
1707 if (ret < 0)
1708 goto out;
1709
1710 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1711 name->start, fs_path_len(name));
1712 if (ret < 0)
1713 goto out;
1714
1715out:
1716 fs_path_free(sctx, name);
1717 return ret;
1718}
1719
1720static int name_cache_insert(struct send_ctx *sctx,
1721 struct name_cache_entry *nce)
1722{
1723 int ret = 0;
1724 struct name_cache_entry **ncea;
1725
1726 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1727 if (ncea) {
1728 if (!ncea[0])
1729 ncea[0] = nce;
1730 else if (!ncea[1])
1731 ncea[1] = nce;
1732 else
1733 BUG();
1734 } else {
1735 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1736 if (!ncea)
1737 return -ENOMEM;
1738
1739 ncea[0] = nce;
1740 ncea[1] = NULL;
1741 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
1742 if (ret < 0)
1743 return ret;
1744 }
1745 list_add_tail(&nce->list, &sctx->name_cache_list);
1746 sctx->name_cache_size++;
1747
1748 return ret;
1749}
1750
1751static void name_cache_delete(struct send_ctx *sctx,
1752 struct name_cache_entry *nce)
1753{
1754 struct name_cache_entry **ncea;
1755
1756 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1757 BUG_ON(!ncea);
1758
1759 if (ncea[0] == nce)
1760 ncea[0] = NULL;
1761 else if (ncea[1] == nce)
1762 ncea[1] = NULL;
1763 else
1764 BUG();
1765
1766 if (!ncea[0] && !ncea[1]) {
1767 radix_tree_delete(&sctx->name_cache, nce->ino);
1768 kfree(ncea);
1769 }
1770
1771 list_del(&nce->list);
1772
1773 sctx->name_cache_size--;
1774}
1775
1776static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1777 u64 ino, u64 gen)
1778{
1779 struct name_cache_entry **ncea;
1780
1781 ncea = radix_tree_lookup(&sctx->name_cache, ino);
1782 if (!ncea)
1783 return NULL;
1784
1785 if (ncea[0] && ncea[0]->gen == gen)
1786 return ncea[0];
1787 else if (ncea[1] && ncea[1]->gen == gen)
1788 return ncea[1];
1789 return NULL;
1790}
1791
1792static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1793{
1794 list_del(&nce->list);
1795 list_add_tail(&nce->list, &sctx->name_cache_list);
1796}
1797
1798static void name_cache_clean_unused(struct send_ctx *sctx)
1799{
1800 struct name_cache_entry *nce;
1801
1802 if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
1803 return;
1804
1805 while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
1806 nce = list_entry(sctx->name_cache_list.next,
1807 struct name_cache_entry, list);
1808 name_cache_delete(sctx, nce);
1809 kfree(nce);
1810 }
1811}
1812
1813static void name_cache_free(struct send_ctx *sctx)
1814{
1815 struct name_cache_entry *nce;
1816 struct name_cache_entry *tmp;
1817
1818 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
1819 name_cache_delete(sctx, nce);
1820 }
1821}
1822
1823static int __get_cur_name_and_parent(struct send_ctx *sctx,
1824 u64 ino, u64 gen,
1825 u64 *parent_ino,
1826 u64 *parent_gen,
1827 struct fs_path *dest)
1828{
1829 int ret;
1830 int nce_ret;
1831 struct btrfs_path *path = NULL;
1832 struct name_cache_entry *nce = NULL;
1833
1834 nce = name_cache_search(sctx, ino, gen);
1835 if (nce) {
1836 if (ino < sctx->send_progress && nce->need_later_update) {
1837 name_cache_delete(sctx, nce);
1838 kfree(nce);
1839 nce = NULL;
1840 } else {
1841 name_cache_used(sctx, nce);
1842 *parent_ino = nce->parent_ino;
1843 *parent_gen = nce->parent_gen;
1844 ret = fs_path_add(dest, nce->name, nce->name_len);
1845 if (ret < 0)
1846 goto out;
1847 ret = nce->ret;
1848 goto out;
1849 }
1850 }
1851
1852 path = alloc_path_for_send();
1853 if (!path)
1854 return -ENOMEM;
1855
1856 ret = is_inode_existent(sctx, ino, gen);
1857 if (ret < 0)
1858 goto out;
1859
1860 if (!ret) {
1861 ret = gen_unique_name(sctx, ino, gen, dest);
1862 if (ret < 0)
1863 goto out;
1864 ret = 1;
1865 goto out_cache;
1866 }
1867
1868 if (ino < sctx->send_progress)
1869 ret = get_first_ref(sctx, sctx->send_root, ino,
1870 parent_ino, parent_gen, dest);
1871 else
1872 ret = get_first_ref(sctx, sctx->parent_root, ino,
1873 parent_ino, parent_gen, dest);
1874 if (ret < 0)
1875 goto out;
1876
1877 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1878 dest->start, dest->end - dest->start);
1879 if (ret < 0)
1880 goto out;
1881 if (ret) {
1882 fs_path_reset(dest);
1883 ret = gen_unique_name(sctx, ino, gen, dest);
1884 if (ret < 0)
1885 goto out;
1886 ret = 1;
1887 }
1888
1889out_cache:
1890 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1891 if (!nce) {
1892 ret = -ENOMEM;
1893 goto out;
1894 }
1895
1896 nce->ino = ino;
1897 nce->gen = gen;
1898 nce->parent_ino = *parent_ino;
1899 nce->parent_gen = *parent_gen;
1900 nce->name_len = fs_path_len(dest);
1901 nce->ret = ret;
1902 strcpy(nce->name, dest->start);
1903 memset(&nce->use_list, 0, sizeof(nce->use_list));
1904
1905 if (ino < sctx->send_progress)
1906 nce->need_later_update = 0;
1907 else
1908 nce->need_later_update = 1;
1909
1910 nce_ret = name_cache_insert(sctx, nce);
1911 if (nce_ret < 0)
1912 ret = nce_ret;
1913 name_cache_clean_unused(sctx);
1914
1915out:
1916 btrfs_free_path(path);
1917 return ret;
1918}
1919
1920/*
1921 * Magic happens here. This function returns the first ref to an inode as it
1922 * would look like while receiving the stream at this point in time.
1923 * We walk the path up to the root. For every inode in between, we check if it
1924 * was already processed/sent. If yes, we continue with the parent as found
1925 * in send_root. If not, we continue with the parent as found in parent_root.
1926 * If we encounter an inode that was deleted at this point in time, we use the
1927 * inodes "orphan" name instead of the real name and stop. Same with new inodes
1928 * that were not created yet and overwritten inodes/refs.
1929 *
1930 * When do we have have orphan inodes:
1931 * 1. When an inode is freshly created and thus no valid refs are available yet
1932 * 2. When a directory lost all it's refs (deleted) but still has dir items
1933 * inside which were not processed yet (pending for move/delete). If anyone
1934 * tried to get the path to the dir items, it would get a path inside that
1935 * orphan directory.
1936 * 3. When an inode is moved around or gets new links, it may overwrite the ref
1937 * of an unprocessed inode. If in that case the first ref would be
1938 * overwritten, the overwritten inode gets "orphanized". Later when we
1939 * process this overwritten inode, it is restored at a new place by moving
1940 * the orphan inode.
1941 *
1942 * sctx->send_progress tells this function at which point in time receiving
1943 * would be.
1944 */
1945static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
1946 struct fs_path *dest)
1947{
1948 int ret = 0;
1949 struct fs_path *name = NULL;
1950 u64 parent_inode = 0;
1951 u64 parent_gen = 0;
1952 int stop = 0;
1953
1954 name = fs_path_alloc(sctx);
1955 if (!name) {
1956 ret = -ENOMEM;
1957 goto out;
1958 }
1959
1960 dest->reversed = 1;
1961 fs_path_reset(dest);
1962
1963 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
1964 fs_path_reset(name);
1965
1966 ret = __get_cur_name_and_parent(sctx, ino, gen,
1967 &parent_inode, &parent_gen, name);
1968 if (ret < 0)
1969 goto out;
1970 if (ret)
1971 stop = 1;
1972
1973 ret = fs_path_add_path(dest, name);
1974 if (ret < 0)
1975 goto out;
1976
1977 ino = parent_inode;
1978 gen = parent_gen;
1979 }
1980
1981out:
1982 fs_path_free(sctx, name);
1983 if (!ret)
1984 fs_path_unreverse(dest);
1985 return ret;
1986}
1987
1988/*
1989 * Called for regular files when sending extents data. Opens a struct file
1990 * to read from the file.
1991 */
1992static int open_cur_inode_file(struct send_ctx *sctx)
1993{
1994 int ret = 0;
1995 struct btrfs_key key;
1996 struct path path;
1997 struct inode *inode;
1998 struct dentry *dentry;
1999 struct file *filp;
2000 int new = 0;
2001
2002 if (sctx->cur_inode_filp)
2003 goto out;
2004
2005 key.objectid = sctx->cur_ino;
2006 key.type = BTRFS_INODE_ITEM_KEY;
2007 key.offset = 0;
2008
2009 inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
2010 &new);
2011 if (IS_ERR(inode)) {
2012 ret = PTR_ERR(inode);
2013 goto out;
2014 }
2015
2016 dentry = d_obtain_alias(inode);
2017 inode = NULL;
2018 if (IS_ERR(dentry)) {
2019 ret = PTR_ERR(dentry);
2020 goto out;
2021 }
2022
2023 path.mnt = sctx->mnt;
2024 path.dentry = dentry;
2025 filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
2026 dput(dentry);
2027 dentry = NULL;
2028 if (IS_ERR(filp)) {
2029 ret = PTR_ERR(filp);
2030 goto out;
2031 }
2032 sctx->cur_inode_filp = filp;
2033
2034out:
2035 /*
2036 * no xxxput required here as every vfs op
2037 * does it by itself on failure
2038 */
2039 return ret;
2040}
2041
2042/*
2043 * Closes the struct file that was created in open_cur_inode_file
2044 */
2045static int close_cur_inode_file(struct send_ctx *sctx)
2046{
2047 int ret = 0;
2048
2049 if (!sctx->cur_inode_filp)
2050 goto out;
2051
2052 ret = filp_close(sctx->cur_inode_filp, NULL);
2053 sctx->cur_inode_filp = NULL;
2054
2055out:
2056 return ret;
2057}
2058
2059/*
2060 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2061 */
2062static int send_subvol_begin(struct send_ctx *sctx)
2063{
2064 int ret;
2065 struct btrfs_root *send_root = sctx->send_root;
2066 struct btrfs_root *parent_root = sctx->parent_root;
2067 struct btrfs_path *path;
2068 struct btrfs_key key;
2069 struct btrfs_root_ref *ref;
2070 struct extent_buffer *leaf;
2071 char *name = NULL;
2072 int namelen;
2073
2074 path = alloc_path_for_send();
2075 if (!path)
2076 return -ENOMEM;
2077
2078 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
2079 if (!name) {
2080 btrfs_free_path(path);
2081 return -ENOMEM;
2082 }
2083
2084 key.objectid = send_root->objectid;
2085 key.type = BTRFS_ROOT_BACKREF_KEY;
2086 key.offset = 0;
2087
2088 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2089 &key, path, 1, 0);
2090 if (ret < 0)
2091 goto out;
2092 if (ret) {
2093 ret = -ENOENT;
2094 goto out;
2095 }
2096
2097 leaf = path->nodes[0];
2098 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2099 if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2100 key.objectid != send_root->objectid) {
2101 ret = -ENOENT;
2102 goto out;
2103 }
2104 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2105 namelen = btrfs_root_ref_name_len(leaf, ref);
2106 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2107 btrfs_release_path(path);
2108
2109 if (ret < 0)
2110 goto out;
2111
2112 if (parent_root) {
2113 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2114 if (ret < 0)
2115 goto out;
2116 } else {
2117 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2118 if (ret < 0)
2119 goto out;
2120 }
2121
2122 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2123 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2124 sctx->send_root->root_item.uuid);
2125 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2126 sctx->send_root->root_item.ctransid);
2127 if (parent_root) {
2128 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2129 sctx->parent_root->root_item.uuid);
2130 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2131 sctx->parent_root->root_item.ctransid);
2132 }
2133
2134 ret = send_cmd(sctx);
2135
2136tlv_put_failure:
2137out:
2138 btrfs_free_path(path);
2139 kfree(name);
2140 return ret;
2141}
2142
2143static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2144{
2145 int ret = 0;
2146 struct fs_path *p;
2147
2148verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2149
2150 p = fs_path_alloc(sctx);
2151 if (!p)
2152 return -ENOMEM;
2153
2154 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2155 if (ret < 0)
2156 goto out;
2157
2158 ret = get_cur_path(sctx, ino, gen, p);
2159 if (ret < 0)
2160 goto out;
2161 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2162 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2163
2164 ret = send_cmd(sctx);
2165
2166tlv_put_failure:
2167out:
2168 fs_path_free(sctx, p);
2169 return ret;
2170}
2171
2172static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2173{
2174 int ret = 0;
2175 struct fs_path *p;
2176
2177verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2178
2179 p = fs_path_alloc(sctx);
2180 if (!p)
2181 return -ENOMEM;
2182
2183 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2184 if (ret < 0)
2185 goto out;
2186
2187 ret = get_cur_path(sctx, ino, gen, p);
2188 if (ret < 0)
2189 goto out;
2190 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2191 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2192
2193 ret = send_cmd(sctx);
2194
2195tlv_put_failure:
2196out:
2197 fs_path_free(sctx, p);
2198 return ret;
2199}
2200
2201static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2202{
2203 int ret = 0;
2204 struct fs_path *p;
2205
2206verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2207
2208 p = fs_path_alloc(sctx);
2209 if (!p)
2210 return -ENOMEM;
2211
2212 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2213 if (ret < 0)
2214 goto out;
2215
2216 ret = get_cur_path(sctx, ino, gen, p);
2217 if (ret < 0)
2218 goto out;
2219 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2220 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2221 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2222
2223 ret = send_cmd(sctx);
2224
2225tlv_put_failure:
2226out:
2227 fs_path_free(sctx, p);
2228 return ret;
2229}
2230
2231static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2232{
2233 int ret = 0;
2234 struct fs_path *p = NULL;
2235 struct btrfs_inode_item *ii;
2236 struct btrfs_path *path = NULL;
2237 struct extent_buffer *eb;
2238 struct btrfs_key key;
2239 int slot;
2240
2241verbose_printk("btrfs: send_utimes %llu\n", ino);
2242
2243 p = fs_path_alloc(sctx);
2244 if (!p)
2245 return -ENOMEM;
2246
2247 path = alloc_path_for_send();
2248 if (!path) {
2249 ret = -ENOMEM;
2250 goto out;
2251 }
2252
2253 key.objectid = ino;
2254 key.type = BTRFS_INODE_ITEM_KEY;
2255 key.offset = 0;
2256 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2257 if (ret < 0)
2258 goto out;
2259
2260 eb = path->nodes[0];
2261 slot = path->slots[0];
2262 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2263
2264 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2265 if (ret < 0)
2266 goto out;
2267
2268 ret = get_cur_path(sctx, ino, gen, p);
2269 if (ret < 0)
2270 goto out;
2271 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2272 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
2273 btrfs_inode_atime(ii));
2274 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
2275 btrfs_inode_mtime(ii));
2276 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2277 btrfs_inode_ctime(ii));
2278 /* TODO otime? */
2279
2280 ret = send_cmd(sctx);
2281
2282tlv_put_failure:
2283out:
2284 fs_path_free(sctx, p);
2285 btrfs_free_path(path);
2286 return ret;
2287}
2288
2289/*
2290 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2291 * a valid path yet because we did not process the refs yet. So, the inode
2292 * is created as orphan.
2293 */
2294static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
2295 struct btrfs_key *key)
2296{
2297 int ret = 0;
2298 struct extent_buffer *eb = path->nodes[0];
2299 struct btrfs_inode_item *ii;
2300 struct fs_path *p;
2301 int slot = path->slots[0];
2302 int cmd;
2303 u64 mode;
2304
2305verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2306
2307 p = fs_path_alloc(sctx);
2308 if (!p)
2309 return -ENOMEM;
2310
2311 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2312 mode = btrfs_inode_mode(eb, ii);
2313
2314 if (S_ISREG(mode))
2315 cmd = BTRFS_SEND_C_MKFILE;
2316 else if (S_ISDIR(mode))
2317 cmd = BTRFS_SEND_C_MKDIR;
2318 else if (S_ISLNK(mode))
2319 cmd = BTRFS_SEND_C_SYMLINK;
2320 else if (S_ISCHR(mode) || S_ISBLK(mode))
2321 cmd = BTRFS_SEND_C_MKNOD;
2322 else if (S_ISFIFO(mode))
2323 cmd = BTRFS_SEND_C_MKFIFO;
2324 else if (S_ISSOCK(mode))
2325 cmd = BTRFS_SEND_C_MKSOCK;
2326 else {
2327 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2328 (int)(mode & S_IFMT));
2329 ret = -ENOTSUPP;
2330 goto out;
2331 }
2332
2333 ret = begin_cmd(sctx, cmd);
2334 if (ret < 0)
2335 goto out;
2336
2337 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
2338 if (ret < 0)
2339 goto out;
2340
2341 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2342 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
2343
2344 if (S_ISLNK(mode)) {
2345 fs_path_reset(p);
2346 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
2347 if (ret < 0)
2348 goto out;
2349 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2350 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2351 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2352 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
2353 }
2354
2355 ret = send_cmd(sctx);
2356 if (ret < 0)
2357 goto out;
2358
2359
2360tlv_put_failure:
2361out:
2362 fs_path_free(sctx, p);
2363 return ret;
2364}
2365
2366struct recorded_ref {
2367 struct list_head list;
2368 char *dir_path;
2369 char *name;
2370 struct fs_path *full_path;
2371 u64 dir;
2372 u64 dir_gen;
2373 int dir_path_len;
2374 int name_len;
2375};
2376
2377/*
2378 * We need to process new refs before deleted refs, but compare_tree gives us
2379 * everything mixed. So we first record all refs and later process them.
2380 * This function is a helper to record one ref.
2381 */
2382static int record_ref(struct list_head *head, u64 dir,
2383 u64 dir_gen, struct fs_path *path)
2384{
2385 struct recorded_ref *ref;
2386 char *tmp;
2387
2388 ref = kmalloc(sizeof(*ref), GFP_NOFS);
2389 if (!ref)
2390 return -ENOMEM;
2391
2392 ref->dir = dir;
2393 ref->dir_gen = dir_gen;
2394 ref->full_path = path;
2395
2396 tmp = strrchr(ref->full_path->start, '/');
2397 if (!tmp) {
2398 ref->name_len = ref->full_path->end - ref->full_path->start;
2399 ref->name = ref->full_path->start;
2400 ref->dir_path_len = 0;
2401 ref->dir_path = ref->full_path->start;
2402 } else {
2403 tmp++;
2404 ref->name_len = ref->full_path->end - tmp;
2405 ref->name = tmp;
2406 ref->dir_path = ref->full_path->start;
2407 ref->dir_path_len = ref->full_path->end -
2408 ref->full_path->start - 1 - ref->name_len;
2409 }
2410
2411 list_add_tail(&ref->list, head);
2412 return 0;
2413}
2414
2415static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2416{
2417 struct recorded_ref *cur;
2418 struct recorded_ref *tmp;
2419
2420 list_for_each_entry_safe(cur, tmp, head, list) {
2421 fs_path_free(sctx, cur->full_path);
2422 kfree(cur);
2423 }
2424 INIT_LIST_HEAD(head);
2425}
2426
2427static void free_recorded_refs(struct send_ctx *sctx)
2428{
2429 __free_recorded_refs(sctx, &sctx->new_refs);
2430 __free_recorded_refs(sctx, &sctx->deleted_refs);
2431}
2432
2433/*
2434 * Renames/moves a file/dir to it's orphan name. Used when the first
2435 * ref of an unprocessed inode gets overwritten and for all non empty
2436 * directories.
2437 */
2438static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2439 struct fs_path *path)
2440{
2441 int ret;
2442 struct fs_path *orphan;
2443
2444 orphan = fs_path_alloc(sctx);
2445 if (!orphan)
2446 return -ENOMEM;
2447
2448 ret = gen_unique_name(sctx, ino, gen, orphan);
2449 if (ret < 0)
2450 goto out;
2451
2452 ret = send_rename(sctx, path, orphan);
2453
2454out:
2455 fs_path_free(sctx, orphan);
2456 return ret;
2457}
2458
2459/*
2460 * Returns 1 if a directory can be removed at this point in time.
2461 * We check this by iterating all dir items and checking if the inode behind
2462 * the dir item was already processed.
2463 */
2464static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2465{
2466 int ret = 0;
2467 struct btrfs_root *root = sctx->parent_root;
2468 struct btrfs_path *path;
2469 struct btrfs_key key;
2470 struct btrfs_key found_key;
2471 struct btrfs_key loc;
2472 struct btrfs_dir_item *di;
2473
2474 path = alloc_path_for_send();
2475 if (!path)
2476 return -ENOMEM;
2477
2478 key.objectid = dir;
2479 key.type = BTRFS_DIR_INDEX_KEY;
2480 key.offset = 0;
2481
2482 while (1) {
2483 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
2484 if (ret < 0)
2485 goto out;
2486 if (!ret) {
2487 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2488 path->slots[0]);
2489 }
2490 if (ret || found_key.objectid != key.objectid ||
2491 found_key.type != key.type) {
2492 break;
2493 }
2494
2495 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2496 struct btrfs_dir_item);
2497 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2498
2499 if (loc.objectid > send_progress) {
2500 ret = 0;
2501 goto out;
2502 }
2503
2504 btrfs_release_path(path);
2505 key.offset = found_key.offset + 1;
2506 }
2507
2508 ret = 1;
2509
2510out:
2511 btrfs_free_path(path);
2512 return ret;
2513}
2514
2515struct finish_unordered_dir_ctx {
2516 struct send_ctx *sctx;
2517 struct fs_path *cur_path;
2518 struct fs_path *dir_path;
2519 u64 dir_ino;
2520 int need_delete;
2521 int delete_pass;
2522};
2523
2524int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2525 const char *name, int name_len,
2526 const char *data, int data_len,
2527 u8 type, void *ctx)
2528{
2529 int ret = 0;
2530 struct finish_unordered_dir_ctx *fctx = ctx;
2531 struct send_ctx *sctx = fctx->sctx;
2532 u64 di_gen;
2533 u64 di_mode;
2534 int is_orphan = 0;
2535
2536 if (di_key->objectid >= fctx->dir_ino)
2537 goto out;
2538
2539 fs_path_reset(fctx->cur_path);
2540
2541 ret = get_inode_info(sctx->send_root, di_key->objectid,
2542 NULL, &di_gen, &di_mode, NULL, NULL);
2543 if (ret < 0)
2544 goto out;
2545
2546 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2547 fctx->dir_ino, name, name_len);
2548 if (ret < 0)
2549 goto out;
2550 if (ret) {
2551 is_orphan = 1;
2552 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2553 fctx->cur_path);
2554 } else {
2555 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2556 fctx->cur_path);
2557 }
2558 if (ret < 0)
2559 goto out;
2560
2561 ret = fs_path_add(fctx->dir_path, name, name_len);
2562 if (ret < 0)
2563 goto out;
2564
2565 if (!fctx->delete_pass) {
2566 if (S_ISDIR(di_mode)) {
2567 ret = send_rename(sctx, fctx->cur_path,
2568 fctx->dir_path);
2569 } else {
2570 ret = send_link(sctx, fctx->dir_path,
2571 fctx->cur_path);
2572 if (is_orphan)
2573 fctx->need_delete = 1;
2574 }
2575 } else if (!S_ISDIR(di_mode)) {
2576 ret = send_unlink(sctx, fctx->cur_path);
2577 } else {
2578 ret = 0;
2579 }
2580
2581 fs_path_remove(fctx->dir_path);
2582
2583out:
2584 return ret;
2585}
2586
2587/*
2588 * Go through all dir items and see if we find refs which could not be created
2589 * in the past because the dir did not exist at that time.
2590 */
2591static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2592{
2593 int ret = 0;
2594 struct btrfs_path *path = NULL;
2595 struct btrfs_key key;
2596 struct btrfs_key found_key;
2597 struct extent_buffer *eb;
2598 struct finish_unordered_dir_ctx fctx;
2599 int slot;
2600
2601 path = alloc_path_for_send();
2602 if (!path) {
2603 ret = -ENOMEM;
2604 goto out;
2605 }
2606
2607 memset(&fctx, 0, sizeof(fctx));
2608 fctx.sctx = sctx;
2609 fctx.cur_path = fs_path_alloc(sctx);
2610 fctx.dir_path = fs_path_alloc(sctx);
2611 if (!fctx.cur_path || !fctx.dir_path) {
2612 ret = -ENOMEM;
2613 goto out;
2614 }
2615 fctx.dir_ino = dir;
2616
2617 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2618 if (ret < 0)
2619 goto out;
2620
2621 /*
2622 * We do two passes. The first links in the new refs and the second
2623 * deletes orphans if required. Deletion of orphans is not required for
2624 * directory inodes, as we always have only one ref and use rename
2625 * instead of link for those.
2626 */
2627
2628again:
2629 key.objectid = dir;
2630 key.type = BTRFS_DIR_ITEM_KEY;
2631 key.offset = 0;
2632 while (1) {
2633 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2634 1, 0);
2635 if (ret < 0)
2636 goto out;
2637 eb = path->nodes[0];
2638 slot = path->slots[0];
2639 btrfs_item_key_to_cpu(eb, &found_key, slot);
2640
2641 if (found_key.objectid != key.objectid ||
2642 found_key.type != key.type) {
2643 btrfs_release_path(path);
2644 break;
2645 }
2646
2647 ret = iterate_dir_item(sctx, sctx->send_root, path,
2648 &found_key, __finish_unordered_dir,
2649 &fctx);
2650 if (ret < 0)
2651 goto out;
2652
2653 key.offset = found_key.offset + 1;
2654 btrfs_release_path(path);
2655 }
2656
2657 if (!fctx.delete_pass && fctx.need_delete) {
2658 fctx.delete_pass = 1;
2659 goto again;
2660 }
2661
2662out:
2663 btrfs_free_path(path);
2664 fs_path_free(sctx, fctx.cur_path);
2665 fs_path_free(sctx, fctx.dir_path);
2666 return ret;
2667}
2668
2669/*
2670 * This does all the move/link/unlink/rmdir magic.
2671 */
2672static int process_recorded_refs(struct send_ctx *sctx)
2673{
2674 int ret = 0;
2675 struct recorded_ref *cur;
2676 struct ulist *check_dirs = NULL;
2677 struct ulist_iterator uit;
2678 struct ulist_node *un;
2679 struct fs_path *valid_path = NULL;
2680 u64 ow_inode = 0;
2681 u64 ow_gen;
2682 int did_overwrite = 0;
2683 int is_orphan = 0;
2684
2685verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2686
2687 valid_path = fs_path_alloc(sctx);
2688 if (!valid_path) {
2689 ret = -ENOMEM;
2690 goto out;
2691 }
2692
2693 check_dirs = ulist_alloc(GFP_NOFS);
2694 if (!check_dirs) {
2695 ret = -ENOMEM;
2696 goto out;
2697 }
2698
2699 /*
2700 * First, check if the first ref of the current inode was overwritten
2701 * before. If yes, we know that the current inode was already orphanized
2702 * and thus use the orphan name. If not, we can use get_cur_path to
2703 * get the path of the first ref as it would like while receiving at
2704 * this point in time.
2705 * New inodes are always orphan at the beginning, so force to use the
2706 * orphan name in this case.
2707 * The first ref is stored in valid_path and will be updated if it
2708 * gets moved around.
2709 */
2710 if (!sctx->cur_inode_new) {
2711 ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
2712 sctx->cur_inode_gen);
2713 if (ret < 0)
2714 goto out;
2715 if (ret)
2716 did_overwrite = 1;
2717 }
2718 if (sctx->cur_inode_new || did_overwrite) {
2719 ret = gen_unique_name(sctx, sctx->cur_ino,
2720 sctx->cur_inode_gen, valid_path);
2721 if (ret < 0)
2722 goto out;
2723 is_orphan = 1;
2724 } else {
2725 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
2726 valid_path);
2727 if (ret < 0)
2728 goto out;
2729 }
2730
2731 list_for_each_entry(cur, &sctx->new_refs, list) {
2732 /*
2733 * Check if this new ref would overwrite the first ref of
2734 * another unprocessed inode. If yes, orphanize the
2735 * overwritten inode. If we find an overwritten ref that is
2736 * not the first ref, simply unlink it.
2737 */
2738 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
2739 cur->name, cur->name_len,
2740 &ow_inode, &ow_gen);
2741 if (ret < 0)
2742 goto out;
2743 if (ret) {
2744 ret = is_first_ref(sctx, sctx->parent_root,
2745 ow_inode, cur->dir, cur->name,
2746 cur->name_len);
2747 if (ret < 0)
2748 goto out;
2749 if (ret) {
2750 ret = orphanize_inode(sctx, ow_inode, ow_gen,
2751 cur->full_path);
2752 if (ret < 0)
2753 goto out;
2754 } else {
2755 ret = send_unlink(sctx, cur->full_path);
2756 if (ret < 0)
2757 goto out;
2758 }
2759 }
2760
2761 /*
2762 * link/move the ref to the new place. If we have an orphan
2763 * inode, move it and update valid_path. If not, link or move
2764 * it depending on the inode mode.
2765 */
2766 if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
2767 ret = send_rename(sctx, valid_path, cur->full_path);
2768 if (ret < 0)
2769 goto out;
2770 is_orphan = 0;
2771 ret = fs_path_copy(valid_path, cur->full_path);
2772 if (ret < 0)
2773 goto out;
2774 } else {
2775 if (S_ISDIR(sctx->cur_inode_mode)) {
2776 /*
2777 * Dirs can't be linked, so move it. For moved
2778 * dirs, we always have one new and one deleted
2779 * ref. The deleted ref is ignored later.
2780 */
2781 ret = send_rename(sctx, valid_path,
2782 cur->full_path);
2783 if (ret < 0)
2784 goto out;
2785 ret = fs_path_copy(valid_path, cur->full_path);
2786 if (ret < 0)
2787 goto out;
2788 } else {
2789 ret = send_link(sctx, cur->full_path,
2790 valid_path);
2791 if (ret < 0)
2792 goto out;
2793 }
2794 }
2795 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2796 GFP_NOFS);
2797 if (ret < 0)
2798 goto out;
2799 }
2800
2801 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
2802 /*
2803 * Check if we can already rmdir the directory. If not,
2804 * orphanize it. For every dir item inside that gets deleted
2805 * later, we do this check again and rmdir it then if possible.
2806 * See the use of check_dirs for more details.
2807 */
2808 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
2809 if (ret < 0)
2810 goto out;
2811 if (ret) {
2812 ret = send_rmdir(sctx, valid_path);
2813 if (ret < 0)
2814 goto out;
2815 } else if (!is_orphan) {
2816 ret = orphanize_inode(sctx, sctx->cur_ino,
2817 sctx->cur_inode_gen, valid_path);
2818 if (ret < 0)
2819 goto out;
2820 is_orphan = 1;
2821 }
2822
2823 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2824 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2825 GFP_NOFS);
2826 if (ret < 0)
2827 goto out;
2828 }
2829 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2830 /*
2831 * We have a non dir inode. Go through all deleted refs and
2832 * unlink them if they were not already overwritten by other
2833 * inodes.
2834 */
2835 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2836 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
2837 sctx->cur_ino, sctx->cur_inode_gen,
2838 cur->name, cur->name_len);
2839 if (ret < 0)
2840 goto out;
2841 if (!ret) {
2842 /*
2843 * In case the inode was moved to a directory
2844 * that was not created yet (see
2845 * __record_new_ref), we can not unlink the ref
2846 * as it will be needed later when the parent
2847 * directory is created, so that we can move in
2848 * the inode to the new dir.
2849 */
2850 if (!is_orphan &&
2851 sctx->cur_inode_first_ref_orphan) {
2852 ret = orphanize_inode(sctx,
2853 sctx->cur_ino,
2854 sctx->cur_inode_gen,
2855 cur->full_path);
2856 if (ret < 0)
2857 goto out;
2858 ret = gen_unique_name(sctx,
2859 sctx->cur_ino,
2860 sctx->cur_inode_gen,
2861 valid_path);
2862 if (ret < 0)
2863 goto out;
2864 is_orphan = 1;
2865
2866 } else {
2867 ret = send_unlink(sctx, cur->full_path);
2868 if (ret < 0)
2869 goto out;
2870 }
2871 }
2872 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2873 GFP_NOFS);
2874 if (ret < 0)
2875 goto out;
2876 }
2877
2878 /*
2879 * If the inode is still orphan, unlink the orphan. This may
2880 * happen when a previous inode did overwrite the first ref
2881 * of this inode and no new refs were added for the current
2882 * inode.
2883 * We can however not delete the orphan in case the inode relies
2884 * in a directory that was not created yet (see
2885 * __record_new_ref)
2886 */
2887 if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
2888 ret = send_unlink(sctx, valid_path);
2889 if (ret < 0)
2890 goto out;
2891 }
2892 }
2893
2894 /*
2895 * We did collect all parent dirs where cur_inode was once located. We
2896 * now go through all these dirs and check if they are pending for
2897 * deletion and if it's finally possible to perform the rmdir now.
2898 * We also update the inode stats of the parent dirs here.
2899 */
2900 ULIST_ITER_INIT(&uit);
2901 while ((un = ulist_next(check_dirs, &uit))) {
2902 if (un->val > sctx->cur_ino)
2903 continue;
2904
2905 ret = get_cur_inode_state(sctx, un->val, un->aux);
2906 if (ret < 0)
2907 goto out;
2908
2909 if (ret == inode_state_did_create ||
2910 ret == inode_state_no_change) {
2911 /* TODO delayed utimes */
2912 ret = send_utimes(sctx, un->val, un->aux);
2913 if (ret < 0)
2914 goto out;
2915 } else if (ret == inode_state_did_delete) {
2916 ret = can_rmdir(sctx, un->val, sctx->cur_ino);
2917 if (ret < 0)
2918 goto out;
2919 if (ret) {
2920 ret = get_cur_path(sctx, un->val, un->aux,
2921 valid_path);
2922 if (ret < 0)
2923 goto out;
2924 ret = send_rmdir(sctx, valid_path);
2925 if (ret < 0)
2926 goto out;
2927 }
2928 }
2929 }
2930
2931 /*
2932 * Current inode is now at it's new position, so we must increase
2933 * send_progress
2934 */
2935 sctx->send_progress = sctx->cur_ino + 1;
2936
2937 /*
2938 * We may have a directory here that has pending refs which could not
2939 * be created before (because the dir did not exist before, see
2940 * __record_new_ref). finish_outoforder_dir will link/move the pending
2941 * refs.
2942 */
2943 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2944 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2945 sctx->cur_inode_gen);
2946 if (ret < 0)
2947 goto out;
2948 }
2949
2950 ret = 0;
2951
2952out:
2953 free_recorded_refs(sctx);
2954 ulist_free(check_dirs);
2955 fs_path_free(sctx, valid_path);
2956 return ret;
2957}
2958
2959static int __record_new_ref(int num, u64 dir, int index,
2960 struct fs_path *name,
2961 void *ctx)
2962{
2963 int ret = 0;
2964 struct send_ctx *sctx = ctx;
2965 struct fs_path *p;
2966 u64 gen;
2967
2968 p = fs_path_alloc(sctx);
2969 if (!p)
2970 return -ENOMEM;
2971
2972 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2973 NULL);
2974 if (ret < 0)
2975 goto out;
2976
2977 /*
2978 * The parent may be non-existent at this point in time. This happens
2979 * if the ino of the parent dir is higher then the current ino. In this
2980 * case, we can not process this ref until the parent dir is finally
2981 * created. If we reach the parent dir later, process_recorded_refs
2982 * will go through all dir items and process the refs that could not be
2983 * processed before. In case this is the first ref, we set
2984 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2985 * keep an orphan of the inode so that it later can be used for
2986 * link/move
2987 */
2988 ret = is_inode_existent(sctx, dir, gen);
2989 if (ret < 0)
2990 goto out;
2991 if (!ret) {
2992 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2993 name->start, fs_path_len(name));
2994 if (ret < 0)
2995 goto out;
2996 if (ret)
2997 sctx->cur_inode_first_ref_orphan = 1;
2998 ret = 0;
2999 goto out;
3000 }
3001
3002 ret = get_cur_path(sctx, dir, gen, p);
3003 if (ret < 0)
3004 goto out;
3005 ret = fs_path_add_path(p, name);
3006 if (ret < 0)
3007 goto out;
3008
3009 ret = record_ref(&sctx->new_refs, dir, gen, p);
3010
3011out:
3012 if (ret)
3013 fs_path_free(sctx, p);
3014 return ret;
3015}
3016
3017static int __record_deleted_ref(int num, u64 dir, int index,
3018 struct fs_path *name,
3019 void *ctx)
3020{
3021 int ret = 0;
3022 struct send_ctx *sctx = ctx;
3023 struct fs_path *p;
3024 u64 gen;
3025
3026 p = fs_path_alloc(sctx);
3027 if (!p)
3028 return -ENOMEM;
3029
3030 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3031 NULL);
3032 if (ret < 0)
3033 goto out;
3034
3035 ret = get_cur_path(sctx, dir, gen, p);
3036 if (ret < 0)
3037 goto out;
3038 ret = fs_path_add_path(p, name);
3039 if (ret < 0)
3040 goto out;
3041
3042 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3043
3044out:
3045 if (ret)
3046 fs_path_free(sctx, p);
3047 return ret;
3048}
3049
3050static int record_new_ref(struct send_ctx *sctx)
3051{
3052 int ret;
3053
3054 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
3055 sctx->cmp_key, 0, __record_new_ref, sctx);
3056 if (ret < 0)
3057 goto out;
3058 ret = 0;
3059
3060out:
3061 return ret;
3062}
3063
3064static int record_deleted_ref(struct send_ctx *sctx)
3065{
3066 int ret;
3067
3068 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
3069 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3070 if (ret < 0)
3071 goto out;
3072 ret = 0;
3073
3074out:
3075 return ret;
3076}
3077
3078struct find_ref_ctx {
3079 u64 dir;
3080 struct fs_path *name;
3081 int found_idx;
3082};
3083
3084static int __find_iref(int num, u64 dir, int index,
3085 struct fs_path *name,
3086 void *ctx_)
3087{
3088 struct find_ref_ctx *ctx = ctx_;
3089
3090 if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
3091 strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
3092 ctx->found_idx = num;
3093 return 1;
3094 }
3095 return 0;
3096}
3097
3098static int find_iref(struct send_ctx *sctx,
3099 struct btrfs_root *root,
3100 struct btrfs_path *path,
3101 struct btrfs_key *key,
3102 u64 dir, struct fs_path *name)
3103{
3104 int ret;
3105 struct find_ref_ctx ctx;
3106
3107 ctx.dir = dir;
3108 ctx.name = name;
3109 ctx.found_idx = -1;
3110
3111 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
3112 if (ret < 0)
3113 return ret;
3114
3115 if (ctx.found_idx == -1)
3116 return -ENOENT;
3117
3118 return ctx.found_idx;
3119}
3120
3121static int __record_changed_new_ref(int num, u64 dir, int index,
3122 struct fs_path *name,
3123 void *ctx)
3124{
3125 int ret;
3126 struct send_ctx *sctx = ctx;
3127
3128 ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
3129 sctx->cmp_key, dir, name);
3130 if (ret == -ENOENT)
3131 ret = __record_new_ref(num, dir, index, name, sctx);
3132 else if (ret > 0)
3133 ret = 0;
3134
3135 return ret;
3136}
3137
3138static int __record_changed_deleted_ref(int num, u64 dir, int index,
3139 struct fs_path *name,
3140 void *ctx)
3141{
3142 int ret;
3143 struct send_ctx *sctx = ctx;
3144
3145 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
3146 dir, name);
3147 if (ret == -ENOENT)
3148 ret = __record_deleted_ref(num, dir, index, name, sctx);
3149 else if (ret > 0)
3150 ret = 0;
3151
3152 return ret;
3153}
3154
3155static int record_changed_ref(struct send_ctx *sctx)
3156{
3157 int ret = 0;
3158
3159 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
3160 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3161 if (ret < 0)
3162 goto out;
3163 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
3164 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3165 if (ret < 0)
3166 goto out;
3167 ret = 0;
3168
3169out:
3170 return ret;
3171}
3172
3173/*
3174 * Record and process all refs at once. Needed when an inode changes the
3175 * generation number, which means that it was deleted and recreated.
3176 */
3177static int process_all_refs(struct send_ctx *sctx,
3178 enum btrfs_compare_tree_result cmd)
3179{
3180 int ret;
3181 struct btrfs_root *root;
3182 struct btrfs_path *path;
3183 struct btrfs_key key;
3184 struct btrfs_key found_key;
3185 struct extent_buffer *eb;
3186 int slot;
3187 iterate_inode_ref_t cb;
3188
3189 path = alloc_path_for_send();
3190 if (!path)
3191 return -ENOMEM;
3192
3193 if (cmd == BTRFS_COMPARE_TREE_NEW) {
3194 root = sctx->send_root;
3195 cb = __record_new_ref;
3196 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
3197 root = sctx->parent_root;
3198 cb = __record_deleted_ref;
3199 } else {
3200 BUG();
3201 }
3202
3203 key.objectid = sctx->cmp_key->objectid;
3204 key.type = BTRFS_INODE_REF_KEY;
3205 key.offset = 0;
3206 while (1) {
3207 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3208 if (ret < 0) {
3209 btrfs_release_path(path);
3210 goto out;
3211 }
3212 if (ret) {
3213 btrfs_release_path(path);
3214 break;
3215 }
3216
3217 eb = path->nodes[0];
3218 slot = path->slots[0];
3219 btrfs_item_key_to_cpu(eb, &found_key, slot);
3220
3221 if (found_key.objectid != key.objectid ||
3222 found_key.type != key.type) {
3223 btrfs_release_path(path);
3224 break;
3225 }
3226
3227 ret = iterate_inode_ref(sctx, sctx->parent_root, path,
3228 &found_key, 0, cb, sctx);
3229 btrfs_release_path(path);
3230 if (ret < 0)
3231 goto out;
3232
3233 key.offset = found_key.offset + 1;
3234 }
3235
3236 ret = process_recorded_refs(sctx);
3237
3238out:
3239 btrfs_free_path(path);
3240 return ret;
3241}
3242
3243static int send_set_xattr(struct send_ctx *sctx,
3244 struct fs_path *path,
3245 const char *name, int name_len,
3246 const char *data, int data_len)
3247{
3248 int ret = 0;
3249
3250 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
3251 if (ret < 0)
3252 goto out;
3253
3254 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
3255 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
3256 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
3257
3258 ret = send_cmd(sctx);
3259
3260tlv_put_failure:
3261out:
3262 return ret;
3263}
3264
3265static int send_remove_xattr(struct send_ctx *sctx,
3266 struct fs_path *path,
3267 const char *name, int name_len)
3268{
3269 int ret = 0;
3270
3271 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
3272 if (ret < 0)
3273 goto out;
3274
3275 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
3276 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
3277
3278 ret = send_cmd(sctx);
3279
3280tlv_put_failure:
3281out:
3282 return ret;
3283}
3284
3285static int __process_new_xattr(int num, struct btrfs_key *di_key,
3286 const char *name, int name_len,
3287 const char *data, int data_len,
3288 u8 type, void *ctx)
3289{
3290 int ret;
3291 struct send_ctx *sctx = ctx;
3292 struct fs_path *p;
3293 posix_acl_xattr_header dummy_acl;
3294
3295 p = fs_path_alloc(sctx);
3296 if (!p)
3297 return -ENOMEM;
3298
3299 /*
3300 * This hack is needed because empty acl's are stored as zero byte
3301 * data in xattrs. Problem with that is, that receiving these zero byte
3302 * acl's will fail later. To fix this, we send a dummy acl list that
3303 * only contains the version number and no entries.
3304 */
3305 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
3306 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
3307 if (data_len == 0) {
3308 dummy_acl.a_version =
3309 cpu_to_le32(POSIX_ACL_XATTR_VERSION);
3310 data = (char *)&dummy_acl;
3311 data_len = sizeof(dummy_acl);
3312 }
3313 }
3314
3315 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3316 if (ret < 0)
3317 goto out;
3318
3319 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3320
3321out:
3322 fs_path_free(sctx, p);
3323 return ret;
3324}
3325
3326static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3327 const char *name, int name_len,
3328 const char *data, int data_len,
3329 u8 type, void *ctx)
3330{
3331 int ret;
3332 struct send_ctx *sctx = ctx;
3333 struct fs_path *p;
3334
3335 p = fs_path_alloc(sctx);
3336 if (!p)
3337 return -ENOMEM;
3338
3339 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3340 if (ret < 0)
3341 goto out;
3342
3343 ret = send_remove_xattr(sctx, p, name, name_len);
3344
3345out:
3346 fs_path_free(sctx, p);
3347 return ret;
3348}
3349
3350static int process_new_xattr(struct send_ctx *sctx)
3351{
3352 int ret = 0;
3353
3354 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
3355 sctx->cmp_key, __process_new_xattr, sctx);
3356
3357 return ret;
3358}
3359
3360static int process_deleted_xattr(struct send_ctx *sctx)
3361{
3362 int ret;
3363
3364 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
3365 sctx->cmp_key, __process_deleted_xattr, sctx);
3366
3367 return ret;
3368}
3369
3370struct find_xattr_ctx {
3371 const char *name;
3372 int name_len;
3373 int found_idx;
3374 char *found_data;
3375 int found_data_len;
3376};
3377
3378static int __find_xattr(int num, struct btrfs_key *di_key,
3379 const char *name, int name_len,
3380 const char *data, int data_len,
3381 u8 type, void *vctx)
3382{
3383 struct find_xattr_ctx *ctx = vctx;
3384
3385 if (name_len == ctx->name_len &&
3386 strncmp(name, ctx->name, name_len) == 0) {
3387 ctx->found_idx = num;
3388 ctx->found_data_len = data_len;
3389 ctx->found_data = kmalloc(data_len, GFP_NOFS);
3390 if (!ctx->found_data)
3391 return -ENOMEM;
3392 memcpy(ctx->found_data, data, data_len);
3393 return 1;
3394 }
3395 return 0;
3396}
3397
3398static int find_xattr(struct send_ctx *sctx,
3399 struct btrfs_root *root,
3400 struct btrfs_path *path,
3401 struct btrfs_key *key,
3402 const char *name, int name_len,
3403 char **data, int *data_len)
3404{
3405 int ret;
3406 struct find_xattr_ctx ctx;
3407
3408 ctx.name = name;
3409 ctx.name_len = name_len;
3410 ctx.found_idx = -1;
3411 ctx.found_data = NULL;
3412 ctx.found_data_len = 0;
3413
3414 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
3415 if (ret < 0)
3416 return ret;
3417
3418 if (ctx.found_idx == -1)
3419 return -ENOENT;
3420 if (data) {
3421 *data = ctx.found_data;
3422 *data_len = ctx.found_data_len;
3423 } else {
3424 kfree(ctx.found_data);
3425 }
3426 return ctx.found_idx;
3427}
3428
3429
3430static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3431 const char *name, int name_len,
3432 const char *data, int data_len,
3433 u8 type, void *ctx)
3434{
3435 int ret;
3436 struct send_ctx *sctx = ctx;
3437 char *found_data = NULL;
3438 int found_data_len = 0;
3439 struct fs_path *p = NULL;
3440
3441 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
3442 sctx->cmp_key, name, name_len, &found_data,
3443 &found_data_len);
3444 if (ret == -ENOENT) {
3445 ret = __process_new_xattr(num, di_key, name, name_len, data,
3446 data_len, type, ctx);
3447 } else if (ret >= 0) {
3448 if (data_len != found_data_len ||
3449 memcmp(data, found_data, data_len)) {
3450 ret = __process_new_xattr(num, di_key, name, name_len,
3451 data, data_len, type, ctx);
3452 } else {
3453 ret = 0;
3454 }
3455 }
3456
3457 kfree(found_data);
3458 fs_path_free(sctx, p);
3459 return ret;
3460}
3461
3462static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3463 const char *name, int name_len,
3464 const char *data, int data_len,
3465 u8 type, void *ctx)
3466{
3467 int ret;
3468 struct send_ctx *sctx = ctx;
3469
3470 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
3471 name, name_len, NULL, NULL);
3472 if (ret == -ENOENT)
3473 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3474 data_len, type, ctx);
3475 else if (ret >= 0)
3476 ret = 0;
3477
3478 return ret;
3479}
3480
3481static int process_changed_xattr(struct send_ctx *sctx)
3482{
3483 int ret = 0;
3484
3485 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
3486 sctx->cmp_key, __process_changed_new_xattr, sctx);
3487 if (ret < 0)
3488 goto out;
3489 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
3490 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3491
3492out:
3493 return ret;
3494}
3495
3496static int process_all_new_xattrs(struct send_ctx *sctx)
3497{
3498 int ret;
3499 struct btrfs_root *root;
3500 struct btrfs_path *path;
3501 struct btrfs_key key;
3502 struct btrfs_key found_key;
3503 struct extent_buffer *eb;
3504 int slot;
3505
3506 path = alloc_path_for_send();
3507 if (!path)
3508 return -ENOMEM;
3509
3510 root = sctx->send_root;
3511
3512 key.objectid = sctx->cmp_key->objectid;
3513 key.type = BTRFS_XATTR_ITEM_KEY;
3514 key.offset = 0;
3515 while (1) {
3516 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3517 if (ret < 0)
3518 goto out;
3519 if (ret) {
3520 ret = 0;
3521 goto out;
3522 }
3523
3524 eb = path->nodes[0];
3525 slot = path->slots[0];
3526 btrfs_item_key_to_cpu(eb, &found_key, slot);
3527
3528 if (found_key.objectid != key.objectid ||
3529 found_key.type != key.type) {
3530 ret = 0;
3531 goto out;
3532 }
3533
3534 ret = iterate_dir_item(sctx, root, path, &found_key,
3535 __process_new_xattr, sctx);
3536 if (ret < 0)
3537 goto out;
3538
3539 btrfs_release_path(path);
3540 key.offset = found_key.offset + 1;
3541 }
3542
3543out:
3544 btrfs_free_path(path);
3545 return ret;
3546}
3547
3548/*
3549 * Read some bytes from the current inode/file and send a write command to
3550 * user space.
3551 */
3552static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3553{
3554 int ret = 0;
3555 struct fs_path *p;
3556 loff_t pos = offset;
3557 int readed = 0;
3558 mm_segment_t old_fs;
3559
3560 p = fs_path_alloc(sctx);
3561 if (!p)
3562 return -ENOMEM;
3563
3564 /*
3565 * vfs normally only accepts user space buffers for security reasons.
3566 * we only read from the file and also only provide the read_buf buffer
3567 * to vfs. As this buffer does not come from a user space call, it's
3568 * ok to temporary allow kernel space buffers.
3569 */
3570 old_fs = get_fs();
3571 set_fs(KERNEL_DS);
3572
3573verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3574
3575 ret = open_cur_inode_file(sctx);
3576 if (ret < 0)
3577 goto out;
3578
3579 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3580 if (ret < 0)
3581 goto out;
3582 readed = ret;
3583 if (!readed)
3584 goto out;
3585
3586 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
3587 if (ret < 0)
3588 goto out;
3589
3590 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3591 if (ret < 0)
3592 goto out;
3593
3594 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3595 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3596 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
3597
3598 ret = send_cmd(sctx);
3599
3600tlv_put_failure:
3601out:
3602 fs_path_free(sctx, p);
3603 set_fs(old_fs);
3604 if (ret < 0)
3605 return ret;
3606 return readed;
3607}
3608
3609/*
3610 * Send a clone command to user space.
3611 */
3612static int send_clone(struct send_ctx *sctx,
3613 u64 offset, u32 len,
3614 struct clone_root *clone_root)
3615{
3616 int ret = 0;
3617 struct btrfs_root *clone_root2 = clone_root->root;
3618 struct fs_path *p;
3619 u64 gen;
3620
3621verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3622 "clone_inode=%llu, clone_offset=%llu\n", offset, len,
3623 clone_root->root->objectid, clone_root->ino,
3624 clone_root->offset);
3625
3626 p = fs_path_alloc(sctx);
3627 if (!p)
3628 return -ENOMEM;
3629
3630 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
3631 if (ret < 0)
3632 goto out;
3633
3634 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3635 if (ret < 0)
3636 goto out;
3637
3638 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3639 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3640 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3641
3642 if (clone_root2 == sctx->send_root) {
3643 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3644 &gen, NULL, NULL, NULL);
3645 if (ret < 0)
3646 goto out;
3647 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3648 } else {
3649 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
3650 }
3651 if (ret < 0)
3652 goto out;
3653
3654 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3655 clone_root2->root_item.uuid);
3656 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3657 clone_root2->root_item.ctransid);
3658 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3659 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3660 clone_root->offset);
3661
3662 ret = send_cmd(sctx);
3663
3664tlv_put_failure:
3665out:
3666 fs_path_free(sctx, p);
3667 return ret;
3668}
3669
3670static int send_write_or_clone(struct send_ctx *sctx,
3671 struct btrfs_path *path,
3672 struct btrfs_key *key,
3673 struct clone_root *clone_root)
3674{
3675 int ret = 0;
3676 struct btrfs_file_extent_item *ei;
3677 u64 offset = key->offset;
3678 u64 pos = 0;
3679 u64 len;
3680 u32 l;
3681 u8 type;
3682
3683 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3684 struct btrfs_file_extent_item);
3685 type = btrfs_file_extent_type(path->nodes[0], ei);
3686 if (type == BTRFS_FILE_EXTENT_INLINE)
3687 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3688 else
3689 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3690
3691 if (offset + len > sctx->cur_inode_size)
3692 len = sctx->cur_inode_size - offset;
3693 if (len == 0) {
3694 ret = 0;
3695 goto out;
3696 }
3697
3698 if (!clone_root) {
3699 while (pos < len) {
3700 l = len - pos;
3701 if (l > BTRFS_SEND_READ_SIZE)
3702 l = BTRFS_SEND_READ_SIZE;
3703 ret = send_write(sctx, pos + offset, l);
3704 if (ret < 0)
3705 goto out;
3706 if (!ret)
3707 break;
3708 pos += ret;
3709 }
3710 ret = 0;
3711 } else {
3712 ret = send_clone(sctx, offset, len, clone_root);
3713 }
3714
3715out:
3716 return ret;
3717}
3718
3719static int is_extent_unchanged(struct send_ctx *sctx,
3720 struct btrfs_path *left_path,
3721 struct btrfs_key *ekey)
3722{
3723 int ret = 0;
3724 struct btrfs_key key;
3725 struct btrfs_path *path = NULL;
3726 struct extent_buffer *eb;
3727 int slot;
3728 struct btrfs_key found_key;
3729 struct btrfs_file_extent_item *ei;
3730 u64 left_disknr;
3731 u64 right_disknr;
3732 u64 left_offset;
3733 u64 right_offset;
3734 u64 left_offset_fixed;
3735 u64 left_len;
3736 u64 right_len;
3737 u8 left_type;
3738 u8 right_type;
3739
3740 path = alloc_path_for_send();
3741 if (!path)
3742 return -ENOMEM;
3743
3744 eb = left_path->nodes[0];
3745 slot = left_path->slots[0];
3746
3747 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3748 left_type = btrfs_file_extent_type(eb, ei);
3749 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3750 left_len = btrfs_file_extent_num_bytes(eb, ei);
3751 left_offset = btrfs_file_extent_offset(eb, ei);
3752
3753 if (left_type != BTRFS_FILE_EXTENT_REG) {
3754 ret = 0;
3755 goto out;
3756 }
3757
3758 /*
3759 * Following comments will refer to these graphics. L is the left
3760 * extents which we are checking at the moment. 1-8 are the right
3761 * extents that we iterate.
3762 *
3763 * |-----L-----|
3764 * |-1-|-2a-|-3-|-4-|-5-|-6-|
3765 *
3766 * |-----L-----|
3767 * |--1--|-2b-|...(same as above)
3768 *
3769 * Alternative situation. Happens on files where extents got split.
3770 * |-----L-----|
3771 * |-----------7-----------|-6-|
3772 *
3773 * Alternative situation. Happens on files which got larger.
3774 * |-----L-----|
3775 * |-8-|
3776 * Nothing follows after 8.
3777 */
3778
3779 key.objectid = ekey->objectid;
3780 key.type = BTRFS_EXTENT_DATA_KEY;
3781 key.offset = ekey->offset;
3782 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
3783 if (ret < 0)
3784 goto out;
3785 if (ret) {
3786 ret = 0;
3787 goto out;
3788 }
3789
3790 /*
3791 * Handle special case where the right side has no extents at all.
3792 */
3793 eb = path->nodes[0];
3794 slot = path->slots[0];
3795 btrfs_item_key_to_cpu(eb, &found_key, slot);
3796 if (found_key.objectid != key.objectid ||
3797 found_key.type != key.type) {
3798 ret = 0;
3799 goto out;
3800 }
3801
3802 /*
3803 * We're now on 2a, 2b or 7.
3804 */
3805 key = found_key;
3806 while (key.offset < ekey->offset + left_len) {
3807 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3808 right_type = btrfs_file_extent_type(eb, ei);
3809 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3810 right_len = btrfs_file_extent_num_bytes(eb, ei);
3811 right_offset = btrfs_file_extent_offset(eb, ei);
3812
3813 if (right_type != BTRFS_FILE_EXTENT_REG) {
3814 ret = 0;
3815 goto out;
3816 }
3817
3818 /*
3819 * Are we at extent 8? If yes, we know the extent is changed.
3820 * This may only happen on the first iteration.
3821 */
3822 if (found_key.offset + right_len < ekey->offset) {
3823 ret = 0;
3824 goto out;
3825 }
3826
3827 left_offset_fixed = left_offset;
3828 if (key.offset < ekey->offset) {
3829 /* Fix the right offset for 2a and 7. */
3830 right_offset += ekey->offset - key.offset;
3831 } else {
3832 /* Fix the left offset for all behind 2a and 2b */
3833 left_offset_fixed += key.offset - ekey->offset;
3834 }
3835
3836 /*
3837 * Check if we have the same extent.
3838 */
3839 if (left_disknr + left_offset_fixed !=
3840 right_disknr + right_offset) {
3841 ret = 0;
3842 goto out;
3843 }
3844
3845 /*
3846 * Go to the next extent.
3847 */
3848 ret = btrfs_next_item(sctx->parent_root, path);
3849 if (ret < 0)
3850 goto out;
3851 if (!ret) {
3852 eb = path->nodes[0];
3853 slot = path->slots[0];
3854 btrfs_item_key_to_cpu(eb, &found_key, slot);
3855 }
3856 if (ret || found_key.objectid != key.objectid ||
3857 found_key.type != key.type) {
3858 key.offset += right_len;
3859 break;
3860 } else {
3861 if (found_key.offset != key.offset + right_len) {
3862 /* Should really not happen */
3863 ret = -EIO;
3864 goto out;
3865 }
3866 }
3867 key = found_key;
3868 }
3869
3870 /*
3871 * We're now behind the left extent (treat as unchanged) or at the end
3872 * of the right side (treat as changed).
3873 */
3874 if (key.offset >= ekey->offset + left_len)
3875 ret = 1;
3876 else
3877 ret = 0;
3878
3879
3880out:
3881 btrfs_free_path(path);
3882 return ret;
3883}
3884
3885static int process_extent(struct send_ctx *sctx,
3886 struct btrfs_path *path,
3887 struct btrfs_key *key)
3888{
3889 int ret = 0;
3890 struct clone_root *found_clone = NULL;
3891
3892 if (S_ISLNK(sctx->cur_inode_mode))
3893 return 0;
3894
3895 if (sctx->parent_root && !sctx->cur_inode_new) {
3896 ret = is_extent_unchanged(sctx, path, key);
3897 if (ret < 0)
3898 goto out;
3899 if (ret) {
3900 ret = 0;
3901 goto out;
3902 }
3903 }
3904
3905 ret = find_extent_clone(sctx, path, key->objectid, key->offset,
3906 sctx->cur_inode_size, &found_clone);
3907 if (ret != -ENOENT && ret < 0)
3908 goto out;
3909
3910 ret = send_write_or_clone(sctx, path, key, found_clone);
3911
3912out:
3913 return ret;
3914}
3915
3916static int process_all_extents(struct send_ctx *sctx)
3917{
3918 int ret;
3919 struct btrfs_root *root;
3920 struct btrfs_path *path;
3921 struct btrfs_key key;
3922 struct btrfs_key found_key;
3923 struct extent_buffer *eb;
3924 int slot;
3925
3926 root = sctx->send_root;
3927 path = alloc_path_for_send();
3928 if (!path)
3929 return -ENOMEM;
3930
3931 key.objectid = sctx->cmp_key->objectid;
3932 key.type = BTRFS_EXTENT_DATA_KEY;
3933 key.offset = 0;
3934 while (1) {
3935 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3936 if (ret < 0)
3937 goto out;
3938 if (ret) {
3939 ret = 0;
3940 goto out;
3941 }
3942
3943 eb = path->nodes[0];
3944 slot = path->slots[0];
3945 btrfs_item_key_to_cpu(eb, &found_key, slot);
3946
3947 if (found_key.objectid != key.objectid ||
3948 found_key.type != key.type) {
3949 ret = 0;
3950 goto out;
3951 }
3952
3953 ret = process_extent(sctx, path, &found_key);
3954 if (ret < 0)
3955 goto out;
3956
3957 btrfs_release_path(path);
3958 key.offset = found_key.offset + 1;
3959 }
3960
3961out:
3962 btrfs_free_path(path);
3963 return ret;
3964}
3965
3966static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3967{
3968 int ret = 0;
3969
3970 if (sctx->cur_ino == 0)
3971 goto out;
3972 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
3973 sctx->cmp_key->type <= BTRFS_INODE_REF_KEY)
3974 goto out;
3975 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
3976 goto out;
3977
3978 ret = process_recorded_refs(sctx);
3979
3980out:
3981 return ret;
3982}
3983
3984static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
3985{
3986 int ret = 0;
3987 u64 left_mode;
3988 u64 left_uid;
3989 u64 left_gid;
3990 u64 right_mode;
3991 u64 right_uid;
3992 u64 right_gid;
3993 int need_chmod = 0;
3994 int need_chown = 0;
3995
3996 ret = process_recorded_refs_if_needed(sctx, at_end);
3997 if (ret < 0)
3998 goto out;
3999
4000 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
4001 goto out;
4002 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
4003 goto out;
4004
4005 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4006 &left_mode, &left_uid, &left_gid);
4007 if (ret < 0)
4008 goto out;
4009
4010 if (!S_ISLNK(sctx->cur_inode_mode)) {
4011 if (!sctx->parent_root || sctx->cur_inode_new) {
4012 need_chmod = 1;
4013 need_chown = 1;
4014 } else {
4015 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4016 NULL, NULL, &right_mode, &right_uid,
4017 &right_gid);
4018 if (ret < 0)
4019 goto out;
4020
4021 if (left_uid != right_uid || left_gid != right_gid)
4022 need_chown = 1;
4023 if (left_mode != right_mode)
4024 need_chmod = 1;
4025 }
4026 }
4027
4028 if (S_ISREG(sctx->cur_inode_mode)) {
4029 ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4030 sctx->cur_inode_size);
4031 if (ret < 0)
4032 goto out;
4033 }
4034
4035 if (need_chown) {
4036 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4037 left_uid, left_gid);
4038 if (ret < 0)
4039 goto out;
4040 }
4041 if (need_chmod) {
4042 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4043 left_mode);
4044 if (ret < 0)
4045 goto out;
4046 }
4047
4048 /*
4049 * Need to send that every time, no matter if it actually changed
4050 * between the two trees as we have done changes to the inode before.
4051 */
4052 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4053 if (ret < 0)
4054 goto out;
4055
4056out:
4057 return ret;
4058}
4059
4060static int changed_inode(struct send_ctx *sctx,
4061 enum btrfs_compare_tree_result result)
4062{
4063 int ret = 0;
4064 struct btrfs_key *key = sctx->cmp_key;
4065 struct btrfs_inode_item *left_ii = NULL;
4066 struct btrfs_inode_item *right_ii = NULL;
4067 u64 left_gen = 0;
4068 u64 right_gen = 0;
4069
4070 ret = close_cur_inode_file(sctx);
4071 if (ret < 0)
4072 goto out;
4073
4074 sctx->cur_ino = key->objectid;
4075 sctx->cur_inode_new_gen = 0;
4076 sctx->cur_inode_first_ref_orphan = 0;
4077 sctx->send_progress = sctx->cur_ino;
4078
4079 if (result == BTRFS_COMPARE_TREE_NEW ||
4080 result == BTRFS_COMPARE_TREE_CHANGED) {
4081 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
4082 sctx->left_path->slots[0],
4083 struct btrfs_inode_item);
4084 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
4085 left_ii);
4086 } else {
4087 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
4088 sctx->right_path->slots[0],
4089 struct btrfs_inode_item);
4090 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4091 right_ii);
4092 }
4093 if (result == BTRFS_COMPARE_TREE_CHANGED) {
4094 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
4095 sctx->right_path->slots[0],
4096 struct btrfs_inode_item);
4097
4098 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4099 right_ii);
4100 if (left_gen != right_gen)
4101 sctx->cur_inode_new_gen = 1;
4102 }
4103
4104 if (result == BTRFS_COMPARE_TREE_NEW) {
4105 sctx->cur_inode_gen = left_gen;
4106 sctx->cur_inode_new = 1;
4107 sctx->cur_inode_deleted = 0;
4108 sctx->cur_inode_size = btrfs_inode_size(
4109 sctx->left_path->nodes[0], left_ii);
4110 sctx->cur_inode_mode = btrfs_inode_mode(
4111 sctx->left_path->nodes[0], left_ii);
4112 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4113 ret = send_create_inode(sctx, sctx->left_path,
4114 sctx->cmp_key);
4115 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4116 sctx->cur_inode_gen = right_gen;
4117 sctx->cur_inode_new = 0;
4118 sctx->cur_inode_deleted = 1;
4119 sctx->cur_inode_size = btrfs_inode_size(
4120 sctx->right_path->nodes[0], right_ii);
4121 sctx->cur_inode_mode = btrfs_inode_mode(
4122 sctx->right_path->nodes[0], right_ii);
4123 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4124 if (sctx->cur_inode_new_gen) {
4125 sctx->cur_inode_gen = right_gen;
4126 sctx->cur_inode_new = 0;
4127 sctx->cur_inode_deleted = 1;
4128 sctx->cur_inode_size = btrfs_inode_size(
4129 sctx->right_path->nodes[0], right_ii);
4130 sctx->cur_inode_mode = btrfs_inode_mode(
4131 sctx->right_path->nodes[0], right_ii);
4132 ret = process_all_refs(sctx,
4133 BTRFS_COMPARE_TREE_DELETED);
4134 if (ret < 0)
4135 goto out;
4136
4137 sctx->cur_inode_gen = left_gen;
4138 sctx->cur_inode_new = 1;
4139 sctx->cur_inode_deleted = 0;
4140 sctx->cur_inode_size = btrfs_inode_size(
4141 sctx->left_path->nodes[0], left_ii);
4142 sctx->cur_inode_mode = btrfs_inode_mode(
4143 sctx->left_path->nodes[0], left_ii);
4144 ret = send_create_inode(sctx, sctx->left_path,
4145 sctx->cmp_key);
4146 if (ret < 0)
4147 goto out;
4148
4149 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4150 if (ret < 0)
4151 goto out;
4152 ret = process_all_extents(sctx);
4153 if (ret < 0)
4154 goto out;
4155 ret = process_all_new_xattrs(sctx);
4156 if (ret < 0)
4157 goto out;
4158 } else {
4159 sctx->cur_inode_gen = left_gen;
4160 sctx->cur_inode_new = 0;
4161 sctx->cur_inode_new_gen = 0;
4162 sctx->cur_inode_deleted = 0;
4163 sctx->cur_inode_size = btrfs_inode_size(
4164 sctx->left_path->nodes[0], left_ii);
4165 sctx->cur_inode_mode = btrfs_inode_mode(
4166 sctx->left_path->nodes[0], left_ii);
4167 }
4168 }
4169
4170out:
4171 return ret;
4172}
4173
4174static int changed_ref(struct send_ctx *sctx,
4175 enum btrfs_compare_tree_result result)
4176{
4177 int ret = 0;
4178
4179 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4180
4181 if (!sctx->cur_inode_new_gen &&
4182 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
4183 if (result == BTRFS_COMPARE_TREE_NEW)
4184 ret = record_new_ref(sctx);
4185 else if (result == BTRFS_COMPARE_TREE_DELETED)
4186 ret = record_deleted_ref(sctx);
4187 else if (result == BTRFS_COMPARE_TREE_CHANGED)
4188 ret = record_changed_ref(sctx);
4189 }
4190
4191 return ret;
4192}
4193
4194static int changed_xattr(struct send_ctx *sctx,
4195 enum btrfs_compare_tree_result result)
4196{
4197 int ret = 0;
4198
4199 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4200
4201 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
4202 if (result == BTRFS_COMPARE_TREE_NEW)
4203 ret = process_new_xattr(sctx);
4204 else if (result == BTRFS_COMPARE_TREE_DELETED)
4205 ret = process_deleted_xattr(sctx);
4206 else if (result == BTRFS_COMPARE_TREE_CHANGED)
4207 ret = process_changed_xattr(sctx);
4208 }
4209
4210 return ret;
4211}
4212
4213static int changed_extent(struct send_ctx *sctx,
4214 enum btrfs_compare_tree_result result)
4215{
4216 int ret = 0;
4217
4218 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4219
4220 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
4221 if (result != BTRFS_COMPARE_TREE_DELETED)
4222 ret = process_extent(sctx, sctx->left_path,
4223 sctx->cmp_key);
4224 }
4225
4226 return ret;
4227}
4228
4229
4230static int changed_cb(struct btrfs_root *left_root,
4231 struct btrfs_root *right_root,
4232 struct btrfs_path *left_path,
4233 struct btrfs_path *right_path,
4234 struct btrfs_key *key,
4235 enum btrfs_compare_tree_result result,
4236 void *ctx)
4237{
4238 int ret = 0;
4239 struct send_ctx *sctx = ctx;
4240
4241 sctx->left_path = left_path;
4242 sctx->right_path = right_path;
4243 sctx->cmp_key = key;
4244
4245 ret = finish_inode_if_needed(sctx, 0);
4246 if (ret < 0)
4247 goto out;
4248
4249 if (key->type == BTRFS_INODE_ITEM_KEY)
4250 ret = changed_inode(sctx, result);
4251 else if (key->type == BTRFS_INODE_REF_KEY)
4252 ret = changed_ref(sctx, result);
4253 else if (key->type == BTRFS_XATTR_ITEM_KEY)
4254 ret = changed_xattr(sctx, result);
4255 else if (key->type == BTRFS_EXTENT_DATA_KEY)
4256 ret = changed_extent(sctx, result);
4257
4258out:
4259 return ret;
4260}
4261
4262static int full_send_tree(struct send_ctx *sctx)
4263{
4264 int ret;
4265 struct btrfs_trans_handle *trans = NULL;
4266 struct btrfs_root *send_root = sctx->send_root;
4267 struct btrfs_key key;
4268 struct btrfs_key found_key;
4269 struct btrfs_path *path;
4270 struct extent_buffer *eb;
4271 int slot;
4272 u64 start_ctransid;
4273 u64 ctransid;
4274
4275 path = alloc_path_for_send();
4276 if (!path)
4277 return -ENOMEM;
4278
4279 spin_lock(&send_root->root_times_lock);
4280 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4281 spin_unlock(&send_root->root_times_lock);
4282
4283 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4284 key.type = BTRFS_INODE_ITEM_KEY;
4285 key.offset = 0;
4286
4287join_trans:
4288 /*
4289 * We need to make sure the transaction does not get committed
4290 * while we do anything on commit roots. Join a transaction to prevent
4291 * this.
4292 */
4293 trans = btrfs_join_transaction(send_root);
4294 if (IS_ERR(trans)) {
4295 ret = PTR_ERR(trans);
4296 trans = NULL;
4297 goto out;
4298 }
4299
4300 /*
4301 * Make sure the tree has not changed
4302 */
4303 spin_lock(&send_root->root_times_lock);
4304 ctransid = btrfs_root_ctransid(&send_root->root_item);
4305 spin_unlock(&send_root->root_times_lock);
4306
4307 if (ctransid != start_ctransid) {
4308 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
4309 "send was modified in between. This is "
4310 "probably a bug.\n");
4311 ret = -EIO;
4312 goto out;
4313 }
4314
4315 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
4316 if (ret < 0)
4317 goto out;
4318 if (ret)
4319 goto out_finish;
4320
4321 while (1) {
4322 /*
4323 * When someone want to commit while we iterate, end the
4324 * joined transaction and rejoin.
4325 */
4326 if (btrfs_should_end_transaction(trans, send_root)) {
4327 ret = btrfs_end_transaction(trans, send_root);
4328 trans = NULL;
4329 if (ret < 0)
4330 goto out;
4331 btrfs_release_path(path);
4332 goto join_trans;
4333 }
4334
4335 eb = path->nodes[0];
4336 slot = path->slots[0];
4337 btrfs_item_key_to_cpu(eb, &found_key, slot);
4338
4339 ret = changed_cb(send_root, NULL, path, NULL,
4340 &found_key, BTRFS_COMPARE_TREE_NEW, sctx);
4341 if (ret < 0)
4342 goto out;
4343
4344 key.objectid = found_key.objectid;
4345 key.type = found_key.type;
4346 key.offset = found_key.offset + 1;
4347
4348 ret = btrfs_next_item(send_root, path);
4349 if (ret < 0)
4350 goto out;
4351 if (ret) {
4352 ret = 0;
4353 break;
4354 }
4355 }
4356
4357out_finish:
4358 ret = finish_inode_if_needed(sctx, 1);
4359
4360out:
4361 btrfs_free_path(path);
4362 if (trans) {
4363 if (!ret)
4364 ret = btrfs_end_transaction(trans, send_root);
4365 else
4366 btrfs_end_transaction(trans, send_root);
4367 }
4368 return ret;
4369}
4370
4371static int send_subvol(struct send_ctx *sctx)
4372{
4373 int ret;
4374
4375 ret = send_header(sctx);
4376 if (ret < 0)
4377 goto out;
4378
4379 ret = send_subvol_begin(sctx);
4380 if (ret < 0)
4381 goto out;
4382
4383 if (sctx->parent_root) {
4384 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
4385 changed_cb, sctx);
4386 if (ret < 0)
4387 goto out;
4388 ret = finish_inode_if_needed(sctx, 1);
4389 if (ret < 0)
4390 goto out;
4391 } else {
4392 ret = full_send_tree(sctx);
4393 if (ret < 0)
4394 goto out;
4395 }
4396
4397out:
4398 if (!ret)
4399 ret = close_cur_inode_file(sctx);
4400 else
4401 close_cur_inode_file(sctx);
4402
4403 free_recorded_refs(sctx);
4404 return ret;
4405}
4406
4407long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4408{
4409 int ret = 0;
4410 struct btrfs_root *send_root;
4411 struct btrfs_root *clone_root;
4412 struct btrfs_fs_info *fs_info;
4413 struct btrfs_ioctl_send_args *arg = NULL;
4414 struct btrfs_key key;
4415 struct file *filp = NULL;
4416 struct send_ctx *sctx = NULL;
4417 u32 i;
4418 u64 *clone_sources_tmp = NULL;
4419
4420 if (!capable(CAP_SYS_ADMIN))
4421 return -EPERM;
4422
4423 send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
4424 fs_info = send_root->fs_info;
4425
4426 arg = memdup_user(arg_, sizeof(*arg));
4427 if (IS_ERR(arg)) {
4428 ret = PTR_ERR(arg);
4429 arg = NULL;
4430 goto out;
4431 }
4432
4433 if (!access_ok(VERIFY_READ, arg->clone_sources,
4434 sizeof(*arg->clone_sources *
4435 arg->clone_sources_count))) {
4436 ret = -EFAULT;
4437 goto out;
4438 }
4439
4440 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
4441 if (!sctx) {
4442 ret = -ENOMEM;
4443 goto out;
4444 }
4445
4446 INIT_LIST_HEAD(&sctx->new_refs);
4447 INIT_LIST_HEAD(&sctx->deleted_refs);
4448 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
4449 INIT_LIST_HEAD(&sctx->name_cache_list);
4450
4451 sctx->send_filp = fget(arg->send_fd);
4452 if (IS_ERR(sctx->send_filp)) {
4453 ret = PTR_ERR(sctx->send_filp);
4454 goto out;
4455 }
4456
4457 sctx->mnt = mnt_file->f_path.mnt;
4458
4459 sctx->send_root = send_root;
4460 sctx->clone_roots_cnt = arg->clone_sources_count;
4461
4462 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
4463 sctx->send_buf = vmalloc(sctx->send_max_size);
4464 if (!sctx->send_buf) {
4465 ret = -ENOMEM;
4466 goto out;
4467 }
4468
4469 sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
4470 if (!sctx->read_buf) {
4471 ret = -ENOMEM;
4472 goto out;
4473 }
4474
4475 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
4476 (arg->clone_sources_count + 1));
4477 if (!sctx->clone_roots) {
4478 ret = -ENOMEM;
4479 goto out;
4480 }
4481
4482 if (arg->clone_sources_count) {
4483 clone_sources_tmp = vmalloc(arg->clone_sources_count *
4484 sizeof(*arg->clone_sources));
4485 if (!clone_sources_tmp) {
4486 ret = -ENOMEM;
4487 goto out;
4488 }
4489
4490 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
4491 arg->clone_sources_count *
4492 sizeof(*arg->clone_sources));
4493 if (ret) {
4494 ret = -EFAULT;
4495 goto out;
4496 }
4497
4498 for (i = 0; i < arg->clone_sources_count; i++) {
4499 key.objectid = clone_sources_tmp[i];
4500 key.type = BTRFS_ROOT_ITEM_KEY;
4501 key.offset = (u64)-1;
4502 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4503 if (!clone_root) {
4504 ret = -EINVAL;
4505 goto out;
4506 }
4507 if (IS_ERR(clone_root)) {
4508 ret = PTR_ERR(clone_root);
4509 goto out;
4510 }
4511 sctx->clone_roots[i].root = clone_root;
4512 }
4513 vfree(clone_sources_tmp);
4514 clone_sources_tmp = NULL;
4515 }
4516
4517 if (arg->parent_root) {
4518 key.objectid = arg->parent_root;
4519 key.type = BTRFS_ROOT_ITEM_KEY;
4520 key.offset = (u64)-1;
4521 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4522 if (!sctx->parent_root) {
4523 ret = -EINVAL;
4524 goto out;
4525 }
4526 }
4527
4528 /*
4529 * Clones from send_root are allowed, but only if the clone source
4530 * is behind the current send position. This is checked while searching
4531 * for possible clone sources.
4532 */
4533 sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
4534
4535 /* We do a bsearch later */
4536 sort(sctx->clone_roots, sctx->clone_roots_cnt,
4537 sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
4538 NULL);
4539
4540 ret = send_subvol(sctx);
4541 if (ret < 0)
4542 goto out;
4543
4544 ret = begin_cmd(sctx, BTRFS_SEND_C_END);
4545 if (ret < 0)
4546 goto out;
4547 ret = send_cmd(sctx);
4548 if (ret < 0)
4549 goto out;
4550
4551out:
4552 if (filp)
4553 fput(filp);
4554 kfree(arg);
4555 vfree(clone_sources_tmp);
4556
4557 if (sctx) {
4558 if (sctx->send_filp)
4559 fput(sctx->send_filp);
4560
4561 vfree(sctx->clone_roots);
4562 vfree(sctx->send_buf);
4563 vfree(sctx->read_buf);
4564
4565 name_cache_free(sctx);
4566
4567 kfree(sctx);
4568 }
4569
4570 return ret;
4571}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
new file mode 100644
index 00000000000..9934e948e57
--- /dev/null
+++ b/fs/btrfs/send.h
@@ -0,0 +1,133 @@
1/*
2 * Copyright (C) 2012 Alexander Block. All rights reserved.
3 * Copyright (C) 2012 STRATO. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#include "ctree.h"
21
22#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
23#define BTRFS_SEND_STREAM_VERSION 1
24
25#define BTRFS_SEND_BUF_SIZE (1024 * 64)
26#define BTRFS_SEND_READ_SIZE (1024 * 48)
27
28enum btrfs_tlv_type {
29 BTRFS_TLV_U8,
30 BTRFS_TLV_U16,
31 BTRFS_TLV_U32,
32 BTRFS_TLV_U64,
33 BTRFS_TLV_BINARY,
34 BTRFS_TLV_STRING,
35 BTRFS_TLV_UUID,
36 BTRFS_TLV_TIMESPEC,
37};
38
39struct btrfs_stream_header {
40 char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
41 __le32 version;
42} __attribute__ ((__packed__));
43
44struct btrfs_cmd_header {
45 /* len excluding the header */
46 __le32 len;
47 __le16 cmd;
48 /* crc including the header with zero crc field */
49 __le32 crc;
50} __attribute__ ((__packed__));
51
52struct btrfs_tlv_header {
53 __le16 tlv_type;
54 /* len excluding the header */
55 __le16 tlv_len;
56} __attribute__ ((__packed__));
57
58/* commands */
59enum btrfs_send_cmd {
60 BTRFS_SEND_C_UNSPEC,
61
62 BTRFS_SEND_C_SUBVOL,
63 BTRFS_SEND_C_SNAPSHOT,
64
65 BTRFS_SEND_C_MKFILE,
66 BTRFS_SEND_C_MKDIR,
67 BTRFS_SEND_C_MKNOD,
68 BTRFS_SEND_C_MKFIFO,
69 BTRFS_SEND_C_MKSOCK,
70 BTRFS_SEND_C_SYMLINK,
71
72 BTRFS_SEND_C_RENAME,
73 BTRFS_SEND_C_LINK,
74 BTRFS_SEND_C_UNLINK,
75 BTRFS_SEND_C_RMDIR,
76
77 BTRFS_SEND_C_SET_XATTR,
78 BTRFS_SEND_C_REMOVE_XATTR,
79
80 BTRFS_SEND_C_WRITE,
81 BTRFS_SEND_C_CLONE,
82
83 BTRFS_SEND_C_TRUNCATE,
84 BTRFS_SEND_C_CHMOD,
85 BTRFS_SEND_C_CHOWN,
86 BTRFS_SEND_C_UTIMES,
87
88 BTRFS_SEND_C_END,
89 __BTRFS_SEND_C_MAX,
90};
91#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
92
93/* attributes in send stream */
94enum {
95 BTRFS_SEND_A_UNSPEC,
96
97 BTRFS_SEND_A_UUID,
98 BTRFS_SEND_A_CTRANSID,
99
100 BTRFS_SEND_A_INO,
101 BTRFS_SEND_A_SIZE,
102 BTRFS_SEND_A_MODE,
103 BTRFS_SEND_A_UID,
104 BTRFS_SEND_A_GID,
105 BTRFS_SEND_A_RDEV,
106 BTRFS_SEND_A_CTIME,
107 BTRFS_SEND_A_MTIME,
108 BTRFS_SEND_A_ATIME,
109 BTRFS_SEND_A_OTIME,
110
111 BTRFS_SEND_A_XATTR_NAME,
112 BTRFS_SEND_A_XATTR_DATA,
113
114 BTRFS_SEND_A_PATH,
115 BTRFS_SEND_A_PATH_TO,
116 BTRFS_SEND_A_PATH_LINK,
117
118 BTRFS_SEND_A_FILE_OFFSET,
119 BTRFS_SEND_A_DATA,
120
121 BTRFS_SEND_A_CLONE_UUID,
122 BTRFS_SEND_A_CLONE_CTRANSID,
123 BTRFS_SEND_A_CLONE_PATH,
124 BTRFS_SEND_A_CLONE_OFFSET,
125 BTRFS_SEND_A_CLONE_LEN,
126
127 __BTRFS_SEND_A_MAX,
128};
129#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
130
131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133#endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c6ffa581241..b976597b072 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,15 +17,27 @@
17 */ 17 */
18 18
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <asm/unaligned.h>
20 21
21/* this is some deeply nasty code. ctree.h has a different 22#include "ctree.h"
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef 23
24static inline u8 get_unaligned_le8(const void *p)
25{
26 return *(u8 *)p;
27}
28
29static inline void put_unaligned_le8(u8 val, void *p)
30{
31 *(u8 *)p = val;
32}
33
34/*
35 * this is some deeply nasty code.
23 * 36 *
24 * The end result is that anyone who #includes ctree.h gets a 37 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions 38 * declaration for the btrfs_set_foo functions and btrfs_foo functions,
26 * 39 * which are wappers of btrfs_set_token_#bits functions and
27 * This file declares the macros and then #includes ctree.h, which results 40 * btrfs_get_token_#bits functions, which are defined in this file.
28 * in cpp creating the function here based on the template below.
29 * 41 *
30 * These setget functions do all the extent_buffer related mapping 42 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent 43 * required to efficiently read and write specific fields in the extent
@@ -33,103 +45,93 @@
33 * an unsigned long offset into the extent buffer which has been 45 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking. 46 * cast to a specific type. This gives us all the gcc type checking.
35 * 47 *
36 * The extent buffer api is used to do all the kmapping and page 48 * The extent buffer api is used to do the page spanning work required to
37 * spanning work required to get extent buffers in highmem and have 49 * have a metadata blocksize different from the page size.
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */ 50 */
43 51
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 52#define DEFINE_BTRFS_SETGET_BITS(bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ 53u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ 54 unsigned long off, \
47void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ 55 struct btrfs_map_token *token) \
48u##bits btrfs_token_##name(struct extent_buffer *eb, \
49 type *s, struct btrfs_map_token *token) \
50{ \ 56{ \
51 unsigned long part_offset = (unsigned long)s; \ 57 unsigned long part_offset = (unsigned long)ptr; \
52 unsigned long offset = part_offset + offsetof(type, member); \ 58 unsigned long offset = part_offset + off; \
53 type *p; \ 59 void *p; \
54 int err; \ 60 int err; \
55 char *kaddr; \ 61 char *kaddr; \
56 unsigned long map_start; \ 62 unsigned long map_start; \
57 unsigned long map_len; \ 63 unsigned long map_len; \
58 unsigned long mem_len = sizeof(((type *)0)->member); \ 64 int size = sizeof(u##bits); \
59 u##bits res; \ 65 u##bits res; \
60 if (token && token->kaddr && token->offset <= offset && \ 66 \
61 token->eb == eb && \ 67 if (token && token->kaddr && token->offset <= offset && \
62 (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ 68 token->eb == eb && \
63 kaddr = token->kaddr; \ 69 (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
64 p = (type *)(kaddr + part_offset - token->offset); \ 70 kaddr = token->kaddr; \
65 res = le##bits##_to_cpu(p->member); \ 71 p = kaddr + part_offset - token->offset; \
66 return res; \ 72 res = get_unaligned_le##bits(p + off); \
67 } \ 73 return res; \
68 err = map_private_extent_buffer(eb, offset, \ 74 } \
69 mem_len, \ 75 err = map_private_extent_buffer(eb, offset, size, \
70 &kaddr, &map_start, &map_len); \ 76 &kaddr, &map_start, &map_len); \
71 if (err) { \ 77 if (err) { \
72 __le##bits leres; \ 78 __le##bits leres; \
73 read_eb_member(eb, s, type, member, &leres); \ 79 \
74 return le##bits##_to_cpu(leres); \ 80 read_extent_buffer(eb, &leres, offset, size); \
75 } \ 81 return le##bits##_to_cpu(leres); \
76 p = (type *)(kaddr + part_offset - map_start); \ 82 } \
77 res = le##bits##_to_cpu(p->member); \ 83 p = kaddr + part_offset - map_start; \
78 if (token) { \ 84 res = get_unaligned_le##bits(p + off); \
79 token->kaddr = kaddr; \ 85 if (token) { \
80 token->offset = map_start; \ 86 token->kaddr = kaddr; \
81 token->eb = eb; \ 87 token->offset = map_start; \
82 } \ 88 token->eb = eb; \
83 return res; \ 89 } \
90 return res; \
84} \ 91} \
85void btrfs_set_token_##name(struct extent_buffer *eb, \ 92void btrfs_set_token_##bits(struct extent_buffer *eb, \
86 type *s, u##bits val, struct btrfs_map_token *token) \ 93 void *ptr, unsigned long off, u##bits val, \
94 struct btrfs_map_token *token) \
87{ \ 95{ \
88 unsigned long part_offset = (unsigned long)s; \ 96 unsigned long part_offset = (unsigned long)ptr; \
89 unsigned long offset = part_offset + offsetof(type, member); \ 97 unsigned long offset = part_offset + off; \
90 type *p; \ 98 void *p; \
91 int err; \ 99 int err; \
92 char *kaddr; \ 100 char *kaddr; \
93 unsigned long map_start; \ 101 unsigned long map_start; \
94 unsigned long map_len; \ 102 unsigned long map_len; \
95 unsigned long mem_len = sizeof(((type *)0)->member); \ 103 int size = sizeof(u##bits); \
96 if (token && token->kaddr && token->offset <= offset && \ 104 \
97 token->eb == eb && \ 105 if (token && token->kaddr && token->offset <= offset && \
98 (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ 106 token->eb == eb && \
99 kaddr = token->kaddr; \ 107 (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
100 p = (type *)(kaddr + part_offset - token->offset); \ 108 kaddr = token->kaddr; \
101 p->member = cpu_to_le##bits(val); \ 109 p = kaddr + part_offset - token->offset; \
102 return; \ 110 put_unaligned_le##bits(val, p + off); \
103 } \ 111 return; \
104 err = map_private_extent_buffer(eb, offset, \ 112 } \
105 mem_len, \ 113 err = map_private_extent_buffer(eb, offset, size, \
106 &kaddr, &map_start, &map_len); \ 114 &kaddr, &map_start, &map_len); \
107 if (err) { \ 115 if (err) { \
108 __le##bits val2; \ 116 __le##bits val2; \
109 val2 = cpu_to_le##bits(val); \ 117 \
110 write_eb_member(eb, s, type, member, &val2); \ 118 val2 = cpu_to_le##bits(val); \
111 return; \ 119 write_extent_buffer(eb, &val2, offset, size); \
112 } \ 120 return; \
113 p = (type *)(kaddr + part_offset - map_start); \ 121 } \
114 p->member = cpu_to_le##bits(val); \ 122 p = kaddr + part_offset - map_start; \
115 if (token) { \ 123 put_unaligned_le##bits(val, p + off); \
116 token->kaddr = kaddr; \ 124 if (token) { \
117 token->offset = map_start; \ 125 token->kaddr = kaddr; \
118 token->eb = eb; \ 126 token->offset = map_start; \
119 } \ 127 token->eb = eb; \
120} \ 128 } \
121void btrfs_set_##name(struct extent_buffer *eb, \ 129}
122 type *s, u##bits val) \
123{ \
124 btrfs_set_token_##name(eb, s, val, NULL); \
125} \
126u##bits btrfs_##name(struct extent_buffer *eb, \
127 type *s) \
128{ \
129 return btrfs_token_##name(eb, s, NULL); \
130} \
131 130
132#include "ctree.h" 131DEFINE_BTRFS_SETGET_BITS(8)
132DEFINE_BTRFS_SETGET_BITS(16)
133DEFINE_BTRFS_SETGET_BITS(32)
134DEFINE_BTRFS_SETGET_BITS(64)
133 135
134void btrfs_node_key(struct extent_buffer *eb, 136void btrfs_node_key(struct extent_buffer *eb,
135 struct btrfs_disk_key *disk_key, int nr) 137 struct btrfs_disk_key *disk_key, int nr)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b19d7556772..fa61ef59cd6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -396,15 +396,23 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
396 strcmp(args[0].from, "zlib") == 0) { 396 strcmp(args[0].from, "zlib") == 0) {
397 compress_type = "zlib"; 397 compress_type = "zlib";
398 info->compress_type = BTRFS_COMPRESS_ZLIB; 398 info->compress_type = BTRFS_COMPRESS_ZLIB;
399 btrfs_set_opt(info->mount_opt, COMPRESS);
399 } else if (strcmp(args[0].from, "lzo") == 0) { 400 } else if (strcmp(args[0].from, "lzo") == 0) {
400 compress_type = "lzo"; 401 compress_type = "lzo";
401 info->compress_type = BTRFS_COMPRESS_LZO; 402 info->compress_type = BTRFS_COMPRESS_LZO;
403 btrfs_set_opt(info->mount_opt, COMPRESS);
404 btrfs_set_fs_incompat(info, COMPRESS_LZO);
405 } else if (strncmp(args[0].from, "no", 2) == 0) {
406 compress_type = "no";
407 info->compress_type = BTRFS_COMPRESS_NONE;
408 btrfs_clear_opt(info->mount_opt, COMPRESS);
409 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
410 compress_force = false;
402 } else { 411 } else {
403 ret = -EINVAL; 412 ret = -EINVAL;
404 goto out; 413 goto out;
405 } 414 }
406 415
407 btrfs_set_opt(info->mount_opt, COMPRESS);
408 if (compress_force) { 416 if (compress_force) {
409 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 417 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
410 pr_info("btrfs: force %s compression\n", 418 pr_info("btrfs: force %s compression\n",
@@ -1455,6 +1463,13 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1455 ret = btrfs_scan_one_device(vol->name, FMODE_READ, 1463 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1456 &btrfs_fs_type, &fs_devices); 1464 &btrfs_fs_type, &fs_devices);
1457 break; 1465 break;
1466 case BTRFS_IOC_DEVICES_READY:
1467 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1468 &btrfs_fs_type, &fs_devices);
1469 if (ret)
1470 break;
1471 ret = !(fs_devices->num_devices == fs_devices->total_devices);
1472 break;
1458 } 1473 }
1459 1474
1460 kfree(vol); 1475 kfree(vol);
@@ -1477,16 +1492,6 @@ static int btrfs_unfreeze(struct super_block *sb)
1477 return 0; 1492 return 0;
1478} 1493}
1479 1494
1480static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1481{
1482 int ret;
1483
1484 ret = btrfs_dirty_inode(inode);
1485 if (ret)
1486 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1487 "error %d\n", btrfs_ino(inode), ret);
1488}
1489
1490static int btrfs_show_devname(struct seq_file *m, struct dentry *root) 1495static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
1491{ 1496{
1492 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); 1497 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -1526,7 +1531,6 @@ static const struct super_operations btrfs_super_ops = {
1526 .show_options = btrfs_show_options, 1531 .show_options = btrfs_show_options,
1527 .show_devname = btrfs_show_devname, 1532 .show_devname = btrfs_show_devname,
1528 .write_inode = btrfs_write_inode, 1533 .write_inode = btrfs_write_inode,
1529 .dirty_inode = btrfs_fs_dirty_inode,
1530 .alloc_inode = btrfs_alloc_inode, 1534 .alloc_inode = btrfs_alloc_inode,
1531 .destroy_inode = btrfs_destroy_inode, 1535 .destroy_inode = btrfs_destroy_inode,
1532 .statfs = btrfs_statfs, 1536 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b72b068183e..7ac7cdcc294 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -22,6 +22,7 @@
22#include <linux/writeback.h> 22#include <linux/writeback.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/uuid.h>
25#include "ctree.h" 26#include "ctree.h"
26#include "disk-io.h" 27#include "disk-io.h"
27#include "transaction.h" 28#include "transaction.h"
@@ -38,7 +39,6 @@ void put_transaction(struct btrfs_transaction *transaction)
38 if (atomic_dec_and_test(&transaction->use_count)) { 39 if (atomic_dec_and_test(&transaction->use_count)) {
39 BUG_ON(!list_empty(&transaction->list)); 40 BUG_ON(!list_empty(&transaction->list));
40 WARN_ON(transaction->delayed_refs.root.rb_node); 41 WARN_ON(transaction->delayed_refs.root.rb_node);
41 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
42 memset(transaction, 0, sizeof(*transaction)); 42 memset(transaction, 0, sizeof(*transaction));
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 43 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 44 }
@@ -100,8 +100,8 @@ loop:
100 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
101 cur_trans = fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
102 goto loop; 102 goto loop;
103 } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 103 } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
104 spin_unlock(&root->fs_info->trans_lock); 104 spin_unlock(&fs_info->trans_lock);
105 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 105 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
106 return -EROFS; 106 return -EROFS;
107 } 107 }
@@ -126,7 +126,6 @@ loop:
126 cur_trans->delayed_refs.num_heads = 0; 126 cur_trans->delayed_refs.num_heads = 0;
127 cur_trans->delayed_refs.flushing = 0; 127 cur_trans->delayed_refs.flushing = 0;
128 cur_trans->delayed_refs.run_delayed_start = 0; 128 cur_trans->delayed_refs.run_delayed_start = 0;
129 cur_trans->delayed_refs.seq = 1;
130 129
131 /* 130 /*
132 * although the tree mod log is per file system and not per transaction, 131 * although the tree mod log is per file system and not per transaction,
@@ -145,10 +144,8 @@ loop:
145 } 144 }
146 atomic_set(&fs_info->tree_mod_seq, 0); 145 atomic_set(&fs_info->tree_mod_seq, 0);
147 146
148 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
149 spin_lock_init(&cur_trans->commit_lock); 147 spin_lock_init(&cur_trans->commit_lock);
150 spin_lock_init(&cur_trans->delayed_refs.lock); 148 spin_lock_init(&cur_trans->delayed_refs.lock);
151 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
152 149
153 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 150 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
154 list_add_tail(&cur_trans->list, &fs_info->trans_list); 151 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -299,6 +296,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
299 struct btrfs_transaction *cur_trans; 296 struct btrfs_transaction *cur_trans;
300 u64 num_bytes = 0; 297 u64 num_bytes = 0;
301 int ret; 298 int ret;
299 u64 qgroup_reserved = 0;
302 300
303 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 301 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
304 return ERR_PTR(-EROFS); 302 return ERR_PTR(-EROFS);
@@ -317,6 +315,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
317 * the appropriate flushing if need be. 315 * the appropriate flushing if need be.
318 */ 316 */
319 if (num_items > 0 && root != root->fs_info->chunk_root) { 317 if (num_items > 0 && root != root->fs_info->chunk_root) {
318 if (root->fs_info->quota_enabled &&
319 is_fstree(root->root_key.objectid)) {
320 qgroup_reserved = num_items * root->leafsize;
321 ret = btrfs_qgroup_reserve(root, qgroup_reserved);
322 if (ret)
323 return ERR_PTR(ret);
324 }
325
320 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
321 ret = btrfs_block_rsv_add(root, 327 ret = btrfs_block_rsv_add(root,
322 &root->fs_info->trans_block_rsv, 328 &root->fs_info->trans_block_rsv,
@@ -349,11 +355,16 @@ again:
349 h->transaction = cur_trans; 355 h->transaction = cur_trans;
350 h->blocks_used = 0; 356 h->blocks_used = 0;
351 h->bytes_reserved = 0; 357 h->bytes_reserved = 0;
358 h->root = root;
352 h->delayed_ref_updates = 0; 359 h->delayed_ref_updates = 0;
353 h->use_count = 1; 360 h->use_count = 1;
361 h->adding_csums = 0;
354 h->block_rsv = NULL; 362 h->block_rsv = NULL;
355 h->orig_rsv = NULL; 363 h->orig_rsv = NULL;
356 h->aborted = 0; 364 h->aborted = 0;
365 h->qgroup_reserved = qgroup_reserved;
366 h->delayed_ref_elem.seq = 0;
367 INIT_LIST_HEAD(&h->qgroup_ref_list);
357 368
358 smp_mb(); 369 smp_mb();
359 if (cur_trans->blocked && may_wait_transaction(root, type)) { 370 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -473,7 +484,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
473 struct btrfs_root *root) 484 struct btrfs_root *root)
474{ 485{
475 struct btrfs_transaction *cur_trans = trans->transaction; 486 struct btrfs_transaction *cur_trans = trans->transaction;
476 struct btrfs_block_rsv *rsv = trans->block_rsv;
477 int updates; 487 int updates;
478 int err; 488 int err;
479 489
@@ -481,12 +491,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
481 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 491 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
482 return 1; 492 return 1;
483 493
484 /*
485 * We need to do this in case we're deleting csums so the global block
486 * rsv get's used instead of the csum block rsv.
487 */
488 trans->block_rsv = NULL;
489
490 updates = trans->delayed_ref_updates; 494 updates = trans->delayed_ref_updates;
491 trans->delayed_ref_updates = 0; 495 trans->delayed_ref_updates = 0;
492 if (updates) { 496 if (updates) {
@@ -495,8 +499,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
495 return err; 499 return err;
496 } 500 }
497 501
498 trans->block_rsv = rsv;
499
500 return should_end_transaction(trans, root); 502 return should_end_transaction(trans, root);
501} 503}
502 504
@@ -513,8 +515,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
513 return 0; 515 return 0;
514 } 516 }
515 517
518 /*
519 * do the qgroup accounting as early as possible
520 */
521 err = btrfs_delayed_refs_qgroup_accounting(trans, info);
522
516 btrfs_trans_release_metadata(trans, root); 523 btrfs_trans_release_metadata(trans, root);
517 trans->block_rsv = NULL; 524 trans->block_rsv = NULL;
525 /*
526 * the same root has to be passed to start_transaction and
527 * end_transaction. Subvolume quota depends on this.
528 */
529 WARN_ON(trans->root != root);
530
531 if (trans->qgroup_reserved) {
532 btrfs_qgroup_free(root, trans->qgroup_reserved);
533 trans->qgroup_reserved = 0;
534 }
535
518 while (count < 2) { 536 while (count < 2) {
519 unsigned long cur = trans->delayed_ref_updates; 537 unsigned long cur = trans->delayed_ref_updates;
520 trans->delayed_ref_updates = 0; 538 trans->delayed_ref_updates = 0;
@@ -527,6 +545,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
527 } 545 }
528 count++; 546 count++;
529 } 547 }
548 btrfs_trans_release_metadata(trans, root);
549 trans->block_rsv = NULL;
530 550
531 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 551 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
532 should_end_transaction(trans, root)) { 552 should_end_transaction(trans, root)) {
@@ -567,6 +587,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
567 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 587 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
568 err = -EIO; 588 err = -EIO;
569 } 589 }
590 assert_qgroups_uptodate(trans);
570 591
571 memset(trans, 0, sizeof(*trans)); 592 memset(trans, 0, sizeof(*trans));
572 kmem_cache_free(btrfs_trans_handle_cachep, trans); 593 kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -785,6 +806,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
785 ret = btrfs_run_dev_stats(trans, root->fs_info); 806 ret = btrfs_run_dev_stats(trans, root->fs_info);
786 BUG_ON(ret); 807 BUG_ON(ret);
787 808
809 ret = btrfs_run_qgroups(trans, root->fs_info);
810 BUG_ON(ret);
811
812 /* run_qgroups might have added some more refs */
813 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
814 BUG_ON(ret);
815
788 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 816 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
789 next = fs_info->dirty_cowonly_roots.next; 817 next = fs_info->dirty_cowonly_roots.next;
790 list_del_init(next); 818 list_del_init(next);
@@ -926,11 +954,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
926 struct dentry *dentry; 954 struct dentry *dentry;
927 struct extent_buffer *tmp; 955 struct extent_buffer *tmp;
928 struct extent_buffer *old; 956 struct extent_buffer *old;
957 struct timespec cur_time = CURRENT_TIME;
929 int ret; 958 int ret;
930 u64 to_reserve = 0; 959 u64 to_reserve = 0;
931 u64 index = 0; 960 u64 index = 0;
932 u64 objectid; 961 u64 objectid;
933 u64 root_flags; 962 u64 root_flags;
963 uuid_le new_uuid;
934 964
935 rsv = trans->block_rsv; 965 rsv = trans->block_rsv;
936 966
@@ -957,6 +987,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
957 } 987 }
958 } 988 }
959 989
990 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
991 objectid, pending->inherit);
992 kfree(pending->inherit);
993 if (ret) {
994 pending->error = ret;
995 goto fail;
996 }
997
960 key.objectid = objectid; 998 key.objectid = objectid;
961 key.offset = (u64)-1; 999 key.offset = (u64)-1;
962 key.type = BTRFS_ROOT_ITEM_KEY; 1000 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1016,6 +1054,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1016 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; 1054 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
1017 btrfs_set_root_flags(new_root_item, root_flags); 1055 btrfs_set_root_flags(new_root_item, root_flags);
1018 1056
1057 btrfs_set_root_generation_v2(new_root_item,
1058 trans->transid);
1059 uuid_le_gen(&new_uuid);
1060 memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
1061 memcpy(new_root_item->parent_uuid, root->root_item.uuid,
1062 BTRFS_UUID_SIZE);
1063 new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
1064 new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
1065 btrfs_set_root_otransid(new_root_item, trans->transid);
1066 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
1067 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
1068 btrfs_set_root_stransid(new_root_item, 0);
1069 btrfs_set_root_rtransid(new_root_item, 0);
1070
1019 old = btrfs_lock_root_node(root); 1071 old = btrfs_lock_root_node(root);
1020 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); 1072 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
1021 if (ret) { 1073 if (ret) {
@@ -1269,9 +1321,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1269 1321
1270 btrfs_run_ordered_operations(root, 0); 1322 btrfs_run_ordered_operations(root, 0);
1271 1323
1272 btrfs_trans_release_metadata(trans, root);
1273 trans->block_rsv = NULL;
1274
1275 if (cur_trans->aborted) 1324 if (cur_trans->aborted)
1276 goto cleanup_transaction; 1325 goto cleanup_transaction;
1277 1326
@@ -1282,6 +1331,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1282 if (ret) 1331 if (ret)
1283 goto cleanup_transaction; 1332 goto cleanup_transaction;
1284 1333
1334 btrfs_trans_release_metadata(trans, root);
1335 trans->block_rsv = NULL;
1336
1285 cur_trans = trans->transaction; 1337 cur_trans = trans->transaction;
1286 1338
1287 /* 1339 /*
@@ -1330,7 +1382,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1330 spin_unlock(&root->fs_info->trans_lock); 1382 spin_unlock(&root->fs_info->trans_lock);
1331 } 1383 }
1332 1384
1333 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1385 if (!btrfs_test_opt(root, SSD) &&
1386 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1334 should_grow = 1; 1387 should_grow = 1;
1335 1388
1336 do { 1389 do {
@@ -1352,6 +1405,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1352 goto cleanup_transaction; 1405 goto cleanup_transaction;
1353 1406
1354 /* 1407 /*
1408 * running the delayed items may have added new refs. account
1409 * them now so that they hinder processing of more delayed refs
1410 * as little as possible.
1411 */
1412 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1413
1414 /*
1355 * rename don't use btrfs_join_transaction, so, once we 1415 * rename don't use btrfs_join_transaction, so, once we
1356 * set the transaction to blocked above, we aren't going 1416 * set the transaction to blocked above, we aren't going
1357 * to get any new ordered operations. We can safely run 1417 * to get any new ordered operations. We can safely run
@@ -1463,6 +1523,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1463 root->fs_info->chunk_root->node); 1523 root->fs_info->chunk_root->node);
1464 switch_commit_root(root->fs_info->chunk_root); 1524 switch_commit_root(root->fs_info->chunk_root);
1465 1525
1526 assert_qgroups_uptodate(trans);
1466 update_super_roots(root); 1527 update_super_roots(root);
1467 1528
1468 if (!root->fs_info->log_root_recovering) { 1529 if (!root->fs_info->log_root_recovering) {
@@ -1532,6 +1593,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1532 return ret; 1593 return ret;
1533 1594
1534cleanup_transaction: 1595cleanup_transaction:
1596 btrfs_trans_release_metadata(trans, root);
1597 trans->block_rsv = NULL;
1535 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); 1598 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
1536// WARN_ON(1); 1599// WARN_ON(1);
1537 if (current->journal_info == trans) 1600 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fe27379e368..e8b8416c688 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -20,6 +20,7 @@
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h"
23 24
24struct btrfs_transaction { 25struct btrfs_transaction {
25 u64 transid; 26 u64 transid;
@@ -49,6 +50,7 @@ struct btrfs_transaction {
49struct btrfs_trans_handle { 50struct btrfs_trans_handle {
50 u64 transid; 51 u64 transid;
51 u64 bytes_reserved; 52 u64 bytes_reserved;
53 u64 qgroup_reserved;
52 unsigned long use_count; 54 unsigned long use_count;
53 unsigned long blocks_reserved; 55 unsigned long blocks_reserved;
54 unsigned long blocks_used; 56 unsigned long blocks_used;
@@ -57,12 +59,22 @@ struct btrfs_trans_handle {
57 struct btrfs_block_rsv *block_rsv; 59 struct btrfs_block_rsv *block_rsv;
58 struct btrfs_block_rsv *orig_rsv; 60 struct btrfs_block_rsv *orig_rsv;
59 int aborted; 61 int aborted;
62 int adding_csums;
63 /*
64 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction.
66 * Subvolume quota depends on this
67 */
68 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list;
60}; 71};
61 72
62struct btrfs_pending_snapshot { 73struct btrfs_pending_snapshot {
63 struct dentry *dentry; 74 struct dentry *dentry;
64 struct btrfs_root *root; 75 struct btrfs_root *root;
65 struct btrfs_root *snap; 76 struct btrfs_root *snap;
77 struct btrfs_qgroup_inherit *inherit;
66 /* block reservation for the operation */ 78 /* block reservation for the operation */
67 struct btrfs_block_rsv block_rsv; 79 struct btrfs_block_rsv block_rsv;
68 /* extra metadata reseration for relocation */ 80 /* extra metadata reseration for relocation */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8abeae4224f..c86670f4f28 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -637,7 +637,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
637 } 637 }
638 638
639 inode_set_bytes(inode, saved_nbytes); 639 inode_set_bytes(inode, saved_nbytes);
640 btrfs_update_inode(trans, root, inode); 640 ret = btrfs_update_inode(trans, root, inode);
641out: 641out:
642 if (inode) 642 if (inode)
643 iput(inode); 643 iput(inode);
@@ -1133,7 +1133,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1133 btrfs_release_path(path); 1133 btrfs_release_path(path);
1134 if (ret == 0) { 1134 if (ret == 0) {
1135 btrfs_inc_nlink(inode); 1135 btrfs_inc_nlink(inode);
1136 btrfs_update_inode(trans, root, inode); 1136 ret = btrfs_update_inode(trans, root, inode);
1137 } else if (ret == -EEXIST) { 1137 } else if (ret == -EEXIST) {
1138 ret = 0; 1138 ret = 0;
1139 } else { 1139 } else {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ecaad40e7ef..b8708f994e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -429,6 +429,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
429 mutex_init(&fs_devices->device_list_mutex); 429 mutex_init(&fs_devices->device_list_mutex);
430 fs_devices->latest_devid = orig->latest_devid; 430 fs_devices->latest_devid = orig->latest_devid;
431 fs_devices->latest_trans = orig->latest_trans; 431 fs_devices->latest_trans = orig->latest_trans;
432 fs_devices->total_devices = orig->total_devices;
432 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 433 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
433 434
434 /* We have held the volume lock, it is safe to get the devices. */ 435 /* We have held the volume lock, it is safe to get the devices. */
@@ -739,6 +740,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
739 int ret; 740 int ret;
740 u64 devid; 741 u64 devid;
741 u64 transid; 742 u64 transid;
743 u64 total_devices;
742 744
743 flags |= FMODE_EXCL; 745 flags |= FMODE_EXCL;
744 bdev = blkdev_get_by_path(path, flags, holder); 746 bdev = blkdev_get_by_path(path, flags, holder);
@@ -760,6 +762,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
760 disk_super = (struct btrfs_super_block *)bh->b_data; 762 disk_super = (struct btrfs_super_block *)bh->b_data;
761 devid = btrfs_stack_device_id(&disk_super->dev_item); 763 devid = btrfs_stack_device_id(&disk_super->dev_item);
762 transid = btrfs_super_generation(disk_super); 764 transid = btrfs_super_generation(disk_super);
765 total_devices = btrfs_super_num_devices(disk_super);
763 if (disk_super->label[0]) 766 if (disk_super->label[0])
764 printk(KERN_INFO "device label %s ", disk_super->label); 767 printk(KERN_INFO "device label %s ", disk_super->label);
765 else 768 else
@@ -767,7 +770,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
767 printk(KERN_CONT "devid %llu transid %llu %s\n", 770 printk(KERN_CONT "devid %llu transid %llu %s\n",
768 (unsigned long long)devid, (unsigned long long)transid, path); 771 (unsigned long long)devid, (unsigned long long)transid, path);
769 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 772 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
770 773 if (!ret && fs_devices_ret)
774 (*fs_devices_ret)->total_devices = total_devices;
771 brelse(bh); 775 brelse(bh);
772error_close: 776error_close:
773 mutex_unlock(&uuid_mutex); 777 mutex_unlock(&uuid_mutex);
@@ -1433,6 +1437,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1433 list_del_rcu(&device->dev_list); 1437 list_del_rcu(&device->dev_list);
1434 1438
1435 device->fs_devices->num_devices--; 1439 device->fs_devices->num_devices--;
1440 device->fs_devices->total_devices--;
1436 1441
1437 if (device->missing) 1442 if (device->missing)
1438 root->fs_info->fs_devices->missing_devices--; 1443 root->fs_info->fs_devices->missing_devices--;
@@ -1550,6 +1555,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1550 fs_devices->seeding = 0; 1555 fs_devices->seeding = 0;
1551 fs_devices->num_devices = 0; 1556 fs_devices->num_devices = 0;
1552 fs_devices->open_devices = 0; 1557 fs_devices->open_devices = 0;
1558 fs_devices->total_devices = 0;
1553 fs_devices->seed = seed_devices; 1559 fs_devices->seed = seed_devices;
1554 1560
1555 generate_random_uuid(fs_devices->fsid); 1561 generate_random_uuid(fs_devices->fsid);
@@ -1749,6 +1755,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1749 root->fs_info->fs_devices->num_devices++; 1755 root->fs_info->fs_devices->num_devices++;
1750 root->fs_info->fs_devices->open_devices++; 1756 root->fs_info->fs_devices->open_devices++;
1751 root->fs_info->fs_devices->rw_devices++; 1757 root->fs_info->fs_devices->rw_devices++;
1758 root->fs_info->fs_devices->total_devices++;
1752 if (device->can_discard) 1759 if (device->can_discard)
1753 root->fs_info->fs_devices->num_can_discard++; 1760 root->fs_info->fs_devices->num_can_discard++;
1754 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1761 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
@@ -4736,9 +4743,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4736 key.offset = device->devid; 4743 key.offset = device->devid;
4737 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 4744 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4738 if (ret) { 4745 if (ret) {
4739 printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4740 rcu_str_deref(device->name),
4741 (unsigned long long)device->devid);
4742 __btrfs_reset_dev_stats(device); 4746 __btrfs_reset_dev_stats(device);
4743 device->dev_stats_valid = 1; 4747 device->dev_stats_valid = 1;
4744 btrfs_release_path(path); 4748 btrfs_release_path(path);
@@ -4880,6 +4884,14 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4880 4884
4881static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 4885static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4882{ 4886{
4887 int i;
4888
4889 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4890 if (btrfs_dev_stat_read(dev, i) != 0)
4891 break;
4892 if (i == BTRFS_DEV_STAT_VALUES_MAX)
4893 return; /* all values == 0, suppress message */
4894
4883 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4895 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4884 rcu_str_deref(dev->name), 4896 rcu_str_deref(dev->name),
4885 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4897 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -4890,8 +4902,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4890} 4902}
4891 4903
4892int btrfs_get_dev_stats(struct btrfs_root *root, 4904int btrfs_get_dev_stats(struct btrfs_root *root,
4893 struct btrfs_ioctl_get_dev_stats *stats, 4905 struct btrfs_ioctl_get_dev_stats *stats)
4894 int reset_after_read)
4895{ 4906{
4896 struct btrfs_device *dev; 4907 struct btrfs_device *dev;
4897 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4908 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
@@ -4909,7 +4920,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4909 printk(KERN_WARNING 4920 printk(KERN_WARNING
4910 "btrfs: get dev_stats failed, not yet valid\n"); 4921 "btrfs: get dev_stats failed, not yet valid\n");
4911 return -ENODEV; 4922 return -ENODEV;
4912 } else if (reset_after_read) { 4923 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
4913 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4924 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4914 if (stats->nr_items > i) 4925 if (stats->nr_items > i)
4915 stats->values[i] = 4926 stats->values[i] =
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 95f6637614d..5479325987b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -126,6 +126,7 @@ struct btrfs_fs_devices {
126 u64 missing_devices; 126 u64 missing_devices;
127 u64 total_rw_bytes; 127 u64 total_rw_bytes;
128 u64 num_can_discard; 128 u64 num_can_discard;
129 u64 total_devices;
129 struct block_device *latest_bdev; 130 struct block_device *latest_bdev;
130 131
131 /* all of the devices in the FS, protected by a mutex 132 /* all of the devices in the FS, protected by a mutex
@@ -293,8 +294,7 @@ struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
293void btrfs_dev_stat_print_on_error(struct btrfs_device *device); 294void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
294void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 295void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
295int btrfs_get_dev_stats(struct btrfs_root *root, 296int btrfs_get_dev_stats(struct btrfs_root *root,
296 struct btrfs_ioctl_get_dev_stats *stats, 297 struct btrfs_ioctl_get_dev_stats *stats);
297 int reset_after_read);
298int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 298int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
299int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 299int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
300 struct btrfs_fs_info *fs_info); 300 struct btrfs_fs_info *fs_info);
diff --git a/fs/inode.c b/fs/inode.c
index 775cbabd4fa..3cc50432046 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1551,6 +1551,8 @@ void touch_atime(struct path *path)
1551 * Btrfs), but since we touch atime while walking down the path we 1551 * Btrfs), but since we touch atime while walking down the path we
1552 * really don't care if we failed to update the atime of the file, 1552 * really don't care if we failed to update the atime of the file,
1553 * so just ignore the return value. 1553 * so just ignore the return value.
1554 * We may also fail on filesystems that have the ability to make parts
1555 * of the fs read only, e.g. subvolumes in Btrfs.
1554 */ 1556 */
1555 update_time(inode, &now, S_ATIME); 1557 update_time(inode, &now, S_ATIME);
1556 mnt_drop_write(mnt); 1558 mnt_drop_write(mnt);