aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/async-thread.c9
-rw-r--r--fs/btrfs/backref.c40
-rw-r--r--fs/btrfs/backref.h7
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c7
-rw-r--r--fs/btrfs/ctree.c775
-rw-r--r--fs/btrfs/ctree.h368
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c56
-rw-r--r--fs/btrfs/delayed-ref.h62
-rw-r--r--fs/btrfs/disk-io.c150
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c358
-rw-r--r--fs/btrfs/extent_io.c58
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c42
-rw-r--r--fs/btrfs/ioctl.c467
-rw-r--r--fs/btrfs/ioctl.h97
-rw-r--r--fs/btrfs/locking.c14
-rw-r--r--fs/btrfs/qgroup.c1571
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/root-tree.c107
-rw-r--r--fs/btrfs/send.c4571
-rw-r--r--fs/btrfs/send.h133
-rw-r--r--fs/btrfs/struct-funcs.c196
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/transaction.c101
-rw-r--r--fs/btrfs/transaction.h12
-rw-r--r--fs/btrfs/tree-log.c4
-rw-r--r--fs/btrfs/volumes.c25
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/inode.c2
35 files changed, 8689 insertions, 631 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2befae..d7fcdba141a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o 11 reada.o backref.o ulist.o qgroup.o send.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 42704149b72..58b7d14b08e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers,
206 206
207 work->ordered_func(work); 207 work->ordered_func(work);
208 208
209 /* now take the lock again and call the freeing code */ 209 /* now take the lock again and drop our item from the list */
210 spin_lock(&workers->order_lock); 210 spin_lock(&workers->order_lock);
211 list_del(&work->order_list); 211 list_del(&work->order_list);
212 spin_unlock(&workers->order_lock);
213
214 /*
215 * we don't want to call the ordered free functions
216 * with the lock held though
217 */
212 work->ordered_free(work); 218 work->ordered_free(work);
219 spin_lock(&workers->order_lock);
213 } 220 }
214 221
215 spin_unlock(&workers->order_lock); 222 spin_unlock(&workers->order_lock);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a383c18e74e..a256f3b2a84 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
773 */ 773 */
774static int find_parent_nodes(struct btrfs_trans_handle *trans, 774static int find_parent_nodes(struct btrfs_trans_handle *trans,
775 struct btrfs_fs_info *fs_info, u64 bytenr, 775 struct btrfs_fs_info *fs_info, u64 bytenr,
776 u64 delayed_ref_seq, u64 time_seq, 776 u64 time_seq, struct ulist *refs,
777 struct ulist *refs, struct ulist *roots, 777 struct ulist *roots, const u64 *extent_item_pos)
778 const u64 *extent_item_pos)
779{ 778{
780 struct btrfs_key key; 779 struct btrfs_key key;
781 struct btrfs_path *path; 780 struct btrfs_path *path;
@@ -837,7 +836,7 @@ again:
837 btrfs_put_delayed_ref(&head->node); 836 btrfs_put_delayed_ref(&head->node);
838 goto again; 837 goto again;
839 } 838 }
840 ret = __add_delayed_refs(head, delayed_ref_seq, 839 ret = __add_delayed_refs(head, time_seq,
841 &prefs_delayed); 840 &prefs_delayed);
842 mutex_unlock(&head->mutex); 841 mutex_unlock(&head->mutex);
843 if (ret) { 842 if (ret) {
@@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks)
981 */ 980 */
982static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 981static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
983 struct btrfs_fs_info *fs_info, u64 bytenr, 982 struct btrfs_fs_info *fs_info, u64 bytenr,
984 u64 delayed_ref_seq, u64 time_seq, 983 u64 time_seq, struct ulist **leafs,
985 struct ulist **leafs,
986 const u64 *extent_item_pos) 984 const u64 *extent_item_pos)
987{ 985{
988 struct ulist *tmp; 986 struct ulist *tmp;
@@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
997 return -ENOMEM; 995 return -ENOMEM;
998 } 996 }
999 997
1000 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 998 ret = find_parent_nodes(trans, fs_info, bytenr,
1001 time_seq, *leafs, tmp, extent_item_pos); 999 time_seq, *leafs, tmp, extent_item_pos);
1002 ulist_free(tmp); 1000 ulist_free(tmp);
1003 1001
@@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1024 */ 1022 */
1025int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1023int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1026 struct btrfs_fs_info *fs_info, u64 bytenr, 1024 struct btrfs_fs_info *fs_info, u64 bytenr,
1027 u64 delayed_ref_seq, u64 time_seq, 1025 u64 time_seq, struct ulist **roots)
1028 struct ulist **roots)
1029{ 1026{
1030 struct ulist *tmp; 1027 struct ulist *tmp;
1031 struct ulist_node *node = NULL; 1028 struct ulist_node *node = NULL;
@@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1043 1040
1044 ULIST_ITER_INIT(&uiter); 1041 ULIST_ITER_INIT(&uiter);
1045 while (1) { 1042 while (1) {
1046 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 1043 ret = find_parent_nodes(trans, fs_info, bytenr,
1047 time_seq, tmp, *roots, NULL); 1044 time_seq, tmp, *roots, NULL);
1048 if (ret < 0 && ret != -ENOENT) { 1045 if (ret < 0 && ret != -ENOENT) {
1049 ulist_free(tmp); 1046 ulist_free(tmp);
@@ -1125,10 +1122,10 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1125 * required for the path to fit into the buffer. in that case, the returned 1122 * required for the path to fit into the buffer. in that case, the returned
1126 * value will be smaller than dest. callers must check this! 1123 * value will be smaller than dest. callers must check this!
1127 */ 1124 */
1128static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1129 struct btrfs_inode_ref *iref, 1126 struct btrfs_inode_ref *iref,
1130 struct extent_buffer *eb_in, u64 parent, 1127 struct extent_buffer *eb_in, u64 parent,
1131 char *dest, u32 size) 1128 char *dest, u32 size)
1132{ 1129{
1133 u32 len; 1130 u32 len;
1134 int slot; 1131 int slot;
@@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1376 struct ulist *roots = NULL; 1373 struct ulist *roots = NULL;
1377 struct ulist_node *ref_node = NULL; 1374 struct ulist_node *ref_node = NULL;
1378 struct ulist_node *root_node = NULL; 1375 struct ulist_node *root_node = NULL;
1379 struct seq_list seq_elem = {};
1380 struct seq_list tree_mod_seq_elem = {}; 1376 struct seq_list tree_mod_seq_elem = {};
1381 struct ulist_iterator ref_uiter; 1377 struct ulist_iterator ref_uiter;
1382 struct ulist_iterator root_uiter; 1378 struct ulist_iterator root_uiter;
1383 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1384 1379
1385 pr_debug("resolving all inodes for extent %llu\n", 1380 pr_debug("resolving all inodes for extent %llu\n",
1386 extent_item_objectid); 1381 extent_item_objectid);
@@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1391 trans = btrfs_join_transaction(fs_info->extent_root); 1386 trans = btrfs_join_transaction(fs_info->extent_root);
1392 if (IS_ERR(trans)) 1387 if (IS_ERR(trans))
1393 return PTR_ERR(trans); 1388 return PTR_ERR(trans);
1394
1395 delayed_refs = &trans->transaction->delayed_refs;
1396 spin_lock(&delayed_refs->lock);
1397 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1398 spin_unlock(&delayed_refs->lock);
1399 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1389 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1400 } 1390 }
1401 1391
1402 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1392 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1403 seq_elem.seq, tree_mod_seq_elem.seq, &refs, 1393 tree_mod_seq_elem.seq, &refs,
1404 &extent_item_pos); 1394 &extent_item_pos);
1405 if (ret) 1395 if (ret)
1406 goto out; 1396 goto out;
@@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1408 ULIST_ITER_INIT(&ref_uiter); 1398 ULIST_ITER_INIT(&ref_uiter);
1409 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1399 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1410 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, 1400 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1411 seq_elem.seq, 1401 tree_mod_seq_elem.seq, &roots);
1412 tree_mod_seq_elem.seq, &roots);
1413 if (ret) 1402 if (ret)
1414 break; 1403 break;
1415 ULIST_ITER_INIT(&root_uiter); 1404 ULIST_ITER_INIT(&root_uiter);
@@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1431out: 1420out:
1432 if (!search_commit_root) { 1421 if (!search_commit_root) {
1433 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1422 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1434 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1435 btrfs_end_transaction(trans, fs_info->extent_root); 1423 btrfs_end_transaction(trans, fs_info->extent_root);
1436 } 1424 }
1437 1425
@@ -1543,7 +1531,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1543 ipath->fspath->bytes_left - s_ptr : 0; 1531 ipath->fspath->bytes_left - s_ptr : 0;
1544 1532
1545 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1546 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
1547 inum, fspath_min, bytes_left); 1535 inum, fspath_min, bytes_left);
1548 if (IS_ERR(fspath)) 1536 if (IS_ERR(fspath))
1549 return PTR_ERR(fspath); 1537 return PTR_ERR(fspath);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac7b79..032f4dc7eab 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -21,6 +21,7 @@
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h"
24 25
25#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) 26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
26 27
@@ -58,8 +59,10 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 59
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 60int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 61 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 delayed_ref_seq, u64 time_seq, 62 u64 time_seq, struct ulist **roots);
62 struct ulist **roots); 63char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
64 struct btrfs_inode_ref *iref, struct extent_buffer *eb,
65 u64 parent, char *dest, u32 size);
63 66
64struct btrfs_data_container *init_data_container(u32 total_bytes); 67struct btrfs_data_container *init_data_container(u32 total_bytes);
65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 68struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 12394a90d60..5b2ad6bc4fe 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -87,9 +87,6 @@ struct btrfs_inode {
87 /* node for the red-black tree that links inodes in subvolume root */ 87 /* node for the red-black tree that links inodes in subvolume root */
88 struct rb_node rb_node; 88 struct rb_node rb_node;
89 89
90 /* the space_info for where this inode's data allocations are done */
91 struct btrfs_space_info *space_info;
92
93 unsigned long runtime_flags; 90 unsigned long runtime_flags;
94 91
95 /* full 64 bit generation number, struct vfs_inode doesn't have a big 92 /* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -191,11 +188,14 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
191 BTRFS_I(inode)->disk_i_size = size; 188 BTRFS_I(inode)->disk_i_size = size;
192} 189}
193 190
194static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, 191static inline bool btrfs_is_free_space_inode(struct inode *inode)
195 struct inode *inode)
196{ 192{
197 if (root == root->fs_info->tree_root || 193 struct btrfs_root *root = BTRFS_I(inode)->root;
198 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) 194
195 if (root == root->fs_info->tree_root &&
196 btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
197 return true;
198 if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
199 return true; 199 return true;
200 return false; 200 return false;
201} 201}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index da6e9364a5e..9197e2e3340 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1032,6 +1032,7 @@ continue_with_current_leaf_stack_frame:
1032 struct btrfs_disk_key *disk_key; 1032 struct btrfs_disk_key *disk_key;
1033 u8 type; 1033 u8 type;
1034 u32 item_offset; 1034 u32 item_offset;
1035 u32 item_size;
1035 1036
1036 if (disk_item_offset + sizeof(struct btrfs_item) > 1037 if (disk_item_offset + sizeof(struct btrfs_item) >
1037 sf->block_ctx->len) { 1038 sf->block_ctx->len) {
@@ -1047,6 +1048,7 @@ leaf_item_out_of_bounce_error:
1047 disk_item_offset, 1048 disk_item_offset,
1048 sizeof(struct btrfs_item)); 1049 sizeof(struct btrfs_item));
1049 item_offset = le32_to_cpu(disk_item.offset); 1050 item_offset = le32_to_cpu(disk_item.offset);
1051 item_size = le32_to_cpu(disk_item.size);
1050 disk_key = &disk_item.key; 1052 disk_key = &disk_item.key;
1051 type = disk_key->type; 1053 type = disk_key->type;
1052 1054
@@ -1057,14 +1059,13 @@ leaf_item_out_of_bounce_error:
1057 1059
1058 root_item_offset = item_offset + 1060 root_item_offset = item_offset +
1059 offsetof(struct btrfs_leaf, items); 1061 offsetof(struct btrfs_leaf, items);
1060 if (root_item_offset + 1062 if (root_item_offset + item_size >
1061 sizeof(struct btrfs_root_item) >
1062 sf->block_ctx->len) 1063 sf->block_ctx->len)
1063 goto leaf_item_out_of_bounce_error; 1064 goto leaf_item_out_of_bounce_error;
1064 btrfsic_read_from_block_data( 1065 btrfsic_read_from_block_data(
1065 sf->block_ctx, &root_item, 1066 sf->block_ctx, &root_item,
1066 root_item_offset, 1067 root_item_offset,
1067 sizeof(struct btrfs_root_item)); 1068 item_size);
1068 next_bytenr = le64_to_cpu(root_item.bytenr); 1069 next_bytenr = le64_to_cpu(root_item.bytenr);
1069 1070
1070 sf->error = 1071 sf->error =
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b390058..9d7621f271f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -321,7 +321,7 @@ struct tree_mod_root {
321struct tree_mod_elem { 321struct tree_mod_elem {
322 struct rb_node node; 322 struct rb_node node;
323 u64 index; /* shifted logical */ 323 u64 index; /* shifted logical */
324 struct seq_list elem; 324 u64 seq;
325 enum mod_log_op op; 325 enum mod_log_op op;
326 326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +341,50 @@ struct tree_mod_elem {
341 struct tree_mod_root old_root; 341 struct tree_mod_root old_root;
342}; 342};
343 343
344static inline void 344static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{ 345{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); 346 read_lock(&fs_info->tree_mod_log_lock);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349} 347}
350 348
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 349static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
352 struct seq_list *elem) 350{
351 read_unlock(&fs_info->tree_mod_log_lock);
352}
353
354static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
355{
356 write_lock(&fs_info->tree_mod_log_lock);
357}
358
359static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
360{
361 write_unlock(&fs_info->tree_mod_log_lock);
362}
363
364/*
365 * This adds a new blocker to the tree mod log's blocker list if the @elem
366 * passed does not already have a sequence number set. So when a caller expects
367 * to record tree modifications, it should ensure to set elem->seq to zero
368 * before calling btrfs_get_tree_mod_seq.
369 * Returns a fresh, unused tree log modification sequence number, even if no new
370 * blocker was added.
371 */
372u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
373 struct seq_list *elem)
353{ 374{
354 elem->flags = 1; 375 u64 seq;
376
377 tree_mod_log_write_lock(fs_info);
355 spin_lock(&fs_info->tree_mod_seq_lock); 378 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem); 379 if (!elem->seq) {
380 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
381 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
382 }
383 seq = btrfs_inc_tree_mod_seq(fs_info);
357 spin_unlock(&fs_info->tree_mod_seq_lock); 384 spin_unlock(&fs_info->tree_mod_seq_lock);
385 tree_mod_log_write_unlock(fs_info);
386
387 return seq;
358} 388}
359 389
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 390void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
371 if (!seq_putting) 401 if (!seq_putting)
372 return; 402 return;
373 403
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock); 404 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list); 405 list_del(&elem->list);
406 elem->seq = 0;
377 407
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 408 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { 409 if (cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) { 410 if (seq_putting > cur_elem->seq) {
381 /* 411 /*
382 * blocker with lower sequence number exists, we 412 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log 413 * cannot remove anything from the log
384 */ 414 */
385 goto out; 415 spin_unlock(&fs_info->tree_mod_seq_lock);
416 return;
386 } 417 }
387 min_seq = cur_elem->seq; 418 min_seq = cur_elem->seq;
388 } 419 }
389 } 420 }
421 spin_unlock(&fs_info->tree_mod_seq_lock);
422
423 /*
424 * we removed the lowest blocker from the blocker list, so there may be
425 * more processible delayed refs.
426 */
427 wake_up(&fs_info->tree_mod_seq_wait);
390 428
391 /* 429 /*
392 * anything that's lower than the lowest existing (read: blocked) 430 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree. 431 * sequence number can be removed from the tree.
394 */ 432 */
395 write_lock(&fs_info->tree_mod_log_lock); 433 tree_mod_log_write_lock(fs_info);
396 tm_root = &fs_info->tree_mod_log; 434 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) { 435 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node); 436 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node); 437 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq) 438 if (tm->seq > min_seq)
401 continue; 439 continue;
402 rb_erase(node, tm_root); 440 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm); 441 kfree(tm);
405 } 442 }
406 write_unlock(&fs_info->tree_mod_log_lock); 443 tree_mod_log_write_unlock(fs_info);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409} 444}
410 445
411/* 446/*
@@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
423 struct rb_node **new; 458 struct rb_node **new;
424 struct rb_node *parent = NULL; 459 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur; 460 struct tree_mod_elem *cur;
426 int ret = 0;
427 461
428 BUG_ON(!tm || !tm->elem.seq); 462 BUG_ON(!tm || !tm->seq);
429 463
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log; 464 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node; 465 new = &tm_root->rb_node;
433 while (*new) { 466 while (*new) {
@@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
437 new = &((*new)->rb_left); 470 new = &((*new)->rb_left);
438 else if (cur->index > tm->index) 471 else if (cur->index > tm->index)
439 new = &((*new)->rb_right); 472 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq) 473 else if (cur->seq < tm->seq)
441 new = &((*new)->rb_left); 474 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq) 475 else if (cur->seq > tm->seq)
443 new = &((*new)->rb_right); 476 new = &((*new)->rb_right);
444 else { 477 else {
445 kfree(tm); 478 kfree(tm);
446 ret = -EEXIST; 479 return -EEXIST;
447 goto unlock;
448 } 480 }
449 } 481 }
450 482
451 rb_link_node(&tm->node, parent, new); 483 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root); 484 rb_insert_color(&tm->node, tm_root);
453unlock: 485 return 0;
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456} 486}
457 487
488/*
489 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
490 * returns zero with the tree_mod_log_lock acquired. The caller must hold
491 * this until all tree mod log insertions are recorded in the rb tree and then
492 * call tree_mod_log_write_unlock() to release.
493 */
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 494static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) { 495 struct extent_buffer *eb) {
460 smp_mb(); 496 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list)) 497 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1; 498 return 1;
463 if (!eb) 499 if (eb && btrfs_header_level(eb) == 0)
464 return 0; 500 return 1;
465 if (btrfs_header_level(eb) == 0) 501
502 tree_mod_log_write_lock(fs_info);
503 if (list_empty(&fs_info->tree_mod_seq_list)) {
504 /*
505 * someone emptied the list while we were waiting for the lock.
506 * we must not add to the list when no blocker exists.
507 */
508 tree_mod_log_write_unlock(fs_info);
466 return 1; 509 return 1;
510 }
511
467 return 0; 512 return 0;
468} 513}
469 514
470/* 515/*
471 * This allocates memory and gets a tree modification sequence number when 516 * This allocates memory and gets a tree modification sequence number.
472 * needed.
473 * 517 *
474 * Returns 0 when no sequence number is needed, < 0 on error. 518 * Returns <0 on error.
475 * Returns 1 when a sequence number was added. In this case, 519 * Returns >0 (the added sequence number) on success.
476 * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
477 * after inserting into the rb tree.
478 */ 520 */
479static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, 521static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
480 struct tree_mod_elem **tm_ret) 522 struct tree_mod_elem **tm_ret)
481{ 523{
482 struct tree_mod_elem *tm; 524 struct tree_mod_elem *tm;
483 int seq;
484 525
485 if (tree_mod_dont_log(fs_info, NULL)) 526 /*
486 return 0; 527 * once we switch from spin locks to something different, we should
487 528 * honor the flags parameter here.
488 tm = *tm_ret = kzalloc(sizeof(*tm), flags); 529 */
530 tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
489 if (!tm) 531 if (!tm)
490 return -ENOMEM; 532 return -ENOMEM;
491 533
492 tm->elem.flags = 0; 534 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
493 spin_lock(&fs_info->tree_mod_seq_lock); 535 return tm->seq;
494 if (list_empty(&fs_info->tree_mod_seq_list)) {
495 /*
496 * someone emptied the list while we were waiting for the lock.
497 * we must not add to the list, because no blocker exists. items
498 * are removed from the list only when the existing blocker is
499 * removed from the list.
500 */
501 kfree(tm);
502 seq = 0;
503 spin_unlock(&fs_info->tree_mod_seq_lock);
504 } else {
505 __get_tree_mod_seq(fs_info, &tm->elem);
506 seq = tm->elem.seq;
507 }
508
509 return seq;
510} 536}
511 537
512static noinline int 538static inline int
513tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, 539__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
514 struct extent_buffer *eb, int slot, 540 struct extent_buffer *eb, int slot,
515 enum mod_log_op op, gfp_t flags) 541 enum mod_log_op op, gfp_t flags)
516{ 542{
517 struct tree_mod_elem *tm;
518 int ret; 543 int ret;
544 struct tree_mod_elem *tm;
519 545
520 ret = tree_mod_alloc(fs_info, flags, &tm); 546 ret = tree_mod_alloc(fs_info, flags, &tm);
521 if (ret <= 0) 547 if (ret < 0)
522 return ret; 548 return ret;
523 549
524 tm->index = eb->start >> PAGE_CACHE_SHIFT; 550 tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
530 tm->slot = slot; 556 tm->slot = slot;
531 tm->generation = btrfs_node_ptr_generation(eb, slot); 557 tm->generation = btrfs_node_ptr_generation(eb, slot);
532 558
533 ret = __tree_mod_log_insert(fs_info, tm); 559 return __tree_mod_log_insert(fs_info, tm);
534 spin_unlock(&fs_info->tree_mod_seq_lock); 560}
561
562static noinline int
563tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
564 struct extent_buffer *eb, int slot,
565 enum mod_log_op op, gfp_t flags)
566{
567 int ret;
568
569 if (tree_mod_dont_log(fs_info, eb))
570 return 0;
571
572 ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
573
574 tree_mod_log_write_unlock(fs_info);
535 return ret; 575 return ret;
536} 576}
537 577
@@ -543,6 +583,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
543} 583}
544 584
545static noinline int 585static noinline int
586tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
587 struct extent_buffer *eb, int slot,
588 enum mod_log_op op)
589{
590 return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
591}
592
593static noinline int
546tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 594tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
547 struct extent_buffer *eb, int dst_slot, int src_slot, 595 struct extent_buffer *eb, int dst_slot, int src_slot,
548 int nr_items, gfp_t flags) 596 int nr_items, gfp_t flags)
@@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
555 return 0; 603 return 0;
556 604
557 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 605 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
558 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, 606 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
559 MOD_LOG_KEY_REMOVE_WHILE_MOVING); 607 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
560 BUG_ON(ret < 0); 608 BUG_ON(ret < 0);
561 } 609 }
562 610
563 ret = tree_mod_alloc(fs_info, flags, &tm); 611 ret = tree_mod_alloc(fs_info, flags, &tm);
564 if (ret <= 0) 612 if (ret < 0)
565 return ret; 613 goto out;
566 614
567 tm->index = eb->start >> PAGE_CACHE_SHIFT; 615 tm->index = eb->start >> PAGE_CACHE_SHIFT;
568 tm->slot = src_slot; 616 tm->slot = src_slot;
@@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
571 tm->op = MOD_LOG_MOVE_KEYS; 619 tm->op = MOD_LOG_MOVE_KEYS;
572 620
573 ret = __tree_mod_log_insert(fs_info, tm); 621 ret = __tree_mod_log_insert(fs_info, tm);
574 spin_unlock(&fs_info->tree_mod_seq_lock); 622out:
623 tree_mod_log_write_unlock(fs_info);
575 return ret; 624 return ret;
576} 625}
577 626
627static inline void
628__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
629{
630 int i;
631 u32 nritems;
632 int ret;
633
634 nritems = btrfs_header_nritems(eb);
635 for (i = nritems - 1; i >= 0; i--) {
636 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
637 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
638 BUG_ON(ret < 0);
639 }
640}
641
578static noinline int 642static noinline int
579tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 643tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
580 struct extent_buffer *old_root, 644 struct extent_buffer *old_root,
@@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
583 struct tree_mod_elem *tm; 647 struct tree_mod_elem *tm;
584 int ret; 648 int ret;
585 649
650 if (tree_mod_dont_log(fs_info, NULL))
651 return 0;
652
653 __tree_mod_log_free_eb(fs_info, old_root);
654
586 ret = tree_mod_alloc(fs_info, flags, &tm); 655 ret = tree_mod_alloc(fs_info, flags, &tm);
587 if (ret <= 0) 656 if (ret < 0)
588 return ret; 657 goto out;
589 658
590 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 659 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
591 tm->old_root.logical = old_root->start; 660 tm->old_root.logical = old_root->start;
@@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
594 tm->op = MOD_LOG_ROOT_REPLACE; 663 tm->op = MOD_LOG_ROOT_REPLACE;
595 664
596 ret = __tree_mod_log_insert(fs_info, tm); 665 ret = __tree_mod_log_insert(fs_info, tm);
597 spin_unlock(&fs_info->tree_mod_seq_lock); 666out:
667 tree_mod_log_write_unlock(fs_info);
598 return ret; 668 return ret;
599} 669}
600 670
@@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
608 struct tree_mod_elem *found = NULL; 678 struct tree_mod_elem *found = NULL;
609 u64 index = start >> PAGE_CACHE_SHIFT; 679 u64 index = start >> PAGE_CACHE_SHIFT;
610 680
611 read_lock(&fs_info->tree_mod_log_lock); 681 tree_mod_log_read_lock(fs_info);
612 tm_root = &fs_info->tree_mod_log; 682 tm_root = &fs_info->tree_mod_log;
613 node = tm_root->rb_node; 683 node = tm_root->rb_node;
614 while (node) { 684 while (node) {
@@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
617 node = node->rb_left; 687 node = node->rb_left;
618 } else if (cur->index > index) { 688 } else if (cur->index > index) {
619 node = node->rb_right; 689 node = node->rb_right;
620 } else if (cur->elem.seq < min_seq) { 690 } else if (cur->seq < min_seq) {
621 node = node->rb_left; 691 node = node->rb_left;
622 } else if (!smallest) { 692 } else if (!smallest) {
623 /* we want the node with the highest seq */ 693 /* we want the node with the highest seq */
624 if (found) 694 if (found)
625 BUG_ON(found->elem.seq > cur->elem.seq); 695 BUG_ON(found->seq > cur->seq);
626 found = cur; 696 found = cur;
627 node = node->rb_left; 697 node = node->rb_left;
628 } else if (cur->elem.seq > min_seq) { 698 } else if (cur->seq > min_seq) {
629 /* we want the node with the smallest seq */ 699 /* we want the node with the smallest seq */
630 if (found) 700 if (found)
631 BUG_ON(found->elem.seq < cur->elem.seq); 701 BUG_ON(found->seq < cur->seq);
632 found = cur; 702 found = cur;
633 node = node->rb_right; 703 node = node->rb_right;
634 } else { 704 } else {
@@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
636 break; 706 break;
637 } 707 }
638 } 708 }
639 read_unlock(&fs_info->tree_mod_log_lock); 709 tree_mod_log_read_unlock(fs_info);
640 710
641 return found; 711 return found;
642} 712}
@@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
664 return __tree_mod_log_search(fs_info, start, min_seq, 0); 734 return __tree_mod_log_search(fs_info, start, min_seq, 0);
665} 735}
666 736
667static inline void 737static noinline void
668tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 738tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
669 struct extent_buffer *src, unsigned long dst_offset, 739 struct extent_buffer *src, unsigned long dst_offset,
670 unsigned long src_offset, int nr_items) 740 unsigned long src_offset, int nr_items)
@@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
675 if (tree_mod_dont_log(fs_info, NULL)) 745 if (tree_mod_dont_log(fs_info, NULL))
676 return; 746 return;
677 747
678 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 748 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
749 tree_mod_log_write_unlock(fs_info);
679 return; 750 return;
751 }
680 752
681 /* speed this up by single seq for all operations? */
682 for (i = 0; i < nr_items; i++) { 753 for (i = 0; i < nr_items; i++) {
683 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, 754 ret = tree_mod_log_insert_key_locked(fs_info, src,
684 MOD_LOG_KEY_REMOVE); 755 i + src_offset,
756 MOD_LOG_KEY_REMOVE);
685 BUG_ON(ret < 0); 757 BUG_ON(ret < 0);
686 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, 758 ret = tree_mod_log_insert_key_locked(fs_info, dst,
687 MOD_LOG_KEY_ADD); 759 i + dst_offset,
760 MOD_LOG_KEY_ADD);
688 BUG_ON(ret < 0); 761 BUG_ON(ret < 0);
689 } 762 }
763
764 tree_mod_log_write_unlock(fs_info);
690} 765}
691 766
692static inline void 767static inline void
@@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
699 BUG_ON(ret < 0); 774 BUG_ON(ret < 0);
700} 775}
701 776
702static inline void 777static noinline void
703tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
704 struct extent_buffer *eb, 779 struct extent_buffer *eb,
705 struct btrfs_disk_key *disk_key, int slot, int atomic) 780 struct btrfs_disk_key *disk_key, int slot, int atomic)
@@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
712 BUG_ON(ret < 0); 787 BUG_ON(ret < 0);
713} 788}
714 789
715static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 790static noinline void
716 struct extent_buffer *eb) 791tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
717{ 792{
718 int i;
719 int ret;
720 u32 nritems;
721
722 if (tree_mod_dont_log(fs_info, eb)) 793 if (tree_mod_dont_log(fs_info, eb))
723 return; 794 return;
724 795
725 nritems = btrfs_header_nritems(eb); 796 __tree_mod_log_free_eb(fs_info, eb);
726 for (i = nritems - 1; i >= 0; i--) { 797
727 ret = tree_mod_log_insert_key(fs_info, eb, i, 798 tree_mod_log_write_unlock(fs_info);
728 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
729 BUG_ON(ret < 0);
730 }
731} 799}
732 800
733static inline void 801static noinline void
734tree_mod_log_set_root_pointer(struct btrfs_root *root, 802tree_mod_log_set_root_pointer(struct btrfs_root *root,
735 struct extent_buffer *new_root_node) 803 struct extent_buffer *new_root_node)
736{ 804{
737 int ret; 805 int ret;
738 tree_mod_log_free_eb(root->fs_info, root->node);
739 ret = tree_mod_log_insert_root(root->fs_info, root->node, 806 ret = tree_mod_log_insert_root(root->fs_info, root->node,
740 new_root_node, GFP_NOFS); 807 new_root_node, GFP_NOFS);
741 BUG_ON(ret < 0); 808 BUG_ON(ret < 0);
@@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1069 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1136 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1070 1137
1071 n = btrfs_header_nritems(eb); 1138 n = btrfs_header_nritems(eb);
1072 while (tm && tm->elem.seq >= time_seq) { 1139 while (tm && tm->seq >= time_seq) {
1073 /* 1140 /*
1074 * all the operations are recorded with the operator used for 1141 * all the operations are recorded with the operator used for
1075 * the modification. as we're going backwards, we do the 1142 * the modification. as we're going backwards, we do the
@@ -2722,6 +2789,80 @@ done:
2722} 2789}
2723 2790
2724/* 2791/*
2792 * helper to use instead of search slot if no exact match is needed but
2793 * instead the next or previous item should be returned.
2794 * When find_higher is true, the next higher item is returned, the next lower
2795 * otherwise.
2796 * When return_any and find_higher are both true, and no higher item is found,
2797 * return the next lower instead.
2798 * When return_any is true and find_higher is false, and no lower item is found,
2799 * return the next higher instead.
2800 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2801 * < 0 on error
2802 */
2803int btrfs_search_slot_for_read(struct btrfs_root *root,
2804 struct btrfs_key *key, struct btrfs_path *p,
2805 int find_higher, int return_any)
2806{
2807 int ret;
2808 struct extent_buffer *leaf;
2809
2810again:
2811 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2812 if (ret <= 0)
2813 return ret;
2814 /*
2815 * a return value of 1 means the path is at the position where the
2816 * item should be inserted. Normally this is the next bigger item,
2817 * but in case the previous item is the last in a leaf, path points
2818 * to the first free slot in the previous leaf, i.e. at an invalid
2819 * item.
2820 */
2821 leaf = p->nodes[0];
2822
2823 if (find_higher) {
2824 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2825 ret = btrfs_next_leaf(root, p);
2826 if (ret <= 0)
2827 return ret;
2828 if (!return_any)
2829 return 1;
2830 /*
2831 * no higher item found, return the next
2832 * lower instead
2833 */
2834 return_any = 0;
2835 find_higher = 0;
2836 btrfs_release_path(p);
2837 goto again;
2838 }
2839 } else {
2840 if (p->slots[0] == 0) {
2841 ret = btrfs_prev_leaf(root, p);
2842 if (ret < 0)
2843 return ret;
2844 if (!ret) {
2845 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2846 return 0;
2847 }
2848 if (!return_any)
2849 return 1;
2850 /*
2851 * no lower item found, return the next
2852 * higher instead
2853 */
2854 return_any = 0;
2855 find_higher = 1;
2856 btrfs_release_path(p);
2857 goto again;
2858 } else {
2859 --p->slots[0];
2860 }
2861 }
2862 return 0;
2863}
2864
2865/*
2725 * adjust the pointers going up the tree, starting at level 2866 * adjust the pointers going up the tree, starting at level
2726 * making sure the right key of each node is points to 'key'. 2867 * making sure the right key of each node is points to 'key'.
2727 * This is used after shifting pointers to the left, so it stops 2868 * This is used after shifting pointers to the left, so it stops
@@ -4931,6 +5072,431 @@ out:
4931 return ret; 5072 return ret;
4932} 5073}
4933 5074
5075static void tree_move_down(struct btrfs_root *root,
5076 struct btrfs_path *path,
5077 int *level, int root_level)
5078{
5079 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5080 path->slots[*level]);
5081 path->slots[*level - 1] = 0;
5082 (*level)--;
5083}
5084
5085static int tree_move_next_or_upnext(struct btrfs_root *root,
5086 struct btrfs_path *path,
5087 int *level, int root_level)
5088{
5089 int ret = 0;
5090 int nritems;
5091 nritems = btrfs_header_nritems(path->nodes[*level]);
5092
5093 path->slots[*level]++;
5094
5095 while (path->slots[*level] == nritems) {
5096 if (*level == root_level)
5097 return -1;
5098
5099 /* move upnext */
5100 path->slots[*level] = 0;
5101 free_extent_buffer(path->nodes[*level]);
5102 path->nodes[*level] = NULL;
5103 (*level)++;
5104 path->slots[*level]++;
5105
5106 nritems = btrfs_header_nritems(path->nodes[*level]);
5107 ret = 1;
5108 }
5109 return ret;
5110}
5111
5112/*
5113 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5114 * or down.
5115 */
5116static int tree_advance(struct btrfs_root *root,
5117 struct btrfs_path *path,
5118 int *level, int root_level,
5119 int allow_down,
5120 struct btrfs_key *key)
5121{
5122 int ret;
5123
5124 if (*level == 0 || !allow_down) {
5125 ret = tree_move_next_or_upnext(root, path, level, root_level);
5126 } else {
5127 tree_move_down(root, path, level, root_level);
5128 ret = 0;
5129 }
5130 if (ret >= 0) {
5131 if (*level == 0)
5132 btrfs_item_key_to_cpu(path->nodes[*level], key,
5133 path->slots[*level]);
5134 else
5135 btrfs_node_key_to_cpu(path->nodes[*level], key,
5136 path->slots[*level]);
5137 }
5138 return ret;
5139}
5140
5141static int tree_compare_item(struct btrfs_root *left_root,
5142 struct btrfs_path *left_path,
5143 struct btrfs_path *right_path,
5144 char *tmp_buf)
5145{
5146 int cmp;
5147 int len1, len2;
5148 unsigned long off1, off2;
5149
5150 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5151 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5152 if (len1 != len2)
5153 return 1;
5154
5155 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5156 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5157 right_path->slots[0]);
5158
5159 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5160
5161 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5162 if (cmp)
5163 return 1;
5164 return 0;
5165}
5166
5167#define ADVANCE 1
5168#define ADVANCE_ONLY_NEXT -1
5169
5170/*
5171 * This function compares two trees and calls the provided callback for
5172 * every changed/new/deleted item it finds.
5173 * If shared tree blocks are encountered, whole subtrees are skipped, making
5174 * the compare pretty fast on snapshotted subvolumes.
5175 *
5176 * This currently works on commit roots only. As commit roots are read only,
5177 * we don't do any locking. The commit roots are protected with transactions.
5178 * Transactions are ended and rejoined when a commit is tried in between.
5179 *
5180 * This function checks for modifications done to the trees while comparing.
5181 * If it detects a change, it aborts immediately.
5182 */
5183int btrfs_compare_trees(struct btrfs_root *left_root,
5184 struct btrfs_root *right_root,
5185 btrfs_changed_cb_t changed_cb, void *ctx)
5186{
5187 int ret;
5188 int cmp;
5189 struct btrfs_trans_handle *trans = NULL;
5190 struct btrfs_path *left_path = NULL;
5191 struct btrfs_path *right_path = NULL;
5192 struct btrfs_key left_key;
5193 struct btrfs_key right_key;
5194 char *tmp_buf = NULL;
5195 int left_root_level;
5196 int right_root_level;
5197 int left_level;
5198 int right_level;
5199 int left_end_reached;
5200 int right_end_reached;
5201 int advance_left;
5202 int advance_right;
5203 u64 left_blockptr;
5204 u64 right_blockptr;
5205 u64 left_start_ctransid;
5206 u64 right_start_ctransid;
5207 u64 ctransid;
5208
5209 left_path = btrfs_alloc_path();
5210 if (!left_path) {
5211 ret = -ENOMEM;
5212 goto out;
5213 }
5214 right_path = btrfs_alloc_path();
5215 if (!right_path) {
5216 ret = -ENOMEM;
5217 goto out;
5218 }
5219
5220 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5221 if (!tmp_buf) {
5222 ret = -ENOMEM;
5223 goto out;
5224 }
5225
5226 left_path->search_commit_root = 1;
5227 left_path->skip_locking = 1;
5228 right_path->search_commit_root = 1;
5229 right_path->skip_locking = 1;
5230
5231 spin_lock(&left_root->root_times_lock);
5232 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5233 spin_unlock(&left_root->root_times_lock);
5234
5235 spin_lock(&right_root->root_times_lock);
5236 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5237 spin_unlock(&right_root->root_times_lock);
5238
5239 trans = btrfs_join_transaction(left_root);
5240 if (IS_ERR(trans)) {
5241 ret = PTR_ERR(trans);
5242 trans = NULL;
5243 goto out;
5244 }
5245
5246 /*
5247 * Strategy: Go to the first items of both trees. Then do
5248 *
5249 * If both trees are at level 0
5250 * Compare keys of current items
5251 * If left < right treat left item as new, advance left tree
5252 * and repeat
5253 * If left > right treat right item as deleted, advance right tree
5254 * and repeat
5255 * If left == right do deep compare of items, treat as changed if
5256 * needed, advance both trees and repeat
5257 * If both trees are at the same level but not at level 0
5258 * Compare keys of current nodes/leafs
5259 * If left < right advance left tree and repeat
5260 * If left > right advance right tree and repeat
5261 * If left == right compare blockptrs of the next nodes/leafs
5262 * If they match advance both trees but stay at the same level
5263 * and repeat
5264 * If they don't match advance both trees while allowing to go
5265 * deeper and repeat
5266 * If tree levels are different
5267 * Advance the tree that needs it and repeat
5268 *
5269 * Advancing a tree means:
5270 * If we are at level 0, try to go to the next slot. If that's not
5271 * possible, go one level up and repeat. Stop when we found a level
5272 * where we could go to the next slot. We may at this point be on a
5273 * node or a leaf.
5274 *
5275 * If we are not at level 0 and not on shared tree blocks, go one
5276 * level deeper.
5277 *
5278 * If we are not at level 0 and on shared tree blocks, go one slot to
5279 * the right if possible or go up and right.
5280 */
5281
5282 left_level = btrfs_header_level(left_root->commit_root);
5283 left_root_level = left_level;
5284 left_path->nodes[left_level] = left_root->commit_root;
5285 extent_buffer_get(left_path->nodes[left_level]);
5286
5287 right_level = btrfs_header_level(right_root->commit_root);
5288 right_root_level = right_level;
5289 right_path->nodes[right_level] = right_root->commit_root;
5290 extent_buffer_get(right_path->nodes[right_level]);
5291
5292 if (left_level == 0)
5293 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5294 &left_key, left_path->slots[left_level]);
5295 else
5296 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5297 &left_key, left_path->slots[left_level]);
5298 if (right_level == 0)
5299 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5300 &right_key, right_path->slots[right_level]);
5301 else
5302 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5303 &right_key, right_path->slots[right_level]);
5304
5305 left_end_reached = right_end_reached = 0;
5306 advance_left = advance_right = 0;
5307
5308 while (1) {
5309 /*
5310 * We need to make sure the transaction does not get committed
5311 * while we do anything on commit roots. This means, we need to
5312 * join and leave transactions for every item that we process.
5313 */
5314 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5315 btrfs_release_path(left_path);
5316 btrfs_release_path(right_path);
5317
5318 ret = btrfs_end_transaction(trans, left_root);
5319 trans = NULL;
5320 if (ret < 0)
5321 goto out;
5322 }
5323 /* now rejoin the transaction */
5324 if (!trans) {
5325 trans = btrfs_join_transaction(left_root);
5326 if (IS_ERR(trans)) {
5327 ret = PTR_ERR(trans);
5328 trans = NULL;
5329 goto out;
5330 }
5331
5332 spin_lock(&left_root->root_times_lock);
5333 ctransid = btrfs_root_ctransid(&left_root->root_item);
5334 spin_unlock(&left_root->root_times_lock);
5335 if (ctransid != left_start_ctransid)
5336 left_start_ctransid = 0;
5337
5338 spin_lock(&right_root->root_times_lock);
5339 ctransid = btrfs_root_ctransid(&right_root->root_item);
5340 spin_unlock(&right_root->root_times_lock);
5341 if (ctransid != right_start_ctransid)
5342 right_start_ctransid = 0;
5343
5344 if (!left_start_ctransid || !right_start_ctransid) {
5345 WARN(1, KERN_WARNING
5346 "btrfs: btrfs_compare_tree detected "
5347 "a change in one of the trees while "
5348 "iterating. This is probably a "
5349 "bug.\n");
5350 ret = -EIO;
5351 goto out;
5352 }
5353
5354 /*
5355 * the commit root may have changed, so start again
5356 * where we stopped
5357 */
5358 left_path->lowest_level = left_level;
5359 right_path->lowest_level = right_level;
5360 ret = btrfs_search_slot(NULL, left_root,
5361 &left_key, left_path, 0, 0);
5362 if (ret < 0)
5363 goto out;
5364 ret = btrfs_search_slot(NULL, right_root,
5365 &right_key, right_path, 0, 0);
5366 if (ret < 0)
5367 goto out;
5368 }
5369
5370 if (advance_left && !left_end_reached) {
5371 ret = tree_advance(left_root, left_path, &left_level,
5372 left_root_level,
5373 advance_left != ADVANCE_ONLY_NEXT,
5374 &left_key);
5375 if (ret < 0)
5376 left_end_reached = ADVANCE;
5377 advance_left = 0;
5378 }
5379 if (advance_right && !right_end_reached) {
5380 ret = tree_advance(right_root, right_path, &right_level,
5381 right_root_level,
5382 advance_right != ADVANCE_ONLY_NEXT,
5383 &right_key);
5384 if (ret < 0)
5385 right_end_reached = ADVANCE;
5386 advance_right = 0;
5387 }
5388
5389 if (left_end_reached && right_end_reached) {
5390 ret = 0;
5391 goto out;
5392 } else if (left_end_reached) {
5393 if (right_level == 0) {
5394 ret = changed_cb(left_root, right_root,
5395 left_path, right_path,
5396 &right_key,
5397 BTRFS_COMPARE_TREE_DELETED,
5398 ctx);
5399 if (ret < 0)
5400 goto out;
5401 }
5402 advance_right = ADVANCE;
5403 continue;
5404 } else if (right_end_reached) {
5405 if (left_level == 0) {
5406 ret = changed_cb(left_root, right_root,
5407 left_path, right_path,
5408 &left_key,
5409 BTRFS_COMPARE_TREE_NEW,
5410 ctx);
5411 if (ret < 0)
5412 goto out;
5413 }
5414 advance_left = ADVANCE;
5415 continue;
5416 }
5417
5418 if (left_level == 0 && right_level == 0) {
5419 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5420 if (cmp < 0) {
5421 ret = changed_cb(left_root, right_root,
5422 left_path, right_path,
5423 &left_key,
5424 BTRFS_COMPARE_TREE_NEW,
5425 ctx);
5426 if (ret < 0)
5427 goto out;
5428 advance_left = ADVANCE;
5429 } else if (cmp > 0) {
5430 ret = changed_cb(left_root, right_root,
5431 left_path, right_path,
5432 &right_key,
5433 BTRFS_COMPARE_TREE_DELETED,
5434 ctx);
5435 if (ret < 0)
5436 goto out;
5437 advance_right = ADVANCE;
5438 } else {
5439 ret = tree_compare_item(left_root, left_path,
5440 right_path, tmp_buf);
5441 if (ret) {
5442 ret = changed_cb(left_root, right_root,
5443 left_path, right_path,
5444 &left_key,
5445 BTRFS_COMPARE_TREE_CHANGED,
5446 ctx);
5447 if (ret < 0)
5448 goto out;
5449 }
5450 advance_left = ADVANCE;
5451 advance_right = ADVANCE;
5452 }
5453 } else if (left_level == right_level) {
5454 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5455 if (cmp < 0) {
5456 advance_left = ADVANCE;
5457 } else if (cmp > 0) {
5458 advance_right = ADVANCE;
5459 } else {
5460 left_blockptr = btrfs_node_blockptr(
5461 left_path->nodes[left_level],
5462 left_path->slots[left_level]);
5463 right_blockptr = btrfs_node_blockptr(
5464 right_path->nodes[right_level],
5465 right_path->slots[right_level]);
5466 if (left_blockptr == right_blockptr) {
5467 /*
5468 * As we're on a shared block, don't
5469 * allow to go deeper.
5470 */
5471 advance_left = ADVANCE_ONLY_NEXT;
5472 advance_right = ADVANCE_ONLY_NEXT;
5473 } else {
5474 advance_left = ADVANCE;
5475 advance_right = ADVANCE;
5476 }
5477 }
5478 } else if (left_level < right_level) {
5479 advance_right = ADVANCE;
5480 } else {
5481 advance_left = ADVANCE;
5482 }
5483 }
5484
5485out:
5486 btrfs_free_path(left_path);
5487 btrfs_free_path(right_path);
5488 kfree(tmp_buf);
5489
5490 if (trans) {
5491 if (!ret)
5492 ret = btrfs_end_transaction(trans, left_root);
5493 else
5494 btrfs_end_transaction(trans, left_root);
5495 }
5496
5497 return ret;
5498}
5499
4934/* 5500/*
4935 * this is similar to btrfs_next_leaf, but does not try to preserve 5501 * this is similar to btrfs_next_leaf, but does not try to preserve
4936 * and fixup the path. It looks for and returns the next key in the 5502 * and fixup the path. It looks for and returns the next key in the
@@ -5127,6 +5693,7 @@ again:
5127 * locked. To solve this situation, we give up 5693 * locked. To solve this situation, we give up
5128 * on our lock and cycle. 5694 * on our lock and cycle.
5129 */ 5695 */
5696 free_extent_buffer(next);
5130 btrfs_release_path(path); 5697 btrfs_release_path(path);
5131 cond_resched(); 5698 cond_resched();
5132 goto again; 5699 goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b3907..adb1cd7ceb9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
91/* for storing balance parameters in the root tree */ 91/* for storing balance parameters in the root tree */
92#define BTRFS_BALANCE_OBJECTID -4ULL 92#define BTRFS_BALANCE_OBJECTID -4ULL
93 93
94/* holds quota configuration and tracking */
95#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
96
94/* orhpan objectid for tracking unlinked/truncated files */ 97/* orhpan objectid for tracking unlinked/truncated files */
95#define BTRFS_ORPHAN_OBJECTID -5ULL 98#define BTRFS_ORPHAN_OBJECTID -5ULL
96 99
@@ -709,6 +712,36 @@ struct btrfs_root_item {
709 struct btrfs_disk_key drop_progress; 712 struct btrfs_disk_key drop_progress;
710 u8 drop_level; 713 u8 drop_level;
711 u8 level; 714 u8 level;
715
716 /*
717 * The following fields appear after subvol_uuids+subvol_times
718 * were introduced.
719 */
720
721 /*
722 * This generation number is used to test if the new fields are valid
723 * and up to date while reading the root item. Everytime the root item
724 * is written out, the "generation" field is copied into this field. If
725 * anyone ever mounted the fs with an older kernel, we will have
726 * mismatching generation values here and thus must invalidate the
727 * new fields. See btrfs_update_root and btrfs_find_last_root for
728 * details.
729 * the offset of generation_v2 is also used as the start for the memset
730 * when invalidating the fields.
731 */
732 __le64 generation_v2;
733 u8 uuid[BTRFS_UUID_SIZE];
734 u8 parent_uuid[BTRFS_UUID_SIZE];
735 u8 received_uuid[BTRFS_UUID_SIZE];
736 __le64 ctransid; /* updated when an inode changes */
737 __le64 otransid; /* trans when created */
738 __le64 stransid; /* trans when sent. non-zero for received subvol */
739 __le64 rtransid; /* trans when received. non-zero for received subvol */
740 struct btrfs_timespec ctime;
741 struct btrfs_timespec otime;
742 struct btrfs_timespec stime;
743 struct btrfs_timespec rtime;
744 __le64 reserved[8]; /* for future */
712} __attribute__ ((__packed__)); 745} __attribute__ ((__packed__));
713 746
714/* 747/*
@@ -883,6 +916,72 @@ struct btrfs_block_group_item {
883 __le64 flags; 916 __le64 flags;
884} __attribute__ ((__packed__)); 917} __attribute__ ((__packed__));
885 918
919/*
920 * is subvolume quota turned on?
921 */
922#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
923/*
924 * SCANNING is set during the initialization phase
925 */
926#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1)
927/*
928 * Some qgroup entries are known to be out of date,
929 * either because the configuration has changed in a way that
930 * makes a rescan necessary, or because the fs has been mounted
931 * with a non-qgroup-aware version.
932 * Turning qouta off and on again makes it inconsistent, too.
933 */
934#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
935
936#define BTRFS_QGROUP_STATUS_VERSION 1
937
938struct btrfs_qgroup_status_item {
939 __le64 version;
940 /*
941 * the generation is updated during every commit. As older
942 * versions of btrfs are not aware of qgroups, it will be
943 * possible to detect inconsistencies by checking the
944 * generation on mount time
945 */
946 __le64 generation;
947
948 /* flag definitions see above */
949 __le64 flags;
950
951 /*
952 * only used during scanning to record the progress
953 * of the scan. It contains a logical address
954 */
955 __le64 scan;
956} __attribute__ ((__packed__));
957
958struct btrfs_qgroup_info_item {
959 __le64 generation;
960 __le64 rfer;
961 __le64 rfer_cmpr;
962 __le64 excl;
963 __le64 excl_cmpr;
964} __attribute__ ((__packed__));
965
966/* flags definition for qgroup limits */
967#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
968#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
969#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
970#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
971#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
972#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
973
974struct btrfs_qgroup_limit_item {
975 /*
976 * only updated when any of the other values change
977 */
978 __le64 flags;
979 __le64 max_rfer;
980 __le64 max_excl;
981 __le64 rsv_rfer;
982 __le64 rsv_excl;
983} __attribute__ ((__packed__));
984
886struct btrfs_space_info { 985struct btrfs_space_info {
887 u64 flags; 986 u64 flags;
888 987
@@ -1030,6 +1129,13 @@ struct btrfs_block_group_cache {
1030 struct list_head cluster_list; 1129 struct list_head cluster_list;
1031}; 1130};
1032 1131
1132/* delayed seq elem */
1133struct seq_list {
1134 struct list_head list;
1135 u64 seq;
1136};
1137
1138/* fs_info */
1033struct reloc_control; 1139struct reloc_control;
1034struct btrfs_device; 1140struct btrfs_device;
1035struct btrfs_fs_devices; 1141struct btrfs_fs_devices;
@@ -1044,6 +1150,7 @@ struct btrfs_fs_info {
1044 struct btrfs_root *dev_root; 1150 struct btrfs_root *dev_root;
1045 struct btrfs_root *fs_root; 1151 struct btrfs_root *fs_root;
1046 struct btrfs_root *csum_root; 1152 struct btrfs_root *csum_root;
1153 struct btrfs_root *quota_root;
1047 1154
1048 /* the log root tree is a directory of all the other log roots */ 1155 /* the log root tree is a directory of all the other log roots */
1049 struct btrfs_root *log_root_tree; 1156 struct btrfs_root *log_root_tree;
@@ -1144,6 +1251,8 @@ struct btrfs_fs_info {
1144 spinlock_t tree_mod_seq_lock; 1251 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq; 1252 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list; 1253 struct list_head tree_mod_seq_list;
1254 struct seq_list tree_mod_seq_elem;
1255 wait_queue_head_t tree_mod_seq_wait;
1147 1256
1148 /* this protects tree_mod_log */ 1257 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock; 1258 rwlock_t tree_mod_log_lock;
@@ -1240,6 +1349,8 @@ struct btrfs_fs_info {
1240 */ 1349 */
1241 struct list_head space_info; 1350 struct list_head space_info;
1242 1351
1352 struct btrfs_space_info *data_sinfo;
1353
1243 struct reloc_control *reloc_ctl; 1354 struct reloc_control *reloc_ctl;
1244 1355
1245 spinlock_t delalloc_lock; 1356 spinlock_t delalloc_lock;
@@ -1296,6 +1407,29 @@ struct btrfs_fs_info {
1296#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1407#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1297 u32 check_integrity_print_mask; 1408 u32 check_integrity_print_mask;
1298#endif 1409#endif
1410 /*
1411 * quota information
1412 */
1413 unsigned int quota_enabled:1;
1414
1415 /*
1416 * quota_enabled only changes state after a commit. This holds the
1417 * next state.
1418 */
1419 unsigned int pending_quota_state:1;
1420
1421 /* is qgroup tracking in a consistent state? */
1422 u64 qgroup_flags;
1423
1424 /* holds configuration and tracking. Protected by qgroup_lock */
1425 struct rb_root qgroup_tree;
1426 spinlock_t qgroup_lock;
1427
1428 /* list of dirty qgroups to be written at next commit */
1429 struct list_head dirty_qgroups;
1430
1431 /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
1432 u64 qgroup_seq;
1299 1433
1300 /* filesystem state */ 1434 /* filesystem state */
1301 u64 fs_state; 1435 u64 fs_state;
@@ -1416,6 +1550,8 @@ struct btrfs_root {
1416 dev_t anon_dev; 1550 dev_t anon_dev;
1417 1551
1418 int force_cow; 1552 int force_cow;
1553
1554 spinlock_t root_times_lock;
1419}; 1555};
1420 1556
1421struct btrfs_ioctl_defrag_range_args { 1557struct btrfs_ioctl_defrag_range_args {
@@ -1525,6 +1661,30 @@ struct btrfs_ioctl_defrag_range_args {
1525#define BTRFS_DEV_ITEM_KEY 216 1661#define BTRFS_DEV_ITEM_KEY 216
1526#define BTRFS_CHUNK_ITEM_KEY 228 1662#define BTRFS_CHUNK_ITEM_KEY 228
1527 1663
1664/*
1665 * Records the overall state of the qgroups.
1666 * There's only one instance of this key present,
1667 * (0, BTRFS_QGROUP_STATUS_KEY, 0)
1668 */
1669#define BTRFS_QGROUP_STATUS_KEY 240
1670/*
1671 * Records the currently used space of the qgroup.
1672 * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
1673 */
1674#define BTRFS_QGROUP_INFO_KEY 242
1675/*
1676 * Contains the user configured limits for the qgroup.
1677 * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
1678 */
1679#define BTRFS_QGROUP_LIMIT_KEY 244
1680/*
1681 * Records the child-parent relationship of qgroups. For
1682 * each relation, 2 keys are present:
1683 * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
1684 * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
1685 */
1686#define BTRFS_QGROUP_RELATION_KEY 246
1687
1528#define BTRFS_BALANCE_ITEM_KEY 248 1688#define BTRFS_BALANCE_ITEM_KEY 248
1529 1689
1530/* 1690/*
@@ -1621,13 +1781,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1621 offsetof(type, member), \ 1781 offsetof(type, member), \
1622 sizeof(((type *)0)->member))) 1782 sizeof(((type *)0)->member)))
1623 1783
1624#ifndef BTRFS_SETGET_FUNCS 1784#define DECLARE_BTRFS_SETGET_BITS(bits) \
1785u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
1786 unsigned long off, \
1787 struct btrfs_map_token *token); \
1788void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
1789 unsigned long off, u##bits val, \
1790 struct btrfs_map_token *token); \
1791static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
1792 unsigned long off) \
1793{ \
1794 return btrfs_get_token_##bits(eb, ptr, off, NULL); \
1795} \
1796static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
1797 unsigned long off, u##bits val) \
1798{ \
1799 btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
1800}
1801
1802DECLARE_BTRFS_SETGET_BITS(8)
1803DECLARE_BTRFS_SETGET_BITS(16)
1804DECLARE_BTRFS_SETGET_BITS(32)
1805DECLARE_BTRFS_SETGET_BITS(64)
1806
1625#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 1807#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
1626u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ 1808static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
1627u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ 1809{ \
1628void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ 1810 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1629void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); 1811 return btrfs_get_##bits(eb, s, offsetof(type, member)); \
1630#endif 1812} \
1813static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
1814 u##bits val) \
1815{ \
1816 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1817 btrfs_set_##bits(eb, s, offsetof(type, member), val); \
1818} \
1819static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
1820 struct btrfs_map_token *token) \
1821{ \
1822 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1823 return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
1824} \
1825static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
1826 type *s, u##bits val, \
1827 struct btrfs_map_token *token) \
1828{ \
1829 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1830 btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
1831}
1631 1832
1632#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1833#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1633static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1834static inline u##bits btrfs_##name(struct extent_buffer *eb) \
@@ -2189,6 +2390,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
2189BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); 2390BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
2190BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 2391BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
2191 last_snapshot, 64); 2392 last_snapshot, 64);
2393BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
2394 generation_v2, 64);
2395BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
2396 ctransid, 64);
2397BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
2398 otransid, 64);
2399BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
2400 stransid, 64);
2401BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
2402 rtransid, 64);
2192 2403
2193static inline bool btrfs_root_readonly(struct btrfs_root *root) 2404static inline bool btrfs_root_readonly(struct btrfs_root *root)
2194{ 2405{
@@ -2465,6 +2676,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
2465 sizeof(val)); 2676 sizeof(val));
2466} 2677}
2467 2678
2679/* btrfs_qgroup_status_item */
2680BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
2681 generation, 64);
2682BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
2683 version, 64);
2684BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
2685 flags, 64);
2686BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
2687 scan, 64);
2688
2689/* btrfs_qgroup_info_item */
2690BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
2691 generation, 64);
2692BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
2693BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
2694 rfer_cmpr, 64);
2695BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
2696BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
2697 excl_cmpr, 64);
2698
2699BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
2700 struct btrfs_qgroup_info_item, generation, 64);
2701BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
2702 rfer, 64);
2703BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
2704 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
2705BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
2706 excl, 64);
2707BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
2708 struct btrfs_qgroup_info_item, excl_cmpr, 64);
2709
2710/* btrfs_qgroup_limit_item */
2711BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
2712 flags, 64);
2713BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
2714 max_rfer, 64);
2715BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
2716 max_excl, 64);
2717BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2718 rsv_rfer, 64);
2719BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2720 rsv_excl, 64);
2721
2468static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2722static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2469{ 2723{
2470 return sb->s_fs_info; 2724 return sb->s_fs_info;
@@ -2607,7 +2861,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2607 struct btrfs_root *root, u64 group_start); 2861 struct btrfs_root *root, u64 group_start);
2608u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2862u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2609u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2863u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2610void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2611void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2864void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2612int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2865int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2613void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2866void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
@@ -2661,6 +2914,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2661int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); 2914int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2662 2915
2663int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 2916int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2917int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2918 struct btrfs_fs_info *fs_info);
2664/* ctree.c */ 2919/* ctree.c */
2665int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2920int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2666 int level, int *slot); 2921 int level, int *slot);
@@ -2680,6 +2935,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
2680 struct btrfs_key *max_key, 2935 struct btrfs_key *max_key,
2681 struct btrfs_path *path, int cache_only, 2936 struct btrfs_path *path, int cache_only,
2682 u64 min_trans); 2937 u64 min_trans);
2938enum btrfs_compare_tree_result {
2939 BTRFS_COMPARE_TREE_NEW,
2940 BTRFS_COMPARE_TREE_DELETED,
2941 BTRFS_COMPARE_TREE_CHANGED,
2942};
2943typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
2944 struct btrfs_root *right_root,
2945 struct btrfs_path *left_path,
2946 struct btrfs_path *right_path,
2947 struct btrfs_key *key,
2948 enum btrfs_compare_tree_result result,
2949 void *ctx);
2950int btrfs_compare_trees(struct btrfs_root *left_root,
2951 struct btrfs_root *right_root,
2952 btrfs_changed_cb_t cb, void *ctx);
2683int btrfs_cow_block(struct btrfs_trans_handle *trans, 2953int btrfs_cow_block(struct btrfs_trans_handle *trans,
2684 struct btrfs_root *root, struct extent_buffer *buf, 2954 struct btrfs_root *root, struct extent_buffer *buf,
2685 struct extent_buffer *parent, int parent_slot, 2955 struct extent_buffer *parent, int parent_slot,
@@ -2711,6 +2981,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2711 ins_len, int cow); 2981 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2982int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq); 2983 struct btrfs_path *p, u64 time_seq);
2984int btrfs_search_slot_for_read(struct btrfs_root *root,
2985 struct btrfs_key *key, struct btrfs_path *p,
2986 int find_higher, int return_any);
2714int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2987int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2715 struct btrfs_root *root, struct extent_buffer *parent, 2988 struct btrfs_root *root, struct extent_buffer *parent,
2716 int start_slot, int cache_only, u64 *last_ret, 2989 int start_slot, int cache_only, u64 *last_ret,
@@ -2793,11 +3066,22 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2793 kfree(fs_info->chunk_root); 3066 kfree(fs_info->chunk_root);
2794 kfree(fs_info->dev_root); 3067 kfree(fs_info->dev_root);
2795 kfree(fs_info->csum_root); 3068 kfree(fs_info->csum_root);
3069 kfree(fs_info->quota_root);
2796 kfree(fs_info->super_copy); 3070 kfree(fs_info->super_copy);
2797 kfree(fs_info->super_for_commit); 3071 kfree(fs_info->super_for_commit);
2798 kfree(fs_info); 3072 kfree(fs_info);
2799} 3073}
2800 3074
3075/* tree mod log functions from ctree.c */
3076u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3077 struct seq_list *elem);
3078void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3079 struct seq_list *elem);
3080static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
3081{
3082 return atomic_inc_return(&fs_info->tree_mod_seq);
3083}
3084
2801/* root-item.c */ 3085/* root-item.c */
2802int btrfs_find_root_ref(struct btrfs_root *tree_root, 3086int btrfs_find_root_ref(struct btrfs_root *tree_root,
2803 struct btrfs_path *path, 3087 struct btrfs_path *path,
@@ -2819,6 +3103,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
2819 struct btrfs_root *root, 3103 struct btrfs_root *root,
2820 struct btrfs_key *key, 3104 struct btrfs_key *key,
2821 struct btrfs_root_item *item); 3105 struct btrfs_root_item *item);
3106void btrfs_read_root_item(struct btrfs_root *root,
3107 struct extent_buffer *eb, int slot,
3108 struct btrfs_root_item *item);
2822int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3109int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2823 btrfs_root_item *item, struct btrfs_key *key); 3110 btrfs_root_item *item, struct btrfs_key *key);
2824int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3111int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2826,6 +3113,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2826void btrfs_set_root_node(struct btrfs_root_item *item, 3113void btrfs_set_root_node(struct btrfs_root_item *item,
2827 struct extent_buffer *node); 3114 struct extent_buffer *node);
2828void btrfs_check_and_init_root_item(struct btrfs_root_item *item); 3115void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
3116void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3117 struct btrfs_root *root);
2829 3118
2830/* dir-item.c */ 3119/* dir-item.c */
2831int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3120int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -3061,6 +3350,23 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
3061 struct btrfs_root *root, const char *function, 3350 struct btrfs_root *root, const char *function,
3062 unsigned int line, int errno); 3351 unsigned int line, int errno);
3063 3352
3353#define btrfs_set_fs_incompat(__fs_info, opt) \
3354 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
3355
3356static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3357 u64 flag)
3358{
3359 struct btrfs_super_block *disk_super;
3360 u64 features;
3361
3362 disk_super = fs_info->super_copy;
3363 features = btrfs_super_incompat_flags(disk_super);
3364 if (!(features & flag)) {
3365 features |= flag;
3366 btrfs_set_super_incompat_flags(disk_super, features);
3367 }
3368}
3369
3064#define btrfs_abort_transaction(trans, root, errno) \ 3370#define btrfs_abort_transaction(trans, root, errno) \
3065do { \ 3371do { \
3066 __btrfs_abort_transaction(trans, root, __func__, \ 3372 __btrfs_abort_transaction(trans, root, __func__, \
@@ -3156,17 +3462,49 @@ void btrfs_reada_detach(void *handle);
3156int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3462int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3157 u64 start, int err); 3463 u64 start, int err);
3158 3464
3159/* delayed seq elem */ 3465/* qgroup.c */
3160struct seq_list { 3466struct qgroup_update {
3161 struct list_head list; 3467 struct list_head list;
3162 u64 seq; 3468 struct btrfs_delayed_ref_node *node;
3163 u32 flags; 3469 struct btrfs_delayed_extent_op *extent_op;
3164}; 3470};
3165 3471
3166void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 3472int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3167 struct seq_list *elem); 3473 struct btrfs_fs_info *fs_info);
3168void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3474int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3169 struct seq_list *elem); 3475 struct btrfs_fs_info *fs_info);
3476int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
3477int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3478 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3479int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
3480 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3481int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
3482 struct btrfs_fs_info *fs_info, u64 qgroupid,
3483 char *name);
3484int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
3485 struct btrfs_fs_info *fs_info, u64 qgroupid);
3486int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
3487 struct btrfs_fs_info *fs_info, u64 qgroupid,
3488 struct btrfs_qgroup_limit *limit);
3489int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
3490void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
3491struct btrfs_delayed_extent_op;
3492int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
3493 struct btrfs_delayed_ref_node *node,
3494 struct btrfs_delayed_extent_op *extent_op);
3495int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
3496 struct btrfs_fs_info *fs_info,
3497 struct btrfs_delayed_ref_node *node,
3498 struct btrfs_delayed_extent_op *extent_op);
3499int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
3500 struct btrfs_fs_info *fs_info);
3501int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
3502 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
3503 struct btrfs_qgroup_inherit *inherit);
3504int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
3505void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
3506
3507void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
3170 3508
3171static inline int is_fstree(u64 rootid) 3509static inline int is_fstree(u64 rootid)
3172{ 3510{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2399f408691..335605c8cea 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node(
62 INIT_LIST_HEAD(&delayed_node->n_list); 62 INIT_LIST_HEAD(&delayed_node->n_list);
63 INIT_LIST_HEAD(&delayed_node->p_list); 63 INIT_LIST_HEAD(&delayed_node->p_list);
64 delayed_node->bytes_reserved = 0; 64 delayed_node->bytes_reserved = 0;
65 memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
65} 66}
66 67
67static inline int btrfs_is_continuous_delayed_item( 68static inline int btrfs_is_continuous_delayed_item(
@@ -1113,8 +1114,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1113 * Returns < 0 on error and returns with an aborted transaction with any 1114 * Returns < 0 on error and returns with an aborted transaction with any
1114 * outstanding delayed items cleaned up. 1115 * outstanding delayed items cleaned up.
1115 */ 1116 */
1116int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 1117static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1117 struct btrfs_root *root) 1118 struct btrfs_root *root, int nr)
1118{ 1119{
1119 struct btrfs_root *curr_root = root; 1120 struct btrfs_root *curr_root = root;
1120 struct btrfs_delayed_root *delayed_root; 1121 struct btrfs_delayed_root *delayed_root;
@@ -1122,6 +1123,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1122 struct btrfs_path *path; 1123 struct btrfs_path *path;
1123 struct btrfs_block_rsv *block_rsv; 1124 struct btrfs_block_rsv *block_rsv;
1124 int ret = 0; 1125 int ret = 0;
1126 bool count = (nr > 0);
1125 1127
1126 if (trans->aborted) 1128 if (trans->aborted)
1127 return -EIO; 1129 return -EIO;
@@ -1137,7 +1139,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1137 delayed_root = btrfs_get_delayed_root(root); 1139 delayed_root = btrfs_get_delayed_root(root);
1138 1140
1139 curr_node = btrfs_first_delayed_node(delayed_root); 1141 curr_node = btrfs_first_delayed_node(delayed_root);
1140 while (curr_node) { 1142 while (curr_node && (!count || (count && nr--))) {
1141 curr_root = curr_node->root; 1143 curr_root = curr_node->root;
1142 ret = btrfs_insert_delayed_items(trans, path, curr_root, 1144 ret = btrfs_insert_delayed_items(trans, path, curr_root,
1143 curr_node); 1145 curr_node);
@@ -1149,6 +1151,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1149 path, curr_node); 1151 path, curr_node);
1150 if (ret) { 1152 if (ret) {
1151 btrfs_release_delayed_node(curr_node); 1153 btrfs_release_delayed_node(curr_node);
1154 curr_node = NULL;
1152 btrfs_abort_transaction(trans, root, ret); 1155 btrfs_abort_transaction(trans, root, ret);
1153 break; 1156 break;
1154 } 1157 }
@@ -1158,12 +1161,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1158 btrfs_release_delayed_node(prev_node); 1161 btrfs_release_delayed_node(prev_node);
1159 } 1162 }
1160 1163
1164 if (curr_node)
1165 btrfs_release_delayed_node(curr_node);
1161 btrfs_free_path(path); 1166 btrfs_free_path(path);
1162 trans->block_rsv = block_rsv; 1167 trans->block_rsv = block_rsv;
1163 1168
1164 return ret; 1169 return ret;
1165} 1170}
1166 1171
1172int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1173 struct btrfs_root *root)
1174{
1175 return __btrfs_run_delayed_items(trans, root, -1);
1176}
1177
1178int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1179 struct btrfs_root *root, int nr)
1180{
1181 return __btrfs_run_delayed_items(trans, root, nr);
1182}
1183
1167static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1184static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1168 struct btrfs_delayed_node *node) 1185 struct btrfs_delayed_node *node)
1169{ 1186{
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f5aa4023d3e..4f808e1baee 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode);
107 107
108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root); 109 struct btrfs_root *root);
110int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root, int nr);
110 112
111void btrfs_balance_delayed_items(struct btrfs_root *root); 113void btrfs_balance_delayed_items(struct btrfs_root *root);
112 114
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b04790..da7419ed01b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
233 return 0; 233 return 0;
234} 234}
235 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 236int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
237 struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq) 238 u64 seq)
238{ 239{
239 struct seq_list *elem; 240 struct seq_list *elem;
240 241 int ret = 0;
241 assert_spin_locked(&delayed_refs->lock); 242
242 if (list_empty(&delayed_refs->seq_head)) 243 spin_lock(&fs_info->tree_mod_seq_lock);
243 return 0; 244 if (!list_empty(&fs_info->tree_mod_seq_list)) {
244 245 elem = list_first_entry(&fs_info->tree_mod_seq_list,
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); 246 struct seq_list, list);
246 if (seq >= elem->seq) { 247 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", 248 pr_debug("holding back delayed_ref %llu, lowest is "
248 seq, elem->seq, delayed_refs); 249 "%llu (%p)\n", seq, elem->seq, delayed_refs);
249 return 1; 250 ret = 1;
251 }
250 } 252 }
251 return 0; 253
254 spin_unlock(&fs_info->tree_mod_seq_lock);
255 return ret;
252} 256}
253 257
254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 258int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -525,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 529 ref->is_head = 0;
526 ref->in_tree = 1; 530 ref->in_tree = 1;
527 531
528 if (is_fstree(ref_root)) 532 if (need_ref_seq(for_cow, ref_root))
529 seq = inc_delayed_seq(delayed_refs); 533 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
530 ref->seq = seq; 534 ref->seq = seq;
531 535
532 full_ref = btrfs_delayed_node_to_tree_ref(ref); 536 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -584,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 588 ref->is_head = 0;
585 ref->in_tree = 1; 589 ref->in_tree = 1;
586 590
587 if (is_fstree(ref_root)) 591 if (need_ref_seq(for_cow, ref_root))
588 seq = inc_delayed_seq(delayed_refs); 592 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
589 ref->seq = seq; 593 ref->seq = seq;
590 594
591 full_ref = btrfs_delayed_node_to_data_ref(ref); 595 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -658,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 662 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 663 num_bytes, parent, ref_root, level, action,
660 for_cow); 664 for_cow);
661 if (!is_fstree(ref_root) && 665 if (!need_ref_seq(for_cow, ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 666 waitqueue_active(&fs_info->tree_mod_seq_wait))
663 wake_up(&delayed_refs->seq_wait); 667 wake_up(&fs_info->tree_mod_seq_wait);
664 spin_unlock(&delayed_refs->lock); 668 spin_unlock(&delayed_refs->lock);
669 if (need_ref_seq(for_cow, ref_root))
670 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
665 671
666 return 0; 672 return 0;
667} 673}
@@ -707,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 713 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
708 num_bytes, parent, ref_root, owner, offset, 714 num_bytes, parent, ref_root, owner, offset,
709 action, for_cow); 715 action, for_cow);
710 if (!is_fstree(ref_root) && 716 if (!need_ref_seq(for_cow, ref_root) &&
711 waitqueue_active(&delayed_refs->seq_wait)) 717 waitqueue_active(&fs_info->tree_mod_seq_wait))
712 wake_up(&delayed_refs->seq_wait); 718 wake_up(&fs_info->tree_mod_seq_wait);
713 spin_unlock(&delayed_refs->lock); 719 spin_unlock(&delayed_refs->lock);
720 if (need_ref_seq(for_cow, ref_root))
721 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
714 722
715 return 0; 723 return 0;
716} 724}
@@ -736,8 +744,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
736 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
737 extent_op->is_data); 745 extent_op->is_data);
738 746
739 if (waitqueue_active(&delayed_refs->seq_wait)) 747 if (waitqueue_active(&fs_info->tree_mod_seq_wait))
740 wake_up(&delayed_refs->seq_wait); 748 wake_up(&fs_info->tree_mod_seq_wait);
741 spin_unlock(&delayed_refs->lock); 749 spin_unlock(&delayed_refs->lock);
742 return 0; 750 return 0;
743} 751}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927fb995..0d7c90c366b 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
139 int flushing; 139 int flushing;
140 140
141 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
162}; 142};
163 143
164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 144static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -195,34 +175,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 175int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 176 struct list_head *cluster, u64 search_start);
197 177
198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 178int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
199{ 179 struct btrfs_delayed_ref_root *delayed_refs,
200 assert_spin_locked(&delayed_refs->lock); 180 u64 seq);
201 ++delayed_refs->seq;
202 return delayed_refs->seq;
203}
204 181
205static inline void 182/*
206btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 183 * delayed refs with a ref_seq > 0 must be held back during backref walking.
207 struct seq_list *elem) 184 * this only applies to items in one of the fs-trees. for_cow items never need
185 * to be held back, so they won't get a ref_seq number.
186 */
187static inline int need_ref_seq(int for_cow, u64 rootid)
208{ 188{
209 assert_spin_locked(&delayed_refs->lock); 189 if (for_cow)
210 elem->seq = delayed_refs->seq; 190 return 0;
211 list_add_tail(&elem->list, &delayed_refs->seq_head);
212}
213 191
214static inline void 192 if (rootid == BTRFS_FS_TREE_OBJECTID)
215btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 193 return 1;
216 struct seq_list *elem)
217{
218 spin_lock(&delayed_refs->lock);
219 list_del(&elem->list);
220 wake_up(&delayed_refs->seq_wait);
221 spin_unlock(&delayed_refs->lock);
222}
223 194
224int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 195 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
225 u64 seq); 196 return 1;
197
198 return 0;
199}
226 200
227/* 201/*
228 * a node might live in a head or a regular ref, this lets you 202 * a node might live in a head or a regular ref, this lets you
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2936ca49b3b..502b20c56e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
407 break; 407 break;
408 } 408 }
409 409
410 if (failed && !ret) 410 if (failed && !ret && failed_mirror)
411 repair_eb_io_failure(root, eb, failed_mirror); 411 repair_eb_io_failure(root, eb, failed_mirror);
412 412
413 return ret; 413 return ret;
@@ -1182,6 +1182,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1182 root->defrag_running = 0; 1182 root->defrag_running = 0;
1183 root->root_key.objectid = objectid; 1183 root->root_key.objectid = objectid;
1184 root->anon_dev = 0; 1184 root->anon_dev = 0;
1185
1186 spin_lock_init(&root->root_times_lock);
1185} 1187}
1186 1188
1187static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1189static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1225,6 +1227,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1225 return root; 1227 return root;
1226} 1228}
1227 1229
1230struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1231 struct btrfs_fs_info *fs_info,
1232 u64 objectid)
1233{
1234 struct extent_buffer *leaf;
1235 struct btrfs_root *tree_root = fs_info->tree_root;
1236 struct btrfs_root *root;
1237 struct btrfs_key key;
1238 int ret = 0;
1239 u64 bytenr;
1240
1241 root = btrfs_alloc_root(fs_info);
1242 if (!root)
1243 return ERR_PTR(-ENOMEM);
1244
1245 __setup_root(tree_root->nodesize, tree_root->leafsize,
1246 tree_root->sectorsize, tree_root->stripesize,
1247 root, fs_info, objectid);
1248 root->root_key.objectid = objectid;
1249 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1250 root->root_key.offset = 0;
1251
1252 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
1253 0, objectid, NULL, 0, 0, 0);
1254 if (IS_ERR(leaf)) {
1255 ret = PTR_ERR(leaf);
1256 goto fail;
1257 }
1258
1259 bytenr = leaf->start;
1260 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1261 btrfs_set_header_bytenr(leaf, leaf->start);
1262 btrfs_set_header_generation(leaf, trans->transid);
1263 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1264 btrfs_set_header_owner(leaf, objectid);
1265 root->node = leaf;
1266
1267 write_extent_buffer(leaf, fs_info->fsid,
1268 (unsigned long)btrfs_header_fsid(leaf),
1269 BTRFS_FSID_SIZE);
1270 write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
1271 (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
1272 BTRFS_UUID_SIZE);
1273 btrfs_mark_buffer_dirty(leaf);
1274
1275 root->commit_root = btrfs_root_node(root);
1276 root->track_dirty = 1;
1277
1278
1279 root->root_item.flags = 0;
1280 root->root_item.byte_limit = 0;
1281 btrfs_set_root_bytenr(&root->root_item, leaf->start);
1282 btrfs_set_root_generation(&root->root_item, trans->transid);
1283 btrfs_set_root_level(&root->root_item, 0);
1284 btrfs_set_root_refs(&root->root_item, 1);
1285 btrfs_set_root_used(&root->root_item, leaf->len);
1286 btrfs_set_root_last_snapshot(&root->root_item, 0);
1287 btrfs_set_root_dirid(&root->root_item, 0);
1288 root->root_item.drop_level = 0;
1289
1290 key.objectid = objectid;
1291 key.type = BTRFS_ROOT_ITEM_KEY;
1292 key.offset = 0;
1293 ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1294 if (ret)
1295 goto fail;
1296
1297 btrfs_tree_unlock(leaf);
1298
1299fail:
1300 if (ret)
1301 return ERR_PTR(ret);
1302
1303 return root;
1304}
1305
1228static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1306static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1229 struct btrfs_fs_info *fs_info) 1307 struct btrfs_fs_info *fs_info)
1230{ 1308{
@@ -1326,6 +1404,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1326 u64 generation; 1404 u64 generation;
1327 u32 blocksize; 1405 u32 blocksize;
1328 int ret = 0; 1406 int ret = 0;
1407 int slot;
1329 1408
1330 root = btrfs_alloc_root(fs_info); 1409 root = btrfs_alloc_root(fs_info);
1331 if (!root) 1410 if (!root)
@@ -1352,9 +1431,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1352 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1431 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1353 if (ret == 0) { 1432 if (ret == 0) {
1354 l = path->nodes[0]; 1433 l = path->nodes[0];
1355 read_extent_buffer(l, &root->root_item, 1434 slot = path->slots[0];
1356 btrfs_item_ptr_offset(l, path->slots[0]), 1435 btrfs_read_root_item(tree_root, l, slot, &root->root_item);
1357 sizeof(root->root_item));
1358 memcpy(&root->root_key, location, sizeof(*location)); 1436 memcpy(&root->root_key, location, sizeof(*location));
1359 } 1437 }
1360 btrfs_free_path(path); 1438 btrfs_free_path(path);
@@ -1396,6 +1474,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1396 return fs_info->dev_root; 1474 return fs_info->dev_root;
1397 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) 1475 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1398 return fs_info->csum_root; 1476 return fs_info->csum_root;
1477 if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1478 return fs_info->quota_root ? fs_info->quota_root :
1479 ERR_PTR(-ENOENT);
1399again: 1480again:
1400 spin_lock(&fs_info->fs_roots_radix_lock); 1481 spin_lock(&fs_info->fs_roots_radix_lock);
1401 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1482 root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1823,6 +1904,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1823 free_extent_buffer(info->extent_root->commit_root); 1904 free_extent_buffer(info->extent_root->commit_root);
1824 free_extent_buffer(info->csum_root->node); 1905 free_extent_buffer(info->csum_root->node);
1825 free_extent_buffer(info->csum_root->commit_root); 1906 free_extent_buffer(info->csum_root->commit_root);
1907 if (info->quota_root) {
1908 free_extent_buffer(info->quota_root->node);
1909 free_extent_buffer(info->quota_root->commit_root);
1910 }
1826 1911
1827 info->tree_root->node = NULL; 1912 info->tree_root->node = NULL;
1828 info->tree_root->commit_root = NULL; 1913 info->tree_root->commit_root = NULL;
@@ -1832,6 +1917,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1832 info->extent_root->commit_root = NULL; 1917 info->extent_root->commit_root = NULL;
1833 info->csum_root->node = NULL; 1918 info->csum_root->node = NULL;
1834 info->csum_root->commit_root = NULL; 1919 info->csum_root->commit_root = NULL;
1920 if (info->quota_root) {
1921 info->quota_root->node = NULL;
1922 info->quota_root->commit_root = NULL;
1923 }
1835 1924
1836 if (chunk_root) { 1925 if (chunk_root) {
1837 free_extent_buffer(info->chunk_root->node); 1926 free_extent_buffer(info->chunk_root->node);
@@ -1862,6 +1951,7 @@ int open_ctree(struct super_block *sb,
1862 struct btrfs_root *csum_root; 1951 struct btrfs_root *csum_root;
1863 struct btrfs_root *chunk_root; 1952 struct btrfs_root *chunk_root;
1864 struct btrfs_root *dev_root; 1953 struct btrfs_root *dev_root;
1954 struct btrfs_root *quota_root;
1865 struct btrfs_root *log_tree_root; 1955 struct btrfs_root *log_tree_root;
1866 int ret; 1956 int ret;
1867 int err = -EINVAL; 1957 int err = -EINVAL;
@@ -1873,9 +1963,10 @@ int open_ctree(struct super_block *sb,
1873 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); 1963 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1874 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 1964 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1875 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 1965 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1966 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
1876 1967
1877 if (!tree_root || !extent_root || !csum_root || 1968 if (!tree_root || !extent_root || !csum_root ||
1878 !chunk_root || !dev_root) { 1969 !chunk_root || !dev_root || !quota_root) {
1879 err = -ENOMEM; 1970 err = -ENOMEM;
1880 goto fail; 1971 goto fail;
1881 } 1972 }
@@ -1944,6 +2035,8 @@ int open_ctree(struct super_block *sb,
1944 fs_info->free_chunk_space = 0; 2035 fs_info->free_chunk_space = 0;
1945 fs_info->tree_mod_log = RB_ROOT; 2036 fs_info->tree_mod_log = RB_ROOT;
1946 2037
2038 init_waitqueue_head(&fs_info->tree_mod_seq_wait);
2039
1947 /* readahead state */ 2040 /* readahead state */
1948 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2041 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1949 spin_lock_init(&fs_info->reada_lock); 2042 spin_lock_init(&fs_info->reada_lock);
@@ -2032,6 +2125,13 @@ int open_ctree(struct super_block *sb,
2032 init_rwsem(&fs_info->cleanup_work_sem); 2125 init_rwsem(&fs_info->cleanup_work_sem);
2033 init_rwsem(&fs_info->subvol_sem); 2126 init_rwsem(&fs_info->subvol_sem);
2034 2127
2128 spin_lock_init(&fs_info->qgroup_lock);
2129 fs_info->qgroup_tree = RB_ROOT;
2130 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2131 fs_info->qgroup_seq = 1;
2132 fs_info->quota_enabled = 0;
2133 fs_info->pending_quota_state = 0;
2134
2035 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2135 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2036 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 2136 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2037 2137
@@ -2244,7 +2344,7 @@ int open_ctree(struct super_block *sb,
2244 ret |= btrfs_start_workers(&fs_info->caching_workers); 2344 ret |= btrfs_start_workers(&fs_info->caching_workers);
2245 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2345 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2246 if (ret) { 2346 if (ret) {
2247 ret = -ENOMEM; 2347 err = -ENOMEM;
2248 goto fail_sb_buffer; 2348 goto fail_sb_buffer;
2249 } 2349 }
2250 2350
@@ -2356,6 +2456,17 @@ retry_root_backup:
2356 goto recovery_tree_root; 2456 goto recovery_tree_root;
2357 csum_root->track_dirty = 1; 2457 csum_root->track_dirty = 1;
2358 2458
2459 ret = find_and_setup_root(tree_root, fs_info,
2460 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
2461 if (ret) {
2462 kfree(quota_root);
2463 quota_root = fs_info->quota_root = NULL;
2464 } else {
2465 quota_root->track_dirty = 1;
2466 fs_info->quota_enabled = 1;
2467 fs_info->pending_quota_state = 1;
2468 }
2469
2359 fs_info->generation = generation; 2470 fs_info->generation = generation;
2360 fs_info->last_trans_committed = generation; 2471 fs_info->last_trans_committed = generation;
2361 2472
@@ -2415,6 +2526,9 @@ retry_root_backup:
2415 " integrity check module %s\n", sb->s_id); 2526 " integrity check module %s\n", sb->s_id);
2416 } 2527 }
2417#endif 2528#endif
2529 ret = btrfs_read_qgroup_config(fs_info);
2530 if (ret)
2531 goto fail_trans_kthread;
2418 2532
2419 /* do not make disk changes in broken FS */ 2533 /* do not make disk changes in broken FS */
2420 if (btrfs_super_log_root(disk_super) != 0 && 2534 if (btrfs_super_log_root(disk_super) != 0 &&
@@ -2425,7 +2539,7 @@ retry_root_backup:
2425 printk(KERN_WARNING "Btrfs log replay required " 2539 printk(KERN_WARNING "Btrfs log replay required "
2426 "on RO media\n"); 2540 "on RO media\n");
2427 err = -EIO; 2541 err = -EIO;
2428 goto fail_trans_kthread; 2542 goto fail_qgroup;
2429 } 2543 }
2430 blocksize = 2544 blocksize =
2431 btrfs_level_size(tree_root, 2545 btrfs_level_size(tree_root,
@@ -2434,7 +2548,7 @@ retry_root_backup:
2434 log_tree_root = btrfs_alloc_root(fs_info); 2548 log_tree_root = btrfs_alloc_root(fs_info);
2435 if (!log_tree_root) { 2549 if (!log_tree_root) {
2436 err = -ENOMEM; 2550 err = -ENOMEM;
2437 goto fail_trans_kthread; 2551 goto fail_qgroup;
2438 } 2552 }
2439 2553
2440 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2554 __setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2466,15 +2580,15 @@ retry_root_backup:
2466 2580
2467 if (!(sb->s_flags & MS_RDONLY)) { 2581 if (!(sb->s_flags & MS_RDONLY)) {
2468 ret = btrfs_cleanup_fs_roots(fs_info); 2582 ret = btrfs_cleanup_fs_roots(fs_info);
2469 if (ret) { 2583 if (ret)
2470 } 2584 goto fail_trans_kthread;
2471 2585
2472 ret = btrfs_recover_relocation(tree_root); 2586 ret = btrfs_recover_relocation(tree_root);
2473 if (ret < 0) { 2587 if (ret < 0) {
2474 printk(KERN_WARNING 2588 printk(KERN_WARNING
2475 "btrfs: failed to recover relocation\n"); 2589 "btrfs: failed to recover relocation\n");
2476 err = -EINVAL; 2590 err = -EINVAL;
2477 goto fail_trans_kthread; 2591 goto fail_qgroup;
2478 } 2592 }
2479 } 2593 }
2480 2594
@@ -2484,10 +2598,10 @@ retry_root_backup:
2484 2598
2485 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2599 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2486 if (!fs_info->fs_root) 2600 if (!fs_info->fs_root)
2487 goto fail_trans_kthread; 2601 goto fail_qgroup;
2488 if (IS_ERR(fs_info->fs_root)) { 2602 if (IS_ERR(fs_info->fs_root)) {
2489 err = PTR_ERR(fs_info->fs_root); 2603 err = PTR_ERR(fs_info->fs_root);
2490 goto fail_trans_kthread; 2604 goto fail_qgroup;
2491 } 2605 }
2492 2606
2493 if (sb->s_flags & MS_RDONLY) 2607 if (sb->s_flags & MS_RDONLY)
@@ -2511,6 +2625,8 @@ retry_root_backup:
2511 2625
2512 return 0; 2626 return 0;
2513 2627
2628fail_qgroup:
2629 btrfs_free_qgroup_config(fs_info);
2514fail_trans_kthread: 2630fail_trans_kthread:
2515 kthread_stop(fs_info->transaction_kthread); 2631 kthread_stop(fs_info->transaction_kthread);
2516fail_cleaner: 2632fail_cleaner:
@@ -3109,6 +3225,8 @@ int close_ctree(struct btrfs_root *root)
3109 fs_info->closing = 2; 3225 fs_info->closing = 2;
3110 smp_mb(); 3226 smp_mb();
3111 3227
3228 btrfs_free_qgroup_config(root->fs_info);
3229
3112 if (fs_info->delalloc_bytes) { 3230 if (fs_info->delalloc_bytes) {
3113 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3231 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3114 (unsigned long long)fs_info->delalloc_bytes); 3232 (unsigned long long)fs_info->delalloc_bytes);
@@ -3128,6 +3246,10 @@ int close_ctree(struct btrfs_root *root)
3128 free_extent_buffer(fs_info->dev_root->commit_root); 3246 free_extent_buffer(fs_info->dev_root->commit_root);
3129 free_extent_buffer(fs_info->csum_root->node); 3247 free_extent_buffer(fs_info->csum_root->node);
3130 free_extent_buffer(fs_info->csum_root->commit_root); 3248 free_extent_buffer(fs_info->csum_root->commit_root);
3249 if (fs_info->quota_root) {
3250 free_extent_buffer(fs_info->quota_root->node);
3251 free_extent_buffer(fs_info->quota_root->commit_root);
3252 }
3131 3253
3132 btrfs_free_block_groups(fs_info); 3254 btrfs_free_block_groups(fs_info);
3133 3255
@@ -3258,7 +3380,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3258 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3380 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3259} 3381}
3260 3382
3261static int btree_lock_page_hook(struct page *page, void *data, 3383int btree_lock_page_hook(struct page *page, void *data,
3262 void (*flush_fn)(void *)) 3384 void (*flush_fn)(void *))
3263{ 3385{
3264 struct inode *inode = page->mapping->host; 3386 struct inode *inode = page->mapping->host;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b3fab39f7..95e147eea23 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
89int btrfs_cleanup_transaction(struct btrfs_root *root); 89int btrfs_cleanup_transaction(struct btrfs_root *root);
90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
91 struct btrfs_root *root); 91 struct btrfs_root *root);
92void btrfs_abort_devices(struct btrfs_root *root);
93struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
94 struct btrfs_fs_info *fs_info,
95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *));
92 98
93#ifdef CONFIG_DEBUG_LOCK_ALLOC 99#ifdef CONFIG_DEBUG_LOCK_ALLOC
94void btrfs_init_lockdep(void); 100void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6e1d36702ff..4e1b153b7c4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36 36
37#undef SCRAMBLE_DELAYED_REFS
38
37/* 39/*
38 * control flags for do_chunk_alloc's force field 40 * control flags for do_chunk_alloc's force field
39 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 41 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2217 struct btrfs_delayed_ref_node *ref; 2219 struct btrfs_delayed_ref_node *ref;
2218 struct btrfs_delayed_ref_head *locked_ref = NULL; 2220 struct btrfs_delayed_ref_head *locked_ref = NULL;
2219 struct btrfs_delayed_extent_op *extent_op; 2221 struct btrfs_delayed_extent_op *extent_op;
2222 struct btrfs_fs_info *fs_info = root->fs_info;
2220 int ret; 2223 int ret;
2221 int count = 0; 2224 int count = 0;
2222 int must_insert_reserved = 0; 2225 int must_insert_reserved = 0;
@@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2255 ref = select_delayed_ref(locked_ref); 2258 ref = select_delayed_ref(locked_ref);
2256 2259
2257 if (ref && ref->seq && 2260 if (ref && ref->seq &&
2258 btrfs_check_delayed_seq(delayed_refs, ref->seq)) { 2261 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2259 /* 2262 /*
2260 * there are still refs with lower seq numbers in the 2263 * there are still refs with lower seq numbers in the
2261 * process of being added. Don't run this ref yet. 2264 * process of being added. Don't run this ref yet.
@@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 } 2340 }
2338 2341
2339next: 2342next:
2340 do_chunk_alloc(trans, root->fs_info->extent_root, 2343 do_chunk_alloc(trans, fs_info->extent_root,
2341 2 * 1024 * 1024, 2344 2 * 1024 * 1024,
2342 btrfs_get_alloc_profile(root, 0), 2345 btrfs_get_alloc_profile(root, 0),
2343 CHUNK_ALLOC_NO_FORCE); 2346 CHUNK_ALLOC_NO_FORCE);
@@ -2347,21 +2350,99 @@ next:
2347 return count; 2350 return count;
2348} 2351}
2349 2352
2350static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, 2353static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
2354 struct btrfs_delayed_ref_root *delayed_refs,
2351 unsigned long num_refs, 2355 unsigned long num_refs,
2352 struct list_head *first_seq) 2356 struct list_head *first_seq)
2353{ 2357{
2354 spin_unlock(&delayed_refs->lock); 2358 spin_unlock(&delayed_refs->lock);
2355 pr_debug("waiting for more refs (num %ld, first %p)\n", 2359 pr_debug("waiting for more refs (num %ld, first %p)\n",
2356 num_refs, first_seq); 2360 num_refs, first_seq);
2357 wait_event(delayed_refs->seq_wait, 2361 wait_event(fs_info->tree_mod_seq_wait,
2358 num_refs != delayed_refs->num_entries || 2362 num_refs != delayed_refs->num_entries ||
2359 delayed_refs->seq_head.next != first_seq); 2363 fs_info->tree_mod_seq_list.next != first_seq);
2360 pr_debug("done waiting for more refs (num %ld, first %p)\n", 2364 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2361 delayed_refs->num_entries, delayed_refs->seq_head.next); 2365 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
2362 spin_lock(&delayed_refs->lock); 2366 spin_lock(&delayed_refs->lock);
2363} 2367}
2364 2368
2369#ifdef SCRAMBLE_DELAYED_REFS
2370/*
2371 * Normally delayed refs get processed in ascending bytenr order. This
2372 * correlates in most cases to the order added. To expose dependencies on this
2373 * order, we start to process the tree in the middle instead of the beginning
2374 */
2375static u64 find_middle(struct rb_root *root)
2376{
2377 struct rb_node *n = root->rb_node;
2378 struct btrfs_delayed_ref_node *entry;
2379 int alt = 1;
2380 u64 middle;
2381 u64 first = 0, last = 0;
2382
2383 n = rb_first(root);
2384 if (n) {
2385 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2386 first = entry->bytenr;
2387 }
2388 n = rb_last(root);
2389 if (n) {
2390 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2391 last = entry->bytenr;
2392 }
2393 n = root->rb_node;
2394
2395 while (n) {
2396 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2397 WARN_ON(!entry->in_tree);
2398
2399 middle = entry->bytenr;
2400
2401 if (alt)
2402 n = n->rb_left;
2403 else
2404 n = n->rb_right;
2405
2406 alt = 1 - alt;
2407 }
2408 return middle;
2409}
2410#endif
2411
2412int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2413 struct btrfs_fs_info *fs_info)
2414{
2415 struct qgroup_update *qgroup_update;
2416 int ret = 0;
2417
2418 if (list_empty(&trans->qgroup_ref_list) !=
2419 !trans->delayed_ref_elem.seq) {
2420 /* list without seq or seq without list */
2421 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2422 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2423 trans->delayed_ref_elem.seq);
2424 BUG();
2425 }
2426
2427 if (!trans->delayed_ref_elem.seq)
2428 return 0;
2429
2430 while (!list_empty(&trans->qgroup_ref_list)) {
2431 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2432 struct qgroup_update, list);
2433 list_del(&qgroup_update->list);
2434 if (!ret)
2435 ret = btrfs_qgroup_account_ref(
2436 trans, fs_info, qgroup_update->node,
2437 qgroup_update->extent_op);
2438 kfree(qgroup_update);
2439 }
2440
2441 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2442
2443 return ret;
2444}
2445
2365/* 2446/*
2366 * this starts processing the delayed reference count updates and 2447 * this starts processing the delayed reference count updates and
2367 * extent insertions we have queued up so far. count can be 2448 * extent insertions we have queued up so far. count can be
@@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2398 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), 2479 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2399 CHUNK_ALLOC_NO_FORCE); 2480 CHUNK_ALLOC_NO_FORCE);
2400 2481
2482 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2483
2401 delayed_refs = &trans->transaction->delayed_refs; 2484 delayed_refs = &trans->transaction->delayed_refs;
2402 INIT_LIST_HEAD(&cluster); 2485 INIT_LIST_HEAD(&cluster);
2403again: 2486again:
2404 consider_waiting = 0; 2487 consider_waiting = 0;
2405 spin_lock(&delayed_refs->lock); 2488 spin_lock(&delayed_refs->lock);
2489
2490#ifdef SCRAMBLE_DELAYED_REFS
2491 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2492#endif
2493
2406 if (count == 0) { 2494 if (count == 0) {
2407 count = delayed_refs->num_entries * 2; 2495 count = delayed_refs->num_entries * 2;
2408 run_most = 1; 2496 run_most = 1;
@@ -2437,7 +2525,7 @@ again:
2437 num_refs = delayed_refs->num_entries; 2525 num_refs = delayed_refs->num_entries;
2438 first_seq = root->fs_info->tree_mod_seq_list.next; 2526 first_seq = root->fs_info->tree_mod_seq_list.next;
2439 } else { 2527 } else {
2440 wait_for_more_refs(delayed_refs, 2528 wait_for_more_refs(root->fs_info, delayed_refs,
2441 num_refs, first_seq); 2529 num_refs, first_seq);
2442 /* 2530 /*
2443 * after waiting, things have changed. we 2531 * after waiting, things have changed. we
@@ -2502,6 +2590,7 @@ again:
2502 } 2590 }
2503out: 2591out:
2504 spin_unlock(&delayed_refs->lock); 2592 spin_unlock(&delayed_refs->lock);
2593 assert_qgroups_uptodate(trans);
2505 return 0; 2594 return 0;
2506} 2595}
2507 2596
@@ -2581,8 +2670,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2581 2670
2582 node = rb_prev(node); 2671 node = rb_prev(node);
2583 if (node) { 2672 if (node) {
2673 int seq = ref->seq;
2674
2584 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2675 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2585 if (ref->bytenr == bytenr) 2676 if (ref->bytenr == bytenr && ref->seq == seq)
2586 goto out_unlock; 2677 goto out_unlock;
2587 } 2678 }
2588 2679
@@ -2903,8 +2994,13 @@ again:
2903 } 2994 }
2904 2995
2905 spin_lock(&block_group->lock); 2996 spin_lock(&block_group->lock);
2906 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2997 if (block_group->cached != BTRFS_CACHE_FINISHED ||
2907 /* We're not cached, don't bother trying to write stuff out */ 2998 !btrfs_test_opt(root, SPACE_CACHE)) {
2999 /*
3000 * don't bother trying to write stuff out _if_
3001 * a) we're not cached,
3002 * b) we're with nospace_cache mount option.
3003 */
2908 dcs = BTRFS_DC_WRITTEN; 3004 dcs = BTRFS_DC_WRITTEN;
2909 spin_unlock(&block_group->lock); 3005 spin_unlock(&block_group->lock);
2910 goto out_put; 3006 goto out_put;
@@ -3134,6 +3230,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3134 init_waitqueue_head(&found->wait); 3230 init_waitqueue_head(&found->wait);
3135 *space_info = found; 3231 *space_info = found;
3136 list_add_rcu(&found->list, &info->space_info); 3232 list_add_rcu(&found->list, &info->space_info);
3233 if (flags & BTRFS_BLOCK_GROUP_DATA)
3234 info->data_sinfo = found;
3137 return 0; 3235 return 0;
3138} 3236}
3139 3237
@@ -3263,12 +3361,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3263 return get_alloc_profile(root, flags); 3361 return get_alloc_profile(root, flags);
3264} 3362}
3265 3363
3266void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3267{
3268 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3269 BTRFS_BLOCK_GROUP_DATA);
3270}
3271
3272/* 3364/*
3273 * This will check the space that the inode allocates from to make sure we have 3365 * This will check the space that the inode allocates from to make sure we have
3274 * enough space for bytes. 3366 * enough space for bytes.
@@ -3277,6 +3369,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3277{ 3369{
3278 struct btrfs_space_info *data_sinfo; 3370 struct btrfs_space_info *data_sinfo;
3279 struct btrfs_root *root = BTRFS_I(inode)->root; 3371 struct btrfs_root *root = BTRFS_I(inode)->root;
3372 struct btrfs_fs_info *fs_info = root->fs_info;
3280 u64 used; 3373 u64 used;
3281 int ret = 0, committed = 0, alloc_chunk = 1; 3374 int ret = 0, committed = 0, alloc_chunk = 1;
3282 3375
@@ -3289,7 +3382,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3289 committed = 1; 3382 committed = 1;
3290 } 3383 }
3291 3384
3292 data_sinfo = BTRFS_I(inode)->space_info; 3385 data_sinfo = fs_info->data_sinfo;
3293 if (!data_sinfo) 3386 if (!data_sinfo)
3294 goto alloc; 3387 goto alloc;
3295 3388
@@ -3330,10 +3423,9 @@ alloc:
3330 goto commit_trans; 3423 goto commit_trans;
3331 } 3424 }
3332 3425
3333 if (!data_sinfo) { 3426 if (!data_sinfo)
3334 btrfs_set_inode_space_info(root, inode); 3427 data_sinfo = fs_info->data_sinfo;
3335 data_sinfo = BTRFS_I(inode)->space_info; 3428
3336 }
3337 goto again; 3429 goto again;
3338 } 3430 }
3339 3431
@@ -3380,7 +3472,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3380 /* make sure bytes are sectorsize aligned */ 3472 /* make sure bytes are sectorsize aligned */
3381 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3473 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3382 3474
3383 data_sinfo = BTRFS_I(inode)->space_info; 3475 data_sinfo = root->fs_info->data_sinfo;
3384 spin_lock(&data_sinfo->lock); 3476 spin_lock(&data_sinfo->lock);
3385 data_sinfo->bytes_may_use -= bytes; 3477 data_sinfo->bytes_may_use -= bytes;
3386 trace_btrfs_space_reservation(root->fs_info, "space_info", 3478 trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3586,89 +3678,58 @@ out:
3586/* 3678/*
3587 * shrink metadata reservation for delalloc 3679 * shrink metadata reservation for delalloc
3588 */ 3680 */
3589static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, 3681static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3590 bool wait_ordered) 3682 bool wait_ordered)
3591{ 3683{
3592 struct btrfs_block_rsv *block_rsv; 3684 struct btrfs_block_rsv *block_rsv;
3593 struct btrfs_space_info *space_info; 3685 struct btrfs_space_info *space_info;
3594 struct btrfs_trans_handle *trans; 3686 struct btrfs_trans_handle *trans;
3595 u64 reserved; 3687 u64 delalloc_bytes;
3596 u64 max_reclaim; 3688 u64 max_reclaim;
3597 u64 reclaimed = 0;
3598 long time_left; 3689 long time_left;
3599 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3690 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3600 int loops = 0; 3691 int loops = 0;
3601 unsigned long progress;
3602 3692
3603 trans = (struct btrfs_trans_handle *)current->journal_info; 3693 trans = (struct btrfs_trans_handle *)current->journal_info;
3604 block_rsv = &root->fs_info->delalloc_block_rsv; 3694 block_rsv = &root->fs_info->delalloc_block_rsv;
3605 space_info = block_rsv->space_info; 3695 space_info = block_rsv->space_info;
3606 3696
3607 smp_mb(); 3697 smp_mb();
3608 reserved = space_info->bytes_may_use; 3698 delalloc_bytes = root->fs_info->delalloc_bytes;
3609 progress = space_info->reservation_progress; 3699 if (delalloc_bytes == 0) {
3610
3611 if (reserved == 0)
3612 return 0;
3613
3614 smp_mb();
3615 if (root->fs_info->delalloc_bytes == 0) {
3616 if (trans) 3700 if (trans)
3617 return 0; 3701 return;
3618 btrfs_wait_ordered_extents(root, 0, 0); 3702 btrfs_wait_ordered_extents(root, 0, 0);
3619 return 0; 3703 return;
3620 } 3704 }
3621 3705
3622 max_reclaim = min(reserved, to_reclaim); 3706 while (delalloc_bytes && loops < 3) {
3623 nr_pages = max_t(unsigned long, nr_pages, 3707 max_reclaim = min(delalloc_bytes, to_reclaim);
3624 max_reclaim >> PAGE_CACHE_SHIFT); 3708 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3625 while (loops < 1024) {
3626 /* have the flusher threads jump in and do some IO */
3627 smp_mb();
3628 nr_pages = min_t(unsigned long, nr_pages,
3629 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3630 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3709 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3631 WB_REASON_FS_FREE_SPACE); 3710 WB_REASON_FS_FREE_SPACE);
3632 3711
3633 spin_lock(&space_info->lock); 3712 spin_lock(&space_info->lock);
3634 if (reserved > space_info->bytes_may_use) 3713 if (space_info->bytes_used + space_info->bytes_reserved +
3635 reclaimed += reserved - space_info->bytes_may_use; 3714 space_info->bytes_pinned + space_info->bytes_readonly +
3636 reserved = space_info->bytes_may_use; 3715 space_info->bytes_may_use + orig <=
3716 space_info->total_bytes) {
3717 spin_unlock(&space_info->lock);
3718 break;
3719 }
3637 spin_unlock(&space_info->lock); 3720 spin_unlock(&space_info->lock);
3638 3721
3639 loops++; 3722 loops++;
3640
3641 if (reserved == 0 || reclaimed >= max_reclaim)
3642 break;
3643
3644 if (trans && trans->transaction->blocked)
3645 return -EAGAIN;
3646
3647 if (wait_ordered && !trans) { 3723 if (wait_ordered && !trans) {
3648 btrfs_wait_ordered_extents(root, 0, 0); 3724 btrfs_wait_ordered_extents(root, 0, 0);
3649 } else { 3725 } else {
3650 time_left = schedule_timeout_interruptible(1); 3726 time_left = schedule_timeout_killable(1);
3651
3652 /* We were interrupted, exit */
3653 if (time_left) 3727 if (time_left)
3654 break; 3728 break;
3655 } 3729 }
3656 3730 smp_mb();
3657 /* we've kicked the IO a few times, if anything has been freed, 3731 delalloc_bytes = root->fs_info->delalloc_bytes;
3658 * exit. There is no sense in looping here for a long time
3659 * when we really need to commit the transaction, or there are
3660 * just too many writers without enough free space
3661 */
3662
3663 if (loops > 3) {
3664 smp_mb();
3665 if (progress != space_info->reservation_progress)
3666 break;
3667 }
3668
3669 } 3732 }
3670
3671 return reclaimed >= to_reclaim;
3672} 3733}
3673 3734
3674/** 3735/**
@@ -3728,6 +3789,58 @@ commit:
3728 return btrfs_commit_transaction(trans, root); 3789 return btrfs_commit_transaction(trans, root);
3729} 3790}
3730 3791
3792enum flush_state {
3793 FLUSH_DELALLOC = 1,
3794 FLUSH_DELALLOC_WAIT = 2,
3795 FLUSH_DELAYED_ITEMS_NR = 3,
3796 FLUSH_DELAYED_ITEMS = 4,
3797 COMMIT_TRANS = 5,
3798};
3799
3800static int flush_space(struct btrfs_root *root,
3801 struct btrfs_space_info *space_info, u64 num_bytes,
3802 u64 orig_bytes, int state)
3803{
3804 struct btrfs_trans_handle *trans;
3805 int nr;
3806 int ret = 0;
3807
3808 switch (state) {
3809 case FLUSH_DELALLOC:
3810 case FLUSH_DELALLOC_WAIT:
3811 shrink_delalloc(root, num_bytes, orig_bytes,
3812 state == FLUSH_DELALLOC_WAIT);
3813 break;
3814 case FLUSH_DELAYED_ITEMS_NR:
3815 case FLUSH_DELAYED_ITEMS:
3816 if (state == FLUSH_DELAYED_ITEMS_NR) {
3817 u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3818
3819 nr = (int)div64_u64(num_bytes, bytes);
3820 if (!nr)
3821 nr = 1;
3822 nr *= 2;
3823 } else {
3824 nr = -1;
3825 }
3826 trans = btrfs_join_transaction(root);
3827 if (IS_ERR(trans)) {
3828 ret = PTR_ERR(trans);
3829 break;
3830 }
3831 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3832 btrfs_end_transaction(trans, root);
3833 break;
3834 case COMMIT_TRANS:
3835 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3836 break;
3837 default:
3838 ret = -ENOSPC;
3839 break;
3840 }
3841
3842 return ret;
3843}
3731/** 3844/**
3732 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 3845 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3733 * @root - the root we're allocating for 3846 * @root - the root we're allocating for
@@ -3749,11 +3862,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
3749 struct btrfs_space_info *space_info = block_rsv->space_info; 3862 struct btrfs_space_info *space_info = block_rsv->space_info;
3750 u64 used; 3863 u64 used;
3751 u64 num_bytes = orig_bytes; 3864 u64 num_bytes = orig_bytes;
3752 int retries = 0; 3865 int flush_state = FLUSH_DELALLOC;
3753 int ret = 0; 3866 int ret = 0;
3754 bool committed = false;
3755 bool flushing = false; 3867 bool flushing = false;
3756 bool wait_ordered = false; 3868 bool committed = false;
3757 3869
3758again: 3870again:
3759 ret = 0; 3871 ret = 0;
@@ -3812,9 +3924,8 @@ again:
3812 * amount plus the amount of bytes that we need for this 3924 * amount plus the amount of bytes that we need for this
3813 * reservation. 3925 * reservation.
3814 */ 3926 */
3815 wait_ordered = true;
3816 num_bytes = used - space_info->total_bytes + 3927 num_bytes = used - space_info->total_bytes +
3817 (orig_bytes * (retries + 1)); 3928 (orig_bytes * 2);
3818 } 3929 }
3819 3930
3820 if (ret) { 3931 if (ret) {
@@ -3867,8 +3978,6 @@ again:
3867 trace_btrfs_space_reservation(root->fs_info, 3978 trace_btrfs_space_reservation(root->fs_info,
3868 "space_info", space_info->flags, orig_bytes, 1); 3979 "space_info", space_info->flags, orig_bytes, 1);
3869 ret = 0; 3980 ret = 0;
3870 } else {
3871 wait_ordered = true;
3872 } 3981 }
3873 } 3982 }
3874 3983
@@ -3887,36 +3996,13 @@ again:
3887 if (!ret || !flush) 3996 if (!ret || !flush)
3888 goto out; 3997 goto out;
3889 3998
3890 /* 3999 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3891 * We do synchronous shrinking since we don't actually unreserve 4000 flush_state);
3892 * metadata until after the IO is completed. 4001 flush_state++;
3893 */ 4002 if (!ret)
3894 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3895 if (ret < 0)
3896 goto out;
3897
3898 ret = 0;
3899
3900 /*
3901 * So if we were overcommitted it's possible that somebody else flushed
3902 * out enough space and we simply didn't have enough space to reclaim,
3903 * so go back around and try again.
3904 */
3905 if (retries < 2) {
3906 wait_ordered = true;
3907 retries++;
3908 goto again; 4003 goto again;
3909 } 4004 else if (flush_state <= COMMIT_TRANS)
3910
3911 ret = -ENOSPC;
3912 if (committed)
3913 goto out;
3914
3915 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3916 if (!ret) {
3917 committed = true;
3918 goto again; 4005 goto again;
3919 }
3920 4006
3921out: 4007out:
3922 if (flushing) { 4008 if (flushing) {
@@ -3934,7 +4020,10 @@ static struct btrfs_block_rsv *get_block_rsv(
3934{ 4020{
3935 struct btrfs_block_rsv *block_rsv = NULL; 4021 struct btrfs_block_rsv *block_rsv = NULL;
3936 4022
3937 if (root->ref_cows || root == root->fs_info->csum_root) 4023 if (root->ref_cows)
4024 block_rsv = trans->block_rsv;
4025
4026 if (root == root->fs_info->csum_root && trans->adding_csums)
3938 block_rsv = trans->block_rsv; 4027 block_rsv = trans->block_rsv;
3939 4028
3940 if (!block_rsv) 4029 if (!block_rsv)
@@ -4286,6 +4375,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4286void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4375void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4287 struct btrfs_root *root) 4376 struct btrfs_root *root)
4288{ 4377{
4378 if (!trans->block_rsv)
4379 return;
4380
4289 if (!trans->bytes_reserved) 4381 if (!trans->bytes_reserved)
4290 return; 4382 return;
4291 4383
@@ -4444,7 +4536,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4444 int ret; 4536 int ret;
4445 4537
4446 /* Need to be holding the i_mutex here if we aren't free space cache */ 4538 /* Need to be holding the i_mutex here if we aren't free space cache */
4447 if (btrfs_is_free_space_inode(root, inode)) 4539 if (btrfs_is_free_space_inode(inode))
4448 flush = 0; 4540 flush = 0;
4449 4541
4450 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4542 if (flush && btrfs_transaction_in_commit(root->fs_info))
@@ -4476,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4476 csum_bytes = BTRFS_I(inode)->csum_bytes; 4568 csum_bytes = BTRFS_I(inode)->csum_bytes;
4477 spin_unlock(&BTRFS_I(inode)->lock); 4569 spin_unlock(&BTRFS_I(inode)->lock);
4478 4570
4571 if (root->fs_info->quota_enabled) {
4572 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize);
4574 if (ret)
4575 return ret;
4576 }
4577
4479 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4578 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4480 if (ret) { 4579 if (ret) {
4481 u64 to_free = 0; 4580 u64 to_free = 0;
@@ -4554,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4554 4653
4555 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4654 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4556 btrfs_ino(inode), to_free, 0); 4655 btrfs_ino(inode), to_free, 0);
4656 if (root->fs_info->quota_enabled) {
4657 btrfs_qgroup_free(root, num_bytes +
4658 dropped * root->leafsize);
4659 }
4660
4557 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4661 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4558 to_free); 4662 to_free);
4559} 4663}
@@ -5190,8 +5294,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5190 rb_erase(&head->node.rb_node, &delayed_refs->root); 5294 rb_erase(&head->node.rb_node, &delayed_refs->root);
5191 5295
5192 delayed_refs->num_entries--; 5296 delayed_refs->num_entries--;
5193 if (waitqueue_active(&delayed_refs->seq_wait)) 5297 smp_mb();
5194 wake_up(&delayed_refs->seq_wait); 5298 if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
5299 wake_up(&root->fs_info->tree_mod_seq_wait);
5195 5300
5196 /* 5301 /*
5197 * we don't take a ref on the node because we're removing it from the 5302 * we don't take a ref on the node because we're removing it from the
@@ -5748,7 +5853,11 @@ loop:
5748 ret = do_chunk_alloc(trans, root, num_bytes + 5853 ret = do_chunk_alloc(trans, root, num_bytes +
5749 2 * 1024 * 1024, data, 5854 2 * 1024 * 1024, data,
5750 CHUNK_ALLOC_LIMITED); 5855 CHUNK_ALLOC_LIMITED);
5751 if (ret < 0) { 5856 /*
5857 * Do not bail out on ENOSPC since we
5858 * can do more things.
5859 */
5860 if (ret < 0 && ret != -ENOSPC) {
5752 btrfs_abort_transaction(trans, 5861 btrfs_abort_transaction(trans,
5753 root, ret); 5862 root, ret);
5754 goto out; 5863 goto out;
@@ -5816,13 +5925,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5816again: 5925again:
5817 list_for_each_entry(cache, &info->block_groups[index], list) { 5926 list_for_each_entry(cache, &info->block_groups[index], list) {
5818 spin_lock(&cache->lock); 5927 spin_lock(&cache->lock);
5819 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 5928 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5820 "%llu pinned %llu reserved\n",
5821 (unsigned long long)cache->key.objectid, 5929 (unsigned long long)cache->key.objectid,
5822 (unsigned long long)cache->key.offset, 5930 (unsigned long long)cache->key.offset,
5823 (unsigned long long)btrfs_block_group_used(&cache->item), 5931 (unsigned long long)btrfs_block_group_used(&cache->item),
5824 (unsigned long long)cache->pinned, 5932 (unsigned long long)cache->pinned,
5825 (unsigned long long)cache->reserved); 5933 (unsigned long long)cache->reserved,
5934 cache->ro ? "[readonly]" : "");
5826 btrfs_dump_free_space(cache, bytes); 5935 btrfs_dump_free_space(cache, bytes);
5827 spin_unlock(&cache->lock); 5936 spin_unlock(&cache->lock);
5828 } 5937 }
@@ -7610,8 +7719,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7610 INIT_LIST_HEAD(&cache->list); 7719 INIT_LIST_HEAD(&cache->list);
7611 INIT_LIST_HEAD(&cache->cluster_list); 7720 INIT_LIST_HEAD(&cache->cluster_list);
7612 7721
7613 if (need_clear) 7722 if (need_clear) {
7723 /*
7724 * When we mount with old space cache, we need to
7725 * set BTRFS_DC_CLEAR and set dirty flag.
7726 *
7727 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7728 * truncate the old free space cache inode and
7729 * setup a new one.
7730 * b) Setting 'dirty flag' makes sure that we flush
7731 * the new space cache info onto disk.
7732 */
7614 cache->disk_cache_state = BTRFS_DC_CLEAR; 7733 cache->disk_cache_state = BTRFS_DC_CLEAR;
7734 if (btrfs_test_opt(root, SPACE_CACHE))
7735 cache->dirty = 1;
7736 }
7615 7737
7616 read_extent_buffer(leaf, &cache->item, 7738 read_extent_buffer(leaf, &cache->item,
7617 btrfs_item_ptr_offset(leaf, path->slots[0]), 7739 btrfs_item_ptr_offset(leaf, path->slots[0]),
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index deafe19c34b..45c81bb4ac8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1919,7 +1919,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1919 return -EIO; 1919 return -EIO;
1920 } 1920 }
1921 1921
1922 printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 1922 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1923 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 1923 "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1924 start, rcu_str_deref(dev->name), sector); 1924 start, rcu_str_deref(dev->name), sector);
1925 1925
@@ -3078,8 +3078,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3078 } 3078 }
3079 } 3079 }
3080 3080
3081 /*
3082 * We need to do this to prevent races in people who check if the eb is
3083 * under IO since we can end up having no IO bits set for a short period
3084 * of time.
3085 */
3086 spin_lock(&eb->refs_lock);
3081 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3087 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3082 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3088 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3089 spin_unlock(&eb->refs_lock);
3083 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3090 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3084 spin_lock(&fs_info->delalloc_lock); 3091 spin_lock(&fs_info->delalloc_lock);
3085 if (fs_info->dirty_metadata_bytes >= eb->len) 3092 if (fs_info->dirty_metadata_bytes >= eb->len)
@@ -3088,6 +3095,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3088 WARN_ON(1); 3095 WARN_ON(1);
3089 spin_unlock(&fs_info->delalloc_lock); 3096 spin_unlock(&fs_info->delalloc_lock);
3090 ret = 1; 3097 ret = 1;
3098 } else {
3099 spin_unlock(&eb->refs_lock);
3091 } 3100 }
3092 3101
3093 btrfs_tree_unlock(eb); 3102 btrfs_tree_unlock(eb);
@@ -3558,19 +3567,38 @@ int extent_readpages(struct extent_io_tree *tree,
3558 struct bio *bio = NULL; 3567 struct bio *bio = NULL;
3559 unsigned page_idx; 3568 unsigned page_idx;
3560 unsigned long bio_flags = 0; 3569 unsigned long bio_flags = 0;
3570 struct page *pagepool[16];
3571 struct page *page;
3572 int i = 0;
3573 int nr = 0;
3561 3574
3562 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3575 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3563 struct page *page = list_entry(pages->prev, struct page, lru); 3576 page = list_entry(pages->prev, struct page, lru);
3564 3577
3565 prefetchw(&page->flags); 3578 prefetchw(&page->flags);
3566 list_del(&page->lru); 3579 list_del(&page->lru);
3567 if (!add_to_page_cache_lru(page, mapping, 3580 if (add_to_page_cache_lru(page, mapping,
3568 page->index, GFP_NOFS)) { 3581 page->index, GFP_NOFS)) {
3569 __extent_read_full_page(tree, page, get_extent, 3582 page_cache_release(page);
3570 &bio, 0, &bio_flags); 3583 continue;
3571 } 3584 }
3572 page_cache_release(page); 3585
3586 pagepool[nr++] = page;
3587 if (nr < ARRAY_SIZE(pagepool))
3588 continue;
3589 for (i = 0; i < nr; i++) {
3590 __extent_read_full_page(tree, pagepool[i], get_extent,
3591 &bio, 0, &bio_flags);
3592 page_cache_release(pagepool[i]);
3593 }
3594 nr = 0;
3595 }
3596 for (i = 0; i < nr; i++) {
3597 __extent_read_full_page(tree, pagepool[i], get_extent,
3598 &bio, 0, &bio_flags);
3599 page_cache_release(pagepool[i]);
3573 } 3600 }
3601
3574 BUG_ON(!list_empty(pages)); 3602 BUG_ON(!list_empty(pages));
3575 if (bio) 3603 if (bio)
3576 return submit_one_bio(READ, bio, 0, bio_flags); 3604 return submit_one_bio(READ, bio, 0, bio_flags);
@@ -4124,11 +4152,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4124 * So bump the ref count first, then set the bit. If someone 4152 * So bump the ref count first, then set the bit. If someone
4125 * beat us to it, drop the ref we added. 4153 * beat us to it, drop the ref we added.
4126 */ 4154 */
4127 if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4155 spin_lock(&eb->refs_lock);
4156 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4128 atomic_inc(&eb->refs); 4157 atomic_inc(&eb->refs);
4129 if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4158 spin_unlock(&eb->refs_lock);
4130 atomic_dec(&eb->refs);
4131 }
4132} 4159}
4133 4160
4134static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4161static void mark_extent_buffer_accessed(struct extent_buffer *eb)
@@ -4240,9 +4267,7 @@ again:
4240 goto free_eb; 4267 goto free_eb;
4241 } 4268 }
4242 /* add one reference for the tree */ 4269 /* add one reference for the tree */
4243 spin_lock(&eb->refs_lock);
4244 check_buffer_tree_ref(eb); 4270 check_buffer_tree_ref(eb);
4245 spin_unlock(&eb->refs_lock);
4246 spin_unlock(&tree->buffer_lock); 4271 spin_unlock(&tree->buffer_lock);
4247 radix_tree_preload_end(); 4272 radix_tree_preload_end();
4248 4273
@@ -4301,7 +4326,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4301} 4326}
4302 4327
4303/* Expects to have eb->eb_lock already held */ 4328/* Expects to have eb->eb_lock already held */
4304static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) 4329static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4305{ 4330{
4306 WARN_ON(atomic_read(&eb->refs) == 0); 4331 WARN_ON(atomic_read(&eb->refs) == 0);
4307 if (atomic_dec_and_test(&eb->refs)) { 4332 if (atomic_dec_and_test(&eb->refs)) {
@@ -4322,9 +4347,11 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4322 btrfs_release_extent_buffer_page(eb, 0); 4347 btrfs_release_extent_buffer_page(eb, 0);
4323 4348
4324 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4349 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4325 return; 4350 return 1;
4326 } 4351 }
4327 spin_unlock(&eb->refs_lock); 4352 spin_unlock(&eb->refs_lock);
4353
4354 return 0;
4328} 4355}
4329 4356
4330void free_extent_buffer(struct extent_buffer *eb) 4357void free_extent_buffer(struct extent_buffer *eb)
@@ -4963,7 +4990,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
4963 spin_unlock(&eb->refs_lock); 4990 spin_unlock(&eb->refs_lock);
4964 return 0; 4991 return 0;
4965 } 4992 }
4966 release_extent_buffer(eb, mask);
4967 4993
4968 return 1; 4994 return release_extent_buffer(eb, mask);
4969} 4995}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5d158d32023..b45b9de0c21 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -183,7 +183,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
183 * read from the commit root and sidestep a nasty deadlock 183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree. 184 * between reading the free space cache and updating the csum tree.
185 */ 185 */
186 if (btrfs_is_free_space_inode(root, inode)) { 186 if (btrfs_is_free_space_inode(inode)) {
187 path->search_commit_root = 1; 187 path->search_commit_root = 1;
188 path->skip_locking = 1; 188 path->skip_locking = 1;
189 } 189 }
@@ -690,6 +690,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
690 return -ENOMEM; 690 return -ENOMEM;
691 691
692 sector_sum = sums->sums; 692 sector_sum = sums->sums;
693 trans->adding_csums = 1;
693again: 694again:
694 next_offset = (u64)-1; 695 next_offset = (u64)-1;
695 found_next = 0; 696 found_next = 0;
@@ -853,6 +854,7 @@ next_sector:
853 goto again; 854 goto again;
854 } 855 }
855out: 856out:
857 trans->adding_csums = 0;
856 btrfs_free_path(path); 858 btrfs_free_path(path);
857 return ret; 859 return ret;
858 860
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6c4e2baa929..6b10acfc2f5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1968,7 +1968,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1968 1968
1969 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 1969 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
1970 info = rb_entry(n, struct btrfs_free_space, offset_index); 1970 info = rb_entry(n, struct btrfs_free_space, offset_index);
1971 if (info->bytes >= bytes) 1971 if (info->bytes >= bytes && !block_group->ro)
1972 count++; 1972 count++;
1973 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 1973 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
1974 (unsigned long long)info->offset, 1974 (unsigned long long)info->offset,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb8d671d00e..48bdfd2591c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -825,7 +825,7 @@ static noinline int cow_file_range(struct inode *inode,
825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
826 int ret = 0; 826 int ret = 0;
827 827
828 BUG_ON(btrfs_is_free_space_inode(root, inode)); 828 BUG_ON(btrfs_is_free_space_inode(inode));
829 trans = btrfs_join_transaction(root); 829 trans = btrfs_join_transaction(root);
830 if (IS_ERR(trans)) { 830 if (IS_ERR(trans)) {
831 extent_clear_unlock_delalloc(inode, 831 extent_clear_unlock_delalloc(inode,
@@ -1010,7 +1010,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
1011 1011
1012 if (atomic_read(&root->fs_info->async_delalloc_pages) < 1012 if (atomic_read(&root->fs_info->async_delalloc_pages) <
1013 5 * 1042 * 1024 && 1013 5 * 1024 * 1024 &&
1014 waitqueue_active(&root->fs_info->async_submit_wait)) 1014 waitqueue_active(&root->fs_info->async_submit_wait))
1015 wake_up(&root->fs_info->async_submit_wait); 1015 wake_up(&root->fs_info->async_submit_wait);
1016 1016
@@ -1035,7 +1035,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1035 struct btrfs_root *root = BTRFS_I(inode)->root; 1035 struct btrfs_root *root = BTRFS_I(inode)->root;
1036 unsigned long nr_pages; 1036 unsigned long nr_pages;
1037 u64 cur_end; 1037 u64 cur_end;
1038 int limit = 10 * 1024 * 1042; 1038 int limit = 10 * 1024 * 1024;
1039 1039
1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1041 1, 0, NULL, GFP_NOFS); 1041 1, 0, NULL, GFP_NOFS);
@@ -1153,7 +1153,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1153 return -ENOMEM; 1153 return -ENOMEM;
1154 } 1154 }
1155 1155
1156 nolock = btrfs_is_free_space_inode(root, inode); 1156 nolock = btrfs_is_free_space_inode(inode);
1157 1157
1158 if (nolock) 1158 if (nolock)
1159 trans = btrfs_join_transaction_nolock(root); 1159 trans = btrfs_join_transaction_nolock(root);
@@ -1466,7 +1466,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1467 struct btrfs_root *root = BTRFS_I(inode)->root; 1467 struct btrfs_root *root = BTRFS_I(inode)->root;
1468 u64 len = state->end + 1 - state->start; 1468 u64 len = state->end + 1 - state->start;
1469 bool do_list = !btrfs_is_free_space_inode(root, inode); 1469 bool do_list = !btrfs_is_free_space_inode(inode);
1470 1470
1471 if (*bits & EXTENT_FIRST_DELALLOC) { 1471 if (*bits & EXTENT_FIRST_DELALLOC) {
1472 *bits &= ~EXTENT_FIRST_DELALLOC; 1472 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1501,7 +1501,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1502 struct btrfs_root *root = BTRFS_I(inode)->root; 1502 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 u64 len = state->end + 1 - state->start; 1503 u64 len = state->end + 1 - state->start;
1504 bool do_list = !btrfs_is_free_space_inode(root, inode); 1504 bool do_list = !btrfs_is_free_space_inode(inode);
1505 1505
1506 if (*bits & EXTENT_FIRST_DELALLOC) { 1506 if (*bits & EXTENT_FIRST_DELALLOC) {
1507 *bits &= ~EXTENT_FIRST_DELALLOC; 1507 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1612,7 +1612,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1612 1612
1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1614 1614
1615 if (btrfs_is_free_space_inode(root, inode)) 1615 if (btrfs_is_free_space_inode(inode))
1616 metadata = 2; 1616 metadata = 2;
1617 1617
1618 if (!(rw & REQ_WRITE)) { 1618 if (!(rw & REQ_WRITE)) {
@@ -1869,7 +1869,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1869 int ret; 1869 int ret;
1870 bool nolock; 1870 bool nolock;
1871 1871
1872 nolock = btrfs_is_free_space_inode(root, inode); 1872 nolock = btrfs_is_free_space_inode(inode);
1873 1873
1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1875 ret = -EIO; 1875 ret = -EIO;
@@ -2007,7 +2007,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2007 ordered_extent->work.func = finish_ordered_fn; 2007 ordered_extent->work.func = finish_ordered_fn;
2008 ordered_extent->work.flags = 0; 2008 ordered_extent->work.flags = 0;
2009 2009
2010 if (btrfs_is_free_space_inode(root, inode)) 2010 if (btrfs_is_free_space_inode(inode))
2011 workers = &root->fs_info->endio_freespace_worker; 2011 workers = &root->fs_info->endio_freespace_worker;
2012 else 2012 else
2013 workers = &root->fs_info->endio_write_workers; 2013 workers = &root->fs_info->endio_write_workers;
@@ -2732,8 +2732,10 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2732 * The data relocation inode should also be directly updated 2732 * The data relocation inode should also be directly updated
2733 * without delay 2733 * without delay
2734 */ 2734 */
2735 if (!btrfs_is_free_space_inode(root, inode) 2735 if (!btrfs_is_free_space_inode(inode)
2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2737 btrfs_update_root_times(trans, root);
2738
2737 ret = btrfs_delayed_update_inode(trans, root, inode); 2739 ret = btrfs_delayed_update_inode(trans, root, inode);
2738 if (!ret) 2740 if (!ret)
2739 btrfs_set_inode_last_trans(trans, inode); 2741 btrfs_set_inode_last_trans(trans, inode);
@@ -2833,7 +2835,7 @@ err:
2833 inode_inc_iversion(inode); 2835 inode_inc_iversion(inode);
2834 inode_inc_iversion(dir); 2836 inode_inc_iversion(dir);
2835 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2837 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2836 btrfs_update_inode(trans, root, dir); 2838 ret = btrfs_update_inode(trans, root, dir);
2837out: 2839out:
2838 return ret; 2840 return ret;
2839} 2841}
@@ -3743,7 +3745,7 @@ void btrfs_evict_inode(struct inode *inode)
3743 3745
3744 truncate_inode_pages(&inode->i_data, 0); 3746 truncate_inode_pages(&inode->i_data, 0);
3745 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3747 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3746 btrfs_is_free_space_inode(root, inode))) 3748 btrfs_is_free_space_inode(inode)))
3747 goto no_delete; 3749 goto no_delete;
3748 3750
3749 if (is_bad_inode(inode)) { 3751 if (is_bad_inode(inode)) {
@@ -4082,7 +4084,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
4082 struct btrfs_iget_args *args = p; 4084 struct btrfs_iget_args *args = p;
4083 inode->i_ino = args->ino; 4085 inode->i_ino = args->ino;
4084 BTRFS_I(inode)->root = args->root; 4086 BTRFS_I(inode)->root = args->root;
4085 btrfs_set_inode_space_info(args->root, inode);
4086 return 0; 4087 return 0;
4087} 4088}
4088 4089
@@ -4457,7 +4458,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4457 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4458 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4458 return 0; 4459 return 0;
4459 4460
4460 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4461 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
4461 nolock = true; 4462 nolock = true;
4462 4463
4463 if (wbc->sync_mode == WB_SYNC_ALL) { 4464 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4518,6 +4519,11 @@ int btrfs_dirty_inode(struct inode *inode)
4518static int btrfs_update_time(struct inode *inode, struct timespec *now, 4519static int btrfs_update_time(struct inode *inode, struct timespec *now,
4519 int flags) 4520 int flags)
4520{ 4521{
4522 struct btrfs_root *root = BTRFS_I(inode)->root;
4523
4524 if (btrfs_root_readonly(root))
4525 return -EROFS;
4526
4521 if (flags & S_VERSION) 4527 if (flags & S_VERSION)
4522 inode_inc_iversion(inode); 4528 inode_inc_iversion(inode);
4523 if (flags & S_CTIME) 4529 if (flags & S_CTIME)
@@ -4662,7 +4668,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4662 BTRFS_I(inode)->root = root; 4668 BTRFS_I(inode)->root = root;
4663 BTRFS_I(inode)->generation = trans->transid; 4669 BTRFS_I(inode)->generation = trans->transid;
4664 inode->i_generation = BTRFS_I(inode)->generation; 4670 inode->i_generation = BTRFS_I(inode)->generation;
4665 btrfs_set_inode_space_info(root, inode);
4666 4671
4667 if (S_ISDIR(mode)) 4672 if (S_ISDIR(mode))
4668 owner = 0; 4673 owner = 0;
@@ -4690,6 +4695,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4690 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4695 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4691 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4696 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4692 struct btrfs_inode_item); 4697 struct btrfs_inode_item);
4698 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
4699 sizeof(*inode_item));
4693 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4700 fill_inode_item(trans, path->nodes[0], inode_item, inode);
4694 4701
4695 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4702 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
@@ -4723,6 +4730,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4723 trace_btrfs_inode_new(inode); 4730 trace_btrfs_inode_new(inode);
4724 btrfs_set_inode_last_trans(trans, inode); 4731 btrfs_set_inode_last_trans(trans, inode);
4725 4732
4733 btrfs_update_root_times(trans, root);
4734
4726 return inode; 4735 return inode;
4727fail: 4736fail:
4728 if (dir) 4737 if (dir)
@@ -6939,7 +6948,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6939 return NULL; 6948 return NULL;
6940 6949
6941 ei->root = NULL; 6950 ei->root = NULL;
6942 ei->space_info = NULL;
6943 ei->generation = 0; 6951 ei->generation = 0;
6944 ei->last_trans = 0; 6952 ei->last_trans = 0;
6945 ei->last_sub_trans = 0; 6953 ei->last_sub_trans = 0;
@@ -7046,7 +7054,7 @@ int btrfs_drop_inode(struct inode *inode)
7046 struct btrfs_root *root = BTRFS_I(inode)->root; 7054 struct btrfs_root *root = BTRFS_I(inode)->root;
7047 7055
7048 if (btrfs_root_refs(&root->root_item) == 0 && 7056 if (btrfs_root_refs(&root->root_item) == 0 &&
7049 !btrfs_is_free_space_inode(root, inode)) 7057 !btrfs_is_free_space_inode(inode))
7050 return 1; 7058 return 1;
7051 else 7059 else
7052 return generic_drop_inode(inode); 7060 return generic_drop_inode(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1e9f6c019ad..43f0012016e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -41,6 +41,7 @@
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h>
44#include "compat.h" 45#include "compat.h"
45#include "ctree.h" 46#include "ctree.h"
46#include "disk-io.h" 47#include "disk-io.h"
@@ -53,6 +54,7 @@
53#include "inode-map.h" 54#include "inode-map.h"
54#include "backref.h" 55#include "backref.h"
55#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h"
56 58
57/* Mask out flags that are inappropriate for the given type of inode. */ 59/* Mask out flags that are inappropriate for the given type of inode. */
58static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -336,7 +338,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
336static noinline int create_subvol(struct btrfs_root *root, 338static noinline int create_subvol(struct btrfs_root *root,
337 struct dentry *dentry, 339 struct dentry *dentry,
338 char *name, int namelen, 340 char *name, int namelen,
339 u64 *async_transid) 341 u64 *async_transid,
342 struct btrfs_qgroup_inherit **inherit)
340{ 343{
341 struct btrfs_trans_handle *trans; 344 struct btrfs_trans_handle *trans;
342 struct btrfs_key key; 345 struct btrfs_key key;
@@ -346,11 +349,13 @@ static noinline int create_subvol(struct btrfs_root *root,
346 struct btrfs_root *new_root; 349 struct btrfs_root *new_root;
347 struct dentry *parent = dentry->d_parent; 350 struct dentry *parent = dentry->d_parent;
348 struct inode *dir; 351 struct inode *dir;
352 struct timespec cur_time = CURRENT_TIME;
349 int ret; 353 int ret;
350 int err; 354 int err;
351 u64 objectid; 355 u64 objectid;
352 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 356 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
353 u64 index = 0; 357 u64 index = 0;
358 uuid_le new_uuid;
354 359
355 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 360 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
356 if (ret) 361 if (ret)
@@ -368,6 +373,11 @@ static noinline int create_subvol(struct btrfs_root *root,
368 if (IS_ERR(trans)) 373 if (IS_ERR(trans))
369 return PTR_ERR(trans); 374 return PTR_ERR(trans);
370 375
376 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
377 inherit ? *inherit : NULL);
378 if (ret)
379 goto fail;
380
371 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 381 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
372 0, objectid, NULL, 0, 0, 0); 382 0, objectid, NULL, 0, 0, 0);
373 if (IS_ERR(leaf)) { 383 if (IS_ERR(leaf)) {
@@ -389,8 +399,9 @@ static noinline int create_subvol(struct btrfs_root *root,
389 BTRFS_UUID_SIZE); 399 BTRFS_UUID_SIZE);
390 btrfs_mark_buffer_dirty(leaf); 400 btrfs_mark_buffer_dirty(leaf);
391 401
402 memset(&root_item, 0, sizeof(root_item));
403
392 inode_item = &root_item.inode; 404 inode_item = &root_item.inode;
393 memset(inode_item, 0, sizeof(*inode_item));
394 inode_item->generation = cpu_to_le64(1); 405 inode_item->generation = cpu_to_le64(1);
395 inode_item->size = cpu_to_le64(3); 406 inode_item->size = cpu_to_le64(3);
396 inode_item->nlink = cpu_to_le32(1); 407 inode_item->nlink = cpu_to_le32(1);
@@ -408,8 +419,15 @@ static noinline int create_subvol(struct btrfs_root *root,
408 btrfs_set_root_used(&root_item, leaf->len); 419 btrfs_set_root_used(&root_item, leaf->len);
409 btrfs_set_root_last_snapshot(&root_item, 0); 420 btrfs_set_root_last_snapshot(&root_item, 0);
410 421
411 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 422 btrfs_set_root_generation_v2(&root_item,
412 root_item.drop_level = 0; 423 btrfs_root_generation(&root_item));
424 uuid_le_gen(&new_uuid);
425 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
426 root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
427 root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec);
428 root_item.ctime = root_item.otime;
429 btrfs_set_root_ctransid(&root_item, trans->transid);
430 btrfs_set_root_otransid(&root_item, trans->transid);
413 431
414 btrfs_tree_unlock(leaf); 432 btrfs_tree_unlock(leaf);
415 free_extent_buffer(leaf); 433 free_extent_buffer(leaf);
@@ -484,7 +502,7 @@ fail:
484 502
485static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 503static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
486 char *name, int namelen, u64 *async_transid, 504 char *name, int namelen, u64 *async_transid,
487 bool readonly) 505 bool readonly, struct btrfs_qgroup_inherit **inherit)
488{ 506{
489 struct inode *inode; 507 struct inode *inode;
490 struct btrfs_pending_snapshot *pending_snapshot; 508 struct btrfs_pending_snapshot *pending_snapshot;
@@ -502,6 +520,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
502 pending_snapshot->dentry = dentry; 520 pending_snapshot->dentry = dentry;
503 pending_snapshot->root = root; 521 pending_snapshot->root = root;
504 pending_snapshot->readonly = readonly; 522 pending_snapshot->readonly = readonly;
523 if (inherit) {
524 pending_snapshot->inherit = *inherit;
525 *inherit = NULL; /* take responsibility to free it */
526 }
505 527
506 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
507 if (IS_ERR(trans)) { 529 if (IS_ERR(trans)) {
@@ -635,7 +657,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
635static noinline int btrfs_mksubvol(struct path *parent, 657static noinline int btrfs_mksubvol(struct path *parent,
636 char *name, int namelen, 658 char *name, int namelen,
637 struct btrfs_root *snap_src, 659 struct btrfs_root *snap_src,
638 u64 *async_transid, bool readonly) 660 u64 *async_transid, bool readonly,
661 struct btrfs_qgroup_inherit **inherit)
639{ 662{
640 struct inode *dir = parent->dentry->d_inode; 663 struct inode *dir = parent->dentry->d_inode;
641 struct dentry *dentry; 664 struct dentry *dentry;
@@ -652,13 +675,9 @@ static noinline int btrfs_mksubvol(struct path *parent,
652 if (dentry->d_inode) 675 if (dentry->d_inode)
653 goto out_dput; 676 goto out_dput;
654 677
655 error = mnt_want_write(parent->mnt);
656 if (error)
657 goto out_dput;
658
659 error = btrfs_may_create(dir, dentry); 678 error = btrfs_may_create(dir, dentry);
660 if (error) 679 if (error)
661 goto out_drop_write; 680 goto out_dput;
662 681
663 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 682 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
664 683
@@ -666,18 +685,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
666 goto out_up_read; 685 goto out_up_read;
667 686
668 if (snap_src) { 687 if (snap_src) {
669 error = create_snapshot(snap_src, dentry, 688 error = create_snapshot(snap_src, dentry, name, namelen,
670 name, namelen, async_transid, readonly); 689 async_transid, readonly, inherit);
671 } else { 690 } else {
672 error = create_subvol(BTRFS_I(dir)->root, dentry, 691 error = create_subvol(BTRFS_I(dir)->root, dentry,
673 name, namelen, async_transid); 692 name, namelen, async_transid, inherit);
674 } 693 }
675 if (!error) 694 if (!error)
676 fsnotify_mkdir(dir, dentry); 695 fsnotify_mkdir(dir, dentry);
677out_up_read: 696out_up_read:
678 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 697 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
679out_drop_write:
680 mnt_drop_write(parent->mnt);
681out_dput: 698out_dput:
682 dput(dentry); 699 dput(dentry);
683out_unlock: 700out_unlock:
@@ -832,7 +849,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
832} 849}
833 850
834static int should_defrag_range(struct inode *inode, u64 start, int thresh, 851static int should_defrag_range(struct inode *inode, u64 start, int thresh,
835 u64 *last_len, u64 *skip, u64 *defrag_end) 852 u64 *last_len, u64 *skip, u64 *defrag_end,
853 int compress)
836{ 854{
837 struct extent_map *em; 855 struct extent_map *em;
838 int ret = 1; 856 int ret = 1;
@@ -863,7 +881,7 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
863 * we hit a real extent, if it is big or the next extent is not a 881 * we hit a real extent, if it is big or the next extent is not a
864 * real extent, don't bother defragging it 882 * real extent, don't bother defragging it
865 */ 883 */
866 if ((*last_len == 0 || *last_len >= thresh) && 884 if (!compress && (*last_len == 0 || *last_len >= thresh) &&
867 (em->len >= thresh || !next_mergeable)) 885 (em->len >= thresh || !next_mergeable))
868 ret = 0; 886 ret = 0;
869out: 887out:
@@ -1047,11 +1065,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1047 u64 newer_than, unsigned long max_to_defrag) 1065 u64 newer_than, unsigned long max_to_defrag)
1048{ 1066{
1049 struct btrfs_root *root = BTRFS_I(inode)->root; 1067 struct btrfs_root *root = BTRFS_I(inode)->root;
1050 struct btrfs_super_block *disk_super;
1051 struct file_ra_state *ra = NULL; 1068 struct file_ra_state *ra = NULL;
1052 unsigned long last_index; 1069 unsigned long last_index;
1053 u64 isize = i_size_read(inode); 1070 u64 isize = i_size_read(inode);
1054 u64 features;
1055 u64 last_len = 0; 1071 u64 last_len = 0;
1056 u64 skip = 0; 1072 u64 skip = 0;
1057 u64 defrag_end = 0; 1073 u64 defrag_end = 0;
@@ -1145,7 +1161,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1145 1161
1146 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1162 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1147 extent_thresh, &last_len, &skip, 1163 extent_thresh, &last_len, &skip,
1148 &defrag_end)) { 1164 &defrag_end, range->flags &
1165 BTRFS_DEFRAG_RANGE_COMPRESS)) {
1149 unsigned long next; 1166 unsigned long next;
1150 /* 1167 /*
1151 * the should_defrag function tells us how much to skip 1168 * the should_defrag function tells us how much to skip
@@ -1237,11 +1254,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1237 mutex_unlock(&inode->i_mutex); 1254 mutex_unlock(&inode->i_mutex);
1238 } 1255 }
1239 1256
1240 disk_super = root->fs_info->super_copy;
1241 features = btrfs_super_incompat_flags(disk_super);
1242 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1257 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1243 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1258 btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
1244 btrfs_set_super_incompat_flags(disk_super, features);
1245 } 1259 }
1246 1260
1247 ret = defrag_count; 1261 ret = defrag_count;
@@ -1379,41 +1393,39 @@ out:
1379} 1393}
1380 1394
1381static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1395static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1382 char *name, 1396 char *name, unsigned long fd, int subvol,
1383 unsigned long fd, 1397 u64 *transid, bool readonly,
1384 int subvol, 1398 struct btrfs_qgroup_inherit **inherit)
1385 u64 *transid,
1386 bool readonly)
1387{ 1399{
1388 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1389 struct file *src_file; 1400 struct file *src_file;
1390 int namelen; 1401 int namelen;
1391 int ret = 0; 1402 int ret = 0;
1392 1403
1393 if (root->fs_info->sb->s_flags & MS_RDONLY) 1404 ret = mnt_want_write_file(file);
1394 return -EROFS; 1405 if (ret)
1406 goto out;
1395 1407
1396 namelen = strlen(name); 1408 namelen = strlen(name);
1397 if (strchr(name, '/')) { 1409 if (strchr(name, '/')) {
1398 ret = -EINVAL; 1410 ret = -EINVAL;
1399 goto out; 1411 goto out_drop_write;
1400 } 1412 }
1401 1413
1402 if (name[0] == '.' && 1414 if (name[0] == '.' &&
1403 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1415 (namelen == 1 || (name[1] == '.' && namelen == 2))) {
1404 ret = -EEXIST; 1416 ret = -EEXIST;
1405 goto out; 1417 goto out_drop_write;
1406 } 1418 }
1407 1419
1408 if (subvol) { 1420 if (subvol) {
1409 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1421 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1410 NULL, transid, readonly); 1422 NULL, transid, readonly, inherit);
1411 } else { 1423 } else {
1412 struct inode *src_inode; 1424 struct inode *src_inode;
1413 src_file = fget(fd); 1425 src_file = fget(fd);
1414 if (!src_file) { 1426 if (!src_file) {
1415 ret = -EINVAL; 1427 ret = -EINVAL;
1416 goto out; 1428 goto out_drop_write;
1417 } 1429 }
1418 1430
1419 src_inode = src_file->f_path.dentry->d_inode; 1431 src_inode = src_file->f_path.dentry->d_inode;
@@ -1422,13 +1434,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1422 "another FS\n"); 1434 "another FS\n");
1423 ret = -EINVAL; 1435 ret = -EINVAL;
1424 fput(src_file); 1436 fput(src_file);
1425 goto out; 1437 goto out_drop_write;
1426 } 1438 }
1427 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1439 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1428 BTRFS_I(src_inode)->root, 1440 BTRFS_I(src_inode)->root,
1429 transid, readonly); 1441 transid, readonly, inherit);
1430 fput(src_file); 1442 fput(src_file);
1431 } 1443 }
1444out_drop_write:
1445 mnt_drop_write_file(file);
1432out: 1446out:
1433 return ret; 1447 return ret;
1434} 1448}
@@ -1446,7 +1460,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
1446 1460
1447 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1461 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1448 vol_args->fd, subvol, 1462 vol_args->fd, subvol,
1449 NULL, false); 1463 NULL, false, NULL);
1450 1464
1451 kfree(vol_args); 1465 kfree(vol_args);
1452 return ret; 1466 return ret;
@@ -1460,6 +1474,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1460 u64 transid = 0; 1474 u64 transid = 0;
1461 u64 *ptr = NULL; 1475 u64 *ptr = NULL;
1462 bool readonly = false; 1476 bool readonly = false;
1477 struct btrfs_qgroup_inherit *inherit = NULL;
1463 1478
1464 vol_args = memdup_user(arg, sizeof(*vol_args)); 1479 vol_args = memdup_user(arg, sizeof(*vol_args));
1465 if (IS_ERR(vol_args)) 1480 if (IS_ERR(vol_args))
@@ -1467,7 +1482,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1467 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1482 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1468 1483
1469 if (vol_args->flags & 1484 if (vol_args->flags &
1470 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { 1485 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1486 BTRFS_SUBVOL_QGROUP_INHERIT)) {
1471 ret = -EOPNOTSUPP; 1487 ret = -EOPNOTSUPP;
1472 goto out; 1488 goto out;
1473 } 1489 }
@@ -1476,10 +1492,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1476 ptr = &transid; 1492 ptr = &transid;
1477 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1493 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1478 readonly = true; 1494 readonly = true;
1495 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1496 if (vol_args->size > PAGE_CACHE_SIZE) {
1497 ret = -EINVAL;
1498 goto out;
1499 }
1500 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1501 if (IS_ERR(inherit)) {
1502 ret = PTR_ERR(inherit);
1503 goto out;
1504 }
1505 }
1479 1506
1480 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1507 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1481 vol_args->fd, subvol, 1508 vol_args->fd, subvol, ptr,
1482 ptr, readonly); 1509 readonly, &inherit);
1483 1510
1484 if (ret == 0 && ptr && 1511 if (ret == 0 && ptr &&
1485 copy_to_user(arg + 1512 copy_to_user(arg +
@@ -1488,6 +1515,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1488 ret = -EFAULT; 1515 ret = -EFAULT;
1489out: 1516out:
1490 kfree(vol_args); 1517 kfree(vol_args);
1518 kfree(inherit);
1491 return ret; 1519 return ret;
1492} 1520}
1493 1521
@@ -1523,29 +1551,40 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1523 u64 flags; 1551 u64 flags;
1524 int ret = 0; 1552 int ret = 0;
1525 1553
1526 if (root->fs_info->sb->s_flags & MS_RDONLY) 1554 ret = mnt_want_write_file(file);
1527 return -EROFS; 1555 if (ret)
1556 goto out;
1528 1557
1529 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) 1558 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
1530 return -EINVAL; 1559 ret = -EINVAL;
1560 goto out_drop_write;
1561 }
1531 1562
1532 if (copy_from_user(&flags, arg, sizeof(flags))) 1563 if (copy_from_user(&flags, arg, sizeof(flags))) {
1533 return -EFAULT; 1564 ret = -EFAULT;
1565 goto out_drop_write;
1566 }
1534 1567
1535 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) 1568 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
1536 return -EINVAL; 1569 ret = -EINVAL;
1570 goto out_drop_write;
1571 }
1537 1572
1538 if (flags & ~BTRFS_SUBVOL_RDONLY) 1573 if (flags & ~BTRFS_SUBVOL_RDONLY) {
1539 return -EOPNOTSUPP; 1574 ret = -EOPNOTSUPP;
1575 goto out_drop_write;
1576 }
1540 1577
1541 if (!inode_owner_or_capable(inode)) 1578 if (!inode_owner_or_capable(inode)) {
1542 return -EACCES; 1579 ret = -EACCES;
1580 goto out_drop_write;
1581 }
1543 1582
1544 down_write(&root->fs_info->subvol_sem); 1583 down_write(&root->fs_info->subvol_sem);
1545 1584
1546 /* nothing to do */ 1585 /* nothing to do */
1547 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1586 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1548 goto out; 1587 goto out_drop_sem;
1549 1588
1550 root_flags = btrfs_root_flags(&root->root_item); 1589 root_flags = btrfs_root_flags(&root->root_item);
1551 if (flags & BTRFS_SUBVOL_RDONLY) 1590 if (flags & BTRFS_SUBVOL_RDONLY)
@@ -1568,8 +1607,11 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1568out_reset: 1607out_reset:
1569 if (ret) 1608 if (ret)
1570 btrfs_set_root_flags(&root->root_item, root_flags); 1609 btrfs_set_root_flags(&root->root_item, root_flags);
1571out: 1610out_drop_sem:
1572 up_write(&root->fs_info->subvol_sem); 1611 up_write(&root->fs_info->subvol_sem);
1612out_drop_write:
1613 mnt_drop_write_file(file);
1614out:
1573 return ret; 1615 return ret;
1574} 1616}
1575 1617
@@ -2340,6 +2382,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2340 goto out_drop_write; 2382 goto out_drop_write;
2341 } 2383 }
2342 2384
2385 ret = -EXDEV;
2386 if (src_file->f_path.mnt != file->f_path.mnt)
2387 goto out_fput;
2388
2343 src = src_file->f_dentry->d_inode; 2389 src = src_file->f_dentry->d_inode;
2344 2390
2345 ret = -EINVAL; 2391 ret = -EINVAL;
@@ -2360,7 +2406,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2360 goto out_fput; 2406 goto out_fput;
2361 2407
2362 ret = -EXDEV; 2408 ret = -EXDEV;
2363 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) 2409 if (src->i_sb != inode->i_sb)
2364 goto out_fput; 2410 goto out_fput;
2365 2411
2366 ret = -ENOMEM; 2412 ret = -ENOMEM;
@@ -2434,13 +2480,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2434 * note the key will change type as we walk through the 2480 * note the key will change type as we walk through the
2435 * tree. 2481 * tree.
2436 */ 2482 */
2437 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2483 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
2484 0, 0);
2438 if (ret < 0) 2485 if (ret < 0)
2439 goto out; 2486 goto out;
2440 2487
2441 nritems = btrfs_header_nritems(path->nodes[0]); 2488 nritems = btrfs_header_nritems(path->nodes[0]);
2442 if (path->slots[0] >= nritems) { 2489 if (path->slots[0] >= nritems) {
2443 ret = btrfs_next_leaf(root, path); 2490 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2444 if (ret < 0) 2491 if (ret < 0)
2445 goto out; 2492 goto out;
2446 if (ret > 0) 2493 if (ret > 0)
@@ -2749,8 +2796,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2749 struct btrfs_path *path; 2796 struct btrfs_path *path;
2750 struct btrfs_key location; 2797 struct btrfs_key location;
2751 struct btrfs_disk_key disk_key; 2798 struct btrfs_disk_key disk_key;
2752 struct btrfs_super_block *disk_super;
2753 u64 features;
2754 u64 objectid = 0; 2799 u64 objectid = 0;
2755 u64 dir_id; 2800 u64 dir_id;
2756 2801
@@ -2801,12 +2846,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2801 btrfs_mark_buffer_dirty(path->nodes[0]); 2846 btrfs_mark_buffer_dirty(path->nodes[0]);
2802 btrfs_free_path(path); 2847 btrfs_free_path(path);
2803 2848
2804 disk_super = root->fs_info->super_copy; 2849 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2805 features = btrfs_super_incompat_flags(disk_super);
2806 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2807 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
2808 btrfs_set_super_incompat_flags(disk_super, features);
2809 }
2810 btrfs_end_transaction(trans, root); 2850 btrfs_end_transaction(trans, root);
2811 2851
2812 return 0; 2852 return 0;
@@ -3063,19 +3103,21 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3063} 3103}
3064 3104
3065static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, 3105static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3066 void __user *arg, int reset_after_read) 3106 void __user *arg)
3067{ 3107{
3068 struct btrfs_ioctl_get_dev_stats *sa; 3108 struct btrfs_ioctl_get_dev_stats *sa;
3069 int ret; 3109 int ret;
3070 3110
3071 if (reset_after_read && !capable(CAP_SYS_ADMIN))
3072 return -EPERM;
3073
3074 sa = memdup_user(arg, sizeof(*sa)); 3111 sa = memdup_user(arg, sizeof(*sa));
3075 if (IS_ERR(sa)) 3112 if (IS_ERR(sa))
3076 return PTR_ERR(sa); 3113 return PTR_ERR(sa);
3077 3114
3078 ret = btrfs_get_dev_stats(root, sa, reset_after_read); 3115 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
3116 kfree(sa);
3117 return -EPERM;
3118 }
3119
3120 ret = btrfs_get_dev_stats(root, sa);
3079 3121
3080 if (copy_to_user(arg, sa, sizeof(*sa))) 3122 if (copy_to_user(arg, sa, sizeof(*sa)))
3081 ret = -EFAULT; 3123 ret = -EFAULT;
@@ -3265,9 +3307,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3265 if (!capable(CAP_SYS_ADMIN)) 3307 if (!capable(CAP_SYS_ADMIN))
3266 return -EPERM; 3308 return -EPERM;
3267 3309
3268 if (fs_info->sb->s_flags & MS_RDONLY)
3269 return -EROFS;
3270
3271 ret = mnt_want_write_file(file); 3310 ret = mnt_want_write_file(file);
3272 if (ret) 3311 if (ret)
3273 return ret; 3312 return ret;
@@ -3390,6 +3429,264 @@ out:
3390 return ret; 3429 return ret;
3391} 3430}
3392 3431
3432static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3433{
3434 struct btrfs_ioctl_quota_ctl_args *sa;
3435 struct btrfs_trans_handle *trans = NULL;
3436 int ret;
3437 int err;
3438
3439 if (!capable(CAP_SYS_ADMIN))
3440 return -EPERM;
3441
3442 if (root->fs_info->sb->s_flags & MS_RDONLY)
3443 return -EROFS;
3444
3445 sa = memdup_user(arg, sizeof(*sa));
3446 if (IS_ERR(sa))
3447 return PTR_ERR(sa);
3448
3449 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3450 trans = btrfs_start_transaction(root, 2);
3451 if (IS_ERR(trans)) {
3452 ret = PTR_ERR(trans);
3453 goto out;
3454 }
3455 }
3456
3457 switch (sa->cmd) {
3458 case BTRFS_QUOTA_CTL_ENABLE:
3459 ret = btrfs_quota_enable(trans, root->fs_info);
3460 break;
3461 case BTRFS_QUOTA_CTL_DISABLE:
3462 ret = btrfs_quota_disable(trans, root->fs_info);
3463 break;
3464 case BTRFS_QUOTA_CTL_RESCAN:
3465 ret = btrfs_quota_rescan(root->fs_info);
3466 break;
3467 default:
3468 ret = -EINVAL;
3469 break;
3470 }
3471
3472 if (copy_to_user(arg, sa, sizeof(*sa)))
3473 ret = -EFAULT;
3474
3475 if (trans) {
3476 err = btrfs_commit_transaction(trans, root);
3477 if (err && !ret)
3478 ret = err;
3479 }
3480
3481out:
3482 kfree(sa);
3483 return ret;
3484}
3485
3486static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3487{
3488 struct btrfs_ioctl_qgroup_assign_args *sa;
3489 struct btrfs_trans_handle *trans;
3490 int ret;
3491 int err;
3492
3493 if (!capable(CAP_SYS_ADMIN))
3494 return -EPERM;
3495
3496 if (root->fs_info->sb->s_flags & MS_RDONLY)
3497 return -EROFS;
3498
3499 sa = memdup_user(arg, sizeof(*sa));
3500 if (IS_ERR(sa))
3501 return PTR_ERR(sa);
3502
3503 trans = btrfs_join_transaction(root);
3504 if (IS_ERR(trans)) {
3505 ret = PTR_ERR(trans);
3506 goto out;
3507 }
3508
3509 /* FIXME: check if the IDs really exist */
3510 if (sa->assign) {
3511 ret = btrfs_add_qgroup_relation(trans, root->fs_info,
3512 sa->src, sa->dst);
3513 } else {
3514 ret = btrfs_del_qgroup_relation(trans, root->fs_info,
3515 sa->src, sa->dst);
3516 }
3517
3518 err = btrfs_end_transaction(trans, root);
3519 if (err && !ret)
3520 ret = err;
3521
3522out:
3523 kfree(sa);
3524 return ret;
3525}
3526
3527static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3528{
3529 struct btrfs_ioctl_qgroup_create_args *sa;
3530 struct btrfs_trans_handle *trans;
3531 int ret;
3532 int err;
3533
3534 if (!capable(CAP_SYS_ADMIN))
3535 return -EPERM;
3536
3537 if (root->fs_info->sb->s_flags & MS_RDONLY)
3538 return -EROFS;
3539
3540 sa = memdup_user(arg, sizeof(*sa));
3541 if (IS_ERR(sa))
3542 return PTR_ERR(sa);
3543
3544 trans = btrfs_join_transaction(root);
3545 if (IS_ERR(trans)) {
3546 ret = PTR_ERR(trans);
3547 goto out;
3548 }
3549
3550 /* FIXME: check if the IDs really exist */
3551 if (sa->create) {
3552 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
3553 NULL);
3554 } else {
3555 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
3556 }
3557
3558 err = btrfs_end_transaction(trans, root);
3559 if (err && !ret)
3560 ret = err;
3561
3562out:
3563 kfree(sa);
3564 return ret;
3565}
3566
3567static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3568{
3569 struct btrfs_ioctl_qgroup_limit_args *sa;
3570 struct btrfs_trans_handle *trans;
3571 int ret;
3572 int err;
3573 u64 qgroupid;
3574
3575 if (!capable(CAP_SYS_ADMIN))
3576 return -EPERM;
3577
3578 if (root->fs_info->sb->s_flags & MS_RDONLY)
3579 return -EROFS;
3580
3581 sa = memdup_user(arg, sizeof(*sa));
3582 if (IS_ERR(sa))
3583 return PTR_ERR(sa);
3584
3585 trans = btrfs_join_transaction(root);
3586 if (IS_ERR(trans)) {
3587 ret = PTR_ERR(trans);
3588 goto out;
3589 }
3590
3591 qgroupid = sa->qgroupid;
3592 if (!qgroupid) {
3593 /* take the current subvol as qgroup */
3594 qgroupid = root->root_key.objectid;
3595 }
3596
3597 /* FIXME: check if the IDs really exist */
3598 ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
3599
3600 err = btrfs_end_transaction(trans, root);
3601 if (err && !ret)
3602 ret = err;
3603
3604out:
3605 kfree(sa);
3606 return ret;
3607}
3608
3609static long btrfs_ioctl_set_received_subvol(struct file *file,
3610 void __user *arg)
3611{
3612 struct btrfs_ioctl_received_subvol_args *sa = NULL;
3613 struct inode *inode = fdentry(file)->d_inode;
3614 struct btrfs_root *root = BTRFS_I(inode)->root;
3615 struct btrfs_root_item *root_item = &root->root_item;
3616 struct btrfs_trans_handle *trans;
3617 struct timespec ct = CURRENT_TIME;
3618 int ret = 0;
3619
3620 ret = mnt_want_write_file(file);
3621 if (ret < 0)
3622 return ret;
3623
3624 down_write(&root->fs_info->subvol_sem);
3625
3626 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
3627 ret = -EINVAL;
3628 goto out;
3629 }
3630
3631 if (btrfs_root_readonly(root)) {
3632 ret = -EROFS;
3633 goto out;
3634 }
3635
3636 if (!inode_owner_or_capable(inode)) {
3637 ret = -EACCES;
3638 goto out;
3639 }
3640
3641 sa = memdup_user(arg, sizeof(*sa));
3642 if (IS_ERR(sa)) {
3643 ret = PTR_ERR(sa);
3644 sa = NULL;
3645 goto out;
3646 }
3647
3648 trans = btrfs_start_transaction(root, 1);
3649 if (IS_ERR(trans)) {
3650 ret = PTR_ERR(trans);
3651 trans = NULL;
3652 goto out;
3653 }
3654
3655 sa->rtransid = trans->transid;
3656 sa->rtime.sec = ct.tv_sec;
3657 sa->rtime.nsec = ct.tv_nsec;
3658
3659 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
3660 btrfs_set_root_stransid(root_item, sa->stransid);
3661 btrfs_set_root_rtransid(root_item, sa->rtransid);
3662 root_item->stime.sec = cpu_to_le64(sa->stime.sec);
3663 root_item->stime.nsec = cpu_to_le32(sa->stime.nsec);
3664 root_item->rtime.sec = cpu_to_le64(sa->rtime.sec);
3665 root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec);
3666
3667 ret = btrfs_update_root(trans, root->fs_info->tree_root,
3668 &root->root_key, &root->root_item);
3669 if (ret < 0) {
3670 btrfs_end_transaction(trans, root);
3671 trans = NULL;
3672 goto out;
3673 } else {
3674 ret = btrfs_commit_transaction(trans, root);
3675 if (ret < 0)
3676 goto out;
3677 }
3678
3679 ret = copy_to_user(arg, sa, sizeof(*sa));
3680 if (ret)
3681 ret = -EFAULT;
3682
3683out:
3684 kfree(sa);
3685 up_write(&root->fs_info->subvol_sem);
3686 mnt_drop_write_file(file);
3687 return ret;
3688}
3689
3393long btrfs_ioctl(struct file *file, unsigned int 3690long btrfs_ioctl(struct file *file, unsigned int
3394 cmd, unsigned long arg) 3691 cmd, unsigned long arg)
3395{ 3692{
@@ -3411,6 +3708,8 @@ long btrfs_ioctl(struct file *file, unsigned int
3411 return btrfs_ioctl_snap_create_v2(file, argp, 0); 3708 return btrfs_ioctl_snap_create_v2(file, argp, 0);
3412 case BTRFS_IOC_SUBVOL_CREATE: 3709 case BTRFS_IOC_SUBVOL_CREATE:
3413 return btrfs_ioctl_snap_create(file, argp, 1); 3710 return btrfs_ioctl_snap_create(file, argp, 1);
3711 case BTRFS_IOC_SUBVOL_CREATE_V2:
3712 return btrfs_ioctl_snap_create_v2(file, argp, 1);
3414 case BTRFS_IOC_SNAP_DESTROY: 3713 case BTRFS_IOC_SNAP_DESTROY:
3415 return btrfs_ioctl_snap_destroy(file, argp); 3714 return btrfs_ioctl_snap_destroy(file, argp);
3416 case BTRFS_IOC_SUBVOL_GETFLAGS: 3715 case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -3472,10 +3771,20 @@ long btrfs_ioctl(struct file *file, unsigned int
3472 return btrfs_ioctl_balance_ctl(root, arg); 3771 return btrfs_ioctl_balance_ctl(root, arg);
3473 case BTRFS_IOC_BALANCE_PROGRESS: 3772 case BTRFS_IOC_BALANCE_PROGRESS:
3474 return btrfs_ioctl_balance_progress(root, argp); 3773 return btrfs_ioctl_balance_progress(root, argp);
3774 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
3775 return btrfs_ioctl_set_received_subvol(file, argp);
3776 case BTRFS_IOC_SEND:
3777 return btrfs_ioctl_send(file, argp);
3475 case BTRFS_IOC_GET_DEV_STATS: 3778 case BTRFS_IOC_GET_DEV_STATS:
3476 return btrfs_ioctl_get_dev_stats(root, argp, 0); 3779 return btrfs_ioctl_get_dev_stats(root, argp);
3477 case BTRFS_IOC_GET_AND_RESET_DEV_STATS: 3780 case BTRFS_IOC_QUOTA_CTL:
3478 return btrfs_ioctl_get_dev_stats(root, argp, 1); 3781 return btrfs_ioctl_quota_ctl(root, argp);
3782 case BTRFS_IOC_QGROUP_ASSIGN:
3783 return btrfs_ioctl_qgroup_assign(root, argp);
3784 case BTRFS_IOC_QGROUP_CREATE:
3785 return btrfs_ioctl_qgroup_create(root, argp);
3786 case BTRFS_IOC_QGROUP_LIMIT:
3787 return btrfs_ioctl_qgroup_limit(root, argp);
3479 } 3788 }
3480 3789
3481 return -ENOTTY; 3790 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e440aa653c3..731e2875ab9 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -32,15 +32,46 @@ struct btrfs_ioctl_vol_args {
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
35#define BTRFS_FSID_SIZE 16 36#define BTRFS_FSID_SIZE 16
36#define BTRFS_UUID_SIZE 16 37#define BTRFS_UUID_SIZE 16
37 38
39#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
40
41struct btrfs_qgroup_limit {
42 __u64 flags;
43 __u64 max_rfer;
44 __u64 max_excl;
45 __u64 rsv_rfer;
46 __u64 rsv_excl;
47};
48
49struct btrfs_qgroup_inherit {
50 __u64 flags;
51 __u64 num_qgroups;
52 __u64 num_ref_copies;
53 __u64 num_excl_copies;
54 struct btrfs_qgroup_limit lim;
55 __u64 qgroups[0];
56};
57
58struct btrfs_ioctl_qgroup_limit_args {
59 __u64 qgroupid;
60 struct btrfs_qgroup_limit lim;
61};
62
38#define BTRFS_SUBVOL_NAME_MAX 4039 63#define BTRFS_SUBVOL_NAME_MAX 4039
39struct btrfs_ioctl_vol_args_v2 { 64struct btrfs_ioctl_vol_args_v2 {
40 __s64 fd; 65 __s64 fd;
41 __u64 transid; 66 __u64 transid;
42 __u64 flags; 67 __u64 flags;
43 __u64 unused[4]; 68 union {
69 struct {
70 __u64 size;
71 struct btrfs_qgroup_inherit __user *qgroup_inherit;
72 };
73 __u64 unused[4];
74 };
44 char name[BTRFS_SUBVOL_NAME_MAX + 1]; 75 char name[BTRFS_SUBVOL_NAME_MAX + 1];
45}; 76};
46 77
@@ -285,9 +316,13 @@ enum btrfs_dev_stat_values {
285 BTRFS_DEV_STAT_VALUES_MAX 316 BTRFS_DEV_STAT_VALUES_MAX
286}; 317};
287 318
319/* Reset statistics after reading; needs SYS_ADMIN capability */
320#define BTRFS_DEV_STATS_RESET (1ULL << 0)
321
288struct btrfs_ioctl_get_dev_stats { 322struct btrfs_ioctl_get_dev_stats {
289 __u64 devid; /* in */ 323 __u64 devid; /* in */
290 __u64 nr_items; /* in/out */ 324 __u64 nr_items; /* in/out */
325 __u64 flags; /* in/out */
291 326
292 /* out values: */ 327 /* out values: */
293 __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; 328 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
@@ -295,6 +330,48 @@ struct btrfs_ioctl_get_dev_stats {
295 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ 330 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
296}; 331};
297 332
333#define BTRFS_QUOTA_CTL_ENABLE 1
334#define BTRFS_QUOTA_CTL_DISABLE 2
335#define BTRFS_QUOTA_CTL_RESCAN 3
336struct btrfs_ioctl_quota_ctl_args {
337 __u64 cmd;
338 __u64 status;
339};
340
341struct btrfs_ioctl_qgroup_assign_args {
342 __u64 assign;
343 __u64 src;
344 __u64 dst;
345};
346
347struct btrfs_ioctl_qgroup_create_args {
348 __u64 create;
349 __u64 qgroupid;
350};
351struct btrfs_ioctl_timespec {
352 __u64 sec;
353 __u32 nsec;
354};
355
356struct btrfs_ioctl_received_subvol_args {
357 char uuid[BTRFS_UUID_SIZE]; /* in */
358 __u64 stransid; /* in */
359 __u64 rtransid; /* out */
360 struct btrfs_ioctl_timespec stime; /* in */
361 struct btrfs_ioctl_timespec rtime; /* out */
362 __u64 flags; /* in */
363 __u64 reserved[16]; /* in */
364};
365
366struct btrfs_ioctl_send_args {
367 __s64 send_fd; /* in */
368 __u64 clone_sources_count; /* in */
369 __u64 __user *clone_sources; /* in */
370 __u64 parent_root; /* in */
371 __u64 flags; /* in */
372 __u64 reserved[4]; /* in */
373};
374
298#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 375#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
299 struct btrfs_ioctl_vol_args) 376 struct btrfs_ioctl_vol_args)
300#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 377#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -339,6 +416,8 @@ struct btrfs_ioctl_get_dev_stats {
339#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 416#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
340#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 417#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
341 struct btrfs_ioctl_vol_args_v2) 418 struct btrfs_ioctl_vol_args_v2)
419#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
420 struct btrfs_ioctl_vol_args_v2)
342#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) 421#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
343#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) 422#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
344#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ 423#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
@@ -359,9 +438,19 @@ struct btrfs_ioctl_get_dev_stats {
359 struct btrfs_ioctl_ino_path_args) 438 struct btrfs_ioctl_ino_path_args)
360#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 439#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
361 struct btrfs_ioctl_ino_path_args) 440 struct btrfs_ioctl_ino_path_args)
441#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
442 struct btrfs_ioctl_received_subvol_args)
443#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
444#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
445 struct btrfs_ioctl_vol_args)
446#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
447 struct btrfs_ioctl_quota_ctl_args)
448#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
449 struct btrfs_ioctl_qgroup_assign_args)
450#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
451 struct btrfs_ioctl_qgroup_create_args)
452#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
453 struct btrfs_ioctl_qgroup_limit_args)
362#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
363 struct btrfs_ioctl_get_dev_stats) 455 struct btrfs_ioctl_get_dev_stats)
364#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
365 struct btrfs_ioctl_get_dev_stats)
366
367#endif 456#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 272f911203f..a44eff07480 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -78,13 +78,15 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
78 write_lock(&eb->lock); 78 write_lock(&eb->lock);
79 WARN_ON(atomic_read(&eb->spinning_writers)); 79 WARN_ON(atomic_read(&eb->spinning_writers));
80 atomic_inc(&eb->spinning_writers); 80 atomic_inc(&eb->spinning_writers);
81 if (atomic_dec_and_test(&eb->blocking_writers)) 81 if (atomic_dec_and_test(&eb->blocking_writers) &&
82 waitqueue_active(&eb->write_lock_wq))
82 wake_up(&eb->write_lock_wq); 83 wake_up(&eb->write_lock_wq);
83 } else if (rw == BTRFS_READ_LOCK_BLOCKING) { 84 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
84 BUG_ON(atomic_read(&eb->blocking_readers) == 0); 85 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
85 read_lock(&eb->lock); 86 read_lock(&eb->lock);
86 atomic_inc(&eb->spinning_readers); 87 atomic_inc(&eb->spinning_readers);
87 if (atomic_dec_and_test(&eb->blocking_readers)) 88 if (atomic_dec_and_test(&eb->blocking_readers) &&
89 waitqueue_active(&eb->read_lock_wq))
88 wake_up(&eb->read_lock_wq); 90 wake_up(&eb->read_lock_wq);
89 } 91 }
90 return; 92 return;
@@ -199,7 +201,8 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
199 } 201 }
200 btrfs_assert_tree_read_locked(eb); 202 btrfs_assert_tree_read_locked(eb);
201 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 203 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
202 if (atomic_dec_and_test(&eb->blocking_readers)) 204 if (atomic_dec_and_test(&eb->blocking_readers) &&
205 waitqueue_active(&eb->read_lock_wq))
203 wake_up(&eb->read_lock_wq); 206 wake_up(&eb->read_lock_wq);
204 atomic_dec(&eb->read_locks); 207 atomic_dec(&eb->read_locks);
205} 208}
@@ -247,8 +250,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
247 if (blockers) { 250 if (blockers) {
248 WARN_ON(atomic_read(&eb->spinning_writers)); 251 WARN_ON(atomic_read(&eb->spinning_writers));
249 atomic_dec(&eb->blocking_writers); 252 atomic_dec(&eb->blocking_writers);
250 smp_wmb(); 253 smp_mb();
251 wake_up(&eb->write_lock_wq); 254 if (waitqueue_active(&eb->write_lock_wq))
255 wake_up(&eb->write_lock_wq);
252 } else { 256 } else {
253 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 257 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
254 atomic_dec(&eb->spinning_writers); 258 atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 00000000000..bc424ae5a81
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,1571 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26
27#include "ctree.h"
28#include "transaction.h"
29#include "disk-io.h"
30#include "locking.h"
31#include "ulist.h"
32#include "ioctl.h"
33#include "backref.h"
34
35/* TODO XXX FIXME
36 * - subvol delete -> delete when ref goes to 0? delete limits also?
37 * - reorganize keys
38 * - compressed
39 * - sync
40 * - rescan
41 * - copy also limits on subvol creation
42 * - limit
43 * - caches fuer ulists
44 * - performance benchmarks
45 * - check all ioctl parameters
46 */
47
48/*
49 * one struct for each qgroup, organized in fs_info->qgroup_tree.
50 */
51struct btrfs_qgroup {
52 u64 qgroupid;
53
54 /*
55 * state
56 */
57 u64 rfer; /* referenced */
58 u64 rfer_cmpr; /* referenced compressed */
59 u64 excl; /* exclusive */
60 u64 excl_cmpr; /* exclusive compressed */
61
62 /*
63 * limits
64 */
65 u64 lim_flags; /* which limits are set */
66 u64 max_rfer;
67 u64 max_excl;
68 u64 rsv_rfer;
69 u64 rsv_excl;
70
71 /*
72 * reservation tracking
73 */
74 u64 reserved;
75
76 /*
77 * lists
78 */
79 struct list_head groups; /* groups this group is member of */
80 struct list_head members; /* groups that are members of this group */
81 struct list_head dirty; /* dirty groups */
82 struct rb_node node; /* tree of qgroups */
83
84 /*
85 * temp variables for accounting operations
86 */
87 u64 tag;
88 u64 refcnt;
89};
90
91/*
92 * glue structure to represent the relations between qgroups.
93 */
94struct btrfs_qgroup_list {
95 struct list_head next_group;
96 struct list_head next_member;
97 struct btrfs_qgroup *group;
98 struct btrfs_qgroup *member;
99};
100
101/* must be called with qgroup_lock held */
102static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
103 u64 qgroupid)
104{
105 struct rb_node *n = fs_info->qgroup_tree.rb_node;
106 struct btrfs_qgroup *qgroup;
107
108 while (n) {
109 qgroup = rb_entry(n, struct btrfs_qgroup, node);
110 if (qgroup->qgroupid < qgroupid)
111 n = n->rb_left;
112 else if (qgroup->qgroupid > qgroupid)
113 n = n->rb_right;
114 else
115 return qgroup;
116 }
117 return NULL;
118}
119
120/* must be called with qgroup_lock held */
121static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
122 u64 qgroupid)
123{
124 struct rb_node **p = &fs_info->qgroup_tree.rb_node;
125 struct rb_node *parent = NULL;
126 struct btrfs_qgroup *qgroup;
127
128 while (*p) {
129 parent = *p;
130 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
131
132 if (qgroup->qgroupid < qgroupid)
133 p = &(*p)->rb_left;
134 else if (qgroup->qgroupid > qgroupid)
135 p = &(*p)->rb_right;
136 else
137 return qgroup;
138 }
139
140 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
141 if (!qgroup)
142 return ERR_PTR(-ENOMEM);
143
144 qgroup->qgroupid = qgroupid;
145 INIT_LIST_HEAD(&qgroup->groups);
146 INIT_LIST_HEAD(&qgroup->members);
147 INIT_LIST_HEAD(&qgroup->dirty);
148
149 rb_link_node(&qgroup->node, parent, p);
150 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
151
152 return qgroup;
153}
154
155/* must be called with qgroup_lock held */
156static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
157{
158 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
159 struct btrfs_qgroup_list *list;
160
161 if (!qgroup)
162 return -ENOENT;
163
164 rb_erase(&qgroup->node, &fs_info->qgroup_tree);
165 list_del(&qgroup->dirty);
166
167 while (!list_empty(&qgroup->groups)) {
168 list = list_first_entry(&qgroup->groups,
169 struct btrfs_qgroup_list, next_group);
170 list_del(&list->next_group);
171 list_del(&list->next_member);
172 kfree(list);
173 }
174
175 while (!list_empty(&qgroup->members)) {
176 list = list_first_entry(&qgroup->members,
177 struct btrfs_qgroup_list, next_member);
178 list_del(&list->next_group);
179 list_del(&list->next_member);
180 kfree(list);
181 }
182 kfree(qgroup);
183
184 return 0;
185}
186
187/* must be called with qgroup_lock held */
188static int add_relation_rb(struct btrfs_fs_info *fs_info,
189 u64 memberid, u64 parentid)
190{
191 struct btrfs_qgroup *member;
192 struct btrfs_qgroup *parent;
193 struct btrfs_qgroup_list *list;
194
195 member = find_qgroup_rb(fs_info, memberid);
196 parent = find_qgroup_rb(fs_info, parentid);
197 if (!member || !parent)
198 return -ENOENT;
199
200 list = kzalloc(sizeof(*list), GFP_ATOMIC);
201 if (!list)
202 return -ENOMEM;
203
204 list->group = parent;
205 list->member = member;
206 list_add_tail(&list->next_group, &member->groups);
207 list_add_tail(&list->next_member, &parent->members);
208
209 return 0;
210}
211
212/* must be called with qgroup_lock held */
213static int del_relation_rb(struct btrfs_fs_info *fs_info,
214 u64 memberid, u64 parentid)
215{
216 struct btrfs_qgroup *member;
217 struct btrfs_qgroup *parent;
218 struct btrfs_qgroup_list *list;
219
220 member = find_qgroup_rb(fs_info, memberid);
221 parent = find_qgroup_rb(fs_info, parentid);
222 if (!member || !parent)
223 return -ENOENT;
224
225 list_for_each_entry(list, &member->groups, next_group) {
226 if (list->group == parent) {
227 list_del(&list->next_group);
228 list_del(&list->next_member);
229 kfree(list);
230 return 0;
231 }
232 }
233 return -ENOENT;
234}
235
236/*
237 * The full config is read in one go, only called from open_ctree()
238 * It doesn't use any locking, as at this point we're still single-threaded
239 */
240int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
241{
242 struct btrfs_key key;
243 struct btrfs_key found_key;
244 struct btrfs_root *quota_root = fs_info->quota_root;
245 struct btrfs_path *path = NULL;
246 struct extent_buffer *l;
247 int slot;
248 int ret = 0;
249 u64 flags = 0;
250
251 if (!fs_info->quota_enabled)
252 return 0;
253
254 path = btrfs_alloc_path();
255 if (!path) {
256 ret = -ENOMEM;
257 goto out;
258 }
259
260 /* default this to quota off, in case no status key is found */
261 fs_info->qgroup_flags = 0;
262
263 /*
264 * pass 1: read status, all qgroup infos and limits
265 */
266 key.objectid = 0;
267 key.type = 0;
268 key.offset = 0;
269 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
270 if (ret)
271 goto out;
272
273 while (1) {
274 struct btrfs_qgroup *qgroup;
275
276 slot = path->slots[0];
277 l = path->nodes[0];
278 btrfs_item_key_to_cpu(l, &found_key, slot);
279
280 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
281 struct btrfs_qgroup_status_item *ptr;
282
283 ptr = btrfs_item_ptr(l, slot,
284 struct btrfs_qgroup_status_item);
285
286 if (btrfs_qgroup_status_version(l, ptr) !=
287 BTRFS_QGROUP_STATUS_VERSION) {
288 printk(KERN_ERR
289 "btrfs: old qgroup version, quota disabled\n");
290 goto out;
291 }
292 if (btrfs_qgroup_status_generation(l, ptr) !=
293 fs_info->generation) {
294 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
295 printk(KERN_ERR
296 "btrfs: qgroup generation mismatch, "
297 "marked as inconsistent\n");
298 }
299 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
300 ptr);
301 /* FIXME read scan element */
302 goto next1;
303 }
304
305 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
306 found_key.type != BTRFS_QGROUP_LIMIT_KEY)
307 goto next1;
308
309 qgroup = find_qgroup_rb(fs_info, found_key.offset);
310 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
311 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
312 printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
313 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
314 }
315 if (!qgroup) {
316 qgroup = add_qgroup_rb(fs_info, found_key.offset);
317 if (IS_ERR(qgroup)) {
318 ret = PTR_ERR(qgroup);
319 goto out;
320 }
321 }
322 switch (found_key.type) {
323 case BTRFS_QGROUP_INFO_KEY: {
324 struct btrfs_qgroup_info_item *ptr;
325
326 ptr = btrfs_item_ptr(l, slot,
327 struct btrfs_qgroup_info_item);
328 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
329 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
330 qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
331 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
332 /* generation currently unused */
333 break;
334 }
335 case BTRFS_QGROUP_LIMIT_KEY: {
336 struct btrfs_qgroup_limit_item *ptr;
337
338 ptr = btrfs_item_ptr(l, slot,
339 struct btrfs_qgroup_limit_item);
340 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
341 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
342 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
343 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
344 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
345 break;
346 }
347 }
348next1:
349 ret = btrfs_next_item(quota_root, path);
350 if (ret < 0)
351 goto out;
352 if (ret)
353 break;
354 }
355 btrfs_release_path(path);
356
357 /*
358 * pass 2: read all qgroup relations
359 */
360 key.objectid = 0;
361 key.type = BTRFS_QGROUP_RELATION_KEY;
362 key.offset = 0;
363 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
364 if (ret)
365 goto out;
366 while (1) {
367 slot = path->slots[0];
368 l = path->nodes[0];
369 btrfs_item_key_to_cpu(l, &found_key, slot);
370
371 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
372 goto next2;
373
374 if (found_key.objectid > found_key.offset) {
375 /* parent <- member, not needed to build config */
376 /* FIXME should we omit the key completely? */
377 goto next2;
378 }
379
380 ret = add_relation_rb(fs_info, found_key.objectid,
381 found_key.offset);
382 if (ret)
383 goto out;
384next2:
385 ret = btrfs_next_item(quota_root, path);
386 if (ret < 0)
387 goto out;
388 if (ret)
389 break;
390 }
391out:
392 fs_info->qgroup_flags |= flags;
393 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
394 fs_info->quota_enabled = 0;
395 fs_info->pending_quota_state = 0;
396 }
397 btrfs_free_path(path);
398
399 return ret < 0 ? ret : 0;
400}
401
402/*
403 * This is only called from close_ctree() or open_ctree(), both in single-
404 * treaded paths. Clean up the in-memory structures. No locking needed.
405 */
406void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
407{
408 struct rb_node *n;
409 struct btrfs_qgroup *qgroup;
410 struct btrfs_qgroup_list *list;
411
412 while ((n = rb_first(&fs_info->qgroup_tree))) {
413 qgroup = rb_entry(n, struct btrfs_qgroup, node);
414 rb_erase(n, &fs_info->qgroup_tree);
415
416 WARN_ON(!list_empty(&qgroup->dirty));
417
418 while (!list_empty(&qgroup->groups)) {
419 list = list_first_entry(&qgroup->groups,
420 struct btrfs_qgroup_list,
421 next_group);
422 list_del(&list->next_group);
423 list_del(&list->next_member);
424 kfree(list);
425 }
426
427 while (!list_empty(&qgroup->members)) {
428 list = list_first_entry(&qgroup->members,
429 struct btrfs_qgroup_list,
430 next_member);
431 list_del(&list->next_group);
432 list_del(&list->next_member);
433 kfree(list);
434 }
435 kfree(qgroup);
436 }
437}
438
439static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
440 struct btrfs_root *quota_root,
441 u64 src, u64 dst)
442{
443 int ret;
444 struct btrfs_path *path;
445 struct btrfs_key key;
446
447 path = btrfs_alloc_path();
448 if (!path)
449 return -ENOMEM;
450
451 key.objectid = src;
452 key.type = BTRFS_QGROUP_RELATION_KEY;
453 key.offset = dst;
454
455 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
456
457 btrfs_mark_buffer_dirty(path->nodes[0]);
458
459 btrfs_free_path(path);
460 return ret;
461}
462
463static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
464 struct btrfs_root *quota_root,
465 u64 src, u64 dst)
466{
467 int ret;
468 struct btrfs_path *path;
469 struct btrfs_key key;
470
471 path = btrfs_alloc_path();
472 if (!path)
473 return -ENOMEM;
474
475 key.objectid = src;
476 key.type = BTRFS_QGROUP_RELATION_KEY;
477 key.offset = dst;
478
479 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
480 if (ret < 0)
481 goto out;
482
483 if (ret > 0) {
484 ret = -ENOENT;
485 goto out;
486 }
487
488 ret = btrfs_del_item(trans, quota_root, path);
489out:
490 btrfs_free_path(path);
491 return ret;
492}
493
494static int add_qgroup_item(struct btrfs_trans_handle *trans,
495 struct btrfs_root *quota_root, u64 qgroupid)
496{
497 int ret;
498 struct btrfs_path *path;
499 struct btrfs_qgroup_info_item *qgroup_info;
500 struct btrfs_qgroup_limit_item *qgroup_limit;
501 struct extent_buffer *leaf;
502 struct btrfs_key key;
503
504 path = btrfs_alloc_path();
505 if (!path)
506 return -ENOMEM;
507
508 key.objectid = 0;
509 key.type = BTRFS_QGROUP_INFO_KEY;
510 key.offset = qgroupid;
511
512 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
513 sizeof(*qgroup_info));
514 if (ret)
515 goto out;
516
517 leaf = path->nodes[0];
518 qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
519 struct btrfs_qgroup_info_item);
520 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
521 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
522 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
523 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
524 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
525
526 btrfs_mark_buffer_dirty(leaf);
527
528 btrfs_release_path(path);
529
530 key.type = BTRFS_QGROUP_LIMIT_KEY;
531 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
532 sizeof(*qgroup_limit));
533 if (ret)
534 goto out;
535
536 leaf = path->nodes[0];
537 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
538 struct btrfs_qgroup_limit_item);
539 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
540 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
541 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
542 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
543 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
544
545 btrfs_mark_buffer_dirty(leaf);
546
547 ret = 0;
548out:
549 btrfs_free_path(path);
550 return ret;
551}
552
553static int del_qgroup_item(struct btrfs_trans_handle *trans,
554 struct btrfs_root *quota_root, u64 qgroupid)
555{
556 int ret;
557 struct btrfs_path *path;
558 struct btrfs_key key;
559
560 path = btrfs_alloc_path();
561 if (!path)
562 return -ENOMEM;
563
564 key.objectid = 0;
565 key.type = BTRFS_QGROUP_INFO_KEY;
566 key.offset = qgroupid;
567 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
568 if (ret < 0)
569 goto out;
570
571 if (ret > 0) {
572 ret = -ENOENT;
573 goto out;
574 }
575
576 ret = btrfs_del_item(trans, quota_root, path);
577 if (ret)
578 goto out;
579
580 btrfs_release_path(path);
581
582 key.type = BTRFS_QGROUP_LIMIT_KEY;
583 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
584 if (ret < 0)
585 goto out;
586
587 if (ret > 0) {
588 ret = -ENOENT;
589 goto out;
590 }
591
592 ret = btrfs_del_item(trans, quota_root, path);
593
594out:
595 btrfs_free_path(path);
596 return ret;
597}
598
599static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
600 struct btrfs_root *root, u64 qgroupid,
601 u64 flags, u64 max_rfer, u64 max_excl,
602 u64 rsv_rfer, u64 rsv_excl)
603{
604 struct btrfs_path *path;
605 struct btrfs_key key;
606 struct extent_buffer *l;
607 struct btrfs_qgroup_limit_item *qgroup_limit;
608 int ret;
609 int slot;
610
611 key.objectid = 0;
612 key.type = BTRFS_QGROUP_LIMIT_KEY;
613 key.offset = qgroupid;
614
615 path = btrfs_alloc_path();
616 BUG_ON(!path);
617 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
618 if (ret > 0)
619 ret = -ENOENT;
620
621 if (ret)
622 goto out;
623
624 l = path->nodes[0];
625 slot = path->slots[0];
626 qgroup_limit = btrfs_item_ptr(l, path->slots[0],
627 struct btrfs_qgroup_limit_item);
628 btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
629 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
630 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
631 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
632 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
633
634 btrfs_mark_buffer_dirty(l);
635
636out:
637 btrfs_free_path(path);
638 return ret;
639}
640
641static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
642 struct btrfs_root *root,
643 struct btrfs_qgroup *qgroup)
644{
645 struct btrfs_path *path;
646 struct btrfs_key key;
647 struct extent_buffer *l;
648 struct btrfs_qgroup_info_item *qgroup_info;
649 int ret;
650 int slot;
651
652 key.objectid = 0;
653 key.type = BTRFS_QGROUP_INFO_KEY;
654 key.offset = qgroup->qgroupid;
655
656 path = btrfs_alloc_path();
657 BUG_ON(!path);
658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
659 if (ret > 0)
660 ret = -ENOENT;
661
662 if (ret)
663 goto out;
664
665 l = path->nodes[0];
666 slot = path->slots[0];
667 qgroup_info = btrfs_item_ptr(l, path->slots[0],
668 struct btrfs_qgroup_info_item);
669 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
670 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
671 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
672 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
673 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
674
675 btrfs_mark_buffer_dirty(l);
676
677out:
678 btrfs_free_path(path);
679 return ret;
680}
681
682static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
683 struct btrfs_fs_info *fs_info,
684 struct btrfs_root *root)
685{
686 struct btrfs_path *path;
687 struct btrfs_key key;
688 struct extent_buffer *l;
689 struct btrfs_qgroup_status_item *ptr;
690 int ret;
691 int slot;
692
693 key.objectid = 0;
694 key.type = BTRFS_QGROUP_STATUS_KEY;
695 key.offset = 0;
696
697 path = btrfs_alloc_path();
698 BUG_ON(!path);
699 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
700 if (ret > 0)
701 ret = -ENOENT;
702
703 if (ret)
704 goto out;
705
706 l = path->nodes[0];
707 slot = path->slots[0];
708 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
709 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
710 btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
711 /* XXX scan */
712
713 btrfs_mark_buffer_dirty(l);
714
715out:
716 btrfs_free_path(path);
717 return ret;
718}
719
720/*
721 * called with qgroup_lock held
722 */
723static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
724 struct btrfs_root *root)
725{
726 struct btrfs_path *path;
727 struct btrfs_key key;
728 int ret;
729
730 if (!root)
731 return -EINVAL;
732
733 path = btrfs_alloc_path();
734 if (!path)
735 return -ENOMEM;
736
737 while (1) {
738 key.objectid = 0;
739 key.offset = 0;
740 key.type = 0;
741
742 path->leave_spinning = 1;
743 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
744 if (ret > 0) {
745 if (path->slots[0] == 0)
746 break;
747 path->slots[0]--;
748 } else if (ret < 0) {
749 break;
750 }
751
752 ret = btrfs_del_item(trans, root, path);
753 if (ret)
754 goto out;
755 btrfs_release_path(path);
756 }
757 ret = 0;
758out:
759 root->fs_info->pending_quota_state = 0;
760 btrfs_free_path(path);
761 return ret;
762}
763
764int btrfs_quota_enable(struct btrfs_trans_handle *trans,
765 struct btrfs_fs_info *fs_info)
766{
767 struct btrfs_root *quota_root;
768 struct btrfs_path *path = NULL;
769 struct btrfs_qgroup_status_item *ptr;
770 struct extent_buffer *leaf;
771 struct btrfs_key key;
772 int ret = 0;
773
774 spin_lock(&fs_info->qgroup_lock);
775 if (fs_info->quota_root) {
776 fs_info->pending_quota_state = 1;
777 spin_unlock(&fs_info->qgroup_lock);
778 goto out;
779 }
780 spin_unlock(&fs_info->qgroup_lock);
781
782 /*
783 * initially create the quota tree
784 */
785 quota_root = btrfs_create_tree(trans, fs_info,
786 BTRFS_QUOTA_TREE_OBJECTID);
787 if (IS_ERR(quota_root)) {
788 ret = PTR_ERR(quota_root);
789 goto out;
790 }
791
792 path = btrfs_alloc_path();
793 if (!path)
794 return -ENOMEM;
795
796 key.objectid = 0;
797 key.type = BTRFS_QGROUP_STATUS_KEY;
798 key.offset = 0;
799
800 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
801 sizeof(*ptr));
802 if (ret)
803 goto out;
804
805 leaf = path->nodes[0];
806 ptr = btrfs_item_ptr(leaf, path->slots[0],
807 struct btrfs_qgroup_status_item);
808 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
809 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
810 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
811 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
812 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
813 btrfs_set_qgroup_status_scan(leaf, ptr, 0);
814
815 btrfs_mark_buffer_dirty(leaf);
816
817 spin_lock(&fs_info->qgroup_lock);
818 fs_info->quota_root = quota_root;
819 fs_info->pending_quota_state = 1;
820 spin_unlock(&fs_info->qgroup_lock);
821out:
822 btrfs_free_path(path);
823 return ret;
824}
825
826int btrfs_quota_disable(struct btrfs_trans_handle *trans,
827 struct btrfs_fs_info *fs_info)
828{
829 struct btrfs_root *tree_root = fs_info->tree_root;
830 struct btrfs_root *quota_root;
831 int ret = 0;
832
833 spin_lock(&fs_info->qgroup_lock);
834 fs_info->quota_enabled = 0;
835 fs_info->pending_quota_state = 0;
836 quota_root = fs_info->quota_root;
837 fs_info->quota_root = NULL;
838 btrfs_free_qgroup_config(fs_info);
839 spin_unlock(&fs_info->qgroup_lock);
840
841 if (!quota_root)
842 return -EINVAL;
843
844 ret = btrfs_clean_quota_tree(trans, quota_root);
845 if (ret)
846 goto out;
847
848 ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
849 if (ret)
850 goto out;
851
852 list_del(&quota_root->dirty_list);
853
854 btrfs_tree_lock(quota_root->node);
855 clean_tree_block(trans, tree_root, quota_root->node);
856 btrfs_tree_unlock(quota_root->node);
857 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
858
859 free_extent_buffer(quota_root->node);
860 free_extent_buffer(quota_root->commit_root);
861 kfree(quota_root);
862out:
863 return ret;
864}
865
866int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
867{
868 /* FIXME */
869 return 0;
870}
871
872int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
873 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
874{
875 struct btrfs_root *quota_root;
876 int ret = 0;
877
878 quota_root = fs_info->quota_root;
879 if (!quota_root)
880 return -EINVAL;
881
882 ret = add_qgroup_relation_item(trans, quota_root, src, dst);
883 if (ret)
884 return ret;
885
886 ret = add_qgroup_relation_item(trans, quota_root, dst, src);
887 if (ret) {
888 del_qgroup_relation_item(trans, quota_root, src, dst);
889 return ret;
890 }
891
892 spin_lock(&fs_info->qgroup_lock);
893 ret = add_relation_rb(quota_root->fs_info, src, dst);
894 spin_unlock(&fs_info->qgroup_lock);
895
896 return ret;
897}
898
899int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
900 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
901{
902 struct btrfs_root *quota_root;
903 int ret = 0;
904 int err;
905
906 quota_root = fs_info->quota_root;
907 if (!quota_root)
908 return -EINVAL;
909
910 ret = del_qgroup_relation_item(trans, quota_root, src, dst);
911 err = del_qgroup_relation_item(trans, quota_root, dst, src);
912 if (err && !ret)
913 ret = err;
914
915 spin_lock(&fs_info->qgroup_lock);
916 del_relation_rb(fs_info, src, dst);
917
918 spin_unlock(&fs_info->qgroup_lock);
919
920 return ret;
921}
922
923int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
924 struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
925{
926 struct btrfs_root *quota_root;
927 struct btrfs_qgroup *qgroup;
928 int ret = 0;
929
930 quota_root = fs_info->quota_root;
931 if (!quota_root)
932 return -EINVAL;
933
934 ret = add_qgroup_item(trans, quota_root, qgroupid);
935
936 spin_lock(&fs_info->qgroup_lock);
937 qgroup = add_qgroup_rb(fs_info, qgroupid);
938 spin_unlock(&fs_info->qgroup_lock);
939
940 if (IS_ERR(qgroup))
941 ret = PTR_ERR(qgroup);
942
943 return ret;
944}
945
946int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
947 struct btrfs_fs_info *fs_info, u64 qgroupid)
948{
949 struct btrfs_root *quota_root;
950 int ret = 0;
951
952 quota_root = fs_info->quota_root;
953 if (!quota_root)
954 return -EINVAL;
955
956 ret = del_qgroup_item(trans, quota_root, qgroupid);
957
958 spin_lock(&fs_info->qgroup_lock);
959 del_qgroup_rb(quota_root->fs_info, qgroupid);
960
961 spin_unlock(&fs_info->qgroup_lock);
962
963 return ret;
964}
965
966int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
967 struct btrfs_fs_info *fs_info, u64 qgroupid,
968 struct btrfs_qgroup_limit *limit)
969{
970 struct btrfs_root *quota_root = fs_info->quota_root;
971 struct btrfs_qgroup *qgroup;
972 int ret = 0;
973
974 if (!quota_root)
975 return -EINVAL;
976
977 ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
978 limit->flags, limit->max_rfer,
979 limit->max_excl, limit->rsv_rfer,
980 limit->rsv_excl);
981 if (ret) {
982 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
983 printk(KERN_INFO "unable to update quota limit for %llu\n",
984 (unsigned long long)qgroupid);
985 }
986
987 spin_lock(&fs_info->qgroup_lock);
988
989 qgroup = find_qgroup_rb(fs_info, qgroupid);
990 if (!qgroup) {
991 ret = -ENOENT;
992 goto unlock;
993 }
994 qgroup->lim_flags = limit->flags;
995 qgroup->max_rfer = limit->max_rfer;
996 qgroup->max_excl = limit->max_excl;
997 qgroup->rsv_rfer = limit->rsv_rfer;
998 qgroup->rsv_excl = limit->rsv_excl;
999
1000unlock:
1001 spin_unlock(&fs_info->qgroup_lock);
1002
1003 return ret;
1004}
1005
1006static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1007 struct btrfs_qgroup *qgroup)
1008{
1009 if (list_empty(&qgroup->dirty))
1010 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1011}
1012
1013/*
1014 * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
1015 * the modification into a list that's later used by btrfs_end_transaction to
1016 * pass the recorded modifications on to btrfs_qgroup_account_ref.
1017 */
1018int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_delayed_ref_node *node,
1020 struct btrfs_delayed_extent_op *extent_op)
1021{
1022 struct qgroup_update *u;
1023
1024 BUG_ON(!trans->delayed_ref_elem.seq);
1025 u = kmalloc(sizeof(*u), GFP_NOFS);
1026 if (!u)
1027 return -ENOMEM;
1028
1029 u->node = node;
1030 u->extent_op = extent_op;
1031 list_add_tail(&u->list, &trans->qgroup_ref_list);
1032
1033 return 0;
1034}
1035
1036/*
1037 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1038 * from the fs. First, all roots referencing the extent are searched, and
1039 * then the space is accounted accordingly to the different roots. The
1040 * accounting algorithm works in 3 steps documented inline.
1041 */
1042int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info,
1044 struct btrfs_delayed_ref_node *node,
1045 struct btrfs_delayed_extent_op *extent_op)
1046{
1047 struct btrfs_key ins;
1048 struct btrfs_root *quota_root;
1049 u64 ref_root;
1050 struct btrfs_qgroup *qgroup;
1051 struct ulist_node *unode;
1052 struct ulist *roots = NULL;
1053 struct ulist *tmp = NULL;
1054 struct ulist_iterator uiter;
1055 u64 seq;
1056 int ret = 0;
1057 int sgn;
1058
1059 if (!fs_info->quota_enabled)
1060 return 0;
1061
1062 BUG_ON(!fs_info->quota_root);
1063
1064 ins.objectid = node->bytenr;
1065 ins.offset = node->num_bytes;
1066 ins.type = BTRFS_EXTENT_ITEM_KEY;
1067
1068 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1069 node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
1070 struct btrfs_delayed_tree_ref *ref;
1071 ref = btrfs_delayed_node_to_tree_ref(node);
1072 ref_root = ref->root;
1073 } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1074 node->type == BTRFS_SHARED_DATA_REF_KEY) {
1075 struct btrfs_delayed_data_ref *ref;
1076 ref = btrfs_delayed_node_to_data_ref(node);
1077 ref_root = ref->root;
1078 } else {
1079 BUG();
1080 }
1081
1082 if (!is_fstree(ref_root)) {
1083 /*
1084 * non-fs-trees are not being accounted
1085 */
1086 return 0;
1087 }
1088
1089 switch (node->action) {
1090 case BTRFS_ADD_DELAYED_REF:
1091 case BTRFS_ADD_DELAYED_EXTENT:
1092 sgn = 1;
1093 break;
1094 case BTRFS_DROP_DELAYED_REF:
1095 sgn = -1;
1096 break;
1097 case BTRFS_UPDATE_DELAYED_HEAD:
1098 return 0;
1099 default:
1100 BUG();
1101 }
1102
1103 /*
1104 * the delayed ref sequence number we pass depends on the direction of
1105 * the operation. for add operations, we pass (node->seq - 1) to skip
1106 * the delayed ref's current sequence number, because we need the state
1107 * of the tree before the add operation. for delete operations, we pass
1108 * (node->seq) to include the delayed ref's current sequence number,
1109 * because we need the state of the tree after the delete operation.
1110 */
1111 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
1112 sgn > 0 ? node->seq - 1 : node->seq, &roots);
1113 if (ret < 0)
1114 goto out;
1115
1116 spin_lock(&fs_info->qgroup_lock);
1117 quota_root = fs_info->quota_root;
1118 if (!quota_root)
1119 goto unlock;
1120
1121 qgroup = find_qgroup_rb(fs_info, ref_root);
1122 if (!qgroup)
1123 goto unlock;
1124
1125 /*
1126 * step 1: for each old ref, visit all nodes once and inc refcnt
1127 */
1128 tmp = ulist_alloc(GFP_ATOMIC);
1129 if (!tmp) {
1130 ret = -ENOMEM;
1131 goto unlock;
1132 }
1133 seq = fs_info->qgroup_seq;
1134 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1135
1136 ULIST_ITER_INIT(&uiter);
1137 while ((unode = ulist_next(roots, &uiter))) {
1138 struct ulist_node *tmp_unode;
1139 struct ulist_iterator tmp_uiter;
1140 struct btrfs_qgroup *qg;
1141
1142 qg = find_qgroup_rb(fs_info, unode->val);
1143 if (!qg)
1144 continue;
1145
1146 ulist_reinit(tmp);
1147 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist;
1152
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux;
1154 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1;
1156 else
1157 ++qg->refcnt;
1158
1159 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group,
1162 GFP_ATOMIC);
1163 }
1164 }
1165 }
1166
1167 /*
1168 * step 2: walk from the new root
1169 */
1170 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist;
1176
1177 qg = (struct btrfs_qgroup *)unode->aux;
1178 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes;
1181 qg->rfer_cmpr += sgn * node->num_bytes;
1182 if (roots->nnodes == 0) {
1183 qg->excl += sgn * node->num_bytes;
1184 qg->excl_cmpr += sgn * node->num_bytes;
1185 }
1186 qgroup_dirty(fs_info, qg);
1187 }
1188 WARN_ON(qg->tag >= seq);
1189 qg->tag = seq;
1190
1191 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC);
1194 }
1195 }
1196
1197 /*
1198 * step 3: walk again from old refs
1199 */
1200 ULIST_ITER_INIT(&uiter);
1201 while ((unode = ulist_next(roots, &uiter))) {
1202 struct btrfs_qgroup *qg;
1203 struct ulist_node *tmp_unode;
1204 struct ulist_iterator tmp_uiter;
1205
1206 qg = find_qgroup_rb(fs_info, unode->val);
1207 if (!qg)
1208 continue;
1209
1210 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist;
1215
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux;
1217 if (qg->tag == seq)
1218 continue;
1219
1220 if (qg->refcnt - seq == roots->nnodes) {
1221 qg->excl -= sgn * node->num_bytes;
1222 qg->excl_cmpr -= sgn * node->num_bytes;
1223 qgroup_dirty(fs_info, qg);
1224 }
1225
1226 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group,
1229 GFP_ATOMIC);
1230 }
1231 }
1232 }
1233 ret = 0;
1234unlock:
1235 spin_unlock(&fs_info->qgroup_lock);
1236out:
1237 ulist_free(roots);
1238 ulist_free(tmp);
1239
1240 return ret;
1241}
1242
1243/*
1244 * called from commit_transaction. Writes all changed qgroups to disk.
1245 */
1246int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1247 struct btrfs_fs_info *fs_info)
1248{
1249 struct btrfs_root *quota_root = fs_info->quota_root;
1250 int ret = 0;
1251
1252 if (!quota_root)
1253 goto out;
1254
1255 fs_info->quota_enabled = fs_info->pending_quota_state;
1256
1257 spin_lock(&fs_info->qgroup_lock);
1258 while (!list_empty(&fs_info->dirty_qgroups)) {
1259 struct btrfs_qgroup *qgroup;
1260 qgroup = list_first_entry(&fs_info->dirty_qgroups,
1261 struct btrfs_qgroup, dirty);
1262 list_del_init(&qgroup->dirty);
1263 spin_unlock(&fs_info->qgroup_lock);
1264 ret = update_qgroup_info_item(trans, quota_root, qgroup);
1265 if (ret)
1266 fs_info->qgroup_flags |=
1267 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1268 spin_lock(&fs_info->qgroup_lock);
1269 }
1270 if (fs_info->quota_enabled)
1271 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
1272 else
1273 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1274 spin_unlock(&fs_info->qgroup_lock);
1275
1276 ret = update_qgroup_status_item(trans, fs_info, quota_root);
1277 if (ret)
1278 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1279
1280out:
1281
1282 return ret;
1283}
1284
1285/*
1286 * copy the acounting information between qgroups. This is necessary when a
1287 * snapshot or a subvolume is created
1288 */
1289int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1290 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
1291 struct btrfs_qgroup_inherit *inherit)
1292{
1293 int ret = 0;
1294 int i;
1295 u64 *i_qgroups;
1296 struct btrfs_root *quota_root = fs_info->quota_root;
1297 struct btrfs_qgroup *srcgroup;
1298 struct btrfs_qgroup *dstgroup;
1299 u32 level_size = 0;
1300
1301 if (!fs_info->quota_enabled)
1302 return 0;
1303
1304 if (!quota_root)
1305 return -EINVAL;
1306
1307 /*
1308 * create a tracking group for the subvol itself
1309 */
1310 ret = add_qgroup_item(trans, quota_root, objectid);
1311 if (ret)
1312 goto out;
1313
1314 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
1315 ret = update_qgroup_limit_item(trans, quota_root, objectid,
1316 inherit->lim.flags,
1317 inherit->lim.max_rfer,
1318 inherit->lim.max_excl,
1319 inherit->lim.rsv_rfer,
1320 inherit->lim.rsv_excl);
1321 if (ret)
1322 goto out;
1323 }
1324
1325 if (srcid) {
1326 struct btrfs_root *srcroot;
1327 struct btrfs_key srckey;
1328 int srcroot_level;
1329
1330 srckey.objectid = srcid;
1331 srckey.type = BTRFS_ROOT_ITEM_KEY;
1332 srckey.offset = (u64)-1;
1333 srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
1334 if (IS_ERR(srcroot)) {
1335 ret = PTR_ERR(srcroot);
1336 goto out;
1337 }
1338
1339 rcu_read_lock();
1340 srcroot_level = btrfs_header_level(srcroot->node);
1341 level_size = btrfs_level_size(srcroot, srcroot_level);
1342 rcu_read_unlock();
1343 }
1344
1345 /*
1346 * add qgroup to all inherited groups
1347 */
1348 if (inherit) {
1349 i_qgroups = (u64 *)(inherit + 1);
1350 for (i = 0; i < inherit->num_qgroups; ++i) {
1351 ret = add_qgroup_relation_item(trans, quota_root,
1352 objectid, *i_qgroups);
1353 if (ret)
1354 goto out;
1355 ret = add_qgroup_relation_item(trans, quota_root,
1356 *i_qgroups, objectid);
1357 if (ret)
1358 goto out;
1359 ++i_qgroups;
1360 }
1361 }
1362
1363
1364 spin_lock(&fs_info->qgroup_lock);
1365
1366 dstgroup = add_qgroup_rb(fs_info, objectid);
1367 if (!dstgroup)
1368 goto unlock;
1369
1370 if (srcid) {
1371 srcgroup = find_qgroup_rb(fs_info, srcid);
1372 if (!srcgroup)
1373 goto unlock;
1374 dstgroup->rfer = srcgroup->rfer - level_size;
1375 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
1376 srcgroup->excl = level_size;
1377 srcgroup->excl_cmpr = level_size;
1378 qgroup_dirty(fs_info, dstgroup);
1379 qgroup_dirty(fs_info, srcgroup);
1380 }
1381
1382 if (!inherit)
1383 goto unlock;
1384
1385 i_qgroups = (u64 *)(inherit + 1);
1386 for (i = 0; i < inherit->num_qgroups; ++i) {
1387 ret = add_relation_rb(quota_root->fs_info, objectid,
1388 *i_qgroups);
1389 if (ret)
1390 goto unlock;
1391 ++i_qgroups;
1392 }
1393
1394 for (i = 0; i < inherit->num_ref_copies; ++i) {
1395 struct btrfs_qgroup *src;
1396 struct btrfs_qgroup *dst;
1397
1398 src = find_qgroup_rb(fs_info, i_qgroups[0]);
1399 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
1400
1401 if (!src || !dst) {
1402 ret = -EINVAL;
1403 goto unlock;
1404 }
1405
1406 dst->rfer = src->rfer - level_size;
1407 dst->rfer_cmpr = src->rfer_cmpr - level_size;
1408 i_qgroups += 2;
1409 }
1410 for (i = 0; i < inherit->num_excl_copies; ++i) {
1411 struct btrfs_qgroup *src;
1412 struct btrfs_qgroup *dst;
1413
1414 src = find_qgroup_rb(fs_info, i_qgroups[0]);
1415 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
1416
1417 if (!src || !dst) {
1418 ret = -EINVAL;
1419 goto unlock;
1420 }
1421
1422 dst->excl = src->excl + level_size;
1423 dst->excl_cmpr = src->excl_cmpr + level_size;
1424 i_qgroups += 2;
1425 }
1426
1427unlock:
1428 spin_unlock(&fs_info->qgroup_lock);
1429out:
1430 return ret;
1431}
1432
1433/*
1434 * reserve some space for a qgroup and all its parents. The reservation takes
1435 * place with start_transaction or dealloc_reserve, similar to ENOSPC
1436 * accounting. If not enough space is available, EDQUOT is returned.
1437 * We assume that the requested space is new for all qgroups.
1438 */
1439int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1440{
1441 struct btrfs_root *quota_root;
1442 struct btrfs_qgroup *qgroup;
1443 struct btrfs_fs_info *fs_info = root->fs_info;
1444 u64 ref_root = root->root_key.objectid;
1445 int ret = 0;
1446 struct ulist *ulist = NULL;
1447 struct ulist_node *unode;
1448 struct ulist_iterator uiter;
1449
1450 if (!is_fstree(ref_root))
1451 return 0;
1452
1453 if (num_bytes == 0)
1454 return 0;
1455
1456 spin_lock(&fs_info->qgroup_lock);
1457 quota_root = fs_info->quota_root;
1458 if (!quota_root)
1459 goto out;
1460
1461 qgroup = find_qgroup_rb(fs_info, ref_root);
1462 if (!qgroup)
1463 goto out;
1464
1465 /*
1466 * in a first step, we check all affected qgroups if any limits would
1467 * be exceeded
1468 */
1469 ulist = ulist_alloc(GFP_ATOMIC);
1470 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1471 ULIST_ITER_INIT(&uiter);
1472 while ((unode = ulist_next(ulist, &uiter))) {
1473 struct btrfs_qgroup *qg;
1474 struct btrfs_qgroup_list *glist;
1475
1476 qg = (struct btrfs_qgroup *)unode->aux;
1477
1478 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1479 qg->reserved + qg->rfer + num_bytes >
1480 qg->max_rfer)
1481 ret = -EDQUOT;
1482
1483 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
1484 qg->reserved + qg->excl + num_bytes >
1485 qg->max_excl)
1486 ret = -EDQUOT;
1487
1488 list_for_each_entry(glist, &qg->groups, next_group) {
1489 ulist_add(ulist, glist->group->qgroupid,
1490 (unsigned long)glist->group, GFP_ATOMIC);
1491 }
1492 }
1493 if (ret)
1494 goto out;
1495
1496 /*
1497 * no limits exceeded, now record the reservation into all qgroups
1498 */
1499 ULIST_ITER_INIT(&uiter);
1500 while ((unode = ulist_next(ulist, &uiter))) {
1501 struct btrfs_qgroup *qg;
1502
1503 qg = (struct btrfs_qgroup *)unode->aux;
1504
1505 qg->reserved += num_bytes;
1506 }
1507
1508out:
1509 spin_unlock(&fs_info->qgroup_lock);
1510 ulist_free(ulist);
1511
1512 return ret;
1513}
1514
1515void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1516{
1517 struct btrfs_root *quota_root;
1518 struct btrfs_qgroup *qgroup;
1519 struct btrfs_fs_info *fs_info = root->fs_info;
1520 struct ulist *ulist = NULL;
1521 struct ulist_node *unode;
1522 struct ulist_iterator uiter;
1523 u64 ref_root = root->root_key.objectid;
1524
1525 if (!is_fstree(ref_root))
1526 return;
1527
1528 if (num_bytes == 0)
1529 return;
1530
1531 spin_lock(&fs_info->qgroup_lock);
1532
1533 quota_root = fs_info->quota_root;
1534 if (!quota_root)
1535 goto out;
1536
1537 qgroup = find_qgroup_rb(fs_info, ref_root);
1538 if (!qgroup)
1539 goto out;
1540
1541 ulist = ulist_alloc(GFP_ATOMIC);
1542 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1543 ULIST_ITER_INIT(&uiter);
1544 while ((unode = ulist_next(ulist, &uiter))) {
1545 struct btrfs_qgroup *qg;
1546 struct btrfs_qgroup_list *glist;
1547
1548 qg = (struct btrfs_qgroup *)unode->aux;
1549
1550 qg->reserved -= num_bytes;
1551
1552 list_for_each_entry(glist, &qg->groups, next_group) {
1553 ulist_add(ulist, glist->group->qgroupid,
1554 (unsigned long)glist->group, GFP_ATOMIC);
1555 }
1556 }
1557
1558out:
1559 spin_unlock(&fs_info->qgroup_lock);
1560 ulist_free(ulist);
1561}
1562
1563void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1564{
1565 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1566 return;
1567 printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
1568 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1569 trans->delayed_ref_elem.seq);
1570 BUG();
1571}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 646ee21bb03..c5dbd914967 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1239,10 +1239,11 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
1239 node->bytenr, &node->rb_node); 1239 node->bytenr, &node->rb_node);
1240 spin_unlock(&rc->reloc_root_tree.lock); 1240 spin_unlock(&rc->reloc_root_tree.lock);
1241 if (rb_node) { 1241 if (rb_node) {
1242 kfree(node);
1243 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " 1242 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1244 "for start=%llu while inserting into relocation " 1243 "for start=%llu while inserting into relocation "
1245 "tree\n"); 1244 "tree\n");
1245 kfree(node);
1246 return -EEXIST;
1246 } 1247 }
1247 1248
1248 list_add_tail(&root->root_list, &rc->reloc_roots); 1249 list_add_tail(&root->root_list, &rc->reloc_roots);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 24fb8ce4e07..6bb465cca20 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,12 +16,55 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/uuid.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "transaction.h" 21#include "transaction.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
24/* 25/*
26 * Read a root item from the tree. In case we detect a root item smaller then
27 * sizeof(root_item), we know it's an old version of the root structure and
28 * initialize all new fields to zero. The same happens if we detect mismatching
29 * generation numbers as then we know the root was once mounted with an older
30 * kernel that was not aware of the root item structure change.
31 */
32void btrfs_read_root_item(struct btrfs_root *root,
33 struct extent_buffer *eb, int slot,
34 struct btrfs_root_item *item)
35{
36 uuid_le uuid;
37 int len;
38 int need_reset = 0;
39
40 len = btrfs_item_size_nr(eb, slot);
41 read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
42 min_t(int, len, (int)sizeof(*item)));
43 if (len < sizeof(*item))
44 need_reset = 1;
45 if (!need_reset && btrfs_root_generation(item)
46 != btrfs_root_generation_v2(item)) {
47 if (btrfs_root_generation_v2(item) != 0) {
48 printk(KERN_WARNING "btrfs: mismatching "
49 "generation and generation_v2 "
50 "found in root item. This root "
51 "was probably mounted with an "
52 "older kernel. Resetting all "
53 "new fields.\n");
54 }
55 need_reset = 1;
56 }
57 if (need_reset) {
58 memset(&item->generation_v2, 0,
59 sizeof(*item) - offsetof(struct btrfs_root_item,
60 generation_v2));
61
62 uuid_le_gen(&uuid);
63 memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
64 }
65}
66
67/*
25 * lookup the root with the highest offset for a given objectid. The key we do 68 * lookup the root with the highest offset for a given objectid. The key we do
26 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 69 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
27 * on error. 70 * on error.
@@ -61,10 +104,10 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
61 goto out; 104 goto out;
62 } 105 }
63 if (item) 106 if (item)
64 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), 107 btrfs_read_root_item(root, l, slot, item);
65 sizeof(*item));
66 if (key) 108 if (key)
67 memcpy(key, &found_key, sizeof(found_key)); 109 memcpy(key, &found_key, sizeof(found_key));
110
68 ret = 0; 111 ret = 0;
69out: 112out:
70 btrfs_free_path(path); 113 btrfs_free_path(path);
@@ -91,16 +134,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
91 int ret; 134 int ret;
92 int slot; 135 int slot;
93 unsigned long ptr; 136 unsigned long ptr;
137 int old_len;
94 138
95 path = btrfs_alloc_path(); 139 path = btrfs_alloc_path();
96 if (!path) 140 if (!path)
97 return -ENOMEM; 141 return -ENOMEM;
98 142
99 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
100 if (ret < 0) { 144 if (ret < 0)
101 btrfs_abort_transaction(trans, root, ret); 145 goto out_abort;
102 goto out;
103 }
104 146
105 if (ret != 0) { 147 if (ret != 0) {
106 btrfs_print_leaf(root, path->nodes[0]); 148 btrfs_print_leaf(root, path->nodes[0]);
@@ -113,16 +155,56 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
113 l = path->nodes[0]; 155 l = path->nodes[0];
114 slot = path->slots[0]; 156 slot = path->slots[0];
115 ptr = btrfs_item_ptr_offset(l, slot); 157 ptr = btrfs_item_ptr_offset(l, slot);
158 old_len = btrfs_item_size_nr(l, slot);
159
160 /*
161 * If this is the first time we update the root item which originated
162 * from an older kernel, we need to enlarge the item size to make room
163 * for the added fields.
164 */
165 if (old_len < sizeof(*item)) {
166 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1);
169 if (ret < 0)
170 goto out_abort;
171 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0)
173 goto out_abort;
174 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item));
177 if (ret < 0)
178 goto out_abort;
179 l = path->nodes[0];
180 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot);
182 }
183
184 /*
185 * Update generation_v2 so at the next mount we know the new root
186 * fields are valid.
187 */
188 btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
189
116 write_extent_buffer(l, item, ptr, sizeof(*item)); 190 write_extent_buffer(l, item, ptr, sizeof(*item));
117 btrfs_mark_buffer_dirty(path->nodes[0]); 191 btrfs_mark_buffer_dirty(path->nodes[0]);
118out: 192out:
119 btrfs_free_path(path); 193 btrfs_free_path(path);
120 return ret; 194 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
121} 199}
122 200
123int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
124 struct btrfs_key *key, struct btrfs_root_item *item) 202 struct btrfs_key *key, struct btrfs_root_item *item)
125{ 203{
204 /*
205 * Make sure generation v1 and v2 match. See update_root for details.
206 */
207 btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
126 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 208 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
127} 209}
128 210
@@ -454,3 +536,16 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
454 root_item->byte_limit = 0; 536 root_item->byte_limit = 0;
455 } 537 }
456} 538}
539
540void btrfs_update_root_times(struct btrfs_trans_handle *trans,
541 struct btrfs_root *root)
542{
543 struct btrfs_root_item *item = &root->root_item;
544 struct timespec ct = CURRENT_TIME;
545
546 spin_lock(&root->root_times_lock);
547 item->ctransid = trans->transid;
548 item->ctime.sec = cpu_to_le64(ct.tv_sec);
549 item->ctime.nsec = cpu_to_le64(ct.tv_nsec);
550 spin_unlock(&root->root_times_lock);
551}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 00000000000..c8ca49b1bb4
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,4571 @@
1/*
2 * Copyright (C) 2012 Alexander Block. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bsearch.h>
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/sort.h>
23#include <linux/mount.h>
24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h>
26#include <linux/radix-tree.h>
27#include <linux/crc32c.h>
28
29#include "send.h"
30#include "backref.h"
31#include "locking.h"
32#include "disk-io.h"
33#include "btrfs_inode.h"
34#include "transaction.h"
35
36static int g_verbose = 0;
37
38#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
39
40/*
41 * A fs_path is a helper to dynamically build path names with unknown size.
42 * It reallocates the internal buffer on demand.
43 * It allows fast adding of path elements on the right side (normal path) and
44 * fast adding to the left side (reversed path). A reversed path can also be
45 * unreversed if needed.
46 */
47struct fs_path {
48 union {
49 struct {
50 char *start;
51 char *end;
52 char *prepared;
53
54 char *buf;
55 int buf_len;
56 int reversed:1;
57 int virtual_mem:1;
58 char inline_buf[];
59 };
60 char pad[PAGE_SIZE];
61 };
62};
63#define FS_PATH_INLINE_SIZE \
64 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
65
66
67/* reused for each extent */
68struct clone_root {
69 struct btrfs_root *root;
70 u64 ino;
71 u64 offset;
72
73 u64 found_refs;
74};
75
76#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
77#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
78
79struct send_ctx {
80 struct file *send_filp;
81 loff_t send_off;
82 char *send_buf;
83 u32 send_size;
84 u32 send_max_size;
85 u64 total_send_size;
86 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
87
88 struct vfsmount *mnt;
89
90 struct btrfs_root *send_root;
91 struct btrfs_root *parent_root;
92 struct clone_root *clone_roots;
93 int clone_roots_cnt;
94
95 /* current state of the compare_tree call */
96 struct btrfs_path *left_path;
97 struct btrfs_path *right_path;
98 struct btrfs_key *cmp_key;
99
100 /*
101 * infos of the currently processed inode. In case of deleted inodes,
102 * these are the values from the deleted inode.
103 */
104 u64 cur_ino;
105 u64 cur_inode_gen;
106 int cur_inode_new;
107 int cur_inode_new_gen;
108 int cur_inode_deleted;
109 int cur_inode_first_ref_orphan;
110 u64 cur_inode_size;
111 u64 cur_inode_mode;
112
113 u64 send_progress;
114
115 struct list_head new_refs;
116 struct list_head deleted_refs;
117
118 struct radix_tree_root name_cache;
119 struct list_head name_cache_list;
120 int name_cache_size;
121
122 struct file *cur_inode_filp;
123 char *read_buf;
124};
125
126struct name_cache_entry {
127 struct list_head list;
128 struct list_head use_list;
129 u64 ino;
130 u64 gen;
131 u64 parent_ino;
132 u64 parent_gen;
133 int ret;
134 int need_later_update;
135 int name_len;
136 char name[];
137};
138
139static void fs_path_reset(struct fs_path *p)
140{
141 if (p->reversed) {
142 p->start = p->buf + p->buf_len - 1;
143 p->end = p->start;
144 *p->start = 0;
145 } else {
146 p->start = p->buf;
147 p->end = p->start;
148 *p->start = 0;
149 }
150}
151
152static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
153{
154 struct fs_path *p;
155
156 p = kmalloc(sizeof(*p), GFP_NOFS);
157 if (!p)
158 return NULL;
159 p->reversed = 0;
160 p->virtual_mem = 0;
161 p->buf = p->inline_buf;
162 p->buf_len = FS_PATH_INLINE_SIZE;
163 fs_path_reset(p);
164 return p;
165}
166
167static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
168{
169 struct fs_path *p;
170
171 p = fs_path_alloc(sctx);
172 if (!p)
173 return NULL;
174 p->reversed = 1;
175 fs_path_reset(p);
176 return p;
177}
178
179static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
180{
181 if (!p)
182 return;
183 if (p->buf != p->inline_buf) {
184 if (p->virtual_mem)
185 vfree(p->buf);
186 else
187 kfree(p->buf);
188 }
189 kfree(p);
190}
191
192static int fs_path_len(struct fs_path *p)
193{
194 return p->end - p->start;
195}
196
197static int fs_path_ensure_buf(struct fs_path *p, int len)
198{
199 char *tmp_buf;
200 int path_len;
201 int old_buf_len;
202
203 len++;
204
205 if (p->buf_len >= len)
206 return 0;
207
208 path_len = p->end - p->start;
209 old_buf_len = p->buf_len;
210 len = PAGE_ALIGN(len);
211
212 if (p->buf == p->inline_buf) {
213 tmp_buf = kmalloc(len, GFP_NOFS);
214 if (!tmp_buf) {
215 tmp_buf = vmalloc(len);
216 if (!tmp_buf)
217 return -ENOMEM;
218 p->virtual_mem = 1;
219 }
220 memcpy(tmp_buf, p->buf, p->buf_len);
221 p->buf = tmp_buf;
222 p->buf_len = len;
223 } else {
224 if (p->virtual_mem) {
225 tmp_buf = vmalloc(len);
226 if (!tmp_buf)
227 return -ENOMEM;
228 memcpy(tmp_buf, p->buf, p->buf_len);
229 vfree(p->buf);
230 } else {
231 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
232 if (!tmp_buf) {
233 tmp_buf = vmalloc(len);
234 if (!tmp_buf)
235 return -ENOMEM;
236 memcpy(tmp_buf, p->buf, p->buf_len);
237 kfree(p->buf);
238 p->virtual_mem = 1;
239 }
240 }
241 p->buf = tmp_buf;
242 p->buf_len = len;
243 }
244 if (p->reversed) {
245 tmp_buf = p->buf + old_buf_len - path_len - 1;
246 p->end = p->buf + p->buf_len - 1;
247 p->start = p->end - path_len;
248 memmove(p->start, tmp_buf, path_len + 1);
249 } else {
250 p->start = p->buf;
251 p->end = p->start + path_len;
252 }
253 return 0;
254}
255
256static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
257{
258 int ret;
259 int new_len;
260
261 new_len = p->end - p->start + name_len;
262 if (p->start != p->end)
263 new_len++;
264 ret = fs_path_ensure_buf(p, new_len);
265 if (ret < 0)
266 goto out;
267
268 if (p->reversed) {
269 if (p->start != p->end)
270 *--p->start = '/';
271 p->start -= name_len;
272 p->prepared = p->start;
273 } else {
274 if (p->start != p->end)
275 *p->end++ = '/';
276 p->prepared = p->end;
277 p->end += name_len;
278 *p->end = 0;
279 }
280
281out:
282 return ret;
283}
284
285static int fs_path_add(struct fs_path *p, const char *name, int name_len)
286{
287 int ret;
288
289 ret = fs_path_prepare_for_add(p, name_len);
290 if (ret < 0)
291 goto out;
292 memcpy(p->prepared, name, name_len);
293 p->prepared = NULL;
294
295out:
296 return ret;
297}
298
299static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
300{
301 int ret;
302
303 ret = fs_path_prepare_for_add(p, p2->end - p2->start);
304 if (ret < 0)
305 goto out;
306 memcpy(p->prepared, p2->start, p2->end - p2->start);
307 p->prepared = NULL;
308
309out:
310 return ret;
311}
312
313static int fs_path_add_from_extent_buffer(struct fs_path *p,
314 struct extent_buffer *eb,
315 unsigned long off, int len)
316{
317 int ret;
318
319 ret = fs_path_prepare_for_add(p, len);
320 if (ret < 0)
321 goto out;
322
323 read_extent_buffer(eb, p->prepared, off, len);
324 p->prepared = NULL;
325
326out:
327 return ret;
328}
329
330static void fs_path_remove(struct fs_path *p)
331{
332 BUG_ON(p->reversed);
333 while (p->start != p->end && *p->end != '/')
334 p->end--;
335 *p->end = 0;
336}
337
338static int fs_path_copy(struct fs_path *p, struct fs_path *from)
339{
340 int ret;
341
342 p->reversed = from->reversed;
343 fs_path_reset(p);
344
345 ret = fs_path_add_path(p, from);
346
347 return ret;
348}
349
350
351static void fs_path_unreverse(struct fs_path *p)
352{
353 char *tmp;
354 int len;
355
356 if (!p->reversed)
357 return;
358
359 tmp = p->start;
360 len = p->end - p->start;
361 p->start = p->buf;
362 p->end = p->start + len;
363 memmove(p->start, tmp, len + 1);
364 p->reversed = 0;
365}
366
367static struct btrfs_path *alloc_path_for_send(void)
368{
369 struct btrfs_path *path;
370
371 path = btrfs_alloc_path();
372 if (!path)
373 return NULL;
374 path->search_commit_root = 1;
375 path->skip_locking = 1;
376 return path;
377}
378
379static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
380{
381 int ret;
382 mm_segment_t old_fs;
383 u32 pos = 0;
384
385 old_fs = get_fs();
386 set_fs(KERNEL_DS);
387
388 while (pos < len) {
389 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
390 &sctx->send_off);
391 /* TODO handle that correctly */
392 /*if (ret == -ERESTARTSYS) {
393 continue;
394 }*/
395 if (ret < 0)
396 goto out;
397 if (ret == 0) {
398 ret = -EIO;
399 goto out;
400 }
401 pos += ret;
402 }
403
404 ret = 0;
405
406out:
407 set_fs(old_fs);
408 return ret;
409}
410
411static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
412{
413 struct btrfs_tlv_header *hdr;
414 int total_len = sizeof(*hdr) + len;
415 int left = sctx->send_max_size - sctx->send_size;
416
417 if (unlikely(left < total_len))
418 return -EOVERFLOW;
419
420 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
421 hdr->tlv_type = cpu_to_le16(attr);
422 hdr->tlv_len = cpu_to_le16(len);
423 memcpy(hdr + 1, data, len);
424 sctx->send_size += total_len;
425
426 return 0;
427}
428
429#if 0
430static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
431{
432 return tlv_put(sctx, attr, &value, sizeof(value));
433}
434
435static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
436{
437 __le16 tmp = cpu_to_le16(value);
438 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
439}
440
441static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
442{
443 __le32 tmp = cpu_to_le32(value);
444 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
445}
446#endif
447
448static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
449{
450 __le64 tmp = cpu_to_le64(value);
451 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
452}
453
454static int tlv_put_string(struct send_ctx *sctx, u16 attr,
455 const char *str, int len)
456{
457 if (len == -1)
458 len = strlen(str);
459 return tlv_put(sctx, attr, str, len);
460}
461
462static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
463 const u8 *uuid)
464{
465 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
466}
467
468#if 0
469static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
470 struct timespec *ts)
471{
472 struct btrfs_timespec bts;
473 bts.sec = cpu_to_le64(ts->tv_sec);
474 bts.nsec = cpu_to_le32(ts->tv_nsec);
475 return tlv_put(sctx, attr, &bts, sizeof(bts));
476}
477#endif
478
479static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
480 struct extent_buffer *eb,
481 struct btrfs_timespec *ts)
482{
483 struct btrfs_timespec bts;
484 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
485 return tlv_put(sctx, attr, &bts, sizeof(bts));
486}
487
488
489#define TLV_PUT(sctx, attrtype, attrlen, data) \
490 do { \
491 ret = tlv_put(sctx, attrtype, attrlen, data); \
492 if (ret < 0) \
493 goto tlv_put_failure; \
494 } while (0)
495
496#define TLV_PUT_INT(sctx, attrtype, bits, value) \
497 do { \
498 ret = tlv_put_u##bits(sctx, attrtype, value); \
499 if (ret < 0) \
500 goto tlv_put_failure; \
501 } while (0)
502
503#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
504#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
505#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
506#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
507#define TLV_PUT_STRING(sctx, attrtype, str, len) \
508 do { \
509 ret = tlv_put_string(sctx, attrtype, str, len); \
510 if (ret < 0) \
511 goto tlv_put_failure; \
512 } while (0)
513#define TLV_PUT_PATH(sctx, attrtype, p) \
514 do { \
515 ret = tlv_put_string(sctx, attrtype, p->start, \
516 p->end - p->start); \
517 if (ret < 0) \
518 goto tlv_put_failure; \
519 } while(0)
520#define TLV_PUT_UUID(sctx, attrtype, uuid) \
521 do { \
522 ret = tlv_put_uuid(sctx, attrtype, uuid); \
523 if (ret < 0) \
524 goto tlv_put_failure; \
525 } while (0)
526#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
527 do { \
528 ret = tlv_put_timespec(sctx, attrtype, ts); \
529 if (ret < 0) \
530 goto tlv_put_failure; \
531 } while (0)
532#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
533 do { \
534 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
535 if (ret < 0) \
536 goto tlv_put_failure; \
537 } while (0)
538
539static int send_header(struct send_ctx *sctx)
540{
541 struct btrfs_stream_header hdr;
542
543 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
544 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
545
546 return write_buf(sctx, &hdr, sizeof(hdr));
547}
548
549/*
550 * For each command/item we want to send to userspace, we call this function.
551 */
552static int begin_cmd(struct send_ctx *sctx, int cmd)
553{
554 struct btrfs_cmd_header *hdr;
555
556 if (!sctx->send_buf) {
557 WARN_ON(1);
558 return -EINVAL;
559 }
560
561 BUG_ON(sctx->send_size);
562
563 sctx->send_size += sizeof(*hdr);
564 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
565 hdr->cmd = cpu_to_le16(cmd);
566
567 return 0;
568}
569
570static int send_cmd(struct send_ctx *sctx)
571{
572 int ret;
573 struct btrfs_cmd_header *hdr;
574 u32 crc;
575
576 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
577 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
578 hdr->crc = 0;
579
580 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
581 hdr->crc = cpu_to_le32(crc);
582
583 ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
584
585 sctx->total_send_size += sctx->send_size;
586 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
587 sctx->send_size = 0;
588
589 return ret;
590}
591
592/*
593 * Sends a move instruction to user space
594 */
595static int send_rename(struct send_ctx *sctx,
596 struct fs_path *from, struct fs_path *to)
597{
598 int ret;
599
600verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start);
601
602 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
603 if (ret < 0)
604 goto out;
605
606 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
607 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
608
609 ret = send_cmd(sctx);
610
611tlv_put_failure:
612out:
613 return ret;
614}
615
616/*
617 * Sends a link instruction to user space
618 */
619static int send_link(struct send_ctx *sctx,
620 struct fs_path *path, struct fs_path *lnk)
621{
622 int ret;
623
624verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start);
625
626 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
627 if (ret < 0)
628 goto out;
629
630 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
631 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
632
633 ret = send_cmd(sctx);
634
635tlv_put_failure:
636out:
637 return ret;
638}
639
640/*
641 * Sends an unlink instruction to user space
642 */
643static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
644{
645 int ret;
646
647verbose_printk("btrfs: send_unlink %s\n", path->start);
648
649 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
650 if (ret < 0)
651 goto out;
652
653 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
654
655 ret = send_cmd(sctx);
656
657tlv_put_failure:
658out:
659 return ret;
660}
661
662/*
663 * Sends a rmdir instruction to user space
664 */
665static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
666{
667 int ret;
668
669verbose_printk("btrfs: send_rmdir %s\n", path->start);
670
671 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
672 if (ret < 0)
673 goto out;
674
675 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
676
677 ret = send_cmd(sctx);
678
679tlv_put_failure:
680out:
681 return ret;
682}
683
684/*
685 * Helper function to retrieve some fields from an inode item.
686 */
687static int get_inode_info(struct btrfs_root *root,
688 u64 ino, u64 *size, u64 *gen,
689 u64 *mode, u64 *uid, u64 *gid)
690{
691 int ret;
692 struct btrfs_inode_item *ii;
693 struct btrfs_key key;
694 struct btrfs_path *path;
695
696 path = alloc_path_for_send();
697 if (!path)
698 return -ENOMEM;
699
700 key.objectid = ino;
701 key.type = BTRFS_INODE_ITEM_KEY;
702 key.offset = 0;
703 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
704 if (ret < 0)
705 goto out;
706 if (ret) {
707 ret = -ENOENT;
708 goto out;
709 }
710
711 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
712 struct btrfs_inode_item);
713 if (size)
714 *size = btrfs_inode_size(path->nodes[0], ii);
715 if (gen)
716 *gen = btrfs_inode_generation(path->nodes[0], ii);
717 if (mode)
718 *mode = btrfs_inode_mode(path->nodes[0], ii);
719 if (uid)
720 *uid = btrfs_inode_uid(path->nodes[0], ii);
721 if (gid)
722 *gid = btrfs_inode_gid(path->nodes[0], ii);
723
724out:
725 btrfs_free_path(path);
726 return ret;
727}
728
729typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
730 struct fs_path *p,
731 void *ctx);
732
733/*
734 * Helper function to iterate the entries in ONE btrfs_inode_ref.
735 * The iterate callback may return a non zero value to stop iteration. This can
736 * be a negative value for error codes or 1 to simply stop it.
737 *
738 * path must point to the INODE_REF when called.
739 */
740static int iterate_inode_ref(struct send_ctx *sctx,
741 struct btrfs_root *root, struct btrfs_path *path,
742 struct btrfs_key *found_key, int resolve,
743 iterate_inode_ref_t iterate, void *ctx)
744{
745 struct extent_buffer *eb;
746 struct btrfs_item *item;
747 struct btrfs_inode_ref *iref;
748 struct btrfs_path *tmp_path;
749 struct fs_path *p;
750 u32 cur;
751 u32 len;
752 u32 total;
753 int slot;
754 u32 name_len;
755 char *start;
756 int ret = 0;
757 int num;
758 int index;
759
760 p = fs_path_alloc_reversed(sctx);
761 if (!p)
762 return -ENOMEM;
763
764 tmp_path = alloc_path_for_send();
765 if (!tmp_path) {
766 fs_path_free(sctx, p);
767 return -ENOMEM;
768 }
769
770 eb = path->nodes[0];
771 slot = path->slots[0];
772 item = btrfs_item_nr(eb, slot);
773 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
774 cur = 0;
775 len = 0;
776 total = btrfs_item_size(eb, item);
777
778 num = 0;
779 while (cur < total) {
780 fs_path_reset(p);
781
782 name_len = btrfs_inode_ref_name_len(eb, iref);
783 index = btrfs_inode_ref_index(eb, iref);
784 if (resolve) {
785 start = btrfs_iref_to_path(root, tmp_path, iref, eb,
786 found_key->offset, p->buf,
787 p->buf_len);
788 if (IS_ERR(start)) {
789 ret = PTR_ERR(start);
790 goto out;
791 }
792 if (start < p->buf) {
793 /* overflow , try again with larger buffer */
794 ret = fs_path_ensure_buf(p,
795 p->buf_len + p->buf - start);
796 if (ret < 0)
797 goto out;
798 start = btrfs_iref_to_path(root, tmp_path, iref,
799 eb, found_key->offset, p->buf,
800 p->buf_len);
801 if (IS_ERR(start)) {
802 ret = PTR_ERR(start);
803 goto out;
804 }
805 BUG_ON(start < p->buf);
806 }
807 p->start = start;
808 } else {
809 ret = fs_path_add_from_extent_buffer(p, eb,
810 (unsigned long)(iref + 1), name_len);
811 if (ret < 0)
812 goto out;
813 }
814
815
816 len = sizeof(*iref) + name_len;
817 iref = (struct btrfs_inode_ref *)((char *)iref + len);
818 cur += len;
819
820 ret = iterate(num, found_key->offset, index, p, ctx);
821 if (ret)
822 goto out;
823
824 num++;
825 }
826
827out:
828 btrfs_free_path(tmp_path);
829 fs_path_free(sctx, p);
830 return ret;
831}
832
833typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
834 const char *name, int name_len,
835 const char *data, int data_len,
836 u8 type, void *ctx);
837
838/*
839 * Helper function to iterate the entries in ONE btrfs_dir_item.
840 * The iterate callback may return a non zero value to stop iteration. This can
841 * be a negative value for error codes or 1 to simply stop it.
842 *
843 * path must point to the dir item when called.
844 */
845static int iterate_dir_item(struct send_ctx *sctx,
846 struct btrfs_root *root, struct btrfs_path *path,
847 struct btrfs_key *found_key,
848 iterate_dir_item_t iterate, void *ctx)
849{
850 int ret = 0;
851 struct extent_buffer *eb;
852 struct btrfs_item *item;
853 struct btrfs_dir_item *di;
854 struct btrfs_path *tmp_path = NULL;
855 struct btrfs_key di_key;
856 char *buf = NULL;
857 char *buf2 = NULL;
858 int buf_len;
859 int buf_virtual = 0;
860 u32 name_len;
861 u32 data_len;
862 u32 cur;
863 u32 len;
864 u32 total;
865 int slot;
866 int num;
867 u8 type;
868
869 buf_len = PAGE_SIZE;
870 buf = kmalloc(buf_len, GFP_NOFS);
871 if (!buf) {
872 ret = -ENOMEM;
873 goto out;
874 }
875
876 tmp_path = alloc_path_for_send();
877 if (!tmp_path) {
878 ret = -ENOMEM;
879 goto out;
880 }
881
882 eb = path->nodes[0];
883 slot = path->slots[0];
884 item = btrfs_item_nr(eb, slot);
885 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
886 cur = 0;
887 len = 0;
888 total = btrfs_item_size(eb, item);
889
890 num = 0;
891 while (cur < total) {
892 name_len = btrfs_dir_name_len(eb, di);
893 data_len = btrfs_dir_data_len(eb, di);
894 type = btrfs_dir_type(eb, di);
895 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
896
897 if (name_len + data_len > buf_len) {
898 buf_len = PAGE_ALIGN(name_len + data_len);
899 if (buf_virtual) {
900 buf2 = vmalloc(buf_len);
901 if (!buf2) {
902 ret = -ENOMEM;
903 goto out;
904 }
905 vfree(buf);
906 } else {
907 buf2 = krealloc(buf, buf_len, GFP_NOFS);
908 if (!buf2) {
909 buf2 = vmalloc(buf_len);
910 if (!buf2) {
911 ret = -ENOMEM;
912 goto out;
913 }
914 kfree(buf);
915 buf_virtual = 1;
916 }
917 }
918
919 buf = buf2;
920 buf2 = NULL;
921 }
922
923 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
924 name_len + data_len);
925
926 len = sizeof(*di) + name_len + data_len;
927 di = (struct btrfs_dir_item *)((char *)di + len);
928 cur += len;
929
930 ret = iterate(num, &di_key, buf, name_len, buf + name_len,
931 data_len, type, ctx);
932 if (ret < 0)
933 goto out;
934 if (ret) {
935 ret = 0;
936 goto out;
937 }
938
939 num++;
940 }
941
942out:
943 btrfs_free_path(tmp_path);
944 if (buf_virtual)
945 vfree(buf);
946 else
947 kfree(buf);
948 return ret;
949}
950
951static int __copy_first_ref(int num, u64 dir, int index,
952 struct fs_path *p, void *ctx)
953{
954 int ret;
955 struct fs_path *pt = ctx;
956
957 ret = fs_path_copy(pt, p);
958 if (ret < 0)
959 return ret;
960
961 /* we want the first only */
962 return 1;
963}
964
965/*
966 * Retrieve the first path of an inode. If an inode has more then one
967 * ref/hardlink, this is ignored.
968 */
969static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
970 u64 ino, struct fs_path *path)
971{
972 int ret;
973 struct btrfs_key key, found_key;
974 struct btrfs_path *p;
975
976 p = alloc_path_for_send();
977 if (!p)
978 return -ENOMEM;
979
980 fs_path_reset(path);
981
982 key.objectid = ino;
983 key.type = BTRFS_INODE_REF_KEY;
984 key.offset = 0;
985
986 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
987 if (ret < 0)
988 goto out;
989 if (ret) {
990 ret = 1;
991 goto out;
992 }
993 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
994 if (found_key.objectid != ino ||
995 found_key.type != BTRFS_INODE_REF_KEY) {
996 ret = -ENOENT;
997 goto out;
998 }
999
1000 ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
1001 __copy_first_ref, path);
1002 if (ret < 0)
1003 goto out;
1004 ret = 0;
1005
1006out:
1007 btrfs_free_path(p);
1008 return ret;
1009}
1010
1011struct backref_ctx {
1012 struct send_ctx *sctx;
1013
1014 /* number of total found references */
1015 u64 found;
1016
1017 /*
1018 * used for clones found in send_root. clones found behind cur_objectid
1019 * and cur_offset are not considered as allowed clones.
1020 */
1021 u64 cur_objectid;
1022 u64 cur_offset;
1023
1024 /* may be truncated in case it's the last extent in a file */
1025 u64 extent_len;
1026
1027 /* Just to check for bugs in backref resolving */
1028 int found_in_send_root;
1029};
1030
1031static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1032{
1033 u64 root = (u64)key;
1034 struct clone_root *cr = (struct clone_root *)elt;
1035
1036 if (root < cr->root->objectid)
1037 return -1;
1038 if (root > cr->root->objectid)
1039 return 1;
1040 return 0;
1041}
1042
1043static int __clone_root_cmp_sort(const void *e1, const void *e2)
1044{
1045 struct clone_root *cr1 = (struct clone_root *)e1;
1046 struct clone_root *cr2 = (struct clone_root *)e2;
1047
1048 if (cr1->root->objectid < cr2->root->objectid)
1049 return -1;
1050 if (cr1->root->objectid > cr2->root->objectid)
1051 return 1;
1052 return 0;
1053}
1054
1055/*
1056 * Called for every backref that is found for the current extent.
1057 */
1058static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1059{
1060 struct backref_ctx *bctx = ctx_;
1061 struct clone_root *found;
1062 int ret;
1063 u64 i_size;
1064
1065 /* First check if the root is in the list of accepted clone sources */
1066 found = bsearch((void *)root, bctx->sctx->clone_roots,
1067 bctx->sctx->clone_roots_cnt,
1068 sizeof(struct clone_root),
1069 __clone_root_cmp_bsearch);
1070 if (!found)
1071 return 0;
1072
1073 if (found->root == bctx->sctx->send_root &&
1074 ino == bctx->cur_objectid &&
1075 offset == bctx->cur_offset) {
1076 bctx->found_in_send_root = 1;
1077 }
1078
1079 /*
1080 * There are inodes that have extents that lie behind it's i_size. Don't
1081 * accept clones from these extents.
1082 */
1083 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
1084 if (ret < 0)
1085 return ret;
1086
1087 if (offset + bctx->extent_len > i_size)
1088 return 0;
1089
1090 /*
1091 * Make sure we don't consider clones from send_root that are
1092 * behind the current inode/offset.
1093 */
1094 if (found->root == bctx->sctx->send_root) {
1095 /*
1096 * TODO for the moment we don't accept clones from the inode
1097 * that is currently send. We may change this when
1098 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
1099 * file.
1100 */
1101 if (ino >= bctx->cur_objectid)
1102 return 0;
1103 /*if (ino > ctx->cur_objectid)
1104 return 0;
1105 if (offset + ctx->extent_len > ctx->cur_offset)
1106 return 0;*/
1107
1108 bctx->found++;
1109 found->found_refs++;
1110 found->ino = ino;
1111 found->offset = offset;
1112 return 0;
1113 }
1114
1115 bctx->found++;
1116 found->found_refs++;
1117 if (ino < found->ino) {
1118 found->ino = ino;
1119 found->offset = offset;
1120 } else if (found->ino == ino) {
1121 /*
1122 * same extent found more then once in the same file.
1123 */
1124 if (found->offset > offset + bctx->extent_len)
1125 found->offset = offset;
1126 }
1127
1128 return 0;
1129}
1130
1131/*
1132 * path must point to the extent item when called.
1133 */
1134static int find_extent_clone(struct send_ctx *sctx,
1135 struct btrfs_path *path,
1136 u64 ino, u64 data_offset,
1137 u64 ino_size,
1138 struct clone_root **found)
1139{
1140 int ret;
1141 int extent_type;
1142 u64 logical;
1143 u64 num_bytes;
1144 u64 extent_item_pos;
1145 struct btrfs_file_extent_item *fi;
1146 struct extent_buffer *eb = path->nodes[0];
1147 struct backref_ctx backref_ctx;
1148 struct clone_root *cur_clone_root;
1149 struct btrfs_key found_key;
1150 struct btrfs_path *tmp_path;
1151 u32 i;
1152
1153 tmp_path = alloc_path_for_send();
1154 if (!tmp_path)
1155 return -ENOMEM;
1156
1157 if (data_offset >= ino_size) {
1158 /*
1159 * There may be extents that lie behind the file's size.
1160 * I at least had this in combination with snapshotting while
1161 * writing large files.
1162 */
1163 ret = 0;
1164 goto out;
1165 }
1166
1167 fi = btrfs_item_ptr(eb, path->slots[0],
1168 struct btrfs_file_extent_item);
1169 extent_type = btrfs_file_extent_type(eb, fi);
1170 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1176 logical = btrfs_file_extent_disk_bytenr(eb, fi);
1177 if (logical == 0) {
1178 ret = -ENOENT;
1179 goto out;
1180 }
1181 logical += btrfs_file_extent_offset(eb, fi);
1182
1183 ret = extent_from_logical(sctx->send_root->fs_info,
1184 logical, tmp_path, &found_key);
1185 btrfs_release_path(tmp_path);
1186
1187 if (ret < 0)
1188 goto out;
1189 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1190 ret = -EIO;
1191 goto out;
1192 }
1193
1194 /*
1195 * Setup the clone roots.
1196 */
1197 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1198 cur_clone_root = sctx->clone_roots + i;
1199 cur_clone_root->ino = (u64)-1;
1200 cur_clone_root->offset = 0;
1201 cur_clone_root->found_refs = 0;
1202 }
1203
1204 backref_ctx.sctx = sctx;
1205 backref_ctx.found = 0;
1206 backref_ctx.cur_objectid = ino;
1207 backref_ctx.cur_offset = data_offset;
1208 backref_ctx.found_in_send_root = 0;
1209 backref_ctx.extent_len = num_bytes;
1210
1211 /*
1212 * The last extent of a file may be too large due to page alignment.
1213 * We need to adjust extent_len in this case so that the checks in
1214 * __iterate_backrefs work.
1215 */
1216 if (data_offset + num_bytes >= ino_size)
1217 backref_ctx.extent_len = ino_size - data_offset;
1218
1219 /*
1220 * Now collect all backrefs.
1221 */
1222 extent_item_pos = logical - found_key.objectid;
1223 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1224 found_key.objectid, extent_item_pos, 1,
1225 __iterate_backrefs, &backref_ctx);
1226 if (ret < 0)
1227 goto out;
1228
1229 if (!backref_ctx.found_in_send_root) {
1230 /* found a bug in backref code? */
1231 ret = -EIO;
1232 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1233 "send_root. inode=%llu, offset=%llu, "
1234 "logical=%llu\n",
1235 ino, data_offset, logical);
1236 goto out;
1237 }
1238
1239verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1240 "ino=%llu, "
1241 "num_bytes=%llu, logical=%llu\n",
1242 data_offset, ino, num_bytes, logical);
1243
1244 if (!backref_ctx.found)
1245 verbose_printk("btrfs: no clones found\n");
1246
1247 cur_clone_root = NULL;
1248 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1249 if (sctx->clone_roots[i].found_refs) {
1250 if (!cur_clone_root)
1251 cur_clone_root = sctx->clone_roots + i;
1252 else if (sctx->clone_roots[i].root == sctx->send_root)
1253 /* prefer clones from send_root over others */
1254 cur_clone_root = sctx->clone_roots + i;
1255 break;
1256 }
1257
1258 }
1259
1260 if (cur_clone_root) {
1261 *found = cur_clone_root;
1262 ret = 0;
1263 } else {
1264 ret = -ENOENT;
1265 }
1266
1267out:
1268 btrfs_free_path(tmp_path);
1269 return ret;
1270}
1271
1272static int read_symlink(struct send_ctx *sctx,
1273 struct btrfs_root *root,
1274 u64 ino,
1275 struct fs_path *dest)
1276{
1277 int ret;
1278 struct btrfs_path *path;
1279 struct btrfs_key key;
1280 struct btrfs_file_extent_item *ei;
1281 u8 type;
1282 u8 compression;
1283 unsigned long off;
1284 int len;
1285
1286 path = alloc_path_for_send();
1287 if (!path)
1288 return -ENOMEM;
1289
1290 key.objectid = ino;
1291 key.type = BTRFS_EXTENT_DATA_KEY;
1292 key.offset = 0;
1293 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1294 if (ret < 0)
1295 goto out;
1296 BUG_ON(ret);
1297
1298 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1299 struct btrfs_file_extent_item);
1300 type = btrfs_file_extent_type(path->nodes[0], ei);
1301 compression = btrfs_file_extent_compression(path->nodes[0], ei);
1302 BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
1303 BUG_ON(compression);
1304
1305 off = btrfs_file_extent_inline_start(ei);
1306 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1307
1308 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1309 if (ret < 0)
1310 goto out;
1311
1312out:
1313 btrfs_free_path(path);
1314 return ret;
1315}
1316
1317/*
1318 * Helper function to generate a file name that is unique in the root of
1319 * send_root and parent_root. This is used to generate names for orphan inodes.
1320 */
1321static int gen_unique_name(struct send_ctx *sctx,
1322 u64 ino, u64 gen,
1323 struct fs_path *dest)
1324{
1325 int ret = 0;
1326 struct btrfs_path *path;
1327 struct btrfs_dir_item *di;
1328 char tmp[64];
1329 int len;
1330 u64 idx = 0;
1331
1332 path = alloc_path_for_send();
1333 if (!path)
1334 return -ENOMEM;
1335
1336 while (1) {
1337 len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
1338 ino, gen, idx);
1339 if (len >= sizeof(tmp)) {
1340 /* should really not happen */
1341 ret = -EOVERFLOW;
1342 goto out;
1343 }
1344
1345 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1346 path, BTRFS_FIRST_FREE_OBJECTID,
1347 tmp, strlen(tmp), 0);
1348 btrfs_release_path(path);
1349 if (IS_ERR(di)) {
1350 ret = PTR_ERR(di);
1351 goto out;
1352 }
1353 if (di) {
1354 /* not unique, try again */
1355 idx++;
1356 continue;
1357 }
1358
1359 if (!sctx->parent_root) {
1360 /* unique */
1361 ret = 0;
1362 break;
1363 }
1364
1365 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1366 path, BTRFS_FIRST_FREE_OBJECTID,
1367 tmp, strlen(tmp), 0);
1368 btrfs_release_path(path);
1369 if (IS_ERR(di)) {
1370 ret = PTR_ERR(di);
1371 goto out;
1372 }
1373 if (di) {
1374 /* not unique, try again */
1375 idx++;
1376 continue;
1377 }
1378 /* unique */
1379 break;
1380 }
1381
1382 ret = fs_path_add(dest, tmp, strlen(tmp));
1383
1384out:
1385 btrfs_free_path(path);
1386 return ret;
1387}
1388
1389enum inode_state {
1390 inode_state_no_change,
1391 inode_state_will_create,
1392 inode_state_did_create,
1393 inode_state_will_delete,
1394 inode_state_did_delete,
1395};
1396
1397static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1398{
1399 int ret;
1400 int left_ret;
1401 int right_ret;
1402 u64 left_gen;
1403 u64 right_gen;
1404
1405 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1406 NULL);
1407 if (ret < 0 && ret != -ENOENT)
1408 goto out;
1409 left_ret = ret;
1410
1411 if (!sctx->parent_root) {
1412 right_ret = -ENOENT;
1413 } else {
1414 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1415 NULL, NULL, NULL);
1416 if (ret < 0 && ret != -ENOENT)
1417 goto out;
1418 right_ret = ret;
1419 }
1420
1421 if (!left_ret && !right_ret) {
1422 if (left_gen == gen && right_gen == gen)
1423 ret = inode_state_no_change;
1424 else if (left_gen == gen) {
1425 if (ino < sctx->send_progress)
1426 ret = inode_state_did_create;
1427 else
1428 ret = inode_state_will_create;
1429 } else if (right_gen == gen) {
1430 if (ino < sctx->send_progress)
1431 ret = inode_state_did_delete;
1432 else
1433 ret = inode_state_will_delete;
1434 } else {
1435 ret = -ENOENT;
1436 }
1437 } else if (!left_ret) {
1438 if (left_gen == gen) {
1439 if (ino < sctx->send_progress)
1440 ret = inode_state_did_create;
1441 else
1442 ret = inode_state_will_create;
1443 } else {
1444 ret = -ENOENT;
1445 }
1446 } else if (!right_ret) {
1447 if (right_gen == gen) {
1448 if (ino < sctx->send_progress)
1449 ret = inode_state_did_delete;
1450 else
1451 ret = inode_state_will_delete;
1452 } else {
1453 ret = -ENOENT;
1454 }
1455 } else {
1456 ret = -ENOENT;
1457 }
1458
1459out:
1460 return ret;
1461}
1462
1463static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
1464{
1465 int ret;
1466
1467 ret = get_cur_inode_state(sctx, ino, gen);
1468 if (ret < 0)
1469 goto out;
1470
1471 if (ret == inode_state_no_change ||
1472 ret == inode_state_did_create ||
1473 ret == inode_state_will_delete)
1474 ret = 1;
1475 else
1476 ret = 0;
1477
1478out:
1479 return ret;
1480}
1481
1482/*
1483 * Helper function to lookup a dir item in a dir.
1484 */
1485static int lookup_dir_item_inode(struct btrfs_root *root,
1486 u64 dir, const char *name, int name_len,
1487 u64 *found_inode,
1488 u8 *found_type)
1489{
1490 int ret = 0;
1491 struct btrfs_dir_item *di;
1492 struct btrfs_key key;
1493 struct btrfs_path *path;
1494
1495 path = alloc_path_for_send();
1496 if (!path)
1497 return -ENOMEM;
1498
1499 di = btrfs_lookup_dir_item(NULL, root, path,
1500 dir, name, name_len, 0);
1501 if (!di) {
1502 ret = -ENOENT;
1503 goto out;
1504 }
1505 if (IS_ERR(di)) {
1506 ret = PTR_ERR(di);
1507 goto out;
1508 }
1509 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1510 *found_inode = key.objectid;
1511 *found_type = btrfs_dir_type(path->nodes[0], di);
1512
1513out:
1514 btrfs_free_path(path);
1515 return ret;
1516}
1517
1518static int get_first_ref(struct send_ctx *sctx,
1519 struct btrfs_root *root, u64 ino,
1520 u64 *dir, u64 *dir_gen, struct fs_path *name)
1521{
1522 int ret;
1523 struct btrfs_key key;
1524 struct btrfs_key found_key;
1525 struct btrfs_path *path;
1526 struct btrfs_inode_ref *iref;
1527 int len;
1528
1529 path = alloc_path_for_send();
1530 if (!path)
1531 return -ENOMEM;
1532
1533 key.objectid = ino;
1534 key.type = BTRFS_INODE_REF_KEY;
1535 key.offset = 0;
1536
1537 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
1538 if (ret < 0)
1539 goto out;
1540 if (!ret)
1541 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1542 path->slots[0]);
1543 if (ret || found_key.objectid != key.objectid ||
1544 found_key.type != key.type) {
1545 ret = -ENOENT;
1546 goto out;
1547 }
1548
1549 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1550 struct btrfs_inode_ref);
1551 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1552 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1553 (unsigned long)(iref + 1), len);
1554 if (ret < 0)
1555 goto out;
1556 btrfs_release_path(path);
1557
1558 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
1559 NULL);
1560 if (ret < 0)
1561 goto out;
1562
1563 *dir = found_key.offset;
1564
1565out:
1566 btrfs_free_path(path);
1567 return ret;
1568}
1569
1570static int is_first_ref(struct send_ctx *sctx,
1571 struct btrfs_root *root,
1572 u64 ino, u64 dir,
1573 const char *name, int name_len)
1574{
1575 int ret;
1576 struct fs_path *tmp_name;
1577 u64 tmp_dir;
1578 u64 tmp_dir_gen;
1579
1580 tmp_name = fs_path_alloc(sctx);
1581 if (!tmp_name)
1582 return -ENOMEM;
1583
1584 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1585 if (ret < 0)
1586 goto out;
1587
1588 if (name_len != fs_path_len(tmp_name)) {
1589 ret = 0;
1590 goto out;
1591 }
1592
1593 ret = memcmp(tmp_name->start, name, name_len);
1594 if (ret)
1595 ret = 0;
1596 else
1597 ret = 1;
1598
1599out:
1600 fs_path_free(sctx, tmp_name);
1601 return ret;
1602}
1603
1604static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1605 const char *name, int name_len,
1606 u64 *who_ino, u64 *who_gen)
1607{
1608 int ret = 0;
1609 u64 other_inode = 0;
1610 u8 other_type = 0;
1611
1612 if (!sctx->parent_root)
1613 goto out;
1614
1615 ret = is_inode_existent(sctx, dir, dir_gen);
1616 if (ret <= 0)
1617 goto out;
1618
1619 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
1620 &other_inode, &other_type);
1621 if (ret < 0 && ret != -ENOENT)
1622 goto out;
1623 if (ret) {
1624 ret = 0;
1625 goto out;
1626 }
1627
1628 if (other_inode > sctx->send_progress) {
1629 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1630 who_gen, NULL, NULL, NULL);
1631 if (ret < 0)
1632 goto out;
1633
1634 ret = 1;
1635 *who_ino = other_inode;
1636 } else {
1637 ret = 0;
1638 }
1639
1640out:
1641 return ret;
1642}
1643
1644static int did_overwrite_ref(struct send_ctx *sctx,
1645 u64 dir, u64 dir_gen,
1646 u64 ino, u64 ino_gen,
1647 const char *name, int name_len)
1648{
1649 int ret = 0;
1650 u64 gen;
1651 u64 ow_inode;
1652 u8 other_type;
1653
1654 if (!sctx->parent_root)
1655 goto out;
1656
1657 ret = is_inode_existent(sctx, dir, dir_gen);
1658 if (ret <= 0)
1659 goto out;
1660
1661 /* check if the ref was overwritten by another ref */
1662 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
1663 &ow_inode, &other_type);
1664 if (ret < 0 && ret != -ENOENT)
1665 goto out;
1666 if (ret) {
1667 /* was never and will never be overwritten */
1668 ret = 0;
1669 goto out;
1670 }
1671
1672 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1673 NULL);
1674 if (ret < 0)
1675 goto out;
1676
1677 if (ow_inode == ino && gen == ino_gen) {
1678 ret = 0;
1679 goto out;
1680 }
1681
1682 /* we know that it is or will be overwritten. check this now */
1683 if (ow_inode < sctx->send_progress)
1684 ret = 1;
1685 else
1686 ret = 0;
1687
1688out:
1689 return ret;
1690}
1691
1692static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1693{
1694 int ret = 0;
1695 struct fs_path *name = NULL;
1696 u64 dir;
1697 u64 dir_gen;
1698
1699 if (!sctx->parent_root)
1700 goto out;
1701
1702 name = fs_path_alloc(sctx);
1703 if (!name)
1704 return -ENOMEM;
1705
1706 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
1707 if (ret < 0)
1708 goto out;
1709
1710 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1711 name->start, fs_path_len(name));
1712 if (ret < 0)
1713 goto out;
1714
1715out:
1716 fs_path_free(sctx, name);
1717 return ret;
1718}
1719
1720static int name_cache_insert(struct send_ctx *sctx,
1721 struct name_cache_entry *nce)
1722{
1723 int ret = 0;
1724 struct name_cache_entry **ncea;
1725
1726 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1727 if (ncea) {
1728 if (!ncea[0])
1729 ncea[0] = nce;
1730 else if (!ncea[1])
1731 ncea[1] = nce;
1732 else
1733 BUG();
1734 } else {
1735 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1736 if (!ncea)
1737 return -ENOMEM;
1738
1739 ncea[0] = nce;
1740 ncea[1] = NULL;
1741 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
1742 if (ret < 0)
1743 return ret;
1744 }
1745 list_add_tail(&nce->list, &sctx->name_cache_list);
1746 sctx->name_cache_size++;
1747
1748 return ret;
1749}
1750
1751static void name_cache_delete(struct send_ctx *sctx,
1752 struct name_cache_entry *nce)
1753{
1754 struct name_cache_entry **ncea;
1755
1756 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1757 BUG_ON(!ncea);
1758
1759 if (ncea[0] == nce)
1760 ncea[0] = NULL;
1761 else if (ncea[1] == nce)
1762 ncea[1] = NULL;
1763 else
1764 BUG();
1765
1766 if (!ncea[0] && !ncea[1]) {
1767 radix_tree_delete(&sctx->name_cache, nce->ino);
1768 kfree(ncea);
1769 }
1770
1771 list_del(&nce->list);
1772
1773 sctx->name_cache_size--;
1774}
1775
1776static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1777 u64 ino, u64 gen)
1778{
1779 struct name_cache_entry **ncea;
1780
1781 ncea = radix_tree_lookup(&sctx->name_cache, ino);
1782 if (!ncea)
1783 return NULL;
1784
1785 if (ncea[0] && ncea[0]->gen == gen)
1786 return ncea[0];
1787 else if (ncea[1] && ncea[1]->gen == gen)
1788 return ncea[1];
1789 return NULL;
1790}
1791
1792static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1793{
1794 list_del(&nce->list);
1795 list_add_tail(&nce->list, &sctx->name_cache_list);
1796}
1797
1798static void name_cache_clean_unused(struct send_ctx *sctx)
1799{
1800 struct name_cache_entry *nce;
1801
1802 if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
1803 return;
1804
1805 while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
1806 nce = list_entry(sctx->name_cache_list.next,
1807 struct name_cache_entry, list);
1808 name_cache_delete(sctx, nce);
1809 kfree(nce);
1810 }
1811}
1812
1813static void name_cache_free(struct send_ctx *sctx)
1814{
1815 struct name_cache_entry *nce;
1816 struct name_cache_entry *tmp;
1817
1818 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
1819 name_cache_delete(sctx, nce);
1820 }
1821}
1822
1823static int __get_cur_name_and_parent(struct send_ctx *sctx,
1824 u64 ino, u64 gen,
1825 u64 *parent_ino,
1826 u64 *parent_gen,
1827 struct fs_path *dest)
1828{
1829 int ret;
1830 int nce_ret;
1831 struct btrfs_path *path = NULL;
1832 struct name_cache_entry *nce = NULL;
1833
1834 nce = name_cache_search(sctx, ino, gen);
1835 if (nce) {
1836 if (ino < sctx->send_progress && nce->need_later_update) {
1837 name_cache_delete(sctx, nce);
1838 kfree(nce);
1839 nce = NULL;
1840 } else {
1841 name_cache_used(sctx, nce);
1842 *parent_ino = nce->parent_ino;
1843 *parent_gen = nce->parent_gen;
1844 ret = fs_path_add(dest, nce->name, nce->name_len);
1845 if (ret < 0)
1846 goto out;
1847 ret = nce->ret;
1848 goto out;
1849 }
1850 }
1851
1852 path = alloc_path_for_send();
1853 if (!path)
1854 return -ENOMEM;
1855
1856 ret = is_inode_existent(sctx, ino, gen);
1857 if (ret < 0)
1858 goto out;
1859
1860 if (!ret) {
1861 ret = gen_unique_name(sctx, ino, gen, dest);
1862 if (ret < 0)
1863 goto out;
1864 ret = 1;
1865 goto out_cache;
1866 }
1867
1868 if (ino < sctx->send_progress)
1869 ret = get_first_ref(sctx, sctx->send_root, ino,
1870 parent_ino, parent_gen, dest);
1871 else
1872 ret = get_first_ref(sctx, sctx->parent_root, ino,
1873 parent_ino, parent_gen, dest);
1874 if (ret < 0)
1875 goto out;
1876
1877 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1878 dest->start, dest->end - dest->start);
1879 if (ret < 0)
1880 goto out;
1881 if (ret) {
1882 fs_path_reset(dest);
1883 ret = gen_unique_name(sctx, ino, gen, dest);
1884 if (ret < 0)
1885 goto out;
1886 ret = 1;
1887 }
1888
1889out_cache:
1890 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1891 if (!nce) {
1892 ret = -ENOMEM;
1893 goto out;
1894 }
1895
1896 nce->ino = ino;
1897 nce->gen = gen;
1898 nce->parent_ino = *parent_ino;
1899 nce->parent_gen = *parent_gen;
1900 nce->name_len = fs_path_len(dest);
1901 nce->ret = ret;
1902 strcpy(nce->name, dest->start);
1903 memset(&nce->use_list, 0, sizeof(nce->use_list));
1904
1905 if (ino < sctx->send_progress)
1906 nce->need_later_update = 0;
1907 else
1908 nce->need_later_update = 1;
1909
1910 nce_ret = name_cache_insert(sctx, nce);
1911 if (nce_ret < 0)
1912 ret = nce_ret;
1913 name_cache_clean_unused(sctx);
1914
1915out:
1916 btrfs_free_path(path);
1917 return ret;
1918}
1919
1920/*
1921 * Magic happens here. This function returns the first ref to an inode as it
1922 * would look like while receiving the stream at this point in time.
1923 * We walk the path up to the root. For every inode in between, we check if it
1924 * was already processed/sent. If yes, we continue with the parent as found
1925 * in send_root. If not, we continue with the parent as found in parent_root.
1926 * If we encounter an inode that was deleted at this point in time, we use the
1927 * inodes "orphan" name instead of the real name and stop. Same with new inodes
1928 * that were not created yet and overwritten inodes/refs.
1929 *
1930 * When do we have have orphan inodes:
1931 * 1. When an inode is freshly created and thus no valid refs are available yet
1932 * 2. When a directory lost all it's refs (deleted) but still has dir items
1933 * inside which were not processed yet (pending for move/delete). If anyone
1934 * tried to get the path to the dir items, it would get a path inside that
1935 * orphan directory.
1936 * 3. When an inode is moved around or gets new links, it may overwrite the ref
1937 * of an unprocessed inode. If in that case the first ref would be
1938 * overwritten, the overwritten inode gets "orphanized". Later when we
1939 * process this overwritten inode, it is restored at a new place by moving
1940 * the orphan inode.
1941 *
1942 * sctx->send_progress tells this function at which point in time receiving
1943 * would be.
1944 */
1945static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
1946 struct fs_path *dest)
1947{
1948 int ret = 0;
1949 struct fs_path *name = NULL;
1950 u64 parent_inode = 0;
1951 u64 parent_gen = 0;
1952 int stop = 0;
1953
1954 name = fs_path_alloc(sctx);
1955 if (!name) {
1956 ret = -ENOMEM;
1957 goto out;
1958 }
1959
1960 dest->reversed = 1;
1961 fs_path_reset(dest);
1962
1963 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
1964 fs_path_reset(name);
1965
1966 ret = __get_cur_name_and_parent(sctx, ino, gen,
1967 &parent_inode, &parent_gen, name);
1968 if (ret < 0)
1969 goto out;
1970 if (ret)
1971 stop = 1;
1972
1973 ret = fs_path_add_path(dest, name);
1974 if (ret < 0)
1975 goto out;
1976
1977 ino = parent_inode;
1978 gen = parent_gen;
1979 }
1980
1981out:
1982 fs_path_free(sctx, name);
1983 if (!ret)
1984 fs_path_unreverse(dest);
1985 return ret;
1986}
1987
1988/*
1989 * Called for regular files when sending extents data. Opens a struct file
1990 * to read from the file.
1991 */
1992static int open_cur_inode_file(struct send_ctx *sctx)
1993{
1994 int ret = 0;
1995 struct btrfs_key key;
1996 struct path path;
1997 struct inode *inode;
1998 struct dentry *dentry;
1999 struct file *filp;
2000 int new = 0;
2001
2002 if (sctx->cur_inode_filp)
2003 goto out;
2004
2005 key.objectid = sctx->cur_ino;
2006 key.type = BTRFS_INODE_ITEM_KEY;
2007 key.offset = 0;
2008
2009 inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
2010 &new);
2011 if (IS_ERR(inode)) {
2012 ret = PTR_ERR(inode);
2013 goto out;
2014 }
2015
2016 dentry = d_obtain_alias(inode);
2017 inode = NULL;
2018 if (IS_ERR(dentry)) {
2019 ret = PTR_ERR(dentry);
2020 goto out;
2021 }
2022
2023 path.mnt = sctx->mnt;
2024 path.dentry = dentry;
2025 filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
2026 dput(dentry);
2027 dentry = NULL;
2028 if (IS_ERR(filp)) {
2029 ret = PTR_ERR(filp);
2030 goto out;
2031 }
2032 sctx->cur_inode_filp = filp;
2033
2034out:
2035 /*
2036 * no xxxput required here as every vfs op
2037 * does it by itself on failure
2038 */
2039 return ret;
2040}
2041
2042/*
2043 * Closes the struct file that was created in open_cur_inode_file
2044 */
2045static int close_cur_inode_file(struct send_ctx *sctx)
2046{
2047 int ret = 0;
2048
2049 if (!sctx->cur_inode_filp)
2050 goto out;
2051
2052 ret = filp_close(sctx->cur_inode_filp, NULL);
2053 sctx->cur_inode_filp = NULL;
2054
2055out:
2056 return ret;
2057}
2058
2059/*
2060 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2061 */
2062static int send_subvol_begin(struct send_ctx *sctx)
2063{
2064 int ret;
2065 struct btrfs_root *send_root = sctx->send_root;
2066 struct btrfs_root *parent_root = sctx->parent_root;
2067 struct btrfs_path *path;
2068 struct btrfs_key key;
2069 struct btrfs_root_ref *ref;
2070 struct extent_buffer *leaf;
2071 char *name = NULL;
2072 int namelen;
2073
2074 path = alloc_path_for_send();
2075 if (!path)
2076 return -ENOMEM;
2077
2078 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
2079 if (!name) {
2080 btrfs_free_path(path);
2081 return -ENOMEM;
2082 }
2083
2084 key.objectid = send_root->objectid;
2085 key.type = BTRFS_ROOT_BACKREF_KEY;
2086 key.offset = 0;
2087
2088 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2089 &key, path, 1, 0);
2090 if (ret < 0)
2091 goto out;
2092 if (ret) {
2093 ret = -ENOENT;
2094 goto out;
2095 }
2096
2097 leaf = path->nodes[0];
2098 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2099 if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2100 key.objectid != send_root->objectid) {
2101 ret = -ENOENT;
2102 goto out;
2103 }
2104 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2105 namelen = btrfs_root_ref_name_len(leaf, ref);
2106 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2107 btrfs_release_path(path);
2108
2109 if (ret < 0)
2110 goto out;
2111
2112 if (parent_root) {
2113 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2114 if (ret < 0)
2115 goto out;
2116 } else {
2117 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2118 if (ret < 0)
2119 goto out;
2120 }
2121
2122 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2123 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2124 sctx->send_root->root_item.uuid);
2125 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2126 sctx->send_root->root_item.ctransid);
2127 if (parent_root) {
2128 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2129 sctx->parent_root->root_item.uuid);
2130 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2131 sctx->parent_root->root_item.ctransid);
2132 }
2133
2134 ret = send_cmd(sctx);
2135
2136tlv_put_failure:
2137out:
2138 btrfs_free_path(path);
2139 kfree(name);
2140 return ret;
2141}
2142
2143static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2144{
2145 int ret = 0;
2146 struct fs_path *p;
2147
2148verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2149
2150 p = fs_path_alloc(sctx);
2151 if (!p)
2152 return -ENOMEM;
2153
2154 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2155 if (ret < 0)
2156 goto out;
2157
2158 ret = get_cur_path(sctx, ino, gen, p);
2159 if (ret < 0)
2160 goto out;
2161 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2162 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2163
2164 ret = send_cmd(sctx);
2165
2166tlv_put_failure:
2167out:
2168 fs_path_free(sctx, p);
2169 return ret;
2170}
2171
2172static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2173{
2174 int ret = 0;
2175 struct fs_path *p;
2176
2177verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2178
2179 p = fs_path_alloc(sctx);
2180 if (!p)
2181 return -ENOMEM;
2182
2183 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2184 if (ret < 0)
2185 goto out;
2186
2187 ret = get_cur_path(sctx, ino, gen, p);
2188 if (ret < 0)
2189 goto out;
2190 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2191 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2192
2193 ret = send_cmd(sctx);
2194
2195tlv_put_failure:
2196out:
2197 fs_path_free(sctx, p);
2198 return ret;
2199}
2200
2201static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2202{
2203 int ret = 0;
2204 struct fs_path *p;
2205
2206verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2207
2208 p = fs_path_alloc(sctx);
2209 if (!p)
2210 return -ENOMEM;
2211
2212 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2213 if (ret < 0)
2214 goto out;
2215
2216 ret = get_cur_path(sctx, ino, gen, p);
2217 if (ret < 0)
2218 goto out;
2219 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2220 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2221 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2222
2223 ret = send_cmd(sctx);
2224
2225tlv_put_failure:
2226out:
2227 fs_path_free(sctx, p);
2228 return ret;
2229}
2230
2231static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2232{
2233 int ret = 0;
2234 struct fs_path *p = NULL;
2235 struct btrfs_inode_item *ii;
2236 struct btrfs_path *path = NULL;
2237 struct extent_buffer *eb;
2238 struct btrfs_key key;
2239 int slot;
2240
2241verbose_printk("btrfs: send_utimes %llu\n", ino);
2242
2243 p = fs_path_alloc(sctx);
2244 if (!p)
2245 return -ENOMEM;
2246
2247 path = alloc_path_for_send();
2248 if (!path) {
2249 ret = -ENOMEM;
2250 goto out;
2251 }
2252
2253 key.objectid = ino;
2254 key.type = BTRFS_INODE_ITEM_KEY;
2255 key.offset = 0;
2256 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2257 if (ret < 0)
2258 goto out;
2259
2260 eb = path->nodes[0];
2261 slot = path->slots[0];
2262 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2263
2264 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2265 if (ret < 0)
2266 goto out;
2267
2268 ret = get_cur_path(sctx, ino, gen, p);
2269 if (ret < 0)
2270 goto out;
2271 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2272 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
2273 btrfs_inode_atime(ii));
2274 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
2275 btrfs_inode_mtime(ii));
2276 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2277 btrfs_inode_ctime(ii));
2278 /* TODO otime? */
2279
2280 ret = send_cmd(sctx);
2281
2282tlv_put_failure:
2283out:
2284 fs_path_free(sctx, p);
2285 btrfs_free_path(path);
2286 return ret;
2287}
2288
2289/*
2290 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2291 * a valid path yet because we did not process the refs yet. So, the inode
2292 * is created as orphan.
2293 */
2294static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
2295 struct btrfs_key *key)
2296{
2297 int ret = 0;
2298 struct extent_buffer *eb = path->nodes[0];
2299 struct btrfs_inode_item *ii;
2300 struct fs_path *p;
2301 int slot = path->slots[0];
2302 int cmd;
2303 u64 mode;
2304
2305verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2306
2307 p = fs_path_alloc(sctx);
2308 if (!p)
2309 return -ENOMEM;
2310
2311 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2312 mode = btrfs_inode_mode(eb, ii);
2313
2314 if (S_ISREG(mode))
2315 cmd = BTRFS_SEND_C_MKFILE;
2316 else if (S_ISDIR(mode))
2317 cmd = BTRFS_SEND_C_MKDIR;
2318 else if (S_ISLNK(mode))
2319 cmd = BTRFS_SEND_C_SYMLINK;
2320 else if (S_ISCHR(mode) || S_ISBLK(mode))
2321 cmd = BTRFS_SEND_C_MKNOD;
2322 else if (S_ISFIFO(mode))
2323 cmd = BTRFS_SEND_C_MKFIFO;
2324 else if (S_ISSOCK(mode))
2325 cmd = BTRFS_SEND_C_MKSOCK;
2326 else {
2327 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2328 (int)(mode & S_IFMT));
2329 ret = -ENOTSUPP;
2330 goto out;
2331 }
2332
2333 ret = begin_cmd(sctx, cmd);
2334 if (ret < 0)
2335 goto out;
2336
2337 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
2338 if (ret < 0)
2339 goto out;
2340
2341 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2342 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
2343
2344 if (S_ISLNK(mode)) {
2345 fs_path_reset(p);
2346 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
2347 if (ret < 0)
2348 goto out;
2349 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2350 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2351 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2352 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
2353 }
2354
2355 ret = send_cmd(sctx);
2356 if (ret < 0)
2357 goto out;
2358
2359
2360tlv_put_failure:
2361out:
2362 fs_path_free(sctx, p);
2363 return ret;
2364}
2365
2366struct recorded_ref {
2367 struct list_head list;
2368 char *dir_path;
2369 char *name;
2370 struct fs_path *full_path;
2371 u64 dir;
2372 u64 dir_gen;
2373 int dir_path_len;
2374 int name_len;
2375};
2376
2377/*
2378 * We need to process new refs before deleted refs, but compare_tree gives us
2379 * everything mixed. So we first record all refs and later process them.
2380 * This function is a helper to record one ref.
2381 */
2382static int record_ref(struct list_head *head, u64 dir,
2383 u64 dir_gen, struct fs_path *path)
2384{
2385 struct recorded_ref *ref;
2386 char *tmp;
2387
2388 ref = kmalloc(sizeof(*ref), GFP_NOFS);
2389 if (!ref)
2390 return -ENOMEM;
2391
2392 ref->dir = dir;
2393 ref->dir_gen = dir_gen;
2394 ref->full_path = path;
2395
2396 tmp = strrchr(ref->full_path->start, '/');
2397 if (!tmp) {
2398 ref->name_len = ref->full_path->end - ref->full_path->start;
2399 ref->name = ref->full_path->start;
2400 ref->dir_path_len = 0;
2401 ref->dir_path = ref->full_path->start;
2402 } else {
2403 tmp++;
2404 ref->name_len = ref->full_path->end - tmp;
2405 ref->name = tmp;
2406 ref->dir_path = ref->full_path->start;
2407 ref->dir_path_len = ref->full_path->end -
2408 ref->full_path->start - 1 - ref->name_len;
2409 }
2410
2411 list_add_tail(&ref->list, head);
2412 return 0;
2413}
2414
2415static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2416{
2417 struct recorded_ref *cur;
2418 struct recorded_ref *tmp;
2419
2420 list_for_each_entry_safe(cur, tmp, head, list) {
2421 fs_path_free(sctx, cur->full_path);
2422 kfree(cur);
2423 }
2424 INIT_LIST_HEAD(head);
2425}
2426
2427static void free_recorded_refs(struct send_ctx *sctx)
2428{
2429 __free_recorded_refs(sctx, &sctx->new_refs);
2430 __free_recorded_refs(sctx, &sctx->deleted_refs);
2431}
2432
2433/*
2434 * Renames/moves a file/dir to it's orphan name. Used when the first
2435 * ref of an unprocessed inode gets overwritten and for all non empty
2436 * directories.
2437 */
2438static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2439 struct fs_path *path)
2440{
2441 int ret;
2442 struct fs_path *orphan;
2443
2444 orphan = fs_path_alloc(sctx);
2445 if (!orphan)
2446 return -ENOMEM;
2447
2448 ret = gen_unique_name(sctx, ino, gen, orphan);
2449 if (ret < 0)
2450 goto out;
2451
2452 ret = send_rename(sctx, path, orphan);
2453
2454out:
2455 fs_path_free(sctx, orphan);
2456 return ret;
2457}
2458
2459/*
2460 * Returns 1 if a directory can be removed at this point in time.
2461 * We check this by iterating all dir items and checking if the inode behind
2462 * the dir item was already processed.
2463 */
2464static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2465{
2466 int ret = 0;
2467 struct btrfs_root *root = sctx->parent_root;
2468 struct btrfs_path *path;
2469 struct btrfs_key key;
2470 struct btrfs_key found_key;
2471 struct btrfs_key loc;
2472 struct btrfs_dir_item *di;
2473
2474 path = alloc_path_for_send();
2475 if (!path)
2476 return -ENOMEM;
2477
2478 key.objectid = dir;
2479 key.type = BTRFS_DIR_INDEX_KEY;
2480 key.offset = 0;
2481
2482 while (1) {
2483 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
2484 if (ret < 0)
2485 goto out;
2486 if (!ret) {
2487 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2488 path->slots[0]);
2489 }
2490 if (ret || found_key.objectid != key.objectid ||
2491 found_key.type != key.type) {
2492 break;
2493 }
2494
2495 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2496 struct btrfs_dir_item);
2497 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2498
2499 if (loc.objectid > send_progress) {
2500 ret = 0;
2501 goto out;
2502 }
2503
2504 btrfs_release_path(path);
2505 key.offset = found_key.offset + 1;
2506 }
2507
2508 ret = 1;
2509
2510out:
2511 btrfs_free_path(path);
2512 return ret;
2513}
2514
2515struct finish_unordered_dir_ctx {
2516 struct send_ctx *sctx;
2517 struct fs_path *cur_path;
2518 struct fs_path *dir_path;
2519 u64 dir_ino;
2520 int need_delete;
2521 int delete_pass;
2522};
2523
2524int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2525 const char *name, int name_len,
2526 const char *data, int data_len,
2527 u8 type, void *ctx)
2528{
2529 int ret = 0;
2530 struct finish_unordered_dir_ctx *fctx = ctx;
2531 struct send_ctx *sctx = fctx->sctx;
2532 u64 di_gen;
2533 u64 di_mode;
2534 int is_orphan = 0;
2535
2536 if (di_key->objectid >= fctx->dir_ino)
2537 goto out;
2538
2539 fs_path_reset(fctx->cur_path);
2540
2541 ret = get_inode_info(sctx->send_root, di_key->objectid,
2542 NULL, &di_gen, &di_mode, NULL, NULL);
2543 if (ret < 0)
2544 goto out;
2545
2546 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2547 fctx->dir_ino, name, name_len);
2548 if (ret < 0)
2549 goto out;
2550 if (ret) {
2551 is_orphan = 1;
2552 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2553 fctx->cur_path);
2554 } else {
2555 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2556 fctx->cur_path);
2557 }
2558 if (ret < 0)
2559 goto out;
2560
2561 ret = fs_path_add(fctx->dir_path, name, name_len);
2562 if (ret < 0)
2563 goto out;
2564
2565 if (!fctx->delete_pass) {
2566 if (S_ISDIR(di_mode)) {
2567 ret = send_rename(sctx, fctx->cur_path,
2568 fctx->dir_path);
2569 } else {
2570 ret = send_link(sctx, fctx->dir_path,
2571 fctx->cur_path);
2572 if (is_orphan)
2573 fctx->need_delete = 1;
2574 }
2575 } else if (!S_ISDIR(di_mode)) {
2576 ret = send_unlink(sctx, fctx->cur_path);
2577 } else {
2578 ret = 0;
2579 }
2580
2581 fs_path_remove(fctx->dir_path);
2582
2583out:
2584 return ret;
2585}
2586
2587/*
2588 * Go through all dir items and see if we find refs which could not be created
2589 * in the past because the dir did not exist at that time.
2590 */
2591static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2592{
2593 int ret = 0;
2594 struct btrfs_path *path = NULL;
2595 struct btrfs_key key;
2596 struct btrfs_key found_key;
2597 struct extent_buffer *eb;
2598 struct finish_unordered_dir_ctx fctx;
2599 int slot;
2600
2601 path = alloc_path_for_send();
2602 if (!path) {
2603 ret = -ENOMEM;
2604 goto out;
2605 }
2606
2607 memset(&fctx, 0, sizeof(fctx));
2608 fctx.sctx = sctx;
2609 fctx.cur_path = fs_path_alloc(sctx);
2610 fctx.dir_path = fs_path_alloc(sctx);
2611 if (!fctx.cur_path || !fctx.dir_path) {
2612 ret = -ENOMEM;
2613 goto out;
2614 }
2615 fctx.dir_ino = dir;
2616
2617 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2618 if (ret < 0)
2619 goto out;
2620
2621 /*
2622 * We do two passes. The first links in the new refs and the second
2623 * deletes orphans if required. Deletion of orphans is not required for
2624 * directory inodes, as we always have only one ref and use rename
2625 * instead of link for those.
2626 */
2627
2628again:
2629 key.objectid = dir;
2630 key.type = BTRFS_DIR_ITEM_KEY;
2631 key.offset = 0;
2632 while (1) {
2633 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2634 1, 0);
2635 if (ret < 0)
2636 goto out;
2637 eb = path->nodes[0];
2638 slot = path->slots[0];
2639 btrfs_item_key_to_cpu(eb, &found_key, slot);
2640
2641 if (found_key.objectid != key.objectid ||
2642 found_key.type != key.type) {
2643 btrfs_release_path(path);
2644 break;
2645 }
2646
2647 ret = iterate_dir_item(sctx, sctx->send_root, path,
2648 &found_key, __finish_unordered_dir,
2649 &fctx);
2650 if (ret < 0)
2651 goto out;
2652
2653 key.offset = found_key.offset + 1;
2654 btrfs_release_path(path);
2655 }
2656
2657 if (!fctx.delete_pass && fctx.need_delete) {
2658 fctx.delete_pass = 1;
2659 goto again;
2660 }
2661
2662out:
2663 btrfs_free_path(path);
2664 fs_path_free(sctx, fctx.cur_path);
2665 fs_path_free(sctx, fctx.dir_path);
2666 return ret;
2667}
2668
2669/*
2670 * This does all the move/link/unlink/rmdir magic.
2671 */
2672static int process_recorded_refs(struct send_ctx *sctx)
2673{
2674 int ret = 0;
2675 struct recorded_ref *cur;
2676 struct ulist *check_dirs = NULL;
2677 struct ulist_iterator uit;
2678 struct ulist_node *un;
2679 struct fs_path *valid_path = NULL;
2680 u64 ow_inode = 0;
2681 u64 ow_gen;
2682 int did_overwrite = 0;
2683 int is_orphan = 0;
2684
2685verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2686
2687 valid_path = fs_path_alloc(sctx);
2688 if (!valid_path) {
2689 ret = -ENOMEM;
2690 goto out;
2691 }
2692
2693 check_dirs = ulist_alloc(GFP_NOFS);
2694 if (!check_dirs) {
2695 ret = -ENOMEM;
2696 goto out;
2697 }
2698
2699 /*
2700 * First, check if the first ref of the current inode was overwritten
2701 * before. If yes, we know that the current inode was already orphanized
2702 * and thus use the orphan name. If not, we can use get_cur_path to
2703 * get the path of the first ref as it would like while receiving at
2704 * this point in time.
2705 * New inodes are always orphan at the beginning, so force to use the
2706 * orphan name in this case.
2707 * The first ref is stored in valid_path and will be updated if it
2708 * gets moved around.
2709 */
2710 if (!sctx->cur_inode_new) {
2711 ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
2712 sctx->cur_inode_gen);
2713 if (ret < 0)
2714 goto out;
2715 if (ret)
2716 did_overwrite = 1;
2717 }
2718 if (sctx->cur_inode_new || did_overwrite) {
2719 ret = gen_unique_name(sctx, sctx->cur_ino,
2720 sctx->cur_inode_gen, valid_path);
2721 if (ret < 0)
2722 goto out;
2723 is_orphan = 1;
2724 } else {
2725 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
2726 valid_path);
2727 if (ret < 0)
2728 goto out;
2729 }
2730
2731 list_for_each_entry(cur, &sctx->new_refs, list) {
2732 /*
2733 * Check if this new ref would overwrite the first ref of
2734 * another unprocessed inode. If yes, orphanize the
2735 * overwritten inode. If we find an overwritten ref that is
2736 * not the first ref, simply unlink it.
2737 */
2738 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
2739 cur->name, cur->name_len,
2740 &ow_inode, &ow_gen);
2741 if (ret < 0)
2742 goto out;
2743 if (ret) {
2744 ret = is_first_ref(sctx, sctx->parent_root,
2745 ow_inode, cur->dir, cur->name,
2746 cur->name_len);
2747 if (ret < 0)
2748 goto out;
2749 if (ret) {
2750 ret = orphanize_inode(sctx, ow_inode, ow_gen,
2751 cur->full_path);
2752 if (ret < 0)
2753 goto out;
2754 } else {
2755 ret = send_unlink(sctx, cur->full_path);
2756 if (ret < 0)
2757 goto out;
2758 }
2759 }
2760
2761 /*
2762 * link/move the ref to the new place. If we have an orphan
2763 * inode, move it and update valid_path. If not, link or move
2764 * it depending on the inode mode.
2765 */
2766 if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
2767 ret = send_rename(sctx, valid_path, cur->full_path);
2768 if (ret < 0)
2769 goto out;
2770 is_orphan = 0;
2771 ret = fs_path_copy(valid_path, cur->full_path);
2772 if (ret < 0)
2773 goto out;
2774 } else {
2775 if (S_ISDIR(sctx->cur_inode_mode)) {
2776 /*
2777 * Dirs can't be linked, so move it. For moved
2778 * dirs, we always have one new and one deleted
2779 * ref. The deleted ref is ignored later.
2780 */
2781 ret = send_rename(sctx, valid_path,
2782 cur->full_path);
2783 if (ret < 0)
2784 goto out;
2785 ret = fs_path_copy(valid_path, cur->full_path);
2786 if (ret < 0)
2787 goto out;
2788 } else {
2789 ret = send_link(sctx, cur->full_path,
2790 valid_path);
2791 if (ret < 0)
2792 goto out;
2793 }
2794 }
2795 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2796 GFP_NOFS);
2797 if (ret < 0)
2798 goto out;
2799 }
2800
2801 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
2802 /*
2803 * Check if we can already rmdir the directory. If not,
2804 * orphanize it. For every dir item inside that gets deleted
2805 * later, we do this check again and rmdir it then if possible.
2806 * See the use of check_dirs for more details.
2807 */
2808 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
2809 if (ret < 0)
2810 goto out;
2811 if (ret) {
2812 ret = send_rmdir(sctx, valid_path);
2813 if (ret < 0)
2814 goto out;
2815 } else if (!is_orphan) {
2816 ret = orphanize_inode(sctx, sctx->cur_ino,
2817 sctx->cur_inode_gen, valid_path);
2818 if (ret < 0)
2819 goto out;
2820 is_orphan = 1;
2821 }
2822
2823 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2824 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2825 GFP_NOFS);
2826 if (ret < 0)
2827 goto out;
2828 }
2829 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2830 /*
2831 * We have a non dir inode. Go through all deleted refs and
2832 * unlink them if they were not already overwritten by other
2833 * inodes.
2834 */
2835 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2836 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
2837 sctx->cur_ino, sctx->cur_inode_gen,
2838 cur->name, cur->name_len);
2839 if (ret < 0)
2840 goto out;
2841 if (!ret) {
2842 /*
2843 * In case the inode was moved to a directory
2844 * that was not created yet (see
2845 * __record_new_ref), we can not unlink the ref
2846 * as it will be needed later when the parent
2847 * directory is created, so that we can move in
2848 * the inode to the new dir.
2849 */
2850 if (!is_orphan &&
2851 sctx->cur_inode_first_ref_orphan) {
2852 ret = orphanize_inode(sctx,
2853 sctx->cur_ino,
2854 sctx->cur_inode_gen,
2855 cur->full_path);
2856 if (ret < 0)
2857 goto out;
2858 ret = gen_unique_name(sctx,
2859 sctx->cur_ino,
2860 sctx->cur_inode_gen,
2861 valid_path);
2862 if (ret < 0)
2863 goto out;
2864 is_orphan = 1;
2865
2866 } else {
2867 ret = send_unlink(sctx, cur->full_path);
2868 if (ret < 0)
2869 goto out;
2870 }
2871 }
2872 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2873 GFP_NOFS);
2874 if (ret < 0)
2875 goto out;
2876 }
2877
2878 /*
2879 * If the inode is still orphan, unlink the orphan. This may
2880 * happen when a previous inode did overwrite the first ref
2881 * of this inode and no new refs were added for the current
2882 * inode.
2883 * We can however not delete the orphan in case the inode relies
2884 * in a directory that was not created yet (see
2885 * __record_new_ref)
2886 */
2887 if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
2888 ret = send_unlink(sctx, valid_path);
2889 if (ret < 0)
2890 goto out;
2891 }
2892 }
2893
2894 /*
2895 * We did collect all parent dirs where cur_inode was once located. We
2896 * now go through all these dirs and check if they are pending for
2897 * deletion and if it's finally possible to perform the rmdir now.
2898 * We also update the inode stats of the parent dirs here.
2899 */
2900 ULIST_ITER_INIT(&uit);
2901 while ((un = ulist_next(check_dirs, &uit))) {
2902 if (un->val > sctx->cur_ino)
2903 continue;
2904
2905 ret = get_cur_inode_state(sctx, un->val, un->aux);
2906 if (ret < 0)
2907 goto out;
2908
2909 if (ret == inode_state_did_create ||
2910 ret == inode_state_no_change) {
2911 /* TODO delayed utimes */
2912 ret = send_utimes(sctx, un->val, un->aux);
2913 if (ret < 0)
2914 goto out;
2915 } else if (ret == inode_state_did_delete) {
2916 ret = can_rmdir(sctx, un->val, sctx->cur_ino);
2917 if (ret < 0)
2918 goto out;
2919 if (ret) {
2920 ret = get_cur_path(sctx, un->val, un->aux,
2921 valid_path);
2922 if (ret < 0)
2923 goto out;
2924 ret = send_rmdir(sctx, valid_path);
2925 if (ret < 0)
2926 goto out;
2927 }
2928 }
2929 }
2930
2931 /*
2932 * Current inode is now at it's new position, so we must increase
2933 * send_progress
2934 */
2935 sctx->send_progress = sctx->cur_ino + 1;
2936
2937 /*
2938 * We may have a directory here that has pending refs which could not
2939 * be created before (because the dir did not exist before, see
2940 * __record_new_ref). finish_outoforder_dir will link/move the pending
2941 * refs.
2942 */
2943 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2944 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2945 sctx->cur_inode_gen);
2946 if (ret < 0)
2947 goto out;
2948 }
2949
2950 ret = 0;
2951
2952out:
2953 free_recorded_refs(sctx);
2954 ulist_free(check_dirs);
2955 fs_path_free(sctx, valid_path);
2956 return ret;
2957}
2958
2959static int __record_new_ref(int num, u64 dir, int index,
2960 struct fs_path *name,
2961 void *ctx)
2962{
2963 int ret = 0;
2964 struct send_ctx *sctx = ctx;
2965 struct fs_path *p;
2966 u64 gen;
2967
2968 p = fs_path_alloc(sctx);
2969 if (!p)
2970 return -ENOMEM;
2971
2972 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2973 NULL);
2974 if (ret < 0)
2975 goto out;
2976
2977 /*
2978 * The parent may be non-existent at this point in time. This happens
2979 * if the ino of the parent dir is higher then the current ino. In this
2980 * case, we can not process this ref until the parent dir is finally
2981 * created. If we reach the parent dir later, process_recorded_refs
2982 * will go through all dir items and process the refs that could not be
2983 * processed before. In case this is the first ref, we set
2984 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2985 * keep an orphan of the inode so that it later can be used for
2986 * link/move
2987 */
2988 ret = is_inode_existent(sctx, dir, gen);
2989 if (ret < 0)
2990 goto out;
2991 if (!ret) {
2992 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2993 name->start, fs_path_len(name));
2994 if (ret < 0)
2995 goto out;
2996 if (ret)
2997 sctx->cur_inode_first_ref_orphan = 1;
2998 ret = 0;
2999 goto out;
3000 }
3001
3002 ret = get_cur_path(sctx, dir, gen, p);
3003 if (ret < 0)
3004 goto out;
3005 ret = fs_path_add_path(p, name);
3006 if (ret < 0)
3007 goto out;
3008
3009 ret = record_ref(&sctx->new_refs, dir, gen, p);
3010
3011out:
3012 if (ret)
3013 fs_path_free(sctx, p);
3014 return ret;
3015}
3016
3017static int __record_deleted_ref(int num, u64 dir, int index,
3018 struct fs_path *name,
3019 void *ctx)
3020{
3021 int ret = 0;
3022 struct send_ctx *sctx = ctx;
3023 struct fs_path *p;
3024 u64 gen;
3025
3026 p = fs_path_alloc(sctx);
3027 if (!p)
3028 return -ENOMEM;
3029
3030 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3031 NULL);
3032 if (ret < 0)
3033 goto out;
3034
3035 ret = get_cur_path(sctx, dir, gen, p);
3036 if (ret < 0)
3037 goto out;
3038 ret = fs_path_add_path(p, name);
3039 if (ret < 0)
3040 goto out;
3041
3042 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3043
3044out:
3045 if (ret)
3046 fs_path_free(sctx, p);
3047 return ret;
3048}
3049
3050static int record_new_ref(struct send_ctx *sctx)
3051{
3052 int ret;
3053
3054 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
3055 sctx->cmp_key, 0, __record_new_ref, sctx);
3056 if (ret < 0)
3057 goto out;
3058 ret = 0;
3059
3060out:
3061 return ret;
3062}
3063
3064static int record_deleted_ref(struct send_ctx *sctx)
3065{
3066 int ret;
3067
3068 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
3069 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3070 if (ret < 0)
3071 goto out;
3072 ret = 0;
3073
3074out:
3075 return ret;
3076}
3077
3078struct find_ref_ctx {
3079 u64 dir;
3080 struct fs_path *name;
3081 int found_idx;
3082};
3083
3084static int __find_iref(int num, u64 dir, int index,
3085 struct fs_path *name,
3086 void *ctx_)
3087{
3088 struct find_ref_ctx *ctx = ctx_;
3089
3090 if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
3091 strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
3092 ctx->found_idx = num;
3093 return 1;
3094 }
3095 return 0;
3096}
3097
3098static int find_iref(struct send_ctx *sctx,
3099 struct btrfs_root *root,
3100 struct btrfs_path *path,
3101 struct btrfs_key *key,
3102 u64 dir, struct fs_path *name)
3103{
3104 int ret;
3105 struct find_ref_ctx ctx;
3106
3107 ctx.dir = dir;
3108 ctx.name = name;
3109 ctx.found_idx = -1;
3110
3111 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
3112 if (ret < 0)
3113 return ret;
3114
3115 if (ctx.found_idx == -1)
3116 return -ENOENT;
3117
3118 return ctx.found_idx;
3119}
3120
3121static int __record_changed_new_ref(int num, u64 dir, int index,
3122 struct fs_path *name,
3123 void *ctx)
3124{
3125 int ret;
3126 struct send_ctx *sctx = ctx;
3127
3128 ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
3129 sctx->cmp_key, dir, name);
3130 if (ret == -ENOENT)
3131 ret = __record_new_ref(num, dir, index, name, sctx);
3132 else if (ret > 0)
3133 ret = 0;
3134
3135 return ret;
3136}
3137
3138static int __record_changed_deleted_ref(int num, u64 dir, int index,
3139 struct fs_path *name,
3140 void *ctx)
3141{
3142 int ret;
3143 struct send_ctx *sctx = ctx;
3144
3145 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
3146 dir, name);
3147 if (ret == -ENOENT)
3148 ret = __record_deleted_ref(num, dir, index, name, sctx);
3149 else if (ret > 0)
3150 ret = 0;
3151
3152 return ret;
3153}
3154
3155static int record_changed_ref(struct send_ctx *sctx)
3156{
3157 int ret = 0;
3158
3159 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
3160 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3161 if (ret < 0)
3162 goto out;
3163 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
3164 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3165 if (ret < 0)
3166 goto out;
3167 ret = 0;
3168
3169out:
3170 return ret;
3171}
3172
3173/*
3174 * Record and process all refs at once. Needed when an inode changes the
3175 * generation number, which means that it was deleted and recreated.
3176 */
3177static int process_all_refs(struct send_ctx *sctx,
3178 enum btrfs_compare_tree_result cmd)
3179{
3180 int ret;
3181 struct btrfs_root *root;
3182 struct btrfs_path *path;
3183 struct btrfs_key key;
3184 struct btrfs_key found_key;
3185 struct extent_buffer *eb;
3186 int slot;
3187 iterate_inode_ref_t cb;
3188
3189 path = alloc_path_for_send();
3190 if (!path)
3191 return -ENOMEM;
3192
3193 if (cmd == BTRFS_COMPARE_TREE_NEW) {
3194 root = sctx->send_root;
3195 cb = __record_new_ref;
3196 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
3197 root = sctx->parent_root;
3198 cb = __record_deleted_ref;
3199 } else {
3200 BUG();
3201 }
3202
3203 key.objectid = sctx->cmp_key->objectid;
3204 key.type = BTRFS_INODE_REF_KEY;
3205 key.offset = 0;
3206 while (1) {
3207 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3208 if (ret < 0) {
3209 btrfs_release_path(path);
3210 goto out;
3211 }
3212 if (ret) {
3213 btrfs_release_path(path);
3214 break;
3215 }
3216
3217 eb = path->nodes[0];
3218 slot = path->slots[0];
3219 btrfs_item_key_to_cpu(eb, &found_key, slot);
3220
3221 if (found_key.objectid != key.objectid ||
3222 found_key.type != key.type) {
3223 btrfs_release_path(path);
3224 break;
3225 }
3226
3227 ret = iterate_inode_ref(sctx, sctx->parent_root, path,
3228 &found_key, 0, cb, sctx);
3229 btrfs_release_path(path);
3230 if (ret < 0)
3231 goto out;
3232
3233 key.offset = found_key.offset + 1;
3234 }
3235
3236 ret = process_recorded_refs(sctx);
3237
3238out:
3239 btrfs_free_path(path);
3240 return ret;
3241}
3242
3243static int send_set_xattr(struct send_ctx *sctx,
3244 struct fs_path *path,
3245 const char *name, int name_len,
3246 const char *data, int data_len)
3247{
3248 int ret = 0;
3249
3250 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
3251 if (ret < 0)
3252 goto out;
3253
3254 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
3255 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
3256 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
3257
3258 ret = send_cmd(sctx);
3259
3260tlv_put_failure:
3261out:
3262 return ret;
3263}
3264
3265static int send_remove_xattr(struct send_ctx *sctx,
3266 struct fs_path *path,
3267 const char *name, int name_len)
3268{
3269 int ret = 0;
3270
3271 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
3272 if (ret < 0)
3273 goto out;
3274
3275 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
3276 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
3277
3278 ret = send_cmd(sctx);
3279
3280tlv_put_failure:
3281out:
3282 return ret;
3283}
3284
3285static int __process_new_xattr(int num, struct btrfs_key *di_key,
3286 const char *name, int name_len,
3287 const char *data, int data_len,
3288 u8 type, void *ctx)
3289{
3290 int ret;
3291 struct send_ctx *sctx = ctx;
3292 struct fs_path *p;
3293 posix_acl_xattr_header dummy_acl;
3294
3295 p = fs_path_alloc(sctx);
3296 if (!p)
3297 return -ENOMEM;
3298
3299 /*
3300 * This hack is needed because empty acl's are stored as zero byte
3301 * data in xattrs. Problem with that is, that receiving these zero byte
3302 * acl's will fail later. To fix this, we send a dummy acl list that
3303 * only contains the version number and no entries.
3304 */
3305 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
3306 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
3307 if (data_len == 0) {
3308 dummy_acl.a_version =
3309 cpu_to_le32(POSIX_ACL_XATTR_VERSION);
3310 data = (char *)&dummy_acl;
3311 data_len = sizeof(dummy_acl);
3312 }
3313 }
3314
3315 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3316 if (ret < 0)
3317 goto out;
3318
3319 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3320
3321out:
3322 fs_path_free(sctx, p);
3323 return ret;
3324}
3325
3326static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3327 const char *name, int name_len,
3328 const char *data, int data_len,
3329 u8 type, void *ctx)
3330{
3331 int ret;
3332 struct send_ctx *sctx = ctx;
3333 struct fs_path *p;
3334
3335 p = fs_path_alloc(sctx);
3336 if (!p)
3337 return -ENOMEM;
3338
3339 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3340 if (ret < 0)
3341 goto out;
3342
3343 ret = send_remove_xattr(sctx, p, name, name_len);
3344
3345out:
3346 fs_path_free(sctx, p);
3347 return ret;
3348}
3349
3350static int process_new_xattr(struct send_ctx *sctx)
3351{
3352 int ret = 0;
3353
3354 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
3355 sctx->cmp_key, __process_new_xattr, sctx);
3356
3357 return ret;
3358}
3359
3360static int process_deleted_xattr(struct send_ctx *sctx)
3361{
3362 int ret;
3363
3364 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
3365 sctx->cmp_key, __process_deleted_xattr, sctx);
3366
3367 return ret;
3368}
3369
3370struct find_xattr_ctx {
3371 const char *name;
3372 int name_len;
3373 int found_idx;
3374 char *found_data;
3375 int found_data_len;
3376};
3377
3378static int __find_xattr(int num, struct btrfs_key *di_key,
3379 const char *name, int name_len,
3380 const char *data, int data_len,
3381 u8 type, void *vctx)
3382{
3383 struct find_xattr_ctx *ctx = vctx;
3384
3385 if (name_len == ctx->name_len &&
3386 strncmp(name, ctx->name, name_len) == 0) {
3387 ctx->found_idx = num;
3388 ctx->found_data_len = data_len;
3389 ctx->found_data = kmalloc(data_len, GFP_NOFS);
3390 if (!ctx->found_data)
3391 return -ENOMEM;
3392 memcpy(ctx->found_data, data, data_len);
3393 return 1;
3394 }
3395 return 0;
3396}
3397
3398static int find_xattr(struct send_ctx *sctx,
3399 struct btrfs_root *root,
3400 struct btrfs_path *path,
3401 struct btrfs_key *key,
3402 const char *name, int name_len,
3403 char **data, int *data_len)
3404{
3405 int ret;
3406 struct find_xattr_ctx ctx;
3407
3408 ctx.name = name;
3409 ctx.name_len = name_len;
3410 ctx.found_idx = -1;
3411 ctx.found_data = NULL;
3412 ctx.found_data_len = 0;
3413
3414 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
3415 if (ret < 0)
3416 return ret;
3417
3418 if (ctx.found_idx == -1)
3419 return -ENOENT;
3420 if (data) {
3421 *data = ctx.found_data;
3422 *data_len = ctx.found_data_len;
3423 } else {
3424 kfree(ctx.found_data);
3425 }
3426 return ctx.found_idx;
3427}
3428
3429
3430static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3431 const char *name, int name_len,
3432 const char *data, int data_len,
3433 u8 type, void *ctx)
3434{
3435 int ret;
3436 struct send_ctx *sctx = ctx;
3437 char *found_data = NULL;
3438 int found_data_len = 0;
3439 struct fs_path *p = NULL;
3440
3441 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
3442 sctx->cmp_key, name, name_len, &found_data,
3443 &found_data_len);
3444 if (ret == -ENOENT) {
3445 ret = __process_new_xattr(num, di_key, name, name_len, data,
3446 data_len, type, ctx);
3447 } else if (ret >= 0) {
3448 if (data_len != found_data_len ||
3449 memcmp(data, found_data, data_len)) {
3450 ret = __process_new_xattr(num, di_key, name, name_len,
3451 data, data_len, type, ctx);
3452 } else {
3453 ret = 0;
3454 }
3455 }
3456
3457 kfree(found_data);
3458 fs_path_free(sctx, p);
3459 return ret;
3460}
3461
3462static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3463 const char *name, int name_len,
3464 const char *data, int data_len,
3465 u8 type, void *ctx)
3466{
3467 int ret;
3468 struct send_ctx *sctx = ctx;
3469
3470 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
3471 name, name_len, NULL, NULL);
3472 if (ret == -ENOENT)
3473 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3474 data_len, type, ctx);
3475 else if (ret >= 0)
3476 ret = 0;
3477
3478 return ret;
3479}
3480
3481static int process_changed_xattr(struct send_ctx *sctx)
3482{
3483 int ret = 0;
3484
3485 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
3486 sctx->cmp_key, __process_changed_new_xattr, sctx);
3487 if (ret < 0)
3488 goto out;
3489 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
3490 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3491
3492out:
3493 return ret;
3494}
3495
3496static int process_all_new_xattrs(struct send_ctx *sctx)
3497{
3498 int ret;
3499 struct btrfs_root *root;
3500 struct btrfs_path *path;
3501 struct btrfs_key key;
3502 struct btrfs_key found_key;
3503 struct extent_buffer *eb;
3504 int slot;
3505
3506 path = alloc_path_for_send();
3507 if (!path)
3508 return -ENOMEM;
3509
3510 root = sctx->send_root;
3511
3512 key.objectid = sctx->cmp_key->objectid;
3513 key.type = BTRFS_XATTR_ITEM_KEY;
3514 key.offset = 0;
3515 while (1) {
3516 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3517 if (ret < 0)
3518 goto out;
3519 if (ret) {
3520 ret = 0;
3521 goto out;
3522 }
3523
3524 eb = path->nodes[0];
3525 slot = path->slots[0];
3526 btrfs_item_key_to_cpu(eb, &found_key, slot);
3527
3528 if (found_key.objectid != key.objectid ||
3529 found_key.type != key.type) {
3530 ret = 0;
3531 goto out;
3532 }
3533
3534 ret = iterate_dir_item(sctx, root, path, &found_key,
3535 __process_new_xattr, sctx);
3536 if (ret < 0)
3537 goto out;
3538
3539 btrfs_release_path(path);
3540 key.offset = found_key.offset + 1;
3541 }
3542
3543out:
3544 btrfs_free_path(path);
3545 return ret;
3546}
3547
3548/*
3549 * Read some bytes from the current inode/file and send a write command to
3550 * user space.
3551 */
3552static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3553{
3554 int ret = 0;
3555 struct fs_path *p;
3556 loff_t pos = offset;
3557 int readed = 0;
3558 mm_segment_t old_fs;
3559
3560 p = fs_path_alloc(sctx);
3561 if (!p)
3562 return -ENOMEM;
3563
3564 /*
3565 * vfs normally only accepts user space buffers for security reasons.
3566 * we only read from the file and also only provide the read_buf buffer
3567 * to vfs. As this buffer does not come from a user space call, it's
3568 * ok to temporary allow kernel space buffers.
3569 */
3570 old_fs = get_fs();
3571 set_fs(KERNEL_DS);
3572
3573verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3574
3575 ret = open_cur_inode_file(sctx);
3576 if (ret < 0)
3577 goto out;
3578
3579 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3580 if (ret < 0)
3581 goto out;
3582 readed = ret;
3583 if (!readed)
3584 goto out;
3585
3586 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
3587 if (ret < 0)
3588 goto out;
3589
3590 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3591 if (ret < 0)
3592 goto out;
3593
3594 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3595 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3596 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
3597
3598 ret = send_cmd(sctx);
3599
3600tlv_put_failure:
3601out:
3602 fs_path_free(sctx, p);
3603 set_fs(old_fs);
3604 if (ret < 0)
3605 return ret;
3606 return readed;
3607}
3608
3609/*
3610 * Send a clone command to user space.
3611 */
3612static int send_clone(struct send_ctx *sctx,
3613 u64 offset, u32 len,
3614 struct clone_root *clone_root)
3615{
3616 int ret = 0;
3617 struct btrfs_root *clone_root2 = clone_root->root;
3618 struct fs_path *p;
3619 u64 gen;
3620
3621verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3622 "clone_inode=%llu, clone_offset=%llu\n", offset, len,
3623 clone_root->root->objectid, clone_root->ino,
3624 clone_root->offset);
3625
3626 p = fs_path_alloc(sctx);
3627 if (!p)
3628 return -ENOMEM;
3629
3630 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
3631 if (ret < 0)
3632 goto out;
3633
3634 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3635 if (ret < 0)
3636 goto out;
3637
3638 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3639 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3640 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3641
3642 if (clone_root2 == sctx->send_root) {
3643 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3644 &gen, NULL, NULL, NULL);
3645 if (ret < 0)
3646 goto out;
3647 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3648 } else {
3649 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
3650 }
3651 if (ret < 0)
3652 goto out;
3653
3654 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3655 clone_root2->root_item.uuid);
3656 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3657 clone_root2->root_item.ctransid);
3658 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3659 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3660 clone_root->offset);
3661
3662 ret = send_cmd(sctx);
3663
3664tlv_put_failure:
3665out:
3666 fs_path_free(sctx, p);
3667 return ret;
3668}
3669
3670static int send_write_or_clone(struct send_ctx *sctx,
3671 struct btrfs_path *path,
3672 struct btrfs_key *key,
3673 struct clone_root *clone_root)
3674{
3675 int ret = 0;
3676 struct btrfs_file_extent_item *ei;
3677 u64 offset = key->offset;
3678 u64 pos = 0;
3679 u64 len;
3680 u32 l;
3681 u8 type;
3682
3683 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3684 struct btrfs_file_extent_item);
3685 type = btrfs_file_extent_type(path->nodes[0], ei);
3686 if (type == BTRFS_FILE_EXTENT_INLINE)
3687 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3688 else
3689 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3690
3691 if (offset + len > sctx->cur_inode_size)
3692 len = sctx->cur_inode_size - offset;
3693 if (len == 0) {
3694 ret = 0;
3695 goto out;
3696 }
3697
3698 if (!clone_root) {
3699 while (pos < len) {
3700 l = len - pos;
3701 if (l > BTRFS_SEND_READ_SIZE)
3702 l = BTRFS_SEND_READ_SIZE;
3703 ret = send_write(sctx, pos + offset, l);
3704 if (ret < 0)
3705 goto out;
3706 if (!ret)
3707 break;
3708 pos += ret;
3709 }
3710 ret = 0;
3711 } else {
3712 ret = send_clone(sctx, offset, len, clone_root);
3713 }
3714
3715out:
3716 return ret;
3717}
3718
3719static int is_extent_unchanged(struct send_ctx *sctx,
3720 struct btrfs_path *left_path,
3721 struct btrfs_key *ekey)
3722{
3723 int ret = 0;
3724 struct btrfs_key key;
3725 struct btrfs_path *path = NULL;
3726 struct extent_buffer *eb;
3727 int slot;
3728 struct btrfs_key found_key;
3729 struct btrfs_file_extent_item *ei;
3730 u64 left_disknr;
3731 u64 right_disknr;
3732 u64 left_offset;
3733 u64 right_offset;
3734 u64 left_offset_fixed;
3735 u64 left_len;
3736 u64 right_len;
3737 u8 left_type;
3738 u8 right_type;
3739
3740 path = alloc_path_for_send();
3741 if (!path)
3742 return -ENOMEM;
3743
3744 eb = left_path->nodes[0];
3745 slot = left_path->slots[0];
3746
3747 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3748 left_type = btrfs_file_extent_type(eb, ei);
3749 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3750 left_len = btrfs_file_extent_num_bytes(eb, ei);
3751 left_offset = btrfs_file_extent_offset(eb, ei);
3752
3753 if (left_type != BTRFS_FILE_EXTENT_REG) {
3754 ret = 0;
3755 goto out;
3756 }
3757
3758 /*
3759 * Following comments will refer to these graphics. L is the left
3760 * extents which we are checking at the moment. 1-8 are the right
3761 * extents that we iterate.
3762 *
3763 * |-----L-----|
3764 * |-1-|-2a-|-3-|-4-|-5-|-6-|
3765 *
3766 * |-----L-----|
3767 * |--1--|-2b-|...(same as above)
3768 *
3769 * Alternative situation. Happens on files where extents got split.
3770 * |-----L-----|
3771 * |-----------7-----------|-6-|
3772 *
3773 * Alternative situation. Happens on files which got larger.
3774 * |-----L-----|
3775 * |-8-|
3776 * Nothing follows after 8.
3777 */
3778
3779 key.objectid = ekey->objectid;
3780 key.type = BTRFS_EXTENT_DATA_KEY;
3781 key.offset = ekey->offset;
3782 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
3783 if (ret < 0)
3784 goto out;
3785 if (ret) {
3786 ret = 0;
3787 goto out;
3788 }
3789
3790 /*
3791 * Handle special case where the right side has no extents at all.
3792 */
3793 eb = path->nodes[0];
3794 slot = path->slots[0];
3795 btrfs_item_key_to_cpu(eb, &found_key, slot);
3796 if (found_key.objectid != key.objectid ||
3797 found_key.type != key.type) {
3798 ret = 0;
3799 goto out;
3800 }
3801
3802 /*
3803 * We're now on 2a, 2b or 7.
3804 */
3805 key = found_key;
3806 while (key.offset < ekey->offset + left_len) {
3807 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3808 right_type = btrfs_file_extent_type(eb, ei);
3809 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3810 right_len = btrfs_file_extent_num_bytes(eb, ei);
3811 right_offset = btrfs_file_extent_offset(eb, ei);
3812
3813 if (right_type != BTRFS_FILE_EXTENT_REG) {
3814 ret = 0;
3815 goto out;
3816 }
3817
3818 /*
3819 * Are we at extent 8? If yes, we know the extent is changed.
3820 * This may only happen on the first iteration.
3821 */
3822 if (found_key.offset + right_len < ekey->offset) {
3823 ret = 0;
3824 goto out;
3825 }
3826
3827 left_offset_fixed = left_offset;
3828 if (key.offset < ekey->offset) {
3829 /* Fix the right offset for 2a and 7. */
3830 right_offset += ekey->offset - key.offset;
3831 } else {
3832 /* Fix the left offset for all behind 2a and 2b */
3833 left_offset_fixed += key.offset - ekey->offset;
3834 }
3835
3836 /*
3837 * Check if we have the same extent.
3838 */
3839 if (left_disknr + left_offset_fixed !=
3840 right_disknr + right_offset) {
3841 ret = 0;
3842 goto out;
3843 }
3844
3845 /*
3846 * Go to the next extent.
3847 */
3848 ret = btrfs_next_item(sctx->parent_root, path);
3849 if (ret < 0)
3850 goto out;
3851 if (!ret) {
3852 eb = path->nodes[0];
3853 slot = path->slots[0];
3854 btrfs_item_key_to_cpu(eb, &found_key, slot);
3855 }
3856 if (ret || found_key.objectid != key.objectid ||
3857 found_key.type != key.type) {
3858 key.offset += right_len;
3859 break;
3860 } else {
3861 if (found_key.offset != key.offset + right_len) {
3862 /* Should really not happen */
3863 ret = -EIO;
3864 goto out;
3865 }
3866 }
3867 key = found_key;
3868 }
3869
3870 /*
3871 * We're now behind the left extent (treat as unchanged) or at the end
3872 * of the right side (treat as changed).
3873 */
3874 if (key.offset >= ekey->offset + left_len)
3875 ret = 1;
3876 else
3877 ret = 0;
3878
3879
3880out:
3881 btrfs_free_path(path);
3882 return ret;
3883}
3884
3885static int process_extent(struct send_ctx *sctx,
3886 struct btrfs_path *path,
3887 struct btrfs_key *key)
3888{
3889 int ret = 0;
3890 struct clone_root *found_clone = NULL;
3891
3892 if (S_ISLNK(sctx->cur_inode_mode))
3893 return 0;
3894
3895 if (sctx->parent_root && !sctx->cur_inode_new) {
3896 ret = is_extent_unchanged(sctx, path, key);
3897 if (ret < 0)
3898 goto out;
3899 if (ret) {
3900 ret = 0;
3901 goto out;
3902 }
3903 }
3904
3905 ret = find_extent_clone(sctx, path, key->objectid, key->offset,
3906 sctx->cur_inode_size, &found_clone);
3907 if (ret != -ENOENT && ret < 0)
3908 goto out;
3909
3910 ret = send_write_or_clone(sctx, path, key, found_clone);
3911
3912out:
3913 return ret;
3914}
3915
3916static int process_all_extents(struct send_ctx *sctx)
3917{
3918 int ret;
3919 struct btrfs_root *root;
3920 struct btrfs_path *path;
3921 struct btrfs_key key;
3922 struct btrfs_key found_key;
3923 struct extent_buffer *eb;
3924 int slot;
3925
3926 root = sctx->send_root;
3927 path = alloc_path_for_send();
3928 if (!path)
3929 return -ENOMEM;
3930
3931 key.objectid = sctx->cmp_key->objectid;
3932 key.type = BTRFS_EXTENT_DATA_KEY;
3933 key.offset = 0;
3934 while (1) {
3935 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3936 if (ret < 0)
3937 goto out;
3938 if (ret) {
3939 ret = 0;
3940 goto out;
3941 }
3942
3943 eb = path->nodes[0];
3944 slot = path->slots[0];
3945 btrfs_item_key_to_cpu(eb, &found_key, slot);
3946
3947 if (found_key.objectid != key.objectid ||
3948 found_key.type != key.type) {
3949 ret = 0;
3950 goto out;
3951 }
3952
3953 ret = process_extent(sctx, path, &found_key);
3954 if (ret < 0)
3955 goto out;
3956
3957 btrfs_release_path(path);
3958 key.offset = found_key.offset + 1;
3959 }
3960
3961out:
3962 btrfs_free_path(path);
3963 return ret;
3964}
3965
3966static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3967{
3968 int ret = 0;
3969
3970 if (sctx->cur_ino == 0)
3971 goto out;
3972 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
3973 sctx->cmp_key->type <= BTRFS_INODE_REF_KEY)
3974 goto out;
3975 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
3976 goto out;
3977
3978 ret = process_recorded_refs(sctx);
3979
3980out:
3981 return ret;
3982}
3983
3984static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
3985{
3986 int ret = 0;
3987 u64 left_mode;
3988 u64 left_uid;
3989 u64 left_gid;
3990 u64 right_mode;
3991 u64 right_uid;
3992 u64 right_gid;
3993 int need_chmod = 0;
3994 int need_chown = 0;
3995
3996 ret = process_recorded_refs_if_needed(sctx, at_end);
3997 if (ret < 0)
3998 goto out;
3999
4000 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
4001 goto out;
4002 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
4003 goto out;
4004
4005 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4006 &left_mode, &left_uid, &left_gid);
4007 if (ret < 0)
4008 goto out;
4009
4010 if (!S_ISLNK(sctx->cur_inode_mode)) {
4011 if (!sctx->parent_root || sctx->cur_inode_new) {
4012 need_chmod = 1;
4013 need_chown = 1;
4014 } else {
4015 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4016 NULL, NULL, &right_mode, &right_uid,
4017 &right_gid);
4018 if (ret < 0)
4019 goto out;
4020
4021 if (left_uid != right_uid || left_gid != right_gid)
4022 need_chown = 1;
4023 if (left_mode != right_mode)
4024 need_chmod = 1;
4025 }
4026 }
4027
4028 if (S_ISREG(sctx->cur_inode_mode)) {
4029 ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4030 sctx->cur_inode_size);
4031 if (ret < 0)
4032 goto out;
4033 }
4034
4035 if (need_chown) {
4036 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4037 left_uid, left_gid);
4038 if (ret < 0)
4039 goto out;
4040 }
4041 if (need_chmod) {
4042 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4043 left_mode);
4044 if (ret < 0)
4045 goto out;
4046 }
4047
4048 /*
4049 * Need to send that every time, no matter if it actually changed
4050 * between the two trees as we have done changes to the inode before.
4051 */
4052 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4053 if (ret < 0)
4054 goto out;
4055
4056out:
4057 return ret;
4058}
4059
4060static int changed_inode(struct send_ctx *sctx,
4061 enum btrfs_compare_tree_result result)
4062{
4063 int ret = 0;
4064 struct btrfs_key *key = sctx->cmp_key;
4065 struct btrfs_inode_item *left_ii = NULL;
4066 struct btrfs_inode_item *right_ii = NULL;
4067 u64 left_gen = 0;
4068 u64 right_gen = 0;
4069
4070 ret = close_cur_inode_file(sctx);
4071 if (ret < 0)
4072 goto out;
4073
4074 sctx->cur_ino = key->objectid;
4075 sctx->cur_inode_new_gen = 0;
4076 sctx->cur_inode_first_ref_orphan = 0;
4077 sctx->send_progress = sctx->cur_ino;
4078
4079 if (result == BTRFS_COMPARE_TREE_NEW ||
4080 result == BTRFS_COMPARE_TREE_CHANGED) {
4081 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
4082 sctx->left_path->slots[0],
4083 struct btrfs_inode_item);
4084 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
4085 left_ii);
4086 } else {
4087 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
4088 sctx->right_path->slots[0],
4089 struct btrfs_inode_item);
4090 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4091 right_ii);
4092 }
4093 if (result == BTRFS_COMPARE_TREE_CHANGED) {
4094 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
4095 sctx->right_path->slots[0],
4096 struct btrfs_inode_item);
4097
4098 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4099 right_ii);
4100 if (left_gen != right_gen)
4101 sctx->cur_inode_new_gen = 1;
4102 }
4103
4104 if (result == BTRFS_COMPARE_TREE_NEW) {
4105 sctx->cur_inode_gen = left_gen;
4106 sctx->cur_inode_new = 1;
4107 sctx->cur_inode_deleted = 0;
4108 sctx->cur_inode_size = btrfs_inode_size(
4109 sctx->left_path->nodes[0], left_ii);
4110 sctx->cur_inode_mode = btrfs_inode_mode(
4111 sctx->left_path->nodes[0], left_ii);
4112 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4113 ret = send_create_inode(sctx, sctx->left_path,
4114 sctx->cmp_key);
4115 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4116 sctx->cur_inode_gen = right_gen;
4117 sctx->cur_inode_new = 0;
4118 sctx->cur_inode_deleted = 1;
4119 sctx->cur_inode_size = btrfs_inode_size(
4120 sctx->right_path->nodes[0], right_ii);
4121 sctx->cur_inode_mode = btrfs_inode_mode(
4122 sctx->right_path->nodes[0], right_ii);
4123 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4124 if (sctx->cur_inode_new_gen) {
4125 sctx->cur_inode_gen = right_gen;
4126 sctx->cur_inode_new = 0;
4127 sctx->cur_inode_deleted = 1;
4128 sctx->cur_inode_size = btrfs_inode_size(
4129 sctx->right_path->nodes[0], right_ii);
4130 sctx->cur_inode_mode = btrfs_inode_mode(
4131 sctx->right_path->nodes[0], right_ii);
4132 ret = process_all_refs(sctx,
4133 BTRFS_COMPARE_TREE_DELETED);
4134 if (ret < 0)
4135 goto out;
4136
4137 sctx->cur_inode_gen = left_gen;
4138 sctx->cur_inode_new = 1;
4139 sctx->cur_inode_deleted = 0;
4140 sctx->cur_inode_size = btrfs_inode_size(
4141 sctx->left_path->nodes[0], left_ii);
4142 sctx->cur_inode_mode = btrfs_inode_mode(
4143 sctx->left_path->nodes[0], left_ii);
4144 ret = send_create_inode(sctx, sctx->left_path,
4145 sctx->cmp_key);
4146 if (ret < 0)
4147 goto out;
4148
4149 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4150 if (ret < 0)
4151 goto out;
4152 ret = process_all_extents(sctx);
4153 if (ret < 0)
4154 goto out;
4155 ret = process_all_new_xattrs(sctx);
4156 if (ret < 0)
4157 goto out;
4158 } else {
4159 sctx->cur_inode_gen = left_gen;
4160 sctx->cur_inode_new = 0;
4161 sctx->cur_inode_new_gen = 0;
4162 sctx->cur_inode_deleted = 0;
4163 sctx->cur_inode_size = btrfs_inode_size(
4164 sctx->left_path->nodes[0], left_ii);
4165 sctx->cur_inode_mode = btrfs_inode_mode(
4166 sctx->left_path->nodes[0], left_ii);
4167 }
4168 }
4169
4170out:
4171 return ret;
4172}
4173
4174static int changed_ref(struct send_ctx *sctx,
4175 enum btrfs_compare_tree_result result)
4176{
4177 int ret = 0;
4178
4179 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4180
4181 if (!sctx->cur_inode_new_gen &&
4182 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
4183 if (result == BTRFS_COMPARE_TREE_NEW)
4184 ret = record_new_ref(sctx);
4185 else if (result == BTRFS_COMPARE_TREE_DELETED)
4186 ret = record_deleted_ref(sctx);
4187 else if (result == BTRFS_COMPARE_TREE_CHANGED)
4188 ret = record_changed_ref(sctx);
4189 }
4190
4191 return ret;
4192}
4193
4194static int changed_xattr(struct send_ctx *sctx,
4195 enum btrfs_compare_tree_result result)
4196{
4197 int ret = 0;
4198
4199 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4200
4201 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
4202 if (result == BTRFS_COMPARE_TREE_NEW)
4203 ret = process_new_xattr(sctx);
4204 else if (result == BTRFS_COMPARE_TREE_DELETED)
4205 ret = process_deleted_xattr(sctx);
4206 else if (result == BTRFS_COMPARE_TREE_CHANGED)
4207 ret = process_changed_xattr(sctx);
4208 }
4209
4210 return ret;
4211}
4212
4213static int changed_extent(struct send_ctx *sctx,
4214 enum btrfs_compare_tree_result result)
4215{
4216 int ret = 0;
4217
4218 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4219
4220 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
4221 if (result != BTRFS_COMPARE_TREE_DELETED)
4222 ret = process_extent(sctx, sctx->left_path,
4223 sctx->cmp_key);
4224 }
4225
4226 return ret;
4227}
4228
4229
4230static int changed_cb(struct btrfs_root *left_root,
4231 struct btrfs_root *right_root,
4232 struct btrfs_path *left_path,
4233 struct btrfs_path *right_path,
4234 struct btrfs_key *key,
4235 enum btrfs_compare_tree_result result,
4236 void *ctx)
4237{
4238 int ret = 0;
4239 struct send_ctx *sctx = ctx;
4240
4241 sctx->left_path = left_path;
4242 sctx->right_path = right_path;
4243 sctx->cmp_key = key;
4244
4245 ret = finish_inode_if_needed(sctx, 0);
4246 if (ret < 0)
4247 goto out;
4248
4249 if (key->type == BTRFS_INODE_ITEM_KEY)
4250 ret = changed_inode(sctx, result);
4251 else if (key->type == BTRFS_INODE_REF_KEY)
4252 ret = changed_ref(sctx, result);
4253 else if (key->type == BTRFS_XATTR_ITEM_KEY)
4254 ret = changed_xattr(sctx, result);
4255 else if (key->type == BTRFS_EXTENT_DATA_KEY)
4256 ret = changed_extent(sctx, result);
4257
4258out:
4259 return ret;
4260}
4261
4262static int full_send_tree(struct send_ctx *sctx)
4263{
4264 int ret;
4265 struct btrfs_trans_handle *trans = NULL;
4266 struct btrfs_root *send_root = sctx->send_root;
4267 struct btrfs_key key;
4268 struct btrfs_key found_key;
4269 struct btrfs_path *path;
4270 struct extent_buffer *eb;
4271 int slot;
4272 u64 start_ctransid;
4273 u64 ctransid;
4274
4275 path = alloc_path_for_send();
4276 if (!path)
4277 return -ENOMEM;
4278
4279 spin_lock(&send_root->root_times_lock);
4280 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4281 spin_unlock(&send_root->root_times_lock);
4282
4283 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4284 key.type = BTRFS_INODE_ITEM_KEY;
4285 key.offset = 0;
4286
4287join_trans:
4288 /*
4289 * We need to make sure the transaction does not get committed
4290 * while we do anything on commit roots. Join a transaction to prevent
4291 * this.
4292 */
4293 trans = btrfs_join_transaction(send_root);
4294 if (IS_ERR(trans)) {
4295 ret = PTR_ERR(trans);
4296 trans = NULL;
4297 goto out;
4298 }
4299
4300 /*
4301 * Make sure the tree has not changed
4302 */
4303 spin_lock(&send_root->root_times_lock);
4304 ctransid = btrfs_root_ctransid(&send_root->root_item);
4305 spin_unlock(&send_root->root_times_lock);
4306
4307 if (ctransid != start_ctransid) {
4308 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
4309 "send was modified in between. This is "
4310 "probably a bug.\n");
4311 ret = -EIO;
4312 goto out;
4313 }
4314
4315 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
4316 if (ret < 0)
4317 goto out;
4318 if (ret)
4319 goto out_finish;
4320
4321 while (1) {
4322 /*
4323 * When someone want to commit while we iterate, end the
4324 * joined transaction and rejoin.
4325 */
4326 if (btrfs_should_end_transaction(trans, send_root)) {
4327 ret = btrfs_end_transaction(trans, send_root);
4328 trans = NULL;
4329 if (ret < 0)
4330 goto out;
4331 btrfs_release_path(path);
4332 goto join_trans;
4333 }
4334
4335 eb = path->nodes[0];
4336 slot = path->slots[0];
4337 btrfs_item_key_to_cpu(eb, &found_key, slot);
4338
4339 ret = changed_cb(send_root, NULL, path, NULL,
4340 &found_key, BTRFS_COMPARE_TREE_NEW, sctx);
4341 if (ret < 0)
4342 goto out;
4343
4344 key.objectid = found_key.objectid;
4345 key.type = found_key.type;
4346 key.offset = found_key.offset + 1;
4347
4348 ret = btrfs_next_item(send_root, path);
4349 if (ret < 0)
4350 goto out;
4351 if (ret) {
4352 ret = 0;
4353 break;
4354 }
4355 }
4356
4357out_finish:
4358 ret = finish_inode_if_needed(sctx, 1);
4359
4360out:
4361 btrfs_free_path(path);
4362 if (trans) {
4363 if (!ret)
4364 ret = btrfs_end_transaction(trans, send_root);
4365 else
4366 btrfs_end_transaction(trans, send_root);
4367 }
4368 return ret;
4369}
4370
4371static int send_subvol(struct send_ctx *sctx)
4372{
4373 int ret;
4374
4375 ret = send_header(sctx);
4376 if (ret < 0)
4377 goto out;
4378
4379 ret = send_subvol_begin(sctx);
4380 if (ret < 0)
4381 goto out;
4382
4383 if (sctx->parent_root) {
4384 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
4385 changed_cb, sctx);
4386 if (ret < 0)
4387 goto out;
4388 ret = finish_inode_if_needed(sctx, 1);
4389 if (ret < 0)
4390 goto out;
4391 } else {
4392 ret = full_send_tree(sctx);
4393 if (ret < 0)
4394 goto out;
4395 }
4396
4397out:
4398 if (!ret)
4399 ret = close_cur_inode_file(sctx);
4400 else
4401 close_cur_inode_file(sctx);
4402
4403 free_recorded_refs(sctx);
4404 return ret;
4405}
4406
4407long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4408{
4409 int ret = 0;
4410 struct btrfs_root *send_root;
4411 struct btrfs_root *clone_root;
4412 struct btrfs_fs_info *fs_info;
4413 struct btrfs_ioctl_send_args *arg = NULL;
4414 struct btrfs_key key;
4415 struct file *filp = NULL;
4416 struct send_ctx *sctx = NULL;
4417 u32 i;
4418 u64 *clone_sources_tmp = NULL;
4419
4420 if (!capable(CAP_SYS_ADMIN))
4421 return -EPERM;
4422
4423 send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
4424 fs_info = send_root->fs_info;
4425
4426 arg = memdup_user(arg_, sizeof(*arg));
4427 if (IS_ERR(arg)) {
4428 ret = PTR_ERR(arg);
4429 arg = NULL;
4430 goto out;
4431 }
4432
4433 if (!access_ok(VERIFY_READ, arg->clone_sources,
4434 sizeof(*arg->clone_sources *
4435 arg->clone_sources_count))) {
4436 ret = -EFAULT;
4437 goto out;
4438 }
4439
4440 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
4441 if (!sctx) {
4442 ret = -ENOMEM;
4443 goto out;
4444 }
4445
4446 INIT_LIST_HEAD(&sctx->new_refs);
4447 INIT_LIST_HEAD(&sctx->deleted_refs);
4448 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
4449 INIT_LIST_HEAD(&sctx->name_cache_list);
4450
4451 sctx->send_filp = fget(arg->send_fd);
4452 if (IS_ERR(sctx->send_filp)) {
4453 ret = PTR_ERR(sctx->send_filp);
4454 goto out;
4455 }
4456
4457 sctx->mnt = mnt_file->f_path.mnt;
4458
4459 sctx->send_root = send_root;
4460 sctx->clone_roots_cnt = arg->clone_sources_count;
4461
4462 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
4463 sctx->send_buf = vmalloc(sctx->send_max_size);
4464 if (!sctx->send_buf) {
4465 ret = -ENOMEM;
4466 goto out;
4467 }
4468
4469 sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
4470 if (!sctx->read_buf) {
4471 ret = -ENOMEM;
4472 goto out;
4473 }
4474
4475 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
4476 (arg->clone_sources_count + 1));
4477 if (!sctx->clone_roots) {
4478 ret = -ENOMEM;
4479 goto out;
4480 }
4481
4482 if (arg->clone_sources_count) {
4483 clone_sources_tmp = vmalloc(arg->clone_sources_count *
4484 sizeof(*arg->clone_sources));
4485 if (!clone_sources_tmp) {
4486 ret = -ENOMEM;
4487 goto out;
4488 }
4489
4490 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
4491 arg->clone_sources_count *
4492 sizeof(*arg->clone_sources));
4493 if (ret) {
4494 ret = -EFAULT;
4495 goto out;
4496 }
4497
4498 for (i = 0; i < arg->clone_sources_count; i++) {
4499 key.objectid = clone_sources_tmp[i];
4500 key.type = BTRFS_ROOT_ITEM_KEY;
4501 key.offset = (u64)-1;
4502 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4503 if (!clone_root) {
4504 ret = -EINVAL;
4505 goto out;
4506 }
4507 if (IS_ERR(clone_root)) {
4508 ret = PTR_ERR(clone_root);
4509 goto out;
4510 }
4511 sctx->clone_roots[i].root = clone_root;
4512 }
4513 vfree(clone_sources_tmp);
4514 clone_sources_tmp = NULL;
4515 }
4516
4517 if (arg->parent_root) {
4518 key.objectid = arg->parent_root;
4519 key.type = BTRFS_ROOT_ITEM_KEY;
4520 key.offset = (u64)-1;
4521 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4522 if (!sctx->parent_root) {
4523 ret = -EINVAL;
4524 goto out;
4525 }
4526 }
4527
4528 /*
4529 * Clones from send_root are allowed, but only if the clone source
4530 * is behind the current send position. This is checked while searching
4531 * for possible clone sources.
4532 */
4533 sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
4534
4535 /* We do a bsearch later */
4536 sort(sctx->clone_roots, sctx->clone_roots_cnt,
4537 sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
4538 NULL);
4539
4540 ret = send_subvol(sctx);
4541 if (ret < 0)
4542 goto out;
4543
4544 ret = begin_cmd(sctx, BTRFS_SEND_C_END);
4545 if (ret < 0)
4546 goto out;
4547 ret = send_cmd(sctx);
4548 if (ret < 0)
4549 goto out;
4550
4551out:
4552 if (filp)
4553 fput(filp);
4554 kfree(arg);
4555 vfree(clone_sources_tmp);
4556
4557 if (sctx) {
4558 if (sctx->send_filp)
4559 fput(sctx->send_filp);
4560
4561 vfree(sctx->clone_roots);
4562 vfree(sctx->send_buf);
4563 vfree(sctx->read_buf);
4564
4565 name_cache_free(sctx);
4566
4567 kfree(sctx);
4568 }
4569
4570 return ret;
4571}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
new file mode 100644
index 00000000000..9934e948e57
--- /dev/null
+++ b/fs/btrfs/send.h
@@ -0,0 +1,133 @@
1/*
2 * Copyright (C) 2012 Alexander Block. All rights reserved.
3 * Copyright (C) 2012 STRATO. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#include "ctree.h"
21
22#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
23#define BTRFS_SEND_STREAM_VERSION 1
24
25#define BTRFS_SEND_BUF_SIZE (1024 * 64)
26#define BTRFS_SEND_READ_SIZE (1024 * 48)
27
28enum btrfs_tlv_type {
29 BTRFS_TLV_U8,
30 BTRFS_TLV_U16,
31 BTRFS_TLV_U32,
32 BTRFS_TLV_U64,
33 BTRFS_TLV_BINARY,
34 BTRFS_TLV_STRING,
35 BTRFS_TLV_UUID,
36 BTRFS_TLV_TIMESPEC,
37};
38
39struct btrfs_stream_header {
40 char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
41 __le32 version;
42} __attribute__ ((__packed__));
43
44struct btrfs_cmd_header {
45 /* len excluding the header */
46 __le32 len;
47 __le16 cmd;
48 /* crc including the header with zero crc field */
49 __le32 crc;
50} __attribute__ ((__packed__));
51
52struct btrfs_tlv_header {
53 __le16 tlv_type;
54 /* len excluding the header */
55 __le16 tlv_len;
56} __attribute__ ((__packed__));
57
58/* commands */
59enum btrfs_send_cmd {
60 BTRFS_SEND_C_UNSPEC,
61
62 BTRFS_SEND_C_SUBVOL,
63 BTRFS_SEND_C_SNAPSHOT,
64
65 BTRFS_SEND_C_MKFILE,
66 BTRFS_SEND_C_MKDIR,
67 BTRFS_SEND_C_MKNOD,
68 BTRFS_SEND_C_MKFIFO,
69 BTRFS_SEND_C_MKSOCK,
70 BTRFS_SEND_C_SYMLINK,
71
72 BTRFS_SEND_C_RENAME,
73 BTRFS_SEND_C_LINK,
74 BTRFS_SEND_C_UNLINK,
75 BTRFS_SEND_C_RMDIR,
76
77 BTRFS_SEND_C_SET_XATTR,
78 BTRFS_SEND_C_REMOVE_XATTR,
79
80 BTRFS_SEND_C_WRITE,
81 BTRFS_SEND_C_CLONE,
82
83 BTRFS_SEND_C_TRUNCATE,
84 BTRFS_SEND_C_CHMOD,
85 BTRFS_SEND_C_CHOWN,
86 BTRFS_SEND_C_UTIMES,
87
88 BTRFS_SEND_C_END,
89 __BTRFS_SEND_C_MAX,
90};
91#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
92
93/* attributes in send stream */
94enum {
95 BTRFS_SEND_A_UNSPEC,
96
97 BTRFS_SEND_A_UUID,
98 BTRFS_SEND_A_CTRANSID,
99
100 BTRFS_SEND_A_INO,
101 BTRFS_SEND_A_SIZE,
102 BTRFS_SEND_A_MODE,
103 BTRFS_SEND_A_UID,
104 BTRFS_SEND_A_GID,
105 BTRFS_SEND_A_RDEV,
106 BTRFS_SEND_A_CTIME,
107 BTRFS_SEND_A_MTIME,
108 BTRFS_SEND_A_ATIME,
109 BTRFS_SEND_A_OTIME,
110
111 BTRFS_SEND_A_XATTR_NAME,
112 BTRFS_SEND_A_XATTR_DATA,
113
114 BTRFS_SEND_A_PATH,
115 BTRFS_SEND_A_PATH_TO,
116 BTRFS_SEND_A_PATH_LINK,
117
118 BTRFS_SEND_A_FILE_OFFSET,
119 BTRFS_SEND_A_DATA,
120
121 BTRFS_SEND_A_CLONE_UUID,
122 BTRFS_SEND_A_CLONE_CTRANSID,
123 BTRFS_SEND_A_CLONE_PATH,
124 BTRFS_SEND_A_CLONE_OFFSET,
125 BTRFS_SEND_A_CLONE_LEN,
126
127 __BTRFS_SEND_A_MAX,
128};
129#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
130
131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133#endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c6ffa581241..b976597b072 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,15 +17,27 @@
17 */ 17 */
18 18
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <asm/unaligned.h>
20 21
21/* this is some deeply nasty code. ctree.h has a different 22#include "ctree.h"
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef 23
24static inline u8 get_unaligned_le8(const void *p)
25{
26 return *(u8 *)p;
27}
28
29static inline void put_unaligned_le8(u8 val, void *p)
30{
31 *(u8 *)p = val;
32}
33
34/*
35 * this is some deeply nasty code.
23 * 36 *
24 * The end result is that anyone who #includes ctree.h gets a 37 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions 38 * declaration for the btrfs_set_foo functions and btrfs_foo functions,
26 * 39 * which are wappers of btrfs_set_token_#bits functions and
27 * This file declares the macros and then #includes ctree.h, which results 40 * btrfs_get_token_#bits functions, which are defined in this file.
28 * in cpp creating the function here based on the template below.
29 * 41 *
30 * These setget functions do all the extent_buffer related mapping 42 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent 43 * required to efficiently read and write specific fields in the extent
@@ -33,103 +45,93 @@
33 * an unsigned long offset into the extent buffer which has been 45 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking. 46 * cast to a specific type. This gives us all the gcc type checking.
35 * 47 *
36 * The extent buffer api is used to do all the kmapping and page 48 * The extent buffer api is used to do the page spanning work required to
37 * spanning work required to get extent buffers in highmem and have 49 * have a metadata blocksize different from the page size.
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */ 50 */
43 51
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 52#define DEFINE_BTRFS_SETGET_BITS(bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ 53u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ 54 unsigned long off, \
47void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ 55 struct btrfs_map_token *token) \
48u##bits btrfs_token_##name(struct extent_buffer *eb, \
49 type *s, struct btrfs_map_token *token) \
50{ \ 56{ \
51 unsigned long part_offset = (unsigned long)s; \ 57 unsigned long part_offset = (unsigned long)ptr; \
52 unsigned long offset = part_offset + offsetof(type, member); \ 58 unsigned long offset = part_offset + off; \
53 type *p; \ 59 void *p; \
54 int err; \ 60 int err; \
55 char *kaddr; \ 61 char *kaddr; \
56 unsigned long map_start; \ 62 unsigned long map_start; \
57 unsigned long map_len; \ 63 unsigned long map_len; \
58 unsigned long mem_len = sizeof(((type *)0)->member); \ 64 int size = sizeof(u##bits); \
59 u##bits res; \ 65 u##bits res; \
60 if (token && token->kaddr && token->offset <= offset && \ 66 \
61 token->eb == eb && \ 67 if (token && token->kaddr && token->offset <= offset && \
62 (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ 68 token->eb == eb && \
63 kaddr = token->kaddr; \ 69 (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
64 p = (type *)(kaddr + part_offset - token->offset); \ 70 kaddr = token->kaddr; \
65 res = le##bits##_to_cpu(p->member); \ 71 p = kaddr + part_offset - token->offset; \
66 return res; \ 72 res = get_unaligned_le##bits(p + off); \
67 } \ 73 return res; \
68 err = map_private_extent_buffer(eb, offset, \ 74 } \
69 mem_len, \ 75 err = map_private_extent_buffer(eb, offset, size, \
70 &kaddr, &map_start, &map_len); \ 76 &kaddr, &map_start, &map_len); \
71 if (err) { \ 77 if (err) { \
72 __le##bits leres; \ 78 __le##bits leres; \
73 read_eb_member(eb, s, type, member, &leres); \ 79 \
74 return le##bits##_to_cpu(leres); \ 80 read_extent_buffer(eb, &leres, offset, size); \
75 } \ 81 return le##bits##_to_cpu(leres); \
76 p = (type *)(kaddr + part_offset - map_start); \ 82 } \
77 res = le##bits##_to_cpu(p->member); \ 83 p = kaddr + part_offset - map_start; \
78 if (token) { \ 84 res = get_unaligned_le##bits(p + off); \
79 token->kaddr = kaddr; \ 85 if (token) { \
80 token->offset = map_start; \ 86 token->kaddr = kaddr; \
81 token->eb = eb; \ 87 token->offset = map_start; \
82 } \ 88 token->eb = eb; \
83 return res; \ 89 } \
90 return res; \
84} \ 91} \
85void btrfs_set_token_##name(struct extent_buffer *eb, \ 92void btrfs_set_token_##bits(struct extent_buffer *eb, \
86 type *s, u##bits val, struct btrfs_map_token *token) \ 93 void *ptr, unsigned long off, u##bits val, \
94 struct btrfs_map_token *token) \
87{ \ 95{ \
88 unsigned long part_offset = (unsigned long)s; \ 96 unsigned long part_offset = (unsigned long)ptr; \
89 unsigned long offset = part_offset + offsetof(type, member); \ 97 unsigned long offset = part_offset + off; \
90 type *p; \ 98 void *p; \
91 int err; \ 99 int err; \
92 char *kaddr; \ 100 char *kaddr; \
93 unsigned long map_start; \ 101 unsigned long map_start; \
94 unsigned long map_len; \ 102 unsigned long map_len; \
95 unsigned long mem_len = sizeof(((type *)0)->member); \ 103 int size = sizeof(u##bits); \
96 if (token && token->kaddr && token->offset <= offset && \ 104 \
97 token->eb == eb && \ 105 if (token && token->kaddr && token->offset <= offset && \
98 (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ 106 token->eb == eb && \
99 kaddr = token->kaddr; \ 107 (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
100 p = (type *)(kaddr + part_offset - token->offset); \ 108 kaddr = token->kaddr; \
101 p->member = cpu_to_le##bits(val); \ 109 p = kaddr + part_offset - token->offset; \
102 return; \ 110 put_unaligned_le##bits(val, p + off); \
103 } \ 111 return; \
104 err = map_private_extent_buffer(eb, offset, \ 112 } \
105 mem_len, \ 113 err = map_private_extent_buffer(eb, offset, size, \
106 &kaddr, &map_start, &map_len); \ 114 &kaddr, &map_start, &map_len); \
107 if (err) { \ 115 if (err) { \
108 __le##bits val2; \ 116 __le##bits val2; \
109 val2 = cpu_to_le##bits(val); \ 117 \
110 write_eb_member(eb, s, type, member, &val2); \ 118 val2 = cpu_to_le##bits(val); \
111 return; \ 119 write_extent_buffer(eb, &val2, offset, size); \
112 } \ 120 return; \
113 p = (type *)(kaddr + part_offset - map_start); \ 121 } \
114 p->member = cpu_to_le##bits(val); \ 122 p = kaddr + part_offset - map_start; \
115 if (token) { \ 123 put_unaligned_le##bits(val, p + off); \
116 token->kaddr = kaddr; \ 124 if (token) { \
117 token->offset = map_start; \ 125 token->kaddr = kaddr; \
118 token->eb = eb; \ 126 token->offset = map_start; \
119 } \ 127 token->eb = eb; \
120} \ 128 } \
121void btrfs_set_##name(struct extent_buffer *eb, \ 129}
122 type *s, u##bits val) \
123{ \
124 btrfs_set_token_##name(eb, s, val, NULL); \
125} \
126u##bits btrfs_##name(struct extent_buffer *eb, \
127 type *s) \
128{ \
129 return btrfs_token_##name(eb, s, NULL); \
130} \
131 130
132#include "ctree.h" 131DEFINE_BTRFS_SETGET_BITS(8)
132DEFINE_BTRFS_SETGET_BITS(16)
133DEFINE_BTRFS_SETGET_BITS(32)
134DEFINE_BTRFS_SETGET_BITS(64)
133 135
134void btrfs_node_key(struct extent_buffer *eb, 136void btrfs_node_key(struct extent_buffer *eb,
135 struct btrfs_disk_key *disk_key, int nr) 137 struct btrfs_disk_key *disk_key, int nr)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b19d7556772..fa61ef59cd6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -396,15 +396,23 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
396 strcmp(args[0].from, "zlib") == 0) { 396 strcmp(args[0].from, "zlib") == 0) {
397 compress_type = "zlib"; 397 compress_type = "zlib";
398 info->compress_type = BTRFS_COMPRESS_ZLIB; 398 info->compress_type = BTRFS_COMPRESS_ZLIB;
399 btrfs_set_opt(info->mount_opt, COMPRESS);
399 } else if (strcmp(args[0].from, "lzo") == 0) { 400 } else if (strcmp(args[0].from, "lzo") == 0) {
400 compress_type = "lzo"; 401 compress_type = "lzo";
401 info->compress_type = BTRFS_COMPRESS_LZO; 402 info->compress_type = BTRFS_COMPRESS_LZO;
403 btrfs_set_opt(info->mount_opt, COMPRESS);
404 btrfs_set_fs_incompat(info, COMPRESS_LZO);
405 } else if (strncmp(args[0].from, "no", 2) == 0) {
406 compress_type = "no";
407 info->compress_type = BTRFS_COMPRESS_NONE;
408 btrfs_clear_opt(info->mount_opt, COMPRESS);
409 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
410 compress_force = false;
402 } else { 411 } else {
403 ret = -EINVAL; 412 ret = -EINVAL;
404 goto out; 413 goto out;
405 } 414 }
406 415
407 btrfs_set_opt(info->mount_opt, COMPRESS);
408 if (compress_force) { 416 if (compress_force) {
409 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 417 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
410 pr_info("btrfs: force %s compression\n", 418 pr_info("btrfs: force %s compression\n",
@@ -1455,6 +1463,13 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1455 ret = btrfs_scan_one_device(vol->name, FMODE_READ, 1463 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1456 &btrfs_fs_type, &fs_devices); 1464 &btrfs_fs_type, &fs_devices);
1457 break; 1465 break;
1466 case BTRFS_IOC_DEVICES_READY:
1467 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1468 &btrfs_fs_type, &fs_devices);
1469 if (ret)
1470 break;
1471 ret = !(fs_devices->num_devices == fs_devices->total_devices);
1472 break;
1458 } 1473 }
1459 1474
1460 kfree(vol); 1475 kfree(vol);
@@ -1477,16 +1492,6 @@ static int btrfs_unfreeze(struct super_block *sb)
1477 return 0; 1492 return 0;
1478} 1493}
1479 1494
1480static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1481{
1482 int ret;
1483
1484 ret = btrfs_dirty_inode(inode);
1485 if (ret)
1486 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1487 "error %d\n", btrfs_ino(inode), ret);
1488}
1489
1490static int btrfs_show_devname(struct seq_file *m, struct dentry *root) 1495static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
1491{ 1496{
1492 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); 1497 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -1526,7 +1531,6 @@ static const struct super_operations btrfs_super_ops = {
1526 .show_options = btrfs_show_options, 1531 .show_options = btrfs_show_options,
1527 .show_devname = btrfs_show_devname, 1532 .show_devname = btrfs_show_devname,
1528 .write_inode = btrfs_write_inode, 1533 .write_inode = btrfs_write_inode,
1529 .dirty_inode = btrfs_fs_dirty_inode,
1530 .alloc_inode = btrfs_alloc_inode, 1534 .alloc_inode = btrfs_alloc_inode,
1531 .destroy_inode = btrfs_destroy_inode, 1535 .destroy_inode = btrfs_destroy_inode,
1532 .statfs = btrfs_statfs, 1536 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b72b068183e..7ac7cdcc294 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -22,6 +22,7 @@
22#include <linux/writeback.h> 22#include <linux/writeback.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/uuid.h>
25#include "ctree.h" 26#include "ctree.h"
26#include "disk-io.h" 27#include "disk-io.h"
27#include "transaction.h" 28#include "transaction.h"
@@ -38,7 +39,6 @@ void put_transaction(struct btrfs_transaction *transaction)
38 if (atomic_dec_and_test(&transaction->use_count)) { 39 if (atomic_dec_and_test(&transaction->use_count)) {
39 BUG_ON(!list_empty(&transaction->list)); 40 BUG_ON(!list_empty(&transaction->list));
40 WARN_ON(transaction->delayed_refs.root.rb_node); 41 WARN_ON(transaction->delayed_refs.root.rb_node);
41 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
42 memset(transaction, 0, sizeof(*transaction)); 42 memset(transaction, 0, sizeof(*transaction));
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 43 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 44 }
@@ -100,8 +100,8 @@ loop:
100 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
101 cur_trans = fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
102 goto loop; 102 goto loop;
103 } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 103 } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
104 spin_unlock(&root->fs_info->trans_lock); 104 spin_unlock(&fs_info->trans_lock);
105 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 105 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
106 return -EROFS; 106 return -EROFS;
107 } 107 }
@@ -126,7 +126,6 @@ loop:
126 cur_trans->delayed_refs.num_heads = 0; 126 cur_trans->delayed_refs.num_heads = 0;
127 cur_trans->delayed_refs.flushing = 0; 127 cur_trans->delayed_refs.flushing = 0;
128 cur_trans->delayed_refs.run_delayed_start = 0; 128 cur_trans->delayed_refs.run_delayed_start = 0;
129 cur_trans->delayed_refs.seq = 1;
130 129
131 /* 130 /*
132 * although the tree mod log is per file system and not per transaction, 131 * although the tree mod log is per file system and not per transaction,
@@ -145,10 +144,8 @@ loop:
145 } 144 }
146 atomic_set(&fs_info->tree_mod_seq, 0); 145 atomic_set(&fs_info->tree_mod_seq, 0);
147 146
148 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
149 spin_lock_init(&cur_trans->commit_lock); 147 spin_lock_init(&cur_trans->commit_lock);
150 spin_lock_init(&cur_trans->delayed_refs.lock); 148 spin_lock_init(&cur_trans->delayed_refs.lock);
151 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
152 149
153 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 150 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
154 list_add_tail(&cur_trans->list, &fs_info->trans_list); 151 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -299,6 +296,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
299 struct btrfs_transaction *cur_trans; 296 struct btrfs_transaction *cur_trans;
300 u64 num_bytes = 0; 297 u64 num_bytes = 0;
301 int ret; 298 int ret;
299 u64 qgroup_reserved = 0;
302 300
303 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 301 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
304 return ERR_PTR(-EROFS); 302 return ERR_PTR(-EROFS);
@@ -317,6 +315,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
317 * the appropriate flushing if need be. 315 * the appropriate flushing if need be.
318 */ 316 */
319 if (num_items > 0 && root != root->fs_info->chunk_root) { 317 if (num_items > 0 && root != root->fs_info->chunk_root) {
318 if (root->fs_info->quota_enabled &&
319 is_fstree(root->root_key.objectid)) {
320 qgroup_reserved = num_items * root->leafsize;
321 ret = btrfs_qgroup_reserve(root, qgroup_reserved);
322 if (ret)
323 return ERR_PTR(ret);
324 }
325
320 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
321 ret = btrfs_block_rsv_add(root, 327 ret = btrfs_block_rsv_add(root,
322 &root->fs_info->trans_block_rsv, 328 &root->fs_info->trans_block_rsv,
@@ -349,11 +355,16 @@ again:
349 h->transaction = cur_trans; 355 h->transaction = cur_trans;
350 h->blocks_used = 0; 356 h->blocks_used = 0;
351 h->bytes_reserved = 0; 357 h->bytes_reserved = 0;
358 h->root = root;
352 h->delayed_ref_updates = 0; 359 h->delayed_ref_updates = 0;
353 h->use_count = 1; 360 h->use_count = 1;
361 h->adding_csums = 0;
354 h->block_rsv = NULL; 362 h->block_rsv = NULL;
355 h->orig_rsv = NULL; 363 h->orig_rsv = NULL;
356 h->aborted = 0; 364 h->aborted = 0;
365 h->qgroup_reserved = qgroup_reserved;
366 h->delayed_ref_elem.seq = 0;
367 INIT_LIST_HEAD(&h->qgroup_ref_list);
357 368
358 smp_mb(); 369 smp_mb();
359 if (cur_trans->blocked && may_wait_transaction(root, type)) { 370 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -473,7 +484,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
473 struct btrfs_root *root) 484 struct btrfs_root *root)
474{ 485{
475 struct btrfs_transaction *cur_trans = trans->transaction; 486 struct btrfs_transaction *cur_trans = trans->transaction;
476 struct btrfs_block_rsv *rsv = trans->block_rsv;
477 int updates; 487 int updates;
478 int err; 488 int err;
479 489
@@ -481,12 +491,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
481 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 491 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
482 return 1; 492 return 1;
483 493
484 /*
485 * We need to do this in case we're deleting csums so the global block
486 * rsv get's used instead of the csum block rsv.
487 */
488 trans->block_rsv = NULL;
489
490 updates = trans->delayed_ref_updates; 494 updates = trans->delayed_ref_updates;
491 trans->delayed_ref_updates = 0; 495 trans->delayed_ref_updates = 0;
492 if (updates) { 496 if (updates) {
@@ -495,8 +499,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
495 return err; 499 return err;
496 } 500 }
497 501
498 trans->block_rsv = rsv;
499
500 return should_end_transaction(trans, root); 502 return should_end_transaction(trans, root);
501} 503}
502 504
@@ -513,8 +515,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
513 return 0; 515 return 0;
514 } 516 }
515 517
518 /*
519 * do the qgroup accounting as early as possible
520 */
521 err = btrfs_delayed_refs_qgroup_accounting(trans, info);
522
516 btrfs_trans_release_metadata(trans, root); 523 btrfs_trans_release_metadata(trans, root);
517 trans->block_rsv = NULL; 524 trans->block_rsv = NULL;
525 /*
526 * the same root has to be passed to start_transaction and
527 * end_transaction. Subvolume quota depends on this.
528 */
529 WARN_ON(trans->root != root);
530
531 if (trans->qgroup_reserved) {
532 btrfs_qgroup_free(root, trans->qgroup_reserved);
533 trans->qgroup_reserved = 0;
534 }
535
518 while (count < 2) { 536 while (count < 2) {
519 unsigned long cur = trans->delayed_ref_updates; 537 unsigned long cur = trans->delayed_ref_updates;
520 trans->delayed_ref_updates = 0; 538 trans->delayed_ref_updates = 0;
@@ -527,6 +545,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
527 } 545 }
528 count++; 546 count++;
529 } 547 }
548 btrfs_trans_release_metadata(trans, root);
549 trans->block_rsv = NULL;
530 550
531 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 551 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
532 should_end_transaction(trans, root)) { 552 should_end_transaction(trans, root)) {
@@ -567,6 +587,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
567 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 587 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
568 err = -EIO; 588 err = -EIO;
569 } 589 }
590 assert_qgroups_uptodate(trans);
570 591
571 memset(trans, 0, sizeof(*trans)); 592 memset(trans, 0, sizeof(*trans));
572 kmem_cache_free(btrfs_trans_handle_cachep, trans); 593 kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -785,6 +806,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
785 ret = btrfs_run_dev_stats(trans, root->fs_info); 806 ret = btrfs_run_dev_stats(trans, root->fs_info);
786 BUG_ON(ret); 807 BUG_ON(ret);
787 808
809 ret = btrfs_run_qgroups(trans, root->fs_info);
810 BUG_ON(ret);
811
812 /* run_qgroups might have added some more refs */
813 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
814 BUG_ON(ret);
815
788 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 816 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
789 next = fs_info->dirty_cowonly_roots.next; 817 next = fs_info->dirty_cowonly_roots.next;
790 list_del_init(next); 818 list_del_init(next);
@@ -926,11 +954,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
926 struct dentry *dentry; 954 struct dentry *dentry;
927 struct extent_buffer *tmp; 955 struct extent_buffer *tmp;
928 struct extent_buffer *old; 956 struct extent_buffer *old;
957 struct timespec cur_time = CURRENT_TIME;
929 int ret; 958 int ret;
930 u64 to_reserve = 0; 959 u64 to_reserve = 0;
931 u64 index = 0; 960 u64 index = 0;
932 u64 objectid; 961 u64 objectid;
933 u64 root_flags; 962 u64 root_flags;
963 uuid_le new_uuid;
934 964
935 rsv = trans->block_rsv; 965 rsv = trans->block_rsv;
936 966
@@ -957,6 +987,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
957 } 987 }
958 } 988 }
959 989
990 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
991 objectid, pending->inherit);
992 kfree(pending->inherit);
993 if (ret) {
994 pending->error = ret;
995 goto fail;
996 }
997
960 key.objectid = objectid; 998 key.objectid = objectid;
961 key.offset = (u64)-1; 999 key.offset = (u64)-1;
962 key.type = BTRFS_ROOT_ITEM_KEY; 1000 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1016,6 +1054,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1016 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; 1054 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
1017 btrfs_set_root_flags(new_root_item, root_flags); 1055 btrfs_set_root_flags(new_root_item, root_flags);
1018 1056
1057 btrfs_set_root_generation_v2(new_root_item,
1058 trans->transid);
1059 uuid_le_gen(&new_uuid);
1060 memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
1061 memcpy(new_root_item->parent_uuid, root->root_item.uuid,
1062 BTRFS_UUID_SIZE);
1063 new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
1064 new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
1065 btrfs_set_root_otransid(new_root_item, trans->transid);
1066 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
1067 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
1068 btrfs_set_root_stransid(new_root_item, 0);
1069 btrfs_set_root_rtransid(new_root_item, 0);
1070
1019 old = btrfs_lock_root_node(root); 1071 old = btrfs_lock_root_node(root);
1020 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); 1072 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
1021 if (ret) { 1073 if (ret) {
@@ -1269,9 +1321,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1269 1321
1270 btrfs_run_ordered_operations(root, 0); 1322 btrfs_run_ordered_operations(root, 0);
1271 1323
1272 btrfs_trans_release_metadata(trans, root);
1273 trans->block_rsv = NULL;
1274
1275 if (cur_trans->aborted) 1324 if (cur_trans->aborted)
1276 goto cleanup_transaction; 1325 goto cleanup_transaction;
1277 1326
@@ -1282,6 +1331,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1282 if (ret) 1331 if (ret)
1283 goto cleanup_transaction; 1332 goto cleanup_transaction;
1284 1333
1334 btrfs_trans_release_metadata(trans, root);
1335 trans->block_rsv = NULL;
1336
1285 cur_trans = trans->transaction; 1337 cur_trans = trans->transaction;
1286 1338
1287 /* 1339 /*
@@ -1330,7 +1382,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1330 spin_unlock(&root->fs_info->trans_lock); 1382 spin_unlock(&root->fs_info->trans_lock);
1331 } 1383 }
1332 1384
1333 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1385 if (!btrfs_test_opt(root, SSD) &&
1386 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1334 should_grow = 1; 1387 should_grow = 1;
1335 1388
1336 do { 1389 do {
@@ -1352,6 +1405,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1352 goto cleanup_transaction; 1405 goto cleanup_transaction;
1353 1406
1354 /* 1407 /*
1408 * running the delayed items may have added new refs. account
1409 * them now so that they hinder processing of more delayed refs
1410 * as little as possible.
1411 */
1412 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1413
1414 /*
1355 * rename don't use btrfs_join_transaction, so, once we 1415 * rename don't use btrfs_join_transaction, so, once we
1356 * set the transaction to blocked above, we aren't going 1416 * set the transaction to blocked above, we aren't going
1357 * to get any new ordered operations. We can safely run 1417 * to get any new ordered operations. We can safely run
@@ -1463,6 +1523,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1463 root->fs_info->chunk_root->node); 1523 root->fs_info->chunk_root->node);
1464 switch_commit_root(root->fs_info->chunk_root); 1524 switch_commit_root(root->fs_info->chunk_root);
1465 1525
1526 assert_qgroups_uptodate(trans);
1466 update_super_roots(root); 1527 update_super_roots(root);
1467 1528
1468 if (!root->fs_info->log_root_recovering) { 1529 if (!root->fs_info->log_root_recovering) {
@@ -1532,6 +1593,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1532 return ret; 1593 return ret;
1533 1594
1534cleanup_transaction: 1595cleanup_transaction:
1596 btrfs_trans_release_metadata(trans, root);
1597 trans->block_rsv = NULL;
1535 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); 1598 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
1536// WARN_ON(1); 1599// WARN_ON(1);
1537 if (current->journal_info == trans) 1600 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fe27379e368..e8b8416c688 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -20,6 +20,7 @@
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h"
23 24
24struct btrfs_transaction { 25struct btrfs_transaction {
25 u64 transid; 26 u64 transid;
@@ -49,6 +50,7 @@ struct btrfs_transaction {
49struct btrfs_trans_handle { 50struct btrfs_trans_handle {
50 u64 transid; 51 u64 transid;
51 u64 bytes_reserved; 52 u64 bytes_reserved;
53 u64 qgroup_reserved;
52 unsigned long use_count; 54 unsigned long use_count;
53 unsigned long blocks_reserved; 55 unsigned long blocks_reserved;
54 unsigned long blocks_used; 56 unsigned long blocks_used;
@@ -57,12 +59,22 @@ struct btrfs_trans_handle {
57 struct btrfs_block_rsv *block_rsv; 59 struct btrfs_block_rsv *block_rsv;
58 struct btrfs_block_rsv *orig_rsv; 60 struct btrfs_block_rsv *orig_rsv;
59 int aborted; 61 int aborted;
62 int adding_csums;
63 /*
64 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction.
66 * Subvolume quota depends on this
67 */
68 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list;
60}; 71};
61 72
62struct btrfs_pending_snapshot { 73struct btrfs_pending_snapshot {
63 struct dentry *dentry; 74 struct dentry *dentry;
64 struct btrfs_root *root; 75 struct btrfs_root *root;
65 struct btrfs_root *snap; 76 struct btrfs_root *snap;
77 struct btrfs_qgroup_inherit *inherit;
66 /* block reservation for the operation */ 78 /* block reservation for the operation */
67 struct btrfs_block_rsv block_rsv; 79 struct btrfs_block_rsv block_rsv;
68 /* extra metadata reseration for relocation */ 80 /* extra metadata reseration for relocation */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8abeae4224f..c86670f4f28 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -637,7 +637,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
637 } 637 }
638 638
639 inode_set_bytes(inode, saved_nbytes); 639 inode_set_bytes(inode, saved_nbytes);
640 btrfs_update_inode(trans, root, inode); 640 ret = btrfs_update_inode(trans, root, inode);
641out: 641out:
642 if (inode) 642 if (inode)
643 iput(inode); 643 iput(inode);
@@ -1133,7 +1133,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1133 btrfs_release_path(path); 1133 btrfs_release_path(path);
1134 if (ret == 0) { 1134 if (ret == 0) {
1135 btrfs_inc_nlink(inode); 1135 btrfs_inc_nlink(inode);
1136 btrfs_update_inode(trans, root, inode); 1136 ret = btrfs_update_inode(trans, root, inode);
1137 } else if (ret == -EEXIST) { 1137 } else if (ret == -EEXIST) {
1138 ret = 0; 1138 ret = 0;
1139 } else { 1139 } else {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ecaad40e7ef..b8708f994e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -429,6 +429,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
429 mutex_init(&fs_devices->device_list_mutex); 429 mutex_init(&fs_devices->device_list_mutex);
430 fs_devices->latest_devid = orig->latest_devid; 430 fs_devices->latest_devid = orig->latest_devid;
431 fs_devices->latest_trans = orig->latest_trans; 431 fs_devices->latest_trans = orig->latest_trans;
432 fs_devices->total_devices = orig->total_devices;
432 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 433 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
433 434
434 /* We have held the volume lock, it is safe to get the devices. */ 435 /* We have held the volume lock, it is safe to get the devices. */
@@ -739,6 +740,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
739 int ret; 740 int ret;
740 u64 devid; 741 u64 devid;
741 u64 transid; 742 u64 transid;
743 u64 total_devices;
742 744
743 flags |= FMODE_EXCL; 745 flags |= FMODE_EXCL;
744 bdev = blkdev_get_by_path(path, flags, holder); 746 bdev = blkdev_get_by_path(path, flags, holder);
@@ -760,6 +762,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
760 disk_super = (struct btrfs_super_block *)bh->b_data; 762 disk_super = (struct btrfs_super_block *)bh->b_data;
761 devid = btrfs_stack_device_id(&disk_super->dev_item); 763 devid = btrfs_stack_device_id(&disk_super->dev_item);
762 transid = btrfs_super_generation(disk_super); 764 transid = btrfs_super_generation(disk_super);
765 total_devices = btrfs_super_num_devices(disk_super);
763 if (disk_super->label[0]) 766 if (disk_super->label[0])
764 printk(KERN_INFO "device label %s ", disk_super->label); 767 printk(KERN_INFO "device label %s ", disk_super->label);
765 else 768 else
@@ -767,7 +770,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
767 printk(KERN_CONT "devid %llu transid %llu %s\n", 770 printk(KERN_CONT "devid %llu transid %llu %s\n",
768 (unsigned long long)devid, (unsigned long long)transid, path); 771 (unsigned long long)devid, (unsigned long long)transid, path);
769 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 772 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
770 773 if (!ret && fs_devices_ret)
774 (*fs_devices_ret)->total_devices = total_devices;
771 brelse(bh); 775 brelse(bh);
772error_close: 776error_close:
773 mutex_unlock(&uuid_mutex); 777 mutex_unlock(&uuid_mutex);
@@ -1433,6 +1437,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1433 list_del_rcu(&device->dev_list); 1437 list_del_rcu(&device->dev_list);
1434 1438
1435 device->fs_devices->num_devices--; 1439 device->fs_devices->num_devices--;
1440 device->fs_devices->total_devices--;
1436 1441
1437 if (device->missing) 1442 if (device->missing)
1438 root->fs_info->fs_devices->missing_devices--; 1443 root->fs_info->fs_devices->missing_devices--;
@@ -1550,6 +1555,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1550 fs_devices->seeding = 0; 1555 fs_devices->seeding = 0;
1551 fs_devices->num_devices = 0; 1556 fs_devices->num_devices = 0;
1552 fs_devices->open_devices = 0; 1557 fs_devices->open_devices = 0;
1558 fs_devices->total_devices = 0;
1553 fs_devices->seed = seed_devices; 1559 fs_devices->seed = seed_devices;
1554 1560
1555 generate_random_uuid(fs_devices->fsid); 1561 generate_random_uuid(fs_devices->fsid);
@@ -1749,6 +1755,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1749 root->fs_info->fs_devices->num_devices++; 1755 root->fs_info->fs_devices->num_devices++;
1750 root->fs_info->fs_devices->open_devices++; 1756 root->fs_info->fs_devices->open_devices++;
1751 root->fs_info->fs_devices->rw_devices++; 1757 root->fs_info->fs_devices->rw_devices++;
1758 root->fs_info->fs_devices->total_devices++;
1752 if (device->can_discard) 1759 if (device->can_discard)
1753 root->fs_info->fs_devices->num_can_discard++; 1760 root->fs_info->fs_devices->num_can_discard++;
1754 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1761 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
@@ -4736,9 +4743,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4736 key.offset = device->devid; 4743 key.offset = device->devid;
4737 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 4744 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4738 if (ret) { 4745 if (ret) {
4739 printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4740 rcu_str_deref(device->name),
4741 (unsigned long long)device->devid);
4742 __btrfs_reset_dev_stats(device); 4746 __btrfs_reset_dev_stats(device);
4743 device->dev_stats_valid = 1; 4747 device->dev_stats_valid = 1;
4744 btrfs_release_path(path); 4748 btrfs_release_path(path);
@@ -4880,6 +4884,14 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4880 4884
4881static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 4885static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4882{ 4886{
4887 int i;
4888
4889 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4890 if (btrfs_dev_stat_read(dev, i) != 0)
4891 break;
4892 if (i == BTRFS_DEV_STAT_VALUES_MAX)
4893 return; /* all values == 0, suppress message */
4894
4883 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4895 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4884 rcu_str_deref(dev->name), 4896 rcu_str_deref(dev->name),
4885 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4897 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -4890,8 +4902,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4890} 4902}
4891 4903
4892int btrfs_get_dev_stats(struct btrfs_root *root, 4904int btrfs_get_dev_stats(struct btrfs_root *root,
4893 struct btrfs_ioctl_get_dev_stats *stats, 4905 struct btrfs_ioctl_get_dev_stats *stats)
4894 int reset_after_read)
4895{ 4906{
4896 struct btrfs_device *dev; 4907 struct btrfs_device *dev;
4897 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4908 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
@@ -4909,7 +4920,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4909 printk(KERN_WARNING 4920 printk(KERN_WARNING
4910 "btrfs: get dev_stats failed, not yet valid\n"); 4921 "btrfs: get dev_stats failed, not yet valid\n");
4911 return -ENODEV; 4922 return -ENODEV;
4912 } else if (reset_after_read) { 4923 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
4913 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4924 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4914 if (stats->nr_items > i) 4925 if (stats->nr_items > i)
4915 stats->values[i] = 4926 stats->values[i] =
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 95f6637614d..5479325987b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -126,6 +126,7 @@ struct btrfs_fs_devices {
126 u64 missing_devices; 126 u64 missing_devices;
127 u64 total_rw_bytes; 127 u64 total_rw_bytes;
128 u64 num_can_discard; 128 u64 num_can_discard;
129 u64 total_devices;
129 struct block_device *latest_bdev; 130 struct block_device *latest_bdev;
130 131
131 /* all of the devices in the FS, protected by a mutex 132 /* all of the devices in the FS, protected by a mutex
@@ -293,8 +294,7 @@ struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
293void btrfs_dev_stat_print_on_error(struct btrfs_device *device); 294void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
294void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 295void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
295int btrfs_get_dev_stats(struct btrfs_root *root, 296int btrfs_get_dev_stats(struct btrfs_root *root,
296 struct btrfs_ioctl_get_dev_stats *stats, 297 struct btrfs_ioctl_get_dev_stats *stats);
297 int reset_after_read);
298int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 298int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
299int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 299int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
300 struct btrfs_fs_info *fs_info); 300 struct btrfs_fs_info *fs_info);
diff --git a/fs/inode.c b/fs/inode.c
index 775cbabd4fa..3cc50432046 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1551,6 +1551,8 @@ void touch_atime(struct path *path)
1551 * Btrfs), but since we touch atime while walking down the path we 1551 * Btrfs), but since we touch atime while walking down the path we
1552 * really don't care if we failed to update the atime of the file, 1552 * really don't care if we failed to update the atime of the file,
1553 * so just ignore the return value. 1553 * so just ignore the return value.
1554 * We may also fail on filesystems that have the ability to make parts
1555 * of the fs read only, e.g. subvolumes in Btrfs.
1554 */ 1556 */
1555 update_time(inode, &now, S_ATIME); 1557 update_time(inode, &now, S_ATIME);
1556 mnt_drop_write(mnt); 1558 mnt_drop_write(mnt);