aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/async-thread.c9
-rw-r--r--fs/btrfs/backref.c40
-rw-r--r--fs/btrfs/backref.h7
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c7
-rw-r--r--fs/btrfs/ctree.c775
-rw-r--r--fs/btrfs/ctree.h381
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c56
-rw-r--r--fs/btrfs/delayed-ref.h62
-rw-r--r--fs/btrfs/disk-io.c155
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c358
-rw-r--r--fs/btrfs/extent_io.c61
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c57
-rw-r--r--fs/btrfs/ioctl.c481
-rw-r--r--fs/btrfs/ioctl.h97
-rw-r--r--fs/btrfs/locking.c14
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c1571
-rw-r--r--fs/btrfs/relocation.c5
-rw-r--r--fs/btrfs/root-tree.c107
-rw-r--r--fs/btrfs/send.c4572
-rw-r--r--fs/btrfs/send.h133
-rw-r--r--fs/btrfs/struct-funcs.c196
-rw-r--r--fs/btrfs/super.c77
-rw-r--r--fs/btrfs/transaction.c108
-rw-r--r--fs/btrfs/transaction.h12
-rw-r--r--fs/btrfs/tree-log.c4
-rw-r--r--fs/btrfs/volumes.c29
-rw-r--r--fs/btrfs/volumes.h4
36 files changed, 8769 insertions, 667 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2befae7..d7fcdba141a2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o 11 reada.o backref.o ulist.o qgroup.o send.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 42704149b723..58b7d14b08ee 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers,
206 206
207 work->ordered_func(work); 207 work->ordered_func(work);
208 208
209 /* now take the lock again and call the freeing code */ 209 /* now take the lock again and drop our item from the list */
210 spin_lock(&workers->order_lock); 210 spin_lock(&workers->order_lock);
211 list_del(&work->order_list); 211 list_del(&work->order_list);
212 spin_unlock(&workers->order_lock);
213
214 /*
215 * we don't want to call the ordered free functions
216 * with the lock held though
217 */
212 work->ordered_free(work); 218 work->ordered_free(work);
219 spin_lock(&workers->order_lock);
213 } 220 }
214 221
215 spin_unlock(&workers->order_lock); 222 spin_unlock(&workers->order_lock);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a383c18e74e8..a256f3b2a845 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
773 */ 773 */
774static int find_parent_nodes(struct btrfs_trans_handle *trans, 774static int find_parent_nodes(struct btrfs_trans_handle *trans,
775 struct btrfs_fs_info *fs_info, u64 bytenr, 775 struct btrfs_fs_info *fs_info, u64 bytenr,
776 u64 delayed_ref_seq, u64 time_seq, 776 u64 time_seq, struct ulist *refs,
777 struct ulist *refs, struct ulist *roots, 777 struct ulist *roots, const u64 *extent_item_pos)
778 const u64 *extent_item_pos)
779{ 778{
780 struct btrfs_key key; 779 struct btrfs_key key;
781 struct btrfs_path *path; 780 struct btrfs_path *path;
@@ -837,7 +836,7 @@ again:
837 btrfs_put_delayed_ref(&head->node); 836 btrfs_put_delayed_ref(&head->node);
838 goto again; 837 goto again;
839 } 838 }
840 ret = __add_delayed_refs(head, delayed_ref_seq, 839 ret = __add_delayed_refs(head, time_seq,
841 &prefs_delayed); 840 &prefs_delayed);
842 mutex_unlock(&head->mutex); 841 mutex_unlock(&head->mutex);
843 if (ret) { 842 if (ret) {
@@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks)
981 */ 980 */
982static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 981static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
983 struct btrfs_fs_info *fs_info, u64 bytenr, 982 struct btrfs_fs_info *fs_info, u64 bytenr,
984 u64 delayed_ref_seq, u64 time_seq, 983 u64 time_seq, struct ulist **leafs,
985 struct ulist **leafs,
986 const u64 *extent_item_pos) 984 const u64 *extent_item_pos)
987{ 985{
988 struct ulist *tmp; 986 struct ulist *tmp;
@@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
997 return -ENOMEM; 995 return -ENOMEM;
998 } 996 }
999 997
1000 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 998 ret = find_parent_nodes(trans, fs_info, bytenr,
1001 time_seq, *leafs, tmp, extent_item_pos); 999 time_seq, *leafs, tmp, extent_item_pos);
1002 ulist_free(tmp); 1000 ulist_free(tmp);
1003 1001
@@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1024 */ 1022 */
1025int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1023int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1026 struct btrfs_fs_info *fs_info, u64 bytenr, 1024 struct btrfs_fs_info *fs_info, u64 bytenr,
1027 u64 delayed_ref_seq, u64 time_seq, 1025 u64 time_seq, struct ulist **roots)
1028 struct ulist **roots)
1029{ 1026{
1030 struct ulist *tmp; 1027 struct ulist *tmp;
1031 struct ulist_node *node = NULL; 1028 struct ulist_node *node = NULL;
@@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1043 1040
1044 ULIST_ITER_INIT(&uiter); 1041 ULIST_ITER_INIT(&uiter);
1045 while (1) { 1042 while (1) {
1046 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 1043 ret = find_parent_nodes(trans, fs_info, bytenr,
1047 time_seq, tmp, *roots, NULL); 1044 time_seq, tmp, *roots, NULL);
1048 if (ret < 0 && ret != -ENOENT) { 1045 if (ret < 0 && ret != -ENOENT) {
1049 ulist_free(tmp); 1046 ulist_free(tmp);
@@ -1125,10 +1122,10 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1125 * required for the path to fit into the buffer. in that case, the returned 1122 * required for the path to fit into the buffer. in that case, the returned
1126 * value will be smaller than dest. callers must check this! 1123 * value will be smaller than dest. callers must check this!
1127 */ 1124 */
1128static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1129 struct btrfs_inode_ref *iref, 1126 struct btrfs_inode_ref *iref,
1130 struct extent_buffer *eb_in, u64 parent, 1127 struct extent_buffer *eb_in, u64 parent,
1131 char *dest, u32 size) 1128 char *dest, u32 size)
1132{ 1129{
1133 u32 len; 1130 u32 len;
1134 int slot; 1131 int slot;
@@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1376 struct ulist *roots = NULL; 1373 struct ulist *roots = NULL;
1377 struct ulist_node *ref_node = NULL; 1374 struct ulist_node *ref_node = NULL;
1378 struct ulist_node *root_node = NULL; 1375 struct ulist_node *root_node = NULL;
1379 struct seq_list seq_elem = {};
1380 struct seq_list tree_mod_seq_elem = {}; 1376 struct seq_list tree_mod_seq_elem = {};
1381 struct ulist_iterator ref_uiter; 1377 struct ulist_iterator ref_uiter;
1382 struct ulist_iterator root_uiter; 1378 struct ulist_iterator root_uiter;
1383 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1384 1379
1385 pr_debug("resolving all inodes for extent %llu\n", 1380 pr_debug("resolving all inodes for extent %llu\n",
1386 extent_item_objectid); 1381 extent_item_objectid);
@@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1391 trans = btrfs_join_transaction(fs_info->extent_root); 1386 trans = btrfs_join_transaction(fs_info->extent_root);
1392 if (IS_ERR(trans)) 1387 if (IS_ERR(trans))
1393 return PTR_ERR(trans); 1388 return PTR_ERR(trans);
1394
1395 delayed_refs = &trans->transaction->delayed_refs;
1396 spin_lock(&delayed_refs->lock);
1397 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1398 spin_unlock(&delayed_refs->lock);
1399 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1389 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1400 } 1390 }
1401 1391
1402 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1392 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1403 seq_elem.seq, tree_mod_seq_elem.seq, &refs, 1393 tree_mod_seq_elem.seq, &refs,
1404 &extent_item_pos); 1394 &extent_item_pos);
1405 if (ret) 1395 if (ret)
1406 goto out; 1396 goto out;
@@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1408 ULIST_ITER_INIT(&ref_uiter); 1398 ULIST_ITER_INIT(&ref_uiter);
1409 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1399 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1410 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, 1400 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1411 seq_elem.seq, 1401 tree_mod_seq_elem.seq, &roots);
1412 tree_mod_seq_elem.seq, &roots);
1413 if (ret) 1402 if (ret)
1414 break; 1403 break;
1415 ULIST_ITER_INIT(&root_uiter); 1404 ULIST_ITER_INIT(&root_uiter);
@@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1431out: 1420out:
1432 if (!search_commit_root) { 1421 if (!search_commit_root) {
1433 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1422 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1434 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1435 btrfs_end_transaction(trans, fs_info->extent_root); 1423 btrfs_end_transaction(trans, fs_info->extent_root);
1436 } 1424 }
1437 1425
@@ -1543,7 +1531,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1543 ipath->fspath->bytes_left - s_ptr : 0; 1531 ipath->fspath->bytes_left - s_ptr : 0;
1544 1532
1545 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1546 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
1547 inum, fspath_min, bytes_left); 1535 inum, fspath_min, bytes_left);
1548 if (IS_ERR(fspath)) 1536 if (IS_ERR(fspath))
1549 return PTR_ERR(fspath); 1537 return PTR_ERR(fspath);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac7b795..032f4dc7eab8 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -21,6 +21,7 @@
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h"
24 25
25#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) 26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
26 27
@@ -58,8 +59,10 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 59
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 60int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 61 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 delayed_ref_seq, u64 time_seq, 62 u64 time_seq, struct ulist **roots);
62 struct ulist **roots); 63char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
64 struct btrfs_inode_ref *iref, struct extent_buffer *eb,
65 u64 parent, char *dest, u32 size);
63 66
64struct btrfs_data_container *init_data_container(u32 total_bytes); 67struct btrfs_data_container *init_data_container(u32 total_bytes);
65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 68struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 12394a90d60f..5b2ad6bc4fe7 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -87,9 +87,6 @@ struct btrfs_inode {
87 /* node for the red-black tree that links inodes in subvolume root */ 87 /* node for the red-black tree that links inodes in subvolume root */
88 struct rb_node rb_node; 88 struct rb_node rb_node;
89 89
90 /* the space_info for where this inode's data allocations are done */
91 struct btrfs_space_info *space_info;
92
93 unsigned long runtime_flags; 90 unsigned long runtime_flags;
94 91
95 /* full 64 bit generation number, struct vfs_inode doesn't have a big 92 /* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -191,11 +188,14 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
191 BTRFS_I(inode)->disk_i_size = size; 188 BTRFS_I(inode)->disk_i_size = size;
192} 189}
193 190
194static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, 191static inline bool btrfs_is_free_space_inode(struct inode *inode)
195 struct inode *inode)
196{ 192{
197 if (root == root->fs_info->tree_root || 193 struct btrfs_root *root = BTRFS_I(inode)->root;
198 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) 194
195 if (root == root->fs_info->tree_root &&
196 btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
197 return true;
198 if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
199 return true; 199 return true;
200 return false; 200 return false;
201} 201}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index da6e9364a5e3..9197e2e33407 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1032,6 +1032,7 @@ continue_with_current_leaf_stack_frame:
1032 struct btrfs_disk_key *disk_key; 1032 struct btrfs_disk_key *disk_key;
1033 u8 type; 1033 u8 type;
1034 u32 item_offset; 1034 u32 item_offset;
1035 u32 item_size;
1035 1036
1036 if (disk_item_offset + sizeof(struct btrfs_item) > 1037 if (disk_item_offset + sizeof(struct btrfs_item) >
1037 sf->block_ctx->len) { 1038 sf->block_ctx->len) {
@@ -1047,6 +1048,7 @@ leaf_item_out_of_bounce_error:
1047 disk_item_offset, 1048 disk_item_offset,
1048 sizeof(struct btrfs_item)); 1049 sizeof(struct btrfs_item));
1049 item_offset = le32_to_cpu(disk_item.offset); 1050 item_offset = le32_to_cpu(disk_item.offset);
1051 item_size = le32_to_cpu(disk_item.size);
1050 disk_key = &disk_item.key; 1052 disk_key = &disk_item.key;
1051 type = disk_key->type; 1053 type = disk_key->type;
1052 1054
@@ -1057,14 +1059,13 @@ leaf_item_out_of_bounce_error:
1057 1059
1058 root_item_offset = item_offset + 1060 root_item_offset = item_offset +
1059 offsetof(struct btrfs_leaf, items); 1061 offsetof(struct btrfs_leaf, items);
1060 if (root_item_offset + 1062 if (root_item_offset + item_size >
1061 sizeof(struct btrfs_root_item) >
1062 sf->block_ctx->len) 1063 sf->block_ctx->len)
1063 goto leaf_item_out_of_bounce_error; 1064 goto leaf_item_out_of_bounce_error;
1064 btrfsic_read_from_block_data( 1065 btrfsic_read_from_block_data(
1065 sf->block_ctx, &root_item, 1066 sf->block_ctx, &root_item,
1066 root_item_offset, 1067 root_item_offset,
1067 sizeof(struct btrfs_root_item)); 1068 item_size);
1068 next_bytenr = le64_to_cpu(root_item.bytenr); 1069 next_bytenr = le64_to_cpu(root_item.bytenr);
1069 1070
1070 sf->error = 1071 sf->error =
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b3900587..9d7621f271ff 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -321,7 +321,7 @@ struct tree_mod_root {
321struct tree_mod_elem { 321struct tree_mod_elem {
322 struct rb_node node; 322 struct rb_node node;
323 u64 index; /* shifted logical */ 323 u64 index; /* shifted logical */
324 struct seq_list elem; 324 u64 seq;
325 enum mod_log_op op; 325 enum mod_log_op op;
326 326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +341,50 @@ struct tree_mod_elem {
341 struct tree_mod_root old_root; 341 struct tree_mod_root old_root;
342}; 342};
343 343
344static inline void 344static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{ 345{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); 346 read_lock(&fs_info->tree_mod_log_lock);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349} 347}
350 348
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 349static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
352 struct seq_list *elem) 350{
351 read_unlock(&fs_info->tree_mod_log_lock);
352}
353
354static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
355{
356 write_lock(&fs_info->tree_mod_log_lock);
357}
358
359static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
360{
361 write_unlock(&fs_info->tree_mod_log_lock);
362}
363
364/*
365 * This adds a new blocker to the tree mod log's blocker list if the @elem
366 * passed does not already have a sequence number set. So when a caller expects
367 * to record tree modifications, it should ensure to set elem->seq to zero
368 * before calling btrfs_get_tree_mod_seq.
369 * Returns a fresh, unused tree log modification sequence number, even if no new
370 * blocker was added.
371 */
372u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
373 struct seq_list *elem)
353{ 374{
354 elem->flags = 1; 375 u64 seq;
376
377 tree_mod_log_write_lock(fs_info);
355 spin_lock(&fs_info->tree_mod_seq_lock); 378 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem); 379 if (!elem->seq) {
380 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
381 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
382 }
383 seq = btrfs_inc_tree_mod_seq(fs_info);
357 spin_unlock(&fs_info->tree_mod_seq_lock); 384 spin_unlock(&fs_info->tree_mod_seq_lock);
385 tree_mod_log_write_unlock(fs_info);
386
387 return seq;
358} 388}
359 389
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 390void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
371 if (!seq_putting) 401 if (!seq_putting)
372 return; 402 return;
373 403
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock); 404 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list); 405 list_del(&elem->list);
406 elem->seq = 0;
377 407
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 408 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { 409 if (cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) { 410 if (seq_putting > cur_elem->seq) {
381 /* 411 /*
382 * blocker with lower sequence number exists, we 412 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log 413 * cannot remove anything from the log
384 */ 414 */
385 goto out; 415 spin_unlock(&fs_info->tree_mod_seq_lock);
416 return;
386 } 417 }
387 min_seq = cur_elem->seq; 418 min_seq = cur_elem->seq;
388 } 419 }
389 } 420 }
421 spin_unlock(&fs_info->tree_mod_seq_lock);
422
423 /*
424 * we removed the lowest blocker from the blocker list, so there may be
425 * more processible delayed refs.
426 */
427 wake_up(&fs_info->tree_mod_seq_wait);
390 428
391 /* 429 /*
392 * anything that's lower than the lowest existing (read: blocked) 430 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree. 431 * sequence number can be removed from the tree.
394 */ 432 */
395 write_lock(&fs_info->tree_mod_log_lock); 433 tree_mod_log_write_lock(fs_info);
396 tm_root = &fs_info->tree_mod_log; 434 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) { 435 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node); 436 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node); 437 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq) 438 if (tm->seq > min_seq)
401 continue; 439 continue;
402 rb_erase(node, tm_root); 440 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm); 441 kfree(tm);
405 } 442 }
406 write_unlock(&fs_info->tree_mod_log_lock); 443 tree_mod_log_write_unlock(fs_info);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409} 444}
410 445
411/* 446/*
@@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
423 struct rb_node **new; 458 struct rb_node **new;
424 struct rb_node *parent = NULL; 459 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur; 460 struct tree_mod_elem *cur;
426 int ret = 0;
427 461
428 BUG_ON(!tm || !tm->elem.seq); 462 BUG_ON(!tm || !tm->seq);
429 463
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log; 464 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node; 465 new = &tm_root->rb_node;
433 while (*new) { 466 while (*new) {
@@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
437 new = &((*new)->rb_left); 470 new = &((*new)->rb_left);
438 else if (cur->index > tm->index) 471 else if (cur->index > tm->index)
439 new = &((*new)->rb_right); 472 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq) 473 else if (cur->seq < tm->seq)
441 new = &((*new)->rb_left); 474 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq) 475 else if (cur->seq > tm->seq)
443 new = &((*new)->rb_right); 476 new = &((*new)->rb_right);
444 else { 477 else {
445 kfree(tm); 478 kfree(tm);
446 ret = -EEXIST; 479 return -EEXIST;
447 goto unlock;
448 } 480 }
449 } 481 }
450 482
451 rb_link_node(&tm->node, parent, new); 483 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root); 484 rb_insert_color(&tm->node, tm_root);
453unlock: 485 return 0;
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456} 486}
457 487
488/*
489 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
490 * returns zero with the tree_mod_log_lock acquired. The caller must hold
491 * this until all tree mod log insertions are recorded in the rb tree and then
492 * call tree_mod_log_write_unlock() to release.
493 */
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 494static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) { 495 struct extent_buffer *eb) {
460 smp_mb(); 496 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list)) 497 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1; 498 return 1;
463 if (!eb) 499 if (eb && btrfs_header_level(eb) == 0)
464 return 0; 500 return 1;
465 if (btrfs_header_level(eb) == 0) 501
502 tree_mod_log_write_lock(fs_info);
503 if (list_empty(&fs_info->tree_mod_seq_list)) {
504 /*
505 * someone emptied the list while we were waiting for the lock.
506 * we must not add to the list when no blocker exists.
507 */
508 tree_mod_log_write_unlock(fs_info);
466 return 1; 509 return 1;
510 }
511
467 return 0; 512 return 0;
468} 513}
469 514
470/* 515/*
471 * This allocates memory and gets a tree modification sequence number when 516 * This allocates memory and gets a tree modification sequence number.
472 * needed.
473 * 517 *
474 * Returns 0 when no sequence number is needed, < 0 on error. 518 * Returns <0 on error.
475 * Returns 1 when a sequence number was added. In this case, 519 * Returns >0 (the added sequence number) on success.
476 * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
477 * after inserting into the rb tree.
478 */ 520 */
479static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, 521static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
480 struct tree_mod_elem **tm_ret) 522 struct tree_mod_elem **tm_ret)
481{ 523{
482 struct tree_mod_elem *tm; 524 struct tree_mod_elem *tm;
483 int seq;
484 525
485 if (tree_mod_dont_log(fs_info, NULL)) 526 /*
486 return 0; 527 * once we switch from spin locks to something different, we should
487 528 * honor the flags parameter here.
488 tm = *tm_ret = kzalloc(sizeof(*tm), flags); 529 */
530 tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
489 if (!tm) 531 if (!tm)
490 return -ENOMEM; 532 return -ENOMEM;
491 533
492 tm->elem.flags = 0; 534 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
493 spin_lock(&fs_info->tree_mod_seq_lock); 535 return tm->seq;
494 if (list_empty(&fs_info->tree_mod_seq_list)) {
495 /*
496 * someone emptied the list while we were waiting for the lock.
497 * we must not add to the list, because no blocker exists. items
498 * are removed from the list only when the existing blocker is
499 * removed from the list.
500 */
501 kfree(tm);
502 seq = 0;
503 spin_unlock(&fs_info->tree_mod_seq_lock);
504 } else {
505 __get_tree_mod_seq(fs_info, &tm->elem);
506 seq = tm->elem.seq;
507 }
508
509 return seq;
510} 536}
511 537
512static noinline int 538static inline int
513tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, 539__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
514 struct extent_buffer *eb, int slot, 540 struct extent_buffer *eb, int slot,
515 enum mod_log_op op, gfp_t flags) 541 enum mod_log_op op, gfp_t flags)
516{ 542{
517 struct tree_mod_elem *tm;
518 int ret; 543 int ret;
544 struct tree_mod_elem *tm;
519 545
520 ret = tree_mod_alloc(fs_info, flags, &tm); 546 ret = tree_mod_alloc(fs_info, flags, &tm);
521 if (ret <= 0) 547 if (ret < 0)
522 return ret; 548 return ret;
523 549
524 tm->index = eb->start >> PAGE_CACHE_SHIFT; 550 tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
530 tm->slot = slot; 556 tm->slot = slot;
531 tm->generation = btrfs_node_ptr_generation(eb, slot); 557 tm->generation = btrfs_node_ptr_generation(eb, slot);
532 558
533 ret = __tree_mod_log_insert(fs_info, tm); 559 return __tree_mod_log_insert(fs_info, tm);
534 spin_unlock(&fs_info->tree_mod_seq_lock); 560}
561
562static noinline int
563tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
564 struct extent_buffer *eb, int slot,
565 enum mod_log_op op, gfp_t flags)
566{
567 int ret;
568
569 if (tree_mod_dont_log(fs_info, eb))
570 return 0;
571
572 ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
573
574 tree_mod_log_write_unlock(fs_info);
535 return ret; 575 return ret;
536} 576}
537 577
@@ -543,6 +583,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
543} 583}
544 584
545static noinline int 585static noinline int
586tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
587 struct extent_buffer *eb, int slot,
588 enum mod_log_op op)
589{
590 return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
591}
592
593static noinline int
546tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 594tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
547 struct extent_buffer *eb, int dst_slot, int src_slot, 595 struct extent_buffer *eb, int dst_slot, int src_slot,
548 int nr_items, gfp_t flags) 596 int nr_items, gfp_t flags)
@@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
555 return 0; 603 return 0;
556 604
557 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 605 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
558 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, 606 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
559 MOD_LOG_KEY_REMOVE_WHILE_MOVING); 607 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
560 BUG_ON(ret < 0); 608 BUG_ON(ret < 0);
561 } 609 }
562 610
563 ret = tree_mod_alloc(fs_info, flags, &tm); 611 ret = tree_mod_alloc(fs_info, flags, &tm);
564 if (ret <= 0) 612 if (ret < 0)
565 return ret; 613 goto out;
566 614
567 tm->index = eb->start >> PAGE_CACHE_SHIFT; 615 tm->index = eb->start >> PAGE_CACHE_SHIFT;
568 tm->slot = src_slot; 616 tm->slot = src_slot;
@@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
571 tm->op = MOD_LOG_MOVE_KEYS; 619 tm->op = MOD_LOG_MOVE_KEYS;
572 620
573 ret = __tree_mod_log_insert(fs_info, tm); 621 ret = __tree_mod_log_insert(fs_info, tm);
574 spin_unlock(&fs_info->tree_mod_seq_lock); 622out:
623 tree_mod_log_write_unlock(fs_info);
575 return ret; 624 return ret;
576} 625}
577 626
627static inline void
628__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
629{
630 int i;
631 u32 nritems;
632 int ret;
633
634 nritems = btrfs_header_nritems(eb);
635 for (i = nritems - 1; i >= 0; i--) {
636 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
637 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
638 BUG_ON(ret < 0);
639 }
640}
641
578static noinline int 642static noinline int
579tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 643tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
580 struct extent_buffer *old_root, 644 struct extent_buffer *old_root,
@@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
583 struct tree_mod_elem *tm; 647 struct tree_mod_elem *tm;
584 int ret; 648 int ret;
585 649
650 if (tree_mod_dont_log(fs_info, NULL))
651 return 0;
652
653 __tree_mod_log_free_eb(fs_info, old_root);
654
586 ret = tree_mod_alloc(fs_info, flags, &tm); 655 ret = tree_mod_alloc(fs_info, flags, &tm);
587 if (ret <= 0) 656 if (ret < 0)
588 return ret; 657 goto out;
589 658
590 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 659 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
591 tm->old_root.logical = old_root->start; 660 tm->old_root.logical = old_root->start;
@@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
594 tm->op = MOD_LOG_ROOT_REPLACE; 663 tm->op = MOD_LOG_ROOT_REPLACE;
595 664
596 ret = __tree_mod_log_insert(fs_info, tm); 665 ret = __tree_mod_log_insert(fs_info, tm);
597 spin_unlock(&fs_info->tree_mod_seq_lock); 666out:
667 tree_mod_log_write_unlock(fs_info);
598 return ret; 668 return ret;
599} 669}
600 670
@@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
608 struct tree_mod_elem *found = NULL; 678 struct tree_mod_elem *found = NULL;
609 u64 index = start >> PAGE_CACHE_SHIFT; 679 u64 index = start >> PAGE_CACHE_SHIFT;
610 680
611 read_lock(&fs_info->tree_mod_log_lock); 681 tree_mod_log_read_lock(fs_info);
612 tm_root = &fs_info->tree_mod_log; 682 tm_root = &fs_info->tree_mod_log;
613 node = tm_root->rb_node; 683 node = tm_root->rb_node;
614 while (node) { 684 while (node) {
@@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
617 node = node->rb_left; 687 node = node->rb_left;
618 } else if (cur->index > index) { 688 } else if (cur->index > index) {
619 node = node->rb_right; 689 node = node->rb_right;
620 } else if (cur->elem.seq < min_seq) { 690 } else if (cur->seq < min_seq) {
621 node = node->rb_left; 691 node = node->rb_left;
622 } else if (!smallest) { 692 } else if (!smallest) {
623 /* we want the node with the highest seq */ 693 /* we want the node with the highest seq */
624 if (found) 694 if (found)
625 BUG_ON(found->elem.seq > cur->elem.seq); 695 BUG_ON(found->seq > cur->seq);
626 found = cur; 696 found = cur;
627 node = node->rb_left; 697 node = node->rb_left;
628 } else if (cur->elem.seq > min_seq) { 698 } else if (cur->seq > min_seq) {
629 /* we want the node with the smallest seq */ 699 /* we want the node with the smallest seq */
630 if (found) 700 if (found)
631 BUG_ON(found->elem.seq < cur->elem.seq); 701 BUG_ON(found->seq < cur->seq);
632 found = cur; 702 found = cur;
633 node = node->rb_right; 703 node = node->rb_right;
634 } else { 704 } else {
@@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
636 break; 706 break;
637 } 707 }
638 } 708 }
639 read_unlock(&fs_info->tree_mod_log_lock); 709 tree_mod_log_read_unlock(fs_info);
640 710
641 return found; 711 return found;
642} 712}
@@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
664 return __tree_mod_log_search(fs_info, start, min_seq, 0); 734 return __tree_mod_log_search(fs_info, start, min_seq, 0);
665} 735}
666 736
667static inline void 737static noinline void
668tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 738tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
669 struct extent_buffer *src, unsigned long dst_offset, 739 struct extent_buffer *src, unsigned long dst_offset,
670 unsigned long src_offset, int nr_items) 740 unsigned long src_offset, int nr_items)
@@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
675 if (tree_mod_dont_log(fs_info, NULL)) 745 if (tree_mod_dont_log(fs_info, NULL))
676 return; 746 return;
677 747
678 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 748 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
749 tree_mod_log_write_unlock(fs_info);
679 return; 750 return;
751 }
680 752
681 /* speed this up by single seq for all operations? */
682 for (i = 0; i < nr_items; i++) { 753 for (i = 0; i < nr_items; i++) {
683 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, 754 ret = tree_mod_log_insert_key_locked(fs_info, src,
684 MOD_LOG_KEY_REMOVE); 755 i + src_offset,
756 MOD_LOG_KEY_REMOVE);
685 BUG_ON(ret < 0); 757 BUG_ON(ret < 0);
686 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, 758 ret = tree_mod_log_insert_key_locked(fs_info, dst,
687 MOD_LOG_KEY_ADD); 759 i + dst_offset,
760 MOD_LOG_KEY_ADD);
688 BUG_ON(ret < 0); 761 BUG_ON(ret < 0);
689 } 762 }
763
764 tree_mod_log_write_unlock(fs_info);
690} 765}
691 766
692static inline void 767static inline void
@@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
699 BUG_ON(ret < 0); 774 BUG_ON(ret < 0);
700} 775}
701 776
702static inline void 777static noinline void
703tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
704 struct extent_buffer *eb, 779 struct extent_buffer *eb,
705 struct btrfs_disk_key *disk_key, int slot, int atomic) 780 struct btrfs_disk_key *disk_key, int slot, int atomic)
@@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
712 BUG_ON(ret < 0); 787 BUG_ON(ret < 0);
713} 788}
714 789
715static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 790static noinline void
716 struct extent_buffer *eb) 791tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
717{ 792{
718 int i;
719 int ret;
720 u32 nritems;
721
722 if (tree_mod_dont_log(fs_info, eb)) 793 if (tree_mod_dont_log(fs_info, eb))
723 return; 794 return;
724 795
725 nritems = btrfs_header_nritems(eb); 796 __tree_mod_log_free_eb(fs_info, eb);
726 for (i = nritems - 1; i >= 0; i--) { 797
727 ret = tree_mod_log_insert_key(fs_info, eb, i, 798 tree_mod_log_write_unlock(fs_info);
728 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
729 BUG_ON(ret < 0);
730 }
731} 799}
732 800
733static inline void 801static noinline void
734tree_mod_log_set_root_pointer(struct btrfs_root *root, 802tree_mod_log_set_root_pointer(struct btrfs_root *root,
735 struct extent_buffer *new_root_node) 803 struct extent_buffer *new_root_node)
736{ 804{
737 int ret; 805 int ret;
738 tree_mod_log_free_eb(root->fs_info, root->node);
739 ret = tree_mod_log_insert_root(root->fs_info, root->node, 806 ret = tree_mod_log_insert_root(root->fs_info, root->node,
740 new_root_node, GFP_NOFS); 807 new_root_node, GFP_NOFS);
741 BUG_ON(ret < 0); 808 BUG_ON(ret < 0);
@@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1069 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1136 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1070 1137
1071 n = btrfs_header_nritems(eb); 1138 n = btrfs_header_nritems(eb);
1072 while (tm && tm->elem.seq >= time_seq) { 1139 while (tm && tm->seq >= time_seq) {
1073 /* 1140 /*
1074 * all the operations are recorded with the operator used for 1141 * all the operations are recorded with the operator used for
1075 * the modification. as we're going backwards, we do the 1142 * the modification. as we're going backwards, we do the
@@ -2722,6 +2789,80 @@ done:
2722} 2789}
2723 2790
2724/* 2791/*
2792 * helper to use instead of search slot if no exact match is needed but
2793 * instead the next or previous item should be returned.
2794 * When find_higher is true, the next higher item is returned, the next lower
2795 * otherwise.
2796 * When return_any and find_higher are both true, and no higher item is found,
2797 * return the next lower instead.
2798 * When return_any is true and find_higher is false, and no lower item is found,
2799 * return the next higher instead.
2800 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2801 * < 0 on error
2802 */
2803int btrfs_search_slot_for_read(struct btrfs_root *root,
2804 struct btrfs_key *key, struct btrfs_path *p,
2805 int find_higher, int return_any)
2806{
2807 int ret;
2808 struct extent_buffer *leaf;
2809
2810again:
2811 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2812 if (ret <= 0)
2813 return ret;
2814 /*
2815 * a return value of 1 means the path is at the position where the
2816 * item should be inserted. Normally this is the next bigger item,
2817 * but in case the previous item is the last in a leaf, path points
2818 * to the first free slot in the previous leaf, i.e. at an invalid
2819 * item.
2820 */
2821 leaf = p->nodes[0];
2822
2823 if (find_higher) {
2824 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2825 ret = btrfs_next_leaf(root, p);
2826 if (ret <= 0)
2827 return ret;
2828 if (!return_any)
2829 return 1;
2830 /*
2831 * no higher item found, return the next
2832 * lower instead
2833 */
2834 return_any = 0;
2835 find_higher = 0;
2836 btrfs_release_path(p);
2837 goto again;
2838 }
2839 } else {
2840 if (p->slots[0] == 0) {
2841 ret = btrfs_prev_leaf(root, p);
2842 if (ret < 0)
2843 return ret;
2844 if (!ret) {
2845 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2846 return 0;
2847 }
2848 if (!return_any)
2849 return 1;
2850 /*
2851 * no lower item found, return the next
2852 * higher instead
2853 */
2854 return_any = 0;
2855 find_higher = 1;
2856 btrfs_release_path(p);
2857 goto again;
2858 } else {
2859 --p->slots[0];
2860 }
2861 }
2862 return 0;
2863}
2864
2865/*
2725 * adjust the pointers going up the tree, starting at level 2866 * adjust the pointers going up the tree, starting at level
2726 * making sure the right key of each node is points to 'key'. 2867 * making sure the right key of each node is points to 'key'.
2727 * This is used after shifting pointers to the left, so it stops 2868 * This is used after shifting pointers to the left, so it stops
@@ -4931,6 +5072,431 @@ out:
4931 return ret; 5072 return ret;
4932} 5073}
4933 5074
5075static void tree_move_down(struct btrfs_root *root,
5076 struct btrfs_path *path,
5077 int *level, int root_level)
5078{
5079 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5080 path->slots[*level]);
5081 path->slots[*level - 1] = 0;
5082 (*level)--;
5083}
5084
5085static int tree_move_next_or_upnext(struct btrfs_root *root,
5086 struct btrfs_path *path,
5087 int *level, int root_level)
5088{
5089 int ret = 0;
5090 int nritems;
5091 nritems = btrfs_header_nritems(path->nodes[*level]);
5092
5093 path->slots[*level]++;
5094
5095 while (path->slots[*level] == nritems) {
5096 if (*level == root_level)
5097 return -1;
5098
5099 /* move upnext */
5100 path->slots[*level] = 0;
5101 free_extent_buffer(path->nodes[*level]);
5102 path->nodes[*level] = NULL;
5103 (*level)++;
5104 path->slots[*level]++;
5105
5106 nritems = btrfs_header_nritems(path->nodes[*level]);
5107 ret = 1;
5108 }
5109 return ret;
5110}
5111
5112/*
5113 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5114 * or down.
5115 */
5116static int tree_advance(struct btrfs_root *root,
5117 struct btrfs_path *path,
5118 int *level, int root_level,
5119 int allow_down,
5120 struct btrfs_key *key)
5121{
5122 int ret;
5123
5124 if (*level == 0 || !allow_down) {
5125 ret = tree_move_next_or_upnext(root, path, level, root_level);
5126 } else {
5127 tree_move_down(root, path, level, root_level);
5128 ret = 0;
5129 }
5130 if (ret >= 0) {
5131 if (*level == 0)
5132 btrfs_item_key_to_cpu(path->nodes[*level], key,
5133 path->slots[*level]);
5134 else
5135 btrfs_node_key_to_cpu(path->nodes[*level], key,
5136 path->slots[*level]);
5137 }
5138 return ret;
5139}
5140
5141static int tree_compare_item(struct btrfs_root *left_root,
5142 struct btrfs_path *left_path,
5143 struct btrfs_path *right_path,
5144 char *tmp_buf)
5145{
5146 int cmp;
5147 int len1, len2;
5148 unsigned long off1, off2;
5149
5150 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5151 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5152 if (len1 != len2)
5153 return 1;
5154
5155 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5156 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5157 right_path->slots[0]);
5158
5159 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5160
5161 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5162 if (cmp)
5163 return 1;
5164 return 0;
5165}
5166
5167#define ADVANCE 1
5168#define ADVANCE_ONLY_NEXT -1
5169
5170/*
5171 * This function compares two trees and calls the provided callback for
5172 * every changed/new/deleted item it finds.
5173 * If shared tree blocks are encountered, whole subtrees are skipped, making
5174 * the compare pretty fast on snapshotted subvolumes.
5175 *
5176 * This currently works on commit roots only. As commit roots are read only,
5177 * we don't do any locking. The commit roots are protected with transactions.
5178 * Transactions are ended and rejoined when a commit is tried in between.
5179 *
5180 * This function checks for modifications done to the trees while comparing.
5181 * If it detects a change, it aborts immediately.
5182 */
5183int btrfs_compare_trees(struct btrfs_root *left_root,
5184 struct btrfs_root *right_root,
5185 btrfs_changed_cb_t changed_cb, void *ctx)
5186{
5187 int ret;
5188 int cmp;
5189 struct btrfs_trans_handle *trans = NULL;
5190 struct btrfs_path *left_path = NULL;
5191 struct btrfs_path *right_path = NULL;
5192 struct btrfs_key left_key;
5193 struct btrfs_key right_key;
5194 char *tmp_buf = NULL;
5195 int left_root_level;
5196 int right_root_level;
5197 int left_level;
5198 int right_level;
5199 int left_end_reached;
5200 int right_end_reached;
5201 int advance_left;
5202 int advance_right;
5203 u64 left_blockptr;
5204 u64 right_blockptr;
5205 u64 left_start_ctransid;
5206 u64 right_start_ctransid;
5207 u64 ctransid;
5208
5209 left_path = btrfs_alloc_path();
5210 if (!left_path) {
5211 ret = -ENOMEM;
5212 goto out;
5213 }
5214 right_path = btrfs_alloc_path();
5215 if (!right_path) {
5216 ret = -ENOMEM;
5217 goto out;
5218 }
5219
5220 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5221 if (!tmp_buf) {
5222 ret = -ENOMEM;
5223 goto out;
5224 }
5225
5226 left_path->search_commit_root = 1;
5227 left_path->skip_locking = 1;
5228 right_path->search_commit_root = 1;
5229 right_path->skip_locking = 1;
5230
5231 spin_lock(&left_root->root_times_lock);
5232 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5233 spin_unlock(&left_root->root_times_lock);
5234
5235 spin_lock(&right_root->root_times_lock);
5236 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5237 spin_unlock(&right_root->root_times_lock);
5238
5239 trans = btrfs_join_transaction(left_root);
5240 if (IS_ERR(trans)) {
5241 ret = PTR_ERR(trans);
5242 trans = NULL;
5243 goto out;
5244 }
5245
5246 /*
5247 * Strategy: Go to the first items of both trees. Then do
5248 *
5249 * If both trees are at level 0
5250 * Compare keys of current items
5251 * If left < right treat left item as new, advance left tree
5252 * and repeat
5253 * If left > right treat right item as deleted, advance right tree
5254 * and repeat
5255 * If left == right do deep compare of items, treat as changed if
5256 * needed, advance both trees and repeat
5257 * If both trees are at the same level but not at level 0
5258 * Compare keys of current nodes/leafs
5259 * If left < right advance left tree and repeat
5260 * If left > right advance right tree and repeat
5261 * If left == right compare blockptrs of the next nodes/leafs
5262 * If they match advance both trees but stay at the same level
5263 * and repeat
5264 * If they don't match advance both trees while allowing to go
5265 * deeper and repeat
5266 * If tree levels are different
5267 * Advance the tree that needs it and repeat
5268 *
5269 * Advancing a tree means:
5270 * If we are at level 0, try to go to the next slot. If that's not
5271 * possible, go one level up and repeat. Stop when we found a level
5272 * where we could go to the next slot. We may at this point be on a
5273 * node or a leaf.
5274 *
5275 * If we are not at level 0 and not on shared tree blocks, go one
5276 * level deeper.
5277 *
5278 * If we are not at level 0 and on shared tree blocks, go one slot to
5279 * the right if possible or go up and right.
5280 */
5281
5282 left_level = btrfs_header_level(left_root->commit_root);
5283 left_root_level = left_level;
5284 left_path->nodes[left_level] = left_root->commit_root;
5285 extent_buffer_get(left_path->nodes[left_level]);
5286
5287 right_level = btrfs_header_level(right_root->commit_root);
5288 right_root_level = right_level;
5289 right_path->nodes[right_level] = right_root->commit_root;
5290 extent_buffer_get(right_path->nodes[right_level]);
5291
5292 if (left_level == 0)
5293 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5294 &left_key, left_path->slots[left_level]);
5295 else
5296 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5297 &left_key, left_path->slots[left_level]);
5298 if (right_level == 0)
5299 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5300 &right_key, right_path->slots[right_level]);
5301 else
5302 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5303 &right_key, right_path->slots[right_level]);
5304
5305 left_end_reached = right_end_reached = 0;
5306 advance_left = advance_right = 0;
5307
5308 while (1) {
5309 /*
5310 * We need to make sure the transaction does not get committed
5311 * while we do anything on commit roots. This means, we need to
5312 * join and leave transactions for every item that we process.
5313 */
5314 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5315 btrfs_release_path(left_path);
5316 btrfs_release_path(right_path);
5317
5318 ret = btrfs_end_transaction(trans, left_root);
5319 trans = NULL;
5320 if (ret < 0)
5321 goto out;
5322 }
5323 /* now rejoin the transaction */
5324 if (!trans) {
5325 trans = btrfs_join_transaction(left_root);
5326 if (IS_ERR(trans)) {
5327 ret = PTR_ERR(trans);
5328 trans = NULL;
5329 goto out;
5330 }
5331
5332 spin_lock(&left_root->root_times_lock);
5333 ctransid = btrfs_root_ctransid(&left_root->root_item);
5334 spin_unlock(&left_root->root_times_lock);
5335 if (ctransid != left_start_ctransid)
5336 left_start_ctransid = 0;
5337
5338 spin_lock(&right_root->root_times_lock);
5339 ctransid = btrfs_root_ctransid(&right_root->root_item);
5340 spin_unlock(&right_root->root_times_lock);
5341 if (ctransid != right_start_ctransid)
5342 right_start_ctransid = 0;
5343
5344 if (!left_start_ctransid || !right_start_ctransid) {
5345 WARN(1, KERN_WARNING
5346 "btrfs: btrfs_compare_tree detected "
5347 "a change in one of the trees while "
5348 "iterating. This is probably a "
5349 "bug.\n");
5350 ret = -EIO;
5351 goto out;
5352 }
5353
5354 /*
5355 * the commit root may have changed, so start again
5356 * where we stopped
5357 */
5358 left_path->lowest_level = left_level;
5359 right_path->lowest_level = right_level;
5360 ret = btrfs_search_slot(NULL, left_root,
5361 &left_key, left_path, 0, 0);
5362 if (ret < 0)
5363 goto out;
5364 ret = btrfs_search_slot(NULL, right_root,
5365 &right_key, right_path, 0, 0);
5366 if (ret < 0)
5367 goto out;
5368 }
5369
5370 if (advance_left && !left_end_reached) {
5371 ret = tree_advance(left_root, left_path, &left_level,
5372 left_root_level,
5373 advance_left != ADVANCE_ONLY_NEXT,
5374 &left_key);
5375 if (ret < 0)
5376 left_end_reached = ADVANCE;
5377 advance_left = 0;
5378 }
5379 if (advance_right && !right_end_reached) {
5380 ret = tree_advance(right_root, right_path, &right_level,
5381 right_root_level,
5382 advance_right != ADVANCE_ONLY_NEXT,
5383 &right_key);
5384 if (ret < 0)
5385 right_end_reached = ADVANCE;
5386 advance_right = 0;
5387 }
5388
5389 if (left_end_reached && right_end_reached) {
5390 ret = 0;
5391 goto out;
5392 } else if (left_end_reached) {
5393 if (right_level == 0) {
5394 ret = changed_cb(left_root, right_root,
5395 left_path, right_path,
5396 &right_key,
5397 BTRFS_COMPARE_TREE_DELETED,
5398 ctx);
5399 if (ret < 0)
5400 goto out;
5401 }
5402 advance_right = ADVANCE;
5403 continue;
5404 } else if (right_end_reached) {
5405 if (left_level == 0) {
5406 ret = changed_cb(left_root, right_root,
5407 left_path, right_path,
5408 &left_key,
5409 BTRFS_COMPARE_TREE_NEW,
5410 ctx);
5411 if (ret < 0)
5412 goto out;
5413 }
5414 advance_left = ADVANCE;
5415 continue;
5416 }
5417
5418 if (left_level == 0 && right_level == 0) {
5419 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5420 if (cmp < 0) {
5421 ret = changed_cb(left_root, right_root,
5422 left_path, right_path,
5423 &left_key,
5424 BTRFS_COMPARE_TREE_NEW,
5425 ctx);
5426 if (ret < 0)
5427 goto out;
5428 advance_left = ADVANCE;
5429 } else if (cmp > 0) {
5430 ret = changed_cb(left_root, right_root,
5431 left_path, right_path,
5432 &right_key,
5433 BTRFS_COMPARE_TREE_DELETED,
5434 ctx);
5435 if (ret < 0)
5436 goto out;
5437 advance_right = ADVANCE;
5438 } else {
5439 ret = tree_compare_item(left_root, left_path,
5440 right_path, tmp_buf);
5441 if (ret) {
5442 ret = changed_cb(left_root, right_root,
5443 left_path, right_path,
5444 &left_key,
5445 BTRFS_COMPARE_TREE_CHANGED,
5446 ctx);
5447 if (ret < 0)
5448 goto out;
5449 }
5450 advance_left = ADVANCE;
5451 advance_right = ADVANCE;
5452 }
5453 } else if (left_level == right_level) {
5454 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5455 if (cmp < 0) {
5456 advance_left = ADVANCE;
5457 } else if (cmp > 0) {
5458 advance_right = ADVANCE;
5459 } else {
5460 left_blockptr = btrfs_node_blockptr(
5461 left_path->nodes[left_level],
5462 left_path->slots[left_level]);
5463 right_blockptr = btrfs_node_blockptr(
5464 right_path->nodes[right_level],
5465 right_path->slots[right_level]);
5466 if (left_blockptr == right_blockptr) {
5467 /*
5468 * As we're on a shared block, don't
5469 * allow to go deeper.
5470 */
5471 advance_left = ADVANCE_ONLY_NEXT;
5472 advance_right = ADVANCE_ONLY_NEXT;
5473 } else {
5474 advance_left = ADVANCE;
5475 advance_right = ADVANCE;
5476 }
5477 }
5478 } else if (left_level < right_level) {
5479 advance_right = ADVANCE;
5480 } else {
5481 advance_left = ADVANCE;
5482 }
5483 }
5484
5485out:
5486 btrfs_free_path(left_path);
5487 btrfs_free_path(right_path);
5488 kfree(tmp_buf);
5489
5490 if (trans) {
5491 if (!ret)
5492 ret = btrfs_end_transaction(trans, left_root);
5493 else
5494 btrfs_end_transaction(trans, left_root);
5495 }
5496
5497 return ret;
5498}
5499
4934/* 5500/*
4935 * this is similar to btrfs_next_leaf, but does not try to preserve 5501 * this is similar to btrfs_next_leaf, but does not try to preserve
4936 * and fixup the path. It looks for and returns the next key in the 5502 * and fixup the path. It looks for and returns the next key in the
@@ -5127,6 +5693,7 @@ again:
5127 * locked. To solve this situation, we give up 5693 * locked. To solve this situation, we give up
5128 * on our lock and cycle. 5694 * on our lock and cycle.
5129 */ 5695 */
5696 free_extent_buffer(next);
5130 btrfs_release_path(path); 5697 btrfs_release_path(path);
5131 cond_resched(); 5698 cond_resched();
5132 goto again; 5699 goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b39075..4bab807227ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
91/* for storing balance parameters in the root tree */ 91/* for storing balance parameters in the root tree */
92#define BTRFS_BALANCE_OBJECTID -4ULL 92#define BTRFS_BALANCE_OBJECTID -4ULL
93 93
94/* holds quota configuration and tracking */
95#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
96
94/* orhpan objectid for tracking unlinked/truncated files */ 97/* orhpan objectid for tracking unlinked/truncated files */
95#define BTRFS_ORPHAN_OBJECTID -5ULL 98#define BTRFS_ORPHAN_OBJECTID -5ULL
96 99
@@ -709,6 +712,36 @@ struct btrfs_root_item {
709 struct btrfs_disk_key drop_progress; 712 struct btrfs_disk_key drop_progress;
710 u8 drop_level; 713 u8 drop_level;
711 u8 level; 714 u8 level;
715
716 /*
717 * The following fields appear after subvol_uuids+subvol_times
718 * were introduced.
719 */
720
721 /*
722 * This generation number is used to test if the new fields are valid
723 * and up to date while reading the root item. Everytime the root item
724 * is written out, the "generation" field is copied into this field. If
725 * anyone ever mounted the fs with an older kernel, we will have
726 * mismatching generation values here and thus must invalidate the
727 * new fields. See btrfs_update_root and btrfs_find_last_root for
728 * details.
729 * the offset of generation_v2 is also used as the start for the memset
730 * when invalidating the fields.
731 */
732 __le64 generation_v2;
733 u8 uuid[BTRFS_UUID_SIZE];
734 u8 parent_uuid[BTRFS_UUID_SIZE];
735 u8 received_uuid[BTRFS_UUID_SIZE];
736 __le64 ctransid; /* updated when an inode changes */
737 __le64 otransid; /* trans when created */
738 __le64 stransid; /* trans when sent. non-zero for received subvol */
739 __le64 rtransid; /* trans when received. non-zero for received subvol */
740 struct btrfs_timespec ctime;
741 struct btrfs_timespec otime;
742 struct btrfs_timespec stime;
743 struct btrfs_timespec rtime;
744 __le64 reserved[8]; /* for future */
712} __attribute__ ((__packed__)); 745} __attribute__ ((__packed__));
713 746
714/* 747/*
@@ -883,6 +916,72 @@ struct btrfs_block_group_item {
883 __le64 flags; 916 __le64 flags;
884} __attribute__ ((__packed__)); 917} __attribute__ ((__packed__));
885 918
919/*
920 * is subvolume quota turned on?
921 */
922#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
923/*
924 * SCANNING is set during the initialization phase
925 */
926#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1)
927/*
928 * Some qgroup entries are known to be out of date,
929 * either because the configuration has changed in a way that
930 * makes a rescan necessary, or because the fs has been mounted
931 * with a non-qgroup-aware version.
932 * Turning qouta off and on again makes it inconsistent, too.
933 */
934#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
935
936#define BTRFS_QGROUP_STATUS_VERSION 1
937
938struct btrfs_qgroup_status_item {
939 __le64 version;
940 /*
941 * the generation is updated during every commit. As older
942 * versions of btrfs are not aware of qgroups, it will be
943 * possible to detect inconsistencies by checking the
944 * generation on mount time
945 */
946 __le64 generation;
947
948 /* flag definitions see above */
949 __le64 flags;
950
951 /*
952 * only used during scanning to record the progress
953 * of the scan. It contains a logical address
954 */
955 __le64 scan;
956} __attribute__ ((__packed__));
957
958struct btrfs_qgroup_info_item {
959 __le64 generation;
960 __le64 rfer;
961 __le64 rfer_cmpr;
962 __le64 excl;
963 __le64 excl_cmpr;
964} __attribute__ ((__packed__));
965
966/* flags definition for qgroup limits */
967#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
968#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
969#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
970#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
971#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
972#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
973
974struct btrfs_qgroup_limit_item {
975 /*
976 * only updated when any of the other values change
977 */
978 __le64 flags;
979 __le64 max_rfer;
980 __le64 max_excl;
981 __le64 rsv_rfer;
982 __le64 rsv_excl;
983} __attribute__ ((__packed__));
984
886struct btrfs_space_info { 985struct btrfs_space_info {
887 u64 flags; 986 u64 flags;
888 987
@@ -1030,6 +1129,13 @@ struct btrfs_block_group_cache {
1030 struct list_head cluster_list; 1129 struct list_head cluster_list;
1031}; 1130};
1032 1131
1132/* delayed seq elem */
1133struct seq_list {
1134 struct list_head list;
1135 u64 seq;
1136};
1137
1138/* fs_info */
1033struct reloc_control; 1139struct reloc_control;
1034struct btrfs_device; 1140struct btrfs_device;
1035struct btrfs_fs_devices; 1141struct btrfs_fs_devices;
@@ -1044,6 +1150,7 @@ struct btrfs_fs_info {
1044 struct btrfs_root *dev_root; 1150 struct btrfs_root *dev_root;
1045 struct btrfs_root *fs_root; 1151 struct btrfs_root *fs_root;
1046 struct btrfs_root *csum_root; 1152 struct btrfs_root *csum_root;
1153 struct btrfs_root *quota_root;
1047 1154
1048 /* the log root tree is a directory of all the other log roots */ 1155 /* the log root tree is a directory of all the other log roots */
1049 struct btrfs_root *log_root_tree; 1156 struct btrfs_root *log_root_tree;
@@ -1144,6 +1251,8 @@ struct btrfs_fs_info {
1144 spinlock_t tree_mod_seq_lock; 1251 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq; 1252 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list; 1253 struct list_head tree_mod_seq_list;
1254 struct seq_list tree_mod_seq_elem;
1255 wait_queue_head_t tree_mod_seq_wait;
1147 1256
1148 /* this protects tree_mod_log */ 1257 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock; 1258 rwlock_t tree_mod_log_lock;
@@ -1240,6 +1349,8 @@ struct btrfs_fs_info {
1240 */ 1349 */
1241 struct list_head space_info; 1350 struct list_head space_info;
1242 1351
1352 struct btrfs_space_info *data_sinfo;
1353
1243 struct reloc_control *reloc_ctl; 1354 struct reloc_control *reloc_ctl;
1244 1355
1245 spinlock_t delalloc_lock; 1356 spinlock_t delalloc_lock;
@@ -1296,6 +1407,29 @@ struct btrfs_fs_info {
1296#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1407#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1297 u32 check_integrity_print_mask; 1408 u32 check_integrity_print_mask;
1298#endif 1409#endif
1410 /*
1411 * quota information
1412 */
1413 unsigned int quota_enabled:1;
1414
1415 /*
1416 * quota_enabled only changes state after a commit. This holds the
1417 * next state.
1418 */
1419 unsigned int pending_quota_state:1;
1420
1421 /* is qgroup tracking in a consistent state? */
1422 u64 qgroup_flags;
1423
1424 /* holds configuration and tracking. Protected by qgroup_lock */
1425 struct rb_root qgroup_tree;
1426 spinlock_t qgroup_lock;
1427
1428 /* list of dirty qgroups to be written at next commit */
1429 struct list_head dirty_qgroups;
1430
1431 /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
1432 u64 qgroup_seq;
1299 1433
1300 /* filesystem state */ 1434 /* filesystem state */
1301 u64 fs_state; 1435 u64 fs_state;
@@ -1416,6 +1550,8 @@ struct btrfs_root {
1416 dev_t anon_dev; 1550 dev_t anon_dev;
1417 1551
1418 int force_cow; 1552 int force_cow;
1553
1554 spinlock_t root_times_lock;
1419}; 1555};
1420 1556
1421struct btrfs_ioctl_defrag_range_args { 1557struct btrfs_ioctl_defrag_range_args {
@@ -1525,6 +1661,30 @@ struct btrfs_ioctl_defrag_range_args {
1525#define BTRFS_DEV_ITEM_KEY 216 1661#define BTRFS_DEV_ITEM_KEY 216
1526#define BTRFS_CHUNK_ITEM_KEY 228 1662#define BTRFS_CHUNK_ITEM_KEY 228
1527 1663
1664/*
1665 * Records the overall state of the qgroups.
1666 * There's only one instance of this key present,
1667 * (0, BTRFS_QGROUP_STATUS_KEY, 0)
1668 */
1669#define BTRFS_QGROUP_STATUS_KEY 240
1670/*
1671 * Records the currently used space of the qgroup.
1672 * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
1673 */
1674#define BTRFS_QGROUP_INFO_KEY 242
1675/*
1676 * Contains the user configured limits for the qgroup.
1677 * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
1678 */
1679#define BTRFS_QGROUP_LIMIT_KEY 244
1680/*
1681 * Records the child-parent relationship of qgroups. For
1682 * each relation, 2 keys are present:
1683 * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
1684 * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
1685 */
1686#define BTRFS_QGROUP_RELATION_KEY 246
1687
1528#define BTRFS_BALANCE_ITEM_KEY 248 1688#define BTRFS_BALANCE_ITEM_KEY 248
1529 1689
1530/* 1690/*
@@ -1621,13 +1781,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1621 offsetof(type, member), \ 1781 offsetof(type, member), \
1622 sizeof(((type *)0)->member))) 1782 sizeof(((type *)0)->member)))
1623 1783
1624#ifndef BTRFS_SETGET_FUNCS 1784#define DECLARE_BTRFS_SETGET_BITS(bits) \
1785u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
1786 unsigned long off, \
1787 struct btrfs_map_token *token); \
1788void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
1789 unsigned long off, u##bits val, \
1790 struct btrfs_map_token *token); \
1791static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
1792 unsigned long off) \
1793{ \
1794 return btrfs_get_token_##bits(eb, ptr, off, NULL); \
1795} \
1796static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
1797 unsigned long off, u##bits val) \
1798{ \
1799 btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
1800}
1801
1802DECLARE_BTRFS_SETGET_BITS(8)
1803DECLARE_BTRFS_SETGET_BITS(16)
1804DECLARE_BTRFS_SETGET_BITS(32)
1805DECLARE_BTRFS_SETGET_BITS(64)
1806
1625#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 1807#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
1626u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ 1808static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
1627u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ 1809{ \
1628void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ 1810 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1629void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); 1811 return btrfs_get_##bits(eb, s, offsetof(type, member)); \
1630#endif 1812} \
1813static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
1814 u##bits val) \
1815{ \
1816 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1817 btrfs_set_##bits(eb, s, offsetof(type, member), val); \
1818} \
1819static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
1820 struct btrfs_map_token *token) \
1821{ \
1822 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1823 return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
1824} \
1825static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
1826 type *s, u##bits val, \
1827 struct btrfs_map_token *token) \
1828{ \
1829 BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
1830 btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
1831}
1631 1832
1632#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1833#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1633static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1834static inline u##bits btrfs_##name(struct extent_buffer *eb) \
@@ -2189,6 +2390,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
2189BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); 2390BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
2190BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 2391BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
2191 last_snapshot, 64); 2392 last_snapshot, 64);
2393BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
2394 generation_v2, 64);
2395BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
2396 ctransid, 64);
2397BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
2398 otransid, 64);
2399BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
2400 stransid, 64);
2401BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
2402 rtransid, 64);
2192 2403
2193static inline bool btrfs_root_readonly(struct btrfs_root *root) 2404static inline bool btrfs_root_readonly(struct btrfs_root *root)
2194{ 2405{
@@ -2465,6 +2676,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
2465 sizeof(val)); 2676 sizeof(val));
2466} 2677}
2467 2678
2679/* btrfs_qgroup_status_item */
2680BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
2681 generation, 64);
2682BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
2683 version, 64);
2684BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
2685 flags, 64);
2686BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
2687 scan, 64);
2688
2689/* btrfs_qgroup_info_item */
2690BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
2691 generation, 64);
2692BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
2693BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
2694 rfer_cmpr, 64);
2695BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
2696BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
2697 excl_cmpr, 64);
2698
2699BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
2700 struct btrfs_qgroup_info_item, generation, 64);
2701BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
2702 rfer, 64);
2703BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
2704 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
2705BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
2706 excl, 64);
2707BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
2708 struct btrfs_qgroup_info_item, excl_cmpr, 64);
2709
2710/* btrfs_qgroup_limit_item */
2711BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
2712 flags, 64);
2713BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
2714 max_rfer, 64);
2715BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
2716 max_excl, 64);
2717BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2718 rsv_rfer, 64);
2719BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2720 rsv_excl, 64);
2721
2468static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2722static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2469{ 2723{
2470 return sb->s_fs_info; 2724 return sb->s_fs_info;
@@ -2607,7 +2861,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2607 struct btrfs_root *root, u64 group_start); 2861 struct btrfs_root *root, u64 group_start);
2608u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2862u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2609u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2863u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2610void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2611void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2864void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2612int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2865int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2613void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2866void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
@@ -2661,6 +2914,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2661int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); 2914int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2662 2915
2663int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 2916int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2917int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2918 struct btrfs_fs_info *fs_info);
2664/* ctree.c */ 2919/* ctree.c */
2665int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2920int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2666 int level, int *slot); 2921 int level, int *slot);
@@ -2680,6 +2935,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
2680 struct btrfs_key *max_key, 2935 struct btrfs_key *max_key,
2681 struct btrfs_path *path, int cache_only, 2936 struct btrfs_path *path, int cache_only,
2682 u64 min_trans); 2937 u64 min_trans);
2938enum btrfs_compare_tree_result {
2939 BTRFS_COMPARE_TREE_NEW,
2940 BTRFS_COMPARE_TREE_DELETED,
2941 BTRFS_COMPARE_TREE_CHANGED,
2942};
2943typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
2944 struct btrfs_root *right_root,
2945 struct btrfs_path *left_path,
2946 struct btrfs_path *right_path,
2947 struct btrfs_key *key,
2948 enum btrfs_compare_tree_result result,
2949 void *ctx);
2950int btrfs_compare_trees(struct btrfs_root *left_root,
2951 struct btrfs_root *right_root,
2952 btrfs_changed_cb_t cb, void *ctx);
2683int btrfs_cow_block(struct btrfs_trans_handle *trans, 2953int btrfs_cow_block(struct btrfs_trans_handle *trans,
2684 struct btrfs_root *root, struct extent_buffer *buf, 2954 struct btrfs_root *root, struct extent_buffer *buf,
2685 struct extent_buffer *parent, int parent_slot, 2955 struct extent_buffer *parent, int parent_slot,
@@ -2711,6 +2981,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2711 ins_len, int cow); 2981 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2982int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq); 2983 struct btrfs_path *p, u64 time_seq);
2984int btrfs_search_slot_for_read(struct btrfs_root *root,
2985 struct btrfs_key *key, struct btrfs_path *p,
2986 int find_higher, int return_any);
2714int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2987int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2715 struct btrfs_root *root, struct extent_buffer *parent, 2988 struct btrfs_root *root, struct extent_buffer *parent,
2716 int start_slot, int cache_only, u64 *last_ret, 2989 int start_slot, int cache_only, u64 *last_ret,
@@ -2793,11 +3066,22 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2793 kfree(fs_info->chunk_root); 3066 kfree(fs_info->chunk_root);
2794 kfree(fs_info->dev_root); 3067 kfree(fs_info->dev_root);
2795 kfree(fs_info->csum_root); 3068 kfree(fs_info->csum_root);
3069 kfree(fs_info->quota_root);
2796 kfree(fs_info->super_copy); 3070 kfree(fs_info->super_copy);
2797 kfree(fs_info->super_for_commit); 3071 kfree(fs_info->super_for_commit);
2798 kfree(fs_info); 3072 kfree(fs_info);
2799} 3073}
2800 3074
3075/* tree mod log functions from ctree.c */
3076u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3077 struct seq_list *elem);
3078void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3079 struct seq_list *elem);
3080static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
3081{
3082 return atomic_inc_return(&fs_info->tree_mod_seq);
3083}
3084
2801/* root-item.c */ 3085/* root-item.c */
2802int btrfs_find_root_ref(struct btrfs_root *tree_root, 3086int btrfs_find_root_ref(struct btrfs_root *tree_root,
2803 struct btrfs_path *path, 3087 struct btrfs_path *path,
@@ -2819,6 +3103,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
2819 struct btrfs_root *root, 3103 struct btrfs_root *root,
2820 struct btrfs_key *key, 3104 struct btrfs_key *key,
2821 struct btrfs_root_item *item); 3105 struct btrfs_root_item *item);
3106void btrfs_read_root_item(struct btrfs_root *root,
3107 struct extent_buffer *eb, int slot,
3108 struct btrfs_root_item *item);
2822int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3109int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2823 btrfs_root_item *item, struct btrfs_key *key); 3110 btrfs_root_item *item, struct btrfs_key *key);
2824int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3111int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2826,6 +3113,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2826void btrfs_set_root_node(struct btrfs_root_item *item, 3113void btrfs_set_root_node(struct btrfs_root_item *item,
2827 struct extent_buffer *node); 3114 struct extent_buffer *node);
2828void btrfs_check_and_init_root_item(struct btrfs_root_item *item); 3115void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
3116void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3117 struct btrfs_root *root);
2829 3118
2830/* dir-item.c */ 3119/* dir-item.c */
2831int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3120int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -3053,14 +3342,43 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
3053/* super.c */ 3342/* super.c */
3054int btrfs_parse_options(struct btrfs_root *root, char *options); 3343int btrfs_parse_options(struct btrfs_root *root, char *options);
3055int btrfs_sync_fs(struct super_block *sb, int wait); 3344int btrfs_sync_fs(struct super_block *sb, int wait);
3345
3346#ifdef CONFIG_PRINTK
3347__printf(2, 3)
3056void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); 3348void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
3349#else
3350static inline __printf(2, 3)
3351void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
3352{
3353}
3354#endif
3355
3356__printf(5, 6)
3057void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 3357void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
3058 unsigned int line, int errno, const char *fmt, ...); 3358 unsigned int line, int errno, const char *fmt, ...);
3059 3359
3360
3060void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, 3361void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
3061 struct btrfs_root *root, const char *function, 3362 struct btrfs_root *root, const char *function,
3062 unsigned int line, int errno); 3363 unsigned int line, int errno);
3063 3364
3365#define btrfs_set_fs_incompat(__fs_info, opt) \
3366 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
3367
3368static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3369 u64 flag)
3370{
3371 struct btrfs_super_block *disk_super;
3372 u64 features;
3373
3374 disk_super = fs_info->super_copy;
3375 features = btrfs_super_incompat_flags(disk_super);
3376 if (!(features & flag)) {
3377 features |= flag;
3378 btrfs_set_super_incompat_flags(disk_super, features);
3379 }
3380}
3381
3064#define btrfs_abort_transaction(trans, root, errno) \ 3382#define btrfs_abort_transaction(trans, root, errno) \
3065do { \ 3383do { \
3066 __btrfs_abort_transaction(trans, root, __func__, \ 3384 __btrfs_abort_transaction(trans, root, __func__, \
@@ -3080,6 +3398,7 @@ do { \
3080 (errno), fmt, ##args); \ 3398 (errno), fmt, ##args); \
3081} while (0) 3399} while (0)
3082 3400
3401__printf(5, 6)
3083void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 3402void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
3084 unsigned int line, int errno, const char *fmt, ...); 3403 unsigned int line, int errno, const char *fmt, ...);
3085 3404
@@ -3156,17 +3475,49 @@ void btrfs_reada_detach(void *handle);
3156int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3475int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3157 u64 start, int err); 3476 u64 start, int err);
3158 3477
3159/* delayed seq elem */ 3478/* qgroup.c */
3160struct seq_list { 3479struct qgroup_update {
3161 struct list_head list; 3480 struct list_head list;
3162 u64 seq; 3481 struct btrfs_delayed_ref_node *node;
3163 u32 flags; 3482 struct btrfs_delayed_extent_op *extent_op;
3164}; 3483};
3165 3484
3166void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 3485int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3167 struct seq_list *elem); 3486 struct btrfs_fs_info *fs_info);
3168void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3487int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3169 struct seq_list *elem); 3488 struct btrfs_fs_info *fs_info);
3489int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
3490int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3491 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3492int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
3493 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3494int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
3495 struct btrfs_fs_info *fs_info, u64 qgroupid,
3496 char *name);
3497int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
3498 struct btrfs_fs_info *fs_info, u64 qgroupid);
3499int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
3500 struct btrfs_fs_info *fs_info, u64 qgroupid,
3501 struct btrfs_qgroup_limit *limit);
3502int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
3503void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
3504struct btrfs_delayed_extent_op;
3505int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
3506 struct btrfs_delayed_ref_node *node,
3507 struct btrfs_delayed_extent_op *extent_op);
3508int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
3509 struct btrfs_fs_info *fs_info,
3510 struct btrfs_delayed_ref_node *node,
3511 struct btrfs_delayed_extent_op *extent_op);
3512int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
3513 struct btrfs_fs_info *fs_info);
3514int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
3515 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
3516 struct btrfs_qgroup_inherit *inherit);
3517int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
3518void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
3519
3520void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
3170 3521
3171static inline int is_fstree(u64 rootid) 3522static inline int is_fstree(u64 rootid)
3172{ 3523{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2399f4086915..335605c8ceab 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node(
62 INIT_LIST_HEAD(&delayed_node->n_list); 62 INIT_LIST_HEAD(&delayed_node->n_list);
63 INIT_LIST_HEAD(&delayed_node->p_list); 63 INIT_LIST_HEAD(&delayed_node->p_list);
64 delayed_node->bytes_reserved = 0; 64 delayed_node->bytes_reserved = 0;
65 memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
65} 66}
66 67
67static inline int btrfs_is_continuous_delayed_item( 68static inline int btrfs_is_continuous_delayed_item(
@@ -1113,8 +1114,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1113 * Returns < 0 on error and returns with an aborted transaction with any 1114 * Returns < 0 on error and returns with an aborted transaction with any
1114 * outstanding delayed items cleaned up. 1115 * outstanding delayed items cleaned up.
1115 */ 1116 */
1116int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 1117static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1117 struct btrfs_root *root) 1118 struct btrfs_root *root, int nr)
1118{ 1119{
1119 struct btrfs_root *curr_root = root; 1120 struct btrfs_root *curr_root = root;
1120 struct btrfs_delayed_root *delayed_root; 1121 struct btrfs_delayed_root *delayed_root;
@@ -1122,6 +1123,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1122 struct btrfs_path *path; 1123 struct btrfs_path *path;
1123 struct btrfs_block_rsv *block_rsv; 1124 struct btrfs_block_rsv *block_rsv;
1124 int ret = 0; 1125 int ret = 0;
1126 bool count = (nr > 0);
1125 1127
1126 if (trans->aborted) 1128 if (trans->aborted)
1127 return -EIO; 1129 return -EIO;
@@ -1137,7 +1139,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1137 delayed_root = btrfs_get_delayed_root(root); 1139 delayed_root = btrfs_get_delayed_root(root);
1138 1140
1139 curr_node = btrfs_first_delayed_node(delayed_root); 1141 curr_node = btrfs_first_delayed_node(delayed_root);
1140 while (curr_node) { 1142 while (curr_node && (!count || (count && nr--))) {
1141 curr_root = curr_node->root; 1143 curr_root = curr_node->root;
1142 ret = btrfs_insert_delayed_items(trans, path, curr_root, 1144 ret = btrfs_insert_delayed_items(trans, path, curr_root,
1143 curr_node); 1145 curr_node);
@@ -1149,6 +1151,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1149 path, curr_node); 1151 path, curr_node);
1150 if (ret) { 1152 if (ret) {
1151 btrfs_release_delayed_node(curr_node); 1153 btrfs_release_delayed_node(curr_node);
1154 curr_node = NULL;
1152 btrfs_abort_transaction(trans, root, ret); 1155 btrfs_abort_transaction(trans, root, ret);
1153 break; 1156 break;
1154 } 1157 }
@@ -1158,12 +1161,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1158 btrfs_release_delayed_node(prev_node); 1161 btrfs_release_delayed_node(prev_node);
1159 } 1162 }
1160 1163
1164 if (curr_node)
1165 btrfs_release_delayed_node(curr_node);
1161 btrfs_free_path(path); 1166 btrfs_free_path(path);
1162 trans->block_rsv = block_rsv; 1167 trans->block_rsv = block_rsv;
1163 1168
1164 return ret; 1169 return ret;
1165} 1170}
1166 1171
1172int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1173 struct btrfs_root *root)
1174{
1175 return __btrfs_run_delayed_items(trans, root, -1);
1176}
1177
1178int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1179 struct btrfs_root *root, int nr)
1180{
1181 return __btrfs_run_delayed_items(trans, root, nr);
1182}
1183
1167static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1184static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1168 struct btrfs_delayed_node *node) 1185 struct btrfs_delayed_node *node)
1169{ 1186{
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f5aa4023d3e1..4f808e1baeed 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode);
107 107
108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, 108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root); 109 struct btrfs_root *root);
110int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root, int nr);
110 112
111void btrfs_balance_delayed_items(struct btrfs_root *root); 113void btrfs_balance_delayed_items(struct btrfs_root *root);
112 114
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b04790e..da7419ed01bb 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
233 return 0; 233 return 0;
234} 234}
235 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 236int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
237 struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq) 238 u64 seq)
238{ 239{
239 struct seq_list *elem; 240 struct seq_list *elem;
240 241 int ret = 0;
241 assert_spin_locked(&delayed_refs->lock); 242
242 if (list_empty(&delayed_refs->seq_head)) 243 spin_lock(&fs_info->tree_mod_seq_lock);
243 return 0; 244 if (!list_empty(&fs_info->tree_mod_seq_list)) {
244 245 elem = list_first_entry(&fs_info->tree_mod_seq_list,
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); 246 struct seq_list, list);
246 if (seq >= elem->seq) { 247 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", 248 pr_debug("holding back delayed_ref %llu, lowest is "
248 seq, elem->seq, delayed_refs); 249 "%llu (%p)\n", seq, elem->seq, delayed_refs);
249 return 1; 250 ret = 1;
251 }
250 } 252 }
251 return 0; 253
254 spin_unlock(&fs_info->tree_mod_seq_lock);
255 return ret;
252} 256}
253 257
254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 258int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -525,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 529 ref->is_head = 0;
526 ref->in_tree = 1; 530 ref->in_tree = 1;
527 531
528 if (is_fstree(ref_root)) 532 if (need_ref_seq(for_cow, ref_root))
529 seq = inc_delayed_seq(delayed_refs); 533 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
530 ref->seq = seq; 534 ref->seq = seq;
531 535
532 full_ref = btrfs_delayed_node_to_tree_ref(ref); 536 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -584,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 588 ref->is_head = 0;
585 ref->in_tree = 1; 589 ref->in_tree = 1;
586 590
587 if (is_fstree(ref_root)) 591 if (need_ref_seq(for_cow, ref_root))
588 seq = inc_delayed_seq(delayed_refs); 592 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
589 ref->seq = seq; 593 ref->seq = seq;
590 594
591 full_ref = btrfs_delayed_node_to_data_ref(ref); 595 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -658,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 662 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 663 num_bytes, parent, ref_root, level, action,
660 for_cow); 664 for_cow);
661 if (!is_fstree(ref_root) && 665 if (!need_ref_seq(for_cow, ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 666 waitqueue_active(&fs_info->tree_mod_seq_wait))
663 wake_up(&delayed_refs->seq_wait); 667 wake_up(&fs_info->tree_mod_seq_wait);
664 spin_unlock(&delayed_refs->lock); 668 spin_unlock(&delayed_refs->lock);
669 if (need_ref_seq(for_cow, ref_root))
670 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
665 671
666 return 0; 672 return 0;
667} 673}
@@ -707,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 713 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
708 num_bytes, parent, ref_root, owner, offset, 714 num_bytes, parent, ref_root, owner, offset,
709 action, for_cow); 715 action, for_cow);
710 if (!is_fstree(ref_root) && 716 if (!need_ref_seq(for_cow, ref_root) &&
711 waitqueue_active(&delayed_refs->seq_wait)) 717 waitqueue_active(&fs_info->tree_mod_seq_wait))
712 wake_up(&delayed_refs->seq_wait); 718 wake_up(&fs_info->tree_mod_seq_wait);
713 spin_unlock(&delayed_refs->lock); 719 spin_unlock(&delayed_refs->lock);
720 if (need_ref_seq(for_cow, ref_root))
721 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
714 722
715 return 0; 723 return 0;
716} 724}
@@ -736,8 +744,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
736 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
737 extent_op->is_data); 745 extent_op->is_data);
738 746
739 if (waitqueue_active(&delayed_refs->seq_wait)) 747 if (waitqueue_active(&fs_info->tree_mod_seq_wait))
740 wake_up(&delayed_refs->seq_wait); 748 wake_up(&fs_info->tree_mod_seq_wait);
741 spin_unlock(&delayed_refs->lock); 749 spin_unlock(&delayed_refs->lock);
742 return 0; 750 return 0;
743} 751}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927fb9957..0d7c90c366b6 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
139 int flushing; 139 int flushing;
140 140
141 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
162}; 142};
163 143
164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 144static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -195,34 +175,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 175int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 176 struct list_head *cluster, u64 search_start);
197 177
198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 178int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
199{ 179 struct btrfs_delayed_ref_root *delayed_refs,
200 assert_spin_locked(&delayed_refs->lock); 180 u64 seq);
201 ++delayed_refs->seq;
202 return delayed_refs->seq;
203}
204 181
205static inline void 182/*
206btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 183 * delayed refs with a ref_seq > 0 must be held back during backref walking.
207 struct seq_list *elem) 184 * this only applies to items in one of the fs-trees. for_cow items never need
185 * to be held back, so they won't get a ref_seq number.
186 */
187static inline int need_ref_seq(int for_cow, u64 rootid)
208{ 188{
209 assert_spin_locked(&delayed_refs->lock); 189 if (for_cow)
210 elem->seq = delayed_refs->seq; 190 return 0;
211 list_add_tail(&elem->list, &delayed_refs->seq_head);
212}
213 191
214static inline void 192 if (rootid == BTRFS_FS_TREE_OBJECTID)
215btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 193 return 1;
216 struct seq_list *elem)
217{
218 spin_lock(&delayed_refs->lock);
219 list_del(&elem->list);
220 wake_up(&delayed_refs->seq_wait);
221 spin_unlock(&delayed_refs->lock);
222}
223 194
224int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 195 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
225 u64 seq); 196 return 1;
197
198 return 0;
199}
226 200
227/* 201/*
228 * a node might live in a head or a regular ref, this lets you 202 * a node might live in a head or a regular ref, this lets you
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2936ca49b3b4..62e0cafd6e25 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
407 break; 407 break;
408 } 408 }
409 409
410 if (failed && !ret) 410 if (failed && !ret && failed_mirror)
411 repair_eb_io_failure(root, eb, failed_mirror); 411 repair_eb_io_failure(root, eb, failed_mirror);
412 412
413 return ret; 413 return ret;
@@ -1114,7 +1114,7 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1114 spin_unlock(&root->fs_info->delalloc_lock); 1114 spin_unlock(&root->fs_info->delalloc_lock);
1115 btrfs_panic(root->fs_info, -EOVERFLOW, 1115 btrfs_panic(root->fs_info, -EOVERFLOW,
1116 "Can't clear %lu bytes from " 1116 "Can't clear %lu bytes from "
1117 " dirty_mdatadata_bytes (%lu)", 1117 " dirty_mdatadata_bytes (%llu)",
1118 buf->len, 1118 buf->len,
1119 root->fs_info->dirty_metadata_bytes); 1119 root->fs_info->dirty_metadata_bytes);
1120 } 1120 }
@@ -1182,6 +1182,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1182 root->defrag_running = 0; 1182 root->defrag_running = 0;
1183 root->root_key.objectid = objectid; 1183 root->root_key.objectid = objectid;
1184 root->anon_dev = 0; 1184 root->anon_dev = 0;
1185
1186 spin_lock_init(&root->root_times_lock);
1185} 1187}
1186 1188
1187static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1189static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1225,6 +1227,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1225 return root; 1227 return root;
1226} 1228}
1227 1229
1230struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1231 struct btrfs_fs_info *fs_info,
1232 u64 objectid)
1233{
1234 struct extent_buffer *leaf;
1235 struct btrfs_root *tree_root = fs_info->tree_root;
1236 struct btrfs_root *root;
1237 struct btrfs_key key;
1238 int ret = 0;
1239 u64 bytenr;
1240
1241 root = btrfs_alloc_root(fs_info);
1242 if (!root)
1243 return ERR_PTR(-ENOMEM);
1244
1245 __setup_root(tree_root->nodesize, tree_root->leafsize,
1246 tree_root->sectorsize, tree_root->stripesize,
1247 root, fs_info, objectid);
1248 root->root_key.objectid = objectid;
1249 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1250 root->root_key.offset = 0;
1251
1252 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
1253 0, objectid, NULL, 0, 0, 0);
1254 if (IS_ERR(leaf)) {
1255 ret = PTR_ERR(leaf);
1256 goto fail;
1257 }
1258
1259 bytenr = leaf->start;
1260 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1261 btrfs_set_header_bytenr(leaf, leaf->start);
1262 btrfs_set_header_generation(leaf, trans->transid);
1263 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1264 btrfs_set_header_owner(leaf, objectid);
1265 root->node = leaf;
1266
1267 write_extent_buffer(leaf, fs_info->fsid,
1268 (unsigned long)btrfs_header_fsid(leaf),
1269 BTRFS_FSID_SIZE);
1270 write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
1271 (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
1272 BTRFS_UUID_SIZE);
1273 btrfs_mark_buffer_dirty(leaf);
1274
1275 root->commit_root = btrfs_root_node(root);
1276 root->track_dirty = 1;
1277
1278
1279 root->root_item.flags = 0;
1280 root->root_item.byte_limit = 0;
1281 btrfs_set_root_bytenr(&root->root_item, leaf->start);
1282 btrfs_set_root_generation(&root->root_item, trans->transid);
1283 btrfs_set_root_level(&root->root_item, 0);
1284 btrfs_set_root_refs(&root->root_item, 1);
1285 btrfs_set_root_used(&root->root_item, leaf->len);
1286 btrfs_set_root_last_snapshot(&root->root_item, 0);
1287 btrfs_set_root_dirid(&root->root_item, 0);
1288 root->root_item.drop_level = 0;
1289
1290 key.objectid = objectid;
1291 key.type = BTRFS_ROOT_ITEM_KEY;
1292 key.offset = 0;
1293 ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1294 if (ret)
1295 goto fail;
1296
1297 btrfs_tree_unlock(leaf);
1298
1299fail:
1300 if (ret)
1301 return ERR_PTR(ret);
1302
1303 return root;
1304}
1305
1228static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1306static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1229 struct btrfs_fs_info *fs_info) 1307 struct btrfs_fs_info *fs_info)
1230{ 1308{
@@ -1326,6 +1404,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1326 u64 generation; 1404 u64 generation;
1327 u32 blocksize; 1405 u32 blocksize;
1328 int ret = 0; 1406 int ret = 0;
1407 int slot;
1329 1408
1330 root = btrfs_alloc_root(fs_info); 1409 root = btrfs_alloc_root(fs_info);
1331 if (!root) 1410 if (!root)
@@ -1352,9 +1431,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1352 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1431 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1353 if (ret == 0) { 1432 if (ret == 0) {
1354 l = path->nodes[0]; 1433 l = path->nodes[0];
1355 read_extent_buffer(l, &root->root_item, 1434 slot = path->slots[0];
1356 btrfs_item_ptr_offset(l, path->slots[0]), 1435 btrfs_read_root_item(tree_root, l, slot, &root->root_item);
1357 sizeof(root->root_item));
1358 memcpy(&root->root_key, location, sizeof(*location)); 1436 memcpy(&root->root_key, location, sizeof(*location));
1359 } 1437 }
1360 btrfs_free_path(path); 1438 btrfs_free_path(path);
@@ -1396,6 +1474,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1396 return fs_info->dev_root; 1474 return fs_info->dev_root;
1397 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) 1475 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1398 return fs_info->csum_root; 1476 return fs_info->csum_root;
1477 if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1478 return fs_info->quota_root ? fs_info->quota_root :
1479 ERR_PTR(-ENOENT);
1399again: 1480again:
1400 spin_lock(&fs_info->fs_roots_radix_lock); 1481 spin_lock(&fs_info->fs_roots_radix_lock);
1401 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1482 root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1533,8 +1614,6 @@ static int cleaner_kthread(void *arg)
1533 struct btrfs_root *root = arg; 1614 struct btrfs_root *root = arg;
1534 1615
1535 do { 1616 do {
1536 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1537
1538 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1617 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1539 mutex_trylock(&root->fs_info->cleaner_mutex)) { 1618 mutex_trylock(&root->fs_info->cleaner_mutex)) {
1540 btrfs_run_delayed_iputs(root); 1619 btrfs_run_delayed_iputs(root);
@@ -1566,7 +1645,6 @@ static int transaction_kthread(void *arg)
1566 do { 1645 do {
1567 cannot_commit = false; 1646 cannot_commit = false;
1568 delay = HZ * 30; 1647 delay = HZ * 30;
1569 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1570 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1648 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1571 1649
1572 spin_lock(&root->fs_info->trans_lock); 1650 spin_lock(&root->fs_info->trans_lock);
@@ -1823,6 +1901,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1823 free_extent_buffer(info->extent_root->commit_root); 1901 free_extent_buffer(info->extent_root->commit_root);
1824 free_extent_buffer(info->csum_root->node); 1902 free_extent_buffer(info->csum_root->node);
1825 free_extent_buffer(info->csum_root->commit_root); 1903 free_extent_buffer(info->csum_root->commit_root);
1904 if (info->quota_root) {
1905 free_extent_buffer(info->quota_root->node);
1906 free_extent_buffer(info->quota_root->commit_root);
1907 }
1826 1908
1827 info->tree_root->node = NULL; 1909 info->tree_root->node = NULL;
1828 info->tree_root->commit_root = NULL; 1910 info->tree_root->commit_root = NULL;
@@ -1832,6 +1914,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1832 info->extent_root->commit_root = NULL; 1914 info->extent_root->commit_root = NULL;
1833 info->csum_root->node = NULL; 1915 info->csum_root->node = NULL;
1834 info->csum_root->commit_root = NULL; 1916 info->csum_root->commit_root = NULL;
1917 if (info->quota_root) {
1918 info->quota_root->node = NULL;
1919 info->quota_root->commit_root = NULL;
1920 }
1835 1921
1836 if (chunk_root) { 1922 if (chunk_root) {
1837 free_extent_buffer(info->chunk_root->node); 1923 free_extent_buffer(info->chunk_root->node);
@@ -1862,6 +1948,7 @@ int open_ctree(struct super_block *sb,
1862 struct btrfs_root *csum_root; 1948 struct btrfs_root *csum_root;
1863 struct btrfs_root *chunk_root; 1949 struct btrfs_root *chunk_root;
1864 struct btrfs_root *dev_root; 1950 struct btrfs_root *dev_root;
1951 struct btrfs_root *quota_root;
1865 struct btrfs_root *log_tree_root; 1952 struct btrfs_root *log_tree_root;
1866 int ret; 1953 int ret;
1867 int err = -EINVAL; 1954 int err = -EINVAL;
@@ -1873,9 +1960,10 @@ int open_ctree(struct super_block *sb,
1873 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); 1960 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1874 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 1961 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1875 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 1962 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1963 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
1876 1964
1877 if (!tree_root || !extent_root || !csum_root || 1965 if (!tree_root || !extent_root || !csum_root ||
1878 !chunk_root || !dev_root) { 1966 !chunk_root || !dev_root || !quota_root) {
1879 err = -ENOMEM; 1967 err = -ENOMEM;
1880 goto fail; 1968 goto fail;
1881 } 1969 }
@@ -1944,6 +2032,8 @@ int open_ctree(struct super_block *sb,
1944 fs_info->free_chunk_space = 0; 2032 fs_info->free_chunk_space = 0;
1945 fs_info->tree_mod_log = RB_ROOT; 2033 fs_info->tree_mod_log = RB_ROOT;
1946 2034
2035 init_waitqueue_head(&fs_info->tree_mod_seq_wait);
2036
1947 /* readahead state */ 2037 /* readahead state */
1948 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2038 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1949 spin_lock_init(&fs_info->reada_lock); 2039 spin_lock_init(&fs_info->reada_lock);
@@ -2032,6 +2122,13 @@ int open_ctree(struct super_block *sb,
2032 init_rwsem(&fs_info->cleanup_work_sem); 2122 init_rwsem(&fs_info->cleanup_work_sem);
2033 init_rwsem(&fs_info->subvol_sem); 2123 init_rwsem(&fs_info->subvol_sem);
2034 2124
2125 spin_lock_init(&fs_info->qgroup_lock);
2126 fs_info->qgroup_tree = RB_ROOT;
2127 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2128 fs_info->qgroup_seq = 1;
2129 fs_info->quota_enabled = 0;
2130 fs_info->pending_quota_state = 0;
2131
2035 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2132 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2036 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 2133 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2037 2134
@@ -2244,7 +2341,7 @@ int open_ctree(struct super_block *sb,
2244 ret |= btrfs_start_workers(&fs_info->caching_workers); 2341 ret |= btrfs_start_workers(&fs_info->caching_workers);
2245 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2342 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2246 if (ret) { 2343 if (ret) {
2247 ret = -ENOMEM; 2344 err = -ENOMEM;
2248 goto fail_sb_buffer; 2345 goto fail_sb_buffer;
2249 } 2346 }
2250 2347
@@ -2356,6 +2453,17 @@ retry_root_backup:
2356 goto recovery_tree_root; 2453 goto recovery_tree_root;
2357 csum_root->track_dirty = 1; 2454 csum_root->track_dirty = 1;
2358 2455
2456 ret = find_and_setup_root(tree_root, fs_info,
2457 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
2458 if (ret) {
2459 kfree(quota_root);
2460 quota_root = fs_info->quota_root = NULL;
2461 } else {
2462 quota_root->track_dirty = 1;
2463 fs_info->quota_enabled = 1;
2464 fs_info->pending_quota_state = 1;
2465 }
2466
2359 fs_info->generation = generation; 2467 fs_info->generation = generation;
2360 fs_info->last_trans_committed = generation; 2468 fs_info->last_trans_committed = generation;
2361 2469
@@ -2415,6 +2523,9 @@ retry_root_backup:
2415 " integrity check module %s\n", sb->s_id); 2523 " integrity check module %s\n", sb->s_id);
2416 } 2524 }
2417#endif 2525#endif
2526 ret = btrfs_read_qgroup_config(fs_info);
2527 if (ret)
2528 goto fail_trans_kthread;
2418 2529
2419 /* do not make disk changes in broken FS */ 2530 /* do not make disk changes in broken FS */
2420 if (btrfs_super_log_root(disk_super) != 0 && 2531 if (btrfs_super_log_root(disk_super) != 0 &&
@@ -2425,7 +2536,7 @@ retry_root_backup:
2425 printk(KERN_WARNING "Btrfs log replay required " 2536 printk(KERN_WARNING "Btrfs log replay required "
2426 "on RO media\n"); 2537 "on RO media\n");
2427 err = -EIO; 2538 err = -EIO;
2428 goto fail_trans_kthread; 2539 goto fail_qgroup;
2429 } 2540 }
2430 blocksize = 2541 blocksize =
2431 btrfs_level_size(tree_root, 2542 btrfs_level_size(tree_root,
@@ -2434,7 +2545,7 @@ retry_root_backup:
2434 log_tree_root = btrfs_alloc_root(fs_info); 2545 log_tree_root = btrfs_alloc_root(fs_info);
2435 if (!log_tree_root) { 2546 if (!log_tree_root) {
2436 err = -ENOMEM; 2547 err = -ENOMEM;
2437 goto fail_trans_kthread; 2548 goto fail_qgroup;
2438 } 2549 }
2439 2550
2440 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2551 __setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2466,15 +2577,15 @@ retry_root_backup:
2466 2577
2467 if (!(sb->s_flags & MS_RDONLY)) { 2578 if (!(sb->s_flags & MS_RDONLY)) {
2468 ret = btrfs_cleanup_fs_roots(fs_info); 2579 ret = btrfs_cleanup_fs_roots(fs_info);
2469 if (ret) { 2580 if (ret)
2470 } 2581 goto fail_trans_kthread;
2471 2582
2472 ret = btrfs_recover_relocation(tree_root); 2583 ret = btrfs_recover_relocation(tree_root);
2473 if (ret < 0) { 2584 if (ret < 0) {
2474 printk(KERN_WARNING 2585 printk(KERN_WARNING
2475 "btrfs: failed to recover relocation\n"); 2586 "btrfs: failed to recover relocation\n");
2476 err = -EINVAL; 2587 err = -EINVAL;
2477 goto fail_trans_kthread; 2588 goto fail_qgroup;
2478 } 2589 }
2479 } 2590 }
2480 2591
@@ -2484,10 +2595,10 @@ retry_root_backup:
2484 2595
2485 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2596 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2486 if (!fs_info->fs_root) 2597 if (!fs_info->fs_root)
2487 goto fail_trans_kthread; 2598 goto fail_qgroup;
2488 if (IS_ERR(fs_info->fs_root)) { 2599 if (IS_ERR(fs_info->fs_root)) {
2489 err = PTR_ERR(fs_info->fs_root); 2600 err = PTR_ERR(fs_info->fs_root);
2490 goto fail_trans_kthread; 2601 goto fail_qgroup;
2491 } 2602 }
2492 2603
2493 if (sb->s_flags & MS_RDONLY) 2604 if (sb->s_flags & MS_RDONLY)
@@ -2511,6 +2622,8 @@ retry_root_backup:
2511 2622
2512 return 0; 2623 return 0;
2513 2624
2625fail_qgroup:
2626 btrfs_free_qgroup_config(fs_info);
2514fail_trans_kthread: 2627fail_trans_kthread:
2515 kthread_stop(fs_info->transaction_kthread); 2628 kthread_stop(fs_info->transaction_kthread);
2516fail_cleaner: 2629fail_cleaner:
@@ -3109,6 +3222,8 @@ int close_ctree(struct btrfs_root *root)
3109 fs_info->closing = 2; 3222 fs_info->closing = 2;
3110 smp_mb(); 3223 smp_mb();
3111 3224
3225 btrfs_free_qgroup_config(root->fs_info);
3226
3112 if (fs_info->delalloc_bytes) { 3227 if (fs_info->delalloc_bytes) {
3113 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3228 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3114 (unsigned long long)fs_info->delalloc_bytes); 3229 (unsigned long long)fs_info->delalloc_bytes);
@@ -3128,6 +3243,10 @@ int close_ctree(struct btrfs_root *root)
3128 free_extent_buffer(fs_info->dev_root->commit_root); 3243 free_extent_buffer(fs_info->dev_root->commit_root);
3129 free_extent_buffer(fs_info->csum_root->node); 3244 free_extent_buffer(fs_info->csum_root->node);
3130 free_extent_buffer(fs_info->csum_root->commit_root); 3245 free_extent_buffer(fs_info->csum_root->commit_root);
3246 if (fs_info->quota_root) {
3247 free_extent_buffer(fs_info->quota_root->node);
3248 free_extent_buffer(fs_info->quota_root->commit_root);
3249 }
3131 3250
3132 btrfs_free_block_groups(fs_info); 3251 btrfs_free_block_groups(fs_info);
3133 3252
@@ -3258,7 +3377,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3258 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3377 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3259} 3378}
3260 3379
3261static int btree_lock_page_hook(struct page *page, void *data, 3380int btree_lock_page_hook(struct page *page, void *data,
3262 void (*flush_fn)(void *)) 3381 void (*flush_fn)(void *))
3263{ 3382{
3264 struct inode *inode = page->mapping->host; 3383 struct inode *inode = page->mapping->host;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b3fab39f7e..95e147eea239 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
89int btrfs_cleanup_transaction(struct btrfs_root *root); 89int btrfs_cleanup_transaction(struct btrfs_root *root);
90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
91 struct btrfs_root *root); 91 struct btrfs_root *root);
92void btrfs_abort_devices(struct btrfs_root *root);
93struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
94 struct btrfs_fs_info *fs_info,
95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *));
92 98
93#ifdef CONFIG_DEBUG_LOCK_ALLOC 99#ifdef CONFIG_DEBUG_LOCK_ALLOC
94void btrfs_init_lockdep(void); 100void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6e1d36702ff7..4e1b153b7c47 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36 36
37#undef SCRAMBLE_DELAYED_REFS
38
37/* 39/*
38 * control flags for do_chunk_alloc's force field 40 * control flags for do_chunk_alloc's force field
39 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 41 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2217 struct btrfs_delayed_ref_node *ref; 2219 struct btrfs_delayed_ref_node *ref;
2218 struct btrfs_delayed_ref_head *locked_ref = NULL; 2220 struct btrfs_delayed_ref_head *locked_ref = NULL;
2219 struct btrfs_delayed_extent_op *extent_op; 2221 struct btrfs_delayed_extent_op *extent_op;
2222 struct btrfs_fs_info *fs_info = root->fs_info;
2220 int ret; 2223 int ret;
2221 int count = 0; 2224 int count = 0;
2222 int must_insert_reserved = 0; 2225 int must_insert_reserved = 0;
@@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2255 ref = select_delayed_ref(locked_ref); 2258 ref = select_delayed_ref(locked_ref);
2256 2259
2257 if (ref && ref->seq && 2260 if (ref && ref->seq &&
2258 btrfs_check_delayed_seq(delayed_refs, ref->seq)) { 2261 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2259 /* 2262 /*
2260 * there are still refs with lower seq numbers in the 2263 * there are still refs with lower seq numbers in the
2261 * process of being added. Don't run this ref yet. 2264 * process of being added. Don't run this ref yet.
@@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 } 2340 }
2338 2341
2339next: 2342next:
2340 do_chunk_alloc(trans, root->fs_info->extent_root, 2343 do_chunk_alloc(trans, fs_info->extent_root,
2341 2 * 1024 * 1024, 2344 2 * 1024 * 1024,
2342 btrfs_get_alloc_profile(root, 0), 2345 btrfs_get_alloc_profile(root, 0),
2343 CHUNK_ALLOC_NO_FORCE); 2346 CHUNK_ALLOC_NO_FORCE);
@@ -2347,21 +2350,99 @@ next:
2347 return count; 2350 return count;
2348} 2351}
2349 2352
2350static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, 2353static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
2354 struct btrfs_delayed_ref_root *delayed_refs,
2351 unsigned long num_refs, 2355 unsigned long num_refs,
2352 struct list_head *first_seq) 2356 struct list_head *first_seq)
2353{ 2357{
2354 spin_unlock(&delayed_refs->lock); 2358 spin_unlock(&delayed_refs->lock);
2355 pr_debug("waiting for more refs (num %ld, first %p)\n", 2359 pr_debug("waiting for more refs (num %ld, first %p)\n",
2356 num_refs, first_seq); 2360 num_refs, first_seq);
2357 wait_event(delayed_refs->seq_wait, 2361 wait_event(fs_info->tree_mod_seq_wait,
2358 num_refs != delayed_refs->num_entries || 2362 num_refs != delayed_refs->num_entries ||
2359 delayed_refs->seq_head.next != first_seq); 2363 fs_info->tree_mod_seq_list.next != first_seq);
2360 pr_debug("done waiting for more refs (num %ld, first %p)\n", 2364 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2361 delayed_refs->num_entries, delayed_refs->seq_head.next); 2365 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
2362 spin_lock(&delayed_refs->lock); 2366 spin_lock(&delayed_refs->lock);
2363} 2367}
2364 2368
2369#ifdef SCRAMBLE_DELAYED_REFS
2370/*
2371 * Normally delayed refs get processed in ascending bytenr order. This
2372 * correlates in most cases to the order added. To expose dependencies on this
2373 * order, we start to process the tree in the middle instead of the beginning
2374 */
2375static u64 find_middle(struct rb_root *root)
2376{
2377 struct rb_node *n = root->rb_node;
2378 struct btrfs_delayed_ref_node *entry;
2379 int alt = 1;
2380 u64 middle;
2381 u64 first = 0, last = 0;
2382
2383 n = rb_first(root);
2384 if (n) {
2385 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2386 first = entry->bytenr;
2387 }
2388 n = rb_last(root);
2389 if (n) {
2390 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2391 last = entry->bytenr;
2392 }
2393 n = root->rb_node;
2394
2395 while (n) {
2396 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2397 WARN_ON(!entry->in_tree);
2398
2399 middle = entry->bytenr;
2400
2401 if (alt)
2402 n = n->rb_left;
2403 else
2404 n = n->rb_right;
2405
2406 alt = 1 - alt;
2407 }
2408 return middle;
2409}
2410#endif
2411
2412int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2413 struct btrfs_fs_info *fs_info)
2414{
2415 struct qgroup_update *qgroup_update;
2416 int ret = 0;
2417
2418 if (list_empty(&trans->qgroup_ref_list) !=
2419 !trans->delayed_ref_elem.seq) {
2420 /* list without seq or seq without list */
2421 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2422 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2423 trans->delayed_ref_elem.seq);
2424 BUG();
2425 }
2426
2427 if (!trans->delayed_ref_elem.seq)
2428 return 0;
2429
2430 while (!list_empty(&trans->qgroup_ref_list)) {
2431 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2432 struct qgroup_update, list);
2433 list_del(&qgroup_update->list);
2434 if (!ret)
2435 ret = btrfs_qgroup_account_ref(
2436 trans, fs_info, qgroup_update->node,
2437 qgroup_update->extent_op);
2438 kfree(qgroup_update);
2439 }
2440
2441 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2442
2443 return ret;
2444}
2445
2365/* 2446/*
2366 * this starts processing the delayed reference count updates and 2447 * this starts processing the delayed reference count updates and
2367 * extent insertions we have queued up so far. count can be 2448 * extent insertions we have queued up so far. count can be
@@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2398 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), 2479 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2399 CHUNK_ALLOC_NO_FORCE); 2480 CHUNK_ALLOC_NO_FORCE);
2400 2481
2482 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2483
2401 delayed_refs = &trans->transaction->delayed_refs; 2484 delayed_refs = &trans->transaction->delayed_refs;
2402 INIT_LIST_HEAD(&cluster); 2485 INIT_LIST_HEAD(&cluster);
2403again: 2486again:
2404 consider_waiting = 0; 2487 consider_waiting = 0;
2405 spin_lock(&delayed_refs->lock); 2488 spin_lock(&delayed_refs->lock);
2489
2490#ifdef SCRAMBLE_DELAYED_REFS
2491 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2492#endif
2493
2406 if (count == 0) { 2494 if (count == 0) {
2407 count = delayed_refs->num_entries * 2; 2495 count = delayed_refs->num_entries * 2;
2408 run_most = 1; 2496 run_most = 1;
@@ -2437,7 +2525,7 @@ again:
2437 num_refs = delayed_refs->num_entries; 2525 num_refs = delayed_refs->num_entries;
2438 first_seq = root->fs_info->tree_mod_seq_list.next; 2526 first_seq = root->fs_info->tree_mod_seq_list.next;
2439 } else { 2527 } else {
2440 wait_for_more_refs(delayed_refs, 2528 wait_for_more_refs(root->fs_info, delayed_refs,
2441 num_refs, first_seq); 2529 num_refs, first_seq);
2442 /* 2530 /*
2443 * after waiting, things have changed. we 2531 * after waiting, things have changed. we
@@ -2502,6 +2590,7 @@ again:
2502 } 2590 }
2503out: 2591out:
2504 spin_unlock(&delayed_refs->lock); 2592 spin_unlock(&delayed_refs->lock);
2593 assert_qgroups_uptodate(trans);
2505 return 0; 2594 return 0;
2506} 2595}
2507 2596
@@ -2581,8 +2670,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2581 2670
2582 node = rb_prev(node); 2671 node = rb_prev(node);
2583 if (node) { 2672 if (node) {
2673 int seq = ref->seq;
2674
2584 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2675 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2585 if (ref->bytenr == bytenr) 2676 if (ref->bytenr == bytenr && ref->seq == seq)
2586 goto out_unlock; 2677 goto out_unlock;
2587 } 2678 }
2588 2679
@@ -2903,8 +2994,13 @@ again:
2903 } 2994 }
2904 2995
2905 spin_lock(&block_group->lock); 2996 spin_lock(&block_group->lock);
2906 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2997 if (block_group->cached != BTRFS_CACHE_FINISHED ||
2907 /* We're not cached, don't bother trying to write stuff out */ 2998 !btrfs_test_opt(root, SPACE_CACHE)) {
2999 /*
3000 * don't bother trying to write stuff out _if_
3001 * a) we're not cached,
3002 * b) we're with nospace_cache mount option.
3003 */
2908 dcs = BTRFS_DC_WRITTEN; 3004 dcs = BTRFS_DC_WRITTEN;
2909 spin_unlock(&block_group->lock); 3005 spin_unlock(&block_group->lock);
2910 goto out_put; 3006 goto out_put;
@@ -3134,6 +3230,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3134 init_waitqueue_head(&found->wait); 3230 init_waitqueue_head(&found->wait);
3135 *space_info = found; 3231 *space_info = found;
3136 list_add_rcu(&found->list, &info->space_info); 3232 list_add_rcu(&found->list, &info->space_info);
3233 if (flags & BTRFS_BLOCK_GROUP_DATA)
3234 info->data_sinfo = found;
3137 return 0; 3235 return 0;
3138} 3236}
3139 3237
@@ -3263,12 +3361,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3263 return get_alloc_profile(root, flags); 3361 return get_alloc_profile(root, flags);
3264} 3362}
3265 3363
3266void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3267{
3268 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3269 BTRFS_BLOCK_GROUP_DATA);
3270}
3271
3272/* 3364/*
3273 * This will check the space that the inode allocates from to make sure we have 3365 * This will check the space that the inode allocates from to make sure we have
3274 * enough space for bytes. 3366 * enough space for bytes.
@@ -3277,6 +3369,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3277{ 3369{
3278 struct btrfs_space_info *data_sinfo; 3370 struct btrfs_space_info *data_sinfo;
3279 struct btrfs_root *root = BTRFS_I(inode)->root; 3371 struct btrfs_root *root = BTRFS_I(inode)->root;
3372 struct btrfs_fs_info *fs_info = root->fs_info;
3280 u64 used; 3373 u64 used;
3281 int ret = 0, committed = 0, alloc_chunk = 1; 3374 int ret = 0, committed = 0, alloc_chunk = 1;
3282 3375
@@ -3289,7 +3382,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3289 committed = 1; 3382 committed = 1;
3290 } 3383 }
3291 3384
3292 data_sinfo = BTRFS_I(inode)->space_info; 3385 data_sinfo = fs_info->data_sinfo;
3293 if (!data_sinfo) 3386 if (!data_sinfo)
3294 goto alloc; 3387 goto alloc;
3295 3388
@@ -3330,10 +3423,9 @@ alloc:
3330 goto commit_trans; 3423 goto commit_trans;
3331 } 3424 }
3332 3425
3333 if (!data_sinfo) { 3426 if (!data_sinfo)
3334 btrfs_set_inode_space_info(root, inode); 3427 data_sinfo = fs_info->data_sinfo;
3335 data_sinfo = BTRFS_I(inode)->space_info; 3428
3336 }
3337 goto again; 3429 goto again;
3338 } 3430 }
3339 3431
@@ -3380,7 +3472,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3380 /* make sure bytes are sectorsize aligned */ 3472 /* make sure bytes are sectorsize aligned */
3381 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3473 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3382 3474
3383 data_sinfo = BTRFS_I(inode)->space_info; 3475 data_sinfo = root->fs_info->data_sinfo;
3384 spin_lock(&data_sinfo->lock); 3476 spin_lock(&data_sinfo->lock);
3385 data_sinfo->bytes_may_use -= bytes; 3477 data_sinfo->bytes_may_use -= bytes;
3386 trace_btrfs_space_reservation(root->fs_info, "space_info", 3478 trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3586,89 +3678,58 @@ out:
3586/* 3678/*
3587 * shrink metadata reservation for delalloc 3679 * shrink metadata reservation for delalloc
3588 */ 3680 */
3589static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, 3681static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3590 bool wait_ordered) 3682 bool wait_ordered)
3591{ 3683{
3592 struct btrfs_block_rsv *block_rsv; 3684 struct btrfs_block_rsv *block_rsv;
3593 struct btrfs_space_info *space_info; 3685 struct btrfs_space_info *space_info;
3594 struct btrfs_trans_handle *trans; 3686 struct btrfs_trans_handle *trans;
3595 u64 reserved; 3687 u64 delalloc_bytes;
3596 u64 max_reclaim; 3688 u64 max_reclaim;
3597 u64 reclaimed = 0;
3598 long time_left; 3689 long time_left;
3599 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3690 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3600 int loops = 0; 3691 int loops = 0;
3601 unsigned long progress;
3602 3692
3603 trans = (struct btrfs_trans_handle *)current->journal_info; 3693 trans = (struct btrfs_trans_handle *)current->journal_info;
3604 block_rsv = &root->fs_info->delalloc_block_rsv; 3694 block_rsv = &root->fs_info->delalloc_block_rsv;
3605 space_info = block_rsv->space_info; 3695 space_info = block_rsv->space_info;
3606 3696
3607 smp_mb(); 3697 smp_mb();
3608 reserved = space_info->bytes_may_use; 3698 delalloc_bytes = root->fs_info->delalloc_bytes;
3609 progress = space_info->reservation_progress; 3699 if (delalloc_bytes == 0) {
3610
3611 if (reserved == 0)
3612 return 0;
3613
3614 smp_mb();
3615 if (root->fs_info->delalloc_bytes == 0) {
3616 if (trans) 3700 if (trans)
3617 return 0; 3701 return;
3618 btrfs_wait_ordered_extents(root, 0, 0); 3702 btrfs_wait_ordered_extents(root, 0, 0);
3619 return 0; 3703 return;
3620 } 3704 }
3621 3705
3622 max_reclaim = min(reserved, to_reclaim); 3706 while (delalloc_bytes && loops < 3) {
3623 nr_pages = max_t(unsigned long, nr_pages, 3707 max_reclaim = min(delalloc_bytes, to_reclaim);
3624 max_reclaim >> PAGE_CACHE_SHIFT); 3708 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3625 while (loops < 1024) {
3626 /* have the flusher threads jump in and do some IO */
3627 smp_mb();
3628 nr_pages = min_t(unsigned long, nr_pages,
3629 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3630 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3709 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3631 WB_REASON_FS_FREE_SPACE); 3710 WB_REASON_FS_FREE_SPACE);
3632 3711
3633 spin_lock(&space_info->lock); 3712 spin_lock(&space_info->lock);
3634 if (reserved > space_info->bytes_may_use) 3713 if (space_info->bytes_used + space_info->bytes_reserved +
3635 reclaimed += reserved - space_info->bytes_may_use; 3714 space_info->bytes_pinned + space_info->bytes_readonly +
3636 reserved = space_info->bytes_may_use; 3715 space_info->bytes_may_use + orig <=
3716 space_info->total_bytes) {
3717 spin_unlock(&space_info->lock);
3718 break;
3719 }
3637 spin_unlock(&space_info->lock); 3720 spin_unlock(&space_info->lock);
3638 3721
3639 loops++; 3722 loops++;
3640
3641 if (reserved == 0 || reclaimed >= max_reclaim)
3642 break;
3643
3644 if (trans && trans->transaction->blocked)
3645 return -EAGAIN;
3646
3647 if (wait_ordered && !trans) { 3723 if (wait_ordered && !trans) {
3648 btrfs_wait_ordered_extents(root, 0, 0); 3724 btrfs_wait_ordered_extents(root, 0, 0);
3649 } else { 3725 } else {
3650 time_left = schedule_timeout_interruptible(1); 3726 time_left = schedule_timeout_killable(1);
3651
3652 /* We were interrupted, exit */
3653 if (time_left) 3727 if (time_left)
3654 break; 3728 break;
3655 } 3729 }
3656 3730 smp_mb();
3657 /* we've kicked the IO a few times, if anything has been freed, 3731 delalloc_bytes = root->fs_info->delalloc_bytes;
3658 * exit. There is no sense in looping here for a long time
3659 * when we really need to commit the transaction, or there are
3660 * just too many writers without enough free space
3661 */
3662
3663 if (loops > 3) {
3664 smp_mb();
3665 if (progress != space_info->reservation_progress)
3666 break;
3667 }
3668
3669 } 3732 }
3670
3671 return reclaimed >= to_reclaim;
3672} 3733}
3673 3734
3674/** 3735/**
@@ -3728,6 +3789,58 @@ commit:
3728 return btrfs_commit_transaction(trans, root); 3789 return btrfs_commit_transaction(trans, root);
3729} 3790}
3730 3791
3792enum flush_state {
3793 FLUSH_DELALLOC = 1,
3794 FLUSH_DELALLOC_WAIT = 2,
3795 FLUSH_DELAYED_ITEMS_NR = 3,
3796 FLUSH_DELAYED_ITEMS = 4,
3797 COMMIT_TRANS = 5,
3798};
3799
3800static int flush_space(struct btrfs_root *root,
3801 struct btrfs_space_info *space_info, u64 num_bytes,
3802 u64 orig_bytes, int state)
3803{
3804 struct btrfs_trans_handle *trans;
3805 int nr;
3806 int ret = 0;
3807
3808 switch (state) {
3809 case FLUSH_DELALLOC:
3810 case FLUSH_DELALLOC_WAIT:
3811 shrink_delalloc(root, num_bytes, orig_bytes,
3812 state == FLUSH_DELALLOC_WAIT);
3813 break;
3814 case FLUSH_DELAYED_ITEMS_NR:
3815 case FLUSH_DELAYED_ITEMS:
3816 if (state == FLUSH_DELAYED_ITEMS_NR) {
3817 u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3818
3819 nr = (int)div64_u64(num_bytes, bytes);
3820 if (!nr)
3821 nr = 1;
3822 nr *= 2;
3823 } else {
3824 nr = -1;
3825 }
3826 trans = btrfs_join_transaction(root);
3827 if (IS_ERR(trans)) {
3828 ret = PTR_ERR(trans);
3829 break;
3830 }
3831 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3832 btrfs_end_transaction(trans, root);
3833 break;
3834 case COMMIT_TRANS:
3835 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3836 break;
3837 default:
3838 ret = -ENOSPC;
3839 break;
3840 }
3841
3842 return ret;
3843}
3731/** 3844/**
3732 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 3845 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3733 * @root - the root we're allocating for 3846 * @root - the root we're allocating for
@@ -3749,11 +3862,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
3749 struct btrfs_space_info *space_info = block_rsv->space_info; 3862 struct btrfs_space_info *space_info = block_rsv->space_info;
3750 u64 used; 3863 u64 used;
3751 u64 num_bytes = orig_bytes; 3864 u64 num_bytes = orig_bytes;
3752 int retries = 0; 3865 int flush_state = FLUSH_DELALLOC;
3753 int ret = 0; 3866 int ret = 0;
3754 bool committed = false;
3755 bool flushing = false; 3867 bool flushing = false;
3756 bool wait_ordered = false; 3868 bool committed = false;
3757 3869
3758again: 3870again:
3759 ret = 0; 3871 ret = 0;
@@ -3812,9 +3924,8 @@ again:
3812 * amount plus the amount of bytes that we need for this 3924 * amount plus the amount of bytes that we need for this
3813 * reservation. 3925 * reservation.
3814 */ 3926 */
3815 wait_ordered = true;
3816 num_bytes = used - space_info->total_bytes + 3927 num_bytes = used - space_info->total_bytes +
3817 (orig_bytes * (retries + 1)); 3928 (orig_bytes * 2);
3818 } 3929 }
3819 3930
3820 if (ret) { 3931 if (ret) {
@@ -3867,8 +3978,6 @@ again:
3867 trace_btrfs_space_reservation(root->fs_info, 3978 trace_btrfs_space_reservation(root->fs_info,
3868 "space_info", space_info->flags, orig_bytes, 1); 3979 "space_info", space_info->flags, orig_bytes, 1);
3869 ret = 0; 3980 ret = 0;
3870 } else {
3871 wait_ordered = true;
3872 } 3981 }
3873 } 3982 }
3874 3983
@@ -3887,36 +3996,13 @@ again:
3887 if (!ret || !flush) 3996 if (!ret || !flush)
3888 goto out; 3997 goto out;
3889 3998
3890 /* 3999 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3891 * We do synchronous shrinking since we don't actually unreserve 4000 flush_state);
3892 * metadata until after the IO is completed. 4001 flush_state++;
3893 */ 4002 if (!ret)
3894 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3895 if (ret < 0)
3896 goto out;
3897
3898 ret = 0;
3899
3900 /*
3901 * So if we were overcommitted it's possible that somebody else flushed
3902 * out enough space and we simply didn't have enough space to reclaim,
3903 * so go back around and try again.
3904 */
3905 if (retries < 2) {
3906 wait_ordered = true;
3907 retries++;
3908 goto again; 4003 goto again;
3909 } 4004 else if (flush_state <= COMMIT_TRANS)
3910
3911 ret = -ENOSPC;
3912 if (committed)
3913 goto out;
3914
3915 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3916 if (!ret) {
3917 committed = true;
3918 goto again; 4005 goto again;
3919 }
3920 4006
3921out: 4007out:
3922 if (flushing) { 4008 if (flushing) {
@@ -3934,7 +4020,10 @@ static struct btrfs_block_rsv *get_block_rsv(
3934{ 4020{
3935 struct btrfs_block_rsv *block_rsv = NULL; 4021 struct btrfs_block_rsv *block_rsv = NULL;
3936 4022
3937 if (root->ref_cows || root == root->fs_info->csum_root) 4023 if (root->ref_cows)
4024 block_rsv = trans->block_rsv;
4025
4026 if (root == root->fs_info->csum_root && trans->adding_csums)
3938 block_rsv = trans->block_rsv; 4027 block_rsv = trans->block_rsv;
3939 4028
3940 if (!block_rsv) 4029 if (!block_rsv)
@@ -4286,6 +4375,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4286void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4375void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4287 struct btrfs_root *root) 4376 struct btrfs_root *root)
4288{ 4377{
4378 if (!trans->block_rsv)
4379 return;
4380
4289 if (!trans->bytes_reserved) 4381 if (!trans->bytes_reserved)
4290 return; 4382 return;
4291 4383
@@ -4444,7 +4536,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4444 int ret; 4536 int ret;
4445 4537
4446 /* Need to be holding the i_mutex here if we aren't free space cache */ 4538 /* Need to be holding the i_mutex here if we aren't free space cache */
4447 if (btrfs_is_free_space_inode(root, inode)) 4539 if (btrfs_is_free_space_inode(inode))
4448 flush = 0; 4540 flush = 0;
4449 4541
4450 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4542 if (flush && btrfs_transaction_in_commit(root->fs_info))
@@ -4476,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4476 csum_bytes = BTRFS_I(inode)->csum_bytes; 4568 csum_bytes = BTRFS_I(inode)->csum_bytes;
4477 spin_unlock(&BTRFS_I(inode)->lock); 4569 spin_unlock(&BTRFS_I(inode)->lock);
4478 4570
4571 if (root->fs_info->quota_enabled) {
4572 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize);
4574 if (ret)
4575 return ret;
4576 }
4577
4479 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4578 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4480 if (ret) { 4579 if (ret) {
4481 u64 to_free = 0; 4580 u64 to_free = 0;
@@ -4554,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4554 4653
4555 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4654 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4556 btrfs_ino(inode), to_free, 0); 4655 btrfs_ino(inode), to_free, 0);
4656 if (root->fs_info->quota_enabled) {
4657 btrfs_qgroup_free(root, num_bytes +
4658 dropped * root->leafsize);
4659 }
4660
4557 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4661 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4558 to_free); 4662 to_free);
4559} 4663}
@@ -5190,8 +5294,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5190 rb_erase(&head->node.rb_node, &delayed_refs->root); 5294 rb_erase(&head->node.rb_node, &delayed_refs->root);
5191 5295
5192 delayed_refs->num_entries--; 5296 delayed_refs->num_entries--;
5193 if (waitqueue_active(&delayed_refs->seq_wait)) 5297 smp_mb();
5194 wake_up(&delayed_refs->seq_wait); 5298 if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
5299 wake_up(&root->fs_info->tree_mod_seq_wait);
5195 5300
5196 /* 5301 /*
5197 * we don't take a ref on the node because we're removing it from the 5302 * we don't take a ref on the node because we're removing it from the
@@ -5748,7 +5853,11 @@ loop:
5748 ret = do_chunk_alloc(trans, root, num_bytes + 5853 ret = do_chunk_alloc(trans, root, num_bytes +
5749 2 * 1024 * 1024, data, 5854 2 * 1024 * 1024, data,
5750 CHUNK_ALLOC_LIMITED); 5855 CHUNK_ALLOC_LIMITED);
5751 if (ret < 0) { 5856 /*
5857 * Do not bail out on ENOSPC since we
5858 * can do more things.
5859 */
5860 if (ret < 0 && ret != -ENOSPC) {
5752 btrfs_abort_transaction(trans, 5861 btrfs_abort_transaction(trans,
5753 root, ret); 5862 root, ret);
5754 goto out; 5863 goto out;
@@ -5816,13 +5925,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5816again: 5925again:
5817 list_for_each_entry(cache, &info->block_groups[index], list) { 5926 list_for_each_entry(cache, &info->block_groups[index], list) {
5818 spin_lock(&cache->lock); 5927 spin_lock(&cache->lock);
5819 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 5928 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5820 "%llu pinned %llu reserved\n",
5821 (unsigned long long)cache->key.objectid, 5929 (unsigned long long)cache->key.objectid,
5822 (unsigned long long)cache->key.offset, 5930 (unsigned long long)cache->key.offset,
5823 (unsigned long long)btrfs_block_group_used(&cache->item), 5931 (unsigned long long)btrfs_block_group_used(&cache->item),
5824 (unsigned long long)cache->pinned, 5932 (unsigned long long)cache->pinned,
5825 (unsigned long long)cache->reserved); 5933 (unsigned long long)cache->reserved,
5934 cache->ro ? "[readonly]" : "");
5826 btrfs_dump_free_space(cache, bytes); 5935 btrfs_dump_free_space(cache, bytes);
5827 spin_unlock(&cache->lock); 5936 spin_unlock(&cache->lock);
5828 } 5937 }
@@ -7610,8 +7719,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7610 INIT_LIST_HEAD(&cache->list); 7719 INIT_LIST_HEAD(&cache->list);
7611 INIT_LIST_HEAD(&cache->cluster_list); 7720 INIT_LIST_HEAD(&cache->cluster_list);
7612 7721
7613 if (need_clear) 7722 if (need_clear) {
7723 /*
7724 * When we mount with old space cache, we need to
7725 * set BTRFS_DC_CLEAR and set dirty flag.
7726 *
7727 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7728 * truncate the old free space cache inode and
7729 * setup a new one.
7730 * b) Setting 'dirty flag' makes sure that we flush
7731 * the new space cache info onto disk.
7732 */
7614 cache->disk_cache_state = BTRFS_DC_CLEAR; 7733 cache->disk_cache_state = BTRFS_DC_CLEAR;
7734 if (btrfs_test_opt(root, SPACE_CACHE))
7735 cache->dirty = 1;
7736 }
7615 7737
7616 read_extent_buffer(leaf, &cache->item, 7738 read_extent_buffer(leaf, &cache->item,
7617 btrfs_item_ptr_offset(leaf, path->slots[0]), 7739 btrfs_item_ptr_offset(leaf, path->slots[0]),
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 01c21b6c6d43..45c81bb4ac82 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -929,7 +929,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
929 929
930 930
931/** 931/**
932 * convert_extent - convert all bits in a given range from one bit to another 932 * convert_extent_bit - convert all bits in a given range from one bit to
933 * another
933 * @tree: the io tree to search 934 * @tree: the io tree to search
934 * @start: the start offset in bytes 935 * @start: the start offset in bytes
935 * @end: the end offset in bytes (inclusive) 936 * @end: the end offset in bytes (inclusive)
@@ -1918,7 +1919,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1918 return -EIO; 1919 return -EIO;
1919 } 1920 }
1920 1921
1921 printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 1922 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1922 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 1923 "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1923 start, rcu_str_deref(dev->name), sector); 1924 start, rcu_str_deref(dev->name), sector);
1924 1925
@@ -3077,8 +3078,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3077 } 3078 }
3078 } 3079 }
3079 3080
3081 /*
3082 * We need to do this to prevent races in people who check if the eb is
3083 * under IO since we can end up having no IO bits set for a short period
3084 * of time.
3085 */
3086 spin_lock(&eb->refs_lock);
3080 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3087 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3081 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3088 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3089 spin_unlock(&eb->refs_lock);
3082 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3090 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3083 spin_lock(&fs_info->delalloc_lock); 3091 spin_lock(&fs_info->delalloc_lock);
3084 if (fs_info->dirty_metadata_bytes >= eb->len) 3092 if (fs_info->dirty_metadata_bytes >= eb->len)
@@ -3087,6 +3095,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3087 WARN_ON(1); 3095 WARN_ON(1);
3088 spin_unlock(&fs_info->delalloc_lock); 3096 spin_unlock(&fs_info->delalloc_lock);
3089 ret = 1; 3097 ret = 1;
3098 } else {
3099 spin_unlock(&eb->refs_lock);
3090 } 3100 }
3091 3101
3092 btrfs_tree_unlock(eb); 3102 btrfs_tree_unlock(eb);
@@ -3557,19 +3567,38 @@ int extent_readpages(struct extent_io_tree *tree,
3557 struct bio *bio = NULL; 3567 struct bio *bio = NULL;
3558 unsigned page_idx; 3568 unsigned page_idx;
3559 unsigned long bio_flags = 0; 3569 unsigned long bio_flags = 0;
3570 struct page *pagepool[16];
3571 struct page *page;
3572 int i = 0;
3573 int nr = 0;
3560 3574
3561 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3575 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3562 struct page *page = list_entry(pages->prev, struct page, lru); 3576 page = list_entry(pages->prev, struct page, lru);
3563 3577
3564 prefetchw(&page->flags); 3578 prefetchw(&page->flags);
3565 list_del(&page->lru); 3579 list_del(&page->lru);
3566 if (!add_to_page_cache_lru(page, mapping, 3580 if (add_to_page_cache_lru(page, mapping,
3567 page->index, GFP_NOFS)) { 3581 page->index, GFP_NOFS)) {
3568 __extent_read_full_page(tree, page, get_extent, 3582 page_cache_release(page);
3569 &bio, 0, &bio_flags); 3583 continue;
3570 } 3584 }
3571 page_cache_release(page); 3585
3586 pagepool[nr++] = page;
3587 if (nr < ARRAY_SIZE(pagepool))
3588 continue;
3589 for (i = 0; i < nr; i++) {
3590 __extent_read_full_page(tree, pagepool[i], get_extent,
3591 &bio, 0, &bio_flags);
3592 page_cache_release(pagepool[i]);
3593 }
3594 nr = 0;
3595 }
3596 for (i = 0; i < nr; i++) {
3597 __extent_read_full_page(tree, pagepool[i], get_extent,
3598 &bio, 0, &bio_flags);
3599 page_cache_release(pagepool[i]);
3572 } 3600 }
3601
3573 BUG_ON(!list_empty(pages)); 3602 BUG_ON(!list_empty(pages));
3574 if (bio) 3603 if (bio)
3575 return submit_one_bio(READ, bio, 0, bio_flags); 3604 return submit_one_bio(READ, bio, 0, bio_flags);
@@ -4123,11 +4152,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4123 * So bump the ref count first, then set the bit. If someone 4152 * So bump the ref count first, then set the bit. If someone
4124 * beat us to it, drop the ref we added. 4153 * beat us to it, drop the ref we added.
4125 */ 4154 */
4126 if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4155 spin_lock(&eb->refs_lock);
4156 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4127 atomic_inc(&eb->refs); 4157 atomic_inc(&eb->refs);
4128 if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4158 spin_unlock(&eb->refs_lock);
4129 atomic_dec(&eb->refs);
4130 }
4131} 4159}
4132 4160
4133static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4161static void mark_extent_buffer_accessed(struct extent_buffer *eb)
@@ -4239,9 +4267,7 @@ again:
4239 goto free_eb; 4267 goto free_eb;
4240 } 4268 }
4241 /* add one reference for the tree */ 4269 /* add one reference for the tree */
4242 spin_lock(&eb->refs_lock);
4243 check_buffer_tree_ref(eb); 4270 check_buffer_tree_ref(eb);
4244 spin_unlock(&eb->refs_lock);
4245 spin_unlock(&tree->buffer_lock); 4271 spin_unlock(&tree->buffer_lock);
4246 radix_tree_preload_end(); 4272 radix_tree_preload_end();
4247 4273
@@ -4300,7 +4326,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4300} 4326}
4301 4327
4302/* Expects to have eb->eb_lock already held */ 4328/* Expects to have eb->eb_lock already held */
4303static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) 4329static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4304{ 4330{
4305 WARN_ON(atomic_read(&eb->refs) == 0); 4331 WARN_ON(atomic_read(&eb->refs) == 0);
4306 if (atomic_dec_and_test(&eb->refs)) { 4332 if (atomic_dec_and_test(&eb->refs)) {
@@ -4321,9 +4347,11 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4321 btrfs_release_extent_buffer_page(eb, 0); 4347 btrfs_release_extent_buffer_page(eb, 0);
4322 4348
4323 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4349 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4324 return; 4350 return 1;
4325 } 4351 }
4326 spin_unlock(&eb->refs_lock); 4352 spin_unlock(&eb->refs_lock);
4353
4354 return 0;
4327} 4355}
4328 4356
4329void free_extent_buffer(struct extent_buffer *eb) 4357void free_extent_buffer(struct extent_buffer *eb)
@@ -4962,7 +4990,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
4962 spin_unlock(&eb->refs_lock); 4990 spin_unlock(&eb->refs_lock);
4963 return 0; 4991 return 0;
4964 } 4992 }
4965 release_extent_buffer(eb, mask);
4966 4993
4967 return 1; 4994 return release_extent_buffer(eb, mask);
4968} 4995}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5d158d320233..b45b9de0c21d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -183,7 +183,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
183 * read from the commit root and sidestep a nasty deadlock 183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree. 184 * between reading the free space cache and updating the csum tree.
185 */ 185 */
186 if (btrfs_is_free_space_inode(root, inode)) { 186 if (btrfs_is_free_space_inode(inode)) {
187 path->search_commit_root = 1; 187 path->search_commit_root = 1;
188 path->skip_locking = 1; 188 path->skip_locking = 1;
189 } 189 }
@@ -690,6 +690,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
690 return -ENOMEM; 690 return -ENOMEM;
691 691
692 sector_sum = sums->sums; 692 sector_sum = sums->sums;
693 trans->adding_csums = 1;
693again: 694again:
694 next_offset = (u64)-1; 695 next_offset = (u64)-1;
695 found_next = 0; 696 found_next = 0;
@@ -853,6 +854,7 @@ next_sector:
853 goto again; 854 goto again;
854 } 855 }
855out: 856out:
857 trans->adding_csums = 0;
856 btrfs_free_path(path); 858 btrfs_free_path(path);
857 return ret; 859 return ret;
858 860
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9aa01ec2138d..5caf285c6e4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1379 ssize_t err = 0; 1379 ssize_t err = 0;
1380 size_t count, ocount; 1380 size_t count, ocount;
1381 1381
1382 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1382 sb_start_write(inode->i_sb);
1383 1383
1384 mutex_lock(&inode->i_mutex); 1384 mutex_lock(&inode->i_mutex);
1385 1385
@@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1469 num_written = err; 1469 num_written = err;
1470 } 1470 }
1471out: 1471out:
1472 sb_end_write(inode->i_sb);
1472 current->backing_dev_info = NULL; 1473 current->backing_dev_info = NULL;
1473 return num_written ? num_written : err; 1474 return num_written ? num_written : err;
1474} 1475}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6c4e2baa9290..6b10acfc2f5c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1968,7 +1968,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1968 1968
1969 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 1969 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
1970 info = rb_entry(n, struct btrfs_free_space, offset_index); 1970 info = rb_entry(n, struct btrfs_free_space, offset_index);
1971 if (info->bytes >= bytes) 1971 if (info->bytes >= bytes && !block_group->ro)
1972 count++; 1972 count++;
1973 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 1973 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
1974 (unsigned long long)info->offset, 1974 (unsigned long long)info->offset,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a7d1921ac76b..6e8f416773d4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,7 +324,8 @@ static noinline int add_async_extent(struct async_cow *cow,
324 * If this code finds it can't get good compression, it puts an 324 * If this code finds it can't get good compression, it puts an
325 * entry onto the work queue to write the uncompressed bytes. This 325 * entry onto the work queue to write the uncompressed bytes. This
326 * makes sure that both compressed inodes and uncompressed inodes 326 * makes sure that both compressed inodes and uncompressed inodes
327 * are written in the same order that pdflush sent them down. 327 * are written in the same order that the flusher thread sent them
328 * down.
328 */ 329 */
329static noinline int compress_file_range(struct inode *inode, 330static noinline int compress_file_range(struct inode *inode,
330 struct page *locked_page, 331 struct page *locked_page,
@@ -825,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode,
825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 826 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
826 int ret = 0; 827 int ret = 0;
827 828
828 BUG_ON(btrfs_is_free_space_inode(root, inode)); 829 BUG_ON(btrfs_is_free_space_inode(inode));
829 trans = btrfs_join_transaction(root); 830 trans = btrfs_join_transaction(root);
830 if (IS_ERR(trans)) { 831 if (IS_ERR(trans)) {
831 extent_clear_unlock_delalloc(inode, 832 extent_clear_unlock_delalloc(inode,
@@ -1010,7 +1011,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 1011 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
1011 1012
1012 if (atomic_read(&root->fs_info->async_delalloc_pages) < 1013 if (atomic_read(&root->fs_info->async_delalloc_pages) <
1013 5 * 1042 * 1024 && 1014 5 * 1024 * 1024 &&
1014 waitqueue_active(&root->fs_info->async_submit_wait)) 1015 waitqueue_active(&root->fs_info->async_submit_wait))
1015 wake_up(&root->fs_info->async_submit_wait); 1016 wake_up(&root->fs_info->async_submit_wait);
1016 1017
@@ -1035,7 +1036,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1035 struct btrfs_root *root = BTRFS_I(inode)->root; 1036 struct btrfs_root *root = BTRFS_I(inode)->root;
1036 unsigned long nr_pages; 1037 unsigned long nr_pages;
1037 u64 cur_end; 1038 u64 cur_end;
1038 int limit = 10 * 1024 * 1042; 1039 int limit = 10 * 1024 * 1024;
1039 1040
1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1041 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1041 1, 0, NULL, GFP_NOFS); 1042 1, 0, NULL, GFP_NOFS);
@@ -1153,7 +1154,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1153 return -ENOMEM; 1154 return -ENOMEM;
1154 } 1155 }
1155 1156
1156 nolock = btrfs_is_free_space_inode(root, inode); 1157 nolock = btrfs_is_free_space_inode(inode);
1157 1158
1158 if (nolock) 1159 if (nolock)
1159 trans = btrfs_join_transaction_nolock(root); 1160 trans = btrfs_join_transaction_nolock(root);
@@ -1466,7 +1467,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1467 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1467 struct btrfs_root *root = BTRFS_I(inode)->root; 1468 struct btrfs_root *root = BTRFS_I(inode)->root;
1468 u64 len = state->end + 1 - state->start; 1469 u64 len = state->end + 1 - state->start;
1469 bool do_list = !btrfs_is_free_space_inode(root, inode); 1470 bool do_list = !btrfs_is_free_space_inode(inode);
1470 1471
1471 if (*bits & EXTENT_FIRST_DELALLOC) { 1472 if (*bits & EXTENT_FIRST_DELALLOC) {
1472 *bits &= ~EXTENT_FIRST_DELALLOC; 1473 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1501,7 +1502,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1502 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1502 struct btrfs_root *root = BTRFS_I(inode)->root; 1503 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 u64 len = state->end + 1 - state->start; 1504 u64 len = state->end + 1 - state->start;
1504 bool do_list = !btrfs_is_free_space_inode(root, inode); 1505 bool do_list = !btrfs_is_free_space_inode(inode);
1505 1506
1506 if (*bits & EXTENT_FIRST_DELALLOC) { 1507 if (*bits & EXTENT_FIRST_DELALLOC) {
1507 *bits &= ~EXTENT_FIRST_DELALLOC; 1508 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1612,7 +1613,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1612 1613
1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1614 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1614 1615
1615 if (btrfs_is_free_space_inode(root, inode)) 1616 if (btrfs_is_free_space_inode(inode))
1616 metadata = 2; 1617 metadata = 2;
1617 1618
1618 if (!(rw & REQ_WRITE)) { 1619 if (!(rw & REQ_WRITE)) {
@@ -1869,7 +1870,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1869 int ret; 1870 int ret;
1870 bool nolock; 1871 bool nolock;
1871 1872
1872 nolock = btrfs_is_free_space_inode(root, inode); 1873 nolock = btrfs_is_free_space_inode(inode);
1873 1874
1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1875 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1875 ret = -EIO; 1876 ret = -EIO;
@@ -2007,7 +2008,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2007 ordered_extent->work.func = finish_ordered_fn; 2008 ordered_extent->work.func = finish_ordered_fn;
2008 ordered_extent->work.flags = 0; 2009 ordered_extent->work.flags = 0;
2009 2010
2010 if (btrfs_is_free_space_inode(root, inode)) 2011 if (btrfs_is_free_space_inode(inode))
2011 workers = &root->fs_info->endio_freespace_worker; 2012 workers = &root->fs_info->endio_freespace_worker;
2012 else 2013 else
2013 workers = &root->fs_info->endio_write_workers; 2014 workers = &root->fs_info->endio_write_workers;
@@ -2732,8 +2733,10 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2732 * The data relocation inode should also be directly updated 2733 * The data relocation inode should also be directly updated
2733 * without delay 2734 * without delay
2734 */ 2735 */
2735 if (!btrfs_is_free_space_inode(root, inode) 2736 if (!btrfs_is_free_space_inode(inode)
2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2737 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2738 btrfs_update_root_times(trans, root);
2739
2737 ret = btrfs_delayed_update_inode(trans, root, inode); 2740 ret = btrfs_delayed_update_inode(trans, root, inode);
2738 if (!ret) 2741 if (!ret)
2739 btrfs_set_inode_last_trans(trans, inode); 2742 btrfs_set_inode_last_trans(trans, inode);
@@ -2833,7 +2836,7 @@ err:
2833 inode_inc_iversion(inode); 2836 inode_inc_iversion(inode);
2834 inode_inc_iversion(dir); 2837 inode_inc_iversion(dir);
2835 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2838 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2836 btrfs_update_inode(trans, root, dir); 2839 ret = btrfs_update_inode(trans, root, dir);
2837out: 2840out:
2838 return ret; 2841 return ret;
2839} 2842}
@@ -3743,7 +3746,7 @@ void btrfs_evict_inode(struct inode *inode)
3743 3746
3744 truncate_inode_pages(&inode->i_data, 0); 3747 truncate_inode_pages(&inode->i_data, 0);
3745 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3748 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3746 btrfs_is_free_space_inode(root, inode))) 3749 btrfs_is_free_space_inode(inode)))
3747 goto no_delete; 3750 goto no_delete;
3748 3751
3749 if (is_bad_inode(inode)) { 3752 if (is_bad_inode(inode)) {
@@ -4082,7 +4085,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
4082 struct btrfs_iget_args *args = p; 4085 struct btrfs_iget_args *args = p;
4083 inode->i_ino = args->ino; 4086 inode->i_ino = args->ino;
4084 BTRFS_I(inode)->root = args->root; 4087 BTRFS_I(inode)->root = args->root;
4085 btrfs_set_inode_space_info(args->root, inode);
4086 return 0; 4088 return 0;
4087} 4089}
4088 4090
@@ -4247,7 +4249,7 @@ static void btrfs_dentry_release(struct dentry *dentry)
4247} 4249}
4248 4250
4249static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4251static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4250 struct nameidata *nd) 4252 unsigned int flags)
4251{ 4253{
4252 struct dentry *ret; 4254 struct dentry *ret;
4253 4255
@@ -4457,7 +4459,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4457 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4459 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4458 return 0; 4460 return 0;
4459 4461
4460 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4462 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
4461 nolock = true; 4463 nolock = true;
4462 4464
4463 if (wbc->sync_mode == WB_SYNC_ALL) { 4465 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4518,6 +4520,11 @@ int btrfs_dirty_inode(struct inode *inode)
4518static int btrfs_update_time(struct inode *inode, struct timespec *now, 4520static int btrfs_update_time(struct inode *inode, struct timespec *now,
4519 int flags) 4521 int flags)
4520{ 4522{
4523 struct btrfs_root *root = BTRFS_I(inode)->root;
4524
4525 if (btrfs_root_readonly(root))
4526 return -EROFS;
4527
4521 if (flags & S_VERSION) 4528 if (flags & S_VERSION)
4522 inode_inc_iversion(inode); 4529 inode_inc_iversion(inode);
4523 if (flags & S_CTIME) 4530 if (flags & S_CTIME)
@@ -4662,7 +4669,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4662 BTRFS_I(inode)->root = root; 4669 BTRFS_I(inode)->root = root;
4663 BTRFS_I(inode)->generation = trans->transid; 4670 BTRFS_I(inode)->generation = trans->transid;
4664 inode->i_generation = BTRFS_I(inode)->generation; 4671 inode->i_generation = BTRFS_I(inode)->generation;
4665 btrfs_set_inode_space_info(root, inode);
4666 4672
4667 if (S_ISDIR(mode)) 4673 if (S_ISDIR(mode))
4668 owner = 0; 4674 owner = 0;
@@ -4690,6 +4696,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4690 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4696 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4691 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4697 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4692 struct btrfs_inode_item); 4698 struct btrfs_inode_item);
4699 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
4700 sizeof(*inode_item));
4693 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4701 fill_inode_item(trans, path->nodes[0], inode_item, inode);
4694 4702
4695 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4703 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
@@ -4723,6 +4731,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4723 trace_btrfs_inode_new(inode); 4731 trace_btrfs_inode_new(inode);
4724 btrfs_set_inode_last_trans(trans, inode); 4732 btrfs_set_inode_last_trans(trans, inode);
4725 4733
4734 btrfs_update_root_times(trans, root);
4735
4726 return inode; 4736 return inode;
4727fail: 4737fail:
4728 if (dir) 4738 if (dir)
@@ -4893,7 +4903,7 @@ out_unlock:
4893} 4903}
4894 4904
4895static int btrfs_create(struct inode *dir, struct dentry *dentry, 4905static int btrfs_create(struct inode *dir, struct dentry *dentry,
4896 umode_t mode, struct nameidata *nd) 4906 umode_t mode, bool excl)
4897{ 4907{
4898 struct btrfs_trans_handle *trans; 4908 struct btrfs_trans_handle *trans;
4899 struct btrfs_root *root = BTRFS_I(dir)->root; 4909 struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -6620,6 +6630,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6620 u64 page_start; 6630 u64 page_start;
6621 u64 page_end; 6631 u64 page_end;
6622 6632
6633 sb_start_pagefault(inode->i_sb);
6623 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6634 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6624 if (!ret) { 6635 if (!ret) {
6625 ret = file_update_time(vma->vm_file); 6636 ret = file_update_time(vma->vm_file);
@@ -6709,12 +6720,15 @@ again:
6709 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6720 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6710 6721
6711out_unlock: 6722out_unlock:
6712 if (!ret) 6723 if (!ret) {
6724 sb_end_pagefault(inode->i_sb);
6713 return VM_FAULT_LOCKED; 6725 return VM_FAULT_LOCKED;
6726 }
6714 unlock_page(page); 6727 unlock_page(page);
6715out: 6728out:
6716 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6729 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6717out_noreserve: 6730out_noreserve:
6731 sb_end_pagefault(inode->i_sb);
6718 return ret; 6732 return ret;
6719} 6733}
6720 6734
@@ -6939,7 +6953,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6939 return NULL; 6953 return NULL;
6940 6954
6941 ei->root = NULL; 6955 ei->root = NULL;
6942 ei->space_info = NULL;
6943 ei->generation = 0; 6956 ei->generation = 0;
6944 ei->last_trans = 0; 6957 ei->last_trans = 0;
6945 ei->last_sub_trans = 0; 6958 ei->last_sub_trans = 0;
@@ -6987,7 +7000,7 @@ void btrfs_destroy_inode(struct inode *inode)
6987 struct btrfs_ordered_extent *ordered; 7000 struct btrfs_ordered_extent *ordered;
6988 struct btrfs_root *root = BTRFS_I(inode)->root; 7001 struct btrfs_root *root = BTRFS_I(inode)->root;
6989 7002
6990 WARN_ON(!list_empty(&inode->i_dentry)); 7003 WARN_ON(!hlist_empty(&inode->i_dentry));
6991 WARN_ON(inode->i_data.nrpages); 7004 WARN_ON(inode->i_data.nrpages);
6992 WARN_ON(BTRFS_I(inode)->outstanding_extents); 7005 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6993 WARN_ON(BTRFS_I(inode)->reserved_extents); 7006 WARN_ON(BTRFS_I(inode)->reserved_extents);
@@ -7046,7 +7059,7 @@ int btrfs_drop_inode(struct inode *inode)
7046 struct btrfs_root *root = BTRFS_I(inode)->root; 7059 struct btrfs_root *root = BTRFS_I(inode)->root;
7047 7060
7048 if (btrfs_root_refs(&root->root_item) == 0 && 7061 if (btrfs_root_refs(&root->root_item) == 0 &&
7049 !btrfs_is_free_space_inode(root, inode)) 7062 !btrfs_is_free_space_inode(inode))
7050 return 1; 7063 return 1;
7051 else 7064 else
7052 return generic_drop_inode(inode); 7065 return generic_drop_inode(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0e92e5763005..7bb755677a22 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -41,6 +41,7 @@
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h>
44#include "compat.h" 45#include "compat.h"
45#include "ctree.h" 46#include "ctree.h"
46#include "disk-io.h" 47#include "disk-io.h"
@@ -53,6 +54,7 @@
53#include "inode-map.h" 54#include "inode-map.h"
54#include "backref.h" 55#include "backref.h"
55#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h"
56 58
57/* Mask out flags that are inappropriate for the given type of inode. */ 59/* Mask out flags that are inappropriate for the given type of inode. */
58static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -193,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
193 if (!inode_owner_or_capable(inode)) 195 if (!inode_owner_or_capable(inode))
194 return -EACCES; 196 return -EACCES;
195 197
198 ret = mnt_want_write_file(file);
199 if (ret)
200 return ret;
201
196 mutex_lock(&inode->i_mutex); 202 mutex_lock(&inode->i_mutex);
197 203
198 ip_oldflags = ip->flags; 204 ip_oldflags = ip->flags;
@@ -207,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
207 } 213 }
208 } 214 }
209 215
210 ret = mnt_want_write_file(file);
211 if (ret)
212 goto out_unlock;
213
214 if (flags & FS_SYNC_FL) 216 if (flags & FS_SYNC_FL)
215 ip->flags |= BTRFS_INODE_SYNC; 217 ip->flags |= BTRFS_INODE_SYNC;
216 else 218 else
@@ -273,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
273 inode->i_flags = i_oldflags; 275 inode->i_flags = i_oldflags;
274 } 276 }
275 277
276 mnt_drop_write_file(file);
277 out_unlock: 278 out_unlock:
278 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
280 mnt_drop_write_file(file);
279 return ret; 281 return ret;
280} 282}
281 283
@@ -336,7 +338,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
336static noinline int create_subvol(struct btrfs_root *root, 338static noinline int create_subvol(struct btrfs_root *root,
337 struct dentry *dentry, 339 struct dentry *dentry,
338 char *name, int namelen, 340 char *name, int namelen,
339 u64 *async_transid) 341 u64 *async_transid,
342 struct btrfs_qgroup_inherit **inherit)
340{ 343{
341 struct btrfs_trans_handle *trans; 344 struct btrfs_trans_handle *trans;
342 struct btrfs_key key; 345 struct btrfs_key key;
@@ -346,11 +349,13 @@ static noinline int create_subvol(struct btrfs_root *root,
346 struct btrfs_root *new_root; 349 struct btrfs_root *new_root;
347 struct dentry *parent = dentry->d_parent; 350 struct dentry *parent = dentry->d_parent;
348 struct inode *dir; 351 struct inode *dir;
352 struct timespec cur_time = CURRENT_TIME;
349 int ret; 353 int ret;
350 int err; 354 int err;
351 u64 objectid; 355 u64 objectid;
352 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 356 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
353 u64 index = 0; 357 u64 index = 0;
358 uuid_le new_uuid;
354 359
355 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 360 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
356 if (ret) 361 if (ret)
@@ -368,6 +373,11 @@ static noinline int create_subvol(struct btrfs_root *root,
368 if (IS_ERR(trans)) 373 if (IS_ERR(trans))
369 return PTR_ERR(trans); 374 return PTR_ERR(trans);
370 375
376 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
377 inherit ? *inherit : NULL);
378 if (ret)
379 goto fail;
380
371 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 381 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
372 0, objectid, NULL, 0, 0, 0); 382 0, objectid, NULL, 0, 0, 0);
373 if (IS_ERR(leaf)) { 383 if (IS_ERR(leaf)) {
@@ -389,8 +399,9 @@ static noinline int create_subvol(struct btrfs_root *root,
389 BTRFS_UUID_SIZE); 399 BTRFS_UUID_SIZE);
390 btrfs_mark_buffer_dirty(leaf); 400 btrfs_mark_buffer_dirty(leaf);
391 401
402 memset(&root_item, 0, sizeof(root_item));
403
392 inode_item = &root_item.inode; 404 inode_item = &root_item.inode;
393 memset(inode_item, 0, sizeof(*inode_item));
394 inode_item->generation = cpu_to_le64(1); 405 inode_item->generation = cpu_to_le64(1);
395 inode_item->size = cpu_to_le64(3); 406 inode_item->size = cpu_to_le64(3);
396 inode_item->nlink = cpu_to_le32(1); 407 inode_item->nlink = cpu_to_le32(1);
@@ -408,8 +419,15 @@ static noinline int create_subvol(struct btrfs_root *root,
408 btrfs_set_root_used(&root_item, leaf->len); 419 btrfs_set_root_used(&root_item, leaf->len);
409 btrfs_set_root_last_snapshot(&root_item, 0); 420 btrfs_set_root_last_snapshot(&root_item, 0);
410 421
411 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 422 btrfs_set_root_generation_v2(&root_item,
412 root_item.drop_level = 0; 423 btrfs_root_generation(&root_item));
424 uuid_le_gen(&new_uuid);
425 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
426 root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
427 root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec);
428 root_item.ctime = root_item.otime;
429 btrfs_set_root_ctransid(&root_item, trans->transid);
430 btrfs_set_root_otransid(&root_item, trans->transid);
413 431
414 btrfs_tree_unlock(leaf); 432 btrfs_tree_unlock(leaf);
415 free_extent_buffer(leaf); 433 free_extent_buffer(leaf);
@@ -484,7 +502,7 @@ fail:
484 502
485static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 503static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
486 char *name, int namelen, u64 *async_transid, 504 char *name, int namelen, u64 *async_transid,
487 bool readonly) 505 bool readonly, struct btrfs_qgroup_inherit **inherit)
488{ 506{
489 struct inode *inode; 507 struct inode *inode;
490 struct btrfs_pending_snapshot *pending_snapshot; 508 struct btrfs_pending_snapshot *pending_snapshot;
@@ -502,6 +520,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
502 pending_snapshot->dentry = dentry; 520 pending_snapshot->dentry = dentry;
503 pending_snapshot->root = root; 521 pending_snapshot->root = root;
504 pending_snapshot->readonly = readonly; 522 pending_snapshot->readonly = readonly;
523 if (inherit) {
524 pending_snapshot->inherit = *inherit;
525 *inherit = NULL; /* take responsibility to free it */
526 }
505 527
506 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
507 if (IS_ERR(trans)) { 529 if (IS_ERR(trans)) {
@@ -635,7 +657,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
635static noinline int btrfs_mksubvol(struct path *parent, 657static noinline int btrfs_mksubvol(struct path *parent,
636 char *name, int namelen, 658 char *name, int namelen,
637 struct btrfs_root *snap_src, 659 struct btrfs_root *snap_src,
638 u64 *async_transid, bool readonly) 660 u64 *async_transid, bool readonly,
661 struct btrfs_qgroup_inherit **inherit)
639{ 662{
640 struct inode *dir = parent->dentry->d_inode; 663 struct inode *dir = parent->dentry->d_inode;
641 struct dentry *dentry; 664 struct dentry *dentry;
@@ -652,13 +675,9 @@ static noinline int btrfs_mksubvol(struct path *parent,
652 if (dentry->d_inode) 675 if (dentry->d_inode)
653 goto out_dput; 676 goto out_dput;
654 677
655 error = mnt_want_write(parent->mnt);
656 if (error)
657 goto out_dput;
658
659 error = btrfs_may_create(dir, dentry); 678 error = btrfs_may_create(dir, dentry);
660 if (error) 679 if (error)
661 goto out_drop_write; 680 goto out_dput;
662 681
663 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 682 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
664 683
@@ -666,18 +685,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
666 goto out_up_read; 685 goto out_up_read;
667 686
668 if (snap_src) { 687 if (snap_src) {
669 error = create_snapshot(snap_src, dentry, 688 error = create_snapshot(snap_src, dentry, name, namelen,
670 name, namelen, async_transid, readonly); 689 async_transid, readonly, inherit);
671 } else { 690 } else {
672 error = create_subvol(BTRFS_I(dir)->root, dentry, 691 error = create_subvol(BTRFS_I(dir)->root, dentry,
673 name, namelen, async_transid); 692 name, namelen, async_transid, inherit);
674 } 693 }
675 if (!error) 694 if (!error)
676 fsnotify_mkdir(dir, dentry); 695 fsnotify_mkdir(dir, dentry);
677out_up_read: 696out_up_read:
678 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 697 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
679out_drop_write:
680 mnt_drop_write(parent->mnt);
681out_dput: 698out_dput:
682 dput(dentry); 699 dput(dentry);
683out_unlock: 700out_unlock:
@@ -832,7 +849,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
832} 849}
833 850
834static int should_defrag_range(struct inode *inode, u64 start, int thresh, 851static int should_defrag_range(struct inode *inode, u64 start, int thresh,
835 u64 *last_len, u64 *skip, u64 *defrag_end) 852 u64 *last_len, u64 *skip, u64 *defrag_end,
853 int compress)
836{ 854{
837 struct extent_map *em; 855 struct extent_map *em;
838 int ret = 1; 856 int ret = 1;
@@ -863,7 +881,7 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
863 * we hit a real extent, if it is big or the next extent is not a 881 * we hit a real extent, if it is big or the next extent is not a
864 * real extent, don't bother defragging it 882 * real extent, don't bother defragging it
865 */ 883 */
866 if ((*last_len == 0 || *last_len >= thresh) && 884 if (!compress && (*last_len == 0 || *last_len >= thresh) &&
867 (em->len >= thresh || !next_mergeable)) 885 (em->len >= thresh || !next_mergeable))
868 ret = 0; 886 ret = 0;
869out: 887out:
@@ -1047,11 +1065,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1047 u64 newer_than, unsigned long max_to_defrag) 1065 u64 newer_than, unsigned long max_to_defrag)
1048{ 1066{
1049 struct btrfs_root *root = BTRFS_I(inode)->root; 1067 struct btrfs_root *root = BTRFS_I(inode)->root;
1050 struct btrfs_super_block *disk_super;
1051 struct file_ra_state *ra = NULL; 1068 struct file_ra_state *ra = NULL;
1052 unsigned long last_index; 1069 unsigned long last_index;
1053 u64 isize = i_size_read(inode); 1070 u64 isize = i_size_read(inode);
1054 u64 features;
1055 u64 last_len = 0; 1071 u64 last_len = 0;
1056 u64 skip = 0; 1072 u64 skip = 0;
1057 u64 defrag_end = 0; 1073 u64 defrag_end = 0;
@@ -1145,7 +1161,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1145 1161
1146 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1162 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1147 extent_thresh, &last_len, &skip, 1163 extent_thresh, &last_len, &skip,
1148 &defrag_end)) { 1164 &defrag_end, range->flags &
1165 BTRFS_DEFRAG_RANGE_COMPRESS)) {
1149 unsigned long next; 1166 unsigned long next;
1150 /* 1167 /*
1151 * the should_defrag function tells us how much to skip 1168 * the should_defrag function tells us how much to skip
@@ -1237,11 +1254,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1237 mutex_unlock(&inode->i_mutex); 1254 mutex_unlock(&inode->i_mutex);
1238 } 1255 }
1239 1256
1240 disk_super = root->fs_info->super_copy;
1241 features = btrfs_super_incompat_flags(disk_super);
1242 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1257 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1243 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1258 btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
1244 btrfs_set_super_incompat_flags(disk_super, features);
1245 } 1259 }
1246 1260
1247 ret = defrag_count; 1261 ret = defrag_count;
@@ -1379,41 +1393,39 @@ out:
1379} 1393}
1380 1394
1381static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1395static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1382 char *name, 1396 char *name, unsigned long fd, int subvol,
1383 unsigned long fd, 1397 u64 *transid, bool readonly,
1384 int subvol, 1398 struct btrfs_qgroup_inherit **inherit)
1385 u64 *transid,
1386 bool readonly)
1387{ 1399{
1388 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1389 struct file *src_file; 1400 struct file *src_file;
1390 int namelen; 1401 int namelen;
1391 int ret = 0; 1402 int ret = 0;
1392 1403
1393 if (root->fs_info->sb->s_flags & MS_RDONLY) 1404 ret = mnt_want_write_file(file);
1394 return -EROFS; 1405 if (ret)
1406 goto out;
1395 1407
1396 namelen = strlen(name); 1408 namelen = strlen(name);
1397 if (strchr(name, '/')) { 1409 if (strchr(name, '/')) {
1398 ret = -EINVAL; 1410 ret = -EINVAL;
1399 goto out; 1411 goto out_drop_write;
1400 } 1412 }
1401 1413
1402 if (name[0] == '.' && 1414 if (name[0] == '.' &&
1403 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1415 (namelen == 1 || (name[1] == '.' && namelen == 2))) {
1404 ret = -EEXIST; 1416 ret = -EEXIST;
1405 goto out; 1417 goto out_drop_write;
1406 } 1418 }
1407 1419
1408 if (subvol) { 1420 if (subvol) {
1409 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1421 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1410 NULL, transid, readonly); 1422 NULL, transid, readonly, inherit);
1411 } else { 1423 } else {
1412 struct inode *src_inode; 1424 struct inode *src_inode;
1413 src_file = fget(fd); 1425 src_file = fget(fd);
1414 if (!src_file) { 1426 if (!src_file) {
1415 ret = -EINVAL; 1427 ret = -EINVAL;
1416 goto out; 1428 goto out_drop_write;
1417 } 1429 }
1418 1430
1419 src_inode = src_file->f_path.dentry->d_inode; 1431 src_inode = src_file->f_path.dentry->d_inode;
@@ -1422,13 +1434,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1422 "another FS\n"); 1434 "another FS\n");
1423 ret = -EINVAL; 1435 ret = -EINVAL;
1424 fput(src_file); 1436 fput(src_file);
1425 goto out; 1437 goto out_drop_write;
1426 } 1438 }
1427 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1439 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1428 BTRFS_I(src_inode)->root, 1440 BTRFS_I(src_inode)->root,
1429 transid, readonly); 1441 transid, readonly, inherit);
1430 fput(src_file); 1442 fput(src_file);
1431 } 1443 }
1444out_drop_write:
1445 mnt_drop_write_file(file);
1432out: 1446out:
1433 return ret; 1447 return ret;
1434} 1448}
@@ -1446,7 +1460,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
1446 1460
1447 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1461 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1448 vol_args->fd, subvol, 1462 vol_args->fd, subvol,
1449 NULL, false); 1463 NULL, false, NULL);
1450 1464
1451 kfree(vol_args); 1465 kfree(vol_args);
1452 return ret; 1466 return ret;
@@ -1460,6 +1474,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1460 u64 transid = 0; 1474 u64 transid = 0;
1461 u64 *ptr = NULL; 1475 u64 *ptr = NULL;
1462 bool readonly = false; 1476 bool readonly = false;
1477 struct btrfs_qgroup_inherit *inherit = NULL;
1463 1478
1464 vol_args = memdup_user(arg, sizeof(*vol_args)); 1479 vol_args = memdup_user(arg, sizeof(*vol_args));
1465 if (IS_ERR(vol_args)) 1480 if (IS_ERR(vol_args))
@@ -1467,7 +1482,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1467 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1482 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1468 1483
1469 if (vol_args->flags & 1484 if (vol_args->flags &
1470 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { 1485 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1486 BTRFS_SUBVOL_QGROUP_INHERIT)) {
1471 ret = -EOPNOTSUPP; 1487 ret = -EOPNOTSUPP;
1472 goto out; 1488 goto out;
1473 } 1489 }
@@ -1476,10 +1492,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1476 ptr = &transid; 1492 ptr = &transid;
1477 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1493 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1478 readonly = true; 1494 readonly = true;
1495 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1496 if (vol_args->size > PAGE_CACHE_SIZE) {
1497 ret = -EINVAL;
1498 goto out;
1499 }
1500 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1501 if (IS_ERR(inherit)) {
1502 ret = PTR_ERR(inherit);
1503 goto out;
1504 }
1505 }
1479 1506
1480 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1507 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1481 vol_args->fd, subvol, 1508 vol_args->fd, subvol, ptr,
1482 ptr, readonly); 1509 readonly, &inherit);
1483 1510
1484 if (ret == 0 && ptr && 1511 if (ret == 0 && ptr &&
1485 copy_to_user(arg + 1512 copy_to_user(arg +
@@ -1488,6 +1515,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1488 ret = -EFAULT; 1515 ret = -EFAULT;
1489out: 1516out:
1490 kfree(vol_args); 1517 kfree(vol_args);
1518 kfree(inherit);
1491 return ret; 1519 return ret;
1492} 1520}
1493 1521
@@ -1523,29 +1551,40 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1523 u64 flags; 1551 u64 flags;
1524 int ret = 0; 1552 int ret = 0;
1525 1553
1526 if (root->fs_info->sb->s_flags & MS_RDONLY) 1554 ret = mnt_want_write_file(file);
1527 return -EROFS; 1555 if (ret)
1556 goto out;
1528 1557
1529 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) 1558 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
1530 return -EINVAL; 1559 ret = -EINVAL;
1560 goto out_drop_write;
1561 }
1531 1562
1532 if (copy_from_user(&flags, arg, sizeof(flags))) 1563 if (copy_from_user(&flags, arg, sizeof(flags))) {
1533 return -EFAULT; 1564 ret = -EFAULT;
1565 goto out_drop_write;
1566 }
1534 1567
1535 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) 1568 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
1536 return -EINVAL; 1569 ret = -EINVAL;
1570 goto out_drop_write;
1571 }
1537 1572
1538 if (flags & ~BTRFS_SUBVOL_RDONLY) 1573 if (flags & ~BTRFS_SUBVOL_RDONLY) {
1539 return -EOPNOTSUPP; 1574 ret = -EOPNOTSUPP;
1575 goto out_drop_write;
1576 }
1540 1577
1541 if (!inode_owner_or_capable(inode)) 1578 if (!inode_owner_or_capable(inode)) {
1542 return -EACCES; 1579 ret = -EACCES;
1580 goto out_drop_write;
1581 }
1543 1582
1544 down_write(&root->fs_info->subvol_sem); 1583 down_write(&root->fs_info->subvol_sem);
1545 1584
1546 /* nothing to do */ 1585 /* nothing to do */
1547 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1586 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1548 goto out; 1587 goto out_drop_sem;
1549 1588
1550 root_flags = btrfs_root_flags(&root->root_item); 1589 root_flags = btrfs_root_flags(&root->root_item);
1551 if (flags & BTRFS_SUBVOL_RDONLY) 1590 if (flags & BTRFS_SUBVOL_RDONLY)
@@ -1568,8 +1607,11 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1568out_reset: 1607out_reset:
1569 if (ret) 1608 if (ret)
1570 btrfs_set_root_flags(&root->root_item, root_flags); 1609 btrfs_set_root_flags(&root->root_item, root_flags);
1571out: 1610out_drop_sem:
1572 up_write(&root->fs_info->subvol_sem); 1611 up_write(&root->fs_info->subvol_sem);
1612out_drop_write:
1613 mnt_drop_write_file(file);
1614out:
1573 return ret; 1615 return ret;
1574} 1616}
1575 1617
@@ -2340,6 +2382,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2340 goto out_drop_write; 2382 goto out_drop_write;
2341 } 2383 }
2342 2384
2385 ret = -EXDEV;
2386 if (src_file->f_path.mnt != file->f_path.mnt)
2387 goto out_fput;
2388
2343 src = src_file->f_dentry->d_inode; 2389 src = src_file->f_dentry->d_inode;
2344 2390
2345 ret = -EINVAL; 2391 ret = -EINVAL;
@@ -2360,7 +2406,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2360 goto out_fput; 2406 goto out_fput;
2361 2407
2362 ret = -EXDEV; 2408 ret = -EXDEV;
2363 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) 2409 if (src->i_sb != inode->i_sb)
2364 goto out_fput; 2410 goto out_fput;
2365 2411
2366 ret = -ENOMEM; 2412 ret = -ENOMEM;
@@ -2434,13 +2480,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2434 * note the key will change type as we walk through the 2480 * note the key will change type as we walk through the
2435 * tree. 2481 * tree.
2436 */ 2482 */
2437 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2483 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
2484 0, 0);
2438 if (ret < 0) 2485 if (ret < 0)
2439 goto out; 2486 goto out;
2440 2487
2441 nritems = btrfs_header_nritems(path->nodes[0]); 2488 nritems = btrfs_header_nritems(path->nodes[0]);
2442 if (path->slots[0] >= nritems) { 2489 if (path->slots[0] >= nritems) {
2443 ret = btrfs_next_leaf(root, path); 2490 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2444 if (ret < 0) 2491 if (ret < 0)
2445 goto out; 2492 goto out;
2446 if (ret > 0) 2493 if (ret > 0)
@@ -2749,8 +2796,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2749 struct btrfs_path *path; 2796 struct btrfs_path *path;
2750 struct btrfs_key location; 2797 struct btrfs_key location;
2751 struct btrfs_disk_key disk_key; 2798 struct btrfs_disk_key disk_key;
2752 struct btrfs_super_block *disk_super;
2753 u64 features;
2754 u64 objectid = 0; 2799 u64 objectid = 0;
2755 u64 dir_id; 2800 u64 dir_id;
2756 2801
@@ -2801,12 +2846,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2801 btrfs_mark_buffer_dirty(path->nodes[0]); 2846 btrfs_mark_buffer_dirty(path->nodes[0]);
2802 btrfs_free_path(path); 2847 btrfs_free_path(path);
2803 2848
2804 disk_super = root->fs_info->super_copy; 2849 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2805 features = btrfs_super_incompat_flags(disk_super);
2806 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2807 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
2808 btrfs_set_super_incompat_flags(disk_super, features);
2809 }
2810 btrfs_end_transaction(trans, root); 2850 btrfs_end_transaction(trans, root);
2811 2851
2812 return 0; 2852 return 0;
@@ -3063,19 +3103,21 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3063} 3103}
3064 3104
3065static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, 3105static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3066 void __user *arg, int reset_after_read) 3106 void __user *arg)
3067{ 3107{
3068 struct btrfs_ioctl_get_dev_stats *sa; 3108 struct btrfs_ioctl_get_dev_stats *sa;
3069 int ret; 3109 int ret;
3070 3110
3071 if (reset_after_read && !capable(CAP_SYS_ADMIN))
3072 return -EPERM;
3073
3074 sa = memdup_user(arg, sizeof(*sa)); 3111 sa = memdup_user(arg, sizeof(*sa));
3075 if (IS_ERR(sa)) 3112 if (IS_ERR(sa))
3076 return PTR_ERR(sa); 3113 return PTR_ERR(sa);
3077 3114
3078 ret = btrfs_get_dev_stats(root, sa, reset_after_read); 3115 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
3116 kfree(sa);
3117 return -EPERM;
3118 }
3119
3120 ret = btrfs_get_dev_stats(root, sa);
3079 3121
3080 if (copy_to_user(arg, sa, sizeof(*sa))) 3122 if (copy_to_user(arg, sa, sizeof(*sa)))
3081 ret = -EFAULT; 3123 ret = -EFAULT;
@@ -3265,10 +3307,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3265 if (!capable(CAP_SYS_ADMIN)) 3307 if (!capable(CAP_SYS_ADMIN))
3266 return -EPERM; 3308 return -EPERM;
3267 3309
3268 if (fs_info->sb->s_flags & MS_RDONLY) 3310 ret = mnt_want_write_file(file);
3269 return -EROFS;
3270
3271 ret = mnt_want_write(file->f_path.mnt);
3272 if (ret) 3311 if (ret)
3273 return ret; 3312 return ret;
3274 3313
@@ -3338,7 +3377,7 @@ out_bargs:
3338out: 3377out:
3339 mutex_unlock(&fs_info->balance_mutex); 3378 mutex_unlock(&fs_info->balance_mutex);
3340 mutex_unlock(&fs_info->volume_mutex); 3379 mutex_unlock(&fs_info->volume_mutex);
3341 mnt_drop_write(file->f_path.mnt); 3380 mnt_drop_write_file(file);
3342 return ret; 3381 return ret;
3343} 3382}
3344 3383
@@ -3390,6 +3429,264 @@ out:
3390 return ret; 3429 return ret;
3391} 3430}
3392 3431
3432static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3433{
3434 struct btrfs_ioctl_quota_ctl_args *sa;
3435 struct btrfs_trans_handle *trans = NULL;
3436 int ret;
3437 int err;
3438
3439 if (!capable(CAP_SYS_ADMIN))
3440 return -EPERM;
3441
3442 if (root->fs_info->sb->s_flags & MS_RDONLY)
3443 return -EROFS;
3444
3445 sa = memdup_user(arg, sizeof(*sa));
3446 if (IS_ERR(sa))
3447 return PTR_ERR(sa);
3448
3449 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3450 trans = btrfs_start_transaction(root, 2);
3451 if (IS_ERR(trans)) {
3452 ret = PTR_ERR(trans);
3453 goto out;
3454 }
3455 }
3456
3457 switch (sa->cmd) {
3458 case BTRFS_QUOTA_CTL_ENABLE:
3459 ret = btrfs_quota_enable(trans, root->fs_info);
3460 break;
3461 case BTRFS_QUOTA_CTL_DISABLE:
3462 ret = btrfs_quota_disable(trans, root->fs_info);
3463 break;
3464 case BTRFS_QUOTA_CTL_RESCAN:
3465 ret = btrfs_quota_rescan(root->fs_info);
3466 break;
3467 default:
3468 ret = -EINVAL;
3469 break;
3470 }
3471
3472 if (copy_to_user(arg, sa, sizeof(*sa)))
3473 ret = -EFAULT;
3474
3475 if (trans) {
3476 err = btrfs_commit_transaction(trans, root);
3477 if (err && !ret)
3478 ret = err;
3479 }
3480
3481out:
3482 kfree(sa);
3483 return ret;
3484}
3485
3486static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3487{
3488 struct btrfs_ioctl_qgroup_assign_args *sa;
3489 struct btrfs_trans_handle *trans;
3490 int ret;
3491 int err;
3492
3493 if (!capable(CAP_SYS_ADMIN))
3494 return -EPERM;
3495
3496 if (root->fs_info->sb->s_flags & MS_RDONLY)
3497 return -EROFS;
3498
3499 sa = memdup_user(arg, sizeof(*sa));
3500 if (IS_ERR(sa))
3501 return PTR_ERR(sa);
3502
3503 trans = btrfs_join_transaction(root);
3504 if (IS_ERR(trans)) {
3505 ret = PTR_ERR(trans);
3506 goto out;
3507 }
3508
3509 /* FIXME: check if the IDs really exist */
3510 if (sa->assign) {
3511 ret = btrfs_add_qgroup_relation(trans, root->fs_info,
3512 sa->src, sa->dst);
3513 } else {
3514 ret = btrfs_del_qgroup_relation(trans, root->fs_info,
3515 sa->src, sa->dst);
3516 }
3517
3518 err = btrfs_end_transaction(trans, root);
3519 if (err && !ret)
3520 ret = err;
3521
3522out:
3523 kfree(sa);
3524 return ret;
3525}
3526
3527static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3528{
3529 struct btrfs_ioctl_qgroup_create_args *sa;
3530 struct btrfs_trans_handle *trans;
3531 int ret;
3532 int err;
3533
3534 if (!capable(CAP_SYS_ADMIN))
3535 return -EPERM;
3536
3537 if (root->fs_info->sb->s_flags & MS_RDONLY)
3538 return -EROFS;
3539
3540 sa = memdup_user(arg, sizeof(*sa));
3541 if (IS_ERR(sa))
3542 return PTR_ERR(sa);
3543
3544 trans = btrfs_join_transaction(root);
3545 if (IS_ERR(trans)) {
3546 ret = PTR_ERR(trans);
3547 goto out;
3548 }
3549
3550 /* FIXME: check if the IDs really exist */
3551 if (sa->create) {
3552 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
3553 NULL);
3554 } else {
3555 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
3556 }
3557
3558 err = btrfs_end_transaction(trans, root);
3559 if (err && !ret)
3560 ret = err;
3561
3562out:
3563 kfree(sa);
3564 return ret;
3565}
3566
3567static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3568{
3569 struct btrfs_ioctl_qgroup_limit_args *sa;
3570 struct btrfs_trans_handle *trans;
3571 int ret;
3572 int err;
3573 u64 qgroupid;
3574
3575 if (!capable(CAP_SYS_ADMIN))
3576 return -EPERM;
3577
3578 if (root->fs_info->sb->s_flags & MS_RDONLY)
3579 return -EROFS;
3580
3581 sa = memdup_user(arg, sizeof(*sa));
3582 if (IS_ERR(sa))
3583 return PTR_ERR(sa);
3584
3585 trans = btrfs_join_transaction(root);
3586 if (IS_ERR(trans)) {
3587 ret = PTR_ERR(trans);
3588 goto out;
3589 }
3590
3591 qgroupid = sa->qgroupid;
3592 if (!qgroupid) {
3593 /* take the current subvol as qgroup */
3594 qgroupid = root->root_key.objectid;
3595 }
3596
3597 /* FIXME: check if the IDs really exist */
3598 ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
3599
3600 err = btrfs_end_transaction(trans, root);
3601 if (err && !ret)
3602 ret = err;
3603
3604out:
3605 kfree(sa);
3606 return ret;
3607}
3608
3609static long btrfs_ioctl_set_received_subvol(struct file *file,
3610 void __user *arg)
3611{
3612 struct btrfs_ioctl_received_subvol_args *sa = NULL;
3613 struct inode *inode = fdentry(file)->d_inode;
3614 struct btrfs_root *root = BTRFS_I(inode)->root;
3615 struct btrfs_root_item *root_item = &root->root_item;
3616 struct btrfs_trans_handle *trans;
3617 struct timespec ct = CURRENT_TIME;
3618 int ret = 0;
3619
3620 ret = mnt_want_write_file(file);
3621 if (ret < 0)
3622 return ret;
3623
3624 down_write(&root->fs_info->subvol_sem);
3625
3626 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
3627 ret = -EINVAL;
3628 goto out;
3629 }
3630
3631 if (btrfs_root_readonly(root)) {
3632 ret = -EROFS;
3633 goto out;
3634 }
3635
3636 if (!inode_owner_or_capable(inode)) {
3637 ret = -EACCES;
3638 goto out;
3639 }
3640
3641 sa = memdup_user(arg, sizeof(*sa));
3642 if (IS_ERR(sa)) {
3643 ret = PTR_ERR(sa);
3644 sa = NULL;
3645 goto out;
3646 }
3647
3648 trans = btrfs_start_transaction(root, 1);
3649 if (IS_ERR(trans)) {
3650 ret = PTR_ERR(trans);
3651 trans = NULL;
3652 goto out;
3653 }
3654
3655 sa->rtransid = trans->transid;
3656 sa->rtime.sec = ct.tv_sec;
3657 sa->rtime.nsec = ct.tv_nsec;
3658
3659 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
3660 btrfs_set_root_stransid(root_item, sa->stransid);
3661 btrfs_set_root_rtransid(root_item, sa->rtransid);
3662 root_item->stime.sec = cpu_to_le64(sa->stime.sec);
3663 root_item->stime.nsec = cpu_to_le32(sa->stime.nsec);
3664 root_item->rtime.sec = cpu_to_le64(sa->rtime.sec);
3665 root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec);
3666
3667 ret = btrfs_update_root(trans, root->fs_info->tree_root,
3668 &root->root_key, &root->root_item);
3669 if (ret < 0) {
3670 btrfs_end_transaction(trans, root);
3671 trans = NULL;
3672 goto out;
3673 } else {
3674 ret = btrfs_commit_transaction(trans, root);
3675 if (ret < 0)
3676 goto out;
3677 }
3678
3679 ret = copy_to_user(arg, sa, sizeof(*sa));
3680 if (ret)
3681 ret = -EFAULT;
3682
3683out:
3684 kfree(sa);
3685 up_write(&root->fs_info->subvol_sem);
3686 mnt_drop_write_file(file);
3687 return ret;
3688}
3689
3393long btrfs_ioctl(struct file *file, unsigned int 3690long btrfs_ioctl(struct file *file, unsigned int
3394 cmd, unsigned long arg) 3691 cmd, unsigned long arg)
3395{ 3692{
@@ -3411,6 +3708,8 @@ long btrfs_ioctl(struct file *file, unsigned int
3411 return btrfs_ioctl_snap_create_v2(file, argp, 0); 3708 return btrfs_ioctl_snap_create_v2(file, argp, 0);
3412 case BTRFS_IOC_SUBVOL_CREATE: 3709 case BTRFS_IOC_SUBVOL_CREATE:
3413 return btrfs_ioctl_snap_create(file, argp, 1); 3710 return btrfs_ioctl_snap_create(file, argp, 1);
3711 case BTRFS_IOC_SUBVOL_CREATE_V2:
3712 return btrfs_ioctl_snap_create_v2(file, argp, 1);
3414 case BTRFS_IOC_SNAP_DESTROY: 3713 case BTRFS_IOC_SNAP_DESTROY:
3415 return btrfs_ioctl_snap_destroy(file, argp); 3714 return btrfs_ioctl_snap_destroy(file, argp);
3416 case BTRFS_IOC_SUBVOL_GETFLAGS: 3715 case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -3472,10 +3771,20 @@ long btrfs_ioctl(struct file *file, unsigned int
3472 return btrfs_ioctl_balance_ctl(root, arg); 3771 return btrfs_ioctl_balance_ctl(root, arg);
3473 case BTRFS_IOC_BALANCE_PROGRESS: 3772 case BTRFS_IOC_BALANCE_PROGRESS:
3474 return btrfs_ioctl_balance_progress(root, argp); 3773 return btrfs_ioctl_balance_progress(root, argp);
3774 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
3775 return btrfs_ioctl_set_received_subvol(file, argp);
3776 case BTRFS_IOC_SEND:
3777 return btrfs_ioctl_send(file, argp);
3475 case BTRFS_IOC_GET_DEV_STATS: 3778 case BTRFS_IOC_GET_DEV_STATS:
3476 return btrfs_ioctl_get_dev_stats(root, argp, 0); 3779 return btrfs_ioctl_get_dev_stats(root, argp);
3477 case BTRFS_IOC_GET_AND_RESET_DEV_STATS: 3780 case BTRFS_IOC_QUOTA_CTL:
3478 return btrfs_ioctl_get_dev_stats(root, argp, 1); 3781 return btrfs_ioctl_quota_ctl(root, argp);
3782 case BTRFS_IOC_QGROUP_ASSIGN:
3783 return btrfs_ioctl_qgroup_assign(root, argp);
3784 case BTRFS_IOC_QGROUP_CREATE:
3785 return btrfs_ioctl_qgroup_create(root, argp);
3786 case BTRFS_IOC_QGROUP_LIMIT:
3787 return btrfs_ioctl_qgroup_limit(root, argp);
3479 } 3788 }
3480 3789
3481 return -ENOTTY; 3790 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e440aa653c30..731e2875ab93 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -32,15 +32,46 @@ struct btrfs_ioctl_vol_args {
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
35#define BTRFS_FSID_SIZE 16 36#define BTRFS_FSID_SIZE 16
36#define BTRFS_UUID_SIZE 16 37#define BTRFS_UUID_SIZE 16
37 38
39#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
40
41struct btrfs_qgroup_limit {
42 __u64 flags;
43 __u64 max_rfer;
44 __u64 max_excl;
45 __u64 rsv_rfer;
46 __u64 rsv_excl;
47};
48
49struct btrfs_qgroup_inherit {
50 __u64 flags;
51 __u64 num_qgroups;
52 __u64 num_ref_copies;
53 __u64 num_excl_copies;
54 struct btrfs_qgroup_limit lim;
55 __u64 qgroups[0];
56};
57
58struct btrfs_ioctl_qgroup_limit_args {
59 __u64 qgroupid;
60 struct btrfs_qgroup_limit lim;
61};
62
38#define BTRFS_SUBVOL_NAME_MAX 4039 63#define BTRFS_SUBVOL_NAME_MAX 4039
39struct btrfs_ioctl_vol_args_v2 { 64struct btrfs_ioctl_vol_args_v2 {
40 __s64 fd; 65 __s64 fd;
41 __u64 transid; 66 __u64 transid;
42 __u64 flags; 67 __u64 flags;
43 __u64 unused[4]; 68 union {
69 struct {
70 __u64 size;
71 struct btrfs_qgroup_inherit __user *qgroup_inherit;
72 };
73 __u64 unused[4];
74 };
44 char name[BTRFS_SUBVOL_NAME_MAX + 1]; 75 char name[BTRFS_SUBVOL_NAME_MAX + 1];
45}; 76};
46 77
@@ -285,9 +316,13 @@ enum btrfs_dev_stat_values {
285 BTRFS_DEV_STAT_VALUES_MAX 316 BTRFS_DEV_STAT_VALUES_MAX
286}; 317};
287 318
319/* Reset statistics after reading; needs SYS_ADMIN capability */
320#define BTRFS_DEV_STATS_RESET (1ULL << 0)
321
288struct btrfs_ioctl_get_dev_stats { 322struct btrfs_ioctl_get_dev_stats {
289 __u64 devid; /* in */ 323 __u64 devid; /* in */
290 __u64 nr_items; /* in/out */ 324 __u64 nr_items; /* in/out */
325 __u64 flags; /* in/out */
291 326
292 /* out values: */ 327 /* out values: */
293 __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; 328 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
@@ -295,6 +330,48 @@ struct btrfs_ioctl_get_dev_stats {
295 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ 330 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
296}; 331};
297 332
333#define BTRFS_QUOTA_CTL_ENABLE 1
334#define BTRFS_QUOTA_CTL_DISABLE 2
335#define BTRFS_QUOTA_CTL_RESCAN 3
336struct btrfs_ioctl_quota_ctl_args {
337 __u64 cmd;
338 __u64 status;
339};
340
341struct btrfs_ioctl_qgroup_assign_args {
342 __u64 assign;
343 __u64 src;
344 __u64 dst;
345};
346
347struct btrfs_ioctl_qgroup_create_args {
348 __u64 create;
349 __u64 qgroupid;
350};
351struct btrfs_ioctl_timespec {
352 __u64 sec;
353 __u32 nsec;
354};
355
356struct btrfs_ioctl_received_subvol_args {
357 char uuid[BTRFS_UUID_SIZE]; /* in */
358 __u64 stransid; /* in */
359 __u64 rtransid; /* out */
360 struct btrfs_ioctl_timespec stime; /* in */
361 struct btrfs_ioctl_timespec rtime; /* out */
362 __u64 flags; /* in */
363 __u64 reserved[16]; /* in */
364};
365
366struct btrfs_ioctl_send_args {
367 __s64 send_fd; /* in */
368 __u64 clone_sources_count; /* in */
369 __u64 __user *clone_sources; /* in */
370 __u64 parent_root; /* in */
371 __u64 flags; /* in */
372 __u64 reserved[4]; /* in */
373};
374
298#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 375#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
299 struct btrfs_ioctl_vol_args) 376 struct btrfs_ioctl_vol_args)
300#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 377#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -339,6 +416,8 @@ struct btrfs_ioctl_get_dev_stats {
339#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 416#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
340#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 417#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
341 struct btrfs_ioctl_vol_args_v2) 418 struct btrfs_ioctl_vol_args_v2)
419#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
420 struct btrfs_ioctl_vol_args_v2)
342#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) 421#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
343#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) 422#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
344#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ 423#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
@@ -359,9 +438,19 @@ struct btrfs_ioctl_get_dev_stats {
359 struct btrfs_ioctl_ino_path_args) 438 struct btrfs_ioctl_ino_path_args)
360#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 439#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
361 struct btrfs_ioctl_ino_path_args) 440 struct btrfs_ioctl_ino_path_args)
441#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
442 struct btrfs_ioctl_received_subvol_args)
443#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
444#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
445 struct btrfs_ioctl_vol_args)
446#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
447 struct btrfs_ioctl_quota_ctl_args)
448#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
449 struct btrfs_ioctl_qgroup_assign_args)
450#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
451 struct btrfs_ioctl_qgroup_create_args)
452#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
453 struct btrfs_ioctl_qgroup_limit_args)
362#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
363 struct btrfs_ioctl_get_dev_stats) 455 struct btrfs_ioctl_get_dev_stats)
364#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
365 struct btrfs_ioctl_get_dev_stats)
366
367#endif 456#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 272f911203ff..a44eff074805 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -78,13 +78,15 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
78 write_lock(&eb->lock); 78 write_lock(&eb->lock);
79 WARN_ON(atomic_read(&eb->spinning_writers)); 79 WARN_ON(atomic_read(&eb->spinning_writers));
80 atomic_inc(&eb->spinning_writers); 80 atomic_inc(&eb->spinning_writers);
81 if (atomic_dec_and_test(&eb->blocking_writers)) 81 if (atomic_dec_and_test(&eb->blocking_writers) &&
82 waitqueue_active(&eb->write_lock_wq))
82 wake_up(&eb->write_lock_wq); 83 wake_up(&eb->write_lock_wq);
83 } else if (rw == BTRFS_READ_LOCK_BLOCKING) { 84 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
84 BUG_ON(atomic_read(&eb->blocking_readers) == 0); 85 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
85 read_lock(&eb->lock); 86 read_lock(&eb->lock);
86 atomic_inc(&eb->spinning_readers); 87 atomic_inc(&eb->spinning_readers);
87 if (atomic_dec_and_test(&eb->blocking_readers)) 88 if (atomic_dec_and_test(&eb->blocking_readers) &&
89 waitqueue_active(&eb->read_lock_wq))
88 wake_up(&eb->read_lock_wq); 90 wake_up(&eb->read_lock_wq);
89 } 91 }
90 return; 92 return;
@@ -199,7 +201,8 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
199 } 201 }
200 btrfs_assert_tree_read_locked(eb); 202 btrfs_assert_tree_read_locked(eb);
201 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 203 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
202 if (atomic_dec_and_test(&eb->blocking_readers)) 204 if (atomic_dec_and_test(&eb->blocking_readers) &&
205 waitqueue_active(&eb->read_lock_wq))
203 wake_up(&eb->read_lock_wq); 206 wake_up(&eb->read_lock_wq);
204 atomic_dec(&eb->read_locks); 207 atomic_dec(&eb->read_locks);
205} 208}
@@ -247,8 +250,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
247 if (blockers) { 250 if (blockers) {
248 WARN_ON(atomic_read(&eb->spinning_writers)); 251 WARN_ON(atomic_read(&eb->spinning_writers));
249 atomic_dec(&eb->blocking_writers); 252 atomic_dec(&eb->blocking_writers);
250 smp_wmb(); 253 smp_mb();
251 wake_up(&eb->write_lock_wq); 254 if (waitqueue_active(&eb->write_lock_wq))
255 wake_up(&eb->write_lock_wq);
252 } else { 256 } else {
253 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 257 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
254 atomic_dec(&eb->spinning_writers); 258 atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 643335a4fe3c..051c7fe551dd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -596,7 +596,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
596 /* 596 /*
597 * pages in the range can be dirty, clean or writeback. We 597 * pages in the range can be dirty, clean or writeback. We
598 * start IO on any dirty ones so the wait doesn't stall waiting 598 * start IO on any dirty ones so the wait doesn't stall waiting
599 * for pdflush to find them 599 * for the flusher thread to find them
600 */ 600 */
601 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 601 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
602 filemap_fdatawrite_range(inode->i_mapping, start, end); 602 filemap_fdatawrite_range(inode->i_mapping, start, end);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 000000000000..bc424ae5a81a
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,1571 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26
27#include "ctree.h"
28#include "transaction.h"
29#include "disk-io.h"
30#include "locking.h"
31#include "ulist.h"
32#include "ioctl.h"
33#include "backref.h"
34
35/* TODO XXX FIXME
36 * - subvol delete -> delete when ref goes to 0? delete limits also?
37 * - reorganize keys
38 * - compressed
39 * - sync
40 * - rescan
41 * - copy also limits on subvol creation
42 * - limit
43 * - caches fuer ulists
44 * - performance benchmarks
45 * - check all ioctl parameters
46 */
47
48/*
49 * one struct for each qgroup, organized in fs_info->qgroup_tree.
50 */
51struct btrfs_qgroup {
52 u64 qgroupid;
53
54 /*
55 * state
56 */
57 u64 rfer; /* referenced */
58 u64 rfer_cmpr; /* referenced compressed */
59 u64 excl; /* exclusive */
60 u64 excl_cmpr; /* exclusive compressed */
61
62 /*
63 * limits
64 */
65 u64 lim_flags; /* which limits are set */
66 u64 max_rfer;
67 u64 max_excl;
68 u64 rsv_rfer;
69 u64 rsv_excl;
70
71 /*
72 * reservation tracking
73 */
74 u64 reserved;
75
76 /*
77 * lists
78 */
79 struct list_head groups; /* groups this group is member of */
80 struct list_head members; /* groups that are members of this group */
81 struct list_head dirty; /* dirty groups */
82 struct rb_node node; /* tree of qgroups */
83
84 /*
85 * temp variables for accounting operations
86 */
87 u64 tag;
88 u64 refcnt;
89};
90
91/*
92 * glue structure to represent the relations between qgroups.
93 */
94struct btrfs_qgroup_list {
95 struct list_head next_group;
96 struct list_head next_member;
97 struct btrfs_qgroup *group;
98 struct btrfs_qgroup *member;
99};
100
101/* must be called with qgroup_lock held */
102static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
103 u64 qgroupid)
104{
105 struct rb_node *n = fs_info->qgroup_tree.rb_node;
106 struct btrfs_qgroup *qgroup;
107
108 while (n) {
109 qgroup = rb_entry(n, struct btrfs_qgroup, node);
110 if (qgroup->qgroupid < qgroupid)
111 n = n->rb_left;
112 else if (qgroup->qgroupid > qgroupid)
113 n = n->rb_right;
114 else
115 return qgroup;
116 }
117 return NULL;
118}
119
120/* must be called with qgroup_lock held */
121static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
122 u64 qgroupid)
123{
124 struct rb_node **p = &fs_info->qgroup_tree.rb_node;
125 struct rb_node *parent = NULL;
126 struct btrfs_qgroup *qgroup;
127
128 while (*p) {
129 parent = *p;
130 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
131
132 if (qgroup->qgroupid < qgroupid)
133 p = &(*p)->rb_left;
134 else if (qgroup->qgroupid > qgroupid)
135 p = &(*p)->rb_right;
136 else
137 return qgroup;
138 }
139
140 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
141 if (!qgroup)
142 return ERR_PTR(-ENOMEM);
143
144 qgroup->qgroupid = qgroupid;
145 INIT_LIST_HEAD(&qgroup->groups);
146 INIT_LIST_HEAD(&qgroup->members);
147 INIT_LIST_HEAD(&qgroup->dirty);
148
149 rb_link_node(&qgroup->node, parent, p);
150 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
151
152 return qgroup;
153}
154
155/* must be called with qgroup_lock held */
156static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
157{
158 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
159 struct btrfs_qgroup_list *list;
160
161 if (!qgroup)
162 return -ENOENT;
163
164 rb_erase(&qgroup->node, &fs_info->qgroup_tree);
165 list_del(&qgroup->dirty);
166
167 while (!list_empty(&qgroup->groups)) {
168 list = list_first_entry(&qgroup->groups,
169 struct btrfs_qgroup_list, next_group);
170 list_del(&list->next_group);
171 list_del(&list->next_member);
172 kfree(list);
173 }
174
175 while (!list_empty(&qgroup->members)) {
176 list = list_first_entry(&qgroup->members,
177 struct btrfs_qgroup_list, next_member);
178 list_del(&list->next_group);
179 list_del(&list->next_member);
180 kfree(list);
181 }
182 kfree(qgroup);
183
184 return 0;
185}
186
187/* must be called with qgroup_lock held */
188static int add_relation_rb(struct btrfs_fs_info *fs_info,
189 u64 memberid, u64 parentid)
190{
191 struct btrfs_qgroup *member;
192 struct btrfs_qgroup *parent;
193 struct btrfs_qgroup_list *list;
194
195 member = find_qgroup_rb(fs_info, memberid);
196 parent = find_qgroup_rb(fs_info, parentid);
197 if (!member || !parent)
198 return -ENOENT;
199
200 list = kzalloc(sizeof(*list), GFP_ATOMIC);
201 if (!list)
202 return -ENOMEM;
203
204 list->group = parent;
205 list->member = member;
206 list_add_tail(&list->next_group, &member->groups);
207 list_add_tail(&list->next_member, &parent->members);
208
209 return 0;
210}
211
212/* must be called with qgroup_lock held */
213static int del_relation_rb(struct btrfs_fs_info *fs_info,
214 u64 memberid, u64 parentid)
215{
216 struct btrfs_qgroup *member;
217 struct btrfs_qgroup *parent;
218 struct btrfs_qgroup_list *list;
219
220 member = find_qgroup_rb(fs_info, memberid);
221 parent = find_qgroup_rb(fs_info, parentid);
222 if (!member || !parent)
223 return -ENOENT;
224
225 list_for_each_entry(list, &member->groups, next_group) {
226 if (list->group == parent) {
227 list_del(&list->next_group);
228 list_del(&list->next_member);
229 kfree(list);
230 return 0;
231 }
232 }
233 return -ENOENT;
234}
235
236/*
237 * The full config is read in one go, only called from open_ctree()
238 * It doesn't use any locking, as at this point we're still single-threaded
239 */
240int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
241{
242 struct btrfs_key key;
243 struct btrfs_key found_key;
244 struct btrfs_root *quota_root = fs_info->quota_root;
245 struct btrfs_path *path = NULL;
246 struct extent_buffer *l;
247 int slot;
248 int ret = 0;
249 u64 flags = 0;
250
251 if (!fs_info->quota_enabled)
252 return 0;
253
254 path = btrfs_alloc_path();
255 if (!path) {
256 ret = -ENOMEM;
257 goto out;
258 }
259
260 /* default this to quota off, in case no status key is found */
261 fs_info->qgroup_flags = 0;
262
263 /*
264 * pass 1: read status, all qgroup infos and limits
265 */
266 key.objectid = 0;
267 key.type = 0;
268 key.offset = 0;
269 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
270 if (ret)
271 goto out;
272
273 while (1) {
274 struct btrfs_qgroup *qgroup;
275
276 slot = path->slots[0];
277 l = path->nodes[0];
278 btrfs_item_key_to_cpu(l, &found_key, slot);
279
280 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
281 struct btrfs_qgroup_status_item *ptr;
282
283 ptr = btrfs_item_ptr(l, slot,
284 struct btrfs_qgroup_status_item);
285
286 if (btrfs_qgroup_status_version(l, ptr) !=
287 BTRFS_QGROUP_STATUS_VERSION) {
288 printk(KERN_ERR
289 "btrfs: old qgroup version, quota disabled\n");
290 goto out;
291 }
292 if (btrfs_qgroup_status_generation(l, ptr) !=
293 fs_info->generation) {
294 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
295 printk(KERN_ERR
296 "btrfs: qgroup generation mismatch, "
297 "marked as inconsistent\n");
298 }
299 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
300 ptr);
301 /* FIXME read scan element */
302 goto next1;
303 }
304
305 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
306 found_key.type != BTRFS_QGROUP_LIMIT_KEY)
307 goto next1;
308
309 qgroup = find_qgroup_rb(fs_info, found_key.offset);
310 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
311 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
312 printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
313 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
314 }
315 if (!qgroup) {
316 qgroup = add_qgroup_rb(fs_info, found_key.offset);
317 if (IS_ERR(qgroup)) {
318 ret = PTR_ERR(qgroup);
319 goto out;
320 }
321 }
322 switch (found_key.type) {
323 case BTRFS_QGROUP_INFO_KEY: {
324 struct btrfs_qgroup_info_item *ptr;
325
326 ptr = btrfs_item_ptr(l, slot,
327 struct btrfs_qgroup_info_item);
328 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
329 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
330 qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
331 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
332 /* generation currently unused */
333 break;
334 }
335 case BTRFS_QGROUP_LIMIT_KEY: {
336 struct btrfs_qgroup_limit_item *ptr;
337
338 ptr = btrfs_item_ptr(l, slot,
339 struct btrfs_qgroup_limit_item);
340 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
341 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
342 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
343 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
344 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
345 break;
346 }
347 }
348next1:
349 ret = btrfs_next_item(quota_root, path);
350 if (ret < 0)
351 goto out;
352 if (ret)
353 break;
354 }
355 btrfs_release_path(path);
356
357 /*
358 * pass 2: read all qgroup relations
359 */
360 key.objectid = 0;
361 key.type = BTRFS_QGROUP_RELATION_KEY;
362 key.offset = 0;
363 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
364 if (ret)
365 goto out;
366 while (1) {
367 slot = path->slots[0];
368 l = path->nodes[0];
369 btrfs_item_key_to_cpu(l, &found_key, slot);
370
371 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
372 goto next2;
373
374 if (found_key.objectid > found_key.offset) {
375 /* parent <- member, not needed to build config */
376 /* FIXME should we omit the key completely? */
377 goto next2;
378 }
379
380 ret = add_relation_rb(fs_info, found_key.objectid,
381 found_key.offset);
382 if (ret)
383 goto out;
384next2:
385 ret = btrfs_next_item(quota_root, path);
386 if (ret < 0)
387 goto out;
388 if (ret)
389 break;
390 }
391out:
392 fs_info->qgroup_flags |= flags;
393 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
394 fs_info->quota_enabled = 0;
395 fs_info->pending_quota_state = 0;
396 }
397 btrfs_free_path(path);
398
399 return ret < 0 ? ret : 0;
400}
401
402/*
403 * This is only called from close_ctree() or open_ctree(), both in single-
404 * treaded paths. Clean up the in-memory structures. No locking needed.
405 */
406void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
407{
408 struct rb_node *n;
409 struct btrfs_qgroup *qgroup;
410 struct btrfs_qgroup_list *list;
411
412 while ((n = rb_first(&fs_info->qgroup_tree))) {
413 qgroup = rb_entry(n, struct btrfs_qgroup, node);
414 rb_erase(n, &fs_info->qgroup_tree);
415
416 WARN_ON(!list_empty(&qgroup->dirty));
417
418 while (!list_empty(&qgroup->groups)) {
419 list = list_first_entry(&qgroup->groups,
420 struct btrfs_qgroup_list,
421 next_group);
422 list_del(&list->next_group);
423 list_del(&list->next_member);
424 kfree(list);
425 }
426
427 while (!list_empty(&qgroup->members)) {
428 list = list_first_entry(&qgroup->members,
429 struct btrfs_qgroup_list,
430 next_member);
431 list_del(&list->next_group);
432 list_del(&list->next_member);
433 kfree(list);
434 }
435 kfree(qgroup);
436 }
437}
438
439static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
440 struct btrfs_root *quota_root,
441 u64 src, u64 dst)
442{
443 int ret;
444 struct btrfs_path *path;
445 struct btrfs_key key;
446
447 path = btrfs_alloc_path();
448 if (!path)
449 return -ENOMEM;
450
451 key.objectid = src;
452 key.type = BTRFS_QGROUP_RELATION_KEY;
453 key.offset = dst;
454
455 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
456
457 btrfs_mark_buffer_dirty(path->nodes[0]);
458
459 btrfs_free_path(path);
460 return ret;
461}
462
463static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
464 struct btrfs_root *quota_root,
465 u64 src, u64 dst)
466{
467 int ret;
468 struct btrfs_path *path;
469 struct btrfs_key key;
470
471 path = btrfs_alloc_path();
472 if (!path)
473 return -ENOMEM;
474
475 key.objectid = src;
476 key.type = BTRFS_QGROUP_RELATION_KEY;
477 key.offset = dst;
478
479 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
480 if (ret < 0)
481 goto out;
482
483 if (ret > 0) {
484 ret = -ENOENT;
485 goto out;
486 }
487
488 ret = btrfs_del_item(trans, quota_root, path);
489out:
490 btrfs_free_path(path);
491 return ret;
492}
493
494static int add_qgroup_item(struct btrfs_trans_handle *trans,
495 struct btrfs_root *quota_root, u64 qgroupid)
496{
497 int ret;
498 struct btrfs_path *path;
499 struct btrfs_qgroup_info_item *qgroup_info;
500 struct btrfs_qgroup_limit_item *qgroup_limit;
501 struct extent_buffer *leaf;
502 struct btrfs_key key;
503
504 path = btrfs_alloc_path();
505 if (!path)
506 return -ENOMEM;
507
508 key.objectid = 0;
509 key.type = BTRFS_QGROUP_INFO_KEY;
510 key.offset = qgroupid;
511
512 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
513 sizeof(*qgroup_info));
514 if (ret)
515 goto out;
516
517 leaf = path->nodes[0];
518 qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
519 struct btrfs_qgroup_info_item);
520 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
521 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
522 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
523 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
524 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
525
526 btrfs_mark_buffer_dirty(leaf);
527
528 btrfs_release_path(path);
529
530 key.type = BTRFS_QGROUP_LIMIT_KEY;
531 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
532 sizeof(*qgroup_limit));
533 if (ret)
534 goto out;
535
536 leaf = path->nodes[0];
537 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
538 struct btrfs_qgroup_limit_item);
539 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
540 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
541 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
542 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
543 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
544
545 btrfs_mark_buffer_dirty(leaf);
546
547 ret = 0;
548out:
549 btrfs_free_path(path);
550 return ret;
551}
552
553static int del_qgroup_item(struct btrfs_trans_handle *trans,
554 struct btrfs_root *quota_root, u64 qgroupid)
555{
556 int ret;
557 struct btrfs_path *path;
558 struct btrfs_key key;
559
560 path = btrfs_alloc_path();
561 if (!path)
562 return -ENOMEM;
563
564 key.objectid = 0;
565 key.type = BTRFS_QGROUP_INFO_KEY;
566 key.offset = qgroupid;
567 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
568 if (ret < 0)
569 goto out;
570
571 if (ret > 0) {
572 ret = -ENOENT;
573 goto out;
574 }
575
576 ret = btrfs_del_item(trans, quota_root, path);
577 if (ret)
578 goto out;
579
580 btrfs_release_path(path);
581
582 key.type = BTRFS_QGROUP_LIMIT_KEY;
583 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
584 if (ret < 0)
585 goto out;
586
587 if (ret > 0) {
588 ret = -ENOENT;
589 goto out;
590 }
591
592 ret = btrfs_del_item(trans, quota_root, path);
593
594out:
595 btrfs_free_path(path);
596 return ret;
597}
598
599static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
600 struct btrfs_root *root, u64 qgroupid,
601 u64 flags, u64 max_rfer, u64 max_excl,
602 u64 rsv_rfer, u64 rsv_excl)
603{
604 struct btrfs_path *path;
605 struct btrfs_key key;
606 struct extent_buffer *l;
607 struct btrfs_qgroup_limit_item *qgroup_limit;
608 int ret;
609 int slot;
610
611 key.objectid = 0;
612 key.type = BTRFS_QGROUP_LIMIT_KEY;
613 key.offset = qgroupid;
614
615 path = btrfs_alloc_path();
616 BUG_ON(!path);
617 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
618 if (ret > 0)
619 ret = -ENOENT;
620
621 if (ret)
622 goto out;
623
624 l = path->nodes[0];
625 slot = path->slots[0];
626 qgroup_limit = btrfs_item_ptr(l, path->slots[0],
627 struct btrfs_qgroup_limit_item);
628 btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
629 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
630 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
631 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
632 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
633
634 btrfs_mark_buffer_dirty(l);
635
636out:
637 btrfs_free_path(path);
638 return ret;
639}
640
641static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
642 struct btrfs_root *root,
643 struct btrfs_qgroup *qgroup)
644{
645 struct btrfs_path *path;
646 struct btrfs_key key;
647 struct extent_buffer *l;
648 struct btrfs_qgroup_info_item *qgroup_info;
649 int ret;
650 int slot;
651
652 key.objectid = 0;
653 key.type = BTRFS_QGROUP_INFO_KEY;
654 key.offset = qgroup->qgroupid;
655
656 path = btrfs_alloc_path();
657 BUG_ON(!path);
658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
659 if (ret > 0)
660 ret = -ENOENT;
661
662 if (ret)
663 goto out;
664
665 l = path->nodes[0];
666 slot = path->slots[0];
667 qgroup_info = btrfs_item_ptr(l, path->slots[0],
668 struct btrfs_qgroup_info_item);
669 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
670 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
671 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
672 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
673 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
674
675 btrfs_mark_buffer_dirty(l);
676
677out:
678 btrfs_free_path(path);
679 return ret;
680}
681
682static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
683 struct btrfs_fs_info *fs_info,
684 struct btrfs_root *root)
685{
686 struct btrfs_path *path;
687 struct btrfs_key key;
688 struct extent_buffer *l;
689 struct btrfs_qgroup_status_item *ptr;
690 int ret;
691 int slot;
692
693 key.objectid = 0;
694 key.type = BTRFS_QGROUP_STATUS_KEY;
695 key.offset = 0;
696
697 path = btrfs_alloc_path();
698 BUG_ON(!path);
699 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
700 if (ret > 0)
701 ret = -ENOENT;
702
703 if (ret)
704 goto out;
705
706 l = path->nodes[0];
707 slot = path->slots[0];
708 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
709 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
710 btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
711 /* XXX scan */
712
713 btrfs_mark_buffer_dirty(l);
714
715out:
716 btrfs_free_path(path);
717 return ret;
718}
719
720/*
721 * called with qgroup_lock held
722 */
723static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
724 struct btrfs_root *root)
725{
726 struct btrfs_path *path;
727 struct btrfs_key key;
728 int ret;
729
730 if (!root)
731 return -EINVAL;
732
733 path = btrfs_alloc_path();
734 if (!path)
735 return -ENOMEM;
736
737 while (1) {
738 key.objectid = 0;
739 key.offset = 0;
740 key.type = 0;
741
742 path->leave_spinning = 1;
743 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
744 if (ret > 0) {
745 if (path->slots[0] == 0)
746 break;
747 path->slots[0]--;
748 } else if (ret < 0) {
749 break;
750 }
751
752 ret = btrfs_del_item(trans, root, path);
753 if (ret)
754 goto out;
755 btrfs_release_path(path);
756 }
757 ret = 0;
758out:
759 root->fs_info->pending_quota_state = 0;
760 btrfs_free_path(path);
761 return ret;
762}
763
764int btrfs_quota_enable(struct btrfs_trans_handle *trans,
765 struct btrfs_fs_info *fs_info)
766{
767 struct btrfs_root *quota_root;
768 struct btrfs_path *path = NULL;
769 struct btrfs_qgroup_status_item *ptr;
770 struct extent_buffer *leaf;
771 struct btrfs_key key;
772 int ret = 0;
773
774 spin_lock(&fs_info->qgroup_lock);
775 if (fs_info->quota_root) {
776 fs_info->pending_quota_state = 1;
777 spin_unlock(&fs_info->qgroup_lock);
778 goto out;
779 }
780 spin_unlock(&fs_info->qgroup_lock);
781
782 /*
783 * initially create the quota tree
784 */
785 quota_root = btrfs_create_tree(trans, fs_info,
786 BTRFS_QUOTA_TREE_OBJECTID);
787 if (IS_ERR(quota_root)) {
788 ret = PTR_ERR(quota_root);
789 goto out;
790 }
791
792 path = btrfs_alloc_path();
793 if (!path)
794 return -ENOMEM;
795
796 key.objectid = 0;
797 key.type = BTRFS_QGROUP_STATUS_KEY;
798 key.offset = 0;
799
800 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
801 sizeof(*ptr));
802 if (ret)
803 goto out;
804
805 leaf = path->nodes[0];
806 ptr = btrfs_item_ptr(leaf, path->slots[0],
807 struct btrfs_qgroup_status_item);
808 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
809 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
810 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
811 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
812 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
813 btrfs_set_qgroup_status_scan(leaf, ptr, 0);
814
815 btrfs_mark_buffer_dirty(leaf);
816
817 spin_lock(&fs_info->qgroup_lock);
818 fs_info->quota_root = quota_root;
819 fs_info->pending_quota_state = 1;
820 spin_unlock(&fs_info->qgroup_lock);
821out:
822 btrfs_free_path(path);
823 return ret;
824}
825
826int btrfs_quota_disable(struct btrfs_trans_handle *trans,
827 struct btrfs_fs_info *fs_info)
828{
829 struct btrfs_root *tree_root = fs_info->tree_root;
830 struct btrfs_root *quota_root;
831 int ret = 0;
832
833 spin_lock(&fs_info->qgroup_lock);
834 fs_info->quota_enabled = 0;
835 fs_info->pending_quota_state = 0;
836 quota_root = fs_info->quota_root;
837 fs_info->quota_root = NULL;
838 btrfs_free_qgroup_config(fs_info);
839 spin_unlock(&fs_info->qgroup_lock);
840
841 if (!quota_root)
842 return -EINVAL;
843
844 ret = btrfs_clean_quota_tree(trans, quota_root);
845 if (ret)
846 goto out;
847
848 ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
849 if (ret)
850 goto out;
851
852 list_del(&quota_root->dirty_list);
853
854 btrfs_tree_lock(quota_root->node);
855 clean_tree_block(trans, tree_root, quota_root->node);
856 btrfs_tree_unlock(quota_root->node);
857 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
858
859 free_extent_buffer(quota_root->node);
860 free_extent_buffer(quota_root->commit_root);
861 kfree(quota_root);
862out:
863 return ret;
864}
865
866int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
867{
868 /* FIXME */
869 return 0;
870}
871
872int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
873 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
874{
875 struct btrfs_root *quota_root;
876 int ret = 0;
877
878 quota_root = fs_info->quota_root;
879 if (!quota_root)
880 return -EINVAL;
881
882 ret = add_qgroup_relation_item(trans, quota_root, src, dst);
883 if (ret)
884 return ret;
885
886 ret = add_qgroup_relation_item(trans, quota_root, dst, src);
887 if (ret) {
888 del_qgroup_relation_item(trans, quota_root, src, dst);
889 return ret;
890 }
891
892 spin_lock(&fs_info->qgroup_lock);
893 ret = add_relation_rb(quota_root->fs_info, src, dst);
894 spin_unlock(&fs_info->qgroup_lock);
895
896 return ret;
897}
898
899int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
900 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
901{
902 struct btrfs_root *quota_root;
903 int ret = 0;
904 int err;
905
906 quota_root = fs_info->quota_root;
907 if (!quota_root)
908 return -EINVAL;
909
910 ret = del_qgroup_relation_item(trans, quota_root, src, dst);
911 err = del_qgroup_relation_item(trans, quota_root, dst, src);
912 if (err && !ret)
913 ret = err;
914
915 spin_lock(&fs_info->qgroup_lock);
916 del_relation_rb(fs_info, src, dst);
917
918 spin_unlock(&fs_info->qgroup_lock);
919
920 return ret;
921}
922
923int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
924 struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
925{
926 struct btrfs_root *quota_root;
927 struct btrfs_qgroup *qgroup;
928 int ret = 0;
929
930 quota_root = fs_info->quota_root;
931 if (!quota_root)
932 return -EINVAL;
933
934 ret = add_qgroup_item(trans, quota_root, qgroupid);
935
936 spin_lock(&fs_info->qgroup_lock);
937 qgroup = add_qgroup_rb(fs_info, qgroupid);
938 spin_unlock(&fs_info->qgroup_lock);
939
940 if (IS_ERR(qgroup))
941 ret = PTR_ERR(qgroup);
942
943 return ret;
944}
945
946int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
947 struct btrfs_fs_info *fs_info, u64 qgroupid)
948{
949 struct btrfs_root *quota_root;
950 int ret = 0;
951
952 quota_root = fs_info->quota_root;
953 if (!quota_root)
954 return -EINVAL;
955
956 ret = del_qgroup_item(trans, quota_root, qgroupid);
957
958 spin_lock(&fs_info->qgroup_lock);
959 del_qgroup_rb(quota_root->fs_info, qgroupid);
960
961 spin_unlock(&fs_info->qgroup_lock);
962
963 return ret;
964}
965
966int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
967 struct btrfs_fs_info *fs_info, u64 qgroupid,
968 struct btrfs_qgroup_limit *limit)
969{
970 struct btrfs_root *quota_root = fs_info->quota_root;
971 struct btrfs_qgroup *qgroup;
972 int ret = 0;
973
974 if (!quota_root)
975 return -EINVAL;
976
977 ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
978 limit->flags, limit->max_rfer,
979 limit->max_excl, limit->rsv_rfer,
980 limit->rsv_excl);
981 if (ret) {
982 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
983 printk(KERN_INFO "unable to update quota limit for %llu\n",
984 (unsigned long long)qgroupid);
985 }
986
987 spin_lock(&fs_info->qgroup_lock);
988
989 qgroup = find_qgroup_rb(fs_info, qgroupid);
990 if (!qgroup) {
991 ret = -ENOENT;
992 goto unlock;
993 }
994 qgroup->lim_flags = limit->flags;
995 qgroup->max_rfer = limit->max_rfer;
996 qgroup->max_excl = limit->max_excl;
997 qgroup->rsv_rfer = limit->rsv_rfer;
998 qgroup->rsv_excl = limit->rsv_excl;
999
1000unlock:
1001 spin_unlock(&fs_info->qgroup_lock);
1002
1003 return ret;
1004}
1005
1006static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1007 struct btrfs_qgroup *qgroup)
1008{
1009 if (list_empty(&qgroup->dirty))
1010 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1011}
1012
1013/*
1014 * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
1015 * the modification into a list that's later used by btrfs_end_transaction to
1016 * pass the recorded modifications on to btrfs_qgroup_account_ref.
1017 */
1018int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_delayed_ref_node *node,
1020 struct btrfs_delayed_extent_op *extent_op)
1021{
1022 struct qgroup_update *u;
1023
1024 BUG_ON(!trans->delayed_ref_elem.seq);
1025 u = kmalloc(sizeof(*u), GFP_NOFS);
1026 if (!u)
1027 return -ENOMEM;
1028
1029 u->node = node;
1030 u->extent_op = extent_op;
1031 list_add_tail(&u->list, &trans->qgroup_ref_list);
1032
1033 return 0;
1034}
1035
1036/*
1037 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1038 * from the fs. First, all roots referencing the extent are searched, and
1039 * then the space is accounted accordingly to the different roots. The
1040 * accounting algorithm works in 3 steps documented inline.
1041 */
1042int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info,
1044 struct btrfs_delayed_ref_node *node,
1045 struct btrfs_delayed_extent_op *extent_op)
1046{
1047 struct btrfs_key ins;
1048 struct btrfs_root *quota_root;
1049 u64 ref_root;
1050 struct btrfs_qgroup *qgroup;
1051 struct ulist_node *unode;
1052 struct ulist *roots = NULL;
1053 struct ulist *tmp = NULL;
1054 struct ulist_iterator uiter;
1055 u64 seq;
1056 int ret = 0;
1057 int sgn;
1058
1059 if (!fs_info->quota_enabled)
1060 return 0;
1061
1062 BUG_ON(!fs_info->quota_root);
1063
1064 ins.objectid = node->bytenr;
1065 ins.offset = node->num_bytes;
1066 ins.type = BTRFS_EXTENT_ITEM_KEY;
1067
1068 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1069 node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
1070 struct btrfs_delayed_tree_ref *ref;
1071 ref = btrfs_delayed_node_to_tree_ref(node);
1072 ref_root = ref->root;
1073 } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1074 node->type == BTRFS_SHARED_DATA_REF_KEY) {
1075 struct btrfs_delayed_data_ref *ref;
1076 ref = btrfs_delayed_node_to_data_ref(node);
1077 ref_root = ref->root;
1078 } else {
1079 BUG();
1080 }
1081
1082 if (!is_fstree(ref_root)) {
1083 /*
1084 * non-fs-trees are not being accounted
1085 */
1086 return 0;
1087 }
1088
1089 switch (node->action) {
1090 case BTRFS_ADD_DELAYED_REF:
1091 case BTRFS_ADD_DELAYED_EXTENT:
1092 sgn = 1;
1093 break;
1094 case BTRFS_DROP_DELAYED_REF:
1095 sgn = -1;
1096 break;
1097 case BTRFS_UPDATE_DELAYED_HEAD:
1098 return 0;
1099 default:
1100 BUG();
1101 }
1102
1103 /*
1104 * the delayed ref sequence number we pass depends on the direction of
1105 * the operation. for add operations, we pass (node->seq - 1) to skip
1106 * the delayed ref's current sequence number, because we need the state
1107 * of the tree before the add operation. for delete operations, we pass
1108 * (node->seq) to include the delayed ref's current sequence number,
1109 * because we need the state of the tree after the delete operation.
1110 */
1111 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
1112 sgn > 0 ? node->seq - 1 : node->seq, &roots);
1113 if (ret < 0)
1114 goto out;
1115
1116 spin_lock(&fs_info->qgroup_lock);
1117 quota_root = fs_info->quota_root;
1118 if (!quota_root)
1119 goto unlock;
1120
1121 qgroup = find_qgroup_rb(fs_info, ref_root);
1122 if (!qgroup)
1123 goto unlock;
1124
1125 /*
1126 * step 1: for each old ref, visit all nodes once and inc refcnt
1127 */
1128 tmp = ulist_alloc(GFP_ATOMIC);
1129 if (!tmp) {
1130 ret = -ENOMEM;
1131 goto unlock;
1132 }
1133 seq = fs_info->qgroup_seq;
1134 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1135
1136 ULIST_ITER_INIT(&uiter);
1137 while ((unode = ulist_next(roots, &uiter))) {
1138 struct ulist_node *tmp_unode;
1139 struct ulist_iterator tmp_uiter;
1140 struct btrfs_qgroup *qg;
1141
1142 qg = find_qgroup_rb(fs_info, unode->val);
1143 if (!qg)
1144 continue;
1145
1146 ulist_reinit(tmp);
1147 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist;
1152
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux;
1154 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1;
1156 else
1157 ++qg->refcnt;
1158
1159 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group,
1162 GFP_ATOMIC);
1163 }
1164 }
1165 }
1166
1167 /*
1168 * step 2: walk from the new root
1169 */
1170 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist;
1176
1177 qg = (struct btrfs_qgroup *)unode->aux;
1178 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes;
1181 qg->rfer_cmpr += sgn * node->num_bytes;
1182 if (roots->nnodes == 0) {
1183 qg->excl += sgn * node->num_bytes;
1184 qg->excl_cmpr += sgn * node->num_bytes;
1185 }
1186 qgroup_dirty(fs_info, qg);
1187 }
1188 WARN_ON(qg->tag >= seq);
1189 qg->tag = seq;
1190
1191 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC);
1194 }
1195 }
1196
1197 /*
1198 * step 3: walk again from old refs
1199 */
1200 ULIST_ITER_INIT(&uiter);
1201 while ((unode = ulist_next(roots, &uiter))) {
1202 struct btrfs_qgroup *qg;
1203 struct ulist_node *tmp_unode;
1204 struct ulist_iterator tmp_uiter;
1205
1206 qg = find_qgroup_rb(fs_info, unode->val);
1207 if (!qg)
1208 continue;
1209
1210 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist;
1215
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux;
1217 if (qg->tag == seq)
1218 continue;
1219
1220 if (qg->refcnt - seq == roots->nnodes) {
1221 qg->excl -= sgn * node->num_bytes;
1222 qg->excl_cmpr -= sgn * node->num_bytes;
1223 qgroup_dirty(fs_info, qg);
1224 }
1225
1226 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group,
1229 GFP_ATOMIC);
1230 }
1231 }
1232 }
1233 ret = 0;
1234unlock:
1235 spin_unlock(&fs_info->qgroup_lock);
1236out:
1237 ulist_free(roots);
1238 ulist_free(tmp);
1239
1240 return ret;
1241}
1242
1243/*
1244 * called from commit_transaction. Writes all changed qgroups to disk.
1245 */
1246int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1247 struct btrfs_fs_info *fs_info)
1248{
1249 struct btrfs_root *quota_root = fs_info->quota_root;
1250 int ret = 0;
1251
1252 if (!quota_root)
1253 goto out;
1254
1255 fs_info->quota_enabled = fs_info->pending_quota_state;
1256
1257 spin_lock(&fs_info->qgroup_lock);
1258 while (!list_empty(&fs_info->dirty_qgroups)) {
1259 struct btrfs_qgroup *qgroup;
1260 qgroup = list_first_entry(&fs_info->dirty_qgroups,
1261 struct btrfs_qgroup, dirty);
1262 list_del_init(&qgroup->dirty);
1263 spin_unlock(&fs_info->qgroup_lock);
1264 ret = update_qgroup_info_item(trans, quota_root, qgroup);
1265 if (ret)
1266 fs_info->qgroup_flags |=
1267 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1268 spin_lock(&fs_info->qgroup_lock);
1269 }
1270 if (fs_info->quota_enabled)
1271 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
1272 else
1273 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1274 spin_unlock(&fs_info->qgroup_lock);
1275
1276 ret = update_qgroup_status_item(trans, fs_info, quota_root);
1277 if (ret)
1278 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1279
1280out:
1281
1282 return ret;
1283}
1284
1285/*
1286 * copy the acounting information between qgroups. This is necessary when a
1287 * snapshot or a subvolume is created
1288 */
1289int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1290 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
1291 struct btrfs_qgroup_inherit *inherit)
1292{
1293 int ret = 0;
1294 int i;
1295 u64 *i_qgroups;
1296 struct btrfs_root *quota_root = fs_info->quota_root;
1297 struct btrfs_qgroup *srcgroup;
1298 struct btrfs_qgroup *dstgroup;
1299 u32 level_size = 0;
1300
1301 if (!fs_info->quota_enabled)
1302 return 0;
1303
1304 if (!quota_root)
1305 return -EINVAL;
1306
1307 /*
1308 * create a tracking group for the subvol itself
1309 */
1310 ret = add_qgroup_item(trans, quota_root, objectid);
1311 if (ret)
1312 goto out;
1313
1314 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
1315 ret = update_qgroup_limit_item(trans, quota_root, objectid,
1316 inherit->lim.flags,
1317 inherit->lim.max_rfer,
1318 inherit->lim.max_excl,
1319 inherit->lim.rsv_rfer,
1320 inherit->lim.rsv_excl);
1321 if (ret)
1322 goto out;
1323 }
1324
1325 if (srcid) {
1326 struct btrfs_root *srcroot;
1327 struct btrfs_key srckey;
1328 int srcroot_level;
1329
1330 srckey.objectid = srcid;
1331 srckey.type = BTRFS_ROOT_ITEM_KEY;
1332 srckey.offset = (u64)-1;
1333 srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
1334 if (IS_ERR(srcroot)) {
1335 ret = PTR_ERR(srcroot);
1336 goto out;
1337 }
1338
1339 rcu_read_lock();
1340 srcroot_level = btrfs_header_level(srcroot->node);
1341 level_size = btrfs_level_size(srcroot, srcroot_level);
1342 rcu_read_unlock();
1343 }
1344
1345 /*
1346 * add qgroup to all inherited groups
1347 */
1348 if (inherit) {
1349 i_qgroups = (u64 *)(inherit + 1);
1350 for (i = 0; i < inherit->num_qgroups; ++i) {
1351 ret = add_qgroup_relation_item(trans, quota_root,
1352 objectid, *i_qgroups);
1353 if (ret)
1354 goto out;
1355 ret = add_qgroup_relation_item(trans, quota_root,
1356 *i_qgroups, objectid);
1357 if (ret)
1358 goto out;
1359 ++i_qgroups;
1360 }
1361 }
1362
1363
1364 spin_lock(&fs_info->qgroup_lock);
1365
1366 dstgroup = add_qgroup_rb(fs_info, objectid);
1367 if (!dstgroup)
1368 goto unlock;
1369
1370 if (srcid) {
1371 srcgroup = find_qgroup_rb(fs_info, srcid);
1372 if (!srcgroup)
1373 goto unlock;
1374 dstgroup->rfer = srcgroup->rfer - level_size;
1375 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
1376 srcgroup->excl = level_size;
1377 srcgroup->excl_cmpr = level_size;
1378 qgroup_dirty(fs_info, dstgroup);
1379 qgroup_dirty(fs_info, srcgroup);
1380 }
1381
1382 if (!inherit)
1383 goto unlock;
1384
1385 i_qgroups = (u64 *)(inherit + 1);
1386 for (i = 0; i < inherit->num_qgroups; ++i) {
1387 ret = add_relation_rb(quota_root->fs_info, objectid,
1388 *i_qgroups);
1389 if (ret)
1390 goto unlock;
1391 ++i_qgroups;
1392 }
1393
1394 for (i = 0; i < inherit->num_ref_copies; ++i) {
1395 struct btrfs_qgroup *src;
1396 struct btrfs_qgroup *dst;
1397
1398 src = find_qgroup_rb(fs_info, i_qgroups[0]);
1399 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
1400
1401 if (!src || !dst) {
1402 ret = -EINVAL;
1403 goto unlock;
1404 }
1405
1406 dst->rfer = src->rfer - level_size;
1407 dst->rfer_cmpr = src->rfer_cmpr - level_size;
1408 i_qgroups += 2;
1409 }
1410 for (i = 0; i < inherit->num_excl_copies; ++i) {
1411 struct btrfs_qgroup *src;
1412 struct btrfs_qgroup *dst;
1413
1414 src = find_qgroup_rb(fs_info, i_qgroups[0]);
1415 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
1416
1417 if (!src || !dst) {
1418 ret = -EINVAL;
1419 goto unlock;
1420 }
1421
1422 dst->excl = src->excl + level_size;
1423 dst->excl_cmpr = src->excl_cmpr + level_size;
1424 i_qgroups += 2;
1425 }
1426
1427unlock:
1428 spin_unlock(&fs_info->qgroup_lock);
1429out:
1430 return ret;
1431}
1432
1433/*
1434 * reserve some space for a qgroup and all its parents. The reservation takes
1435 * place with start_transaction or dealloc_reserve, similar to ENOSPC
1436 * accounting. If not enough space is available, EDQUOT is returned.
1437 * We assume that the requested space is new for all qgroups.
1438 */
1439int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1440{
1441 struct btrfs_root *quota_root;
1442 struct btrfs_qgroup *qgroup;
1443 struct btrfs_fs_info *fs_info = root->fs_info;
1444 u64 ref_root = root->root_key.objectid;
1445 int ret = 0;
1446 struct ulist *ulist = NULL;
1447 struct ulist_node *unode;
1448 struct ulist_iterator uiter;
1449
1450 if (!is_fstree(ref_root))
1451 return 0;
1452
1453 if (num_bytes == 0)
1454 return 0;
1455
1456 spin_lock(&fs_info->qgroup_lock);
1457 quota_root = fs_info->quota_root;
1458 if (!quota_root)
1459 goto out;
1460
1461 qgroup = find_qgroup_rb(fs_info, ref_root);
1462 if (!qgroup)
1463 goto out;
1464
1465 /*
1466 * in a first step, we check all affected qgroups if any limits would
1467 * be exceeded
1468 */
1469 ulist = ulist_alloc(GFP_ATOMIC);
1470 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1471 ULIST_ITER_INIT(&uiter);
1472 while ((unode = ulist_next(ulist, &uiter))) {
1473 struct btrfs_qgroup *qg;
1474 struct btrfs_qgroup_list *glist;
1475
1476 qg = (struct btrfs_qgroup *)unode->aux;
1477
1478 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1479 qg->reserved + qg->rfer + num_bytes >
1480 qg->max_rfer)
1481 ret = -EDQUOT;
1482
1483 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
1484 qg->reserved + qg->excl + num_bytes >
1485 qg->max_excl)
1486 ret = -EDQUOT;
1487
1488 list_for_each_entry(glist, &qg->groups, next_group) {
1489 ulist_add(ulist, glist->group->qgroupid,
1490 (unsigned long)glist->group, GFP_ATOMIC);
1491 }
1492 }
1493 if (ret)
1494 goto out;
1495
1496 /*
1497 * no limits exceeded, now record the reservation into all qgroups
1498 */
1499 ULIST_ITER_INIT(&uiter);
1500 while ((unode = ulist_next(ulist, &uiter))) {
1501 struct btrfs_qgroup *qg;
1502
1503 qg = (struct btrfs_qgroup *)unode->aux;
1504
1505 qg->reserved += num_bytes;
1506 }
1507
1508out:
1509 spin_unlock(&fs_info->qgroup_lock);
1510 ulist_free(ulist);
1511
1512 return ret;
1513}
1514
1515void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1516{
1517 struct btrfs_root *quota_root;
1518 struct btrfs_qgroup *qgroup;
1519 struct btrfs_fs_info *fs_info = root->fs_info;
1520 struct ulist *ulist = NULL;
1521 struct ulist_node *unode;
1522 struct ulist_iterator uiter;
1523 u64 ref_root = root->root_key.objectid;
1524
1525 if (!is_fstree(ref_root))
1526 return;
1527
1528 if (num_bytes == 0)
1529 return;
1530
1531 spin_lock(&fs_info->qgroup_lock);
1532
1533 quota_root = fs_info->quota_root;
1534 if (!quota_root)
1535 goto out;
1536
1537 qgroup = find_qgroup_rb(fs_info, ref_root);
1538 if (!qgroup)
1539 goto out;
1540
1541 ulist = ulist_alloc(GFP_ATOMIC);
1542 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
1543 ULIST_ITER_INIT(&uiter);
1544 while ((unode = ulist_next(ulist, &uiter))) {
1545 struct btrfs_qgroup *qg;
1546 struct btrfs_qgroup_list *glist;
1547
1548 qg = (struct btrfs_qgroup *)unode->aux;
1549
1550 qg->reserved -= num_bytes;
1551
1552 list_for_each_entry(glist, &qg->groups, next_group) {
1553 ulist_add(ulist, glist->group->qgroupid,
1554 (unsigned long)glist->group, GFP_ATOMIC);
1555 }
1556 }
1557
1558out:
1559 spin_unlock(&fs_info->qgroup_lock);
1560 ulist_free(ulist);
1561}
1562
1563void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1564{
1565 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1566 return;
1567 printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
1568 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1569 trans->delayed_ref_elem.seq);
1570 BUG();
1571}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 646ee21bb035..4da08652004d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1239,10 +1239,11 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
1239 node->bytenr, &node->rb_node); 1239 node->bytenr, &node->rb_node);
1240 spin_unlock(&rc->reloc_root_tree.lock); 1240 spin_unlock(&rc->reloc_root_tree.lock);
1241 if (rb_node) { 1241 if (rb_node) {
1242 kfree(node);
1243 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " 1242 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1244 "for start=%llu while inserting into relocation " 1243 "for start=%llu while inserting into relocation "
1245 "tree\n"); 1244 "tree\n", node->bytenr);
1245 kfree(node);
1246 return -EEXIST;
1246 } 1247 }
1247 1248
1248 list_add_tail(&root->root_list, &rc->reloc_roots); 1249 list_add_tail(&root->root_list, &rc->reloc_roots);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 24fb8ce4e071..6bb465cca20f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,12 +16,55 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/uuid.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "transaction.h" 21#include "transaction.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
24/* 25/*
26 * Read a root item from the tree. In case we detect a root item smaller then
27 * sizeof(root_item), we know it's an old version of the root structure and
28 * initialize all new fields to zero. The same happens if we detect mismatching
29 * generation numbers as then we know the root was once mounted with an older
30 * kernel that was not aware of the root item structure change.
31 */
32void btrfs_read_root_item(struct btrfs_root *root,
33 struct extent_buffer *eb, int slot,
34 struct btrfs_root_item *item)
35{
36 uuid_le uuid;
37 int len;
38 int need_reset = 0;
39
40 len = btrfs_item_size_nr(eb, slot);
41 read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
42 min_t(int, len, (int)sizeof(*item)));
43 if (len < sizeof(*item))
44 need_reset = 1;
45 if (!need_reset && btrfs_root_generation(item)
46 != btrfs_root_generation_v2(item)) {
47 if (btrfs_root_generation_v2(item) != 0) {
48 printk(KERN_WARNING "btrfs: mismatching "
49 "generation and generation_v2 "
50 "found in root item. This root "
51 "was probably mounted with an "
52 "older kernel. Resetting all "
53 "new fields.\n");
54 }
55 need_reset = 1;
56 }
57 if (need_reset) {
58 memset(&item->generation_v2, 0,
59 sizeof(*item) - offsetof(struct btrfs_root_item,
60 generation_v2));
61
62 uuid_le_gen(&uuid);
63 memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
64 }
65}
66
67/*
25 * lookup the root with the highest offset for a given objectid. The key we do 68 * lookup the root with the highest offset for a given objectid. The key we do
26 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 69 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
27 * on error. 70 * on error.
@@ -61,10 +104,10 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
61 goto out; 104 goto out;
62 } 105 }
63 if (item) 106 if (item)
64 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), 107 btrfs_read_root_item(root, l, slot, item);
65 sizeof(*item));
66 if (key) 108 if (key)
67 memcpy(key, &found_key, sizeof(found_key)); 109 memcpy(key, &found_key, sizeof(found_key));
110
68 ret = 0; 111 ret = 0;
69out: 112out:
70 btrfs_free_path(path); 113 btrfs_free_path(path);
@@ -91,16 +134,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
91 int ret; 134 int ret;
92 int slot; 135 int slot;
93 unsigned long ptr; 136 unsigned long ptr;
137 int old_len;
94 138
95 path = btrfs_alloc_path(); 139 path = btrfs_alloc_path();
96 if (!path) 140 if (!path)
97 return -ENOMEM; 141 return -ENOMEM;
98 142
99 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
100 if (ret < 0) { 144 if (ret < 0)
101 btrfs_abort_transaction(trans, root, ret); 145 goto out_abort;
102 goto out;
103 }
104 146
105 if (ret != 0) { 147 if (ret != 0) {
106 btrfs_print_leaf(root, path->nodes[0]); 148 btrfs_print_leaf(root, path->nodes[0]);
@@ -113,16 +155,56 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
113 l = path->nodes[0]; 155 l = path->nodes[0];
114 slot = path->slots[0]; 156 slot = path->slots[0];
115 ptr = btrfs_item_ptr_offset(l, slot); 157 ptr = btrfs_item_ptr_offset(l, slot);
158 old_len = btrfs_item_size_nr(l, slot);
159
160 /*
161 * If this is the first time we update the root item which originated
162 * from an older kernel, we need to enlarge the item size to make room
163 * for the added fields.
164 */
165 if (old_len < sizeof(*item)) {
166 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1);
169 if (ret < 0)
170 goto out_abort;
171 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0)
173 goto out_abort;
174 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item));
177 if (ret < 0)
178 goto out_abort;
179 l = path->nodes[0];
180 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot);
182 }
183
184 /*
185 * Update generation_v2 so at the next mount we know the new root
186 * fields are valid.
187 */
188 btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
189
116 write_extent_buffer(l, item, ptr, sizeof(*item)); 190 write_extent_buffer(l, item, ptr, sizeof(*item));
117 btrfs_mark_buffer_dirty(path->nodes[0]); 191 btrfs_mark_buffer_dirty(path->nodes[0]);
118out: 192out:
119 btrfs_free_path(path); 193 btrfs_free_path(path);
120 return ret; 194 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
121} 199}
122 200
123int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
124 struct btrfs_key *key, struct btrfs_root_item *item) 202 struct btrfs_key *key, struct btrfs_root_item *item)
125{ 203{
204 /*
205 * Make sure generation v1 and v2 match. See update_root for details.
206 */
207 btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
126 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 208 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
127} 209}
128 210
@@ -454,3 +536,16 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
454 root_item->byte_limit = 0; 536 root_item->byte_limit = 0;
455 } 537 }
456} 538}
539
540void btrfs_update_root_times(struct btrfs_trans_handle *trans,
541 struct btrfs_root *root)
542{
543 struct btrfs_root_item *item = &root->root_item;
544 struct timespec ct = CURRENT_TIME;
545
546 spin_lock(&root->root_times_lock);
547 item->ctransid = trans->transid;
548 item->ctime.sec = cpu_to_le64(ct.tv_sec);
549 item->ctime.nsec = cpu_to_le64(ct.tv_nsec);
550 spin_unlock(&root->root_times_lock);
551}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 000000000000..fb5ffe95f869
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,4572 @@
1/*
2 * Copyright (C) 2012 Alexander Block. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bsearch.h>
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/sort.h>
23#include <linux/mount.h>
24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h>
26#include <linux/radix-tree.h>
27#include <linux/crc32c.h>
28#include <linux/vmalloc.h>
29
30#include "send.h"
31#include "backref.h"
32#include "locking.h"
33#include "disk-io.h"
34#include "btrfs_inode.h"
35#include "transaction.h"
36
37static int g_verbose = 0;
38
39#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
40
41/*
42 * A fs_path is a helper to dynamically build path names with unknown size.
43 * It reallocates the internal buffer on demand.
44 * It allows fast adding of path elements on the right side (normal path) and
45 * fast adding to the left side (reversed path). A reversed path can also be
46 * unreversed if needed.
47 */
48struct fs_path {
49 union {
50 struct {
51 char *start;
52 char *end;
53 char *prepared;
54
55 char *buf;
56 int buf_len;
57 int reversed:1;
58 int virtual_mem:1;
59 char inline_buf[];
60 };
61 char pad[PAGE_SIZE];
62 };
63};
64#define FS_PATH_INLINE_SIZE \
65 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
66
67
68/* reused for each extent */
69struct clone_root {
70 struct btrfs_root *root;
71 u64 ino;
72 u64 offset;
73
74 u64 found_refs;
75};
76
77#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
78#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
79
80struct send_ctx {
81 struct file *send_filp;
82 loff_t send_off;
83 char *send_buf;
84 u32 send_size;
85 u32 send_max_size;
86 u64 total_send_size;
87 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
88
89 struct vfsmount *mnt;
90
91 struct btrfs_root *send_root;
92 struct btrfs_root *parent_root;
93 struct clone_root *clone_roots;
94 int clone_roots_cnt;
95
96 /* current state of the compare_tree call */
97 struct btrfs_path *left_path;
98 struct btrfs_path *right_path;
99 struct btrfs_key *cmp_key;
100
101 /*
102 * infos of the currently processed inode. In case of deleted inodes,
103 * these are the values from the deleted inode.
104 */
105 u64 cur_ino;
106 u64 cur_inode_gen;
107 int cur_inode_new;
108 int cur_inode_new_gen;
109 int cur_inode_deleted;
110 int cur_inode_first_ref_orphan;
111 u64 cur_inode_size;
112 u64 cur_inode_mode;
113
114 u64 send_progress;
115
116 struct list_head new_refs;
117 struct list_head deleted_refs;
118
119 struct radix_tree_root name_cache;
120 struct list_head name_cache_list;
121 int name_cache_size;
122
123 struct file *cur_inode_filp;
124 char *read_buf;
125};
126
127struct name_cache_entry {
128 struct list_head list;
129 struct list_head use_list;
130 u64 ino;
131 u64 gen;
132 u64 parent_ino;
133 u64 parent_gen;
134 int ret;
135 int need_later_update;
136 int name_len;
137 char name[];
138};
139
140static void fs_path_reset(struct fs_path *p)
141{
142 if (p->reversed) {
143 p->start = p->buf + p->buf_len - 1;
144 p->end = p->start;
145 *p->start = 0;
146 } else {
147 p->start = p->buf;
148 p->end = p->start;
149 *p->start = 0;
150 }
151}
152
153static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
154{
155 struct fs_path *p;
156
157 p = kmalloc(sizeof(*p), GFP_NOFS);
158 if (!p)
159 return NULL;
160 p->reversed = 0;
161 p->virtual_mem = 0;
162 p->buf = p->inline_buf;
163 p->buf_len = FS_PATH_INLINE_SIZE;
164 fs_path_reset(p);
165 return p;
166}
167
168static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
169{
170 struct fs_path *p;
171
172 p = fs_path_alloc(sctx);
173 if (!p)
174 return NULL;
175 p->reversed = 1;
176 fs_path_reset(p);
177 return p;
178}
179
180static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
181{
182 if (!p)
183 return;
184 if (p->buf != p->inline_buf) {
185 if (p->virtual_mem)
186 vfree(p->buf);
187 else
188 kfree(p->buf);
189 }
190 kfree(p);
191}
192
193static int fs_path_len(struct fs_path *p)
194{
195 return p->end - p->start;
196}
197
198static int fs_path_ensure_buf(struct fs_path *p, int len)
199{
200 char *tmp_buf;
201 int path_len;
202 int old_buf_len;
203
204 len++;
205
206 if (p->buf_len >= len)
207 return 0;
208
209 path_len = p->end - p->start;
210 old_buf_len = p->buf_len;
211 len = PAGE_ALIGN(len);
212
213 if (p->buf == p->inline_buf) {
214 tmp_buf = kmalloc(len, GFP_NOFS);
215 if (!tmp_buf) {
216 tmp_buf = vmalloc(len);
217 if (!tmp_buf)
218 return -ENOMEM;
219 p->virtual_mem = 1;
220 }
221 memcpy(tmp_buf, p->buf, p->buf_len);
222 p->buf = tmp_buf;
223 p->buf_len = len;
224 } else {
225 if (p->virtual_mem) {
226 tmp_buf = vmalloc(len);
227 if (!tmp_buf)
228 return -ENOMEM;
229 memcpy(tmp_buf, p->buf, p->buf_len);
230 vfree(p->buf);
231 } else {
232 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
233 if (!tmp_buf) {
234 tmp_buf = vmalloc(len);
235 if (!tmp_buf)
236 return -ENOMEM;
237 memcpy(tmp_buf, p->buf, p->buf_len);
238 kfree(p->buf);
239 p->virtual_mem = 1;
240 }
241 }
242 p->buf = tmp_buf;
243 p->buf_len = len;
244 }
245 if (p->reversed) {
246 tmp_buf = p->buf + old_buf_len - path_len - 1;
247 p->end = p->buf + p->buf_len - 1;
248 p->start = p->end - path_len;
249 memmove(p->start, tmp_buf, path_len + 1);
250 } else {
251 p->start = p->buf;
252 p->end = p->start + path_len;
253 }
254 return 0;
255}
256
257static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
258{
259 int ret;
260 int new_len;
261
262 new_len = p->end - p->start + name_len;
263 if (p->start != p->end)
264 new_len++;
265 ret = fs_path_ensure_buf(p, new_len);
266 if (ret < 0)
267 goto out;
268
269 if (p->reversed) {
270 if (p->start != p->end)
271 *--p->start = '/';
272 p->start -= name_len;
273 p->prepared = p->start;
274 } else {
275 if (p->start != p->end)
276 *p->end++ = '/';
277 p->prepared = p->end;
278 p->end += name_len;
279 *p->end = 0;
280 }
281
282out:
283 return ret;
284}
285
286static int fs_path_add(struct fs_path *p, const char *name, int name_len)
287{
288 int ret;
289
290 ret = fs_path_prepare_for_add(p, name_len);
291 if (ret < 0)
292 goto out;
293 memcpy(p->prepared, name, name_len);
294 p->prepared = NULL;
295
296out:
297 return ret;
298}
299
300static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
301{
302 int ret;
303
304 ret = fs_path_prepare_for_add(p, p2->end - p2->start);
305 if (ret < 0)
306 goto out;
307 memcpy(p->prepared, p2->start, p2->end - p2->start);
308 p->prepared = NULL;
309
310out:
311 return ret;
312}
313
314static int fs_path_add_from_extent_buffer(struct fs_path *p,
315 struct extent_buffer *eb,
316 unsigned long off, int len)
317{
318 int ret;
319
320 ret = fs_path_prepare_for_add(p, len);
321 if (ret < 0)
322 goto out;
323
324 read_extent_buffer(eb, p->prepared, off, len);
325 p->prepared = NULL;
326
327out:
328 return ret;
329}
330
331static void fs_path_remove(struct fs_path *p)
332{
333 BUG_ON(p->reversed);
334 while (p->start != p->end && *p->end != '/')
335 p->end--;
336 *p->end = 0;
337}
338
339static int fs_path_copy(struct fs_path *p, struct fs_path *from)
340{
341 int ret;
342
343 p->reversed = from->reversed;
344 fs_path_reset(p);
345
346 ret = fs_path_add_path(p, from);
347
348 return ret;
349}
350
351
352static void fs_path_unreverse(struct fs_path *p)
353{
354 char *tmp;
355 int len;
356
357 if (!p->reversed)
358 return;
359
360 tmp = p->start;
361 len = p->end - p->start;
362 p->start = p->buf;
363 p->end = p->start + len;
364 memmove(p->start, tmp, len + 1);
365 p->reversed = 0;
366}
367
368static struct btrfs_path *alloc_path_for_send(void)
369{
370 struct btrfs_path *path;
371
372 path = btrfs_alloc_path();
373 if (!path)
374 return NULL;
375 path->search_commit_root = 1;
376 path->skip_locking = 1;
377 return path;
378}
379
380static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
381{
382 int ret;
383 mm_segment_t old_fs;
384 u32 pos = 0;
385
386 old_fs = get_fs();
387 set_fs(KERNEL_DS);
388
389 while (pos < len) {
390 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
391 &sctx->send_off);
392 /* TODO handle that correctly */
393 /*if (ret == -ERESTARTSYS) {
394 continue;
395 }*/
396 if (ret < 0)
397 goto out;
398 if (ret == 0) {
399 ret = -EIO;
400 goto out;
401 }
402 pos += ret;
403 }
404
405 ret = 0;
406
407out:
408 set_fs(old_fs);
409 return ret;
410}
411
412static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
413{
414 struct btrfs_tlv_header *hdr;
415 int total_len = sizeof(*hdr) + len;
416 int left = sctx->send_max_size - sctx->send_size;
417
418 if (unlikely(left < total_len))
419 return -EOVERFLOW;
420
421 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
422 hdr->tlv_type = cpu_to_le16(attr);
423 hdr->tlv_len = cpu_to_le16(len);
424 memcpy(hdr + 1, data, len);
425 sctx->send_size += total_len;
426
427 return 0;
428}
429
430#if 0
431static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
432{
433 return tlv_put(sctx, attr, &value, sizeof(value));
434}
435
436static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
437{
438 __le16 tmp = cpu_to_le16(value);
439 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
440}
441
442static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
443{
444 __le32 tmp = cpu_to_le32(value);
445 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
446}
447#endif
448
449static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
450{
451 __le64 tmp = cpu_to_le64(value);
452 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
453}
454
455static int tlv_put_string(struct send_ctx *sctx, u16 attr,
456 const char *str, int len)
457{
458 if (len == -1)
459 len = strlen(str);
460 return tlv_put(sctx, attr, str, len);
461}
462
463static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
464 const u8 *uuid)
465{
466 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
467}
468
469#if 0
470static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
471 struct timespec *ts)
472{
473 struct btrfs_timespec bts;
474 bts.sec = cpu_to_le64(ts->tv_sec);
475 bts.nsec = cpu_to_le32(ts->tv_nsec);
476 return tlv_put(sctx, attr, &bts, sizeof(bts));
477}
478#endif
479
480static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
481 struct extent_buffer *eb,
482 struct btrfs_timespec *ts)
483{
484 struct btrfs_timespec bts;
485 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
486 return tlv_put(sctx, attr, &bts, sizeof(bts));
487}
488
489
490#define TLV_PUT(sctx, attrtype, attrlen, data) \
491 do { \
492 ret = tlv_put(sctx, attrtype, attrlen, data); \
493 if (ret < 0) \
494 goto tlv_put_failure; \
495 } while (0)
496
497#define TLV_PUT_INT(sctx, attrtype, bits, value) \
498 do { \
499 ret = tlv_put_u##bits(sctx, attrtype, value); \
500 if (ret < 0) \
501 goto tlv_put_failure; \
502 } while (0)
503
504#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
505#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
506#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
507#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
508#define TLV_PUT_STRING(sctx, attrtype, str, len) \
509 do { \
510 ret = tlv_put_string(sctx, attrtype, str, len); \
511 if (ret < 0) \
512 goto tlv_put_failure; \
513 } while (0)
514#define TLV_PUT_PATH(sctx, attrtype, p) \
515 do { \
516 ret = tlv_put_string(sctx, attrtype, p->start, \
517 p->end - p->start); \
518 if (ret < 0) \
519 goto tlv_put_failure; \
520 } while(0)
521#define TLV_PUT_UUID(sctx, attrtype, uuid) \
522 do { \
523 ret = tlv_put_uuid(sctx, attrtype, uuid); \
524 if (ret < 0) \
525 goto tlv_put_failure; \
526 } while (0)
527#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
528 do { \
529 ret = tlv_put_timespec(sctx, attrtype, ts); \
530 if (ret < 0) \
531 goto tlv_put_failure; \
532 } while (0)
533#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
534 do { \
535 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
536 if (ret < 0) \
537 goto tlv_put_failure; \
538 } while (0)
539
540static int send_header(struct send_ctx *sctx)
541{
542 struct btrfs_stream_header hdr;
543
544 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
545 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
546
547 return write_buf(sctx, &hdr, sizeof(hdr));
548}
549
550/*
551 * For each command/item we want to send to userspace, we call this function.
552 */
553static int begin_cmd(struct send_ctx *sctx, int cmd)
554{
555 struct btrfs_cmd_header *hdr;
556
557 if (!sctx->send_buf) {
558 WARN_ON(1);
559 return -EINVAL;
560 }
561
562 BUG_ON(sctx->send_size);
563
564 sctx->send_size += sizeof(*hdr);
565 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
566 hdr->cmd = cpu_to_le16(cmd);
567
568 return 0;
569}
570
571static int send_cmd(struct send_ctx *sctx)
572{
573 int ret;
574 struct btrfs_cmd_header *hdr;
575 u32 crc;
576
577 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
578 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
579 hdr->crc = 0;
580
581 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
582 hdr->crc = cpu_to_le32(crc);
583
584 ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
585
586 sctx->total_send_size += sctx->send_size;
587 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
588 sctx->send_size = 0;
589
590 return ret;
591}
592
593/*
594 * Sends a move instruction to user space
595 */
596static int send_rename(struct send_ctx *sctx,
597 struct fs_path *from, struct fs_path *to)
598{
599 int ret;
600
601verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start);
602
603 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
604 if (ret < 0)
605 goto out;
606
607 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
608 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
609
610 ret = send_cmd(sctx);
611
612tlv_put_failure:
613out:
614 return ret;
615}
616
617/*
618 * Sends a link instruction to user space
619 */
620static int send_link(struct send_ctx *sctx,
621 struct fs_path *path, struct fs_path *lnk)
622{
623 int ret;
624
625verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start);
626
627 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
628 if (ret < 0)
629 goto out;
630
631 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
632 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
633
634 ret = send_cmd(sctx);
635
636tlv_put_failure:
637out:
638 return ret;
639}
640
641/*
642 * Sends an unlink instruction to user space
643 */
644static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
645{
646 int ret;
647
648verbose_printk("btrfs: send_unlink %s\n", path->start);
649
650 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
651 if (ret < 0)
652 goto out;
653
654 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
655
656 ret = send_cmd(sctx);
657
658tlv_put_failure:
659out:
660 return ret;
661}
662
663/*
664 * Sends a rmdir instruction to user space
665 */
666static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
667{
668 int ret;
669
670verbose_printk("btrfs: send_rmdir %s\n", path->start);
671
672 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
673 if (ret < 0)
674 goto out;
675
676 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
677
678 ret = send_cmd(sctx);
679
680tlv_put_failure:
681out:
682 return ret;
683}
684
685/*
686 * Helper function to retrieve some fields from an inode item.
687 */
688static int get_inode_info(struct btrfs_root *root,
689 u64 ino, u64 *size, u64 *gen,
690 u64 *mode, u64 *uid, u64 *gid)
691{
692 int ret;
693 struct btrfs_inode_item *ii;
694 struct btrfs_key key;
695 struct btrfs_path *path;
696
697 path = alloc_path_for_send();
698 if (!path)
699 return -ENOMEM;
700
701 key.objectid = ino;
702 key.type = BTRFS_INODE_ITEM_KEY;
703 key.offset = 0;
704 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
705 if (ret < 0)
706 goto out;
707 if (ret) {
708 ret = -ENOENT;
709 goto out;
710 }
711
712 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
713 struct btrfs_inode_item);
714 if (size)
715 *size = btrfs_inode_size(path->nodes[0], ii);
716 if (gen)
717 *gen = btrfs_inode_generation(path->nodes[0], ii);
718 if (mode)
719 *mode = btrfs_inode_mode(path->nodes[0], ii);
720 if (uid)
721 *uid = btrfs_inode_uid(path->nodes[0], ii);
722 if (gid)
723 *gid = btrfs_inode_gid(path->nodes[0], ii);
724
725out:
726 btrfs_free_path(path);
727 return ret;
728}
729
730typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
731 struct fs_path *p,
732 void *ctx);
733
734/*
735 * Helper function to iterate the entries in ONE btrfs_inode_ref.
736 * The iterate callback may return a non zero value to stop iteration. This can
737 * be a negative value for error codes or 1 to simply stop it.
738 *
739 * path must point to the INODE_REF when called.
740 */
741static int iterate_inode_ref(struct send_ctx *sctx,
742 struct btrfs_root *root, struct btrfs_path *path,
743 struct btrfs_key *found_key, int resolve,
744 iterate_inode_ref_t iterate, void *ctx)
745{
746 struct extent_buffer *eb;
747 struct btrfs_item *item;
748 struct btrfs_inode_ref *iref;
749 struct btrfs_path *tmp_path;
750 struct fs_path *p;
751 u32 cur;
752 u32 len;
753 u32 total;
754 int slot;
755 u32 name_len;
756 char *start;
757 int ret = 0;
758 int num;
759 int index;
760
761 p = fs_path_alloc_reversed(sctx);
762 if (!p)
763 return -ENOMEM;
764
765 tmp_path = alloc_path_for_send();
766 if (!tmp_path) {
767 fs_path_free(sctx, p);
768 return -ENOMEM;
769 }
770
771 eb = path->nodes[0];
772 slot = path->slots[0];
773 item = btrfs_item_nr(eb, slot);
774 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
775 cur = 0;
776 len = 0;
777 total = btrfs_item_size(eb, item);
778
779 num = 0;
780 while (cur < total) {
781 fs_path_reset(p);
782
783 name_len = btrfs_inode_ref_name_len(eb, iref);
784 index = btrfs_inode_ref_index(eb, iref);
785 if (resolve) {
786 start = btrfs_iref_to_path(root, tmp_path, iref, eb,
787 found_key->offset, p->buf,
788 p->buf_len);
789 if (IS_ERR(start)) {
790 ret = PTR_ERR(start);
791 goto out;
792 }
793 if (start < p->buf) {
794 /* overflow , try again with larger buffer */
795 ret = fs_path_ensure_buf(p,
796 p->buf_len + p->buf - start);
797 if (ret < 0)
798 goto out;
799 start = btrfs_iref_to_path(root, tmp_path, iref,
800 eb, found_key->offset, p->buf,
801 p->buf_len);
802 if (IS_ERR(start)) {
803 ret = PTR_ERR(start);
804 goto out;
805 }
806 BUG_ON(start < p->buf);
807 }
808 p->start = start;
809 } else {
810 ret = fs_path_add_from_extent_buffer(p, eb,
811 (unsigned long)(iref + 1), name_len);
812 if (ret < 0)
813 goto out;
814 }
815
816
817 len = sizeof(*iref) + name_len;
818 iref = (struct btrfs_inode_ref *)((char *)iref + len);
819 cur += len;
820
821 ret = iterate(num, found_key->offset, index, p, ctx);
822 if (ret)
823 goto out;
824
825 num++;
826 }
827
828out:
829 btrfs_free_path(tmp_path);
830 fs_path_free(sctx, p);
831 return ret;
832}
833
834typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
835 const char *name, int name_len,
836 const char *data, int data_len,
837 u8 type, void *ctx);
838
839/*
840 * Helper function to iterate the entries in ONE btrfs_dir_item.
841 * The iterate callback may return a non zero value to stop iteration. This can
842 * be a negative value for error codes or 1 to simply stop it.
843 *
844 * path must point to the dir item when called.
845 */
846static int iterate_dir_item(struct send_ctx *sctx,
847 struct btrfs_root *root, struct btrfs_path *path,
848 struct btrfs_key *found_key,
849 iterate_dir_item_t iterate, void *ctx)
850{
851 int ret = 0;
852 struct extent_buffer *eb;
853 struct btrfs_item *item;
854 struct btrfs_dir_item *di;
855 struct btrfs_path *tmp_path = NULL;
856 struct btrfs_key di_key;
857 char *buf = NULL;
858 char *buf2 = NULL;
859 int buf_len;
860 int buf_virtual = 0;
861 u32 name_len;
862 u32 data_len;
863 u32 cur;
864 u32 len;
865 u32 total;
866 int slot;
867 int num;
868 u8 type;
869
870 buf_len = PAGE_SIZE;
871 buf = kmalloc(buf_len, GFP_NOFS);
872 if (!buf) {
873 ret = -ENOMEM;
874 goto out;
875 }
876
877 tmp_path = alloc_path_for_send();
878 if (!tmp_path) {
879 ret = -ENOMEM;
880 goto out;
881 }
882
883 eb = path->nodes[0];
884 slot = path->slots[0];
885 item = btrfs_item_nr(eb, slot);
886 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
887 cur = 0;
888 len = 0;
889 total = btrfs_item_size(eb, item);
890
891 num = 0;
892 while (cur < total) {
893 name_len = btrfs_dir_name_len(eb, di);
894 data_len = btrfs_dir_data_len(eb, di);
895 type = btrfs_dir_type(eb, di);
896 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
897
898 if (name_len + data_len > buf_len) {
899 buf_len = PAGE_ALIGN(name_len + data_len);
900 if (buf_virtual) {
901 buf2 = vmalloc(buf_len);
902 if (!buf2) {
903 ret = -ENOMEM;
904 goto out;
905 }
906 vfree(buf);
907 } else {
908 buf2 = krealloc(buf, buf_len, GFP_NOFS);
909 if (!buf2) {
910 buf2 = vmalloc(buf_len);
911 if (!buf2) {
912 ret = -ENOMEM;
913 goto out;
914 }
915 kfree(buf);
916 buf_virtual = 1;
917 }
918 }
919
920 buf = buf2;
921 buf2 = NULL;
922 }
923
924 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
925 name_len + data_len);
926
927 len = sizeof(*di) + name_len + data_len;
928 di = (struct btrfs_dir_item *)((char *)di + len);
929 cur += len;
930
931 ret = iterate(num, &di_key, buf, name_len, buf + name_len,
932 data_len, type, ctx);
933 if (ret < 0)
934 goto out;
935 if (ret) {
936 ret = 0;
937 goto out;
938 }
939
940 num++;
941 }
942
943out:
944 btrfs_free_path(tmp_path);
945 if (buf_virtual)
946 vfree(buf);
947 else
948 kfree(buf);
949 return ret;
950}
951
952static int __copy_first_ref(int num, u64 dir, int index,
953 struct fs_path *p, void *ctx)
954{
955 int ret;
956 struct fs_path *pt = ctx;
957
958 ret = fs_path_copy(pt, p);
959 if (ret < 0)
960 return ret;
961
962 /* we want the first only */
963 return 1;
964}
965
966/*
967 * Retrieve the first path of an inode. If an inode has more then one
968 * ref/hardlink, this is ignored.
969 */
970static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
971 u64 ino, struct fs_path *path)
972{
973 int ret;
974 struct btrfs_key key, found_key;
975 struct btrfs_path *p;
976
977 p = alloc_path_for_send();
978 if (!p)
979 return -ENOMEM;
980
981 fs_path_reset(path);
982
983 key.objectid = ino;
984 key.type = BTRFS_INODE_REF_KEY;
985 key.offset = 0;
986
987 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
988 if (ret < 0)
989 goto out;
990 if (ret) {
991 ret = 1;
992 goto out;
993 }
994 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
995 if (found_key.objectid != ino ||
996 found_key.type != BTRFS_INODE_REF_KEY) {
997 ret = -ENOENT;
998 goto out;
999 }
1000
1001 ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
1002 __copy_first_ref, path);
1003 if (ret < 0)
1004 goto out;
1005 ret = 0;
1006
1007out:
1008 btrfs_free_path(p);
1009 return ret;
1010}
1011
1012struct backref_ctx {
1013 struct send_ctx *sctx;
1014
1015 /* number of total found references */
1016 u64 found;
1017
1018 /*
1019 * used for clones found in send_root. clones found behind cur_objectid
1020 * and cur_offset are not considered as allowed clones.
1021 */
1022 u64 cur_objectid;
1023 u64 cur_offset;
1024
1025 /* may be truncated in case it's the last extent in a file */
1026 u64 extent_len;
1027
1028 /* Just to check for bugs in backref resolving */
1029 int found_in_send_root;
1030};
1031
1032static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1033{
1034 u64 root = (u64)key;
1035 struct clone_root *cr = (struct clone_root *)elt;
1036
1037 if (root < cr->root->objectid)
1038 return -1;
1039 if (root > cr->root->objectid)
1040 return 1;
1041 return 0;
1042}
1043
1044static int __clone_root_cmp_sort(const void *e1, const void *e2)
1045{
1046 struct clone_root *cr1 = (struct clone_root *)e1;
1047 struct clone_root *cr2 = (struct clone_root *)e2;
1048
1049 if (cr1->root->objectid < cr2->root->objectid)
1050 return -1;
1051 if (cr1->root->objectid > cr2->root->objectid)
1052 return 1;
1053 return 0;
1054}
1055
1056/*
1057 * Called for every backref that is found for the current extent.
1058 */
1059static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1060{
1061 struct backref_ctx *bctx = ctx_;
1062 struct clone_root *found;
1063 int ret;
1064 u64 i_size;
1065
1066 /* First check if the root is in the list of accepted clone sources */
1067 found = bsearch((void *)root, bctx->sctx->clone_roots,
1068 bctx->sctx->clone_roots_cnt,
1069 sizeof(struct clone_root),
1070 __clone_root_cmp_bsearch);
1071 if (!found)
1072 return 0;
1073
1074 if (found->root == bctx->sctx->send_root &&
1075 ino == bctx->cur_objectid &&
1076 offset == bctx->cur_offset) {
1077 bctx->found_in_send_root = 1;
1078 }
1079
1080 /*
1081 * There are inodes that have extents that lie behind it's i_size. Don't
1082 * accept clones from these extents.
1083 */
1084 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
1085 if (ret < 0)
1086 return ret;
1087
1088 if (offset + bctx->extent_len > i_size)
1089 return 0;
1090
1091 /*
1092 * Make sure we don't consider clones from send_root that are
1093 * behind the current inode/offset.
1094 */
1095 if (found->root == bctx->sctx->send_root) {
1096 /*
1097 * TODO for the moment we don't accept clones from the inode
1098 * that is currently send. We may change this when
1099 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
1100 * file.
1101 */
1102 if (ino >= bctx->cur_objectid)
1103 return 0;
1104 /*if (ino > ctx->cur_objectid)
1105 return 0;
1106 if (offset + ctx->extent_len > ctx->cur_offset)
1107 return 0;*/
1108
1109 bctx->found++;
1110 found->found_refs++;
1111 found->ino = ino;
1112 found->offset = offset;
1113 return 0;
1114 }
1115
1116 bctx->found++;
1117 found->found_refs++;
1118 if (ino < found->ino) {
1119 found->ino = ino;
1120 found->offset = offset;
1121 } else if (found->ino == ino) {
1122 /*
1123 * same extent found more then once in the same file.
1124 */
1125 if (found->offset > offset + bctx->extent_len)
1126 found->offset = offset;
1127 }
1128
1129 return 0;
1130}
1131
1132/*
1133 * path must point to the extent item when called.
1134 */
1135static int find_extent_clone(struct send_ctx *sctx,
1136 struct btrfs_path *path,
1137 u64 ino, u64 data_offset,
1138 u64 ino_size,
1139 struct clone_root **found)
1140{
1141 int ret;
1142 int extent_type;
1143 u64 logical;
1144 u64 num_bytes;
1145 u64 extent_item_pos;
1146 struct btrfs_file_extent_item *fi;
1147 struct extent_buffer *eb = path->nodes[0];
1148 struct backref_ctx backref_ctx;
1149 struct clone_root *cur_clone_root;
1150 struct btrfs_key found_key;
1151 struct btrfs_path *tmp_path;
1152 u32 i;
1153
1154 tmp_path = alloc_path_for_send();
1155 if (!tmp_path)
1156 return -ENOMEM;
1157
1158 if (data_offset >= ino_size) {
1159 /*
1160 * There may be extents that lie behind the file's size.
1161 * I at least had this in combination with snapshotting while
1162 * writing large files.
1163 */
1164 ret = 0;
1165 goto out;
1166 }
1167
1168 fi = btrfs_item_ptr(eb, path->slots[0],
1169 struct btrfs_file_extent_item);
1170 extent_type = btrfs_file_extent_type(eb, fi);
1171 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1172 ret = -ENOENT;
1173 goto out;
1174 }
1175
1176 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1177 logical = btrfs_file_extent_disk_bytenr(eb, fi);
1178 if (logical == 0) {
1179 ret = -ENOENT;
1180 goto out;
1181 }
1182 logical += btrfs_file_extent_offset(eb, fi);
1183
1184 ret = extent_from_logical(sctx->send_root->fs_info,
1185 logical, tmp_path, &found_key);
1186 btrfs_release_path(tmp_path);
1187
1188 if (ret < 0)
1189 goto out;
1190 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1191 ret = -EIO;
1192 goto out;
1193 }
1194
1195 /*
1196 * Setup the clone roots.
1197 */
1198 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1199 cur_clone_root = sctx->clone_roots + i;
1200 cur_clone_root->ino = (u64)-1;
1201 cur_clone_root->offset = 0;
1202 cur_clone_root->found_refs = 0;
1203 }
1204
1205 backref_ctx.sctx = sctx;
1206 backref_ctx.found = 0;
1207 backref_ctx.cur_objectid = ino;
1208 backref_ctx.cur_offset = data_offset;
1209 backref_ctx.found_in_send_root = 0;
1210 backref_ctx.extent_len = num_bytes;
1211
1212 /*
1213 * The last extent of a file may be too large due to page alignment.
1214 * We need to adjust extent_len in this case so that the checks in
1215 * __iterate_backrefs work.
1216 */
1217 if (data_offset + num_bytes >= ino_size)
1218 backref_ctx.extent_len = ino_size - data_offset;
1219
1220 /*
1221 * Now collect all backrefs.
1222 */
1223 extent_item_pos = logical - found_key.objectid;
1224 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1225 found_key.objectid, extent_item_pos, 1,
1226 __iterate_backrefs, &backref_ctx);
1227 if (ret < 0)
1228 goto out;
1229
1230 if (!backref_ctx.found_in_send_root) {
1231 /* found a bug in backref code? */
1232 ret = -EIO;
1233 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1234 "send_root. inode=%llu, offset=%llu, "
1235 "logical=%llu\n",
1236 ino, data_offset, logical);
1237 goto out;
1238 }
1239
1240verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1241 "ino=%llu, "
1242 "num_bytes=%llu, logical=%llu\n",
1243 data_offset, ino, num_bytes, logical);
1244
1245 if (!backref_ctx.found)
1246 verbose_printk("btrfs: no clones found\n");
1247
1248 cur_clone_root = NULL;
1249 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1250 if (sctx->clone_roots[i].found_refs) {
1251 if (!cur_clone_root)
1252 cur_clone_root = sctx->clone_roots + i;
1253 else if (sctx->clone_roots[i].root == sctx->send_root)
1254 /* prefer clones from send_root over others */
1255 cur_clone_root = sctx->clone_roots + i;
1256 break;
1257 }
1258
1259 }
1260
1261 if (cur_clone_root) {
1262 *found = cur_clone_root;
1263 ret = 0;
1264 } else {
1265 ret = -ENOENT;
1266 }
1267
1268out:
1269 btrfs_free_path(tmp_path);
1270 return ret;
1271}
1272
1273static int read_symlink(struct send_ctx *sctx,
1274 struct btrfs_root *root,
1275 u64 ino,
1276 struct fs_path *dest)
1277{
1278 int ret;
1279 struct btrfs_path *path;
1280 struct btrfs_key key;
1281 struct btrfs_file_extent_item *ei;
1282 u8 type;
1283 u8 compression;
1284 unsigned long off;
1285 int len;
1286
1287 path = alloc_path_for_send();
1288 if (!path)
1289 return -ENOMEM;
1290
1291 key.objectid = ino;
1292 key.type = BTRFS_EXTENT_DATA_KEY;
1293 key.offset = 0;
1294 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1295 if (ret < 0)
1296 goto out;
1297 BUG_ON(ret);
1298
1299 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1300 struct btrfs_file_extent_item);
1301 type = btrfs_file_extent_type(path->nodes[0], ei);
1302 compression = btrfs_file_extent_compression(path->nodes[0], ei);
1303 BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
1304 BUG_ON(compression);
1305
1306 off = btrfs_file_extent_inline_start(ei);
1307 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1308
1309 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1310 if (ret < 0)
1311 goto out;
1312
1313out:
1314 btrfs_free_path(path);
1315 return ret;
1316}
1317
1318/*
1319 * Helper function to generate a file name that is unique in the root of
1320 * send_root and parent_root. This is used to generate names for orphan inodes.
1321 */
1322static int gen_unique_name(struct send_ctx *sctx,
1323 u64 ino, u64 gen,
1324 struct fs_path *dest)
1325{
1326 int ret = 0;
1327 struct btrfs_path *path;
1328 struct btrfs_dir_item *di;
1329 char tmp[64];
1330 int len;
1331 u64 idx = 0;
1332
1333 path = alloc_path_for_send();
1334 if (!path)
1335 return -ENOMEM;
1336
1337 while (1) {
1338 len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
1339 ino, gen, idx);
1340 if (len >= sizeof(tmp)) {
1341 /* should really not happen */
1342 ret = -EOVERFLOW;
1343 goto out;
1344 }
1345
1346 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1347 path, BTRFS_FIRST_FREE_OBJECTID,
1348 tmp, strlen(tmp), 0);
1349 btrfs_release_path(path);
1350 if (IS_ERR(di)) {
1351 ret = PTR_ERR(di);
1352 goto out;
1353 }
1354 if (di) {
1355 /* not unique, try again */
1356 idx++;
1357 continue;
1358 }
1359
1360 if (!sctx->parent_root) {
1361 /* unique */
1362 ret = 0;
1363 break;
1364 }
1365
1366 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1367 path, BTRFS_FIRST_FREE_OBJECTID,
1368 tmp, strlen(tmp), 0);
1369 btrfs_release_path(path);
1370 if (IS_ERR(di)) {
1371 ret = PTR_ERR(di);
1372 goto out;
1373 }
1374 if (di) {
1375 /* not unique, try again */
1376 idx++;
1377 continue;
1378 }
1379 /* unique */
1380 break;
1381 }
1382
1383 ret = fs_path_add(dest, tmp, strlen(tmp));
1384
1385out:
1386 btrfs_free_path(path);
1387 return ret;
1388}
1389
1390enum inode_state {
1391 inode_state_no_change,
1392 inode_state_will_create,
1393 inode_state_did_create,
1394 inode_state_will_delete,
1395 inode_state_did_delete,
1396};
1397
1398static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1399{
1400 int ret;
1401 int left_ret;
1402 int right_ret;
1403 u64 left_gen;
1404 u64 right_gen;
1405
1406 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1407 NULL);
1408 if (ret < 0 && ret != -ENOENT)
1409 goto out;
1410 left_ret = ret;
1411
1412 if (!sctx->parent_root) {
1413 right_ret = -ENOENT;
1414 } else {
1415 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1416 NULL, NULL, NULL);
1417 if (ret < 0 && ret != -ENOENT)
1418 goto out;
1419 right_ret = ret;
1420 }
1421
1422 if (!left_ret && !right_ret) {
1423 if (left_gen == gen && right_gen == gen)
1424 ret = inode_state_no_change;
1425 else if (left_gen == gen) {
1426 if (ino < sctx->send_progress)
1427 ret = inode_state_did_create;
1428 else
1429 ret = inode_state_will_create;
1430 } else if (right_gen == gen) {
1431 if (ino < sctx->send_progress)
1432 ret = inode_state_did_delete;
1433 else
1434 ret = inode_state_will_delete;
1435 } else {
1436 ret = -ENOENT;
1437 }
1438 } else if (!left_ret) {
1439 if (left_gen == gen) {
1440 if (ino < sctx->send_progress)
1441 ret = inode_state_did_create;
1442 else
1443 ret = inode_state_will_create;
1444 } else {
1445 ret = -ENOENT;
1446 }
1447 } else if (!right_ret) {
1448 if (right_gen == gen) {
1449 if (ino < sctx->send_progress)
1450 ret = inode_state_did_delete;
1451 else
1452 ret = inode_state_will_delete;
1453 } else {
1454 ret = -ENOENT;
1455 }
1456 } else {
1457 ret = -ENOENT;
1458 }
1459
1460out:
1461 return ret;
1462}
1463
1464static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
1465{
1466 int ret;
1467
1468 ret = get_cur_inode_state(sctx, ino, gen);
1469 if (ret < 0)
1470 goto out;
1471
1472 if (ret == inode_state_no_change ||
1473 ret == inode_state_did_create ||
1474 ret == inode_state_will_delete)
1475 ret = 1;
1476 else
1477 ret = 0;
1478
1479out:
1480 return ret;
1481}
1482
1483/*
1484 * Helper function to lookup a dir item in a dir.
1485 */
1486static int lookup_dir_item_inode(struct btrfs_root *root,
1487 u64 dir, const char *name, int name_len,
1488 u64 *found_inode,
1489 u8 *found_type)
1490{
1491 int ret = 0;
1492 struct btrfs_dir_item *di;
1493 struct btrfs_key key;
1494 struct btrfs_path *path;
1495
1496 path = alloc_path_for_send();
1497 if (!path)
1498 return -ENOMEM;
1499
1500 di = btrfs_lookup_dir_item(NULL, root, path,
1501 dir, name, name_len, 0);
1502 if (!di) {
1503 ret = -ENOENT;
1504 goto out;
1505 }
1506 if (IS_ERR(di)) {
1507 ret = PTR_ERR(di);
1508 goto out;
1509 }
1510 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1511 *found_inode = key.objectid;
1512 *found_type = btrfs_dir_type(path->nodes[0], di);
1513
1514out:
1515 btrfs_free_path(path);
1516 return ret;
1517}
1518
1519static int get_first_ref(struct send_ctx *sctx,
1520 struct btrfs_root *root, u64 ino,
1521 u64 *dir, u64 *dir_gen, struct fs_path *name)
1522{
1523 int ret;
1524 struct btrfs_key key;
1525 struct btrfs_key found_key;
1526 struct btrfs_path *path;
1527 struct btrfs_inode_ref *iref;
1528 int len;
1529
1530 path = alloc_path_for_send();
1531 if (!path)
1532 return -ENOMEM;
1533
1534 key.objectid = ino;
1535 key.type = BTRFS_INODE_REF_KEY;
1536 key.offset = 0;
1537
1538 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
1539 if (ret < 0)
1540 goto out;
1541 if (!ret)
1542 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1543 path->slots[0]);
1544 if (ret || found_key.objectid != key.objectid ||
1545 found_key.type != key.type) {
1546 ret = -ENOENT;
1547 goto out;
1548 }
1549
1550 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1551 struct btrfs_inode_ref);
1552 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1553 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1554 (unsigned long)(iref + 1), len);
1555 if (ret < 0)
1556 goto out;
1557 btrfs_release_path(path);
1558
1559 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
1560 NULL);
1561 if (ret < 0)
1562 goto out;
1563
1564 *dir = found_key.offset;
1565
1566out:
1567 btrfs_free_path(path);
1568 return ret;
1569}
1570
1571static int is_first_ref(struct send_ctx *sctx,
1572 struct btrfs_root *root,
1573 u64 ino, u64 dir,
1574 const char *name, int name_len)
1575{
1576 int ret;
1577 struct fs_path *tmp_name;
1578 u64 tmp_dir;
1579 u64 tmp_dir_gen;
1580
1581 tmp_name = fs_path_alloc(sctx);
1582 if (!tmp_name)
1583 return -ENOMEM;
1584
1585 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1586 if (ret < 0)
1587 goto out;
1588
1589 if (name_len != fs_path_len(tmp_name)) {
1590 ret = 0;
1591 goto out;
1592 }
1593
1594 ret = memcmp(tmp_name->start, name, name_len);
1595 if (ret)
1596 ret = 0;
1597 else
1598 ret = 1;
1599
1600out:
1601 fs_path_free(sctx, tmp_name);
1602 return ret;
1603}
1604
1605static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1606 const char *name, int name_len,
1607 u64 *who_ino, u64 *who_gen)
1608{
1609 int ret = 0;
1610 u64 other_inode = 0;
1611 u8 other_type = 0;
1612
1613 if (!sctx->parent_root)
1614 goto out;
1615
1616 ret = is_inode_existent(sctx, dir, dir_gen);
1617 if (ret <= 0)
1618 goto out;
1619
1620 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
1621 &other_inode, &other_type);
1622 if (ret < 0 && ret != -ENOENT)
1623 goto out;
1624 if (ret) {
1625 ret = 0;
1626 goto out;
1627 }
1628
1629 if (other_inode > sctx->send_progress) {
1630 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1631 who_gen, NULL, NULL, NULL);
1632 if (ret < 0)
1633 goto out;
1634
1635 ret = 1;
1636 *who_ino = other_inode;
1637 } else {
1638 ret = 0;
1639 }
1640
1641out:
1642 return ret;
1643}
1644
1645static int did_overwrite_ref(struct send_ctx *sctx,
1646 u64 dir, u64 dir_gen,
1647 u64 ino, u64 ino_gen,
1648 const char *name, int name_len)
1649{
1650 int ret = 0;
1651 u64 gen;
1652 u64 ow_inode;
1653 u8 other_type;
1654
1655 if (!sctx->parent_root)
1656 goto out;
1657
1658 ret = is_inode_existent(sctx, dir, dir_gen);
1659 if (ret <= 0)
1660 goto out;
1661
1662 /* check if the ref was overwritten by another ref */
1663 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
1664 &ow_inode, &other_type);
1665 if (ret < 0 && ret != -ENOENT)
1666 goto out;
1667 if (ret) {
1668 /* was never and will never be overwritten */
1669 ret = 0;
1670 goto out;
1671 }
1672
1673 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1674 NULL);
1675 if (ret < 0)
1676 goto out;
1677
1678 if (ow_inode == ino && gen == ino_gen) {
1679 ret = 0;
1680 goto out;
1681 }
1682
1683 /* we know that it is or will be overwritten. check this now */
1684 if (ow_inode < sctx->send_progress)
1685 ret = 1;
1686 else
1687 ret = 0;
1688
1689out:
1690 return ret;
1691}
1692
1693static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1694{
1695 int ret = 0;
1696 struct fs_path *name = NULL;
1697 u64 dir;
1698 u64 dir_gen;
1699
1700 if (!sctx->parent_root)
1701 goto out;
1702
1703 name = fs_path_alloc(sctx);
1704 if (!name)
1705 return -ENOMEM;
1706
1707 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
1708 if (ret < 0)
1709 goto out;
1710
1711 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1712 name->start, fs_path_len(name));
1713 if (ret < 0)
1714 goto out;
1715
1716out:
1717 fs_path_free(sctx, name);
1718 return ret;
1719}
1720
1721static int name_cache_insert(struct send_ctx *sctx,
1722 struct name_cache_entry *nce)
1723{
1724 int ret = 0;
1725 struct name_cache_entry **ncea;
1726
1727 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1728 if (ncea) {
1729 if (!ncea[0])
1730 ncea[0] = nce;
1731 else if (!ncea[1])
1732 ncea[1] = nce;
1733 else
1734 BUG();
1735 } else {
1736 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1737 if (!ncea)
1738 return -ENOMEM;
1739
1740 ncea[0] = nce;
1741 ncea[1] = NULL;
1742 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
1743 if (ret < 0)
1744 return ret;
1745 }
1746 list_add_tail(&nce->list, &sctx->name_cache_list);
1747 sctx->name_cache_size++;
1748
1749 return ret;
1750}
1751
1752static void name_cache_delete(struct send_ctx *sctx,
1753 struct name_cache_entry *nce)
1754{
1755 struct name_cache_entry **ncea;
1756
1757 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1758 BUG_ON(!ncea);
1759
1760 if (ncea[0] == nce)
1761 ncea[0] = NULL;
1762 else if (ncea[1] == nce)
1763 ncea[1] = NULL;
1764 else
1765 BUG();
1766
1767 if (!ncea[0] && !ncea[1]) {
1768 radix_tree_delete(&sctx->name_cache, nce->ino);
1769 kfree(ncea);
1770 }
1771
1772 list_del(&nce->list);
1773
1774 sctx->name_cache_size--;
1775}
1776
1777static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1778 u64 ino, u64 gen)
1779{
1780 struct name_cache_entry **ncea;
1781
1782 ncea = radix_tree_lookup(&sctx->name_cache, ino);
1783 if (!ncea)
1784 return NULL;
1785
1786 if (ncea[0] && ncea[0]->gen == gen)
1787 return ncea[0];
1788 else if (ncea[1] && ncea[1]->gen == gen)
1789 return ncea[1];
1790 return NULL;
1791}
1792
1793static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1794{
1795 list_del(&nce->list);
1796 list_add_tail(&nce->list, &sctx->name_cache_list);
1797}
1798
1799static void name_cache_clean_unused(struct send_ctx *sctx)
1800{
1801 struct name_cache_entry *nce;
1802
1803 if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
1804 return;
1805
1806 while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
1807 nce = list_entry(sctx->name_cache_list.next,
1808 struct name_cache_entry, list);
1809 name_cache_delete(sctx, nce);
1810 kfree(nce);
1811 }
1812}
1813
1814static void name_cache_free(struct send_ctx *sctx)
1815{
1816 struct name_cache_entry *nce;
1817 struct name_cache_entry *tmp;
1818
1819 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
1820 name_cache_delete(sctx, nce);
1821 }
1822}
1823
1824static int __get_cur_name_and_parent(struct send_ctx *sctx,
1825 u64 ino, u64 gen,
1826 u64 *parent_ino,
1827 u64 *parent_gen,
1828 struct fs_path *dest)
1829{
1830 int ret;
1831 int nce_ret;
1832 struct btrfs_path *path = NULL;
1833 struct name_cache_entry *nce = NULL;
1834
1835 nce = name_cache_search(sctx, ino, gen);
1836 if (nce) {
1837 if (ino < sctx->send_progress && nce->need_later_update) {
1838 name_cache_delete(sctx, nce);
1839 kfree(nce);
1840 nce = NULL;
1841 } else {
1842 name_cache_used(sctx, nce);
1843 *parent_ino = nce->parent_ino;
1844 *parent_gen = nce->parent_gen;
1845 ret = fs_path_add(dest, nce->name, nce->name_len);
1846 if (ret < 0)
1847 goto out;
1848 ret = nce->ret;
1849 goto out;
1850 }
1851 }
1852
1853 path = alloc_path_for_send();
1854 if (!path)
1855 return -ENOMEM;
1856
1857 ret = is_inode_existent(sctx, ino, gen);
1858 if (ret < 0)
1859 goto out;
1860
1861 if (!ret) {
1862 ret = gen_unique_name(sctx, ino, gen, dest);
1863 if (ret < 0)
1864 goto out;
1865 ret = 1;
1866 goto out_cache;
1867 }
1868
1869 if (ino < sctx->send_progress)
1870 ret = get_first_ref(sctx, sctx->send_root, ino,
1871 parent_ino, parent_gen, dest);
1872 else
1873 ret = get_first_ref(sctx, sctx->parent_root, ino,
1874 parent_ino, parent_gen, dest);
1875 if (ret < 0)
1876 goto out;
1877
1878 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1879 dest->start, dest->end - dest->start);
1880 if (ret < 0)
1881 goto out;
1882 if (ret) {
1883 fs_path_reset(dest);
1884 ret = gen_unique_name(sctx, ino, gen, dest);
1885 if (ret < 0)
1886 goto out;
1887 ret = 1;
1888 }
1889
1890out_cache:
1891 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1892 if (!nce) {
1893 ret = -ENOMEM;
1894 goto out;
1895 }
1896
1897 nce->ino = ino;
1898 nce->gen = gen;
1899 nce->parent_ino = *parent_ino;
1900 nce->parent_gen = *parent_gen;
1901 nce->name_len = fs_path_len(dest);
1902 nce->ret = ret;
1903 strcpy(nce->name, dest->start);
1904 memset(&nce->use_list, 0, sizeof(nce->use_list));
1905
1906 if (ino < sctx->send_progress)
1907 nce->need_later_update = 0;
1908 else
1909 nce->need_later_update = 1;
1910
1911 nce_ret = name_cache_insert(sctx, nce);
1912 if (nce_ret < 0)
1913 ret = nce_ret;
1914 name_cache_clean_unused(sctx);
1915
1916out:
1917 btrfs_free_path(path);
1918 return ret;
1919}
1920
1921/*
1922 * Magic happens here. This function returns the first ref to an inode as it
1923 * would look like while receiving the stream at this point in time.
1924 * We walk the path up to the root. For every inode in between, we check if it
1925 * was already processed/sent. If yes, we continue with the parent as found
1926 * in send_root. If not, we continue with the parent as found in parent_root.
1927 * If we encounter an inode that was deleted at this point in time, we use the
1928 * inodes "orphan" name instead of the real name and stop. Same with new inodes
1929 * that were not created yet and overwritten inodes/refs.
1930 *
1931 * When do we have have orphan inodes:
1932 * 1. When an inode is freshly created and thus no valid refs are available yet
1933 * 2. When a directory lost all it's refs (deleted) but still has dir items
1934 * inside which were not processed yet (pending for move/delete). If anyone
1935 * tried to get the path to the dir items, it would get a path inside that
1936 * orphan directory.
1937 * 3. When an inode is moved around or gets new links, it may overwrite the ref
1938 * of an unprocessed inode. If in that case the first ref would be
1939 * overwritten, the overwritten inode gets "orphanized". Later when we
1940 * process this overwritten inode, it is restored at a new place by moving
1941 * the orphan inode.
1942 *
1943 * sctx->send_progress tells this function at which point in time receiving
1944 * would be.
1945 */
1946static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
1947 struct fs_path *dest)
1948{
1949 int ret = 0;
1950 struct fs_path *name = NULL;
1951 u64 parent_inode = 0;
1952 u64 parent_gen = 0;
1953 int stop = 0;
1954
1955 name = fs_path_alloc(sctx);
1956 if (!name) {
1957 ret = -ENOMEM;
1958 goto out;
1959 }
1960
1961 dest->reversed = 1;
1962 fs_path_reset(dest);
1963
1964 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
1965 fs_path_reset(name);
1966
1967 ret = __get_cur_name_and_parent(sctx, ino, gen,
1968 &parent_inode, &parent_gen, name);
1969 if (ret < 0)
1970 goto out;
1971 if (ret)
1972 stop = 1;
1973
1974 ret = fs_path_add_path(dest, name);
1975 if (ret < 0)
1976 goto out;
1977
1978 ino = parent_inode;
1979 gen = parent_gen;
1980 }
1981
1982out:
1983 fs_path_free(sctx, name);
1984 if (!ret)
1985 fs_path_unreverse(dest);
1986 return ret;
1987}
1988
1989/*
1990 * Called for regular files when sending extents data. Opens a struct file
1991 * to read from the file.
1992 */
1993static int open_cur_inode_file(struct send_ctx *sctx)
1994{
1995 int ret = 0;
1996 struct btrfs_key key;
1997 struct path path;
1998 struct inode *inode;
1999 struct dentry *dentry;
2000 struct file *filp;
2001 int new = 0;
2002
2003 if (sctx->cur_inode_filp)
2004 goto out;
2005
2006 key.objectid = sctx->cur_ino;
2007 key.type = BTRFS_INODE_ITEM_KEY;
2008 key.offset = 0;
2009
2010 inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
2011 &new);
2012 if (IS_ERR(inode)) {
2013 ret = PTR_ERR(inode);
2014 goto out;
2015 }
2016
2017 dentry = d_obtain_alias(inode);
2018 inode = NULL;
2019 if (IS_ERR(dentry)) {
2020 ret = PTR_ERR(dentry);
2021 goto out;
2022 }
2023
2024 path.mnt = sctx->mnt;
2025 path.dentry = dentry;
2026 filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
2027 dput(dentry);
2028 dentry = NULL;
2029 if (IS_ERR(filp)) {
2030 ret = PTR_ERR(filp);
2031 goto out;
2032 }
2033 sctx->cur_inode_filp = filp;
2034
2035out:
2036 /*
2037 * no xxxput required here as every vfs op
2038 * does it by itself on failure
2039 */
2040 return ret;
2041}
2042
2043/*
2044 * Closes the struct file that was created in open_cur_inode_file
2045 */
2046static int close_cur_inode_file(struct send_ctx *sctx)
2047{
2048 int ret = 0;
2049
2050 if (!sctx->cur_inode_filp)
2051 goto out;
2052
2053 ret = filp_close(sctx->cur_inode_filp, NULL);
2054 sctx->cur_inode_filp = NULL;
2055
2056out:
2057 return ret;
2058}
2059
2060/*
2061 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2062 */
2063static int send_subvol_begin(struct send_ctx *sctx)
2064{
2065 int ret;
2066 struct btrfs_root *send_root = sctx->send_root;
2067 struct btrfs_root *parent_root = sctx->parent_root;
2068 struct btrfs_path *path;
2069 struct btrfs_key key;
2070 struct btrfs_root_ref *ref;
2071 struct extent_buffer *leaf;
2072 char *name = NULL;
2073 int namelen;
2074
2075 path = alloc_path_for_send();
2076 if (!path)
2077 return -ENOMEM;
2078
2079 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
2080 if (!name) {
2081 btrfs_free_path(path);
2082 return -ENOMEM;
2083 }
2084
2085 key.objectid = send_root->objectid;
2086 key.type = BTRFS_ROOT_BACKREF_KEY;
2087 key.offset = 0;
2088
2089 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2090 &key, path, 1, 0);
2091 if (ret < 0)
2092 goto out;
2093 if (ret) {
2094 ret = -ENOENT;
2095 goto out;
2096 }
2097
2098 leaf = path->nodes[0];
2099 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2100 if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2101 key.objectid != send_root->objectid) {
2102 ret = -ENOENT;
2103 goto out;
2104 }
2105 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2106 namelen = btrfs_root_ref_name_len(leaf, ref);
2107 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2108 btrfs_release_path(path);
2109
2110 if (ret < 0)
2111 goto out;
2112
2113 if (parent_root) {
2114 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2115 if (ret < 0)
2116 goto out;
2117 } else {
2118 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2119 if (ret < 0)
2120 goto out;
2121 }
2122
2123 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2124 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2125 sctx->send_root->root_item.uuid);
2126 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2127 sctx->send_root->root_item.ctransid);
2128 if (parent_root) {
2129 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2130 sctx->parent_root->root_item.uuid);
2131 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2132 sctx->parent_root->root_item.ctransid);
2133 }
2134
2135 ret = send_cmd(sctx);
2136
2137tlv_put_failure:
2138out:
2139 btrfs_free_path(path);
2140 kfree(name);
2141 return ret;
2142}
2143
2144static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2145{
2146 int ret = 0;
2147 struct fs_path *p;
2148
2149verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2150
2151 p = fs_path_alloc(sctx);
2152 if (!p)
2153 return -ENOMEM;
2154
2155 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2156 if (ret < 0)
2157 goto out;
2158
2159 ret = get_cur_path(sctx, ino, gen, p);
2160 if (ret < 0)
2161 goto out;
2162 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2163 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2164
2165 ret = send_cmd(sctx);
2166
2167tlv_put_failure:
2168out:
2169 fs_path_free(sctx, p);
2170 return ret;
2171}
2172
2173static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2174{
2175 int ret = 0;
2176 struct fs_path *p;
2177
2178verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2179
2180 p = fs_path_alloc(sctx);
2181 if (!p)
2182 return -ENOMEM;
2183
2184 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2185 if (ret < 0)
2186 goto out;
2187
2188 ret = get_cur_path(sctx, ino, gen, p);
2189 if (ret < 0)
2190 goto out;
2191 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2192 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2193
2194 ret = send_cmd(sctx);
2195
2196tlv_put_failure:
2197out:
2198 fs_path_free(sctx, p);
2199 return ret;
2200}
2201
2202static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2203{
2204 int ret = 0;
2205 struct fs_path *p;
2206
2207verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2208
2209 p = fs_path_alloc(sctx);
2210 if (!p)
2211 return -ENOMEM;
2212
2213 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2214 if (ret < 0)
2215 goto out;
2216
2217 ret = get_cur_path(sctx, ino, gen, p);
2218 if (ret < 0)
2219 goto out;
2220 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2221 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2222 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2223
2224 ret = send_cmd(sctx);
2225
2226tlv_put_failure:
2227out:
2228 fs_path_free(sctx, p);
2229 return ret;
2230}
2231
2232static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2233{
2234 int ret = 0;
2235 struct fs_path *p = NULL;
2236 struct btrfs_inode_item *ii;
2237 struct btrfs_path *path = NULL;
2238 struct extent_buffer *eb;
2239 struct btrfs_key key;
2240 int slot;
2241
2242verbose_printk("btrfs: send_utimes %llu\n", ino);
2243
2244 p = fs_path_alloc(sctx);
2245 if (!p)
2246 return -ENOMEM;
2247
2248 path = alloc_path_for_send();
2249 if (!path) {
2250 ret = -ENOMEM;
2251 goto out;
2252 }
2253
2254 key.objectid = ino;
2255 key.type = BTRFS_INODE_ITEM_KEY;
2256 key.offset = 0;
2257 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2258 if (ret < 0)
2259 goto out;
2260
2261 eb = path->nodes[0];
2262 slot = path->slots[0];
2263 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2264
2265 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2266 if (ret < 0)
2267 goto out;
2268
2269 ret = get_cur_path(sctx, ino, gen, p);
2270 if (ret < 0)
2271 goto out;
2272 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2273 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
2274 btrfs_inode_atime(ii));
2275 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
2276 btrfs_inode_mtime(ii));
2277 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2278 btrfs_inode_ctime(ii));
2279 /* TODO otime? */
2280
2281 ret = send_cmd(sctx);
2282
2283tlv_put_failure:
2284out:
2285 fs_path_free(sctx, p);
2286 btrfs_free_path(path);
2287 return ret;
2288}
2289
2290/*
2291 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2292 * a valid path yet because we did not process the refs yet. So, the inode
2293 * is created as orphan.
2294 */
2295static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
2296 struct btrfs_key *key)
2297{
2298 int ret = 0;
2299 struct extent_buffer *eb = path->nodes[0];
2300 struct btrfs_inode_item *ii;
2301 struct fs_path *p;
2302 int slot = path->slots[0];
2303 int cmd;
2304 u64 mode;
2305
2306verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2307
2308 p = fs_path_alloc(sctx);
2309 if (!p)
2310 return -ENOMEM;
2311
2312 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2313 mode = btrfs_inode_mode(eb, ii);
2314
2315 if (S_ISREG(mode))
2316 cmd = BTRFS_SEND_C_MKFILE;
2317 else if (S_ISDIR(mode))
2318 cmd = BTRFS_SEND_C_MKDIR;
2319 else if (S_ISLNK(mode))
2320 cmd = BTRFS_SEND_C_SYMLINK;
2321 else if (S_ISCHR(mode) || S_ISBLK(mode))
2322 cmd = BTRFS_SEND_C_MKNOD;
2323 else if (S_ISFIFO(mode))
2324 cmd = BTRFS_SEND_C_MKFIFO;
2325 else if (S_ISSOCK(mode))
2326 cmd = BTRFS_SEND_C_MKSOCK;
2327 else {
2328 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2329 (int)(mode & S_IFMT));
2330 ret = -ENOTSUPP;
2331 goto out;
2332 }
2333
2334 ret = begin_cmd(sctx, cmd);
2335 if (ret < 0)
2336 goto out;
2337
2338 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
2339 if (ret < 0)
2340 goto out;
2341
2342 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2343 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
2344
2345 if (S_ISLNK(mode)) {
2346 fs_path_reset(p);
2347 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
2348 if (ret < 0)
2349 goto out;
2350 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2351 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2352 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2353 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
2354 }
2355
2356 ret = send_cmd(sctx);
2357 if (ret < 0)
2358 goto out;
2359
2360
2361tlv_put_failure:
2362out:
2363 fs_path_free(sctx, p);
2364 return ret;
2365}
2366
2367struct recorded_ref {
2368 struct list_head list;
2369 char *dir_path;
2370 char *name;
2371 struct fs_path *full_path;
2372 u64 dir;
2373 u64 dir_gen;
2374 int dir_path_len;
2375 int name_len;
2376};
2377
2378/*
2379 * We need to process new refs before deleted refs, but compare_tree gives us
2380 * everything mixed. So we first record all refs and later process them.
2381 * This function is a helper to record one ref.
2382 */
2383static int record_ref(struct list_head *head, u64 dir,
2384 u64 dir_gen, struct fs_path *path)
2385{
2386 struct recorded_ref *ref;
2387 char *tmp;
2388
2389 ref = kmalloc(sizeof(*ref), GFP_NOFS);
2390 if (!ref)
2391 return -ENOMEM;
2392
2393 ref->dir = dir;
2394 ref->dir_gen = dir_gen;
2395 ref->full_path = path;
2396
2397 tmp = strrchr(ref->full_path->start, '/');
2398 if (!tmp) {
2399 ref->name_len = ref->full_path->end - ref->full_path->start;
2400 ref->name = ref->full_path->start;
2401 ref->dir_path_len = 0;
2402 ref->dir_path = ref->full_path->start;
2403 } else {
2404 tmp++;
2405 ref->name_len = ref->full_path->end - tmp;
2406 ref->name = tmp;
2407 ref->dir_path = ref->full_path->start;
2408 ref->dir_path_len = ref->full_path->end -
2409 ref->full_path->start - 1 - ref->name_len;
2410 }
2411
2412 list_add_tail(&ref->list, head);
2413 return 0;
2414}
2415
2416static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2417{
2418 struct recorded_ref *cur;
2419 struct recorded_ref *tmp;
2420
2421 list_for_each_entry_safe(cur, tmp, head, list) {
2422 fs_path_free(sctx, cur->full_path);
2423 kfree(cur);
2424 }
2425 INIT_LIST_HEAD(head);
2426}
2427
2428static void free_recorded_refs(struct send_ctx *sctx)
2429{
2430 __free_recorded_refs(sctx, &sctx->new_refs);
2431 __free_recorded_refs(sctx, &sctx->deleted_refs);
2432}
2433
2434/*
2435 * Renames/moves a file/dir to it's orphan name. Used when the first
2436 * ref of an unprocessed inode gets overwritten and for all non empty
2437 * directories.
2438 */
2439static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2440 struct fs_path *path)
2441{
2442 int ret;
2443 struct fs_path *orphan;
2444
2445 orphan = fs_path_alloc(sctx);
2446 if (!orphan)
2447 return -ENOMEM;
2448
2449 ret = gen_unique_name(sctx, ino, gen, orphan);
2450 if (ret < 0)
2451 goto out;
2452
2453 ret = send_rename(sctx, path, orphan);
2454
2455out:
2456 fs_path_free(sctx, orphan);
2457 return ret;
2458}
2459
2460/*
2461 * Returns 1 if a directory can be removed at this point in time.
2462 * We check this by iterating all dir items and checking if the inode behind
2463 * the dir item was already processed.
2464 */
2465static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2466{
2467 int ret = 0;
2468 struct btrfs_root *root = sctx->parent_root;
2469 struct btrfs_path *path;
2470 struct btrfs_key key;
2471 struct btrfs_key found_key;
2472 struct btrfs_key loc;
2473 struct btrfs_dir_item *di;
2474
2475 path = alloc_path_for_send();
2476 if (!path)
2477 return -ENOMEM;
2478
2479 key.objectid = dir;
2480 key.type = BTRFS_DIR_INDEX_KEY;
2481 key.offset = 0;
2482
2483 while (1) {
2484 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
2485 if (ret < 0)
2486 goto out;
2487 if (!ret) {
2488 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2489 path->slots[0]);
2490 }
2491 if (ret || found_key.objectid != key.objectid ||
2492 found_key.type != key.type) {
2493 break;
2494 }
2495
2496 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2497 struct btrfs_dir_item);
2498 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2499
2500 if (loc.objectid > send_progress) {
2501 ret = 0;
2502 goto out;
2503 }
2504
2505 btrfs_release_path(path);
2506 key.offset = found_key.offset + 1;
2507 }
2508
2509 ret = 1;
2510
2511out:
2512 btrfs_free_path(path);
2513 return ret;
2514}
2515
2516struct finish_unordered_dir_ctx {
2517 struct send_ctx *sctx;
2518 struct fs_path *cur_path;
2519 struct fs_path *dir_path;
2520 u64 dir_ino;
2521 int need_delete;
2522 int delete_pass;
2523};
2524
2525int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2526 const char *name, int name_len,
2527 const char *data, int data_len,
2528 u8 type, void *ctx)
2529{
2530 int ret = 0;
2531 struct finish_unordered_dir_ctx *fctx = ctx;
2532 struct send_ctx *sctx = fctx->sctx;
2533 u64 di_gen;
2534 u64 di_mode;
2535 int is_orphan = 0;
2536
2537 if (di_key->objectid >= fctx->dir_ino)
2538 goto out;
2539
2540 fs_path_reset(fctx->cur_path);
2541
2542 ret = get_inode_info(sctx->send_root, di_key->objectid,
2543 NULL, &di_gen, &di_mode, NULL, NULL);
2544 if (ret < 0)
2545 goto out;
2546
2547 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2548 fctx->dir_ino, name, name_len);
2549 if (ret < 0)
2550 goto out;
2551 if (ret) {
2552 is_orphan = 1;
2553 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2554 fctx->cur_path);
2555 } else {
2556 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2557 fctx->cur_path);
2558 }
2559 if (ret < 0)
2560 goto out;
2561
2562 ret = fs_path_add(fctx->dir_path, name, name_len);
2563 if (ret < 0)
2564 goto out;
2565
2566 if (!fctx->delete_pass) {
2567 if (S_ISDIR(di_mode)) {
2568 ret = send_rename(sctx, fctx->cur_path,
2569 fctx->dir_path);
2570 } else {
2571 ret = send_link(sctx, fctx->dir_path,
2572 fctx->cur_path);
2573 if (is_orphan)
2574 fctx->need_delete = 1;
2575 }
2576 } else if (!S_ISDIR(di_mode)) {
2577 ret = send_unlink(sctx, fctx->cur_path);
2578 } else {
2579 ret = 0;
2580 }
2581
2582 fs_path_remove(fctx->dir_path);
2583
2584out:
2585 return ret;
2586}
2587
2588/*
2589 * Go through all dir items and see if we find refs which could not be created
2590 * in the past because the dir did not exist at that time.
2591 */
2592static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2593{
2594 int ret = 0;
2595 struct btrfs_path *path = NULL;
2596 struct btrfs_key key;
2597 struct btrfs_key found_key;
2598 struct extent_buffer *eb;
2599 struct finish_unordered_dir_ctx fctx;
2600 int slot;
2601
2602 path = alloc_path_for_send();
2603 if (!path) {
2604 ret = -ENOMEM;
2605 goto out;
2606 }
2607
2608 memset(&fctx, 0, sizeof(fctx));
2609 fctx.sctx = sctx;
2610 fctx.cur_path = fs_path_alloc(sctx);
2611 fctx.dir_path = fs_path_alloc(sctx);
2612 if (!fctx.cur_path || !fctx.dir_path) {
2613 ret = -ENOMEM;
2614 goto out;
2615 }
2616 fctx.dir_ino = dir;
2617
2618 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2619 if (ret < 0)
2620 goto out;
2621
2622 /*
2623 * We do two passes. The first links in the new refs and the second
2624 * deletes orphans if required. Deletion of orphans is not required for
2625 * directory inodes, as we always have only one ref and use rename
2626 * instead of link for those.
2627 */
2628
2629again:
2630 key.objectid = dir;
2631 key.type = BTRFS_DIR_ITEM_KEY;
2632 key.offset = 0;
2633 while (1) {
2634 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2635 1, 0);
2636 if (ret < 0)
2637 goto out;
2638 eb = path->nodes[0];
2639 slot = path->slots[0];
2640 btrfs_item_key_to_cpu(eb, &found_key, slot);
2641
2642 if (found_key.objectid != key.objectid ||
2643 found_key.type != key.type) {
2644 btrfs_release_path(path);
2645 break;
2646 }
2647
2648 ret = iterate_dir_item(sctx, sctx->send_root, path,
2649 &found_key, __finish_unordered_dir,
2650 &fctx);
2651 if (ret < 0)
2652 goto out;
2653
2654 key.offset = found_key.offset + 1;
2655 btrfs_release_path(path);
2656 }
2657
2658 if (!fctx.delete_pass && fctx.need_delete) {
2659 fctx.delete_pass = 1;
2660 goto again;
2661 }
2662
2663out:
2664 btrfs_free_path(path);
2665 fs_path_free(sctx, fctx.cur_path);
2666 fs_path_free(sctx, fctx.dir_path);
2667 return ret;
2668}
2669
2670/*
2671 * This does all the move/link/unlink/rmdir magic.
2672 */
2673static int process_recorded_refs(struct send_ctx *sctx)
2674{
2675 int ret = 0;
2676 struct recorded_ref *cur;
2677 struct ulist *check_dirs = NULL;
2678 struct ulist_iterator uit;
2679 struct ulist_node *un;
2680 struct fs_path *valid_path = NULL;
2681 u64 ow_inode = 0;
2682 u64 ow_gen;
2683 int did_overwrite = 0;
2684 int is_orphan = 0;
2685
2686verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2687
2688 valid_path = fs_path_alloc(sctx);
2689 if (!valid_path) {
2690 ret = -ENOMEM;
2691 goto out;
2692 }
2693
2694 check_dirs = ulist_alloc(GFP_NOFS);
2695 if (!check_dirs) {
2696 ret = -ENOMEM;
2697 goto out;
2698 }
2699
2700 /*
2701 * First, check if the first ref of the current inode was overwritten
2702 * before. If yes, we know that the current inode was already orphanized
2703 * and thus use the orphan name. If not, we can use get_cur_path to
2704 * get the path of the first ref as it would like while receiving at
2705 * this point in time.
2706 * New inodes are always orphan at the beginning, so force to use the
2707 * orphan name in this case.
2708 * The first ref is stored in valid_path and will be updated if it
2709 * gets moved around.
2710 */
2711 if (!sctx->cur_inode_new) {
2712 ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
2713 sctx->cur_inode_gen);
2714 if (ret < 0)
2715 goto out;
2716 if (ret)
2717 did_overwrite = 1;
2718 }
2719 if (sctx->cur_inode_new || did_overwrite) {
2720 ret = gen_unique_name(sctx, sctx->cur_ino,
2721 sctx->cur_inode_gen, valid_path);
2722 if (ret < 0)
2723 goto out;
2724 is_orphan = 1;
2725 } else {
2726 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
2727 valid_path);
2728 if (ret < 0)
2729 goto out;
2730 }
2731
2732 list_for_each_entry(cur, &sctx->new_refs, list) {
2733 /*
2734 * Check if this new ref would overwrite the first ref of
2735 * another unprocessed inode. If yes, orphanize the
2736 * overwritten inode. If we find an overwritten ref that is
2737 * not the first ref, simply unlink it.
2738 */
2739 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
2740 cur->name, cur->name_len,
2741 &ow_inode, &ow_gen);
2742 if (ret < 0)
2743 goto out;
2744 if (ret) {
2745 ret = is_first_ref(sctx, sctx->parent_root,
2746 ow_inode, cur->dir, cur->name,
2747 cur->name_len);
2748 if (ret < 0)
2749 goto out;
2750 if (ret) {
2751 ret = orphanize_inode(sctx, ow_inode, ow_gen,
2752 cur->full_path);
2753 if (ret < 0)
2754 goto out;
2755 } else {
2756 ret = send_unlink(sctx, cur->full_path);
2757 if (ret < 0)
2758 goto out;
2759 }
2760 }
2761
2762 /*
2763 * link/move the ref to the new place. If we have an orphan
2764 * inode, move it and update valid_path. If not, link or move
2765 * it depending on the inode mode.
2766 */
2767 if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
2768 ret = send_rename(sctx, valid_path, cur->full_path);
2769 if (ret < 0)
2770 goto out;
2771 is_orphan = 0;
2772 ret = fs_path_copy(valid_path, cur->full_path);
2773 if (ret < 0)
2774 goto out;
2775 } else {
2776 if (S_ISDIR(sctx->cur_inode_mode)) {
2777 /*
2778 * Dirs can't be linked, so move it. For moved
2779 * dirs, we always have one new and one deleted
2780 * ref. The deleted ref is ignored later.
2781 */
2782 ret = send_rename(sctx, valid_path,
2783 cur->full_path);
2784 if (ret < 0)
2785 goto out;
2786 ret = fs_path_copy(valid_path, cur->full_path);
2787 if (ret < 0)
2788 goto out;
2789 } else {
2790 ret = send_link(sctx, cur->full_path,
2791 valid_path);
2792 if (ret < 0)
2793 goto out;
2794 }
2795 }
2796 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2797 GFP_NOFS);
2798 if (ret < 0)
2799 goto out;
2800 }
2801
2802 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
2803 /*
2804 * Check if we can already rmdir the directory. If not,
2805 * orphanize it. For every dir item inside that gets deleted
2806 * later, we do this check again and rmdir it then if possible.
2807 * See the use of check_dirs for more details.
2808 */
2809 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
2810 if (ret < 0)
2811 goto out;
2812 if (ret) {
2813 ret = send_rmdir(sctx, valid_path);
2814 if (ret < 0)
2815 goto out;
2816 } else if (!is_orphan) {
2817 ret = orphanize_inode(sctx, sctx->cur_ino,
2818 sctx->cur_inode_gen, valid_path);
2819 if (ret < 0)
2820 goto out;
2821 is_orphan = 1;
2822 }
2823
2824 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2825 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2826 GFP_NOFS);
2827 if (ret < 0)
2828 goto out;
2829 }
2830 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2831 /*
2832 * We have a non dir inode. Go through all deleted refs and
2833 * unlink them if they were not already overwritten by other
2834 * inodes.
2835 */
2836 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2837 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
2838 sctx->cur_ino, sctx->cur_inode_gen,
2839 cur->name, cur->name_len);
2840 if (ret < 0)
2841 goto out;
2842 if (!ret) {
2843 /*
2844 * In case the inode was moved to a directory
2845 * that was not created yet (see
2846 * __record_new_ref), we can not unlink the ref
2847 * as it will be needed later when the parent
2848 * directory is created, so that we can move in
2849 * the inode to the new dir.
2850 */
2851 if (!is_orphan &&
2852 sctx->cur_inode_first_ref_orphan) {
2853 ret = orphanize_inode(sctx,
2854 sctx->cur_ino,
2855 sctx->cur_inode_gen,
2856 cur->full_path);
2857 if (ret < 0)
2858 goto out;
2859 ret = gen_unique_name(sctx,
2860 sctx->cur_ino,
2861 sctx->cur_inode_gen,
2862 valid_path);
2863 if (ret < 0)
2864 goto out;
2865 is_orphan = 1;
2866
2867 } else {
2868 ret = send_unlink(sctx, cur->full_path);
2869 if (ret < 0)
2870 goto out;
2871 }
2872 }
2873 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2874 GFP_NOFS);
2875 if (ret < 0)
2876 goto out;
2877 }
2878
2879 /*
2880 * If the inode is still orphan, unlink the orphan. This may
2881 * happen when a previous inode did overwrite the first ref
2882 * of this inode and no new refs were added for the current
2883 * inode.
2884 * We can however not delete the orphan in case the inode relies
2885 * in a directory that was not created yet (see
2886 * __record_new_ref)
2887 */
2888 if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
2889 ret = send_unlink(sctx, valid_path);
2890 if (ret < 0)
2891 goto out;
2892 }
2893 }
2894
2895 /*
2896 * We did collect all parent dirs where cur_inode was once located. We
2897 * now go through all these dirs and check if they are pending for
2898 * deletion and if it's finally possible to perform the rmdir now.
2899 * We also update the inode stats of the parent dirs here.
2900 */
2901 ULIST_ITER_INIT(&uit);
2902 while ((un = ulist_next(check_dirs, &uit))) {
2903 if (un->val > sctx->cur_ino)
2904 continue;
2905
2906 ret = get_cur_inode_state(sctx, un->val, un->aux);
2907 if (ret < 0)
2908 goto out;
2909
2910 if (ret == inode_state_did_create ||
2911 ret == inode_state_no_change) {
2912 /* TODO delayed utimes */
2913 ret = send_utimes(sctx, un->val, un->aux);
2914 if (ret < 0)
2915 goto out;
2916 } else if (ret == inode_state_did_delete) {
2917 ret = can_rmdir(sctx, un->val, sctx->cur_ino);
2918 if (ret < 0)
2919 goto out;
2920 if (ret) {
2921 ret = get_cur_path(sctx, un->val, un->aux,
2922 valid_path);
2923 if (ret < 0)
2924 goto out;
2925 ret = send_rmdir(sctx, valid_path);
2926 if (ret < 0)
2927 goto out;
2928 }
2929 }
2930 }
2931
2932 /*
2933 * Current inode is now at it's new position, so we must increase
2934 * send_progress
2935 */
2936 sctx->send_progress = sctx->cur_ino + 1;
2937
2938 /*
2939 * We may have a directory here that has pending refs which could not
2940 * be created before (because the dir did not exist before, see
2941 * __record_new_ref). finish_outoforder_dir will link/move the pending
2942 * refs.
2943 */
2944 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2945 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2946 sctx->cur_inode_gen);
2947 if (ret < 0)
2948 goto out;
2949 }
2950
2951 ret = 0;
2952
2953out:
2954 free_recorded_refs(sctx);
2955 ulist_free(check_dirs);
2956 fs_path_free(sctx, valid_path);
2957 return ret;
2958}
2959
2960static int __record_new_ref(int num, u64 dir, int index,
2961 struct fs_path *name,
2962 void *ctx)
2963{
2964 int ret = 0;
2965 struct send_ctx *sctx = ctx;
2966 struct fs_path *p;
2967 u64 gen;
2968
2969 p = fs_path_alloc(sctx);
2970 if (!p)
2971 return -ENOMEM;
2972
2973 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2974 NULL);
2975 if (ret < 0)
2976 goto out;
2977
2978 /*
2979 * The parent may be non-existent at this point in time. This happens
2980 * if the ino of the parent dir is higher then the current ino. In this
2981 * case, we can not process this ref until the parent dir is finally
2982 * created. If we reach the parent dir later, process_recorded_refs
2983 * will go through all dir items and process the refs that could not be
2984 * processed before. In case this is the first ref, we set
2985 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2986 * keep an orphan of the inode so that it later can be used for
2987 * link/move
2988 */
2989 ret = is_inode_existent(sctx, dir, gen);
2990 if (ret < 0)
2991 goto out;
2992 if (!ret) {
2993 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2994 name->start, fs_path_len(name));
2995 if (ret < 0)
2996 goto out;
2997 if (ret)
2998 sctx->cur_inode_first_ref_orphan = 1;
2999 ret = 0;
3000 goto out;
3001 }
3002
3003 ret = get_cur_path(sctx, dir, gen, p);
3004 if (ret < 0)
3005 goto out;
3006 ret = fs_path_add_path(p, name);
3007 if (ret < 0)
3008 goto out;
3009
3010 ret = record_ref(&sctx->new_refs, dir, gen, p);
3011
3012out:
3013 if (ret)
3014 fs_path_free(sctx, p);
3015 return ret;
3016}
3017
3018static int __record_deleted_ref(int num, u64 dir, int index,
3019 struct fs_path *name,
3020 void *ctx)
3021{
3022 int ret = 0;
3023 struct send_ctx *sctx = ctx;
3024 struct fs_path *p;
3025 u64 gen;
3026
3027 p = fs_path_alloc(sctx);
3028 if (!p)
3029 return -ENOMEM;
3030
3031 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3032 NULL);
3033 if (ret < 0)
3034 goto out;
3035
3036 ret = get_cur_path(sctx, dir, gen, p);
3037 if (ret < 0)
3038 goto out;
3039 ret = fs_path_add_path(p, name);
3040 if (ret < 0)
3041 goto out;
3042
3043 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3044
3045out:
3046 if (ret)
3047 fs_path_free(sctx, p);
3048 return ret;
3049}
3050
3051static int record_new_ref(struct send_ctx *sctx)
3052{
3053 int ret;
3054
3055 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
3056 sctx->cmp_key, 0, __record_new_ref, sctx);
3057 if (ret < 0)
3058 goto out;
3059 ret = 0;
3060
3061out:
3062 return ret;
3063}
3064
3065static int record_deleted_ref(struct send_ctx *sctx)
3066{
3067 int ret;
3068
3069 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
3070 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3071 if (ret < 0)
3072 goto out;
3073 ret = 0;
3074
3075out:
3076 return ret;
3077}
3078
3079struct find_ref_ctx {
3080 u64 dir;
3081 struct fs_path *name;
3082 int found_idx;
3083};
3084
3085static int __find_iref(int num, u64 dir, int index,
3086 struct fs_path *name,
3087 void *ctx_)
3088{
3089 struct find_ref_ctx *ctx = ctx_;
3090
3091 if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
3092 strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
3093 ctx->found_idx = num;
3094 return 1;
3095 }
3096 return 0;
3097}
3098
3099static int find_iref(struct send_ctx *sctx,
3100 struct btrfs_root *root,
3101 struct btrfs_path *path,
3102 struct btrfs_key *key,
3103 u64 dir, struct fs_path *name)
3104{
3105 int ret;
3106 struct find_ref_ctx ctx;
3107
3108 ctx.dir = dir;
3109 ctx.name = name;
3110 ctx.found_idx = -1;
3111
3112 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
3113 if (ret < 0)
3114 return ret;
3115
3116 if (ctx.found_idx == -1)
3117 return -ENOENT;
3118
3119 return ctx.found_idx;
3120}
3121
3122static int __record_changed_new_ref(int num, u64 dir, int index,
3123 struct fs_path *name,
3124 void *ctx)
3125{
3126 int ret;
3127 struct send_ctx *sctx = ctx;
3128
3129 ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
3130 sctx->cmp_key, dir, name);
3131 if (ret == -ENOENT)
3132 ret = __record_new_ref(num, dir, index, name, sctx);
3133 else if (ret > 0)
3134 ret = 0;
3135
3136 return ret;
3137}
3138
3139static int __record_changed_deleted_ref(int num, u64 dir, int index,
3140 struct fs_path *name,
3141 void *ctx)
3142{
3143 int ret;
3144 struct send_ctx *sctx = ctx;
3145
3146 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
3147 dir, name);
3148 if (ret == -ENOENT)
3149 ret = __record_deleted_ref(num, dir, index, name, sctx);
3150 else if (ret > 0)
3151 ret = 0;
3152
3153 return ret;
3154}
3155
3156static int record_changed_ref(struct send_ctx *sctx)
3157{
3158 int ret = 0;
3159
3160 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
3161 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3162 if (ret < 0)
3163 goto out;
3164 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
3165 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3166 if (ret < 0)
3167 goto out;
3168 ret = 0;
3169
3170out:
3171 return ret;
3172}
3173
3174/*
3175 * Record and process all refs at once. Needed when an inode changes the
3176 * generation number, which means that it was deleted and recreated.
3177 */
3178static int process_all_refs(struct send_ctx *sctx,
3179 enum btrfs_compare_tree_result cmd)
3180{
3181 int ret;
3182 struct btrfs_root *root;
3183 struct btrfs_path *path;
3184 struct btrfs_key key;
3185 struct btrfs_key found_key;
3186 struct extent_buffer *eb;
3187 int slot;
3188 iterate_inode_ref_t cb;
3189
3190 path = alloc_path_for_send();
3191 if (!path)
3192 return -ENOMEM;
3193
3194 if (cmd == BTRFS_COMPARE_TREE_NEW) {
3195 root = sctx->send_root;
3196 cb = __record_new_ref;
3197 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
3198 root = sctx->parent_root;
3199 cb = __record_deleted_ref;
3200 } else {
3201 BUG();
3202 }
3203
3204 key.objectid = sctx->cmp_key->objectid;
3205 key.type = BTRFS_INODE_REF_KEY;
3206 key.offset = 0;
3207 while (1) {
3208 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3209 if (ret < 0) {
3210 btrfs_release_path(path);
3211 goto out;
3212 }
3213 if (ret) {
3214 btrfs_release_path(path);
3215 break;
3216 }
3217
3218 eb = path->nodes[0];
3219 slot = path->slots[0];
3220 btrfs_item_key_to_cpu(eb, &found_key, slot);
3221
3222 if (found_key.objectid != key.objectid ||
3223 found_key.type != key.type) {
3224 btrfs_release_path(path);
3225 break;
3226 }
3227
3228 ret = iterate_inode_ref(sctx, sctx->parent_root, path,
3229 &found_key, 0, cb, sctx);
3230 btrfs_release_path(path);
3231 if (ret < 0)
3232 goto out;
3233
3234 key.offset = found_key.offset + 1;
3235 }
3236
3237 ret = process_recorded_refs(sctx);
3238
3239out:
3240 btrfs_free_path(path);
3241 return ret;
3242}
3243
3244static int send_set_xattr(struct send_ctx *sctx,
3245 struct fs_path *path,
3246 const char *name, int name_len,
3247 const char *data, int data_len)
3248{
3249 int ret = 0;
3250
3251 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
3252 if (ret < 0)
3253 goto out;
3254
3255 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
3256 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
3257 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
3258
3259 ret = send_cmd(sctx);
3260
3261tlv_put_failure:
3262out:
3263 return ret;
3264}
3265
3266static int send_remove_xattr(struct send_ctx *sctx,
3267 struct fs_path *path,
3268 const char *name, int name_len)
3269{
3270 int ret = 0;
3271
3272 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
3273 if (ret < 0)
3274 goto out;
3275
3276 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
3277 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
3278
3279 ret = send_cmd(sctx);
3280
3281tlv_put_failure:
3282out:
3283 return ret;
3284}
3285
3286static int __process_new_xattr(int num, struct btrfs_key *di_key,
3287 const char *name, int name_len,
3288 const char *data, int data_len,
3289 u8 type, void *ctx)
3290{
3291 int ret;
3292 struct send_ctx *sctx = ctx;
3293 struct fs_path *p;
3294 posix_acl_xattr_header dummy_acl;
3295
3296 p = fs_path_alloc(sctx);
3297 if (!p)
3298 return -ENOMEM;
3299
3300 /*
3301 * This hack is needed because empty acl's are stored as zero byte
3302 * data in xattrs. Problem with that is, that receiving these zero byte
3303 * acl's will fail later. To fix this, we send a dummy acl list that
3304 * only contains the version number and no entries.
3305 */
3306 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
3307 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
3308 if (data_len == 0) {
3309 dummy_acl.a_version =
3310 cpu_to_le32(POSIX_ACL_XATTR_VERSION);
3311 data = (char *)&dummy_acl;
3312 data_len = sizeof(dummy_acl);
3313 }
3314 }
3315
3316 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3317 if (ret < 0)
3318 goto out;
3319
3320 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3321
3322out:
3323 fs_path_free(sctx, p);
3324 return ret;
3325}
3326
3327static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3328 const char *name, int name_len,
3329 const char *data, int data_len,
3330 u8 type, void *ctx)
3331{
3332 int ret;
3333 struct send_ctx *sctx = ctx;
3334 struct fs_path *p;
3335
3336 p = fs_path_alloc(sctx);
3337 if (!p)
3338 return -ENOMEM;
3339
3340 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3341 if (ret < 0)
3342 goto out;
3343
3344 ret = send_remove_xattr(sctx, p, name, name_len);
3345
3346out:
3347 fs_path_free(sctx, p);
3348 return ret;
3349}
3350
3351static int process_new_xattr(struct send_ctx *sctx)
3352{
3353 int ret = 0;
3354
3355 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
3356 sctx->cmp_key, __process_new_xattr, sctx);
3357
3358 return ret;
3359}
3360
3361static int process_deleted_xattr(struct send_ctx *sctx)
3362{
3363 int ret;
3364
3365 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
3366 sctx->cmp_key, __process_deleted_xattr, sctx);
3367
3368 return ret;
3369}
3370
3371struct find_xattr_ctx {
3372 const char *name;
3373 int name_len;
3374 int found_idx;
3375 char *found_data;
3376 int found_data_len;
3377};
3378
3379static int __find_xattr(int num, struct btrfs_key *di_key,
3380 const char *name, int name_len,
3381 const char *data, int data_len,
3382 u8 type, void *vctx)
3383{
3384 struct find_xattr_ctx *ctx = vctx;
3385
3386 if (name_len == ctx->name_len &&
3387 strncmp(name, ctx->name, name_len) == 0) {
3388 ctx->found_idx = num;
3389 ctx->found_data_len = data_len;
3390 ctx->found_data = kmalloc(data_len, GFP_NOFS);
3391 if (!ctx->found_data)
3392 return -ENOMEM;
3393 memcpy(ctx->found_data, data, data_len);
3394 return 1;
3395 }
3396 return 0;
3397}
3398
3399static int find_xattr(struct send_ctx *sctx,
3400 struct btrfs_root *root,
3401 struct btrfs_path *path,
3402 struct btrfs_key *key,
3403 const char *name, int name_len,
3404 char **data, int *data_len)
3405{
3406 int ret;
3407 struct find_xattr_ctx ctx;
3408
3409 ctx.name = name;
3410 ctx.name_len = name_len;
3411 ctx.found_idx = -1;
3412 ctx.found_data = NULL;
3413 ctx.found_data_len = 0;
3414
3415 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
3416 if (ret < 0)
3417 return ret;
3418
3419 if (ctx.found_idx == -1)
3420 return -ENOENT;
3421 if (data) {
3422 *data = ctx.found_data;
3423 *data_len = ctx.found_data_len;
3424 } else {
3425 kfree(ctx.found_data);
3426 }
3427 return ctx.found_idx;
3428}
3429
3430
3431static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3432 const char *name, int name_len,
3433 const char *data, int data_len,
3434 u8 type, void *ctx)
3435{
3436 int ret;
3437 struct send_ctx *sctx = ctx;
3438 char *found_data = NULL;
3439 int found_data_len = 0;
3440 struct fs_path *p = NULL;
3441
3442 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
3443 sctx->cmp_key, name, name_len, &found_data,
3444 &found_data_len);
3445 if (ret == -ENOENT) {
3446 ret = __process_new_xattr(num, di_key, name, name_len, data,
3447 data_len, type, ctx);
3448 } else if (ret >= 0) {
3449 if (data_len != found_data_len ||
3450 memcmp(data, found_data, data_len)) {
3451 ret = __process_new_xattr(num, di_key, name, name_len,
3452 data, data_len, type, ctx);
3453 } else {
3454 ret = 0;
3455 }
3456 }
3457
3458 kfree(found_data);
3459 fs_path_free(sctx, p);
3460 return ret;
3461}
3462
3463static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3464 const char *name, int name_len,
3465 const char *data, int data_len,
3466 u8 type, void *ctx)
3467{
3468 int ret;
3469 struct send_ctx *sctx = ctx;
3470
3471 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
3472 name, name_len, NULL, NULL);
3473 if (ret == -ENOENT)
3474 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3475 data_len, type, ctx);
3476 else if (ret >= 0)
3477 ret = 0;
3478
3479 return ret;
3480}
3481
3482static int process_changed_xattr(struct send_ctx *sctx)
3483{
3484 int ret = 0;
3485
3486 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
3487 sctx->cmp_key, __process_changed_new_xattr, sctx);
3488 if (ret < 0)
3489 goto out;
3490 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
3491 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3492
3493out:
3494 return ret;
3495}
3496
3497static int process_all_new_xattrs(struct send_ctx *sctx)
3498{
3499 int ret;
3500 struct btrfs_root *root;
3501 struct btrfs_path *path;
3502 struct btrfs_key key;
3503 struct btrfs_key found_key;
3504 struct extent_buffer *eb;
3505 int slot;
3506
3507 path = alloc_path_for_send();
3508 if (!path)
3509 return -ENOMEM;
3510
3511 root = sctx->send_root;
3512
3513 key.objectid = sctx->cmp_key->objectid;
3514 key.type = BTRFS_XATTR_ITEM_KEY;
3515 key.offset = 0;
3516 while (1) {
3517 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3518 if (ret < 0)
3519 goto out;
3520 if (ret) {
3521 ret = 0;
3522 goto out;
3523 }
3524
3525 eb = path->nodes[0];
3526 slot = path->slots[0];
3527 btrfs_item_key_to_cpu(eb, &found_key, slot);
3528
3529 if (found_key.objectid != key.objectid ||
3530 found_key.type != key.type) {
3531 ret = 0;
3532 goto out;
3533 }
3534
3535 ret = iterate_dir_item(sctx, root, path, &found_key,
3536 __process_new_xattr, sctx);
3537 if (ret < 0)
3538 goto out;
3539
3540 btrfs_release_path(path);
3541 key.offset = found_key.offset + 1;
3542 }
3543
3544out:
3545 btrfs_free_path(path);
3546 return ret;
3547}
3548
3549/*
3550 * Read some bytes from the current inode/file and send a write command to
3551 * user space.
3552 */
3553static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3554{
3555 int ret = 0;
3556 struct fs_path *p;
3557 loff_t pos = offset;
3558 int readed = 0;
3559 mm_segment_t old_fs;
3560
3561 p = fs_path_alloc(sctx);
3562 if (!p)
3563 return -ENOMEM;
3564
3565 /*
3566 * vfs normally only accepts user space buffers for security reasons.
3567 * we only read from the file and also only provide the read_buf buffer
3568 * to vfs. As this buffer does not come from a user space call, it's
3569 * ok to temporary allow kernel space buffers.
3570 */
3571 old_fs = get_fs();
3572 set_fs(KERNEL_DS);
3573
3574verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3575
3576 ret = open_cur_inode_file(sctx);
3577 if (ret < 0)
3578 goto out;
3579
3580 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3581 if (ret < 0)
3582 goto out;
3583 readed = ret;
3584 if (!readed)
3585 goto out;
3586
3587 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
3588 if (ret < 0)
3589 goto out;
3590
3591 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3592 if (ret < 0)
3593 goto out;
3594
3595 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3596 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3597 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
3598
3599 ret = send_cmd(sctx);
3600
3601tlv_put_failure:
3602out:
3603 fs_path_free(sctx, p);
3604 set_fs(old_fs);
3605 if (ret < 0)
3606 return ret;
3607 return readed;
3608}
3609
3610/*
3611 * Send a clone command to user space.
3612 */
3613static int send_clone(struct send_ctx *sctx,
3614 u64 offset, u32 len,
3615 struct clone_root *clone_root)
3616{
3617 int ret = 0;
3618 struct btrfs_root *clone_root2 = clone_root->root;
3619 struct fs_path *p;
3620 u64 gen;
3621
3622verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3623 "clone_inode=%llu, clone_offset=%llu\n", offset, len,
3624 clone_root->root->objectid, clone_root->ino,
3625 clone_root->offset);
3626
3627 p = fs_path_alloc(sctx);
3628 if (!p)
3629 return -ENOMEM;
3630
3631 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
3632 if (ret < 0)
3633 goto out;
3634
3635 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
3636 if (ret < 0)
3637 goto out;
3638
3639 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3640 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3641 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3642
3643 if (clone_root2 == sctx->send_root) {
3644 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3645 &gen, NULL, NULL, NULL);
3646 if (ret < 0)
3647 goto out;
3648 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3649 } else {
3650 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
3651 }
3652 if (ret < 0)
3653 goto out;
3654
3655 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3656 clone_root2->root_item.uuid);
3657 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3658 clone_root2->root_item.ctransid);
3659 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3660 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3661 clone_root->offset);
3662
3663 ret = send_cmd(sctx);
3664
3665tlv_put_failure:
3666out:
3667 fs_path_free(sctx, p);
3668 return ret;
3669}
3670
3671static int send_write_or_clone(struct send_ctx *sctx,
3672 struct btrfs_path *path,
3673 struct btrfs_key *key,
3674 struct clone_root *clone_root)
3675{
3676 int ret = 0;
3677 struct btrfs_file_extent_item *ei;
3678 u64 offset = key->offset;
3679 u64 pos = 0;
3680 u64 len;
3681 u32 l;
3682 u8 type;
3683
3684 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3685 struct btrfs_file_extent_item);
3686 type = btrfs_file_extent_type(path->nodes[0], ei);
3687 if (type == BTRFS_FILE_EXTENT_INLINE)
3688 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3689 else
3690 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3691
3692 if (offset + len > sctx->cur_inode_size)
3693 len = sctx->cur_inode_size - offset;
3694 if (len == 0) {
3695 ret = 0;
3696 goto out;
3697 }
3698
3699 if (!clone_root) {
3700 while (pos < len) {
3701 l = len - pos;
3702 if (l > BTRFS_SEND_READ_SIZE)
3703 l = BTRFS_SEND_READ_SIZE;
3704 ret = send_write(sctx, pos + offset, l);
3705 if (ret < 0)
3706 goto out;
3707 if (!ret)
3708 break;
3709 pos += ret;
3710 }
3711 ret = 0;
3712 } else {
3713 ret = send_clone(sctx, offset, len, clone_root);
3714 }
3715
3716out:
3717 return ret;
3718}
3719
3720static int is_extent_unchanged(struct send_ctx *sctx,
3721 struct btrfs_path *left_path,
3722 struct btrfs_key *ekey)
3723{
3724 int ret = 0;
3725 struct btrfs_key key;
3726 struct btrfs_path *path = NULL;
3727 struct extent_buffer *eb;
3728 int slot;
3729 struct btrfs_key found_key;
3730 struct btrfs_file_extent_item *ei;
3731 u64 left_disknr;
3732 u64 right_disknr;
3733 u64 left_offset;
3734 u64 right_offset;
3735 u64 left_offset_fixed;
3736 u64 left_len;
3737 u64 right_len;
3738 u8 left_type;
3739 u8 right_type;
3740
3741 path = alloc_path_for_send();
3742 if (!path)
3743 return -ENOMEM;
3744
3745 eb = left_path->nodes[0];
3746 slot = left_path->slots[0];
3747
3748 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3749 left_type = btrfs_file_extent_type(eb, ei);
3750 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3751 left_len = btrfs_file_extent_num_bytes(eb, ei);
3752 left_offset = btrfs_file_extent_offset(eb, ei);
3753
3754 if (left_type != BTRFS_FILE_EXTENT_REG) {
3755 ret = 0;
3756 goto out;
3757 }
3758
3759 /*
3760 * Following comments will refer to these graphics. L is the left
3761 * extents which we are checking at the moment. 1-8 are the right
3762 * extents that we iterate.
3763 *
3764 * |-----L-----|
3765 * |-1-|-2a-|-3-|-4-|-5-|-6-|
3766 *
3767 * |-----L-----|
3768 * |--1--|-2b-|...(same as above)
3769 *
3770 * Alternative situation. Happens on files where extents got split.
3771 * |-----L-----|
3772 * |-----------7-----------|-6-|
3773 *
3774 * Alternative situation. Happens on files which got larger.
3775 * |-----L-----|
3776 * |-8-|
3777 * Nothing follows after 8.
3778 */
3779
3780 key.objectid = ekey->objectid;
3781 key.type = BTRFS_EXTENT_DATA_KEY;
3782 key.offset = ekey->offset;
3783 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
3784 if (ret < 0)
3785 goto out;
3786 if (ret) {
3787 ret = 0;
3788 goto out;
3789 }
3790
3791 /*
3792 * Handle special case where the right side has no extents at all.
3793 */
3794 eb = path->nodes[0];
3795 slot = path->slots[0];
3796 btrfs_item_key_to_cpu(eb, &found_key, slot);
3797 if (found_key.objectid != key.objectid ||
3798 found_key.type != key.type) {
3799 ret = 0;
3800 goto out;
3801 }
3802
3803 /*
3804 * We're now on 2a, 2b or 7.
3805 */
3806 key = found_key;
3807 while (key.offset < ekey->offset + left_len) {
3808 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3809 right_type = btrfs_file_extent_type(eb, ei);
3810 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3811 right_len = btrfs_file_extent_num_bytes(eb, ei);
3812 right_offset = btrfs_file_extent_offset(eb, ei);
3813
3814 if (right_type != BTRFS_FILE_EXTENT_REG) {
3815 ret = 0;
3816 goto out;
3817 }
3818
3819 /*
3820 * Are we at extent 8? If yes, we know the extent is changed.
3821 * This may only happen on the first iteration.
3822 */
3823 if (found_key.offset + right_len < ekey->offset) {
3824 ret = 0;
3825 goto out;
3826 }
3827
3828 left_offset_fixed = left_offset;
3829 if (key.offset < ekey->offset) {
3830 /* Fix the right offset for 2a and 7. */
3831 right_offset += ekey->offset - key.offset;
3832 } else {
3833 /* Fix the left offset for all behind 2a and 2b */
3834 left_offset_fixed += key.offset - ekey->offset;
3835 }
3836
3837 /*
3838 * Check if we have the same extent.
3839 */
3840 if (left_disknr + left_offset_fixed !=
3841 right_disknr + right_offset) {
3842 ret = 0;
3843 goto out;
3844 }
3845
3846 /*
3847 * Go to the next extent.
3848 */
3849 ret = btrfs_next_item(sctx->parent_root, path);
3850 if (ret < 0)
3851 goto out;
3852 if (!ret) {
3853 eb = path->nodes[0];
3854 slot = path->slots[0];
3855 btrfs_item_key_to_cpu(eb, &found_key, slot);
3856 }
3857 if (ret || found_key.objectid != key.objectid ||
3858 found_key.type != key.type) {
3859 key.offset += right_len;
3860 break;
3861 } else {
3862 if (found_key.offset != key.offset + right_len) {
3863 /* Should really not happen */
3864 ret = -EIO;
3865 goto out;
3866 }
3867 }
3868 key = found_key;
3869 }
3870
3871 /*
3872 * We're now behind the left extent (treat as unchanged) or at the end
3873 * of the right side (treat as changed).
3874 */
3875 if (key.offset >= ekey->offset + left_len)
3876 ret = 1;
3877 else
3878 ret = 0;
3879
3880
3881out:
3882 btrfs_free_path(path);
3883 return ret;
3884}
3885
3886static int process_extent(struct send_ctx *sctx,
3887 struct btrfs_path *path,
3888 struct btrfs_key *key)
3889{
3890 int ret = 0;
3891 struct clone_root *found_clone = NULL;
3892
3893 if (S_ISLNK(sctx->cur_inode_mode))
3894 return 0;
3895
3896 if (sctx->parent_root && !sctx->cur_inode_new) {
3897 ret = is_extent_unchanged(sctx, path, key);
3898 if (ret < 0)
3899 goto out;
3900 if (ret) {
3901 ret = 0;
3902 goto out;
3903 }
3904 }
3905
3906 ret = find_extent_clone(sctx, path, key->objectid, key->offset,
3907 sctx->cur_inode_size, &found_clone);
3908 if (ret != -ENOENT && ret < 0)
3909 goto out;
3910
3911 ret = send_write_or_clone(sctx, path, key, found_clone);
3912
3913out:
3914 return ret;
3915}
3916
3917static int process_all_extents(struct send_ctx *sctx)
3918{
3919 int ret;
3920 struct btrfs_root *root;
3921 struct btrfs_path *path;
3922 struct btrfs_key key;
3923 struct btrfs_key found_key;
3924 struct extent_buffer *eb;
3925 int slot;
3926
3927 root = sctx->send_root;
3928 path = alloc_path_for_send();
3929 if (!path)
3930 return -ENOMEM;
3931
3932 key.objectid = sctx->cmp_key->objectid;
3933 key.type = BTRFS_EXTENT_DATA_KEY;
3934 key.offset = 0;
3935 while (1) {
3936 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3937 if (ret < 0)
3938 goto out;
3939 if (ret) {
3940 ret = 0;
3941 goto out;
3942 }
3943
3944 eb = path->nodes[0];
3945 slot = path->slots[0];
3946 btrfs_item_key_to_cpu(eb, &found_key, slot);
3947
3948 if (found_key.objectid != key.objectid ||
3949 found_key.type != key.type) {
3950 ret = 0;
3951 goto out;
3952 }
3953
3954 ret = process_extent(sctx, path, &found_key);
3955 if (ret < 0)
3956 goto out;
3957
3958 btrfs_release_path(path);
3959 key.offset = found_key.offset + 1;
3960 }
3961
3962out:
3963 btrfs_free_path(path);
3964 return ret;
3965}
3966
3967static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3968{
3969 int ret = 0;
3970
3971 if (sctx->cur_ino == 0)
3972 goto out;
3973 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
3974 sctx->cmp_key->type <= BTRFS_INODE_REF_KEY)
3975 goto out;
3976 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
3977 goto out;
3978
3979 ret = process_recorded_refs(sctx);
3980
3981out:
3982 return ret;
3983}
3984
3985static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
3986{
3987 int ret = 0;
3988 u64 left_mode;
3989 u64 left_uid;
3990 u64 left_gid;
3991 u64 right_mode;
3992 u64 right_uid;
3993 u64 right_gid;
3994 int need_chmod = 0;
3995 int need_chown = 0;
3996
3997 ret = process_recorded_refs_if_needed(sctx, at_end);
3998 if (ret < 0)
3999 goto out;
4000
4001 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
4002 goto out;
4003 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
4004 goto out;
4005
4006 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4007 &left_mode, &left_uid, &left_gid);
4008 if (ret < 0)
4009 goto out;
4010
4011 if (!S_ISLNK(sctx->cur_inode_mode)) {
4012 if (!sctx->parent_root || sctx->cur_inode_new) {
4013 need_chmod = 1;
4014 need_chown = 1;
4015 } else {
4016 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4017 NULL, NULL, &right_mode, &right_uid,
4018 &right_gid);
4019 if (ret < 0)
4020 goto out;
4021
4022 if (left_uid != right_uid || left_gid != right_gid)
4023 need_chown = 1;
4024 if (left_mode != right_mode)
4025 need_chmod = 1;
4026 }
4027 }
4028
4029 if (S_ISREG(sctx->cur_inode_mode)) {
4030 ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4031 sctx->cur_inode_size);
4032 if (ret < 0)
4033 goto out;
4034 }
4035
4036 if (need_chown) {
4037 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4038 left_uid, left_gid);
4039 if (ret < 0)
4040 goto out;
4041 }
4042 if (need_chmod) {
4043 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4044 left_mode);
4045 if (ret < 0)
4046 goto out;
4047 }
4048
4049 /*
4050 * Need to send that every time, no matter if it actually changed
4051 * between the two trees as we have done changes to the inode before.
4052 */
4053 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4054 if (ret < 0)
4055 goto out;
4056
4057out:
4058 return ret;
4059}
4060
4061static int changed_inode(struct send_ctx *sctx,
4062 enum btrfs_compare_tree_result result)
4063{
4064 int ret = 0;
4065 struct btrfs_key *key = sctx->cmp_key;
4066 struct btrfs_inode_item *left_ii = NULL;
4067 struct btrfs_inode_item *right_ii = NULL;
4068 u64 left_gen = 0;
4069 u64 right_gen = 0;
4070
4071 ret = close_cur_inode_file(sctx);
4072 if (ret < 0)
4073 goto out;
4074
4075 sctx->cur_ino = key->objectid;
4076 sctx->cur_inode_new_gen = 0;
4077 sctx->cur_inode_first_ref_orphan = 0;
4078 sctx->send_progress = sctx->cur_ino;
4079
4080 if (result == BTRFS_COMPARE_TREE_NEW ||
4081 result == BTRFS_COMPARE_TREE_CHANGED) {
4082 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
4083 sctx->left_path->slots[0],
4084 struct btrfs_inode_item);
4085 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
4086 left_ii);
4087 } else {
4088 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
4089 sctx->right_path->slots[0],
4090 struct btrfs_inode_item);
4091 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4092 right_ii);
4093 }
4094 if (result == BTRFS_COMPARE_TREE_CHANGED) {
4095 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
4096 sctx->right_path->slots[0],
4097 struct btrfs_inode_item);
4098
4099 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4100 right_ii);
4101 if (left_gen != right_gen)
4102 sctx->cur_inode_new_gen = 1;
4103 }
4104
4105 if (result == BTRFS_COMPARE_TREE_NEW) {
4106 sctx->cur_inode_gen = left_gen;
4107 sctx->cur_inode_new = 1;
4108 sctx->cur_inode_deleted = 0;
4109 sctx->cur_inode_size = btrfs_inode_size(
4110 sctx->left_path->nodes[0], left_ii);
4111 sctx->cur_inode_mode = btrfs_inode_mode(
4112 sctx->left_path->nodes[0], left_ii);
4113 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4114 ret = send_create_inode(sctx, sctx->left_path,
4115 sctx->cmp_key);
4116 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4117 sctx->cur_inode_gen = right_gen;
4118 sctx->cur_inode_new = 0;
4119 sctx->cur_inode_deleted = 1;
4120 sctx->cur_inode_size = btrfs_inode_size(
4121 sctx->right_path->nodes[0], right_ii);
4122 sctx->cur_inode_mode = btrfs_inode_mode(
4123 sctx->right_path->nodes[0], right_ii);
4124 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4125 if (sctx->cur_inode_new_gen) {
4126 sctx->cur_inode_gen = right_gen;
4127 sctx->cur_inode_new = 0;
4128 sctx->cur_inode_deleted = 1;
4129 sctx->cur_inode_size = btrfs_inode_size(
4130 sctx->right_path->nodes[0], right_ii);
4131 sctx->cur_inode_mode = btrfs_inode_mode(
4132 sctx->right_path->nodes[0], right_ii);
4133 ret = process_all_refs(sctx,
4134 BTRFS_COMPARE_TREE_DELETED);
4135 if (ret < 0)
4136 goto out;
4137
4138 sctx->cur_inode_gen = left_gen;
4139 sctx->cur_inode_new = 1;
4140 sctx->cur_inode_deleted = 0;
4141 sctx->cur_inode_size = btrfs_inode_size(
4142 sctx->left_path->nodes[0], left_ii);
4143 sctx->cur_inode_mode = btrfs_inode_mode(
4144 sctx->left_path->nodes[0], left_ii);
4145 ret = send_create_inode(sctx, sctx->left_path,
4146 sctx->cmp_key);
4147 if (ret < 0)
4148 goto out;
4149
4150 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4151 if (ret < 0)
4152 goto out;
4153 ret = process_all_extents(sctx);
4154 if (ret < 0)
4155 goto out;
4156 ret = process_all_new_xattrs(sctx);
4157 if (ret < 0)
4158 goto out;
4159 } else {
4160 sctx->cur_inode_gen = left_gen;
4161 sctx->cur_inode_new = 0;
4162 sctx->cur_inode_new_gen = 0;
4163 sctx->cur_inode_deleted = 0;
4164 sctx->cur_inode_size = btrfs_inode_size(
4165 sctx->left_path->nodes[0], left_ii);
4166 sctx->cur_inode_mode = btrfs_inode_mode(
4167 sctx->left_path->nodes[0], left_ii);
4168 }
4169 }
4170
4171out:
4172 return ret;
4173}
4174
4175static int changed_ref(struct send_ctx *sctx,
4176 enum btrfs_compare_tree_result result)
4177{
4178 int ret = 0;
4179
4180 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4181
4182 if (!sctx->cur_inode_new_gen &&
4183 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
4184 if (result == BTRFS_COMPARE_TREE_NEW)
4185 ret = record_new_ref(sctx);
4186 else if (result == BTRFS_COMPARE_TREE_DELETED)
4187 ret = record_deleted_ref(sctx);
4188 else if (result == BTRFS_COMPARE_TREE_CHANGED)
4189 ret = record_changed_ref(sctx);
4190 }
4191
4192 return ret;
4193}
4194
4195static int changed_xattr(struct send_ctx *sctx,
4196 enum btrfs_compare_tree_result result)
4197{
4198 int ret = 0;
4199
4200 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4201
4202 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
4203 if (result == BTRFS_COMPARE_TREE_NEW)
4204 ret = process_new_xattr(sctx);
4205 else if (result == BTRFS_COMPARE_TREE_DELETED)
4206 ret = process_deleted_xattr(sctx);
4207 else if (result == BTRFS_COMPARE_TREE_CHANGED)
4208 ret = process_changed_xattr(sctx);
4209 }
4210
4211 return ret;
4212}
4213
4214static int changed_extent(struct send_ctx *sctx,
4215 enum btrfs_compare_tree_result result)
4216{
4217 int ret = 0;
4218
4219 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
4220
4221 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
4222 if (result != BTRFS_COMPARE_TREE_DELETED)
4223 ret = process_extent(sctx, sctx->left_path,
4224 sctx->cmp_key);
4225 }
4226
4227 return ret;
4228}
4229
4230
4231static int changed_cb(struct btrfs_root *left_root,
4232 struct btrfs_root *right_root,
4233 struct btrfs_path *left_path,
4234 struct btrfs_path *right_path,
4235 struct btrfs_key *key,
4236 enum btrfs_compare_tree_result result,
4237 void *ctx)
4238{
4239 int ret = 0;
4240 struct send_ctx *sctx = ctx;
4241
4242 sctx->left_path = left_path;
4243 sctx->right_path = right_path;
4244 sctx->cmp_key = key;
4245
4246 ret = finish_inode_if_needed(sctx, 0);
4247 if (ret < 0)
4248 goto out;
4249
4250 if (key->type == BTRFS_INODE_ITEM_KEY)
4251 ret = changed_inode(sctx, result);
4252 else if (key->type == BTRFS_INODE_REF_KEY)
4253 ret = changed_ref(sctx, result);
4254 else if (key->type == BTRFS_XATTR_ITEM_KEY)
4255 ret = changed_xattr(sctx, result);
4256 else if (key->type == BTRFS_EXTENT_DATA_KEY)
4257 ret = changed_extent(sctx, result);
4258
4259out:
4260 return ret;
4261}
4262
4263static int full_send_tree(struct send_ctx *sctx)
4264{
4265 int ret;
4266 struct btrfs_trans_handle *trans = NULL;
4267 struct btrfs_root *send_root = sctx->send_root;
4268 struct btrfs_key key;
4269 struct btrfs_key found_key;
4270 struct btrfs_path *path;
4271 struct extent_buffer *eb;
4272 int slot;
4273 u64 start_ctransid;
4274 u64 ctransid;
4275
4276 path = alloc_path_for_send();
4277 if (!path)
4278 return -ENOMEM;
4279
4280 spin_lock(&send_root->root_times_lock);
4281 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4282 spin_unlock(&send_root->root_times_lock);
4283
4284 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4285 key.type = BTRFS_INODE_ITEM_KEY;
4286 key.offset = 0;
4287
4288join_trans:
4289 /*
4290 * We need to make sure the transaction does not get committed
4291 * while we do anything on commit roots. Join a transaction to prevent
4292 * this.
4293 */
4294 trans = btrfs_join_transaction(send_root);
4295 if (IS_ERR(trans)) {
4296 ret = PTR_ERR(trans);
4297 trans = NULL;
4298 goto out;
4299 }
4300
4301 /*
4302 * Make sure the tree has not changed
4303 */
4304 spin_lock(&send_root->root_times_lock);
4305 ctransid = btrfs_root_ctransid(&send_root->root_item);
4306 spin_unlock(&send_root->root_times_lock);
4307
4308 if (ctransid != start_ctransid) {
4309 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
4310 "send was modified in between. This is "
4311 "probably a bug.\n");
4312 ret = -EIO;
4313 goto out;
4314 }
4315
4316 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
4317 if (ret < 0)
4318 goto out;
4319 if (ret)
4320 goto out_finish;
4321
4322 while (1) {
4323 /*
4324 * When someone want to commit while we iterate, end the
4325 * joined transaction and rejoin.
4326 */
4327 if (btrfs_should_end_transaction(trans, send_root)) {
4328 ret = btrfs_end_transaction(trans, send_root);
4329 trans = NULL;
4330 if (ret < 0)
4331 goto out;
4332 btrfs_release_path(path);
4333 goto join_trans;
4334 }
4335
4336 eb = path->nodes[0];
4337 slot = path->slots[0];
4338 btrfs_item_key_to_cpu(eb, &found_key, slot);
4339
4340 ret = changed_cb(send_root, NULL, path, NULL,
4341 &found_key, BTRFS_COMPARE_TREE_NEW, sctx);
4342 if (ret < 0)
4343 goto out;
4344
4345 key.objectid = found_key.objectid;
4346 key.type = found_key.type;
4347 key.offset = found_key.offset + 1;
4348
4349 ret = btrfs_next_item(send_root, path);
4350 if (ret < 0)
4351 goto out;
4352 if (ret) {
4353 ret = 0;
4354 break;
4355 }
4356 }
4357
4358out_finish:
4359 ret = finish_inode_if_needed(sctx, 1);
4360
4361out:
4362 btrfs_free_path(path);
4363 if (trans) {
4364 if (!ret)
4365 ret = btrfs_end_transaction(trans, send_root);
4366 else
4367 btrfs_end_transaction(trans, send_root);
4368 }
4369 return ret;
4370}
4371
4372static int send_subvol(struct send_ctx *sctx)
4373{
4374 int ret;
4375
4376 ret = send_header(sctx);
4377 if (ret < 0)
4378 goto out;
4379
4380 ret = send_subvol_begin(sctx);
4381 if (ret < 0)
4382 goto out;
4383
4384 if (sctx->parent_root) {
4385 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
4386 changed_cb, sctx);
4387 if (ret < 0)
4388 goto out;
4389 ret = finish_inode_if_needed(sctx, 1);
4390 if (ret < 0)
4391 goto out;
4392 } else {
4393 ret = full_send_tree(sctx);
4394 if (ret < 0)
4395 goto out;
4396 }
4397
4398out:
4399 if (!ret)
4400 ret = close_cur_inode_file(sctx);
4401 else
4402 close_cur_inode_file(sctx);
4403
4404 free_recorded_refs(sctx);
4405 return ret;
4406}
4407
4408long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4409{
4410 int ret = 0;
4411 struct btrfs_root *send_root;
4412 struct btrfs_root *clone_root;
4413 struct btrfs_fs_info *fs_info;
4414 struct btrfs_ioctl_send_args *arg = NULL;
4415 struct btrfs_key key;
4416 struct file *filp = NULL;
4417 struct send_ctx *sctx = NULL;
4418 u32 i;
4419 u64 *clone_sources_tmp = NULL;
4420
4421 if (!capable(CAP_SYS_ADMIN))
4422 return -EPERM;
4423
4424 send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
4425 fs_info = send_root->fs_info;
4426
4427 arg = memdup_user(arg_, sizeof(*arg));
4428 if (IS_ERR(arg)) {
4429 ret = PTR_ERR(arg);
4430 arg = NULL;
4431 goto out;
4432 }
4433
4434 if (!access_ok(VERIFY_READ, arg->clone_sources,
4435 sizeof(*arg->clone_sources *
4436 arg->clone_sources_count))) {
4437 ret = -EFAULT;
4438 goto out;
4439 }
4440
4441 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
4442 if (!sctx) {
4443 ret = -ENOMEM;
4444 goto out;
4445 }
4446
4447 INIT_LIST_HEAD(&sctx->new_refs);
4448 INIT_LIST_HEAD(&sctx->deleted_refs);
4449 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
4450 INIT_LIST_HEAD(&sctx->name_cache_list);
4451
4452 sctx->send_filp = fget(arg->send_fd);
4453 if (IS_ERR(sctx->send_filp)) {
4454 ret = PTR_ERR(sctx->send_filp);
4455 goto out;
4456 }
4457
4458 sctx->mnt = mnt_file->f_path.mnt;
4459
4460 sctx->send_root = send_root;
4461 sctx->clone_roots_cnt = arg->clone_sources_count;
4462
4463 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
4464 sctx->send_buf = vmalloc(sctx->send_max_size);
4465 if (!sctx->send_buf) {
4466 ret = -ENOMEM;
4467 goto out;
4468 }
4469
4470 sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
4471 if (!sctx->read_buf) {
4472 ret = -ENOMEM;
4473 goto out;
4474 }
4475
4476 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
4477 (arg->clone_sources_count + 1));
4478 if (!sctx->clone_roots) {
4479 ret = -ENOMEM;
4480 goto out;
4481 }
4482
4483 if (arg->clone_sources_count) {
4484 clone_sources_tmp = vmalloc(arg->clone_sources_count *
4485 sizeof(*arg->clone_sources));
4486 if (!clone_sources_tmp) {
4487 ret = -ENOMEM;
4488 goto out;
4489 }
4490
4491 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
4492 arg->clone_sources_count *
4493 sizeof(*arg->clone_sources));
4494 if (ret) {
4495 ret = -EFAULT;
4496 goto out;
4497 }
4498
4499 for (i = 0; i < arg->clone_sources_count; i++) {
4500 key.objectid = clone_sources_tmp[i];
4501 key.type = BTRFS_ROOT_ITEM_KEY;
4502 key.offset = (u64)-1;
4503 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4504 if (!clone_root) {
4505 ret = -EINVAL;
4506 goto out;
4507 }
4508 if (IS_ERR(clone_root)) {
4509 ret = PTR_ERR(clone_root);
4510 goto out;
4511 }
4512 sctx->clone_roots[i].root = clone_root;
4513 }
4514 vfree(clone_sources_tmp);
4515 clone_sources_tmp = NULL;
4516 }
4517
4518 if (arg->parent_root) {
4519 key.objectid = arg->parent_root;
4520 key.type = BTRFS_ROOT_ITEM_KEY;
4521 key.offset = (u64)-1;
4522 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4523 if (!sctx->parent_root) {
4524 ret = -EINVAL;
4525 goto out;
4526 }
4527 }
4528
4529 /*
4530 * Clones from send_root are allowed, but only if the clone source
4531 * is behind the current send position. This is checked while searching
4532 * for possible clone sources.
4533 */
4534 sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
4535
4536 /* We do a bsearch later */
4537 sort(sctx->clone_roots, sctx->clone_roots_cnt,
4538 sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
4539 NULL);
4540
4541 ret = send_subvol(sctx);
4542 if (ret < 0)
4543 goto out;
4544
4545 ret = begin_cmd(sctx, BTRFS_SEND_C_END);
4546 if (ret < 0)
4547 goto out;
4548 ret = send_cmd(sctx);
4549 if (ret < 0)
4550 goto out;
4551
4552out:
4553 if (filp)
4554 fput(filp);
4555 kfree(arg);
4556 vfree(clone_sources_tmp);
4557
4558 if (sctx) {
4559 if (sctx->send_filp)
4560 fput(sctx->send_filp);
4561
4562 vfree(sctx->clone_roots);
4563 vfree(sctx->send_buf);
4564 vfree(sctx->read_buf);
4565
4566 name_cache_free(sctx);
4567
4568 kfree(sctx);
4569 }
4570
4571 return ret;
4572}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
new file mode 100644
index 000000000000..9934e948e57f
--- /dev/null
+++ b/fs/btrfs/send.h
@@ -0,0 +1,133 @@
1/*
2 * Copyright (C) 2012 Alexander Block. All rights reserved.
3 * Copyright (C) 2012 STRATO. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#include "ctree.h"
21
22#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
23#define BTRFS_SEND_STREAM_VERSION 1
24
25#define BTRFS_SEND_BUF_SIZE (1024 * 64)
26#define BTRFS_SEND_READ_SIZE (1024 * 48)
27
28enum btrfs_tlv_type {
29 BTRFS_TLV_U8,
30 BTRFS_TLV_U16,
31 BTRFS_TLV_U32,
32 BTRFS_TLV_U64,
33 BTRFS_TLV_BINARY,
34 BTRFS_TLV_STRING,
35 BTRFS_TLV_UUID,
36 BTRFS_TLV_TIMESPEC,
37};
38
39struct btrfs_stream_header {
40 char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
41 __le32 version;
42} __attribute__ ((__packed__));
43
44struct btrfs_cmd_header {
45 /* len excluding the header */
46 __le32 len;
47 __le16 cmd;
48 /* crc including the header with zero crc field */
49 __le32 crc;
50} __attribute__ ((__packed__));
51
52struct btrfs_tlv_header {
53 __le16 tlv_type;
54 /* len excluding the header */
55 __le16 tlv_len;
56} __attribute__ ((__packed__));
57
58/* commands */
59enum btrfs_send_cmd {
60 BTRFS_SEND_C_UNSPEC,
61
62 BTRFS_SEND_C_SUBVOL,
63 BTRFS_SEND_C_SNAPSHOT,
64
65 BTRFS_SEND_C_MKFILE,
66 BTRFS_SEND_C_MKDIR,
67 BTRFS_SEND_C_MKNOD,
68 BTRFS_SEND_C_MKFIFO,
69 BTRFS_SEND_C_MKSOCK,
70 BTRFS_SEND_C_SYMLINK,
71
72 BTRFS_SEND_C_RENAME,
73 BTRFS_SEND_C_LINK,
74 BTRFS_SEND_C_UNLINK,
75 BTRFS_SEND_C_RMDIR,
76
77 BTRFS_SEND_C_SET_XATTR,
78 BTRFS_SEND_C_REMOVE_XATTR,
79
80 BTRFS_SEND_C_WRITE,
81 BTRFS_SEND_C_CLONE,
82
83 BTRFS_SEND_C_TRUNCATE,
84 BTRFS_SEND_C_CHMOD,
85 BTRFS_SEND_C_CHOWN,
86 BTRFS_SEND_C_UTIMES,
87
88 BTRFS_SEND_C_END,
89 __BTRFS_SEND_C_MAX,
90};
91#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
92
93/* attributes in send stream */
94enum {
95 BTRFS_SEND_A_UNSPEC,
96
97 BTRFS_SEND_A_UUID,
98 BTRFS_SEND_A_CTRANSID,
99
100 BTRFS_SEND_A_INO,
101 BTRFS_SEND_A_SIZE,
102 BTRFS_SEND_A_MODE,
103 BTRFS_SEND_A_UID,
104 BTRFS_SEND_A_GID,
105 BTRFS_SEND_A_RDEV,
106 BTRFS_SEND_A_CTIME,
107 BTRFS_SEND_A_MTIME,
108 BTRFS_SEND_A_ATIME,
109 BTRFS_SEND_A_OTIME,
110
111 BTRFS_SEND_A_XATTR_NAME,
112 BTRFS_SEND_A_XATTR_DATA,
113
114 BTRFS_SEND_A_PATH,
115 BTRFS_SEND_A_PATH_TO,
116 BTRFS_SEND_A_PATH_LINK,
117
118 BTRFS_SEND_A_FILE_OFFSET,
119 BTRFS_SEND_A_DATA,
120
121 BTRFS_SEND_A_CLONE_UUID,
122 BTRFS_SEND_A_CLONE_CTRANSID,
123 BTRFS_SEND_A_CLONE_PATH,
124 BTRFS_SEND_A_CLONE_OFFSET,
125 BTRFS_SEND_A_CLONE_LEN,
126
127 __BTRFS_SEND_A_MAX,
128};
129#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
130
131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133#endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c6ffa5812419..b976597b0721 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,15 +17,27 @@
17 */ 17 */
18 18
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <asm/unaligned.h>
20 21
21/* this is some deeply nasty code. ctree.h has a different 22#include "ctree.h"
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef 23
24static inline u8 get_unaligned_le8(const void *p)
25{
26 return *(u8 *)p;
27}
28
29static inline void put_unaligned_le8(u8 val, void *p)
30{
31 *(u8 *)p = val;
32}
33
34/*
35 * this is some deeply nasty code.
23 * 36 *
24 * The end result is that anyone who #includes ctree.h gets a 37 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions 38 * declaration for the btrfs_set_foo functions and btrfs_foo functions,
26 * 39 * which are wappers of btrfs_set_token_#bits functions and
27 * This file declares the macros and then #includes ctree.h, which results 40 * btrfs_get_token_#bits functions, which are defined in this file.
28 * in cpp creating the function here based on the template below.
29 * 41 *
30 * These setget functions do all the extent_buffer related mapping 42 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent 43 * required to efficiently read and write specific fields in the extent
@@ -33,103 +45,93 @@
33 * an unsigned long offset into the extent buffer which has been 45 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking. 46 * cast to a specific type. This gives us all the gcc type checking.
35 * 47 *
36 * The extent buffer api is used to do all the kmapping and page 48 * The extent buffer api is used to do the page spanning work required to
37 * spanning work required to get extent buffers in highmem and have 49 * have a metadata blocksize different from the page size.
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */ 50 */
43 51
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 52#define DEFINE_BTRFS_SETGET_BITS(bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ 53u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ 54 unsigned long off, \
47void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ 55 struct btrfs_map_token *token) \
48u##bits btrfs_token_##name(struct extent_buffer *eb, \
49 type *s, struct btrfs_map_token *token) \
50{ \ 56{ \
51 unsigned long part_offset = (unsigned long)s; \ 57 unsigned long part_offset = (unsigned long)ptr; \
52 unsigned long offset = part_offset + offsetof(type, member); \ 58 unsigned long offset = part_offset + off; \
53 type *p; \ 59 void *p; \
54 int err; \ 60 int err; \
55 char *kaddr; \ 61 char *kaddr; \
56 unsigned long map_start; \ 62 unsigned long map_start; \
57 unsigned long map_len; \ 63 unsigned long map_len; \
58 unsigned long mem_len = sizeof(((type *)0)->member); \ 64 int size = sizeof(u##bits); \
59 u##bits res; \ 65 u##bits res; \
60 if (token && token->kaddr && token->offset <= offset && \ 66 \
61 token->eb == eb && \ 67 if (token && token->kaddr && token->offset <= offset && \
62 (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ 68 token->eb == eb && \
63 kaddr = token->kaddr; \ 69 (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
64 p = (type *)(kaddr + part_offset - token->offset); \ 70 kaddr = token->kaddr; \
65 res = le##bits##_to_cpu(p->member); \ 71 p = kaddr + part_offset - token->offset; \
66 return res; \ 72 res = get_unaligned_le##bits(p + off); \
67 } \ 73 return res; \
68 err = map_private_extent_buffer(eb, offset, \ 74 } \
69 mem_len, \ 75 err = map_private_extent_buffer(eb, offset, size, \
70 &kaddr, &map_start, &map_len); \ 76 &kaddr, &map_start, &map_len); \
71 if (err) { \ 77 if (err) { \
72 __le##bits leres; \ 78 __le##bits leres; \
73 read_eb_member(eb, s, type, member, &leres); \ 79 \
74 return le##bits##_to_cpu(leres); \ 80 read_extent_buffer(eb, &leres, offset, size); \
75 } \ 81 return le##bits##_to_cpu(leres); \
76 p = (type *)(kaddr + part_offset - map_start); \ 82 } \
77 res = le##bits##_to_cpu(p->member); \ 83 p = kaddr + part_offset - map_start; \
78 if (token) { \ 84 res = get_unaligned_le##bits(p + off); \
79 token->kaddr = kaddr; \ 85 if (token) { \
80 token->offset = map_start; \ 86 token->kaddr = kaddr; \
81 token->eb = eb; \ 87 token->offset = map_start; \
82 } \ 88 token->eb = eb; \
83 return res; \ 89 } \
90 return res; \
84} \ 91} \
85void btrfs_set_token_##name(struct extent_buffer *eb, \ 92void btrfs_set_token_##bits(struct extent_buffer *eb, \
86 type *s, u##bits val, struct btrfs_map_token *token) \ 93 void *ptr, unsigned long off, u##bits val, \
94 struct btrfs_map_token *token) \
87{ \ 95{ \
88 unsigned long part_offset = (unsigned long)s; \ 96 unsigned long part_offset = (unsigned long)ptr; \
89 unsigned long offset = part_offset + offsetof(type, member); \ 97 unsigned long offset = part_offset + off; \
90 type *p; \ 98 void *p; \
91 int err; \ 99 int err; \
92 char *kaddr; \ 100 char *kaddr; \
93 unsigned long map_start; \ 101 unsigned long map_start; \
94 unsigned long map_len; \ 102 unsigned long map_len; \
95 unsigned long mem_len = sizeof(((type *)0)->member); \ 103 int size = sizeof(u##bits); \
96 if (token && token->kaddr && token->offset <= offset && \ 104 \
97 token->eb == eb && \ 105 if (token && token->kaddr && token->offset <= offset && \
98 (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ 106 token->eb == eb && \
99 kaddr = token->kaddr; \ 107 (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
100 p = (type *)(kaddr + part_offset - token->offset); \ 108 kaddr = token->kaddr; \
101 p->member = cpu_to_le##bits(val); \ 109 p = kaddr + part_offset - token->offset; \
102 return; \ 110 put_unaligned_le##bits(val, p + off); \
103 } \ 111 return; \
104 err = map_private_extent_buffer(eb, offset, \ 112 } \
105 mem_len, \ 113 err = map_private_extent_buffer(eb, offset, size, \
106 &kaddr, &map_start, &map_len); \ 114 &kaddr, &map_start, &map_len); \
107 if (err) { \ 115 if (err) { \
108 __le##bits val2; \ 116 __le##bits val2; \
109 val2 = cpu_to_le##bits(val); \ 117 \
110 write_eb_member(eb, s, type, member, &val2); \ 118 val2 = cpu_to_le##bits(val); \
111 return; \ 119 write_extent_buffer(eb, &val2, offset, size); \
112 } \ 120 return; \
113 p = (type *)(kaddr + part_offset - map_start); \ 121 } \
114 p->member = cpu_to_le##bits(val); \ 122 p = kaddr + part_offset - map_start; \
115 if (token) { \ 123 put_unaligned_le##bits(val, p + off); \
116 token->kaddr = kaddr; \ 124 if (token) { \
117 token->offset = map_start; \ 125 token->kaddr = kaddr; \
118 token->eb = eb; \ 126 token->offset = map_start; \
119 } \ 127 token->eb = eb; \
120} \ 128 } \
121void btrfs_set_##name(struct extent_buffer *eb, \ 129}
122 type *s, u##bits val) \
123{ \
124 btrfs_set_token_##name(eb, s, val, NULL); \
125} \
126u##bits btrfs_##name(struct extent_buffer *eb, \
127 type *s) \
128{ \
129 return btrfs_token_##name(eb, s, NULL); \
130} \
131 130
132#include "ctree.h" 131DEFINE_BTRFS_SETGET_BITS(8)
132DEFINE_BTRFS_SETGET_BITS(16)
133DEFINE_BTRFS_SETGET_BITS(32)
134DEFINE_BTRFS_SETGET_BITS(64)
133 135
134void btrfs_node_key(struct extent_buffer *eb, 136void btrfs_node_key(struct extent_buffer *eb,
135 struct btrfs_disk_key *disk_key, int nr) 137 struct btrfs_disk_key *disk_key, int nr)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e23991574fdf..f2eb24c477a3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -100,10 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
100 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; 100 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
101} 101}
102 102
103/* NOTE:
104 * We move write_super stuff at umount in order to avoid deadlock
105 * for umount hold all lock.
106 */
107static void save_error_info(struct btrfs_fs_info *fs_info) 103static void save_error_info(struct btrfs_fs_info *fs_info)
108{ 104{
109 __save_error_info(fs_info); 105 __save_error_info(fs_info);
@@ -125,6 +121,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
125 } 121 }
126} 122}
127 123
124#ifdef CONFIG_PRINTK
128/* 125/*
129 * __btrfs_std_error decodes expected errors from the caller and 126 * __btrfs_std_error decodes expected errors from the caller and
130 * invokes the approciate error response. 127 * invokes the approciate error response.
@@ -167,7 +164,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
167 va_end(args); 164 va_end(args);
168} 165}
169 166
170const char *logtypes[] = { 167static const char * const logtypes[] = {
171 "emergency", 168 "emergency",
172 "alert", 169 "alert",
173 "critical", 170 "critical",
@@ -185,21 +182,49 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
185 struct va_format vaf; 182 struct va_format vaf;
186 va_list args; 183 va_list args;
187 const char *type = logtypes[4]; 184 const char *type = logtypes[4];
185 int kern_level;
188 186
189 va_start(args, fmt); 187 va_start(args, fmt);
190 188
191 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { 189 kern_level = printk_get_level(fmt);
192 memcpy(lvl, fmt, 3); 190 if (kern_level) {
193 lvl[3] = '\0'; 191 size_t size = printk_skip_level(fmt) - fmt;
194 fmt += 3; 192 memcpy(lvl, fmt, size);
195 type = logtypes[fmt[1] - '0']; 193 lvl[size] = '\0';
194 fmt += size;
195 type = logtypes[kern_level - '0'];
196 } else 196 } else
197 *lvl = '\0'; 197 *lvl = '\0';
198 198
199 vaf.fmt = fmt; 199 vaf.fmt = fmt;
200 vaf.va = &args; 200 vaf.va = &args;
201
201 printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); 202 printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
203
204 va_end(args);
205}
206
207#else
208
209void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
210 unsigned int line, int errno, const char *fmt, ...)
211{
212 struct super_block *sb = fs_info->sb;
213
214 /*
215 * Special case: if the error is EROFS, and we're already
216 * under MS_RDONLY, then it is safe here.
217 */
218 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
219 return;
220
221 /* Don't go through full error handling during mount */
222 if (sb->s_flags & MS_BORN) {
223 save_error_info(fs_info);
224 btrfs_handle_error(fs_info);
225 }
202} 226}
227#endif
203 228
204/* 229/*
205 * We only mark the transaction aborted and then set the file system read-only. 230 * We only mark the transaction aborted and then set the file system read-only.
@@ -396,15 +421,23 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
396 strcmp(args[0].from, "zlib") == 0) { 421 strcmp(args[0].from, "zlib") == 0) {
397 compress_type = "zlib"; 422 compress_type = "zlib";
398 info->compress_type = BTRFS_COMPRESS_ZLIB; 423 info->compress_type = BTRFS_COMPRESS_ZLIB;
424 btrfs_set_opt(info->mount_opt, COMPRESS);
399 } else if (strcmp(args[0].from, "lzo") == 0) { 425 } else if (strcmp(args[0].from, "lzo") == 0) {
400 compress_type = "lzo"; 426 compress_type = "lzo";
401 info->compress_type = BTRFS_COMPRESS_LZO; 427 info->compress_type = BTRFS_COMPRESS_LZO;
428 btrfs_set_opt(info->mount_opt, COMPRESS);
429 btrfs_set_fs_incompat(info, COMPRESS_LZO);
430 } else if (strncmp(args[0].from, "no", 2) == 0) {
431 compress_type = "no";
432 info->compress_type = BTRFS_COMPRESS_NONE;
433 btrfs_clear_opt(info->mount_opt, COMPRESS);
434 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
435 compress_force = false;
402 } else { 436 } else {
403 ret = -EINVAL; 437 ret = -EINVAL;
404 goto out; 438 goto out;
405 } 439 }
406 440
407 btrfs_set_opt(info->mount_opt, COMPRESS);
408 if (compress_force) { 441 if (compress_force) {
409 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 442 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
410 pr_info("btrfs: force %s compression\n", 443 pr_info("btrfs: force %s compression\n",
@@ -1068,7 +1101,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1068 } 1101 }
1069 1102
1070 bdev = fs_devices->latest_bdev; 1103 bdev = fs_devices->latest_bdev;
1071 s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); 1104 s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
1105 fs_info);
1072 if (IS_ERR(s)) { 1106 if (IS_ERR(s)) {
1073 error = PTR_ERR(s); 1107 error = PTR_ERR(s);
1074 goto error_close_devices; 1108 goto error_close_devices;
@@ -1082,7 +1116,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1082 } else { 1116 } else {
1083 char b[BDEVNAME_SIZE]; 1117 char b[BDEVNAME_SIZE];
1084 1118
1085 s->s_flags = flags | MS_NOSEC;
1086 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 1119 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1087 btrfs_sb(s)->bdev_holder = fs_type; 1120 btrfs_sb(s)->bdev_holder = fs_type;
1088 error = btrfs_fill_super(s, fs_devices, data, 1121 error = btrfs_fill_super(s, fs_devices, data,
@@ -1455,6 +1488,13 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1455 ret = btrfs_scan_one_device(vol->name, FMODE_READ, 1488 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1456 &btrfs_fs_type, &fs_devices); 1489 &btrfs_fs_type, &fs_devices);
1457 break; 1490 break;
1491 case BTRFS_IOC_DEVICES_READY:
1492 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1493 &btrfs_fs_type, &fs_devices);
1494 if (ret)
1495 break;
1496 ret = !(fs_devices->num_devices == fs_devices->total_devices);
1497 break;
1458 } 1498 }
1459 1499
1460 kfree(vol); 1500 kfree(vol);
@@ -1477,16 +1517,6 @@ static int btrfs_unfreeze(struct super_block *sb)
1477 return 0; 1517 return 0;
1478} 1518}
1479 1519
1480static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1481{
1482 int ret;
1483
1484 ret = btrfs_dirty_inode(inode);
1485 if (ret)
1486 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1487 "error %d\n", btrfs_ino(inode), ret);
1488}
1489
1490static int btrfs_show_devname(struct seq_file *m, struct dentry *root) 1520static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
1491{ 1521{
1492 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); 1522 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -1526,7 +1556,6 @@ static const struct super_operations btrfs_super_ops = {
1526 .show_options = btrfs_show_options, 1556 .show_options = btrfs_show_options,
1527 .show_devname = btrfs_show_devname, 1557 .show_devname = btrfs_show_devname,
1528 .write_inode = btrfs_write_inode, 1558 .write_inode = btrfs_write_inode,
1529 .dirty_inode = btrfs_fs_dirty_inode,
1530 .alloc_inode = btrfs_alloc_inode, 1559 .alloc_inode = btrfs_alloc_inode,
1531 .destroy_inode = btrfs_destroy_inode, 1560 .destroy_inode = btrfs_destroy_inode,
1532 .statfs = btrfs_statfs, 1561 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b72b068183ec..17be3dedacba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -22,6 +22,7 @@
22#include <linux/writeback.h> 22#include <linux/writeback.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/uuid.h>
25#include "ctree.h" 26#include "ctree.h"
26#include "disk-io.h" 27#include "disk-io.h"
27#include "transaction.h" 28#include "transaction.h"
@@ -38,7 +39,6 @@ void put_transaction(struct btrfs_transaction *transaction)
38 if (atomic_dec_and_test(&transaction->use_count)) { 39 if (atomic_dec_and_test(&transaction->use_count)) {
39 BUG_ON(!list_empty(&transaction->list)); 40 BUG_ON(!list_empty(&transaction->list));
40 WARN_ON(transaction->delayed_refs.root.rb_node); 41 WARN_ON(transaction->delayed_refs.root.rb_node);
41 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
42 memset(transaction, 0, sizeof(*transaction)); 42 memset(transaction, 0, sizeof(*transaction));
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 43 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 44 }
@@ -100,8 +100,8 @@ loop:
100 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
101 cur_trans = fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
102 goto loop; 102 goto loop;
103 } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 103 } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
104 spin_unlock(&root->fs_info->trans_lock); 104 spin_unlock(&fs_info->trans_lock);
105 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 105 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
106 return -EROFS; 106 return -EROFS;
107 } 107 }
@@ -126,7 +126,6 @@ loop:
126 cur_trans->delayed_refs.num_heads = 0; 126 cur_trans->delayed_refs.num_heads = 0;
127 cur_trans->delayed_refs.flushing = 0; 127 cur_trans->delayed_refs.flushing = 0;
128 cur_trans->delayed_refs.run_delayed_start = 0; 128 cur_trans->delayed_refs.run_delayed_start = 0;
129 cur_trans->delayed_refs.seq = 1;
130 129
131 /* 130 /*
132 * although the tree mod log is per file system and not per transaction, 131 * although the tree mod log is per file system and not per transaction,
@@ -145,10 +144,8 @@ loop:
145 } 144 }
146 atomic_set(&fs_info->tree_mod_seq, 0); 145 atomic_set(&fs_info->tree_mod_seq, 0);
147 146
148 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
149 spin_lock_init(&cur_trans->commit_lock); 147 spin_lock_init(&cur_trans->commit_lock);
150 spin_lock_init(&cur_trans->delayed_refs.lock); 148 spin_lock_init(&cur_trans->delayed_refs.lock);
151 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
152 149
153 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 150 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
154 list_add_tail(&cur_trans->list, &fs_info->trans_list); 151 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -299,6 +296,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
299 struct btrfs_transaction *cur_trans; 296 struct btrfs_transaction *cur_trans;
300 u64 num_bytes = 0; 297 u64 num_bytes = 0;
301 int ret; 298 int ret;
299 u64 qgroup_reserved = 0;
302 300
303 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 301 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
304 return ERR_PTR(-EROFS); 302 return ERR_PTR(-EROFS);
@@ -317,6 +315,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
317 * the appropriate flushing if need be. 315 * the appropriate flushing if need be.
318 */ 316 */
319 if (num_items > 0 && root != root->fs_info->chunk_root) { 317 if (num_items > 0 && root != root->fs_info->chunk_root) {
318 if (root->fs_info->quota_enabled &&
319 is_fstree(root->root_key.objectid)) {
320 qgroup_reserved = num_items * root->leafsize;
321 ret = btrfs_qgroup_reserve(root, qgroup_reserved);
322 if (ret)
323 return ERR_PTR(ret);
324 }
325
320 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
321 ret = btrfs_block_rsv_add(root, 327 ret = btrfs_block_rsv_add(root,
322 &root->fs_info->trans_block_rsv, 328 &root->fs_info->trans_block_rsv,
@@ -329,6 +335,8 @@ again:
329 if (!h) 335 if (!h)
330 return ERR_PTR(-ENOMEM); 336 return ERR_PTR(-ENOMEM);
331 337
338 sb_start_intwrite(root->fs_info->sb);
339
332 if (may_wait_transaction(root, type)) 340 if (may_wait_transaction(root, type))
333 wait_current_trans(root); 341 wait_current_trans(root);
334 342
@@ -339,6 +347,7 @@ again:
339 } while (ret == -EBUSY); 347 } while (ret == -EBUSY);
340 348
341 if (ret < 0) { 349 if (ret < 0) {
350 sb_end_intwrite(root->fs_info->sb);
342 kmem_cache_free(btrfs_trans_handle_cachep, h); 351 kmem_cache_free(btrfs_trans_handle_cachep, h);
343 return ERR_PTR(ret); 352 return ERR_PTR(ret);
344 } 353 }
@@ -349,11 +358,16 @@ again:
349 h->transaction = cur_trans; 358 h->transaction = cur_trans;
350 h->blocks_used = 0; 359 h->blocks_used = 0;
351 h->bytes_reserved = 0; 360 h->bytes_reserved = 0;
361 h->root = root;
352 h->delayed_ref_updates = 0; 362 h->delayed_ref_updates = 0;
353 h->use_count = 1; 363 h->use_count = 1;
364 h->adding_csums = 0;
354 h->block_rsv = NULL; 365 h->block_rsv = NULL;
355 h->orig_rsv = NULL; 366 h->orig_rsv = NULL;
356 h->aborted = 0; 367 h->aborted = 0;
368 h->qgroup_reserved = qgroup_reserved;
369 h->delayed_ref_elem.seq = 0;
370 INIT_LIST_HEAD(&h->qgroup_ref_list);
357 371
358 smp_mb(); 372 smp_mb();
359 if (cur_trans->blocked && may_wait_transaction(root, type)) { 373 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -473,7 +487,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
473 struct btrfs_root *root) 487 struct btrfs_root *root)
474{ 488{
475 struct btrfs_transaction *cur_trans = trans->transaction; 489 struct btrfs_transaction *cur_trans = trans->transaction;
476 struct btrfs_block_rsv *rsv = trans->block_rsv;
477 int updates; 490 int updates;
478 int err; 491 int err;
479 492
@@ -481,12 +494,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
481 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 494 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
482 return 1; 495 return 1;
483 496
484 /*
485 * We need to do this in case we're deleting csums so the global block
486 * rsv get's used instead of the csum block rsv.
487 */
488 trans->block_rsv = NULL;
489
490 updates = trans->delayed_ref_updates; 497 updates = trans->delayed_ref_updates;
491 trans->delayed_ref_updates = 0; 498 trans->delayed_ref_updates = 0;
492 if (updates) { 499 if (updates) {
@@ -495,8 +502,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
495 return err; 502 return err;
496 } 503 }
497 504
498 trans->block_rsv = rsv;
499
500 return should_end_transaction(trans, root); 505 return should_end_transaction(trans, root);
501} 506}
502 507
@@ -513,8 +518,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
513 return 0; 518 return 0;
514 } 519 }
515 520
521 /*
522 * do the qgroup accounting as early as possible
523 */
524 err = btrfs_delayed_refs_qgroup_accounting(trans, info);
525
516 btrfs_trans_release_metadata(trans, root); 526 btrfs_trans_release_metadata(trans, root);
517 trans->block_rsv = NULL; 527 trans->block_rsv = NULL;
528 /*
529 * the same root has to be passed to start_transaction and
530 * end_transaction. Subvolume quota depends on this.
531 */
532 WARN_ON(trans->root != root);
533
534 if (trans->qgroup_reserved) {
535 btrfs_qgroup_free(root, trans->qgroup_reserved);
536 trans->qgroup_reserved = 0;
537 }
538
518 while (count < 2) { 539 while (count < 2) {
519 unsigned long cur = trans->delayed_ref_updates; 540 unsigned long cur = trans->delayed_ref_updates;
520 trans->delayed_ref_updates = 0; 541 trans->delayed_ref_updates = 0;
@@ -527,6 +548,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
527 } 548 }
528 count++; 549 count++;
529 } 550 }
551 btrfs_trans_release_metadata(trans, root);
552 trans->block_rsv = NULL;
553
554 sb_end_intwrite(root->fs_info->sb);
530 555
531 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 556 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
532 should_end_transaction(trans, root)) { 557 should_end_transaction(trans, root)) {
@@ -567,6 +592,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
567 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 592 root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
568 err = -EIO; 593 err = -EIO;
569 } 594 }
595 assert_qgroups_uptodate(trans);
570 596
571 memset(trans, 0, sizeof(*trans)); 597 memset(trans, 0, sizeof(*trans));
572 kmem_cache_free(btrfs_trans_handle_cachep, trans); 598 kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -785,6 +811,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
785 ret = btrfs_run_dev_stats(trans, root->fs_info); 811 ret = btrfs_run_dev_stats(trans, root->fs_info);
786 BUG_ON(ret); 812 BUG_ON(ret);
787 813
814 ret = btrfs_run_qgroups(trans, root->fs_info);
815 BUG_ON(ret);
816
817 /* run_qgroups might have added some more refs */
818 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
819 BUG_ON(ret);
820
788 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 821 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
789 next = fs_info->dirty_cowonly_roots.next; 822 next = fs_info->dirty_cowonly_roots.next;
790 list_del_init(next); 823 list_del_init(next);
@@ -926,11 +959,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
926 struct dentry *dentry; 959 struct dentry *dentry;
927 struct extent_buffer *tmp; 960 struct extent_buffer *tmp;
928 struct extent_buffer *old; 961 struct extent_buffer *old;
962 struct timespec cur_time = CURRENT_TIME;
929 int ret; 963 int ret;
930 u64 to_reserve = 0; 964 u64 to_reserve = 0;
931 u64 index = 0; 965 u64 index = 0;
932 u64 objectid; 966 u64 objectid;
933 u64 root_flags; 967 u64 root_flags;
968 uuid_le new_uuid;
934 969
935 rsv = trans->block_rsv; 970 rsv = trans->block_rsv;
936 971
@@ -957,6 +992,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
957 } 992 }
958 } 993 }
959 994
995 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
996 objectid, pending->inherit);
997 kfree(pending->inherit);
998 if (ret) {
999 pending->error = ret;
1000 goto fail;
1001 }
1002
960 key.objectid = objectid; 1003 key.objectid = objectid;
961 key.offset = (u64)-1; 1004 key.offset = (u64)-1;
962 key.type = BTRFS_ROOT_ITEM_KEY; 1005 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1016,6 +1059,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1016 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; 1059 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
1017 btrfs_set_root_flags(new_root_item, root_flags); 1060 btrfs_set_root_flags(new_root_item, root_flags);
1018 1061
1062 btrfs_set_root_generation_v2(new_root_item,
1063 trans->transid);
1064 uuid_le_gen(&new_uuid);
1065 memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
1066 memcpy(new_root_item->parent_uuid, root->root_item.uuid,
1067 BTRFS_UUID_SIZE);
1068 new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
1069 new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
1070 btrfs_set_root_otransid(new_root_item, trans->transid);
1071 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
1072 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
1073 btrfs_set_root_stransid(new_root_item, 0);
1074 btrfs_set_root_rtransid(new_root_item, 0);
1075
1019 old = btrfs_lock_root_node(root); 1076 old = btrfs_lock_root_node(root);
1020 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); 1077 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
1021 if (ret) { 1078 if (ret) {
@@ -1269,9 +1326,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1269 1326
1270 btrfs_run_ordered_operations(root, 0); 1327 btrfs_run_ordered_operations(root, 0);
1271 1328
1272 btrfs_trans_release_metadata(trans, root);
1273 trans->block_rsv = NULL;
1274
1275 if (cur_trans->aborted) 1329 if (cur_trans->aborted)
1276 goto cleanup_transaction; 1330 goto cleanup_transaction;
1277 1331
@@ -1282,6 +1336,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1282 if (ret) 1336 if (ret)
1283 goto cleanup_transaction; 1337 goto cleanup_transaction;
1284 1338
1339 btrfs_trans_release_metadata(trans, root);
1340 trans->block_rsv = NULL;
1341
1285 cur_trans = trans->transaction; 1342 cur_trans = trans->transaction;
1286 1343
1287 /* 1344 /*
@@ -1330,7 +1387,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1330 spin_unlock(&root->fs_info->trans_lock); 1387 spin_unlock(&root->fs_info->trans_lock);
1331 } 1388 }
1332 1389
1333 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1390 if (!btrfs_test_opt(root, SSD) &&
1391 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1334 should_grow = 1; 1392 should_grow = 1;
1335 1393
1336 do { 1394 do {
@@ -1352,6 +1410,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1352 goto cleanup_transaction; 1410 goto cleanup_transaction;
1353 1411
1354 /* 1412 /*
1413 * running the delayed items may have added new refs. account
1414 * them now so that they hinder processing of more delayed refs
1415 * as little as possible.
1416 */
1417 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1418
1419 /*
1355 * rename don't use btrfs_join_transaction, so, once we 1420 * rename don't use btrfs_join_transaction, so, once we
1356 * set the transaction to blocked above, we aren't going 1421 * set the transaction to blocked above, we aren't going
1357 * to get any new ordered operations. We can safely run 1422 * to get any new ordered operations. We can safely run
@@ -1463,6 +1528,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1463 root->fs_info->chunk_root->node); 1528 root->fs_info->chunk_root->node);
1464 switch_commit_root(root->fs_info->chunk_root); 1529 switch_commit_root(root->fs_info->chunk_root);
1465 1530
1531 assert_qgroups_uptodate(trans);
1466 update_super_roots(root); 1532 update_super_roots(root);
1467 1533
1468 if (!root->fs_info->log_root_recovering) { 1534 if (!root->fs_info->log_root_recovering) {
@@ -1517,6 +1583,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1517 put_transaction(cur_trans); 1583 put_transaction(cur_trans);
1518 put_transaction(cur_trans); 1584 put_transaction(cur_trans);
1519 1585
1586 sb_end_intwrite(root->fs_info->sb);
1587
1520 trace_btrfs_transaction_commit(root); 1588 trace_btrfs_transaction_commit(root);
1521 1589
1522 btrfs_scrub_continue(root); 1590 btrfs_scrub_continue(root);
@@ -1532,6 +1600,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1532 return ret; 1600 return ret;
1533 1601
1534cleanup_transaction: 1602cleanup_transaction:
1603 btrfs_trans_release_metadata(trans, root);
1604 trans->block_rsv = NULL;
1535 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); 1605 btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
1536// WARN_ON(1); 1606// WARN_ON(1);
1537 if (current->journal_info == trans) 1607 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fe27379e368b..e8b8416c688b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -20,6 +20,7 @@
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h"
23 24
24struct btrfs_transaction { 25struct btrfs_transaction {
25 u64 transid; 26 u64 transid;
@@ -49,6 +50,7 @@ struct btrfs_transaction {
49struct btrfs_trans_handle { 50struct btrfs_trans_handle {
50 u64 transid; 51 u64 transid;
51 u64 bytes_reserved; 52 u64 bytes_reserved;
53 u64 qgroup_reserved;
52 unsigned long use_count; 54 unsigned long use_count;
53 unsigned long blocks_reserved; 55 unsigned long blocks_reserved;
54 unsigned long blocks_used; 56 unsigned long blocks_used;
@@ -57,12 +59,22 @@ struct btrfs_trans_handle {
57 struct btrfs_block_rsv *block_rsv; 59 struct btrfs_block_rsv *block_rsv;
58 struct btrfs_block_rsv *orig_rsv; 60 struct btrfs_block_rsv *orig_rsv;
59 int aborted; 61 int aborted;
62 int adding_csums;
63 /*
64 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction.
66 * Subvolume quota depends on this
67 */
68 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list;
60}; 71};
61 72
62struct btrfs_pending_snapshot { 73struct btrfs_pending_snapshot {
63 struct dentry *dentry; 74 struct dentry *dentry;
64 struct btrfs_root *root; 75 struct btrfs_root *root;
65 struct btrfs_root *snap; 76 struct btrfs_root *snap;
77 struct btrfs_qgroup_inherit *inherit;
66 /* block reservation for the operation */ 78 /* block reservation for the operation */
67 struct btrfs_block_rsv block_rsv; 79 struct btrfs_block_rsv block_rsv;
68 /* extra metadata reseration for relocation */ 80 /* extra metadata reseration for relocation */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8abeae4224f9..c86670f4f285 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -637,7 +637,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
637 } 637 }
638 638
639 inode_set_bytes(inode, saved_nbytes); 639 inode_set_bytes(inode, saved_nbytes);
640 btrfs_update_inode(trans, root, inode); 640 ret = btrfs_update_inode(trans, root, inode);
641out: 641out:
642 if (inode) 642 if (inode)
643 iput(inode); 643 iput(inode);
@@ -1133,7 +1133,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1133 btrfs_release_path(path); 1133 btrfs_release_path(path);
1134 if (ret == 0) { 1134 if (ret == 0) {
1135 btrfs_inc_nlink(inode); 1135 btrfs_inc_nlink(inode);
1136 btrfs_update_inode(trans, root, inode); 1136 ret = btrfs_update_inode(trans, root, inode);
1137 } else if (ret == -EEXIST) { 1137 } else if (ret == -EEXIST) {
1138 ret = 0; 1138 ret = 0;
1139 } else { 1139 } else {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ecaad40e7ef4..e86ae04abe6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -429,6 +429,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
429 mutex_init(&fs_devices->device_list_mutex); 429 mutex_init(&fs_devices->device_list_mutex);
430 fs_devices->latest_devid = orig->latest_devid; 430 fs_devices->latest_devid = orig->latest_devid;
431 fs_devices->latest_trans = orig->latest_trans; 431 fs_devices->latest_trans = orig->latest_trans;
432 fs_devices->total_devices = orig->total_devices;
432 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 433 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
433 434
434 /* We have held the volume lock, it is safe to get the devices. */ 435 /* We have held the volume lock, it is safe to get the devices. */
@@ -739,6 +740,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
739 int ret; 740 int ret;
740 u64 devid; 741 u64 devid;
741 u64 transid; 742 u64 transid;
743 u64 total_devices;
742 744
743 flags |= FMODE_EXCL; 745 flags |= FMODE_EXCL;
744 bdev = blkdev_get_by_path(path, flags, holder); 746 bdev = blkdev_get_by_path(path, flags, holder);
@@ -760,6 +762,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
760 disk_super = (struct btrfs_super_block *)bh->b_data; 762 disk_super = (struct btrfs_super_block *)bh->b_data;
761 devid = btrfs_stack_device_id(&disk_super->dev_item); 763 devid = btrfs_stack_device_id(&disk_super->dev_item);
762 transid = btrfs_super_generation(disk_super); 764 transid = btrfs_super_generation(disk_super);
765 total_devices = btrfs_super_num_devices(disk_super);
763 if (disk_super->label[0]) 766 if (disk_super->label[0])
764 printk(KERN_INFO "device label %s ", disk_super->label); 767 printk(KERN_INFO "device label %s ", disk_super->label);
765 else 768 else
@@ -767,7 +770,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
767 printk(KERN_CONT "devid %llu transid %llu %s\n", 770 printk(KERN_CONT "devid %llu transid %llu %s\n",
768 (unsigned long long)devid, (unsigned long long)transid, path); 771 (unsigned long long)devid, (unsigned long long)transid, path);
769 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 772 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
770 773 if (!ret && fs_devices_ret)
774 (*fs_devices_ret)->total_devices = total_devices;
771 brelse(bh); 775 brelse(bh);
772error_close: 776error_close:
773 mutex_unlock(&uuid_mutex); 777 mutex_unlock(&uuid_mutex);
@@ -1433,6 +1437,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1433 list_del_rcu(&device->dev_list); 1437 list_del_rcu(&device->dev_list);
1434 1438
1435 device->fs_devices->num_devices--; 1439 device->fs_devices->num_devices--;
1440 device->fs_devices->total_devices--;
1436 1441
1437 if (device->missing) 1442 if (device->missing)
1438 root->fs_info->fs_devices->missing_devices--; 1443 root->fs_info->fs_devices->missing_devices--;
@@ -1550,6 +1555,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1550 fs_devices->seeding = 0; 1555 fs_devices->seeding = 0;
1551 fs_devices->num_devices = 0; 1556 fs_devices->num_devices = 0;
1552 fs_devices->open_devices = 0; 1557 fs_devices->open_devices = 0;
1558 fs_devices->total_devices = 0;
1553 fs_devices->seed = seed_devices; 1559 fs_devices->seed = seed_devices;
1554 1560
1555 generate_random_uuid(fs_devices->fsid); 1561 generate_random_uuid(fs_devices->fsid);
@@ -1738,10 +1744,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1738 1744
1739 device->fs_devices = root->fs_info->fs_devices; 1745 device->fs_devices = root->fs_info->fs_devices;
1740 1746
1741 /*
1742 * we don't want write_supers to jump in here with our device
1743 * half setup
1744 */
1745 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1747 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1746 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1748 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1747 list_add(&device->dev_alloc_list, 1749 list_add(&device->dev_alloc_list,
@@ -1749,6 +1751,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1749 root->fs_info->fs_devices->num_devices++; 1751 root->fs_info->fs_devices->num_devices++;
1750 root->fs_info->fs_devices->open_devices++; 1752 root->fs_info->fs_devices->open_devices++;
1751 root->fs_info->fs_devices->rw_devices++; 1753 root->fs_info->fs_devices->rw_devices++;
1754 root->fs_info->fs_devices->total_devices++;
1752 if (device->can_discard) 1755 if (device->can_discard)
1753 root->fs_info->fs_devices->num_can_discard++; 1756 root->fs_info->fs_devices->num_can_discard++;
1754 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1757 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
@@ -4736,9 +4739,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4736 key.offset = device->devid; 4739 key.offset = device->devid;
4737 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 4740 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4738 if (ret) { 4741 if (ret) {
4739 printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4740 rcu_str_deref(device->name),
4741 (unsigned long long)device->devid);
4742 __btrfs_reset_dev_stats(device); 4742 __btrfs_reset_dev_stats(device);
4743 device->dev_stats_valid = 1; 4743 device->dev_stats_valid = 1;
4744 btrfs_release_path(path); 4744 btrfs_release_path(path);
@@ -4880,6 +4880,14 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4880 4880
4881static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 4881static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4882{ 4882{
4883 int i;
4884
4885 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4886 if (btrfs_dev_stat_read(dev, i) != 0)
4887 break;
4888 if (i == BTRFS_DEV_STAT_VALUES_MAX)
4889 return; /* all values == 0, suppress message */
4890
4883 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4891 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4884 rcu_str_deref(dev->name), 4892 rcu_str_deref(dev->name),
4885 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4893 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -4890,8 +4898,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4890} 4898}
4891 4899
4892int btrfs_get_dev_stats(struct btrfs_root *root, 4900int btrfs_get_dev_stats(struct btrfs_root *root,
4893 struct btrfs_ioctl_get_dev_stats *stats, 4901 struct btrfs_ioctl_get_dev_stats *stats)
4894 int reset_after_read)
4895{ 4902{
4896 struct btrfs_device *dev; 4903 struct btrfs_device *dev;
4897 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4904 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
@@ -4909,7 +4916,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4909 printk(KERN_WARNING 4916 printk(KERN_WARNING
4910 "btrfs: get dev_stats failed, not yet valid\n"); 4917 "btrfs: get dev_stats failed, not yet valid\n");
4911 return -ENODEV; 4918 return -ENODEV;
4912 } else if (reset_after_read) { 4919 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
4913 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4920 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4914 if (stats->nr_items > i) 4921 if (stats->nr_items > i)
4915 stats->values[i] = 4922 stats->values[i] =
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 95f6637614db..5479325987b3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -126,6 +126,7 @@ struct btrfs_fs_devices {
126 u64 missing_devices; 126 u64 missing_devices;
127 u64 total_rw_bytes; 127 u64 total_rw_bytes;
128 u64 num_can_discard; 128 u64 num_can_discard;
129 u64 total_devices;
129 struct block_device *latest_bdev; 130 struct block_device *latest_bdev;
130 131
131 /* all of the devices in the FS, protected by a mutex 132 /* all of the devices in the FS, protected by a mutex
@@ -293,8 +294,7 @@ struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
293void btrfs_dev_stat_print_on_error(struct btrfs_device *device); 294void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
294void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 295void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
295int btrfs_get_dev_stats(struct btrfs_root *root, 296int btrfs_get_dev_stats(struct btrfs_root *root,
296 struct btrfs_ioctl_get_dev_stats *stats, 297 struct btrfs_ioctl_get_dev_stats *stats);
297 int reset_after_read);
298int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 298int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
299int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 299int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
300 struct btrfs_fs_info *fs_info); 300 struct btrfs_fs_info *fs_info);