aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c7
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c5
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c104
-rw-r--r--fs/btrfs/ctree.h143
-rw-r--r--fs/btrfs/delayed-inode.c7
-rw-r--r--fs/btrfs/delayed-ref.c39
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c113
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c470
-rw-r--r--fs/btrfs/extent_io.c414
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c80
-rw-r--r--fs/btrfs/file.c157
-rw-r--r--fs/btrfs/free-space-cache.c312
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c299
-rw-r--r--fs/btrfs/ioctl.c398
-rw-r--r--fs/btrfs/lzo.c14
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c937
-rw-r--r--fs/btrfs/qgroup.h107
-rw-r--r--fs/btrfs/relocation.c21
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/scrub.c9
-rw-r--r--fs/btrfs/send.c297
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/sysfs.c50
-rw-r--r--fs/btrfs/tests/btrfs-tests.c97
-rw-r--r--fs/btrfs/tests/btrfs-tests.h9
-rw-r--r--fs/btrfs/tests/inode-tests.c35
-rw-r--r--fs/btrfs/tests/qgroup-tests.c468
-rw-r--r--fs/btrfs/transaction.c113
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c49
-rw-r--r--fs/btrfs/tree-log.h16
-rw-r--r--fs/btrfs/volumes.c122
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/zlib.c26
46 files changed, 3713 insertions, 1318 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f341a98031d2..6d1d0b93b1aa 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
16 16
17btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ 17btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
18 tests/extent-buffer-tests.o tests/btrfs-tests.o \ 18 tests/extent-buffer-tests.o tests/btrfs-tests.o \
19 tests/extent-io-tests.o tests/inode-tests.o 19 tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index ff9b3995d453..9a0124a95851 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
79 const char *name; 79 const char *name;
80 char *value = NULL; 80 char *value = NULL;
81 81
82 if (acl) {
83 ret = posix_acl_valid(acl);
84 if (ret < 0)
85 return ret;
86 ret = 0;
87 }
88
89 switch (type) { 82 switch (type) {
90 case ACL_TYPE_ACCESS: 83 case ACL_TYPE_ACCESS:
91 name = POSIX_ACL_XATTR_ACCESS; 84 name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 10db21fa0926..e25564bfcb46 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -900,7 +900,11 @@ again:
900 goto out; 900 goto out;
901 BUG_ON(ret == 0); 901 BUG_ON(ret == 0);
902 902
903#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
904 if (trans && likely(trans->type != __TRANS_DUMMY)) {
905#else
903 if (trans) { 906 if (trans) {
907#endif
904 /* 908 /*
905 * look if there are updates for this ref queued and lock the 909 * look if there are updates for this ref queued and lock the
906 * head 910 * head
@@ -984,11 +988,12 @@ again:
984 goto out; 988 goto out;
985 } 989 }
986 if (ref->count && ref->parent) { 990 if (ref->count && ref->parent) {
987 if (extent_item_pos && !ref->inode_list) { 991 if (extent_item_pos && !ref->inode_list &&
992 ref->level == 0) {
988 u32 bsz; 993 u32 bsz;
989 struct extent_buffer *eb; 994 struct extent_buffer *eb;
990 bsz = btrfs_level_size(fs_info->extent_root, 995 bsz = btrfs_level_size(fs_info->extent_root,
991 info_level); 996 ref->level);
992 eb = read_tree_block(fs_info->extent_root, 997 eb = read_tree_block(fs_info->extent_root,
993 ref->parent, bsz, 0); 998 ref->parent, bsz, 0);
994 if (!eb || !extent_buffer_uptodate(eb)) { 999 if (!eb || !extent_buffer_uptodate(eb)) {
@@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1404 * returns <0 on error 1409 * returns <0 on error
1405 */ 1410 */
1406static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, 1411static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1407 struct btrfs_extent_item *ei, u32 item_size, 1412 struct btrfs_key *key,
1408 struct btrfs_extent_inline_ref **out_eiref, 1413 struct btrfs_extent_item *ei, u32 item_size,
1409 int *out_type) 1414 struct btrfs_extent_inline_ref **out_eiref,
1415 int *out_type)
1410{ 1416{
1411 unsigned long end; 1417 unsigned long end;
1412 u64 flags; 1418 u64 flags;
@@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1416 /* first call */ 1422 /* first call */
1417 flags = btrfs_extent_flags(eb, ei); 1423 flags = btrfs_extent_flags(eb, ei);
1418 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1424 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1419 info = (struct btrfs_tree_block_info *)(ei + 1); 1425 if (key->type == BTRFS_METADATA_ITEM_KEY) {
1420 *out_eiref = 1426 /* a skinny metadata extent */
1421 (struct btrfs_extent_inline_ref *)(info + 1); 1427 *out_eiref =
1428 (struct btrfs_extent_inline_ref *)(ei + 1);
1429 } else {
1430 WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
1431 info = (struct btrfs_tree_block_info *)(ei + 1);
1432 *out_eiref =
1433 (struct btrfs_extent_inline_ref *)(info + 1);
1434 }
1422 } else { 1435 } else {
1423 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); 1436 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
1424 } 1437 }
1425 *ptr = (unsigned long)*out_eiref; 1438 *ptr = (unsigned long)*out_eiref;
1426 if ((void *)*ptr >= (void *)ei + item_size) 1439 if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
1427 return -ENOENT; 1440 return -ENOENT;
1428 } 1441 }
1429 1442
1430 end = (unsigned long)ei + item_size; 1443 end = (unsigned long)ei + item_size;
1431 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; 1444 *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
1432 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); 1445 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
1433 1446
1434 *ptr += btrfs_extent_inline_ref_size(*out_type); 1447 *ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
1447 * <0 on error. 1460 * <0 on error.
1448 */ 1461 */
1449int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 1462int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1450 struct btrfs_extent_item *ei, u32 item_size, 1463 struct btrfs_key *key, struct btrfs_extent_item *ei,
1451 u64 *out_root, u8 *out_level) 1464 u32 item_size, u64 *out_root, u8 *out_level)
1452{ 1465{
1453 int ret; 1466 int ret;
1454 int type; 1467 int type;
@@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1459 return 1; 1472 return 1;
1460 1473
1461 while (1) { 1474 while (1) {
1462 ret = __get_extent_inline_ref(ptr, eb, ei, item_size, 1475 ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
1463 &eiref, &type); 1476 &eiref, &type);
1464 if (ret < 0) 1477 if (ret < 0)
1465 return ret; 1478 return ret;
1466 1479
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a910b27a8ad9..86fc20fec282 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
40 u64 *flags); 40 u64 *flags);
41 41
42int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 42int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
43 struct btrfs_extent_item *ei, u32 item_size, 43 struct btrfs_key *key, struct btrfs_extent_item *ei,
44 u64 *out_root, u8 *out_level); 44 u32 item_size, u64 *out_root, u8 *out_level);
45 45
46int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 46int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
47 u64 extent_item_objectid, 47 u64 extent_item_objectid,
@@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 56
57int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 57int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
58 struct btrfs_fs_info *fs_info, u64 bytenr, 58 struct btrfs_fs_info *fs_info, u64 bytenr,
59 u64 time_seq, struct ulist **roots); 59 u64 time_seq, struct ulist **roots);
60char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 60char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
61 u32 name_len, unsigned long name_off, 61 u32 name_len, unsigned long name_off,
62 struct extent_buffer *eb_in, u64 parent, 62 struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c9a24444ec9a..4794923c410c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -279,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
279 279
280static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) 280static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
281{ 281{
282 smp_mb__before_clear_bit(); 282 smp_mb__before_atomic();
283 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, 283 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
284 &BTRFS_I(inode)->runtime_flags); 284 &BTRFS_I(inode)->runtime_flags);
285} 285}
286 286
287bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
288
287#endif 289#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0e8388e72d8d..ce92ae30250f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error:
1093 next_stack = 1093 next_stack =
1094 btrfsic_stack_frame_alloc(); 1094 btrfsic_stack_frame_alloc();
1095 if (NULL == next_stack) { 1095 if (NULL == next_stack) {
1096 sf->error = -1;
1096 btrfsic_release_block_ctx( 1097 btrfsic_release_block_ctx(
1097 &sf-> 1098 &sf->
1098 next_block_ctx); 1099 next_block_ctx);
@@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame:
1190 sf->next_block_ctx.datav[0]; 1191 sf->next_block_ctx.datav[0];
1191 1192
1192 next_stack = btrfsic_stack_frame_alloc(); 1193 next_stack = btrfsic_stack_frame_alloc();
1193 if (NULL == next_stack) 1194 if (NULL == next_stack) {
1195 sf->error = -1;
1194 goto one_stack_frame_backwards; 1196 goto one_stack_frame_backwards;
1197 }
1195 1198
1196 next_stack->i = -1; 1199 next_stack->i = -1;
1197 next_stack->block = sf->next_block; 1200 next_stack->block = sf->next_block;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d43c544d3b68..92371c414228 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
887 887
888 workspace = find_workspace(type); 888 workspace = find_workspace(type);
889 if (IS_ERR(workspace)) 889 if (IS_ERR(workspace))
890 return -1; 890 return PTR_ERR(workspace);
891 891
892 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, 892 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
893 start, len, pages, 893 start, len, pages,
@@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
923 923
924 workspace = find_workspace(type); 924 workspace = find_workspace(type);
925 if (IS_ERR(workspace)) 925 if (IS_ERR(workspace))
926 return -ENOMEM; 926 return PTR_ERR(workspace);
927 927
928 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, 928 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
929 disk_start, 929 disk_start,
@@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
945 945
946 workspace = find_workspace(type); 946 workspace = find_workspace(type);
947 if (IS_ERR(workspace)) 947 if (IS_ERR(workspace))
948 return -ENOMEM; 948 return PTR_ERR(workspace);
949 949
950 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, 950 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
951 dest_page, start_byte, 951 dest_page, start_byte,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1bcfcdb23cf4..aeab453b8e24 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
224static void add_root_to_dirty_list(struct btrfs_root *root) 224static void add_root_to_dirty_list(struct btrfs_root *root)
225{ 225{
226 spin_lock(&root->fs_info->trans_lock); 226 spin_lock(&root->fs_info->trans_lock);
227 if (root->track_dirty && list_empty(&root->dirty_list)) { 227 if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
228 list_empty(&root->dirty_list)) {
228 list_add(&root->dirty_list, 229 list_add(&root->dirty_list,
229 &root->fs_info->dirty_cowonly_roots); 230 &root->fs_info->dirty_cowonly_roots);
230 } 231 }
@@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
246 int level; 247 int level;
247 struct btrfs_disk_key disk_key; 248 struct btrfs_disk_key disk_key;
248 249
249 WARN_ON(root->ref_cows && trans->transid != 250 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
250 root->fs_info->running_transaction->transid); 251 trans->transid != root->fs_info->running_transaction->transid);
251 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 252 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
253 trans->transid != root->last_trans);
252 254
253 level = btrfs_header_level(buf); 255 level = btrfs_header_level(buf);
254 if (level == 0) 256 if (level == 0)
@@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
354} 356}
355 357
356/* 358/*
357 * Increment the upper half of tree_mod_seq, set lower half zero. 359 * Pull a new tree mod seq number for our operation.
358 *
359 * Must be called with fs_info->tree_mod_seq_lock held.
360 */
361static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
362{
363 u64 seq = atomic64_read(&fs_info->tree_mod_seq);
364 seq &= 0xffffffff00000000ull;
365 seq += 1ull << 32;
366 atomic64_set(&fs_info->tree_mod_seq, seq);
367 return seq;
368}
369
370/*
371 * Increment the lower half of tree_mod_seq.
372 *
373 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
374 * are generated should not technically require a spin lock here. (Rationale:
375 * incrementing the minor while incrementing the major seq number is between its
376 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
377 * just returns a unique sequence number as usual.) We have decided to leave
378 * that requirement in here and rethink it once we notice it really imposes a
379 * problem on some workload.
380 */ 360 */
381static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) 361static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
382{ 362{
383 return atomic64_inc_return(&fs_info->tree_mod_seq); 363 return atomic64_inc_return(&fs_info->tree_mod_seq);
384} 364}
385 365
386/* 366/*
387 * return the last minor in the previous major tree_mod_seq number
388 */
389u64 btrfs_tree_mod_seq_prev(u64 seq)
390{
391 return (seq & 0xffffffff00000000ull) - 1ull;
392}
393
394/*
395 * This adds a new blocker to the tree mod log's blocker list if the @elem 367 * This adds a new blocker to the tree mod log's blocker list if the @elem
396 * passed does not already have a sequence number set. So when a caller expects 368 * passed does not already have a sequence number set. So when a caller expects
397 * to record tree modifications, it should ensure to set elem->seq to zero 369 * to record tree modifications, it should ensure to set elem->seq to zero
@@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
402u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 374u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
403 struct seq_list *elem) 375 struct seq_list *elem)
404{ 376{
405 u64 seq;
406
407 tree_mod_log_write_lock(fs_info); 377 tree_mod_log_write_lock(fs_info);
408 spin_lock(&fs_info->tree_mod_seq_lock); 378 spin_lock(&fs_info->tree_mod_seq_lock);
409 if (!elem->seq) { 379 if (!elem->seq) {
410 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); 380 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
411 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 381 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
412 } 382 }
413 seq = btrfs_inc_tree_mod_seq_minor(fs_info);
414 spin_unlock(&fs_info->tree_mod_seq_lock); 383 spin_unlock(&fs_info->tree_mod_seq_lock);
415 tree_mod_log_write_unlock(fs_info); 384 tree_mod_log_write_unlock(fs_info);
416 385
417 return seq; 386 return elem->seq;
418} 387}
419 388
420void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 389void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
487 456
488 BUG_ON(!tm); 457 BUG_ON(!tm);
489 458
490 spin_lock(&fs_info->tree_mod_seq_lock); 459 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
491 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
492 spin_unlock(&fs_info->tree_mod_seq_lock);
493 460
494 tm_root = &fs_info->tree_mod_log; 461 tm_root = &fs_info->tree_mod_log;
495 new = &tm_root->rb_node; 462 new = &tm_root->rb_node;
@@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
997 * snapshot and the block was not allocated by tree relocation, 964 * snapshot and the block was not allocated by tree relocation,
998 * we know the block is not shared. 965 * we know the block is not shared.
999 */ 966 */
1000 if (root->ref_cows && 967 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1001 buf != root->node && buf != root->commit_root && 968 buf != root->node && buf != root->commit_root &&
1002 (btrfs_header_generation(buf) <= 969 (btrfs_header_generation(buf) <=
1003 btrfs_root_last_snapshot(&root->root_item) || 970 btrfs_root_last_snapshot(&root->root_item) ||
1004 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 971 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
1005 return 1; 972 return 1;
1006#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 973#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1007 if (root->ref_cows && 974 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1008 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 975 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1009 return 1; 976 return 1;
1010#endif 977#endif
@@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1146 1113
1147 btrfs_assert_tree_locked(buf); 1114 btrfs_assert_tree_locked(buf);
1148 1115
1149 WARN_ON(root->ref_cows && trans->transid != 1116 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1150 root->fs_info->running_transaction->transid); 1117 trans->transid != root->fs_info->running_transaction->transid);
1151 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 1118 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1119 trans->transid != root->last_trans);
1152 1120
1153 level = btrfs_header_level(buf); 1121 level = btrfs_header_level(buf);
1154 1122
@@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1193 return ret; 1161 return ret;
1194 } 1162 }
1195 1163
1196 if (root->ref_cows) { 1164 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1197 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 1165 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1198 if (ret) 1166 if (ret)
1199 return ret; 1167 return ret;
@@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1538 struct btrfs_root *root, 1506 struct btrfs_root *root,
1539 struct extent_buffer *buf) 1507 struct extent_buffer *buf)
1540{ 1508{
1509#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1510 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1511 return 0;
1512#endif
1541 /* ensure we can see the force_cow */ 1513 /* ensure we can see the force_cow */
1542 smp_rmb(); 1514 smp_rmb();
1543 1515
@@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1556 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1528 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1557 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1529 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1558 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1530 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1559 !root->force_cow) 1531 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1560 return 0; 1532 return 0;
1561 return 1; 1533 return 1;
1562} 1534}
@@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5125 return ret; 5097 return ret;
5126 btrfs_item_key(path->nodes[0], &found_key, 0); 5098 btrfs_item_key(path->nodes[0], &found_key, 0);
5127 ret = comp_keys(&found_key, &key); 5099 ret = comp_keys(&found_key, &key);
5128 if (ret < 0) 5100 /*
5101 * We might have had an item with the previous key in the tree right
5102 * before we released our path. And after we released our path, that
5103 * item might have been pushed to the first slot (0) of the leaf we
5104 * were holding due to a tree balance. Alternatively, an item with the
5105 * previous key can exist as the only element of a leaf (big fat item).
5106 * Therefore account for these 2 cases, so that our callers (like
5107 * btrfs_previous_item) don't miss an existing item with a key matching
5108 * the previous key we computed above.
5109 */
5110 if (ret <= 0)
5129 return 0; 5111 return 0;
5130 return 1; 5112 return 1;
5131} 5113}
@@ -5736,6 +5718,24 @@ again:
5736 ret = 0; 5718 ret = 0;
5737 goto done; 5719 goto done;
5738 } 5720 }
5721 /*
5722 * So the above check misses one case:
5723 * - after releasing the path above, someone has removed the item that
5724 * used to be at the very end of the block, and balance between leafs
5725 * gets another one with bigger key.offset to replace it.
5726 *
5727 * This one should be returned as well, or we can get leaf corruption
5728 * later(esp. in __btrfs_drop_extents()).
5729 *
5730 * And a bit more explanation about this check,
5731 * with ret > 0, the key isn't found, the path points to the slot
5732 * where it should be inserted, so the path->slots[0] item must be the
5733 * bigger one.
5734 */
5735 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5736 ret = 0;
5737 goto done;
5738 }
5739 5739
5740 while (level < BTRFS_MAX_LEVEL) { 5740 while (level < BTRFS_MAX_LEVEL) {
5741 if (!path->nodes[level]) { 5741 if (!path->nodes[level]) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ba6b88528dc7..b7e2c1c1ef36 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
33#include <asm/kmap_types.h> 33#include <asm/kmap_types.h>
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/btrfs.h> 35#include <linux/btrfs.h>
36#include <linux/workqueue.h>
36#include "extent_io.h" 37#include "extent_io.h"
37#include "extent_map.h" 38#include "extent_map.h"
38#include "async-thread.h" 39#include "async-thread.h"
@@ -756,6 +757,12 @@ struct btrfs_dir_item {
756 757
757#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) 758#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
758 759
760/*
761 * Internal in-memory flag that a subvolume has been marked for deletion but
762 * still visible as a directory
763 */
764#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
765
759struct btrfs_root_item { 766struct btrfs_root_item {
760 struct btrfs_inode_item inode; 767 struct btrfs_inode_item inode;
761 __le64 generation; 768 __le64 generation;
@@ -840,7 +847,10 @@ struct btrfs_disk_balance_args {
840 /* BTRFS_BALANCE_ARGS_* */ 847 /* BTRFS_BALANCE_ARGS_* */
841 __le64 flags; 848 __le64 flags;
842 849
843 __le64 unused[8]; 850 /* BTRFS_BALANCE_ARGS_LIMIT value */
851 __le64 limit;
852
853 __le64 unused[7];
844} __attribute__ ((__packed__)); 854} __attribute__ ((__packed__));
845 855
846/* 856/*
@@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item {
1113 __le64 rsv_excl; 1123 __le64 rsv_excl;
1114} __attribute__ ((__packed__)); 1124} __attribute__ ((__packed__));
1115 1125
1126/* For raid type sysfs entries */
1127struct raid_kobject {
1128 int raid_type;
1129 struct kobject kobj;
1130};
1131
1116struct btrfs_space_info { 1132struct btrfs_space_info {
1117 spinlock_t lock; 1133 spinlock_t lock;
1118 1134
@@ -1163,7 +1179,7 @@ struct btrfs_space_info {
1163 wait_queue_head_t wait; 1179 wait_queue_head_t wait;
1164 1180
1165 struct kobject kobj; 1181 struct kobject kobj;
1166 struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; 1182 struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
1167}; 1183};
1168 1184
1169#define BTRFS_BLOCK_RSV_GLOBAL 1 1185#define BTRFS_BLOCK_RSV_GLOBAL 1
@@ -1313,6 +1329,8 @@ struct btrfs_stripe_hash_table {
1313 1329
1314#define BTRFS_STRIPE_HASH_TABLE_BITS 11 1330#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1315 1331
1332void btrfs_init_async_reclaim_work(struct work_struct *work);
1333
1316/* fs_info */ 1334/* fs_info */
1317struct reloc_control; 1335struct reloc_control;
1318struct btrfs_device; 1336struct btrfs_device;
@@ -1534,6 +1552,9 @@ struct btrfs_fs_info {
1534 */ 1552 */
1535 struct btrfs_workqueue *fixup_workers; 1553 struct btrfs_workqueue *fixup_workers;
1536 struct btrfs_workqueue *delayed_workers; 1554 struct btrfs_workqueue *delayed_workers;
1555
1556 /* the extent workers do delayed refs on the extent allocation tree */
1557 struct btrfs_workqueue *extent_workers;
1537 struct task_struct *transaction_kthread; 1558 struct task_struct *transaction_kthread;
1538 struct task_struct *cleaner_kthread; 1559 struct task_struct *cleaner_kthread;
1539 int thread_pool_size; 1560 int thread_pool_size;
@@ -1636,7 +1657,10 @@ struct btrfs_fs_info {
1636 1657
1637 /* holds configuration and tracking. Protected by qgroup_lock */ 1658 /* holds configuration and tracking. Protected by qgroup_lock */
1638 struct rb_root qgroup_tree; 1659 struct rb_root qgroup_tree;
1660 struct rb_root qgroup_op_tree;
1639 spinlock_t qgroup_lock; 1661 spinlock_t qgroup_lock;
1662 spinlock_t qgroup_op_lock;
1663 atomic_t qgroup_op_seq;
1640 1664
1641 /* 1665 /*
1642 * used to avoid frequently calling ulist_alloc()/ulist_free() 1666 * used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1688,6 +1712,9 @@ struct btrfs_fs_info {
1688 1712
1689 struct semaphore uuid_tree_rescan_sem; 1713 struct semaphore uuid_tree_rescan_sem;
1690 unsigned int update_uuid_tree_gen:1; 1714 unsigned int update_uuid_tree_gen:1;
1715
1716 /* Used to reclaim the metadata space in the background. */
1717 struct work_struct async_reclaim_work;
1691}; 1718};
1692 1719
1693struct btrfs_subvolume_writers { 1720struct btrfs_subvolume_writers {
@@ -1696,6 +1723,26 @@ struct btrfs_subvolume_writers {
1696}; 1723};
1697 1724
1698/* 1725/*
1726 * The state of btrfs root
1727 */
1728/*
1729 * btrfs_record_root_in_trans is a multi-step process,
1730 * and it can race with the balancing code. But the
1731 * race is very small, and only the first time the root
1732 * is added to each transaction. So IN_TRANS_SETUP
1733 * is used to tell us when more checks are required
1734 */
1735#define BTRFS_ROOT_IN_TRANS_SETUP 0
1736#define BTRFS_ROOT_REF_COWS 1
1737#define BTRFS_ROOT_TRACK_DIRTY 2
1738#define BTRFS_ROOT_IN_RADIX 3
1739#define BTRFS_ROOT_DUMMY_ROOT 4
1740#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
1741#define BTRFS_ROOT_DEFRAG_RUNNING 6
1742#define BTRFS_ROOT_FORCE_COW 7
1743#define BTRFS_ROOT_MULTI_LOG_TASKS 8
1744
1745/*
1699 * in ram representation of the tree. extent_root is used for all allocations 1746 * in ram representation of the tree. extent_root is used for all allocations
1700 * and for the extent tree extent_root root. 1747 * and for the extent tree extent_root root.
1701 */ 1748 */
@@ -1706,6 +1753,7 @@ struct btrfs_root {
1706 struct btrfs_root *log_root; 1753 struct btrfs_root *log_root;
1707 struct btrfs_root *reloc_root; 1754 struct btrfs_root *reloc_root;
1708 1755
1756 unsigned long state;
1709 struct btrfs_root_item root_item; 1757 struct btrfs_root_item root_item;
1710 struct btrfs_key root_key; 1758 struct btrfs_key root_key;
1711 struct btrfs_fs_info *fs_info; 1759 struct btrfs_fs_info *fs_info;
@@ -1740,7 +1788,6 @@ struct btrfs_root {
1740 /* Just be updated when the commit succeeds. */ 1788 /* Just be updated when the commit succeeds. */
1741 int last_log_commit; 1789 int last_log_commit;
1742 pid_t log_start_pid; 1790 pid_t log_start_pid;
1743 bool log_multiple_pids;
1744 1791
1745 u64 objectid; 1792 u64 objectid;
1746 u64 last_trans; 1793 u64 last_trans;
@@ -1760,23 +1807,13 @@ struct btrfs_root {
1760 1807
1761 u64 highest_objectid; 1808 u64 highest_objectid;
1762 1809
1763 /* btrfs_record_root_in_trans is a multi-step process,
1764 * and it can race with the balancing code. But the
1765 * race is very small, and only the first time the root
1766 * is added to each transaction. So in_trans_setup
1767 * is used to tell us when more checks are required
1768 */
1769 unsigned long in_trans_setup;
1770 int ref_cows;
1771 int track_dirty;
1772 int in_radix;
1773#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1810#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1774 int dummy_root; 1811 u64 alloc_bytenr;
1775#endif 1812#endif
1813
1776 u64 defrag_trans_start; 1814 u64 defrag_trans_start;
1777 struct btrfs_key defrag_progress; 1815 struct btrfs_key defrag_progress;
1778 struct btrfs_key defrag_max; 1816 struct btrfs_key defrag_max;
1779 int defrag_running;
1780 char *name; 1817 char *name;
1781 1818
1782 /* the dirty list is only used by non-reference counted roots */ 1819 /* the dirty list is only used by non-reference counted roots */
@@ -1790,7 +1827,6 @@ struct btrfs_root {
1790 spinlock_t orphan_lock; 1827 spinlock_t orphan_lock;
1791 atomic_t orphan_inodes; 1828 atomic_t orphan_inodes;
1792 struct btrfs_block_rsv *orphan_block_rsv; 1829 struct btrfs_block_rsv *orphan_block_rsv;
1793 int orphan_item_inserted;
1794 int orphan_cleanup_state; 1830 int orphan_cleanup_state;
1795 1831
1796 spinlock_t inode_lock; 1832 spinlock_t inode_lock;
@@ -1808,8 +1844,6 @@ struct btrfs_root {
1808 */ 1844 */
1809 dev_t anon_dev; 1845 dev_t anon_dev;
1810 1846
1811 int force_cow;
1812
1813 spinlock_t root_item_lock; 1847 spinlock_t root_item_lock;
1814 atomic_t refs; 1848 atomic_t refs;
1815 1849
@@ -2788,6 +2822,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
2788 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; 2822 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
2789} 2823}
2790 2824
2825static inline bool btrfs_root_dead(struct btrfs_root *root)
2826{
2827 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
2828}
2829
2791/* struct btrfs_root_backup */ 2830/* struct btrfs_root_backup */
2792BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, 2831BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2793 tree_root, 64); 2832 tree_root, 64);
@@ -2897,6 +2936,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2897 cpu->vend = le64_to_cpu(disk->vend); 2936 cpu->vend = le64_to_cpu(disk->vend);
2898 cpu->target = le64_to_cpu(disk->target); 2937 cpu->target = le64_to_cpu(disk->target);
2899 cpu->flags = le64_to_cpu(disk->flags); 2938 cpu->flags = le64_to_cpu(disk->flags);
2939 cpu->limit = le64_to_cpu(disk->limit);
2900} 2940}
2901 2941
2902static inline void 2942static inline void
@@ -2914,6 +2954,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2914 disk->vend = cpu_to_le64(cpu->vend); 2954 disk->vend = cpu_to_le64(cpu->vend);
2915 disk->target = cpu_to_le64(cpu->target); 2955 disk->target = cpu_to_le64(cpu->target);
2916 disk->flags = cpu_to_le64(cpu->flags); 2956 disk->flags = cpu_to_le64(cpu->flags);
2957 disk->limit = cpu_to_le64(cpu->limit);
2917} 2958}
2918 2959
2919/* struct btrfs_super_block */ 2960/* struct btrfs_super_block */
@@ -3236,6 +3277,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3236void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3277void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3237int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3278int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3238 struct btrfs_root *root, unsigned long count); 3279 struct btrfs_root *root, unsigned long count);
3280int btrfs_async_run_delayed_refs(struct btrfs_root *root,
3281 unsigned long count, int wait);
3239int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 3282int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
3240int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 3283int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
3241 struct btrfs_root *root, u64 bytenr, 3284 struct btrfs_root *root, u64 bytenr,
@@ -3275,9 +3318,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
3275 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 3318 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
3276 struct btrfs_key *ins, int is_data); 3319 struct btrfs_key *ins, int is_data);
3277int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3320int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3278 struct extent_buffer *buf, int full_backref, int for_cow); 3321 struct extent_buffer *buf, int full_backref, int no_quota);
3279int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3322int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3280 struct extent_buffer *buf, int full_backref, int for_cow); 3323 struct extent_buffer *buf, int full_backref, int no_quota);
3281int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3324int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3282 struct btrfs_root *root, 3325 struct btrfs_root *root,
3283 u64 bytenr, u64 num_bytes, u64 flags, 3326 u64 bytenr, u64 num_bytes, u64 flags,
@@ -3285,7 +3328,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3285int btrfs_free_extent(struct btrfs_trans_handle *trans, 3328int btrfs_free_extent(struct btrfs_trans_handle *trans,
3286 struct btrfs_root *root, 3329 struct btrfs_root *root,
3287 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 3330 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
3288 u64 owner, u64 offset, int for_cow); 3331 u64 owner, u64 offset, int no_quota);
3289 3332
3290int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 3333int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
3291int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 3334int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -3297,7 +3340,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3297int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 3340int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
3298 struct btrfs_root *root, 3341 struct btrfs_root *root,
3299 u64 bytenr, u64 num_bytes, u64 parent, 3342 u64 bytenr, u64 num_bytes, u64 parent,
3300 u64 root_objectid, u64 owner, u64 offset, int for_cow); 3343 u64 root_objectid, u64 owner, u64 offset, int no_quota);
3301 3344
3302int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3345int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3303 struct btrfs_root *root); 3346 struct btrfs_root *root);
@@ -3385,7 +3428,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3385int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3428int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3386 struct btrfs_fs_info *fs_info); 3429 struct btrfs_fs_info *fs_info);
3387int __get_raid_index(u64 flags); 3430int __get_raid_index(u64 flags);
3388
3389int btrfs_start_nocow_write(struct btrfs_root *root); 3431int btrfs_start_nocow_write(struct btrfs_root *root);
3390void btrfs_end_nocow_write(struct btrfs_root *root); 3432void btrfs_end_nocow_write(struct btrfs_root *root);
3391/* ctree.c */ 3433/* ctree.c */
@@ -3561,7 +3603,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3561 struct seq_list *elem); 3603 struct seq_list *elem);
3562void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3604void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3563 struct seq_list *elem); 3605 struct seq_list *elem);
3564u64 btrfs_tree_mod_seq_prev(u64 seq);
3565int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); 3606int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
3566 3607
3567/* root-item.c */ 3608/* root-item.c */
@@ -3708,6 +3749,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
3708 struct bio *bio, u64 file_start, int contig); 3749 struct bio *bio, u64 file_start, int contig);
3709int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3750int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3710 struct list_head *list, int search_commit); 3751 struct list_head *list, int search_commit);
3752void btrfs_extent_item_to_extent_map(struct inode *inode,
3753 const struct btrfs_path *path,
3754 struct btrfs_file_extent_item *fi,
3755 const bool new_inline,
3756 struct extent_map *em);
3757
3711/* inode.c */ 3758/* inode.c */
3712struct btrfs_delalloc_work { 3759struct btrfs_delalloc_work {
3713 struct inode *inode; 3760 struct inode *inode;
@@ -4069,52 +4116,6 @@ void btrfs_reada_detach(void *handle);
4069int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 4116int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
4070 u64 start, int err); 4117 u64 start, int err);
4071 4118
4072/* qgroup.c */
4073struct qgroup_update {
4074 struct list_head list;
4075 struct btrfs_delayed_ref_node *node;
4076 struct btrfs_delayed_extent_op *extent_op;
4077};
4078
4079int btrfs_quota_enable(struct btrfs_trans_handle *trans,
4080 struct btrfs_fs_info *fs_info);
4081int btrfs_quota_disable(struct btrfs_trans_handle *trans,
4082 struct btrfs_fs_info *fs_info);
4083int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
4084void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
4085int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
4086int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
4087 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
4088int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
4089 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
4090int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
4091 struct btrfs_fs_info *fs_info, u64 qgroupid,
4092 char *name);
4093int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
4094 struct btrfs_fs_info *fs_info, u64 qgroupid);
4095int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
4096 struct btrfs_fs_info *fs_info, u64 qgroupid,
4097 struct btrfs_qgroup_limit *limit);
4098int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
4099void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
4100struct btrfs_delayed_extent_op;
4101int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
4102 struct btrfs_delayed_ref_node *node,
4103 struct btrfs_delayed_extent_op *extent_op);
4104int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
4105 struct btrfs_fs_info *fs_info,
4106 struct btrfs_delayed_ref_node *node,
4107 struct btrfs_delayed_extent_op *extent_op);
4108int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
4109 struct btrfs_fs_info *fs_info);
4110int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
4111 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
4112 struct btrfs_qgroup_inherit *inherit);
4113int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
4114void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
4115
4116void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
4117
4118static inline int is_fstree(u64 rootid) 4119static inline int is_fstree(u64 rootid)
4119{ 4120{
4120 if (rootid == BTRFS_FS_TREE_OBJECTID || 4121 if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -4131,6 +4132,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
4131/* Sanity test specific functions */ 4132/* Sanity test specific functions */
4132#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4133#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4133void btrfs_test_destroy_inode(struct inode *inode); 4134void btrfs_test_destroy_inode(struct inode *inode);
4135int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
4136 u64 rfer, u64 excl);
4134#endif 4137#endif
4135 4138
4136#endif 4139#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 33e561a84013..da775bfdebc9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -149,8 +149,8 @@ again:
149 spin_lock(&root->inode_lock); 149 spin_lock(&root->inode_lock);
150 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); 150 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
151 if (ret == -EEXIST) { 151 if (ret == -EEXIST) {
152 kmem_cache_free(delayed_node_cache, node);
153 spin_unlock(&root->inode_lock); 152 spin_unlock(&root->inode_lock);
153 kmem_cache_free(delayed_node_cache, node);
154 radix_tree_preload_end(); 154 radix_tree_preload_end();
155 goto again; 155 goto again;
156 } 156 }
@@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node(
267 mutex_unlock(&delayed_node->mutex); 267 mutex_unlock(&delayed_node->mutex);
268 268
269 if (atomic_dec_and_test(&delayed_node->refs)) { 269 if (atomic_dec_and_test(&delayed_node->refs)) {
270 bool free = false;
270 struct btrfs_root *root = delayed_node->root; 271 struct btrfs_root *root = delayed_node->root;
271 spin_lock(&root->inode_lock); 272 spin_lock(&root->inode_lock);
272 if (atomic_read(&delayed_node->refs) == 0) { 273 if (atomic_read(&delayed_node->refs) == 0) {
273 radix_tree_delete(&root->delayed_nodes_tree, 274 radix_tree_delete(&root->delayed_nodes_tree,
274 delayed_node->inode_id); 275 delayed_node->inode_id);
275 kmem_cache_free(delayed_node_cache, delayed_node); 276 free = true;
276 } 277 }
277 spin_unlock(&root->inode_lock); 278 spin_unlock(&root->inode_lock);
279 if (free)
280 kmem_cache_free(delayed_node_cache, delayed_node);
278 } 281 }
279} 282}
280 283
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 31299646024d..6d16bea94e1c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
106 return -1; 106 return -1;
107 if (ref1->type > ref2->type) 107 if (ref1->type > ref2->type)
108 return 1; 108 return 1;
109 if (ref1->no_quota > ref2->no_quota)
110 return 1;
111 if (ref1->no_quota < ref2->no_quota)
112 return -1;
109 /* merging of sequenced refs is not allowed */ 113 /* merging of sequenced refs is not allowed */
110 if (compare_seq) { 114 if (compare_seq) {
111 if (ref1->seq < ref2->seq) 115 if (ref1->seq < ref2->seq)
@@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
635 struct btrfs_delayed_ref_head *head_ref, 639 struct btrfs_delayed_ref_head *head_ref,
636 struct btrfs_delayed_ref_node *ref, u64 bytenr, 640 struct btrfs_delayed_ref_node *ref, u64 bytenr,
637 u64 num_bytes, u64 parent, u64 ref_root, int level, 641 u64 num_bytes, u64 parent, u64 ref_root, int level,
638 int action, int for_cow) 642 int action, int no_quota)
639{ 643{
640 struct btrfs_delayed_ref_node *existing; 644 struct btrfs_delayed_ref_node *existing;
641 struct btrfs_delayed_tree_ref *full_ref; 645 struct btrfs_delayed_tree_ref *full_ref;
@@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
645 if (action == BTRFS_ADD_DELAYED_EXTENT) 649 if (action == BTRFS_ADD_DELAYED_EXTENT)
646 action = BTRFS_ADD_DELAYED_REF; 650 action = BTRFS_ADD_DELAYED_REF;
647 651
652 if (is_fstree(ref_root))
653 seq = atomic64_read(&fs_info->tree_mod_seq);
648 delayed_refs = &trans->transaction->delayed_refs; 654 delayed_refs = &trans->transaction->delayed_refs;
649 655
650 /* first set the basic ref node struct up */ 656 /* first set the basic ref node struct up */
@@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
655 ref->action = action; 661 ref->action = action;
656 ref->is_head = 0; 662 ref->is_head = 0;
657 ref->in_tree = 1; 663 ref->in_tree = 1;
658 664 ref->no_quota = no_quota;
659 if (need_ref_seq(for_cow, ref_root))
660 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
661 ref->seq = seq; 665 ref->seq = seq;
662 666
663 full_ref = btrfs_delayed_node_to_tree_ref(ref); 667 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
697 struct btrfs_delayed_ref_head *head_ref, 701 struct btrfs_delayed_ref_head *head_ref,
698 struct btrfs_delayed_ref_node *ref, u64 bytenr, 702 struct btrfs_delayed_ref_node *ref, u64 bytenr,
699 u64 num_bytes, u64 parent, u64 ref_root, u64 owner, 703 u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
700 u64 offset, int action, int for_cow) 704 u64 offset, int action, int no_quota)
701{ 705{
702 struct btrfs_delayed_ref_node *existing; 706 struct btrfs_delayed_ref_node *existing;
703 struct btrfs_delayed_data_ref *full_ref; 707 struct btrfs_delayed_data_ref *full_ref;
@@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
709 713
710 delayed_refs = &trans->transaction->delayed_refs; 714 delayed_refs = &trans->transaction->delayed_refs;
711 715
716 if (is_fstree(ref_root))
717 seq = atomic64_read(&fs_info->tree_mod_seq);
718
712 /* first set the basic ref node struct up */ 719 /* first set the basic ref node struct up */
713 atomic_set(&ref->refs, 1); 720 atomic_set(&ref->refs, 1);
714 ref->bytenr = bytenr; 721 ref->bytenr = bytenr;
@@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
717 ref->action = action; 724 ref->action = action;
718 ref->is_head = 0; 725 ref->is_head = 0;
719 ref->in_tree = 1; 726 ref->in_tree = 1;
720 727 ref->no_quota = no_quota;
721 if (need_ref_seq(for_cow, ref_root))
722 seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
723 ref->seq = seq; 728 ref->seq = seq;
724 729
725 full_ref = btrfs_delayed_node_to_data_ref(ref); 730 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
762 u64 bytenr, u64 num_bytes, u64 parent, 767 u64 bytenr, u64 num_bytes, u64 parent,
763 u64 ref_root, int level, int action, 768 u64 ref_root, int level, int action,
764 struct btrfs_delayed_extent_op *extent_op, 769 struct btrfs_delayed_extent_op *extent_op,
765 int for_cow) 770 int no_quota)
766{ 771{
767 struct btrfs_delayed_tree_ref *ref; 772 struct btrfs_delayed_tree_ref *ref;
768 struct btrfs_delayed_ref_head *head_ref; 773 struct btrfs_delayed_ref_head *head_ref;
769 struct btrfs_delayed_ref_root *delayed_refs; 774 struct btrfs_delayed_ref_root *delayed_refs;
770 775
776 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
777 no_quota = 0;
778
771 BUG_ON(extent_op && extent_op->is_data); 779 BUG_ON(extent_op && extent_op->is_data);
772 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 780 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
773 if (!ref) 781 if (!ref)
@@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
793 801
794 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, 802 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
795 num_bytes, parent, ref_root, level, action, 803 num_bytes, parent, ref_root, level, action,
796 for_cow); 804 no_quota);
797 spin_unlock(&delayed_refs->lock); 805 spin_unlock(&delayed_refs->lock);
798 if (need_ref_seq(for_cow, ref_root))
799 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
800 806
801 return 0; 807 return 0;
802} 808}
@@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
810 u64 parent, u64 ref_root, 816 u64 parent, u64 ref_root,
811 u64 owner, u64 offset, int action, 817 u64 owner, u64 offset, int action,
812 struct btrfs_delayed_extent_op *extent_op, 818 struct btrfs_delayed_extent_op *extent_op,
813 int for_cow) 819 int no_quota)
814{ 820{
815 struct btrfs_delayed_data_ref *ref; 821 struct btrfs_delayed_data_ref *ref;
816 struct btrfs_delayed_ref_head *head_ref; 822 struct btrfs_delayed_ref_head *head_ref;
817 struct btrfs_delayed_ref_root *delayed_refs; 823 struct btrfs_delayed_ref_root *delayed_refs;
818 824
825 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
826 no_quota = 0;
827
819 BUG_ON(extent_op && !extent_op->is_data); 828 BUG_ON(extent_op && !extent_op->is_data);
820 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); 829 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
821 if (!ref) 830 if (!ref)
@@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
841 850
842 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, 851 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
843 num_bytes, parent, ref_root, owner, offset, 852 num_bytes, parent, ref_root, owner, offset,
844 action, for_cow); 853 action, no_quota);
845 spin_unlock(&delayed_refs->lock); 854 spin_unlock(&delayed_refs->lock);
846 if (need_ref_seq(for_cow, ref_root))
847 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
848 855
849 return 0; 856 return 0;
850} 857}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 4ba9b93022ff..a764e2340d48 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {
52 52
53 unsigned int action:8; 53 unsigned int action:8;
54 unsigned int type:8; 54 unsigned int type:8;
55 unsigned int no_quota:1;
55 /* is this node still in the rbtree? */ 56 /* is this node still in the rbtree? */
56 unsigned int is_head:1; 57 unsigned int is_head:1;
57 unsigned int in_tree:1; 58 unsigned int in_tree:1;
@@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
196 u64 bytenr, u64 num_bytes, u64 parent, 197 u64 bytenr, u64 num_bytes, u64 parent,
197 u64 ref_root, int level, int action, 198 u64 ref_root, int level, int action,
198 struct btrfs_delayed_extent_op *extent_op, 199 struct btrfs_delayed_extent_op *extent_op,
199 int for_cow); 200 int no_quota);
200int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, 201int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
201 struct btrfs_trans_handle *trans, 202 struct btrfs_trans_handle *trans,
202 u64 bytenr, u64 num_bytes, 203 u64 bytenr, u64 num_bytes,
203 u64 parent, u64 ref_root, 204 u64 parent, u64 ref_root,
204 u64 owner, u64 offset, int action, 205 u64 owner, u64 offset, int action,
205 struct btrfs_delayed_extent_op *extent_op, 206 struct btrfs_delayed_extent_op *extent_op,
206 int for_cow); 207 int no_quota);
207int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 208int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
208 struct btrfs_trans_handle *trans, 209 struct btrfs_trans_handle *trans,
209 u64 bytenr, u64 num_bytes, 210 u64 bytenr, u64 num_bytes,
@@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
231 u64 seq); 232 u64 seq);
232 233
233/* 234/*
234 * delayed refs with a ref_seq > 0 must be held back during backref walking.
235 * this only applies to items in one of the fs-trees. for_cow items never need
236 * to be held back, so they won't get a ref_seq number.
237 */
238static inline int need_ref_seq(int for_cow, u64 rootid)
239{
240 if (for_cow)
241 return 0;
242
243 if (rootid == BTRFS_FS_TREE_OBJECTID)
244 return 1;
245
246 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
247 return 1;
248
249 return 0;
250}
251
252/*
253 * a node might live in a head or a regular ref, this lets you 235 * a node might live in a head or a regular ref, this lets you
254 * test for the proper type to use. 236 * test for the proper type to use.
255 */ 237 */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9f2290509aca..2af6e66fe788 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,7 +313,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
313 313
314 if (btrfs_fs_incompat(fs_info, RAID56)) { 314 if (btrfs_fs_incompat(fs_info, RAID56)) {
315 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); 315 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
316 return -EINVAL; 316 return -EOPNOTSUPP;
317 } 317 }
318 318
319 switch (args->start.cont_reading_from_srcdev_mode) { 319 switch (args->start.cont_reading_from_srcdev_mode) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 983314932af3..8bb4aa19898f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
49#include "dev-replace.h" 49#include "dev-replace.h"
50#include "raid56.h" 50#include "raid56.h"
51#include "sysfs.h" 51#include "sysfs.h"
52#include "qgroup.h"
52 53
53#ifdef CONFIG_X86 54#ifdef CONFIG_X86
54#include <asm/cpufeature.h> 55#include <asm/cpufeature.h>
@@ -1109,6 +1110,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1109struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1110struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1110 u64 bytenr, u32 blocksize) 1111 u64 bytenr, u32 blocksize)
1111{ 1112{
1113#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1114 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1115 return alloc_test_extent_buffer(root->fs_info, bytenr,
1116 blocksize);
1117#endif
1112 return alloc_extent_buffer(root->fs_info, bytenr, blocksize); 1118 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1113} 1119}
1114 1120
@@ -1201,10 +1207,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1201 root->nodesize = nodesize; 1207 root->nodesize = nodesize;
1202 root->leafsize = leafsize; 1208 root->leafsize = leafsize;
1203 root->stripesize = stripesize; 1209 root->stripesize = stripesize;
1204 root->ref_cows = 0; 1210 root->state = 0;
1205 root->track_dirty = 0;
1206 root->in_radix = 0;
1207 root->orphan_item_inserted = 0;
1208 root->orphan_cleanup_state = 0; 1211 root->orphan_cleanup_state = 0;
1209 1212
1210 root->objectid = objectid; 1213 root->objectid = objectid;
@@ -1265,7 +1268,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1265 else 1268 else
1266 root->defrag_trans_start = 0; 1269 root->defrag_trans_start = 0;
1267 init_completion(&root->kobj_unregister); 1270 init_completion(&root->kobj_unregister);
1268 root->defrag_running = 0;
1269 root->root_key.objectid = objectid; 1271 root->root_key.objectid = objectid;
1270 root->anon_dev = 0; 1272 root->anon_dev = 0;
1271 1273
@@ -1290,7 +1292,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1290 if (!root) 1292 if (!root)
1291 return ERR_PTR(-ENOMEM); 1293 return ERR_PTR(-ENOMEM);
1292 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); 1294 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
1293 root->dummy_root = 1; 1295 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1296 root->alloc_bytenr = 0;
1294 1297
1295 return root; 1298 return root;
1296} 1299}
@@ -1341,8 +1344,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1341 btrfs_mark_buffer_dirty(leaf); 1344 btrfs_mark_buffer_dirty(leaf);
1342 1345
1343 root->commit_root = btrfs_root_node(root); 1346 root->commit_root = btrfs_root_node(root);
1344 root->track_dirty = 1; 1347 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1345
1346 1348
1347 root->root_item.flags = 0; 1349 root->root_item.flags = 0;
1348 root->root_item.byte_limit = 0; 1350 root->root_item.byte_limit = 0;
@@ -1371,6 +1373,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1371fail: 1373fail:
1372 if (leaf) { 1374 if (leaf) {
1373 btrfs_tree_unlock(leaf); 1375 btrfs_tree_unlock(leaf);
1376 free_extent_buffer(root->commit_root);
1374 free_extent_buffer(leaf); 1377 free_extent_buffer(leaf);
1375 } 1378 }
1376 kfree(root); 1379 kfree(root);
@@ -1396,13 +1399,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1396 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1399 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1397 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1400 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1398 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1401 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1402
1399 /* 1403 /*
1404 * DON'T set REF_COWS for log trees
1405 *
1400 * log trees do not get reference counted because they go away 1406 * log trees do not get reference counted because they go away
1401 * before a real commit is actually done. They do store pointers 1407 * before a real commit is actually done. They do store pointers
1402 * to file data extents, and those reference counts still get 1408 * to file data extents, and those reference counts still get
1403 * updated (along with back refs to the log tree). 1409 * updated (along with back refs to the log tree).
1404 */ 1410 */
1405 root->ref_cows = 0;
1406 1411
1407 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1412 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1408 BTRFS_TREE_LOG_OBJECTID, NULL, 1413 BTRFS_TREE_LOG_OBJECTID, NULL,
@@ -1536,7 +1541,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1536 return root; 1541 return root;
1537 1542
1538 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 1543 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1539 root->ref_cows = 1; 1544 set_bit(BTRFS_ROOT_REF_COWS, &root->state);
1540 btrfs_check_and_init_root_item(&root->root_item); 1545 btrfs_check_and_init_root_item(&root->root_item);
1541 } 1546 }
1542 1547
@@ -1606,7 +1611,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1606 (unsigned long)root->root_key.objectid, 1611 (unsigned long)root->root_key.objectid,
1607 root); 1612 root);
1608 if (ret == 0) 1613 if (ret == 0)
1609 root->in_radix = 1; 1614 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1610 spin_unlock(&fs_info->fs_roots_radix_lock); 1615 spin_unlock(&fs_info->fs_roots_radix_lock);
1611 radix_tree_preload_end(); 1616 radix_tree_preload_end();
1612 1617
@@ -1662,7 +1667,7 @@ again:
1662 if (ret < 0) 1667 if (ret < 0)
1663 goto fail; 1668 goto fail;
1664 if (ret == 0) 1669 if (ret == 0)
1665 root->orphan_item_inserted = 1; 1670 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1666 1671
1667 ret = btrfs_insert_fs_root(fs_info, root); 1672 ret = btrfs_insert_fs_root(fs_info, root);
1668 if (ret) { 1673 if (ret) {
@@ -2064,6 +2069,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2064 btrfs_destroy_workqueue(fs_info->readahead_workers); 2069 btrfs_destroy_workqueue(fs_info->readahead_workers);
2065 btrfs_destroy_workqueue(fs_info->flush_workers); 2070 btrfs_destroy_workqueue(fs_info->flush_workers);
2066 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); 2071 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2072 btrfs_destroy_workqueue(fs_info->extent_workers);
2067} 2073}
2068 2074
2069static void free_root_extent_buffers(struct btrfs_root *root) 2075static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2090,7 +2096,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
2090 free_root_extent_buffers(info->chunk_root); 2096 free_root_extent_buffers(info->chunk_root);
2091} 2097}
2092 2098
2093static void del_fs_roots(struct btrfs_fs_info *fs_info) 2099void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2094{ 2100{
2095 int ret; 2101 int ret;
2096 struct btrfs_root *gang[8]; 2102 struct btrfs_root *gang[8];
@@ -2101,7 +2107,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2101 struct btrfs_root, root_list); 2107 struct btrfs_root, root_list);
2102 list_del(&gang[0]->root_list); 2108 list_del(&gang[0]->root_list);
2103 2109
2104 if (gang[0]->in_radix) { 2110 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2105 btrfs_drop_and_free_fs_root(fs_info, gang[0]); 2111 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2106 } else { 2112 } else {
2107 free_extent_buffer(gang[0]->node); 2113 free_extent_buffer(gang[0]->node);
@@ -2221,6 +2227,7 @@ int open_ctree(struct super_block *sb,
2221 spin_lock_init(&fs_info->free_chunk_lock); 2227 spin_lock_init(&fs_info->free_chunk_lock);
2222 spin_lock_init(&fs_info->tree_mod_seq_lock); 2228 spin_lock_init(&fs_info->tree_mod_seq_lock);
2223 spin_lock_init(&fs_info->super_lock); 2229 spin_lock_init(&fs_info->super_lock);
2230 spin_lock_init(&fs_info->qgroup_op_lock);
2224 spin_lock_init(&fs_info->buffer_lock); 2231 spin_lock_init(&fs_info->buffer_lock);
2225 rwlock_init(&fs_info->tree_mod_log_lock); 2232 rwlock_init(&fs_info->tree_mod_log_lock);
2226 mutex_init(&fs_info->reloc_mutex); 2233 mutex_init(&fs_info->reloc_mutex);
@@ -2246,6 +2253,7 @@ int open_ctree(struct super_block *sb,
2246 atomic_set(&fs_info->async_submit_draining, 0); 2253 atomic_set(&fs_info->async_submit_draining, 0);
2247 atomic_set(&fs_info->nr_async_bios, 0); 2254 atomic_set(&fs_info->nr_async_bios, 0);
2248 atomic_set(&fs_info->defrag_running, 0); 2255 atomic_set(&fs_info->defrag_running, 0);
2256 atomic_set(&fs_info->qgroup_op_seq, 0);
2249 atomic64_set(&fs_info->tree_mod_seq, 0); 2257 atomic64_set(&fs_info->tree_mod_seq, 0);
2250 fs_info->sb = sb; 2258 fs_info->sb = sb;
2251 fs_info->max_inline = 8192 * 1024; 2259 fs_info->max_inline = 8192 * 1024;
@@ -2291,6 +2299,7 @@ int open_ctree(struct super_block *sb,
2291 atomic_set(&fs_info->balance_cancel_req, 0); 2299 atomic_set(&fs_info->balance_cancel_req, 0);
2292 fs_info->balance_ctl = NULL; 2300 fs_info->balance_ctl = NULL;
2293 init_waitqueue_head(&fs_info->balance_wait_q); 2301 init_waitqueue_head(&fs_info->balance_wait_q);
2302 btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
2294 2303
2295 sb->s_blocksize = 4096; 2304 sb->s_blocksize = 4096;
2296 sb->s_blocksize_bits = blksize_bits(4096); 2305 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2354,6 +2363,7 @@ int open_ctree(struct super_block *sb,
2354 spin_lock_init(&fs_info->qgroup_lock); 2363 spin_lock_init(&fs_info->qgroup_lock);
2355 mutex_init(&fs_info->qgroup_ioctl_lock); 2364 mutex_init(&fs_info->qgroup_ioctl_lock);
2356 fs_info->qgroup_tree = RB_ROOT; 2365 fs_info->qgroup_tree = RB_ROOT;
2366 fs_info->qgroup_op_tree = RB_ROOT;
2357 INIT_LIST_HEAD(&fs_info->dirty_qgroups); 2367 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2358 fs_info->qgroup_seq = 1; 2368 fs_info->qgroup_seq = 1;
2359 fs_info->quota_enabled = 0; 2369 fs_info->quota_enabled = 0;
@@ -2577,6 +2587,10 @@ int open_ctree(struct super_block *sb,
2577 btrfs_alloc_workqueue("readahead", flags, max_active, 2); 2587 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2578 fs_info->qgroup_rescan_workers = 2588 fs_info->qgroup_rescan_workers =
2579 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); 2589 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2590 fs_info->extent_workers =
2591 btrfs_alloc_workqueue("extent-refs", flags,
2592 min_t(u64, fs_devices->num_devices,
2593 max_active), 8);
2580 2594
2581 if (!(fs_info->workers && fs_info->delalloc_workers && 2595 if (!(fs_info->workers && fs_info->delalloc_workers &&
2582 fs_info->submit_workers && fs_info->flush_workers && 2596 fs_info->submit_workers && fs_info->flush_workers &&
@@ -2586,6 +2600,7 @@ int open_ctree(struct super_block *sb,
2586 fs_info->endio_freespace_worker && fs_info->rmw_workers && 2600 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2587 fs_info->caching_workers && fs_info->readahead_workers && 2601 fs_info->caching_workers && fs_info->readahead_workers &&
2588 fs_info->fixup_workers && fs_info->delayed_workers && 2602 fs_info->fixup_workers && fs_info->delayed_workers &&
2603 fs_info->fixup_workers && fs_info->extent_workers &&
2589 fs_info->qgroup_rescan_workers)) { 2604 fs_info->qgroup_rescan_workers)) {
2590 err = -ENOMEM; 2605 err = -ENOMEM;
2591 goto fail_sb_buffer; 2606 goto fail_sb_buffer;
@@ -2693,7 +2708,7 @@ retry_root_backup:
2693 ret = PTR_ERR(extent_root); 2708 ret = PTR_ERR(extent_root);
2694 goto recovery_tree_root; 2709 goto recovery_tree_root;
2695 } 2710 }
2696 extent_root->track_dirty = 1; 2711 set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
2697 fs_info->extent_root = extent_root; 2712 fs_info->extent_root = extent_root;
2698 2713
2699 location.objectid = BTRFS_DEV_TREE_OBJECTID; 2714 location.objectid = BTRFS_DEV_TREE_OBJECTID;
@@ -2702,7 +2717,7 @@ retry_root_backup:
2702 ret = PTR_ERR(dev_root); 2717 ret = PTR_ERR(dev_root);
2703 goto recovery_tree_root; 2718 goto recovery_tree_root;
2704 } 2719 }
2705 dev_root->track_dirty = 1; 2720 set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
2706 fs_info->dev_root = dev_root; 2721 fs_info->dev_root = dev_root;
2707 btrfs_init_devices_late(fs_info); 2722 btrfs_init_devices_late(fs_info);
2708 2723
@@ -2712,13 +2727,13 @@ retry_root_backup:
2712 ret = PTR_ERR(csum_root); 2727 ret = PTR_ERR(csum_root);
2713 goto recovery_tree_root; 2728 goto recovery_tree_root;
2714 } 2729 }
2715 csum_root->track_dirty = 1; 2730 set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
2716 fs_info->csum_root = csum_root; 2731 fs_info->csum_root = csum_root;
2717 2732
2718 location.objectid = BTRFS_QUOTA_TREE_OBJECTID; 2733 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2719 quota_root = btrfs_read_tree_root(tree_root, &location); 2734 quota_root = btrfs_read_tree_root(tree_root, &location);
2720 if (!IS_ERR(quota_root)) { 2735 if (!IS_ERR(quota_root)) {
2721 quota_root->track_dirty = 1; 2736 set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
2722 fs_info->quota_enabled = 1; 2737 fs_info->quota_enabled = 1;
2723 fs_info->pending_quota_state = 1; 2738 fs_info->pending_quota_state = 1;
2724 fs_info->quota_root = quota_root; 2739 fs_info->quota_root = quota_root;
@@ -2733,7 +2748,7 @@ retry_root_backup:
2733 create_uuid_tree = true; 2748 create_uuid_tree = true;
2734 check_uuid_tree = false; 2749 check_uuid_tree = false;
2735 } else { 2750 } else {
2736 uuid_root->track_dirty = 1; 2751 set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
2737 fs_info->uuid_root = uuid_root; 2752 fs_info->uuid_root = uuid_root;
2738 create_uuid_tree = false; 2753 create_uuid_tree = false;
2739 check_uuid_tree = 2754 check_uuid_tree =
@@ -2966,7 +2981,7 @@ fail_qgroup:
2966fail_trans_kthread: 2981fail_trans_kthread:
2967 kthread_stop(fs_info->transaction_kthread); 2982 kthread_stop(fs_info->transaction_kthread);
2968 btrfs_cleanup_transaction(fs_info->tree_root); 2983 btrfs_cleanup_transaction(fs_info->tree_root);
2969 del_fs_roots(fs_info); 2984 btrfs_free_fs_roots(fs_info);
2970fail_cleaner: 2985fail_cleaner:
2971 kthread_stop(fs_info->cleaner_kthread); 2986 kthread_stop(fs_info->cleaner_kthread);
2972 2987
@@ -3501,8 +3516,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3501 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 3516 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3502 btrfs_free_log(NULL, root); 3517 btrfs_free_log(NULL, root);
3503 3518
3504 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3519 if (root->free_ino_pinned)
3505 __btrfs_remove_free_space_cache(root->free_ino_ctl); 3520 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3521 if (root->free_ino_ctl)
3522 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3506 free_fs_root(root); 3523 free_fs_root(root);
3507} 3524}
3508 3525
@@ -3533,28 +3550,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3533{ 3550{
3534 u64 root_objectid = 0; 3551 u64 root_objectid = 0;
3535 struct btrfs_root *gang[8]; 3552 struct btrfs_root *gang[8];
3536 int i; 3553 int i = 0;
3537 int ret; 3554 int err = 0;
3555 unsigned int ret = 0;
3556 int index;
3538 3557
3539 while (1) { 3558 while (1) {
3559 index = srcu_read_lock(&fs_info->subvol_srcu);
3540 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 3560 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3541 (void **)gang, root_objectid, 3561 (void **)gang, root_objectid,
3542 ARRAY_SIZE(gang)); 3562 ARRAY_SIZE(gang));
3543 if (!ret) 3563 if (!ret) {
3564 srcu_read_unlock(&fs_info->subvol_srcu, index);
3544 break; 3565 break;
3545 3566 }
3546 root_objectid = gang[ret - 1]->root_key.objectid + 1; 3567 root_objectid = gang[ret - 1]->root_key.objectid + 1;
3568
3547 for (i = 0; i < ret; i++) { 3569 for (i = 0; i < ret; i++) {
3548 int err; 3570 /* Avoid to grab roots in dead_roots */
3571 if (btrfs_root_refs(&gang[i]->root_item) == 0) {
3572 gang[i] = NULL;
3573 continue;
3574 }
3575 /* grab all the search result for later use */
3576 gang[i] = btrfs_grab_fs_root(gang[i]);
3577 }
3578 srcu_read_unlock(&fs_info->subvol_srcu, index);
3549 3579
3580 for (i = 0; i < ret; i++) {
3581 if (!gang[i])
3582 continue;
3550 root_objectid = gang[i]->root_key.objectid; 3583 root_objectid = gang[i]->root_key.objectid;
3551 err = btrfs_orphan_cleanup(gang[i]); 3584 err = btrfs_orphan_cleanup(gang[i]);
3552 if (err) 3585 if (err)
3553 return err; 3586 break;
3587 btrfs_put_fs_root(gang[i]);
3554 } 3588 }
3555 root_objectid++; 3589 root_objectid++;
3556 } 3590 }
3557 return 0; 3591
3592 /* release the uncleaned roots due to error */
3593 for (; i < ret; i++) {
3594 if (gang[i])
3595 btrfs_put_fs_root(gang[i]);
3596 }
3597 return err;
3558} 3598}
3559 3599
3560int btrfs_commit_super(struct btrfs_root *root) 3600int btrfs_commit_super(struct btrfs_root *root)
@@ -3603,6 +3643,8 @@ int close_ctree(struct btrfs_root *root)
3603 /* clear out the rbtree of defraggable inodes */ 3643 /* clear out the rbtree of defraggable inodes */
3604 btrfs_cleanup_defrag_inodes(fs_info); 3644 btrfs_cleanup_defrag_inodes(fs_info);
3605 3645
3646 cancel_work_sync(&fs_info->async_reclaim_work);
3647
3606 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3648 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3607 ret = btrfs_commit_super(root); 3649 ret = btrfs_commit_super(root);
3608 if (ret) 3650 if (ret)
@@ -3627,12 +3669,17 @@ int close_ctree(struct btrfs_root *root)
3627 3669
3628 btrfs_sysfs_remove_one(fs_info); 3670 btrfs_sysfs_remove_one(fs_info);
3629 3671
3630 del_fs_roots(fs_info); 3672 btrfs_free_fs_roots(fs_info);
3631 3673
3632 btrfs_put_block_group_cache(fs_info); 3674 btrfs_put_block_group_cache(fs_info);
3633 3675
3634 btrfs_free_block_groups(fs_info); 3676 btrfs_free_block_groups(fs_info);
3635 3677
3678 /*
3679 * we must make sure there is not any read request to
3680 * submit after we stopping all workers.
3681 */
3682 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3636 btrfs_stop_all_workers(fs_info); 3683 btrfs_stop_all_workers(fs_info);
3637 3684
3638 free_root_pointers(fs_info, 1); 3685 free_root_pointers(fs_info, 1);
@@ -3709,6 +3756,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3709 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, 3756 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3710 buf->len, 3757 buf->len,
3711 root->fs_info->dirty_metadata_batch); 3758 root->fs_info->dirty_metadata_batch);
3759#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3760 if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
3761 btrfs_print_leaf(root, buf);
3762 ASSERT(0);
3763 }
3764#endif
3712} 3765}
3713 3766
3714static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3767static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 53059df350f8..23ce3ceba0a9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
68int btrfs_init_fs_root(struct btrfs_root *root); 68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, 69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root); 70 struct btrfs_root *root);
71void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
71 72
72struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, 73struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
73 struct btrfs_key *key, 74 struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5590af92094b..fafb3e53ecde 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,16 +26,16 @@
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h> 27#include <linux/percpu_counter.h>
28#include "hash.h" 28#include "hash.h"
29#include "ctree.h" 29#include "tree-log.h"
30#include "disk-io.h" 30#include "disk-io.h"
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h"
33#include "volumes.h" 32#include "volumes.h"
34#include "raid56.h" 33#include "raid56.h"
35#include "locking.h" 34#include "locking.h"
36#include "free-space-cache.h" 35#include "free-space-cache.h"
37#include "math.h" 36#include "math.h"
38#include "sysfs.h" 37#include "sysfs.h"
38#include "qgroup.h"
39 39
40#undef SCRAMBLE_DELAYED_REFS 40#undef SCRAMBLE_DELAYED_REFS
41 41
@@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
81 u64 bytenr, u64 num_bytes, u64 parent, 81 u64 bytenr, u64 num_bytes, u64 parent,
82 u64 root_objectid, u64 owner_objectid, 82 u64 root_objectid, u64 owner_objectid,
83 u64 owner_offset, int refs_to_drop, 83 u64 owner_offset, int refs_to_drop,
84 struct btrfs_delayed_extent_op *extra_op); 84 struct btrfs_delayed_extent_op *extra_op,
85 int no_quota);
85static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 86static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
86 struct extent_buffer *leaf, 87 struct extent_buffer *leaf,
87 struct btrfs_extent_item *ei); 88 struct btrfs_extent_item *ei);
@@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root, 95 struct btrfs_root *root,
95 u64 parent, u64 root_objectid, 96 u64 parent, u64 root_objectid,
96 u64 flags, struct btrfs_disk_key *key, 97 u64 flags, struct btrfs_disk_key *key,
97 int level, struct btrfs_key *ins); 98 int level, struct btrfs_key *ins,
99 int no_quota);
98static int do_chunk_alloc(struct btrfs_trans_handle *trans, 100static int do_chunk_alloc(struct btrfs_trans_handle *trans,
99 struct btrfs_root *extent_root, u64 flags, 101 struct btrfs_root *extent_root, u64 flags,
100 int force); 102 int force);
@@ -1271,7 +1273,7 @@ fail:
1271static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1273static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1272 struct btrfs_root *root, 1274 struct btrfs_root *root,
1273 struct btrfs_path *path, 1275 struct btrfs_path *path,
1274 int refs_to_drop) 1276 int refs_to_drop, int *last_ref)
1275{ 1277{
1276 struct btrfs_key key; 1278 struct btrfs_key key;
1277 struct btrfs_extent_data_ref *ref1 = NULL; 1279 struct btrfs_extent_data_ref *ref1 = NULL;
@@ -1307,6 +1309,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1307 1309
1308 if (num_refs == 0) { 1310 if (num_refs == 0) {
1309 ret = btrfs_del_item(trans, root, path); 1311 ret = btrfs_del_item(trans, root, path);
1312 *last_ref = 1;
1310 } else { 1313 } else {
1311 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1314 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1312 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1315 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1764,7 +1767,8 @@ void update_inline_extent_backref(struct btrfs_root *root,
1764 struct btrfs_path *path, 1767 struct btrfs_path *path,
1765 struct btrfs_extent_inline_ref *iref, 1768 struct btrfs_extent_inline_ref *iref,
1766 int refs_to_mod, 1769 int refs_to_mod,
1767 struct btrfs_delayed_extent_op *extent_op) 1770 struct btrfs_delayed_extent_op *extent_op,
1771 int *last_ref)
1768{ 1772{
1769 struct extent_buffer *leaf; 1773 struct extent_buffer *leaf;
1770 struct btrfs_extent_item *ei; 1774 struct btrfs_extent_item *ei;
@@ -1808,6 +1812,7 @@ void update_inline_extent_backref(struct btrfs_root *root,
1808 else 1812 else
1809 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1813 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1810 } else { 1814 } else {
1815 *last_ref = 1;
1811 size = btrfs_extent_inline_ref_size(type); 1816 size = btrfs_extent_inline_ref_size(type);
1812 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1817 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1813 ptr = (unsigned long)iref; 1818 ptr = (unsigned long)iref;
@@ -1839,7 +1844,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1839 if (ret == 0) { 1844 if (ret == 0) {
1840 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1845 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1841 update_inline_extent_backref(root, path, iref, 1846 update_inline_extent_backref(root, path, iref,
1842 refs_to_add, extent_op); 1847 refs_to_add, extent_op, NULL);
1843 } else if (ret == -ENOENT) { 1848 } else if (ret == -ENOENT) {
1844 setup_inline_extent_backref(root, path, iref, parent, 1849 setup_inline_extent_backref(root, path, iref, parent,
1845 root_objectid, owner, offset, 1850 root_objectid, owner, offset,
@@ -1872,17 +1877,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1872 struct btrfs_root *root, 1877 struct btrfs_root *root,
1873 struct btrfs_path *path, 1878 struct btrfs_path *path,
1874 struct btrfs_extent_inline_ref *iref, 1879 struct btrfs_extent_inline_ref *iref,
1875 int refs_to_drop, int is_data) 1880 int refs_to_drop, int is_data, int *last_ref)
1876{ 1881{
1877 int ret = 0; 1882 int ret = 0;
1878 1883
1879 BUG_ON(!is_data && refs_to_drop != 1); 1884 BUG_ON(!is_data && refs_to_drop != 1);
1880 if (iref) { 1885 if (iref) {
1881 update_inline_extent_backref(root, path, iref, 1886 update_inline_extent_backref(root, path, iref,
1882 -refs_to_drop, NULL); 1887 -refs_to_drop, NULL, last_ref);
1883 } else if (is_data) { 1888 } else if (is_data) {
1884 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1889 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1890 last_ref);
1885 } else { 1891 } else {
1892 *last_ref = 1;
1886 ret = btrfs_del_item(trans, root, path); 1893 ret = btrfs_del_item(trans, root, path);
1887 } 1894 }
1888 return ret; 1895 return ret;
@@ -1946,7 +1953,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1946int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1953int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1947 struct btrfs_root *root, 1954 struct btrfs_root *root,
1948 u64 bytenr, u64 num_bytes, u64 parent, 1955 u64 bytenr, u64 num_bytes, u64 parent,
1949 u64 root_objectid, u64 owner, u64 offset, int for_cow) 1956 u64 root_objectid, u64 owner, u64 offset,
1957 int no_quota)
1950{ 1958{
1951 int ret; 1959 int ret;
1952 struct btrfs_fs_info *fs_info = root->fs_info; 1960 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1958,12 +1966,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1958 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1966 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1959 num_bytes, 1967 num_bytes,
1960 parent, root_objectid, (int)owner, 1968 parent, root_objectid, (int)owner,
1961 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1969 BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1962 } else { 1970 } else {
1963 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1971 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1964 num_bytes, 1972 num_bytes,
1965 parent, root_objectid, owner, offset, 1973 parent, root_objectid, owner, offset,
1966 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1974 BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1967 } 1975 }
1968 return ret; 1976 return ret;
1969} 1977}
@@ -1973,31 +1981,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1973 u64 bytenr, u64 num_bytes, 1981 u64 bytenr, u64 num_bytes,
1974 u64 parent, u64 root_objectid, 1982 u64 parent, u64 root_objectid,
1975 u64 owner, u64 offset, int refs_to_add, 1983 u64 owner, u64 offset, int refs_to_add,
1984 int no_quota,
1976 struct btrfs_delayed_extent_op *extent_op) 1985 struct btrfs_delayed_extent_op *extent_op)
1977{ 1986{
1987 struct btrfs_fs_info *fs_info = root->fs_info;
1978 struct btrfs_path *path; 1988 struct btrfs_path *path;
1979 struct extent_buffer *leaf; 1989 struct extent_buffer *leaf;
1980 struct btrfs_extent_item *item; 1990 struct btrfs_extent_item *item;
1991 struct btrfs_key key;
1981 u64 refs; 1992 u64 refs;
1982 int ret; 1993 int ret;
1994 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
1983 1995
1984 path = btrfs_alloc_path(); 1996 path = btrfs_alloc_path();
1985 if (!path) 1997 if (!path)
1986 return -ENOMEM; 1998 return -ENOMEM;
1987 1999
2000 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
2001 no_quota = 1;
2002
1988 path->reada = 1; 2003 path->reada = 1;
1989 path->leave_spinning = 1; 2004 path->leave_spinning = 1;
1990 /* this will setup the path even if it fails to insert the back ref */ 2005 /* this will setup the path even if it fails to insert the back ref */
1991 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 2006 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
1992 path, bytenr, num_bytes, parent, 2007 bytenr, num_bytes, parent,
1993 root_objectid, owner, offset, 2008 root_objectid, owner, offset,
1994 refs_to_add, extent_op); 2009 refs_to_add, extent_op);
1995 if (ret != -EAGAIN) 2010 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
1996 goto out; 2011 goto out;
2012 /*
2013 * Ok we were able to insert an inline extent and it appears to be a new
2014 * reference, deal with the qgroup accounting.
2015 */
2016 if (!ret && !no_quota) {
2017 ASSERT(root->fs_info->quota_enabled);
2018 leaf = path->nodes[0];
2019 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2020 item = btrfs_item_ptr(leaf, path->slots[0],
2021 struct btrfs_extent_item);
2022 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
2023 type = BTRFS_QGROUP_OPER_ADD_SHARED;
2024 btrfs_release_path(path);
1997 2025
2026 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2027 bytenr, num_bytes, type, 0);
2028 goto out;
2029 }
2030
2031 /*
2032 * Ok we had -EAGAIN which means we didn't have space to insert and
2033 * inline extent ref, so just update the reference count and add a
2034 * normal backref.
2035 */
1998 leaf = path->nodes[0]; 2036 leaf = path->nodes[0];
2037 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1999 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2038 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2000 refs = btrfs_extent_refs(leaf, item); 2039 refs = btrfs_extent_refs(leaf, item);
2040 if (refs)
2041 type = BTRFS_QGROUP_OPER_ADD_SHARED;
2001 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2042 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2002 if (extent_op) 2043 if (extent_op)
2003 __run_delayed_extent_op(extent_op, leaf, item); 2044 __run_delayed_extent_op(extent_op, leaf, item);
@@ -2005,9 +2046,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2005 btrfs_mark_buffer_dirty(leaf); 2046 btrfs_mark_buffer_dirty(leaf);
2006 btrfs_release_path(path); 2047 btrfs_release_path(path);
2007 2048
2049 if (!no_quota) {
2050 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2051 bytenr, num_bytes, type, 0);
2052 if (ret)
2053 goto out;
2054 }
2055
2008 path->reada = 1; 2056 path->reada = 1;
2009 path->leave_spinning = 1; 2057 path->leave_spinning = 1;
2010
2011 /* now insert the actual backref */ 2058 /* now insert the actual backref */
2012 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2059 ret = insert_extent_backref(trans, root->fs_info->extent_root,
2013 path, bytenr, parent, root_objectid, 2060 path, bytenr, parent, root_objectid,
@@ -2041,8 +2088,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2041 2088
2042 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2089 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2043 parent = ref->parent; 2090 parent = ref->parent;
2044 else 2091 ref_root = ref->root;
2045 ref_root = ref->root;
2046 2092
2047 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2093 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2048 if (extent_op) 2094 if (extent_op)
@@ -2056,13 +2102,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2056 node->num_bytes, parent, 2102 node->num_bytes, parent,
2057 ref_root, ref->objectid, 2103 ref_root, ref->objectid,
2058 ref->offset, node->ref_mod, 2104 ref->offset, node->ref_mod,
2059 extent_op); 2105 node->no_quota, extent_op);
2060 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2106 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2061 ret = __btrfs_free_extent(trans, root, node->bytenr, 2107 ret = __btrfs_free_extent(trans, root, node->bytenr,
2062 node->num_bytes, parent, 2108 node->num_bytes, parent,
2063 ref_root, ref->objectid, 2109 ref_root, ref->objectid,
2064 ref->offset, node->ref_mod, 2110 ref->offset, node->ref_mod,
2065 extent_op); 2111 extent_op, node->no_quota);
2066 } else { 2112 } else {
2067 BUG(); 2113 BUG();
2068 } 2114 }
@@ -2199,8 +2245,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2199 2245
2200 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2246 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2201 parent = ref->parent; 2247 parent = ref->parent;
2202 else 2248 ref_root = ref->root;
2203 ref_root = ref->root;
2204 2249
2205 ins.objectid = node->bytenr; 2250 ins.objectid = node->bytenr;
2206 if (skinny_metadata) { 2251 if (skinny_metadata) {
@@ -2218,15 +2263,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2218 parent, ref_root, 2263 parent, ref_root,
2219 extent_op->flags_to_set, 2264 extent_op->flags_to_set,
2220 &extent_op->key, 2265 &extent_op->key,
2221 ref->level, &ins); 2266 ref->level, &ins,
2267 node->no_quota);
2222 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2268 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2223 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2269 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2224 node->num_bytes, parent, ref_root, 2270 node->num_bytes, parent, ref_root,
2225 ref->level, 0, 1, extent_op); 2271 ref->level, 0, 1, node->no_quota,
2272 extent_op);
2226 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2273 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2227 ret = __btrfs_free_extent(trans, root, node->bytenr, 2274 ret = __btrfs_free_extent(trans, root, node->bytenr,
2228 node->num_bytes, parent, ref_root, 2275 node->num_bytes, parent, ref_root,
2229 ref->level, 0, 1, extent_op); 2276 ref->level, 0, 1, extent_op,
2277 node->no_quota);
2230 } else { 2278 } else {
2231 BUG(); 2279 BUG();
2232 } 2280 }
@@ -2574,42 +2622,6 @@ static u64 find_middle(struct rb_root *root)
2574} 2622}
2575#endif 2623#endif
2576 2624
2577int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2578 struct btrfs_fs_info *fs_info)
2579{
2580 struct qgroup_update *qgroup_update;
2581 int ret = 0;
2582
2583 if (list_empty(&trans->qgroup_ref_list) !=
2584 !trans->delayed_ref_elem.seq) {
2585 /* list without seq or seq without list */
2586 btrfs_err(fs_info,
2587 "qgroup accounting update error, list is%s empty, seq is %#x.%x",
2588 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2589 (u32)(trans->delayed_ref_elem.seq >> 32),
2590 (u32)trans->delayed_ref_elem.seq);
2591 BUG();
2592 }
2593
2594 if (!trans->delayed_ref_elem.seq)
2595 return 0;
2596
2597 while (!list_empty(&trans->qgroup_ref_list)) {
2598 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2599 struct qgroup_update, list);
2600 list_del(&qgroup_update->list);
2601 if (!ret)
2602 ret = btrfs_qgroup_account_ref(
2603 trans, fs_info, qgroup_update->node,
2604 qgroup_update->extent_op);
2605 kfree(qgroup_update);
2606 }
2607
2608 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2609
2610 return ret;
2611}
2612
2613static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2625static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2614{ 2626{
2615 u64 num_bytes; 2627 u64 num_bytes;
@@ -2662,15 +2674,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2662 u64 num_entries = 2674 u64 num_entries =
2663 atomic_read(&trans->transaction->delayed_refs.num_entries); 2675 atomic_read(&trans->transaction->delayed_refs.num_entries);
2664 u64 avg_runtime; 2676 u64 avg_runtime;
2677 u64 val;
2665 2678
2666 smp_mb(); 2679 smp_mb();
2667 avg_runtime = fs_info->avg_delayed_ref_runtime; 2680 avg_runtime = fs_info->avg_delayed_ref_runtime;
2681 val = num_entries * avg_runtime;
2668 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2682 if (num_entries * avg_runtime >= NSEC_PER_SEC)
2669 return 1; 2683 return 1;
2684 if (val >= NSEC_PER_SEC / 2)
2685 return 2;
2670 2686
2671 return btrfs_check_space_for_delayed_refs(trans, root); 2687 return btrfs_check_space_for_delayed_refs(trans, root);
2672} 2688}
2673 2689
2690struct async_delayed_refs {
2691 struct btrfs_root *root;
2692 int count;
2693 int error;
2694 int sync;
2695 struct completion wait;
2696 struct btrfs_work work;
2697};
2698
2699static void delayed_ref_async_start(struct btrfs_work *work)
2700{
2701 struct async_delayed_refs *async;
2702 struct btrfs_trans_handle *trans;
2703 int ret;
2704
2705 async = container_of(work, struct async_delayed_refs, work);
2706
2707 trans = btrfs_join_transaction(async->root);
2708 if (IS_ERR(trans)) {
2709 async->error = PTR_ERR(trans);
2710 goto done;
2711 }
2712
2713 /*
2714 * trans->sync means that when we call end_transaciton, we won't
2715 * wait on delayed refs
2716 */
2717 trans->sync = true;
2718 ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2719 if (ret)
2720 async->error = ret;
2721
2722 ret = btrfs_end_transaction(trans, async->root);
2723 if (ret && !async->error)
2724 async->error = ret;
2725done:
2726 if (async->sync)
2727 complete(&async->wait);
2728 else
2729 kfree(async);
2730}
2731
2732int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2733 unsigned long count, int wait)
2734{
2735 struct async_delayed_refs *async;
2736 int ret;
2737
2738 async = kmalloc(sizeof(*async), GFP_NOFS);
2739 if (!async)
2740 return -ENOMEM;
2741
2742 async->root = root->fs_info->tree_root;
2743 async->count = count;
2744 async->error = 0;
2745 if (wait)
2746 async->sync = 1;
2747 else
2748 async->sync = 0;
2749 init_completion(&async->wait);
2750
2751 btrfs_init_work(&async->work, delayed_ref_async_start,
2752 NULL, NULL);
2753
2754 btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2755
2756 if (wait) {
2757 wait_for_completion(&async->wait);
2758 ret = async->error;
2759 kfree(async);
2760 return ret;
2761 }
2762 return 0;
2763}
2764
2674/* 2765/*
2675 * this starts processing the delayed reference count updates and 2766 * this starts processing the delayed reference count updates and
2676 * extent insertions we have queued up so far. count can be 2767 * extent insertions we have queued up so far. count can be
@@ -2698,8 +2789,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2698 if (root == root->fs_info->extent_root) 2789 if (root == root->fs_info->extent_root)
2699 root = root->fs_info->tree_root; 2790 root = root->fs_info->tree_root;
2700 2791
2701 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2702
2703 delayed_refs = &trans->transaction->delayed_refs; 2792 delayed_refs = &trans->transaction->delayed_refs;
2704 if (count == 0) { 2793 if (count == 0) {
2705 count = atomic_read(&delayed_refs->num_entries) * 2; 2794 count = atomic_read(&delayed_refs->num_entries) * 2;
@@ -2758,6 +2847,9 @@ again:
2758 goto again; 2847 goto again;
2759 } 2848 }
2760out: 2849out:
2850 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
2851 if (ret)
2852 return ret;
2761 assert_qgroups_uptodate(trans); 2853 assert_qgroups_uptodate(trans);
2762 return 0; 2854 return 0;
2763} 2855}
@@ -2964,7 +3056,7 @@ out:
2964static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3056static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2965 struct btrfs_root *root, 3057 struct btrfs_root *root,
2966 struct extent_buffer *buf, 3058 struct extent_buffer *buf,
2967 int full_backref, int inc, int for_cow) 3059 int full_backref, int inc, int no_quota)
2968{ 3060{
2969 u64 bytenr; 3061 u64 bytenr;
2970 u64 num_bytes; 3062 u64 num_bytes;
@@ -2979,11 +3071,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2979 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3071 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2980 u64, u64, u64, u64, u64, u64, int); 3072 u64, u64, u64, u64, u64, u64, int);
2981 3073
3074#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3075 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
3076 return 0;
3077#endif
2982 ref_root = btrfs_header_owner(buf); 3078 ref_root = btrfs_header_owner(buf);
2983 nritems = btrfs_header_nritems(buf); 3079 nritems = btrfs_header_nritems(buf);
2984 level = btrfs_header_level(buf); 3080 level = btrfs_header_level(buf);
2985 3081
2986 if (!root->ref_cows && level == 0) 3082 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2987 return 0; 3083 return 0;
2988 3084
2989 if (inc) 3085 if (inc)
@@ -3014,7 +3110,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3014 key.offset -= btrfs_file_extent_offset(buf, fi); 3110 key.offset -= btrfs_file_extent_offset(buf, fi);
3015 ret = process_func(trans, root, bytenr, num_bytes, 3111 ret = process_func(trans, root, bytenr, num_bytes,
3016 parent, ref_root, key.objectid, 3112 parent, ref_root, key.objectid,
3017 key.offset, for_cow); 3113 key.offset, no_quota);
3018 if (ret) 3114 if (ret)
3019 goto fail; 3115 goto fail;
3020 } else { 3116 } else {
@@ -3022,7 +3118,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3022 num_bytes = btrfs_level_size(root, level - 1); 3118 num_bytes = btrfs_level_size(root, level - 1);
3023 ret = process_func(trans, root, bytenr, num_bytes, 3119 ret = process_func(trans, root, bytenr, num_bytes,
3024 parent, ref_root, level - 1, 0, 3120 parent, ref_root, level - 1, 0,
3025 for_cow); 3121 no_quota);
3026 if (ret) 3122 if (ret)
3027 goto fail; 3123 goto fail;
3028 } 3124 }
@@ -3033,15 +3129,15 @@ fail:
3033} 3129}
3034 3130
3035int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3131int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3036 struct extent_buffer *buf, int full_backref, int for_cow) 3132 struct extent_buffer *buf, int full_backref, int no_quota)
3037{ 3133{
3038 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); 3134 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
3039} 3135}
3040 3136
3041int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3137int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3042 struct extent_buffer *buf, int full_backref, int for_cow) 3138 struct extent_buffer *buf, int full_backref, int no_quota)
3043{ 3139{
3044 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); 3140 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
3045} 3141}
3046 3142
3047static int write_one_cache_group(struct btrfs_trans_handle *trans, 3143static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3401,10 +3497,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3401 return ret; 3497 return ret;
3402 } 3498 }
3403 3499
3404 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 3500 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3405 INIT_LIST_HEAD(&found->block_groups[i]); 3501 INIT_LIST_HEAD(&found->block_groups[i]);
3406 kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
3407 }
3408 init_rwsem(&found->groups_sem); 3502 init_rwsem(&found->groups_sem);
3409 spin_lock_init(&found->lock); 3503 spin_lock_init(&found->lock);
3410 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3504 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -4204,6 +4298,104 @@ static int flush_space(struct btrfs_root *root,
4204 4298
4205 return ret; 4299 return ret;
4206} 4300}
4301
4302static inline u64
4303btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4304 struct btrfs_space_info *space_info)
4305{
4306 u64 used;
4307 u64 expected;
4308 u64 to_reclaim;
4309
4310 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4311 16 * 1024 * 1024);
4312 spin_lock(&space_info->lock);
4313 if (can_overcommit(root, space_info, to_reclaim,
4314 BTRFS_RESERVE_FLUSH_ALL)) {
4315 to_reclaim = 0;
4316 goto out;
4317 }
4318
4319 used = space_info->bytes_used + space_info->bytes_reserved +
4320 space_info->bytes_pinned + space_info->bytes_readonly +
4321 space_info->bytes_may_use;
4322 if (can_overcommit(root, space_info, 1024 * 1024,
4323 BTRFS_RESERVE_FLUSH_ALL))
4324 expected = div_factor_fine(space_info->total_bytes, 95);
4325 else
4326 expected = div_factor_fine(space_info->total_bytes, 90);
4327
4328 if (used > expected)
4329 to_reclaim = used - expected;
4330 else
4331 to_reclaim = 0;
4332 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4333 space_info->bytes_reserved);
4334out:
4335 spin_unlock(&space_info->lock);
4336
4337 return to_reclaim;
4338}
4339
4340static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4341 struct btrfs_fs_info *fs_info, u64 used)
4342{
4343 return (used >= div_factor_fine(space_info->total_bytes, 98) &&
4344 !btrfs_fs_closing(fs_info) &&
4345 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4346}
4347
4348static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4349 struct btrfs_fs_info *fs_info)
4350{
4351 u64 used;
4352
4353 spin_lock(&space_info->lock);
4354 used = space_info->bytes_used + space_info->bytes_reserved +
4355 space_info->bytes_pinned + space_info->bytes_readonly +
4356 space_info->bytes_may_use;
4357 if (need_do_async_reclaim(space_info, fs_info, used)) {
4358 spin_unlock(&space_info->lock);
4359 return 1;
4360 }
4361 spin_unlock(&space_info->lock);
4362
4363 return 0;
4364}
4365
4366static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4367{
4368 struct btrfs_fs_info *fs_info;
4369 struct btrfs_space_info *space_info;
4370 u64 to_reclaim;
4371 int flush_state;
4372
4373 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4374 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4375
4376 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4377 space_info);
4378 if (!to_reclaim)
4379 return;
4380
4381 flush_state = FLUSH_DELAYED_ITEMS_NR;
4382 do {
4383 flush_space(fs_info->fs_root, space_info, to_reclaim,
4384 to_reclaim, flush_state);
4385 flush_state++;
4386 if (!btrfs_need_do_async_reclaim(space_info, fs_info))
4387 return;
4388 } while (flush_state <= COMMIT_TRANS);
4389
4390 if (btrfs_need_do_async_reclaim(space_info, fs_info))
4391 queue_work(system_unbound_wq, work);
4392}
4393
4394void btrfs_init_async_reclaim_work(struct work_struct *work)
4395{
4396 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4397}
4398
4207/** 4399/**
4208 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4400 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4209 * @root - the root we're allocating for 4401 * @root - the root we're allocating for
@@ -4311,8 +4503,13 @@ again:
4311 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4503 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4312 flushing = true; 4504 flushing = true;
4313 space_info->flush = 1; 4505 space_info->flush = 1;
4506 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4507 used += orig_bytes;
4508 if (need_do_async_reclaim(space_info, root->fs_info, used) &&
4509 !work_busy(&root->fs_info->async_reclaim_work))
4510 queue_work(system_unbound_wq,
4511 &root->fs_info->async_reclaim_work);
4314 } 4512 }
4315
4316 spin_unlock(&space_info->lock); 4513 spin_unlock(&space_info->lock);
4317 4514
4318 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4515 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
@@ -4369,7 +4566,7 @@ static struct btrfs_block_rsv *get_block_rsv(
4369{ 4566{
4370 struct btrfs_block_rsv *block_rsv = NULL; 4567 struct btrfs_block_rsv *block_rsv = NULL;
4371 4568
4372 if (root->ref_cows) 4569 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4373 block_rsv = trans->block_rsv; 4570 block_rsv = trans->block_rsv;
4374 4571
4375 if (root == root->fs_info->csum_root && trans->adding_csums) 4572 if (root == root->fs_info->csum_root && trans->adding_csums)
@@ -5621,7 +5818,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5621 u64 bytenr, u64 num_bytes, u64 parent, 5818 u64 bytenr, u64 num_bytes, u64 parent,
5622 u64 root_objectid, u64 owner_objectid, 5819 u64 root_objectid, u64 owner_objectid,
5623 u64 owner_offset, int refs_to_drop, 5820 u64 owner_offset, int refs_to_drop,
5624 struct btrfs_delayed_extent_op *extent_op) 5821 struct btrfs_delayed_extent_op *extent_op,
5822 int no_quota)
5625{ 5823{
5626 struct btrfs_key key; 5824 struct btrfs_key key;
5627 struct btrfs_path *path; 5825 struct btrfs_path *path;
@@ -5637,9 +5835,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5637 int num_to_del = 1; 5835 int num_to_del = 1;
5638 u32 item_size; 5836 u32 item_size;
5639 u64 refs; 5837 u64 refs;
5838 int last_ref = 0;
5839 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
5640 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5840 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5641 SKINNY_METADATA); 5841 SKINNY_METADATA);
5642 5842
5843 if (!info->quota_enabled || !is_fstree(root_objectid))
5844 no_quota = 1;
5845
5643 path = btrfs_alloc_path(); 5846 path = btrfs_alloc_path();
5644 if (!path) 5847 if (!path)
5645 return -ENOMEM; 5848 return -ENOMEM;
@@ -5687,7 +5890,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5687 BUG_ON(iref); 5890 BUG_ON(iref);
5688 ret = remove_extent_backref(trans, extent_root, path, 5891 ret = remove_extent_backref(trans, extent_root, path,
5689 NULL, refs_to_drop, 5892 NULL, refs_to_drop,
5690 is_data); 5893 is_data, &last_ref);
5691 if (ret) { 5894 if (ret) {
5692 btrfs_abort_transaction(trans, extent_root, ret); 5895 btrfs_abort_transaction(trans, extent_root, ret);
5693 goto out; 5896 goto out;
@@ -5806,7 +6009,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5806 refs = btrfs_extent_refs(leaf, ei); 6009 refs = btrfs_extent_refs(leaf, ei);
5807 if (refs < refs_to_drop) { 6010 if (refs < refs_to_drop) {
5808 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6011 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
5809 "for bytenr %Lu\n", refs_to_drop, refs, bytenr); 6012 "for bytenr %Lu", refs_to_drop, refs, bytenr);
5810 ret = -EINVAL; 6013 ret = -EINVAL;
5811 btrfs_abort_transaction(trans, extent_root, ret); 6014 btrfs_abort_transaction(trans, extent_root, ret);
5812 goto out; 6015 goto out;
@@ -5814,6 +6017,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5814 refs -= refs_to_drop; 6017 refs -= refs_to_drop;
5815 6018
5816 if (refs > 0) { 6019 if (refs > 0) {
6020 type = BTRFS_QGROUP_OPER_SUB_SHARED;
5817 if (extent_op) 6021 if (extent_op)
5818 __run_delayed_extent_op(extent_op, leaf, ei); 6022 __run_delayed_extent_op(extent_op, leaf, ei);
5819 /* 6023 /*
@@ -5829,7 +6033,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5829 if (found_extent) { 6033 if (found_extent) {
5830 ret = remove_extent_backref(trans, extent_root, path, 6034 ret = remove_extent_backref(trans, extent_root, path,
5831 iref, refs_to_drop, 6035 iref, refs_to_drop,
5832 is_data); 6036 is_data, &last_ref);
5833 if (ret) { 6037 if (ret) {
5834 btrfs_abort_transaction(trans, extent_root, ret); 6038 btrfs_abort_transaction(trans, extent_root, ret);
5835 goto out; 6039 goto out;
@@ -5850,6 +6054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5850 } 6054 }
5851 } 6055 }
5852 6056
6057 last_ref = 1;
5853 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6058 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5854 num_to_del); 6059 num_to_del);
5855 if (ret) { 6060 if (ret) {
@@ -5872,6 +6077,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5872 goto out; 6077 goto out;
5873 } 6078 }
5874 } 6079 }
6080 btrfs_release_path(path);
6081
6082 /* Deal with the quota accounting */
6083 if (!ret && last_ref && !no_quota) {
6084 int mod_seq = 0;
6085
6086 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
6087 type == BTRFS_QGROUP_OPER_SUB_SHARED)
6088 mod_seq = 1;
6089
6090 ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
6091 bytenr, num_bytes, type,
6092 mod_seq);
6093 }
5875out: 6094out:
5876 btrfs_free_path(path); 6095 btrfs_free_path(path);
5877 return ret; 6096 return ret;
@@ -6008,11 +6227,15 @@ out:
6008/* Can return -ENOMEM */ 6227/* Can return -ENOMEM */
6009int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6228int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6010 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6229 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6011 u64 owner, u64 offset, int for_cow) 6230 u64 owner, u64 offset, int no_quota)
6012{ 6231{
6013 int ret; 6232 int ret;
6014 struct btrfs_fs_info *fs_info = root->fs_info; 6233 struct btrfs_fs_info *fs_info = root->fs_info;
6015 6234
6235#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6236 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
6237 return 0;
6238#endif
6016 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6239 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6017 6240
6018 /* 6241 /*
@@ -6028,13 +6251,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6028 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6251 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6029 num_bytes, 6252 num_bytes,
6030 parent, root_objectid, (int)owner, 6253 parent, root_objectid, (int)owner,
6031 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 6254 BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6032 } else { 6255 } else {
6033 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6256 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6034 num_bytes, 6257 num_bytes,
6035 parent, root_objectid, owner, 6258 parent, root_objectid, owner,
6036 offset, BTRFS_DROP_DELAYED_REF, 6259 offset, BTRFS_DROP_DELAYED_REF,
6037 NULL, for_cow); 6260 NULL, no_quota);
6038 } 6261 }
6039 return ret; 6262 return ret;
6040} 6263}
@@ -6514,8 +6737,14 @@ loop:
6514 loop++; 6737 loop++;
6515 if (loop == LOOP_ALLOC_CHUNK) { 6738 if (loop == LOOP_ALLOC_CHUNK) {
6516 struct btrfs_trans_handle *trans; 6739 struct btrfs_trans_handle *trans;
6740 int exist = 0;
6741
6742 trans = current->journal_info;
6743 if (trans)
6744 exist = 1;
6745 else
6746 trans = btrfs_join_transaction(root);
6517 6747
6518 trans = btrfs_join_transaction(root);
6519 if (IS_ERR(trans)) { 6748 if (IS_ERR(trans)) {
6520 ret = PTR_ERR(trans); 6749 ret = PTR_ERR(trans);
6521 goto out; 6750 goto out;
@@ -6532,7 +6761,8 @@ loop:
6532 root, ret); 6761 root, ret);
6533 else 6762 else
6534 ret = 0; 6763 ret = 0;
6535 btrfs_end_transaction(trans, root); 6764 if (!exist)
6765 btrfs_end_transaction(trans, root);
6536 if (ret) 6766 if (ret)
6537 goto out; 6767 goto out;
6538 } 6768 }
@@ -6733,6 +6963,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6733 btrfs_mark_buffer_dirty(path->nodes[0]); 6963 btrfs_mark_buffer_dirty(path->nodes[0]);
6734 btrfs_free_path(path); 6964 btrfs_free_path(path);
6735 6965
6966 /* Always set parent to 0 here since its exclusive anyway. */
6967 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
6968 ins->objectid, ins->offset,
6969 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
6970 if (ret)
6971 return ret;
6972
6736 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6973 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6737 if (ret) { /* -ENOENT, logic error */ 6974 if (ret) { /* -ENOENT, logic error */
6738 btrfs_err(fs_info, "update block group failed for %llu %llu", 6975 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6747,7 +6984,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6747 struct btrfs_root *root, 6984 struct btrfs_root *root,
6748 u64 parent, u64 root_objectid, 6985 u64 parent, u64 root_objectid,
6749 u64 flags, struct btrfs_disk_key *key, 6986 u64 flags, struct btrfs_disk_key *key,
6750 int level, struct btrfs_key *ins) 6987 int level, struct btrfs_key *ins,
6988 int no_quota)
6751{ 6989{
6752 int ret; 6990 int ret;
6753 struct btrfs_fs_info *fs_info = root->fs_info; 6991 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6757,6 +6995,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6757 struct btrfs_path *path; 6995 struct btrfs_path *path;
6758 struct extent_buffer *leaf; 6996 struct extent_buffer *leaf;
6759 u32 size = sizeof(*extent_item) + sizeof(*iref); 6997 u32 size = sizeof(*extent_item) + sizeof(*iref);
6998 u64 num_bytes = ins->offset;
6760 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6999 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6761 SKINNY_METADATA); 7000 SKINNY_METADATA);
6762 7001
@@ -6790,6 +7029,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6790 7029
6791 if (skinny_metadata) { 7030 if (skinny_metadata) {
6792 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7031 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7032 num_bytes = root->leafsize;
6793 } else { 7033 } else {
6794 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7034 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6795 btrfs_set_tree_block_key(leaf, block_info, key); 7035 btrfs_set_tree_block_key(leaf, block_info, key);
@@ -6811,6 +7051,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6811 btrfs_mark_buffer_dirty(leaf); 7051 btrfs_mark_buffer_dirty(leaf);
6812 btrfs_free_path(path); 7052 btrfs_free_path(path);
6813 7053
7054 if (!no_quota) {
7055 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
7056 ins->objectid, num_bytes,
7057 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
7058 if (ret)
7059 return ret;
7060 }
7061
6814 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7062 ret = update_block_group(root, ins->objectid, root->leafsize, 1);
6815 if (ret) { /* -ENOENT, logic error */ 7063 if (ret) { /* -ENOENT, logic error */
6816 btrfs_err(fs_info, "update block group failed for %llu %llu", 7064 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6994,6 +7242,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6994 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7242 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6995 SKINNY_METADATA); 7243 SKINNY_METADATA);
6996 7244
7245#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
7246 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
7247 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7248 blocksize, level);
7249 if (!IS_ERR(buf))
7250 root->alloc_bytenr += blocksize;
7251 return buf;
7252 }
7253#endif
6997 block_rsv = use_block_rsv(trans, root, blocksize); 7254 block_rsv = use_block_rsv(trans, root, blocksize);
6998 if (IS_ERR(block_rsv)) 7255 if (IS_ERR(block_rsv))
6999 return ERR_CAST(block_rsv); 7256 return ERR_CAST(block_rsv);
@@ -7735,7 +7992,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7735 } 7992 }
7736 } 7993 }
7737 7994
7738 if (root->in_radix) { 7995 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
7739 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 7996 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7740 } else { 7997 } else {
7741 free_extent_buffer(root->node); 7998 free_extent_buffer(root->node);
@@ -8327,8 +8584,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8327 list_del(&space_info->list); 8584 list_del(&space_info->list);
8328 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8585 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8329 struct kobject *kobj; 8586 struct kobject *kobj;
8330 kobj = &space_info->block_group_kobjs[i]; 8587 kobj = space_info->block_group_kobjs[i];
8331 if (kobj->parent) { 8588 space_info->block_group_kobjs[i] = NULL;
8589 if (kobj) {
8332 kobject_del(kobj); 8590 kobject_del(kobj);
8333 kobject_put(kobj); 8591 kobject_put(kobj);
8334 } 8592 }
@@ -8352,17 +8610,26 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8352 up_write(&space_info->groups_sem); 8610 up_write(&space_info->groups_sem);
8353 8611
8354 if (first) { 8612 if (first) {
8355 struct kobject *kobj = &space_info->block_group_kobjs[index]; 8613 struct raid_kobject *rkobj;
8356 int ret; 8614 int ret;
8357 8615
8358 kobject_get(&space_info->kobj); /* put in release */ 8616 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
8359 ret = kobject_add(kobj, &space_info->kobj, "%s", 8617 if (!rkobj)
8360 get_raid_name(index)); 8618 goto out_err;
8619 rkobj->raid_type = index;
8620 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
8621 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
8622 "%s", get_raid_name(index));
8361 if (ret) { 8623 if (ret) {
8362 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8624 kobject_put(&rkobj->kobj);
8363 kobject_put(&space_info->kobj); 8625 goto out_err;
8364 } 8626 }
8627 space_info->block_group_kobjs[index] = &rkobj->kobj;
8365 } 8628 }
8629
8630 return;
8631out_err:
8632 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
8366} 8633}
8367 8634
8368static struct btrfs_block_group_cache * 8635static struct btrfs_block_group_cache *
@@ -8611,7 +8878,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8611 8878
8612 extent_root = root->fs_info->extent_root; 8879 extent_root = root->fs_info->extent_root;
8613 8880
8614 root->fs_info->last_trans_log_full_commit = trans->transid; 8881 btrfs_set_log_full_commit(root->fs_info, trans);
8615 8882
8616 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 8883 cache = btrfs_create_block_group_cache(root, chunk_offset, size);
8617 if (!cache) 8884 if (!cache)
@@ -8697,6 +8964,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8697 struct btrfs_root *tree_root = root->fs_info->tree_root; 8964 struct btrfs_root *tree_root = root->fs_info->tree_root;
8698 struct btrfs_key key; 8965 struct btrfs_key key;
8699 struct inode *inode; 8966 struct inode *inode;
8967 struct kobject *kobj = NULL;
8700 int ret; 8968 int ret;
8701 int index; 8969 int index;
8702 int factor; 8970 int factor;
@@ -8796,11 +9064,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8796 */ 9064 */
8797 list_del_init(&block_group->list); 9065 list_del_init(&block_group->list);
8798 if (list_empty(&block_group->space_info->block_groups[index])) { 9066 if (list_empty(&block_group->space_info->block_groups[index])) {
8799 kobject_del(&block_group->space_info->block_group_kobjs[index]); 9067 kobj = block_group->space_info->block_group_kobjs[index];
8800 kobject_put(&block_group->space_info->block_group_kobjs[index]); 9068 block_group->space_info->block_group_kobjs[index] = NULL;
8801 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9069 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8802 } 9070 }
8803 up_write(&block_group->space_info->groups_sem); 9071 up_write(&block_group->space_info->groups_sem);
9072 if (kobj) {
9073 kobject_del(kobj);
9074 kobject_put(kobj);
9075 }
8804 9076
8805 if (block_group->cached == BTRFS_CACHE_STARTED) 9077 if (block_group->cached == BTRFS_CACHE_STARTED)
8806 wait_block_group_cache_done(block_group); 9078 wait_block_group_cache_done(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3955e475ceec..f25a9092b946 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1693,6 +1693,7 @@ again:
1693 * shortening the size of the delalloc range we're searching 1693 * shortening the size of the delalloc range we're searching
1694 */ 1694 */
1695 free_extent_state(cached_state); 1695 free_extent_state(cached_state);
1696 cached_state = NULL;
1696 if (!loops) { 1697 if (!loops) {
1697 max_bytes = PAGE_CACHE_SIZE; 1698 max_bytes = PAGE_CACHE_SIZE;
1698 loops = 1; 1699 loops = 1;
@@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2367 if (!uptodate) { 2368 if (!uptodate) {
2368 ClearPageUptodate(page); 2369 ClearPageUptodate(page);
2369 SetPageError(page); 2370 SetPageError(page);
2371 ret = ret < 0 ? ret : -EIO;
2372 mapping_set_error(page->mapping, ret);
2370 } 2373 }
2371 return 0; 2374 return 0;
2372} 2375}
@@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,
3098} 3101}
3099 3102
3100/* 3103/*
3101 * the writepage semantics are similar to regular writepage. extent 3104 * helper for __extent_writepage, doing all of the delayed allocation setup.
3102 * records are inserted to lock ranges in the tree, and as dirty areas 3105 *
3103 * are found, they are marked writeback. Then the lock bits are removed 3106 * This returns 1 if our fill_delalloc function did all the work required
3104 * and the end_io handler clears the writeback ranges 3107 * to write the page (copy into inline extent). In this case the IO has
3108 * been started and the page is already unlocked.
3109 *
3110 * This returns 0 if all went well (page still locked)
3111 * This returns < 0 if there were errors (page still locked)
3105 */ 3112 */
3106static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3113static noinline_for_stack int writepage_delalloc(struct inode *inode,
3107 void *data) 3114 struct page *page, struct writeback_control *wbc,
3115 struct extent_page_data *epd,
3116 u64 delalloc_start,
3117 unsigned long *nr_written)
3118{
3119 struct extent_io_tree *tree = epd->tree;
3120 u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
3121 u64 nr_delalloc;
3122 u64 delalloc_to_write = 0;
3123 u64 delalloc_end = 0;
3124 int ret;
3125 int page_started = 0;
3126
3127 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
3128 return 0;
3129
3130 while (delalloc_end < page_end) {
3131 nr_delalloc = find_lock_delalloc_range(inode, tree,
3132 page,
3133 &delalloc_start,
3134 &delalloc_end,
3135 128 * 1024 * 1024);
3136 if (nr_delalloc == 0) {
3137 delalloc_start = delalloc_end + 1;
3138 continue;
3139 }
3140 ret = tree->ops->fill_delalloc(inode, page,
3141 delalloc_start,
3142 delalloc_end,
3143 &page_started,
3144 nr_written);
3145 /* File system has been set read-only */
3146 if (ret) {
3147 SetPageError(page);
3148 /* fill_delalloc should be return < 0 for error
3149 * but just in case, we use > 0 here meaning the
3150 * IO is started, so we don't want to return > 0
3151 * unless things are going well.
3152 */
3153 ret = ret < 0 ? ret : -EIO;
3154 goto done;
3155 }
3156 /*
3157 * delalloc_end is already one less than the total
3158 * length, so we don't subtract one from
3159 * PAGE_CACHE_SIZE
3160 */
3161 delalloc_to_write += (delalloc_end - delalloc_start +
3162 PAGE_CACHE_SIZE) >>
3163 PAGE_CACHE_SHIFT;
3164 delalloc_start = delalloc_end + 1;
3165 }
3166 if (wbc->nr_to_write < delalloc_to_write) {
3167 int thresh = 8192;
3168
3169 if (delalloc_to_write < thresh * 2)
3170 thresh = delalloc_to_write;
3171 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3172 thresh);
3173 }
3174
3175 /* did the fill delalloc function already unlock and start
3176 * the IO?
3177 */
3178 if (page_started) {
3179 /*
3180 * we've unlocked the page, so we can't update
3181 * the mapping's writeback index, just update
3182 * nr_to_write.
3183 */
3184 wbc->nr_to_write -= *nr_written;
3185 return 1;
3186 }
3187
3188 ret = 0;
3189
3190done:
3191 return ret;
3192}
3193
3194/*
3195 * helper for __extent_writepage. This calls the writepage start hooks,
3196 * and does the loop to map the page into extents and bios.
3197 *
3198 * We return 1 if the IO is started and the page is unlocked,
3199 * 0 if all went well (page still locked)
3200 * < 0 if there were errors (page still locked)
3201 */
3202static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3203 struct page *page,
3204 struct writeback_control *wbc,
3205 struct extent_page_data *epd,
3206 loff_t i_size,
3207 unsigned long nr_written,
3208 int write_flags, int *nr_ret)
3108{ 3209{
3109 struct inode *inode = page->mapping->host;
3110 struct extent_page_data *epd = data;
3111 struct extent_io_tree *tree = epd->tree; 3210 struct extent_io_tree *tree = epd->tree;
3112 u64 start = page_offset(page); 3211 u64 start = page_offset(page);
3113 u64 delalloc_start;
3114 u64 page_end = start + PAGE_CACHE_SIZE - 1; 3212 u64 page_end = start + PAGE_CACHE_SIZE - 1;
3115 u64 end; 3213 u64 end;
3116 u64 cur = start; 3214 u64 cur = start;
3117 u64 extent_offset; 3215 u64 extent_offset;
3118 u64 last_byte = i_size_read(inode);
3119 u64 block_start; 3216 u64 block_start;
3120 u64 iosize; 3217 u64 iosize;
3121 sector_t sector; 3218 sector_t sector;
3122 struct extent_state *cached_state = NULL; 3219 struct extent_state *cached_state = NULL;
3123 struct extent_map *em; 3220 struct extent_map *em;
3124 struct block_device *bdev; 3221 struct block_device *bdev;
3125 int ret;
3126 int nr = 0;
3127 size_t pg_offset = 0; 3222 size_t pg_offset = 0;
3128 size_t blocksize; 3223 size_t blocksize;
3129 loff_t i_size = i_size_read(inode); 3224 int ret = 0;
3130 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 3225 int nr = 0;
3131 u64 nr_delalloc; 3226 bool compressed;
3132 u64 delalloc_end;
3133 int page_started;
3134 int compressed;
3135 int write_flags;
3136 unsigned long nr_written = 0;
3137 bool fill_delalloc = true;
3138
3139 if (wbc->sync_mode == WB_SYNC_ALL)
3140 write_flags = WRITE_SYNC;
3141 else
3142 write_flags = WRITE;
3143
3144 trace___extent_writepage(page, inode, wbc);
3145
3146 WARN_ON(!PageLocked(page));
3147
3148 ClearPageError(page);
3149
3150 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3151 if (page->index > end_index ||
3152 (page->index == end_index && !pg_offset)) {
3153 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3154 unlock_page(page);
3155 return 0;
3156 }
3157
3158 if (page->index == end_index) {
3159 char *userpage;
3160
3161 userpage = kmap_atomic(page);
3162 memset(userpage + pg_offset, 0,
3163 PAGE_CACHE_SIZE - pg_offset);
3164 kunmap_atomic(userpage);
3165 flush_dcache_page(page);
3166 }
3167 pg_offset = 0;
3168
3169 set_page_extent_mapped(page);
3170
3171 if (!tree->ops || !tree->ops->fill_delalloc)
3172 fill_delalloc = false;
3173
3174 delalloc_start = start;
3175 delalloc_end = 0;
3176 page_started = 0;
3177 if (!epd->extent_locked && fill_delalloc) {
3178 u64 delalloc_to_write = 0;
3179 /*
3180 * make sure the wbc mapping index is at least updated
3181 * to this page.
3182 */
3183 update_nr_written(page, wbc, 0);
3184
3185 while (delalloc_end < page_end) {
3186 nr_delalloc = find_lock_delalloc_range(inode, tree,
3187 page,
3188 &delalloc_start,
3189 &delalloc_end,
3190 128 * 1024 * 1024);
3191 if (nr_delalloc == 0) {
3192 delalloc_start = delalloc_end + 1;
3193 continue;
3194 }
3195 ret = tree->ops->fill_delalloc(inode, page,
3196 delalloc_start,
3197 delalloc_end,
3198 &page_started,
3199 &nr_written);
3200 /* File system has been set read-only */
3201 if (ret) {
3202 SetPageError(page);
3203 goto done;
3204 }
3205 /*
3206 * delalloc_end is already one less than the total
3207 * length, so we don't subtract one from
3208 * PAGE_CACHE_SIZE
3209 */
3210 delalloc_to_write += (delalloc_end - delalloc_start +
3211 PAGE_CACHE_SIZE) >>
3212 PAGE_CACHE_SHIFT;
3213 delalloc_start = delalloc_end + 1;
3214 }
3215 if (wbc->nr_to_write < delalloc_to_write) {
3216 int thresh = 8192;
3217
3218 if (delalloc_to_write < thresh * 2)
3219 thresh = delalloc_to_write;
3220 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3221 thresh);
3222 }
3223 3227
3224 /* did the fill delalloc function already unlock and start
3225 * the IO?
3226 */
3227 if (page_started) {
3228 ret = 0;
3229 /*
3230 * we've unlocked the page, so we can't update
3231 * the mapping's writeback index, just update
3232 * nr_to_write.
3233 */
3234 wbc->nr_to_write -= nr_written;
3235 goto done_unlocked;
3236 }
3237 }
3238 if (tree->ops && tree->ops->writepage_start_hook) { 3228 if (tree->ops && tree->ops->writepage_start_hook) {
3239 ret = tree->ops->writepage_start_hook(page, start, 3229 ret = tree->ops->writepage_start_hook(page, start,
3240 page_end); 3230 page_end);
@@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3244 wbc->pages_skipped++; 3234 wbc->pages_skipped++;
3245 else 3235 else
3246 redirty_page_for_writepage(wbc, page); 3236 redirty_page_for_writepage(wbc, page);
3237
3247 update_nr_written(page, wbc, nr_written); 3238 update_nr_written(page, wbc, nr_written);
3248 unlock_page(page); 3239 unlock_page(page);
3249 ret = 0; 3240 ret = 1;
3250 goto done_unlocked; 3241 goto done_unlocked;
3251 } 3242 }
3252 } 3243 }
@@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3258 update_nr_written(page, wbc, nr_written + 1); 3249 update_nr_written(page, wbc, nr_written + 1);
3259 3250
3260 end = page_end; 3251 end = page_end;
3261 if (last_byte <= start) { 3252 if (i_size <= start) {
3262 if (tree->ops && tree->ops->writepage_end_io_hook) 3253 if (tree->ops && tree->ops->writepage_end_io_hook)
3263 tree->ops->writepage_end_io_hook(page, start, 3254 tree->ops->writepage_end_io_hook(page, start,
3264 page_end, NULL, 1); 3255 page_end, NULL, 1);
@@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3268 blocksize = inode->i_sb->s_blocksize; 3259 blocksize = inode->i_sb->s_blocksize;
3269 3260
3270 while (cur <= end) { 3261 while (cur <= end) {
3271 if (cur >= last_byte) { 3262 u64 em_end;
3263 if (cur >= i_size) {
3272 if (tree->ops && tree->ops->writepage_end_io_hook) 3264 if (tree->ops && tree->ops->writepage_end_io_hook)
3273 tree->ops->writepage_end_io_hook(page, cur, 3265 tree->ops->writepage_end_io_hook(page, cur,
3274 page_end, NULL, 1); 3266 page_end, NULL, 1);
@@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3278 end - cur + 1, 1); 3270 end - cur + 1, 1);
3279 if (IS_ERR_OR_NULL(em)) { 3271 if (IS_ERR_OR_NULL(em)) {
3280 SetPageError(page); 3272 SetPageError(page);
3273 ret = PTR_ERR_OR_ZERO(em);
3281 break; 3274 break;
3282 } 3275 }
3283 3276
3284 extent_offset = cur - em->start; 3277 extent_offset = cur - em->start;
3285 BUG_ON(extent_map_end(em) <= cur); 3278 em_end = extent_map_end(em);
3279 BUG_ON(em_end <= cur);
3286 BUG_ON(end < cur); 3280 BUG_ON(end < cur);
3287 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3281 iosize = min(em_end - cur, end - cur + 1);
3288 iosize = ALIGN(iosize, blocksize); 3282 iosize = ALIGN(iosize, blocksize);
3289 sector = (em->block_start + extent_offset) >> 9; 3283 sector = (em->block_start + extent_offset) >> 9;
3290 bdev = em->bdev; 3284 bdev = em->bdev;
@@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3320 pg_offset += iosize; 3314 pg_offset += iosize;
3321 continue; 3315 continue;
3322 } 3316 }
3323 /* leave this out until we have a page_mkwrite call */
3324 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3325 EXTENT_DIRTY, 0, NULL)) {
3326 cur = cur + iosize;
3327 pg_offset += iosize;
3328 continue;
3329 }
3330 3317
3331 if (tree->ops && tree->ops->writepage_io_hook) { 3318 if (tree->ops && tree->ops->writepage_io_hook) {
3332 ret = tree->ops->writepage_io_hook(page, cur, 3319 ret = tree->ops->writepage_io_hook(page, cur,
@@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3337 if (ret) { 3324 if (ret) {
3338 SetPageError(page); 3325 SetPageError(page);
3339 } else { 3326 } else {
3340 unsigned long max_nr = end_index + 1; 3327 unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
3341 3328
3342 set_range_writeback(tree, cur, cur + iosize - 1); 3329 set_range_writeback(tree, cur, cur + iosize - 1);
3343 if (!PageWriteback(page)) { 3330 if (!PageWriteback(page)) {
@@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3359 nr++; 3346 nr++;
3360 } 3347 }
3361done: 3348done:
3349 *nr_ret = nr;
3350
3351done_unlocked:
3352
3353 /* drop our reference on any cached states */
3354 free_extent_state(cached_state);
3355 return ret;
3356}
3357
3358/*
3359 * the writepage semantics are similar to regular writepage. extent
3360 * records are inserted to lock ranges in the tree, and as dirty areas
3361 * are found, they are marked writeback. Then the lock bits are removed
3362 * and the end_io handler clears the writeback ranges
3363 */
3364static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3365 void *data)
3366{
3367 struct inode *inode = page->mapping->host;
3368 struct extent_page_data *epd = data;
3369 u64 start = page_offset(page);
3370 u64 page_end = start + PAGE_CACHE_SIZE - 1;
3371 int ret;
3372 int nr = 0;
3373 size_t pg_offset = 0;
3374 loff_t i_size = i_size_read(inode);
3375 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
3376 int write_flags;
3377 unsigned long nr_written = 0;
3378
3379 if (wbc->sync_mode == WB_SYNC_ALL)
3380 write_flags = WRITE_SYNC;
3381 else
3382 write_flags = WRITE;
3383
3384 trace___extent_writepage(page, inode, wbc);
3385
3386 WARN_ON(!PageLocked(page));
3387
3388 ClearPageError(page);
3389
3390 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3391 if (page->index > end_index ||
3392 (page->index == end_index && !pg_offset)) {
3393 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3394 unlock_page(page);
3395 return 0;
3396 }
3397
3398 if (page->index == end_index) {
3399 char *userpage;
3400
3401 userpage = kmap_atomic(page);
3402 memset(userpage + pg_offset, 0,
3403 PAGE_CACHE_SIZE - pg_offset);
3404 kunmap_atomic(userpage);
3405 flush_dcache_page(page);
3406 }
3407
3408 pg_offset = 0;
3409
3410 set_page_extent_mapped(page);
3411
3412 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
3413 if (ret == 1)
3414 goto done_unlocked;
3415 if (ret)
3416 goto done;
3417
3418 ret = __extent_writepage_io(inode, page, wbc, epd,
3419 i_size, nr_written, write_flags, &nr);
3420 if (ret == 1)
3421 goto done_unlocked;
3422
3423done:
3362 if (nr == 0) { 3424 if (nr == 0) {
3363 /* make sure the mapping tag for page dirty gets cleared */ 3425 /* make sure the mapping tag for page dirty gets cleared */
3364 set_page_writeback(page); 3426 set_page_writeback(page);
3365 end_page_writeback(page); 3427 end_page_writeback(page);
3366 } 3428 }
3429 if (PageError(page)) {
3430 ret = ret < 0 ? ret : -EIO;
3431 end_extent_writepage(page, ret, start, page_end);
3432 }
3367 unlock_page(page); 3433 unlock_page(page);
3434 return ret;
3368 3435
3369done_unlocked: 3436done_unlocked:
3370
3371 /* drop our reference on any cached states */
3372 free_extent_state(cached_state);
3373 return 0; 3437 return 0;
3374} 3438}
3375 3439
@@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3385 TASK_UNINTERRUPTIBLE); 3449 TASK_UNINTERRUPTIBLE);
3386} 3450}
3387 3451
3388static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3452static noinline_for_stack int
3389 struct btrfs_fs_info *fs_info, 3453lock_extent_buffer_for_io(struct extent_buffer *eb,
3390 struct extent_page_data *epd) 3454 struct btrfs_fs_info *fs_info,
3455 struct extent_page_data *epd)
3391{ 3456{
3392 unsigned long i, num_pages; 3457 unsigned long i, num_pages;
3393 int flush = 0; 3458 int flush = 0;
@@ -3458,7 +3523,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3458static void end_extent_buffer_writeback(struct extent_buffer *eb) 3523static void end_extent_buffer_writeback(struct extent_buffer *eb)
3459{ 3524{
3460 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3525 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3461 smp_mb__after_clear_bit(); 3526 smp_mb__after_atomic();
3462 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3527 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3463} 3528}
3464 3529
@@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3492 bio_put(bio); 3557 bio_put(bio);
3493} 3558}
3494 3559
3495static int write_one_eb(struct extent_buffer *eb, 3560static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3496 struct btrfs_fs_info *fs_info, 3561 struct btrfs_fs_info *fs_info,
3497 struct writeback_control *wbc, 3562 struct writeback_control *wbc,
3498 struct extent_page_data *epd) 3563 struct extent_page_data *epd)
@@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
3690 struct inode *inode = mapping->host; 3755 struct inode *inode = mapping->host;
3691 int ret = 0; 3756 int ret = 0;
3692 int done = 0; 3757 int done = 0;
3758 int err = 0;
3693 int nr_to_write_done = 0; 3759 int nr_to_write_done = 0;
3694 struct pagevec pvec; 3760 struct pagevec pvec;
3695 int nr_pages; 3761 int nr_pages;
@@ -3776,8 +3842,8 @@ retry:
3776 unlock_page(page); 3842 unlock_page(page);
3777 ret = 0; 3843 ret = 0;
3778 } 3844 }
3779 if (ret) 3845 if (!err && ret < 0)
3780 done = 1; 3846 err = ret;
3781 3847
3782 /* 3848 /*
3783 * the filesystem may choose to bump up nr_to_write. 3849 * the filesystem may choose to bump up nr_to_write.
@@ -3789,7 +3855,7 @@ retry:
3789 pagevec_release(&pvec); 3855 pagevec_release(&pvec);
3790 cond_resched(); 3856 cond_resched();
3791 } 3857 }
3792 if (!scanned && !done) { 3858 if (!scanned && !done && !err) {
3793 /* 3859 /*
3794 * We hit the last page and there is more work to be done: wrap 3860 * We hit the last page and there is more work to be done: wrap
3795 * back to the start of the file 3861 * back to the start of the file
@@ -3799,7 +3865,7 @@ retry:
3799 goto retry; 3865 goto retry;
3800 } 3866 }
3801 btrfs_add_delayed_iput(inode); 3867 btrfs_add_delayed_iput(inode);
3802 return ret; 3868 return err;
3803} 3869}
3804 3870
3805static void flush_epd_write_bio(struct extent_page_data *epd) 3871static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4510,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4510 spin_unlock(&eb->refs_lock); 4576 spin_unlock(&eb->refs_lock);
4511} 4577}
4512 4578
4513static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4579static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4580 struct page *accessed)
4514{ 4581{
4515 unsigned long num_pages, i; 4582 unsigned long num_pages, i;
4516 4583
@@ -4519,7 +4586,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4519 num_pages = num_extent_pages(eb->start, eb->len); 4586 num_pages = num_extent_pages(eb->start, eb->len);
4520 for (i = 0; i < num_pages; i++) { 4587 for (i = 0; i < num_pages; i++) {
4521 struct page *p = extent_buffer_page(eb, i); 4588 struct page *p = extent_buffer_page(eb, i);
4522 mark_page_accessed(p); 4589 if (p != accessed)
4590 mark_page_accessed(p);
4523 } 4591 }
4524} 4592}
4525 4593
@@ -4533,7 +4601,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4533 start >> PAGE_CACHE_SHIFT); 4601 start >> PAGE_CACHE_SHIFT);
4534 if (eb && atomic_inc_not_zero(&eb->refs)) { 4602 if (eb && atomic_inc_not_zero(&eb->refs)) {
4535 rcu_read_unlock(); 4603 rcu_read_unlock();
4536 mark_extent_buffer_accessed(eb); 4604 mark_extent_buffer_accessed(eb, NULL);
4537 return eb; 4605 return eb;
4538 } 4606 }
4539 rcu_read_unlock(); 4607 rcu_read_unlock();
@@ -4541,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4541 return NULL; 4609 return NULL;
4542} 4610}
4543 4611
4612#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4613struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4614 u64 start, unsigned long len)
4615{
4616 struct extent_buffer *eb, *exists = NULL;
4617 int ret;
4618
4619 eb = find_extent_buffer(fs_info, start);
4620 if (eb)
4621 return eb;
4622 eb = alloc_dummy_extent_buffer(start, len);
4623 if (!eb)
4624 return NULL;
4625 eb->fs_info = fs_info;
4626again:
4627 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4628 if (ret)
4629 goto free_eb;
4630 spin_lock(&fs_info->buffer_lock);
4631 ret = radix_tree_insert(&fs_info->buffer_radix,
4632 start >> PAGE_CACHE_SHIFT, eb);
4633 spin_unlock(&fs_info->buffer_lock);
4634 radix_tree_preload_end();
4635 if (ret == -EEXIST) {
4636 exists = find_extent_buffer(fs_info, start);
4637 if (exists)
4638 goto free_eb;
4639 else
4640 goto again;
4641 }
4642 check_buffer_tree_ref(eb);
4643 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4644
4645 /*
4646 * We will free dummy extent buffer's if they come into
4647 * free_extent_buffer with a ref count of 2, but if we are using this we
4648 * want the buffers to stay in memory until we're done with them, so
4649 * bump the ref count again.
4650 */
4651 atomic_inc(&eb->refs);
4652 return eb;
4653free_eb:
4654 btrfs_release_extent_buffer(eb);
4655 return exists;
4656}
4657#endif
4658
4544struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4659struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4545 u64 start, unsigned long len) 4660 u64 start, unsigned long len)
4546{ 4661{
@@ -4581,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4581 spin_unlock(&mapping->private_lock); 4696 spin_unlock(&mapping->private_lock);
4582 unlock_page(p); 4697 unlock_page(p);
4583 page_cache_release(p); 4698 page_cache_release(p);
4584 mark_extent_buffer_accessed(exists); 4699 mark_extent_buffer_accessed(exists, p);
4585 goto free_eb; 4700 goto free_eb;
4586 } 4701 }
4587 4702
@@ -4596,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4596 attach_extent_buffer_page(eb, p); 4711 attach_extent_buffer_page(eb, p);
4597 spin_unlock(&mapping->private_lock); 4712 spin_unlock(&mapping->private_lock);
4598 WARN_ON(PageDirty(p)); 4713 WARN_ON(PageDirty(p));
4599 mark_page_accessed(p);
4600 eb->pages[i] = p; 4714 eb->pages[i] = p;
4601 if (!PageUptodate(p)) 4715 if (!PageUptodate(p))
4602 uptodate = 0; 4716 uptodate = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c488b45237bf..8b63f2d46518 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -350,5 +350,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
350 struct extent_io_tree *tree, 350 struct extent_io_tree *tree,
351 struct page *locked_page, u64 *start, 351 struct page *locked_page, u64 *start,
352 u64 *end, u64 max_bytes); 352 u64 *end, u64 max_bytes);
353struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
354 u64 start, unsigned long len);
353#endif 355#endif
354#endif 356#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 127555b29f58..f46cfe45d686 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
281found: 281found:
282 csum += count * csum_size; 282 csum += count * csum_size;
283 nblocks -= count; 283 nblocks -= count;
284 bio_index += count;
284 while (count--) { 285 while (count--) {
285 disk_bytenr += bvec->bv_len; 286 disk_bytenr += bvec->bv_len;
286 offset += bvec->bv_len; 287 offset += bvec->bv_len;
287 bio_index++;
288 bvec++; 288 bvec++;
289 } 289 }
290 } 290 }
@@ -750,7 +750,7 @@ again:
750 int slot = path->slots[0] + 1; 750 int slot = path->slots[0] + 1;
751 /* we didn't find a csum item, insert one */ 751 /* we didn't find a csum item, insert one */
752 nritems = btrfs_header_nritems(path->nodes[0]); 752 nritems = btrfs_header_nritems(path->nodes[0]);
753 if (path->slots[0] >= nritems - 1) { 753 if (!nritems || (path->slots[0] >= nritems - 1)) {
754 ret = btrfs_next_leaf(root, path); 754 ret = btrfs_next_leaf(root, path);
755 if (ret == 1) 755 if (ret == 1)
756 found_next = 1; 756 found_next = 1;
@@ -885,3 +885,79 @@ out:
885fail_unlock: 885fail_unlock:
886 goto out; 886 goto out;
887} 887}
888
889void btrfs_extent_item_to_extent_map(struct inode *inode,
890 const struct btrfs_path *path,
891 struct btrfs_file_extent_item *fi,
892 const bool new_inline,
893 struct extent_map *em)
894{
895 struct btrfs_root *root = BTRFS_I(inode)->root;
896 struct extent_buffer *leaf = path->nodes[0];
897 const int slot = path->slots[0];
898 struct btrfs_key key;
899 u64 extent_start, extent_end;
900 u64 bytenr;
901 u8 type = btrfs_file_extent_type(leaf, fi);
902 int compress_type = btrfs_file_extent_compression(leaf, fi);
903
904 em->bdev = root->fs_info->fs_devices->latest_bdev;
905 btrfs_item_key_to_cpu(leaf, &key, slot);
906 extent_start = key.offset;
907
908 if (type == BTRFS_FILE_EXTENT_REG ||
909 type == BTRFS_FILE_EXTENT_PREALLOC) {
910 extent_end = extent_start +
911 btrfs_file_extent_num_bytes(leaf, fi);
912 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
913 size_t size;
914 size = btrfs_file_extent_inline_len(leaf, slot, fi);
915 extent_end = ALIGN(extent_start + size, root->sectorsize);
916 }
917
918 em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
919 if (type == BTRFS_FILE_EXTENT_REG ||
920 type == BTRFS_FILE_EXTENT_PREALLOC) {
921 em->start = extent_start;
922 em->len = extent_end - extent_start;
923 em->orig_start = extent_start -
924 btrfs_file_extent_offset(leaf, fi);
925 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
926 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
927 if (bytenr == 0) {
928 em->block_start = EXTENT_MAP_HOLE;
929 return;
930 }
931 if (compress_type != BTRFS_COMPRESS_NONE) {
932 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
933 em->compress_type = compress_type;
934 em->block_start = bytenr;
935 em->block_len = em->orig_block_len;
936 } else {
937 bytenr += btrfs_file_extent_offset(leaf, fi);
938 em->block_start = bytenr;
939 em->block_len = em->len;
940 if (type == BTRFS_FILE_EXTENT_PREALLOC)
941 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
942 }
943 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
944 em->block_start = EXTENT_MAP_INLINE;
945 em->start = extent_start;
946 em->len = extent_end - extent_start;
947 /*
948 * Initialize orig_start and block_len with the same values
949 * as in inode.c:btrfs_get_extent().
950 */
951 em->orig_start = EXTENT_MAP_HOLE;
952 em->block_len = (u64)-1;
953 if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
954 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
955 em->compress_type = compress_type;
956 }
957 } else {
958 btrfs_err(root->fs_info,
959 "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
960 type, btrfs_ino(inode), extent_start,
961 root->root_key.objectid);
962 }
963}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 17e7393c50f0..1f2b99cb55ea 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,7 @@
40#include "tree-log.h" 40#include "tree-log.h"
41#include "locking.h" 41#include "locking.h"
42#include "volumes.h" 42#include "volumes.h"
43#include "qgroup.h"
43 44
44static struct kmem_cache *btrfs_inode_defrag_cachep; 45static struct kmem_cache *btrfs_inode_defrag_cachep;
45/* 46/*
@@ -470,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
470 for (i = 0; i < num_pages; i++) { 471 for (i = 0; i < num_pages; i++) {
471 /* page checked is some magic around finding pages that 472 /* page checked is some magic around finding pages that
472 * have been modified without going through btrfs_set_page_dirty 473 * have been modified without going through btrfs_set_page_dirty
473 * clear it here 474 * clear it here. There should be no need to mark the pages
475 * accessed as prepare_pages should have marked them accessed
476 * in prepare_pages via find_or_create_page()
474 */ 477 */
475 ClearPageChecked(pages[i]); 478 ClearPageChecked(pages[i]);
476 unlock_page(pages[i]); 479 unlock_page(pages[i]);
477 mark_page_accessed(pages[i]);
478 page_cache_release(pages[i]); 480 page_cache_release(pages[i]);
479 } 481 }
480} 482}
@@ -714,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
714 int recow; 716 int recow;
715 int ret; 717 int ret;
716 int modify_tree = -1; 718 int modify_tree = -1;
717 int update_refs = (root->ref_cows || root == root->fs_info->tree_root); 719 int update_refs;
718 int found = 0; 720 int found = 0;
719 int leafs_visited = 0; 721 int leafs_visited = 0;
720 722
@@ -724,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
724 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) 726 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
725 modify_tree = 0; 727 modify_tree = 0;
726 728
729 update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
730 root == root->fs_info->tree_root);
727 while (1) { 731 while (1) {
728 recow = 0; 732 recow = 0;
729 ret = btrfs_lookup_file_extent(trans, root, path, ino, 733 ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -780,6 +784,18 @@ next_slot:
780 extent_end = search_start; 784 extent_end = search_start;
781 } 785 }
782 786
787 /*
788 * Don't skip extent items representing 0 byte lengths. They
789 * used to be created (bug) if while punching holes we hit
790 * -ENOSPC condition. So if we find one here, just ensure we
791 * delete it, otherwise we would insert a new file extent item
792 * with the same key (offset) as that 0 bytes length file
793 * extent item in the call to setup_items_for_insert() later
794 * in this function.
795 */
796 if (extent_end == key.offset && extent_end >= search_start)
797 goto delete_extent_item;
798
783 if (extent_end <= search_start) { 799 if (extent_end <= search_start) {
784 path->slots[0]++; 800 path->slots[0]++;
785 goto next_slot; 801 goto next_slot;
@@ -835,7 +851,7 @@ next_slot:
835 disk_bytenr, num_bytes, 0, 851 disk_bytenr, num_bytes, 0,
836 root->root_key.objectid, 852 root->root_key.objectid,
837 new_key.objectid, 853 new_key.objectid,
838 start - extent_offset, 0); 854 start - extent_offset, 1);
839 BUG_ON(ret); /* -ENOMEM */ 855 BUG_ON(ret); /* -ENOMEM */
840 } 856 }
841 key.offset = start; 857 key.offset = start;
@@ -893,6 +909,7 @@ next_slot:
893 * | ------ extent ------ | 909 * | ------ extent ------ |
894 */ 910 */
895 if (start <= key.offset && end >= extent_end) { 911 if (start <= key.offset && end >= extent_end) {
912delete_extent_item:
896 if (del_nr == 0) { 913 if (del_nr == 0) {
897 del_slot = path->slots[0]; 914 del_slot = path->slots[0];
898 del_nr = 1; 915 del_nr = 1;
@@ -1191,7 +1208,7 @@ again:
1191 1208
1192 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1209 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1193 root->root_key.objectid, 1210 root->root_key.objectid,
1194 ino, orig_offset, 0); 1211 ino, orig_offset, 1);
1195 BUG_ON(ret); /* -ENOMEM */ 1212 BUG_ON(ret); /* -ENOMEM */
1196 1213
1197 if (split == start) { 1214 if (split == start) {
@@ -1994,8 +2011,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1994 if (!full_sync) { 2011 if (!full_sync) {
1995 ret = btrfs_wait_ordered_range(inode, start, 2012 ret = btrfs_wait_ordered_range(inode, start,
1996 end - start + 1); 2013 end - start + 1);
1997 if (ret) 2014 if (ret) {
2015 btrfs_end_transaction(trans, root);
1998 goto out; 2016 goto out;
2017 }
1999 } 2018 }
2000 ret = btrfs_commit_transaction(trans, root); 2019 ret = btrfs_commit_transaction(trans, root);
2001 } else { 2020 } else {
@@ -2153,6 +2172,37 @@ out:
2153 return 0; 2172 return 0;
2154} 2173}
2155 2174
2175/*
2176 * Find a hole extent on given inode and change start/len to the end of hole
2177 * extent.(hole/vacuum extent whose em->start <= start &&
2178 * em->start + em->len > start)
2179 * When a hole extent is found, return 1 and modify start/len.
2180 */
2181static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2182{
2183 struct extent_map *em;
2184 int ret = 0;
2185
2186 em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
2187 if (IS_ERR_OR_NULL(em)) {
2188 if (!em)
2189 ret = -ENOMEM;
2190 else
2191 ret = PTR_ERR(em);
2192 return ret;
2193 }
2194
2195 /* Hole or vacuum extent(only exists in no-hole mode) */
2196 if (em->block_start == EXTENT_MAP_HOLE) {
2197 ret = 1;
2198 *len = em->start + em->len > *start + *len ?
2199 0 : *start + *len - em->start - em->len;
2200 *start = em->start + em->len;
2201 }
2202 free_extent_map(em);
2203 return ret;
2204}
2205
2156static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 2206static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157{ 2207{
2158 struct btrfs_root *root = BTRFS_I(inode)->root; 2208 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2160,25 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2160 struct btrfs_path *path; 2210 struct btrfs_path *path;
2161 struct btrfs_block_rsv *rsv; 2211 struct btrfs_block_rsv *rsv;
2162 struct btrfs_trans_handle *trans; 2212 struct btrfs_trans_handle *trans;
2163 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2213 u64 lockstart;
2164 u64 lockend = round_down(offset + len, 2214 u64 lockend;
2165 BTRFS_I(inode)->root->sectorsize) - 1; 2215 u64 tail_start;
2166 u64 cur_offset = lockstart; 2216 u64 tail_len;
2217 u64 orig_start = offset;
2218 u64 cur_offset;
2167 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 2219 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2168 u64 drop_end; 2220 u64 drop_end;
2169 int ret = 0; 2221 int ret = 0;
2170 int err = 0; 2222 int err = 0;
2171 int rsv_count; 2223 int rsv_count;
2172 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2224 bool same_page;
2173 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2174 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2225 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2175 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); 2226 u64 ino_size;
2176 2227
2177 ret = btrfs_wait_ordered_range(inode, offset, len); 2228 ret = btrfs_wait_ordered_range(inode, offset, len);
2178 if (ret) 2229 if (ret)
2179 return ret; 2230 return ret;
2180 2231
2181 mutex_lock(&inode->i_mutex); 2232 mutex_lock(&inode->i_mutex);
2233 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2234 ret = find_first_non_hole(inode, &offset, &len);
2235 if (ret < 0)
2236 goto out_only_mutex;
2237 if (ret && !len) {
2238 /* Already in a large hole */
2239 ret = 0;
2240 goto out_only_mutex;
2241 }
2242
2243 lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
2244 lockend = round_down(offset + len,
2245 BTRFS_I(inode)->root->sectorsize) - 1;
2246 same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2247 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2248
2182 /* 2249 /*
2183 * We needn't truncate any page which is beyond the end of the file 2250 * We needn't truncate any page which is beyond the end of the file
2184 * because we are sure there is no data there. 2251 * because we are sure there is no data there.
@@ -2190,8 +2257,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2190 if (same_page && len < PAGE_CACHE_SIZE) { 2257 if (same_page && len < PAGE_CACHE_SIZE) {
2191 if (offset < ino_size) 2258 if (offset < ino_size)
2192 ret = btrfs_truncate_page(inode, offset, len, 0); 2259 ret = btrfs_truncate_page(inode, offset, len, 0);
2193 mutex_unlock(&inode->i_mutex); 2260 goto out_only_mutex;
2194 return ret;
2195 } 2261 }
2196 2262
2197 /* zero back part of the first page */ 2263 /* zero back part of the first page */
@@ -2203,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2203 } 2269 }
2204 } 2270 }
2205 2271
2206 /* zero the front end of the last page */ 2272 /* Check the aligned pages after the first unaligned page,
2207 if (offset + len < ino_size) { 2273 * if offset != orig_start, which means the first unaligned page
2208 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2274 * including serveral following pages are already in holes,
2209 if (ret) { 2275 * the extra check can be skipped */
2210 mutex_unlock(&inode->i_mutex); 2276 if (offset == orig_start) {
2211 return ret; 2277 /* after truncate page, check hole again */
2278 len = offset + len - lockstart;
2279 offset = lockstart;
2280 ret = find_first_non_hole(inode, &offset, &len);
2281 if (ret < 0)
2282 goto out_only_mutex;
2283 if (ret && !len) {
2284 ret = 0;
2285 goto out_only_mutex;
2286 }
2287 lockstart = offset;
2288 }
2289
2290 /* Check the tail unaligned part is in a hole */
2291 tail_start = lockend + 1;
2292 tail_len = offset + len - tail_start;
2293 if (tail_len) {
2294 ret = find_first_non_hole(inode, &tail_start, &tail_len);
2295 if (unlikely(ret < 0))
2296 goto out_only_mutex;
2297 if (!ret) {
2298 /* zero the front end of the last page */
2299 if (tail_start + tail_len < ino_size) {
2300 ret = btrfs_truncate_page(inode,
2301 tail_start + tail_len, 0, 1);
2302 if (ret)
2303 goto out_only_mutex;
2304 }
2212 } 2305 }
2213 } 2306 }
2214 2307
@@ -2234,9 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2234 if ((!ordered || 2327 if ((!ordered ||
2235 (ordered->file_offset + ordered->len <= lockstart || 2328 (ordered->file_offset + ordered->len <= lockstart ||
2236 ordered->file_offset > lockend)) && 2329 ordered->file_offset > lockend)) &&
2237 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, 2330 !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
2238 lockend, EXTENT_UPTODATE, 0,
2239 cached_state)) {
2240 if (ordered) 2331 if (ordered)
2241 btrfs_put_ordered_extent(ordered); 2332 btrfs_put_ordered_extent(ordered);
2242 break; 2333 break;
@@ -2284,6 +2375,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2284 BUG_ON(ret); 2375 BUG_ON(ret);
2285 trans->block_rsv = rsv; 2376 trans->block_rsv = rsv;
2286 2377
2378 cur_offset = lockstart;
2379 len = lockend - cur_offset;
2287 while (cur_offset < lockend) { 2380 while (cur_offset < lockend) {
2288 ret = __btrfs_drop_extents(trans, root, inode, path, 2381 ret = __btrfs_drop_extents(trans, root, inode, path,
2289 cur_offset, lockend + 1, 2382 cur_offset, lockend + 1,
@@ -2324,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2324 rsv, min_size); 2417 rsv, min_size);
2325 BUG_ON(ret); /* shouldn't happen */ 2418 BUG_ON(ret); /* shouldn't happen */
2326 trans->block_rsv = rsv; 2419 trans->block_rsv = rsv;
2420
2421 ret = find_first_non_hole(inode, &cur_offset, &len);
2422 if (unlikely(ret < 0))
2423 break;
2424 if (ret && !len) {
2425 ret = 0;
2426 break;
2427 }
2327 } 2428 }
2328 2429
2329 if (ret) { 2430 if (ret) {
@@ -2332,7 +2433,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2332 } 2433 }
2333 2434
2334 trans->block_rsv = &root->fs_info->trans_block_rsv; 2435 trans->block_rsv = &root->fs_info->trans_block_rsv;
2335 if (cur_offset < ino_size) { 2436 /*
2437 * Don't insert file hole extent item if it's for a range beyond eof
2438 * (because it's useless) or if it represents a 0 bytes range (when
2439 * cur_offset == drop_end).
2440 */
2441 if (cur_offset < ino_size && cur_offset < drop_end) {
2336 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2442 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2337 if (ret) { 2443 if (ret) {
2338 err = ret; 2444 err = ret;
@@ -2357,6 +2463,7 @@ out_free:
2357out: 2463out:
2358 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2464 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2359 &cached_state, GFP_NOFS); 2465 &cached_state, GFP_NOFS);
2466out_only_mutex:
2360 mutex_unlock(&inode->i_mutex); 2467 mutex_unlock(&inode->i_mutex);
2361 if (ret && !err) 2468 if (ret && !err)
2362 err = ret; 2469 err = ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 73f3de7a083c..372b05ff1943 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -831,7 +831,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
831 831
832 if (!matched) { 832 if (!matched) {
833 __btrfs_remove_free_space_cache(ctl); 833 __btrfs_remove_free_space_cache(ctl);
834 btrfs_err(fs_info, "block group %llu has wrong amount of free space", 834 btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
835 block_group->key.objectid); 835 block_group->key.objectid);
836 ret = -1; 836 ret = -1;
837 } 837 }
@@ -843,7 +843,7 @@ out:
843 spin_unlock(&block_group->lock); 843 spin_unlock(&block_group->lock);
844 ret = 0; 844 ret = 0;
845 845
846 btrfs_err(fs_info, "failed to load free space cache for block group %llu", 846 btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
847 block_group->key.objectid); 847 block_group->key.objectid);
848 } 848 }
849 849
@@ -851,90 +851,44 @@ out:
851 return ret; 851 return ret;
852} 852}
853 853
854/** 854static noinline_for_stack
855 * __btrfs_write_out_cache - write out cached info to an inode 855int write_cache_extent_entries(struct io_ctl *io_ctl,
856 * @root - the root the inode belongs to 856 struct btrfs_free_space_ctl *ctl,
857 * @ctl - the free space cache we are going to write out 857 struct btrfs_block_group_cache *block_group,
858 * @block_group - the block_group for this cache if it belongs to a block_group 858 int *entries, int *bitmaps,
859 * @trans - the trans handle 859 struct list_head *bitmap_list)
860 * @path - the path to use
861 * @offset - the offset for the key we'll insert
862 *
863 * This function writes out a free space cache struct to disk for quick recovery
864 * on mount. This will return 0 if it was successfull in writing the cache out,
865 * and -1 if it was not.
866 */
867static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
868 struct btrfs_free_space_ctl *ctl,
869 struct btrfs_block_group_cache *block_group,
870 struct btrfs_trans_handle *trans,
871 struct btrfs_path *path, u64 offset)
872{ 860{
873 struct btrfs_free_space_header *header;
874 struct extent_buffer *leaf;
875 struct rb_node *node;
876 struct list_head *pos, *n;
877 struct extent_state *cached_state = NULL;
878 struct btrfs_free_cluster *cluster = NULL;
879 struct extent_io_tree *unpin = NULL;
880 struct io_ctl io_ctl;
881 struct list_head bitmap_list;
882 struct btrfs_key key;
883 u64 start, extent_start, extent_end, len;
884 int entries = 0;
885 int bitmaps = 0;
886 int ret; 861 int ret;
887 int err = -1; 862 struct btrfs_free_cluster *cluster = NULL;
888 863 struct rb_node *node = rb_first(&ctl->free_space_offset);
889 INIT_LIST_HEAD(&bitmap_list);
890
891 if (!i_size_read(inode))
892 return -1;
893
894 ret = io_ctl_init(&io_ctl, inode, root);
895 if (ret)
896 return -1;
897 864
898 /* Get the cluster for this block_group if it exists */ 865 /* Get the cluster for this block_group if it exists */
899 if (block_group && !list_empty(&block_group->cluster_list)) 866 if (block_group && !list_empty(&block_group->cluster_list)) {
900 cluster = list_entry(block_group->cluster_list.next, 867 cluster = list_entry(block_group->cluster_list.next,
901 struct btrfs_free_cluster, 868 struct btrfs_free_cluster,
902 block_group_list); 869 block_group_list);
870 }
903 871
904 /* Lock all pages first so we can lock the extent safely. */
905 io_ctl_prepare_pages(&io_ctl, inode, 0);
906
907 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
908 0, &cached_state);
909
910 node = rb_first(&ctl->free_space_offset);
911 if (!node && cluster) { 872 if (!node && cluster) {
912 node = rb_first(&cluster->root); 873 node = rb_first(&cluster->root);
913 cluster = NULL; 874 cluster = NULL;
914 } 875 }
915 876
916 /* Make sure we can fit our crcs into the first page */
917 if (io_ctl.check_crcs &&
918 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
919 goto out_nospc;
920
921 io_ctl_set_generation(&io_ctl, trans->transid);
922
923 /* Write out the extent entries */ 877 /* Write out the extent entries */
924 while (node) { 878 while (node) {
925 struct btrfs_free_space *e; 879 struct btrfs_free_space *e;
926 880
927 e = rb_entry(node, struct btrfs_free_space, offset_index); 881 e = rb_entry(node, struct btrfs_free_space, offset_index);
928 entries++; 882 *entries += 1;
929 883
930 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, 884 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
931 e->bitmap); 885 e->bitmap);
932 if (ret) 886 if (ret)
933 goto out_nospc; 887 goto fail;
934 888
935 if (e->bitmap) { 889 if (e->bitmap) {
936 list_add_tail(&e->list, &bitmap_list); 890 list_add_tail(&e->list, bitmap_list);
937 bitmaps++; 891 *bitmaps += 1;
938 } 892 }
939 node = rb_next(node); 893 node = rb_next(node);
940 if (!node && cluster) { 894 if (!node && cluster) {
@@ -942,13 +896,84 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
942 cluster = NULL; 896 cluster = NULL;
943 } 897 }
944 } 898 }
899 return 0;
900fail:
901 return -ENOSPC;
902}
903
904static noinline_for_stack int
905update_cache_item(struct btrfs_trans_handle *trans,
906 struct btrfs_root *root,
907 struct inode *inode,
908 struct btrfs_path *path, u64 offset,
909 int entries, int bitmaps)
910{
911 struct btrfs_key key;
912 struct btrfs_free_space_header *header;
913 struct extent_buffer *leaf;
914 int ret;
915
916 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
917 key.offset = offset;
918 key.type = 0;
919
920 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
921 if (ret < 0) {
922 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
923 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
924 GFP_NOFS);
925 goto fail;
926 }
927 leaf = path->nodes[0];
928 if (ret > 0) {
929 struct btrfs_key found_key;
930 ASSERT(path->slots[0]);
931 path->slots[0]--;
932 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
933 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
934 found_key.offset != offset) {
935 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
936 inode->i_size - 1,
937 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
938 NULL, GFP_NOFS);
939 btrfs_release_path(path);
940 goto fail;
941 }
942 }
943
944 BTRFS_I(inode)->generation = trans->transid;
945 header = btrfs_item_ptr(leaf, path->slots[0],
946 struct btrfs_free_space_header);
947 btrfs_set_free_space_entries(leaf, header, entries);
948 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
949 btrfs_set_free_space_generation(leaf, header, trans->transid);
950 btrfs_mark_buffer_dirty(leaf);
951 btrfs_release_path(path);
952
953 return 0;
954
955fail:
956 return -1;
957}
958
959static noinline_for_stack int
960add_ioctl_entries(struct btrfs_root *root,
961 struct inode *inode,
962 struct btrfs_block_group_cache *block_group,
963 struct io_ctl *io_ctl,
964 struct extent_state **cached_state,
965 struct list_head *bitmap_list,
966 int *entries)
967{
968 u64 start, extent_start, extent_end, len;
969 struct list_head *pos, *n;
970 struct extent_io_tree *unpin = NULL;
971 int ret;
945 972
946 /* 973 /*
947 * We want to add any pinned extents to our free space cache 974 * We want to add any pinned extents to our free space cache
948 * so we don't leak the space 975 * so we don't leak the space
949 */ 976 *
950
951 /*
952 * We shouldn't have switched the pinned extents yet so this is the 977 * We shouldn't have switched the pinned extents yet so this is the
953 * right one 978 * right one
954 */ 979 */
@@ -977,8 +1002,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
977 block_group->key.offset, extent_end + 1); 1002 block_group->key.offset, extent_end + 1);
978 len = extent_end - extent_start; 1003 len = extent_end - extent_start;
979 1004
980 entries++; 1005 *entries += 1;
981 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); 1006 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
982 if (ret) 1007 if (ret)
983 goto out_nospc; 1008 goto out_nospc;
984 1009
@@ -986,74 +1011,129 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
986 } 1011 }
987 1012
988 /* Write out the bitmaps */ 1013 /* Write out the bitmaps */
989 list_for_each_safe(pos, n, &bitmap_list) { 1014 list_for_each_safe(pos, n, bitmap_list) {
990 struct btrfs_free_space *entry = 1015 struct btrfs_free_space *entry =
991 list_entry(pos, struct btrfs_free_space, list); 1016 list_entry(pos, struct btrfs_free_space, list);
992 1017
993 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); 1018 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
994 if (ret) 1019 if (ret)
995 goto out_nospc; 1020 goto out_nospc;
996 list_del_init(&entry->list); 1021 list_del_init(&entry->list);
997 } 1022 }
998 1023
999 /* Zero out the rest of the pages just to make sure */ 1024 /* Zero out the rest of the pages just to make sure */
1000 io_ctl_zero_remaining_pages(&io_ctl); 1025 io_ctl_zero_remaining_pages(io_ctl);
1001 1026
1002 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, 1027 ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
1003 0, i_size_read(inode), &cached_state); 1028 0, i_size_read(inode), cached_state);
1004 io_ctl_drop_pages(&io_ctl); 1029 io_ctl_drop_pages(io_ctl);
1005 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1030 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1006 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 1031 i_size_read(inode) - 1, cached_state, GFP_NOFS);
1007 1032
1008 if (ret) 1033 if (ret)
1009 goto out; 1034 goto fail;
1010 1035
1011 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1036 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1012 if (ret) { 1037 if (ret) {
1013 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1038 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1014 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1039 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1015 GFP_NOFS); 1040 GFP_NOFS);
1016 goto out; 1041 goto fail;
1017 } 1042 }
1043 return 0;
1018 1044
1019 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1045fail:
1020 key.offset = offset; 1046 return -1;
1021 key.type = 0;
1022 1047
1023 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1048out_nospc:
1024 if (ret < 0) { 1049 return -ENOSPC;
1025 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1050}
1026 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1051
1027 GFP_NOFS); 1052static void noinline_for_stack
1028 goto out; 1053cleanup_write_cache_enospc(struct inode *inode,
1029 } 1054 struct io_ctl *io_ctl,
1030 leaf = path->nodes[0]; 1055 struct extent_state **cached_state,
1031 if (ret > 0) { 1056 struct list_head *bitmap_list)
1032 struct btrfs_key found_key; 1057{
1033 ASSERT(path->slots[0]); 1058 struct list_head *pos, *n;
1034 path->slots[0]--; 1059 list_for_each_safe(pos, n, bitmap_list) {
1035 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1060 struct btrfs_free_space *entry =
1036 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1061 list_entry(pos, struct btrfs_free_space, list);
1037 found_key.offset != offset) { 1062 list_del_init(&entry->list);
1038 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1039 inode->i_size - 1,
1040 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
1041 NULL, GFP_NOFS);
1042 btrfs_release_path(path);
1043 goto out;
1044 }
1045 } 1063 }
1064 io_ctl_drop_pages(io_ctl);
1065 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1066 i_size_read(inode) - 1, cached_state,
1067 GFP_NOFS);
1068}
1046 1069
1047 BTRFS_I(inode)->generation = trans->transid; 1070/**
1048 header = btrfs_item_ptr(leaf, path->slots[0], 1071 * __btrfs_write_out_cache - write out cached info to an inode
1049 struct btrfs_free_space_header); 1072 * @root - the root the inode belongs to
1050 btrfs_set_free_space_entries(leaf, header, entries); 1073 * @ctl - the free space cache we are going to write out
1051 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1074 * @block_group - the block_group for this cache if it belongs to a block_group
1052 btrfs_set_free_space_generation(leaf, header, trans->transid); 1075 * @trans - the trans handle
1053 btrfs_mark_buffer_dirty(leaf); 1076 * @path - the path to use
1054 btrfs_release_path(path); 1077 * @offset - the offset for the key we'll insert
1078 *
1079 * This function writes out a free space cache struct to disk for quick recovery
1080 * on mount. This will return 0 if it was successfull in writing the cache out,
1081 * and -1 if it was not.
1082 */
1083static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1084 struct btrfs_free_space_ctl *ctl,
1085 struct btrfs_block_group_cache *block_group,
1086 struct btrfs_trans_handle *trans,
1087 struct btrfs_path *path, u64 offset)
1088{
1089 struct extent_state *cached_state = NULL;
1090 struct io_ctl io_ctl;
1091 struct list_head bitmap_list;
1092 int entries = 0;
1093 int bitmaps = 0;
1094 int ret;
1095 int err = -1;
1096
1097 INIT_LIST_HEAD(&bitmap_list);
1098
1099 if (!i_size_read(inode))
1100 return -1;
1101
1102 ret = io_ctl_init(&io_ctl, inode, root);
1103 if (ret)
1104 return -1;
1105
1106 /* Lock all pages first so we can lock the extent safely. */
1107 io_ctl_prepare_pages(&io_ctl, inode, 0);
1108
1109 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1110 0, &cached_state);
1111
1112
1113 /* Make sure we can fit our crcs into the first page */
1114 if (io_ctl.check_crcs &&
1115 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
1116 goto out_nospc;
1117
1118 io_ctl_set_generation(&io_ctl, trans->transid);
1119
1120 ret = write_cache_extent_entries(&io_ctl, ctl,
1121 block_group, &entries, &bitmaps,
1122 &bitmap_list);
1123 if (ret)
1124 goto out_nospc;
1125
1126 ret = add_ioctl_entries(root, inode, block_group, &io_ctl,
1127 &cached_state, &bitmap_list, &entries);
1128
1129 if (ret == -ENOSPC)
1130 goto out_nospc;
1131 else if (ret)
1132 goto out;
1133
1134 err = update_cache_item(trans, root, inode, path, offset,
1135 entries, bitmaps);
1055 1136
1056 err = 0;
1057out: 1137out:
1058 io_ctl_free(&io_ctl); 1138 io_ctl_free(&io_ctl);
1059 if (err) { 1139 if (err) {
@@ -1064,14 +1144,8 @@ out:
1064 return err; 1144 return err;
1065 1145
1066out_nospc: 1146out_nospc:
1067 list_for_each_safe(pos, n, &bitmap_list) { 1147
1068 struct btrfs_free_space *entry = 1148 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
1069 list_entry(pos, struct btrfs_free_space, list);
1070 list_del_init(&entry->list);
1071 }
1072 io_ctl_drop_pages(&io_ctl);
1073 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1074 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1075 goto out; 1149 goto out;
1076} 1150}
1077 1151
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 86935f5ae291..888fbe19079f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -174,7 +174,7 @@ static void start_caching(struct btrfs_root *root)
174 BTRFS_LAST_FREE_OBJECTID - objectid + 1); 174 BTRFS_LAST_FREE_OBJECTID - objectid + 1);
175 } 175 }
176 176
177 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", 177 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
178 root->root_key.objectid); 178 root->root_key.objectid);
179 if (IS_ERR(tsk)) { 179 if (IS_ERR(tsk)) {
180 btrfs_warn(root->fs_info, "failed to start inode caching task"); 180 btrfs_warn(root->fs_info, "failed to start inode caching task");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c8386f1961f0..8925f66a1411 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
125 * the btree. The caller should have done a btrfs_drop_extents so that 125 * the btree. The caller should have done a btrfs_drop_extents so that
126 * no overlapping inline items exist in the btree 126 * no overlapping inline items exist in the btree
127 */ 127 */
128static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 128static int insert_inline_extent(struct btrfs_trans_handle *trans,
129 struct btrfs_path *path, int extent_inserted, 129 struct btrfs_path *path, int extent_inserted,
130 struct btrfs_root *root, struct inode *inode, 130 struct btrfs_root *root, struct inode *inode,
131 u64 start, size_t size, size_t compressed_size, 131 u64 start, size_t size, size_t compressed_size,
@@ -2678,6 +2678,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2678 trans = NULL; 2678 trans = NULL;
2679 goto out_unlock; 2679 goto out_unlock;
2680 } 2680 }
2681
2681 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2682 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2682 2683
2683 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2947,14 +2948,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2947 root->orphan_block_rsv = NULL; 2948 root->orphan_block_rsv = NULL;
2948 spin_unlock(&root->orphan_lock); 2949 spin_unlock(&root->orphan_lock);
2949 2950
2950 if (root->orphan_item_inserted && 2951 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
2951 btrfs_root_refs(&root->root_item) > 0) { 2952 btrfs_root_refs(&root->root_item) > 0) {
2952 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2953 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2953 root->root_key.objectid); 2954 root->root_key.objectid);
2954 if (ret) 2955 if (ret)
2955 btrfs_abort_transaction(trans, root, ret); 2956 btrfs_abort_transaction(trans, root, ret);
2956 else 2957 else
2957 root->orphan_item_inserted = 0; 2958 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
2959 &root->state);
2958 } 2960 }
2959 2961
2960 if (block_rsv) { 2962 if (block_rsv) {
@@ -3271,7 +3273,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3271 btrfs_block_rsv_release(root, root->orphan_block_rsv, 3273 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3272 (u64)-1); 3274 (u64)-1);
3273 3275
3274 if (root->orphan_block_rsv || root->orphan_item_inserted) { 3276 if (root->orphan_block_rsv ||
3277 test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3275 trans = btrfs_join_transaction(root); 3278 trans = btrfs_join_transaction(root);
3276 if (!IS_ERR(trans)) 3279 if (!IS_ERR(trans))
3277 btrfs_end_transaction(trans, root); 3280 btrfs_end_transaction(trans, root);
@@ -3473,7 +3476,7 @@ cache_acl:
3473 ret = btrfs_load_inode_props(inode, path); 3476 ret = btrfs_load_inode_props(inode, path);
3474 if (ret) 3477 if (ret)
3475 btrfs_err(root->fs_info, 3478 btrfs_err(root->fs_info,
3476 "error loading props for ino %llu (root %llu): %d\n", 3479 "error loading props for ino %llu (root %llu): %d",
3477 btrfs_ino(inode), 3480 btrfs_ino(inode),
3478 root->root_key.objectid, ret); 3481 root->root_key.objectid, ret);
3479 } 3482 }
@@ -3998,7 +4001,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3998 * not block aligned since we will be keeping the last block of the 4001 * not block aligned since we will be keeping the last block of the
3999 * extent just the way it is. 4002 * extent just the way it is.
4000 */ 4003 */
4001 if (root->ref_cows || root == root->fs_info->tree_root) 4004 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4005 root == root->fs_info->tree_root)
4002 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4006 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4003 root->sectorsize), (u64)-1, 0); 4007 root->sectorsize), (u64)-1, 0);
4004 4008
@@ -4091,7 +4095,9 @@ search_again:
4091 extent_num_bytes); 4095 extent_num_bytes);
4092 num_dec = (orig_num_bytes - 4096 num_dec = (orig_num_bytes -
4093 extent_num_bytes); 4097 extent_num_bytes);
4094 if (root->ref_cows && extent_start != 0) 4098 if (test_bit(BTRFS_ROOT_REF_COWS,
4099 &root->state) &&
4100 extent_start != 0)
4095 inode_sub_bytes(inode, num_dec); 4101 inode_sub_bytes(inode, num_dec);
4096 btrfs_mark_buffer_dirty(leaf); 4102 btrfs_mark_buffer_dirty(leaf);
4097 } else { 4103 } else {
@@ -4105,7 +4111,8 @@ search_again:
4105 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4111 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4106 if (extent_start != 0) { 4112 if (extent_start != 0) {
4107 found_extent = 1; 4113 found_extent = 1;
4108 if (root->ref_cows) 4114 if (test_bit(BTRFS_ROOT_REF_COWS,
4115 &root->state))
4109 inode_sub_bytes(inode, num_dec); 4116 inode_sub_bytes(inode, num_dec);
4110 } 4117 }
4111 } 4118 }
@@ -4120,10 +4127,9 @@ search_again:
4120 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4127 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4121 u32 size = new_size - found_key.offset; 4128 u32 size = new_size - found_key.offset;
4122 4129
4123 if (root->ref_cows) { 4130 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4124 inode_sub_bytes(inode, item_end + 1 - 4131 inode_sub_bytes(inode, item_end + 1 -
4125 new_size); 4132 new_size);
4126 }
4127 4133
4128 /* 4134 /*
4129 * update the ram bytes to properly reflect 4135 * update the ram bytes to properly reflect
@@ -4133,7 +4139,8 @@ search_again:
4133 size = 4139 size =
4134 btrfs_file_extent_calc_inline_size(size); 4140 btrfs_file_extent_calc_inline_size(size);
4135 btrfs_truncate_item(root, path, size, 1); 4141 btrfs_truncate_item(root, path, size, 1);
4136 } else if (root->ref_cows) { 4142 } else if (test_bit(BTRFS_ROOT_REF_COWS,
4143 &root->state)) {
4137 inode_sub_bytes(inode, item_end + 1 - 4144 inode_sub_bytes(inode, item_end + 1 -
4138 found_key.offset); 4145 found_key.offset);
4139 } 4146 }
@@ -4155,8 +4162,9 @@ delete:
4155 } else { 4162 } else {
4156 break; 4163 break;
4157 } 4164 }
4158 if (found_extent && (root->ref_cows || 4165 if (found_extent &&
4159 root == root->fs_info->tree_root)) { 4166 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4167 root == root->fs_info->tree_root)) {
4160 btrfs_set_path_blocking(path); 4168 btrfs_set_path_blocking(path);
4161 ret = btrfs_free_extent(trans, root, extent_start, 4169 ret = btrfs_free_extent(trans, root, extent_start,
4162 extent_num_bytes, 0, 4170 extent_num_bytes, 0,
@@ -5168,8 +5176,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
5168 5176
5169static void btrfs_dentry_release(struct dentry *dentry) 5177static void btrfs_dentry_release(struct dentry *dentry)
5170{ 5178{
5171 if (dentry->d_fsdata) 5179 kfree(dentry->d_fsdata);
5172 kfree(dentry->d_fsdata);
5173} 5180}
5174 5181
5175static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5182static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
@@ -5553,6 +5560,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5553 struct btrfs_inode_ref *ref; 5560 struct btrfs_inode_ref *ref;
5554 struct btrfs_key key[2]; 5561 struct btrfs_key key[2];
5555 u32 sizes[2]; 5562 u32 sizes[2];
5563 int nitems = name ? 2 : 1;
5556 unsigned long ptr; 5564 unsigned long ptr;
5557 int ret; 5565 int ret;
5558 5566
@@ -5572,7 +5580,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5572 */ 5580 */
5573 inode->i_ino = objectid; 5581 inode->i_ino = objectid;
5574 5582
5575 if (dir) { 5583 if (dir && name) {
5576 trace_btrfs_inode_request(dir); 5584 trace_btrfs_inode_request(dir);
5577 5585
5578 ret = btrfs_set_inode_index(dir, index); 5586 ret = btrfs_set_inode_index(dir, index);
@@ -5581,6 +5589,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5581 iput(inode); 5589 iput(inode);
5582 return ERR_PTR(ret); 5590 return ERR_PTR(ret);
5583 } 5591 }
5592 } else if (dir) {
5593 *index = 0;
5584 } 5594 }
5585 /* 5595 /*
5586 * index_cnt is ignored for everything but a dir, 5596 * index_cnt is ignored for everything but a dir,
@@ -5605,21 +5615,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5605 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5615 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5606 key[0].offset = 0; 5616 key[0].offset = 0;
5607 5617
5608 /*
5609 * Start new inodes with an inode_ref. This is slightly more
5610 * efficient for small numbers of hard links since they will
5611 * be packed into one item. Extended refs will kick in if we
5612 * add more hard links than can fit in the ref item.
5613 */
5614 key[1].objectid = objectid;
5615 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5616 key[1].offset = ref_objectid;
5617
5618 sizes[0] = sizeof(struct btrfs_inode_item); 5618 sizes[0] = sizeof(struct btrfs_inode_item);
5619 sizes[1] = name_len + sizeof(*ref); 5619
5620 if (name) {
5621 /*
5622 * Start new inodes with an inode_ref. This is slightly more
5623 * efficient for small numbers of hard links since they will
5624 * be packed into one item. Extended refs will kick in if we
5625 * add more hard links than can fit in the ref item.
5626 */
5627 key[1].objectid = objectid;
5628 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5629 key[1].offset = ref_objectid;
5630
5631 sizes[1] = name_len + sizeof(*ref);
5632 }
5620 5633
5621 path->leave_spinning = 1; 5634 path->leave_spinning = 1;
5622 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 5635 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5623 if (ret != 0) 5636 if (ret != 0)
5624 goto fail; 5637 goto fail;
5625 5638
@@ -5632,12 +5645,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5632 sizeof(*inode_item)); 5645 sizeof(*inode_item));
5633 fill_inode_item(trans, path->nodes[0], inode_item, inode); 5646 fill_inode_item(trans, path->nodes[0], inode_item, inode);
5634 5647
5635 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 5648 if (name) {
5636 struct btrfs_inode_ref); 5649 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5637 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 5650 struct btrfs_inode_ref);
5638 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 5651 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5639 ptr = (unsigned long)(ref + 1); 5652 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5640 write_extent_buffer(path->nodes[0], name, ptr, name_len); 5653 ptr = (unsigned long)(ref + 1);
5654 write_extent_buffer(path->nodes[0], name, ptr, name_len);
5655 }
5641 5656
5642 btrfs_mark_buffer_dirty(path->nodes[0]); 5657 btrfs_mark_buffer_dirty(path->nodes[0]);
5643 btrfs_free_path(path); 5658 btrfs_free_path(path);
@@ -5673,7 +5688,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5673 5688
5674 return inode; 5689 return inode;
5675fail: 5690fail:
5676 if (dir) 5691 if (dir && name)
5677 BTRFS_I(dir)->index_cnt--; 5692 BTRFS_I(dir)->index_cnt--;
5678 btrfs_free_path(path); 5693 btrfs_free_path(path);
5679 iput(inode); 5694 iput(inode);
@@ -5958,6 +5973,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5958 err = btrfs_update_inode(trans, root, inode); 5973 err = btrfs_update_inode(trans, root, inode);
5959 if (err) 5974 if (err)
5960 goto fail; 5975 goto fail;
5976 if (inode->i_nlink == 1) {
5977 /*
5978 * If new hard link count is 1, it's a file created
5979 * with open(2) O_TMPFILE flag.
5980 */
5981 err = btrfs_orphan_del(trans, inode);
5982 if (err)
5983 goto fail;
5984 }
5961 d_instantiate(dentry, inode); 5985 d_instantiate(dentry, inode);
5962 btrfs_log_new_name(trans, inode, NULL, parent); 5986 btrfs_log_new_name(trans, inode, NULL, parent);
5963 } 5987 }
@@ -6086,16 +6110,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
6086 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 6110 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6087 ret = btrfs_decompress(compress_type, tmp, page, 6111 ret = btrfs_decompress(compress_type, tmp, page,
6088 extent_offset, inline_size, max_size); 6112 extent_offset, inline_size, max_size);
6089 if (ret) {
6090 char *kaddr = kmap_atomic(page);
6091 unsigned long copy_size = min_t(u64,
6092 PAGE_CACHE_SIZE - pg_offset,
6093 max_size - extent_offset);
6094 memset(kaddr + pg_offset, 0, copy_size);
6095 kunmap_atomic(kaddr);
6096 }
6097 kfree(tmp); 6113 kfree(tmp);
6098 return 0; 6114 return ret;
6099} 6115}
6100 6116
6101/* 6117/*
@@ -6113,7 +6129,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6113{ 6129{
6114 int ret; 6130 int ret;
6115 int err = 0; 6131 int err = 0;
6116 u64 bytenr;
6117 u64 extent_start = 0; 6132 u64 extent_start = 0;
6118 u64 extent_end = 0; 6133 u64 extent_end = 0;
6119 u64 objectid = btrfs_ino(inode); 6134 u64 objectid = btrfs_ino(inode);
@@ -6127,7 +6142,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6127 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 6142 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6128 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6143 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6129 struct btrfs_trans_handle *trans = NULL; 6144 struct btrfs_trans_handle *trans = NULL;
6130 int compress_type; 6145 const bool new_inline = !page || create;
6131 6146
6132again: 6147again:
6133 read_lock(&em_tree->lock); 6148 read_lock(&em_tree->lock);
@@ -6201,7 +6216,6 @@ again:
6201 6216
6202 found_type = btrfs_file_extent_type(leaf, item); 6217 found_type = btrfs_file_extent_type(leaf, item);
6203 extent_start = found_key.offset; 6218 extent_start = found_key.offset;
6204 compress_type = btrfs_file_extent_compression(leaf, item);
6205 if (found_type == BTRFS_FILE_EXTENT_REG || 6219 if (found_type == BTRFS_FILE_EXTENT_REG ||
6206 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6220 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6207 extent_end = extent_start + 6221 extent_end = extent_start +
@@ -6236,32 +6250,10 @@ next:
6236 goto not_found_em; 6250 goto not_found_em;
6237 } 6251 }
6238 6252
6239 em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 6253 btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6254
6240 if (found_type == BTRFS_FILE_EXTENT_REG || 6255 if (found_type == BTRFS_FILE_EXTENT_REG ||
6241 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6256 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6242 em->start = extent_start;
6243 em->len = extent_end - extent_start;
6244 em->orig_start = extent_start -
6245 btrfs_file_extent_offset(leaf, item);
6246 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
6247 item);
6248 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
6249 if (bytenr == 0) {
6250 em->block_start = EXTENT_MAP_HOLE;
6251 goto insert;
6252 }
6253 if (compress_type != BTRFS_COMPRESS_NONE) {
6254 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6255 em->compress_type = compress_type;
6256 em->block_start = bytenr;
6257 em->block_len = em->orig_block_len;
6258 } else {
6259 bytenr += btrfs_file_extent_offset(leaf, item);
6260 em->block_start = bytenr;
6261 em->block_len = em->len;
6262 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
6263 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6264 }
6265 goto insert; 6257 goto insert;
6266 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6258 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6267 unsigned long ptr; 6259 unsigned long ptr;
@@ -6270,12 +6262,8 @@ next:
6270 size_t extent_offset; 6262 size_t extent_offset;
6271 size_t copy_size; 6263 size_t copy_size;
6272 6264
6273 em->block_start = EXTENT_MAP_INLINE; 6265 if (new_inline)
6274 if (!page || create) {
6275 em->start = extent_start;
6276 em->len = extent_end - extent_start;
6277 goto out; 6266 goto out;
6278 }
6279 6267
6280 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); 6268 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6281 extent_offset = page_offset(page) + pg_offset - extent_start; 6269 extent_offset = page_offset(page) + pg_offset - extent_start;
@@ -6285,10 +6273,6 @@ next:
6285 em->len = ALIGN(copy_size, root->sectorsize); 6273 em->len = ALIGN(copy_size, root->sectorsize);
6286 em->orig_block_len = em->len; 6274 em->orig_block_len = em->len;
6287 em->orig_start = em->start; 6275 em->orig_start = em->start;
6288 if (compress_type) {
6289 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6290 em->compress_type = compress_type;
6291 }
6292 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6276 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6293 if (create == 0 && !PageUptodate(page)) { 6277 if (create == 0 && !PageUptodate(page)) {
6294 if (btrfs_file_extent_compression(leaf, item) != 6278 if (btrfs_file_extent_compression(leaf, item) !=
@@ -6296,7 +6280,10 @@ next:
6296 ret = uncompress_inline(path, inode, page, 6280 ret = uncompress_inline(path, inode, page,
6297 pg_offset, 6281 pg_offset,
6298 extent_offset, item); 6282 extent_offset, item);
6299 BUG_ON(ret); /* -ENOMEM */ 6283 if (ret) {
6284 err = ret;
6285 goto out;
6286 }
6300 } else { 6287 } else {
6301 map = kmap(page); 6288 map = kmap(page);
6302 read_extent_buffer(leaf, map + pg_offset, ptr, 6289 read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6332,8 +6319,6 @@ next:
6332 set_extent_uptodate(io_tree, em->start, 6319 set_extent_uptodate(io_tree, em->start,
6333 extent_map_end(em) - 1, NULL, GFP_NOFS); 6320 extent_map_end(em) - 1, NULL, GFP_NOFS);
6334 goto insert; 6321 goto insert;
6335 } else {
6336 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
6337 } 6322 }
6338not_found: 6323not_found:
6339 em->start = start; 6324 em->start = start;
@@ -6717,6 +6702,76 @@ out:
6717 return ret; 6702 return ret;
6718} 6703}
6719 6704
6705bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
6706{
6707 struct radix_tree_root *root = &inode->i_mapping->page_tree;
6708 int found = false;
6709 void **pagep = NULL;
6710 struct page *page = NULL;
6711 int start_idx;
6712 int end_idx;
6713
6714 start_idx = start >> PAGE_CACHE_SHIFT;
6715
6716 /*
6717 * end is the last byte in the last page. end == start is legal
6718 */
6719 end_idx = end >> PAGE_CACHE_SHIFT;
6720
6721 rcu_read_lock();
6722
6723 /* Most of the code in this while loop is lifted from
6724 * find_get_page. It's been modified to begin searching from a
6725 * page and return just the first page found in that range. If the
6726 * found idx is less than or equal to the end idx then we know that
6727 * a page exists. If no pages are found or if those pages are
6728 * outside of the range then we're fine (yay!) */
6729 while (page == NULL &&
6730 radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
6731 page = radix_tree_deref_slot(pagep);
6732 if (unlikely(!page))
6733 break;
6734
6735 if (radix_tree_exception(page)) {
6736 if (radix_tree_deref_retry(page)) {
6737 page = NULL;
6738 continue;
6739 }
6740 /*
6741 * Otherwise, shmem/tmpfs must be storing a swap entry
6742 * here as an exceptional entry: so return it without
6743 * attempting to raise page count.
6744 */
6745 page = NULL;
6746 break; /* TODO: Is this relevant for this use case? */
6747 }
6748
6749 if (!page_cache_get_speculative(page)) {
6750 page = NULL;
6751 continue;
6752 }
6753
6754 /*
6755 * Has the page moved?
6756 * This is part of the lockless pagecache protocol. See
6757 * include/linux/pagemap.h for details.
6758 */
6759 if (unlikely(page != *pagep)) {
6760 page_cache_release(page);
6761 page = NULL;
6762 }
6763 }
6764
6765 if (page) {
6766 if (page->index <= end_idx)
6767 found = true;
6768 page_cache_release(page);
6769 }
6770
6771 rcu_read_unlock();
6772 return found;
6773}
6774
6720static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 6775static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6721 struct extent_state **cached_state, int writing) 6776 struct extent_state **cached_state, int writing)
6722{ 6777{
@@ -6741,10 +6796,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6741 * invalidate needs to happen so that reads after a write do not 6796 * invalidate needs to happen so that reads after a write do not
6742 * get stale data. 6797 * get stale data.
6743 */ 6798 */
6744 if (!ordered && (!writing || 6799 if (!ordered &&
6745 !test_range_bit(&BTRFS_I(inode)->io_tree, 6800 (!writing ||
6746 lockstart, lockend, EXTENT_UPTODATE, 0, 6801 !btrfs_page_exists_in_range(inode, lockstart, lockend)))
6747 *cached_state)))
6748 break; 6802 break;
6749 6803
6750 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6804 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -7126,7 +7180,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
7126 * before atomic variable goto zero, we must make sure 7180 * before atomic variable goto zero, we must make sure
7127 * dip->errors is perceived to be set. 7181 * dip->errors is perceived to be set.
7128 */ 7182 */
7129 smp_mb__before_atomic_dec(); 7183 smp_mb__before_atomic();
7130 } 7184 }
7131 7185
7132 /* if there are more bios still pending for this dio, just exit */ 7186 /* if there are more bios still pending for this dio, just exit */
@@ -7306,7 +7360,7 @@ out_err:
7306 * before atomic variable goto zero, we must 7360 * before atomic variable goto zero, we must
7307 * make sure dip->errors is perceived to be set. 7361 * make sure dip->errors is perceived to be set.
7308 */ 7362 */
7309 smp_mb__before_atomic_dec(); 7363 smp_mb__before_atomic();
7310 if (atomic_dec_and_test(&dip->pending_bios)) 7364 if (atomic_dec_and_test(&dip->pending_bios))
7311 bio_io_error(dip->orig_bio); 7365 bio_io_error(dip->orig_bio);
7312 7366
@@ -7438,7 +7492,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7438 return 0; 7492 return 0;
7439 7493
7440 atomic_inc(&inode->i_dio_count); 7494 atomic_inc(&inode->i_dio_count);
7441 smp_mb__after_atomic_inc(); 7495 smp_mb__after_atomic();
7442 7496
7443 /* 7497 /*
7444 * The generic stuff only does filemap_write_and_wait_range, which 7498 * The generic stuff only does filemap_write_and_wait_range, which
@@ -7981,7 +8035,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7981 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8035 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
7982 if (err) 8036 if (err)
7983 btrfs_err(new_root->fs_info, 8037 btrfs_err(new_root->fs_info,
7984 "error inheriting subvolume %llu properties: %d\n", 8038 "error inheriting subvolume %llu properties: %d",
7985 new_root->root_key.objectid, err); 8039 new_root->root_key.objectid, err);
7986 8040
7987 err = btrfs_update_inode(trans, new_root, inode); 8041 err = btrfs_update_inode(trans, new_root, inode);
@@ -8300,7 +8354,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8300 BTRFS_I(old_inode)->dir_index = 0ULL; 8354 BTRFS_I(old_inode)->dir_index = 0ULL;
8301 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8355 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8302 /* force full log commit if subvolume involved. */ 8356 /* force full log commit if subvolume involved. */
8303 root->fs_info->last_trans_log_full_commit = trans->transid; 8357 btrfs_set_log_full_commit(root->fs_info, trans);
8304 } else { 8358 } else {
8305 ret = btrfs_insert_inode_ref(trans, dest, 8359 ret = btrfs_insert_inode_ref(trans, dest,
8306 new_dentry->d_name.name, 8360 new_dentry->d_name.name,
@@ -8878,6 +8932,66 @@ static int btrfs_permission(struct inode *inode, int mask)
8878 return generic_permission(inode, mask); 8932 return generic_permission(inode, mask);
8879} 8933}
8880 8934
8935static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8936{
8937 struct btrfs_trans_handle *trans;
8938 struct btrfs_root *root = BTRFS_I(dir)->root;
8939 struct inode *inode = NULL;
8940 u64 objectid;
8941 u64 index;
8942 int ret = 0;
8943
8944 /*
8945 * 5 units required for adding orphan entry
8946 */
8947 trans = btrfs_start_transaction(root, 5);
8948 if (IS_ERR(trans))
8949 return PTR_ERR(trans);
8950
8951 ret = btrfs_find_free_ino(root, &objectid);
8952 if (ret)
8953 goto out;
8954
8955 inode = btrfs_new_inode(trans, root, dir, NULL, 0,
8956 btrfs_ino(dir), objectid, mode, &index);
8957 if (IS_ERR(inode)) {
8958 ret = PTR_ERR(inode);
8959 inode = NULL;
8960 goto out;
8961 }
8962
8963 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
8964 if (ret)
8965 goto out;
8966
8967 ret = btrfs_update_inode(trans, root, inode);
8968 if (ret)
8969 goto out;
8970
8971 inode->i_fop = &btrfs_file_operations;
8972 inode->i_op = &btrfs_file_inode_operations;
8973
8974 inode->i_mapping->a_ops = &btrfs_aops;
8975 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8976 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8977
8978 ret = btrfs_orphan_add(trans, inode);
8979 if (ret)
8980 goto out;
8981
8982 d_tmpfile(dentry, inode);
8983 mark_inode_dirty(inode);
8984
8985out:
8986 btrfs_end_transaction(trans, root);
8987 if (ret)
8988 iput(inode);
8989 btrfs_balance_delayed_items(root);
8990 btrfs_btree_balance_dirty(root);
8991
8992 return ret;
8993}
8994
8881static const struct inode_operations btrfs_dir_inode_operations = { 8995static const struct inode_operations btrfs_dir_inode_operations = {
8882 .getattr = btrfs_getattr, 8996 .getattr = btrfs_getattr,
8883 .lookup = btrfs_lookup, 8997 .lookup = btrfs_lookup,
@@ -8898,6 +9012,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
8898 .get_acl = btrfs_get_acl, 9012 .get_acl = btrfs_get_acl,
8899 .set_acl = btrfs_set_acl, 9013 .set_acl = btrfs_set_acl,
8900 .update_time = btrfs_update_time, 9014 .update_time = btrfs_update_time,
9015 .tmpfile = btrfs_tmpfile,
8901}; 9016};
8902static const struct inode_operations btrfs_dir_ro_inode_operations = { 9017static const struct inode_operations btrfs_dir_ro_inode_operations = {
8903 .lookup = btrfs_lookup, 9018 .lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2f6d7b13b5bd..82c18ba12e3f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -58,6 +58,7 @@
58#include "dev-replace.h" 58#include "dev-replace.h"
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61#include "qgroup.h"
61 62
62#ifdef CONFIG_64BIT 63#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 64/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -638,11 +639,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
638 struct btrfs_trans_handle *trans; 639 struct btrfs_trans_handle *trans;
639 int ret; 640 int ret;
640 641
641 if (!root->ref_cows) 642 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
642 return -EINVAL; 643 return -EINVAL;
643 644
644 atomic_inc(&root->will_be_snapshoted); 645 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc(); 646 smp_mb__after_atomic();
646 btrfs_wait_nocow_write(root); 647 btrfs_wait_nocow_write(root);
647 648
648 ret = btrfs_start_delalloc_inodes(root, 0); 649 ret = btrfs_start_delalloc_inodes(root, 0);
@@ -711,6 +712,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
711 if (ret) 712 if (ret)
712 goto fail; 713 goto fail;
713 714
715 /*
716 * If orphan cleanup did remove any orphans, it means the tree was
717 * modified and therefore the commit root is not the same as the
718 * current root anymore. This is a problem, because send uses the
719 * commit root and therefore can see inode items that don't exist
720 * in the current root anymore, and for example make calls to
721 * btrfs_iget, which will do tree lookups based on the current root
722 * and not on the commit root. Those lookups will fail, returning a
723 * -ESTALE error, and making send fail with that error. So make sure
724 * a send does not see any orphans we have just removed, and that it
725 * will see the same inodes regardless of whether a transaction
726 * commit happened before it started (meaning that the commit root
727 * will be the same as the current root) or not.
728 */
729 if (readonly && pending_snapshot->snap->node !=
730 pending_snapshot->snap->commit_root) {
731 trans = btrfs_join_transaction(pending_snapshot->snap);
732 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
733 ret = PTR_ERR(trans);
734 goto fail;
735 }
736 if (!IS_ERR(trans)) {
737 ret = btrfs_commit_transaction(trans,
738 pending_snapshot->snap);
739 if (ret)
740 goto fail;
741 }
742 }
743
714 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 744 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
715 if (IS_ERR(inode)) { 745 if (IS_ERR(inode)) {
716 ret = PTR_ERR(inode); 746 ret = PTR_ERR(inode);
@@ -1502,11 +1532,12 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1502 sizestr = vol_args->name; 1532 sizestr = vol_args->name;
1503 devstr = strchr(sizestr, ':'); 1533 devstr = strchr(sizestr, ':');
1504 if (devstr) { 1534 if (devstr) {
1505 char *end;
1506 sizestr = devstr + 1; 1535 sizestr = devstr + 1;
1507 *devstr = '\0'; 1536 *devstr = '\0';
1508 devstr = vol_args->name; 1537 devstr = vol_args->name;
1509 devid = simple_strtoull(devstr, &end, 10); 1538 ret = kstrtoull(devstr, 10, &devid);
1539 if (ret)
1540 goto out_free;
1510 if (!devid) { 1541 if (!devid) {
1511 ret = -EINVAL; 1542 ret = -EINVAL;
1512 goto out_free; 1543 goto out_free;
@@ -1562,7 +1593,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1562 new_size = old_size - new_size; 1593 new_size = old_size - new_size;
1563 } else if (mod > 0) { 1594 } else if (mod > 0) {
1564 if (new_size > ULLONG_MAX - old_size) { 1595 if (new_size > ULLONG_MAX - old_size) {
1565 ret = -EINVAL; 1596 ret = -ERANGE;
1566 goto out_free; 1597 goto out_free;
1567 } 1598 }
1568 new_size = old_size + new_size; 1599 new_size = old_size + new_size;
@@ -2219,6 +2250,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2219 struct btrfs_ioctl_vol_args *vol_args; 2250 struct btrfs_ioctl_vol_args *vol_args;
2220 struct btrfs_trans_handle *trans; 2251 struct btrfs_trans_handle *trans;
2221 struct btrfs_block_rsv block_rsv; 2252 struct btrfs_block_rsv block_rsv;
2253 u64 root_flags;
2222 u64 qgroup_reserved; 2254 u64 qgroup_reserved;
2223 int namelen; 2255 int namelen;
2224 int ret; 2256 int ret;
@@ -2240,6 +2272,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2240 if (err) 2272 if (err)
2241 goto out; 2273 goto out;
2242 2274
2275
2243 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 2276 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
2244 if (err == -EINTR) 2277 if (err == -EINTR)
2245 goto out_drop_write; 2278 goto out_drop_write;
@@ -2301,6 +2334,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2301 } 2334 }
2302 2335
2303 mutex_lock(&inode->i_mutex); 2336 mutex_lock(&inode->i_mutex);
2337
2338 /*
2339 * Don't allow to delete a subvolume with send in progress. This is
2340 * inside the i_mutex so the error handling that has to drop the bit
2341 * again is not run concurrently.
2342 */
2343 spin_lock(&dest->root_item_lock);
2344 root_flags = btrfs_root_flags(&dest->root_item);
2345 if (dest->send_in_progress == 0) {
2346 btrfs_set_root_flags(&dest->root_item,
2347 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
2348 spin_unlock(&dest->root_item_lock);
2349 } else {
2350 spin_unlock(&dest->root_item_lock);
2351 btrfs_warn(root->fs_info,
2352 "Attempt to delete subvolume %llu during send",
2353 dest->root_key.objectid);
2354 err = -EPERM;
2355 goto out_dput;
2356 }
2357
2304 err = d_invalidate(dentry); 2358 err = d_invalidate(dentry);
2305 if (err) 2359 if (err)
2306 goto out_unlock; 2360 goto out_unlock;
@@ -2346,7 +2400,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2346 dest->root_item.drop_level = 0; 2400 dest->root_item.drop_level = 0;
2347 btrfs_set_root_refs(&dest->root_item, 0); 2401 btrfs_set_root_refs(&dest->root_item, 0);
2348 2402
2349 if (!xchg(&dest->orphan_item_inserted, 1)) { 2403 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
2350 ret = btrfs_insert_orphan_item(trans, 2404 ret = btrfs_insert_orphan_item(trans,
2351 root->fs_info->tree_root, 2405 root->fs_info->tree_root,
2352 dest->root_key.objectid); 2406 dest->root_key.objectid);
@@ -2389,11 +2443,19 @@ out_release:
2389out_up_write: 2443out_up_write:
2390 up_write(&root->fs_info->subvol_sem); 2444 up_write(&root->fs_info->subvol_sem);
2391out_unlock: 2445out_unlock:
2446 if (err) {
2447 spin_lock(&dest->root_item_lock);
2448 root_flags = btrfs_root_flags(&dest->root_item);
2449 btrfs_set_root_flags(&dest->root_item,
2450 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
2451 spin_unlock(&dest->root_item_lock);
2452 }
2392 mutex_unlock(&inode->i_mutex); 2453 mutex_unlock(&inode->i_mutex);
2393 if (!err) { 2454 if (!err) {
2394 shrink_dcache_sb(root->fs_info->sb); 2455 shrink_dcache_sb(root->fs_info->sb);
2395 btrfs_invalidate_inodes(dest); 2456 btrfs_invalidate_inodes(dest);
2396 d_delete(dentry); 2457 d_delete(dentry);
2458 ASSERT(dest->send_in_progress == 0);
2397 2459
2398 /* the last ref */ 2460 /* the last ref */
2399 if (dest->cache_inode) { 2461 if (dest->cache_inode) {
@@ -2557,9 +2619,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2557 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2619 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2558 int ret = 0; 2620 int ret = 0;
2559 2621
2560 if (!capable(CAP_SYS_ADMIN))
2561 return -EPERM;
2562
2563 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); 2622 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
2564 if (!fi_args) 2623 if (!fi_args)
2565 return -ENOMEM; 2624 return -ENOMEM;
@@ -2574,6 +2633,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2574 } 2633 }
2575 mutex_unlock(&fs_devices->device_list_mutex); 2634 mutex_unlock(&fs_devices->device_list_mutex);
2576 2635
2636 fi_args->nodesize = root->fs_info->super_copy->nodesize;
2637 fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
2638 fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
2639
2577 if (copy_to_user(arg, fi_args, sizeof(*fi_args))) 2640 if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
2578 ret = -EFAULT; 2641 ret = -EFAULT;
2579 2642
@@ -2589,9 +2652,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2589 int ret = 0; 2652 int ret = 0;
2590 char *s_uuid = NULL; 2653 char *s_uuid = NULL;
2591 2654
2592 if (!capable(CAP_SYS_ADMIN))
2593 return -EPERM;
2594
2595 di_args = memdup_user(arg, sizeof(*di_args)); 2655 di_args = memdup_user(arg, sizeof(*di_args));
2596 if (IS_ERR(di_args)) 2656 if (IS_ERR(di_args))
2597 return PTR_ERR(di_args); 2657 return PTR_ERR(di_args);
@@ -2669,10 +2729,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
2669 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2729 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
2670 ordered = btrfs_lookup_first_ordered_extent(inode, 2730 ordered = btrfs_lookup_first_ordered_extent(inode,
2671 off + len - 1); 2731 off + len - 1);
2672 if (!ordered && 2732 if ((!ordered ||
2733 ordered->file_offset + ordered->len <= off ||
2734 ordered->file_offset >= off + len) &&
2673 !test_range_bit(&BTRFS_I(inode)->io_tree, off, 2735 !test_range_bit(&BTRFS_I(inode)->io_tree, off,
2674 off + len - 1, EXTENT_DELALLOC, 0, NULL)) 2736 off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
2737 if (ordered)
2738 btrfs_put_ordered_extent(ordered);
2675 break; 2739 break;
2740 }
2676 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2741 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
2677 if (ordered) 2742 if (ordered)
2678 btrfs_put_ordered_extent(ordered); 2743 btrfs_put_ordered_extent(ordered);
@@ -2912,6 +2977,126 @@ out:
2912 return ret; 2977 return ret;
2913} 2978}
2914 2979
2980/* Helper to check and see if this root currently has a ref on the given disk
2981 * bytenr. If it does then we need to update the quota for this root. This
2982 * doesn't do anything if quotas aren't enabled.
2983 */
2984static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2985 u64 disko)
2986{
2987 struct seq_list tree_mod_seq_elem = {};
2988 struct ulist *roots;
2989 struct ulist_iterator uiter;
2990 struct ulist_node *root_node = NULL;
2991 int ret;
2992
2993 if (!root->fs_info->quota_enabled)
2994 return 1;
2995
2996 btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
2997 ret = btrfs_find_all_roots(trans, root->fs_info, disko,
2998 tree_mod_seq_elem.seq, &roots);
2999 if (ret < 0)
3000 goto out;
3001 ret = 0;
3002 ULIST_ITER_INIT(&uiter);
3003 while ((root_node = ulist_next(roots, &uiter))) {
3004 if (root_node->val == root->objectid) {
3005 ret = 1;
3006 break;
3007 }
3008 }
3009 ulist_free(roots);
3010out:
3011 btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
3012 return ret;
3013}
3014
3015static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
3016 struct inode *inode,
3017 u64 endoff,
3018 const u64 destoff,
3019 const u64 olen)
3020{
3021 struct btrfs_root *root = BTRFS_I(inode)->root;
3022 int ret;
3023
3024 inode_inc_iversion(inode);
3025 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3026 /*
3027 * We round up to the block size at eof when determining which
3028 * extents to clone above, but shouldn't round up the file size.
3029 */
3030 if (endoff > destoff + olen)
3031 endoff = destoff + olen;
3032 if (endoff > inode->i_size)
3033 btrfs_i_size_write(inode, endoff);
3034
3035 ret = btrfs_update_inode(trans, root, inode);
3036 if (ret) {
3037 btrfs_abort_transaction(trans, root, ret);
3038 btrfs_end_transaction(trans, root);
3039 goto out;
3040 }
3041 ret = btrfs_end_transaction(trans, root);
3042out:
3043 return ret;
3044}
3045
3046static void clone_update_extent_map(struct inode *inode,
3047 const struct btrfs_trans_handle *trans,
3048 const struct btrfs_path *path,
3049 struct btrfs_file_extent_item *fi,
3050 const u64 hole_offset,
3051 const u64 hole_len)
3052{
3053 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3054 struct extent_map *em;
3055 int ret;
3056
3057 em = alloc_extent_map();
3058 if (!em) {
3059 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3060 &BTRFS_I(inode)->runtime_flags);
3061 return;
3062 }
3063
3064 if (fi) {
3065 btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
3066 em->generation = -1;
3067 if (btrfs_file_extent_type(path->nodes[0], fi) ==
3068 BTRFS_FILE_EXTENT_INLINE)
3069 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3070 &BTRFS_I(inode)->runtime_flags);
3071 } else {
3072 em->start = hole_offset;
3073 em->len = hole_len;
3074 em->ram_bytes = em->len;
3075 em->orig_start = hole_offset;
3076 em->block_start = EXTENT_MAP_HOLE;
3077 em->block_len = 0;
3078 em->orig_block_len = 0;
3079 em->compress_type = BTRFS_COMPRESS_NONE;
3080 em->generation = trans->transid;
3081 }
3082
3083 while (1) {
3084 write_lock(&em_tree->lock);
3085 ret = add_extent_mapping(em_tree, em, 1);
3086 write_unlock(&em_tree->lock);
3087 if (ret != -EEXIST) {
3088 free_extent_map(em);
3089 break;
3090 }
3091 btrfs_drop_extent_cache(inode, em->start,
3092 em->start + em->len - 1, 0);
3093 }
3094
3095 if (unlikely(ret))
3096 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3097 &BTRFS_I(inode)->runtime_flags);
3098}
3099
2915/** 3100/**
2916 * btrfs_clone() - clone a range from inode file to another 3101 * btrfs_clone() - clone a range from inode file to another
2917 * 3102 *
@@ -2924,7 +3109,8 @@ out:
2924 * @destoff: Offset within @inode to start clone 3109 * @destoff: Offset within @inode to start clone
2925 */ 3110 */
2926static int btrfs_clone(struct inode *src, struct inode *inode, 3111static int btrfs_clone(struct inode *src, struct inode *inode,
2927 u64 off, u64 olen, u64 olen_aligned, u64 destoff) 3112 const u64 off, const u64 olen, const u64 olen_aligned,
3113 const u64 destoff)
2928{ 3114{
2929 struct btrfs_root *root = BTRFS_I(inode)->root; 3115 struct btrfs_root *root = BTRFS_I(inode)->root;
2930 struct btrfs_path *path = NULL; 3116 struct btrfs_path *path = NULL;
@@ -2935,7 +3121,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2935 u32 nritems; 3121 u32 nritems;
2936 int slot; 3122 int slot;
2937 int ret; 3123 int ret;
2938 u64 len = olen_aligned; 3124 int no_quota;
3125 const u64 len = olen_aligned;
3126 u64 last_disko = 0;
3127 u64 last_dest_end = destoff;
2939 3128
2940 ret = -ENOMEM; 3129 ret = -ENOMEM;
2941 buf = vmalloc(btrfs_level_size(root, 0)); 3130 buf = vmalloc(btrfs_level_size(root, 0));
@@ -2952,7 +3141,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2952 /* clone data */ 3141 /* clone data */
2953 key.objectid = btrfs_ino(src); 3142 key.objectid = btrfs_ino(src);
2954 key.type = BTRFS_EXTENT_DATA_KEY; 3143 key.type = BTRFS_EXTENT_DATA_KEY;
2955 key.offset = 0; 3144 key.offset = off;
2956 3145
2957 while (1) { 3146 while (1) {
2958 /* 3147 /*
@@ -2964,9 +3153,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2964 0, 0); 3153 0, 0);
2965 if (ret < 0) 3154 if (ret < 0)
2966 goto out; 3155 goto out;
3156 /*
3157 * First search, if no extent item that starts at offset off was
3158 * found but the previous item is an extent item, it's possible
3159 * it might overlap our target range, therefore process it.
3160 */
3161 if (key.offset == off && ret > 0 && path->slots[0] > 0) {
3162 btrfs_item_key_to_cpu(path->nodes[0], &key,
3163 path->slots[0] - 1);
3164 if (key.type == BTRFS_EXTENT_DATA_KEY)
3165 path->slots[0]--;
3166 }
2967 3167
2968 nritems = btrfs_header_nritems(path->nodes[0]); 3168 nritems = btrfs_header_nritems(path->nodes[0]);
2969process_slot: 3169process_slot:
3170 no_quota = 1;
2970 if (path->slots[0] >= nritems) { 3171 if (path->slots[0] >= nritems) {
2971 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3172 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2972 if (ret < 0) 3173 if (ret < 0)
@@ -2991,7 +3192,7 @@ process_slot:
2991 u64 disko = 0, diskl = 0; 3192 u64 disko = 0, diskl = 0;
2992 u64 datao = 0, datal = 0; 3193 u64 datao = 0, datal = 0;
2993 u8 comp; 3194 u8 comp;
2994 u64 endoff; 3195 u64 drop_start;
2995 3196
2996 extent = btrfs_item_ptr(leaf, slot, 3197 extent = btrfs_item_ptr(leaf, slot,
2997 struct btrfs_file_extent_item); 3198 struct btrfs_file_extent_item);
@@ -3012,10 +3213,16 @@ process_slot:
3012 extent); 3213 extent);
3013 } 3214 }
3014 3215
3015 if (key.offset + datal <= off || 3216 /*
3016 key.offset >= off + len - 1) { 3217 * The first search might have left us at an extent
3218 * item that ends before our target range's start, can
3219 * happen if we have holes and NO_HOLES feature enabled.
3220 */
3221 if (key.offset + datal <= off) {
3017 path->slots[0]++; 3222 path->slots[0]++;
3018 goto process_slot; 3223 goto process_slot;
3224 } else if (key.offset >= off + len) {
3225 break;
3019 } 3226 }
3020 3227
3021 size = btrfs_item_size_nr(leaf, slot); 3228 size = btrfs_item_size_nr(leaf, slot);
@@ -3034,6 +3241,18 @@ process_slot:
3034 new_key.offset = destoff; 3241 new_key.offset = destoff;
3035 3242
3036 /* 3243 /*
3244 * Deal with a hole that doesn't have an extent item
3245 * that represents it (NO_HOLES feature enabled).
3246 * This hole is either in the middle of the cloning
3247 * range or at the beginning (fully overlaps it or
3248 * partially overlaps it).
3249 */
3250 if (new_key.offset != last_dest_end)
3251 drop_start = last_dest_end;
3252 else
3253 drop_start = new_key.offset;
3254
3255 /*
3037 * 1 - adjusting old extent (we may have to split it) 3256 * 1 - adjusting old extent (we may have to split it)
3038 * 1 - add new extent 3257 * 1 - add new extent
3039 * 1 - inode update 3258 * 1 - inode update
@@ -3051,18 +3270,18 @@ process_slot:
3051 * | ------------- extent ------------- | 3270 * | ------------- extent ------------- |
3052 */ 3271 */
3053 3272
3054 /* substract range b */ 3273 /* subtract range b */
3055 if (key.offset + datal > off + len) 3274 if (key.offset + datal > off + len)
3056 datal = off + len - key.offset; 3275 datal = off + len - key.offset;
3057 3276
3058 /* substract range a */ 3277 /* subtract range a */
3059 if (off > key.offset) { 3278 if (off > key.offset) {
3060 datao += off - key.offset; 3279 datao += off - key.offset;
3061 datal -= off - key.offset; 3280 datal -= off - key.offset;
3062 } 3281 }
3063 3282
3064 ret = btrfs_drop_extents(trans, root, inode, 3283 ret = btrfs_drop_extents(trans, root, inode,
3065 new_key.offset, 3284 drop_start,
3066 new_key.offset + datal, 3285 new_key.offset + datal,
3067 1); 3286 1);
3068 if (ret) { 3287 if (ret) {
@@ -3099,6 +3318,28 @@ process_slot:
3099 datao); 3318 datao);
3100 btrfs_set_file_extent_num_bytes(leaf, extent, 3319 btrfs_set_file_extent_num_bytes(leaf, extent,
3101 datal); 3320 datal);
3321
3322 /*
3323 * We need to look up the roots that point at
3324 * this bytenr and see if the new root does. If
3325 * it does not we need to make sure we update
3326 * quotas appropriately.
3327 */
3328 if (disko && root != BTRFS_I(src)->root &&
3329 disko != last_disko) {
3330 no_quota = check_ref(trans, root,
3331 disko);
3332 if (no_quota < 0) {
3333 btrfs_abort_transaction(trans,
3334 root,
3335 ret);
3336 btrfs_end_transaction(trans,
3337 root);
3338 ret = no_quota;
3339 goto out;
3340 }
3341 }
3342
3102 if (disko) { 3343 if (disko) {
3103 inode_add_bytes(inode, datal); 3344 inode_add_bytes(inode, datal);
3104 ret = btrfs_inc_extent_ref(trans, root, 3345 ret = btrfs_inc_extent_ref(trans, root,
@@ -3106,7 +3347,7 @@ process_slot:
3106 root->root_key.objectid, 3347 root->root_key.objectid,
3107 btrfs_ino(inode), 3348 btrfs_ino(inode),
3108 new_key.offset - datao, 3349 new_key.offset - datao,
3109 0); 3350 no_quota);
3110 if (ret) { 3351 if (ret) {
3111 btrfs_abort_transaction(trans, 3352 btrfs_abort_transaction(trans,
3112 root, 3353 root,
@@ -3141,7 +3382,7 @@ process_slot:
3141 aligned_end = ALIGN(new_key.offset + datal, 3382 aligned_end = ALIGN(new_key.offset + datal,
3142 root->sectorsize); 3383 root->sectorsize);
3143 ret = btrfs_drop_extents(trans, root, inode, 3384 ret = btrfs_drop_extents(trans, root, inode,
3144 new_key.offset, 3385 drop_start,
3145 aligned_end, 3386 aligned_end,
3146 1); 3387 1);
3147 if (ret) { 3388 if (ret) {
@@ -3174,40 +3415,69 @@ process_slot:
3174 btrfs_item_ptr_offset(leaf, slot), 3415 btrfs_item_ptr_offset(leaf, slot),
3175 size); 3416 size);
3176 inode_add_bytes(inode, datal); 3417 inode_add_bytes(inode, datal);
3418 extent = btrfs_item_ptr(leaf, slot,
3419 struct btrfs_file_extent_item);
3177 } 3420 }
3178 3421
3422 /* If we have an implicit hole (NO_HOLES feature). */
3423 if (drop_start < new_key.offset)
3424 clone_update_extent_map(inode, trans,
3425 path, NULL, drop_start,
3426 new_key.offset - drop_start);
3427
3428 clone_update_extent_map(inode, trans, path,
3429 extent, 0, 0);
3430
3179 btrfs_mark_buffer_dirty(leaf); 3431 btrfs_mark_buffer_dirty(leaf);
3180 btrfs_release_path(path); 3432 btrfs_release_path(path);
3181 3433
3182 inode_inc_iversion(inode); 3434 last_dest_end = new_key.offset + datal;
3183 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3435 ret = clone_finish_inode_update(trans, inode,
3184 3436 last_dest_end,
3185 /* 3437 destoff, olen);
3186 * we round up to the block size at eof when 3438 if (ret)
3187 * determining which extents to clone above,
3188 * but shouldn't round up the file size
3189 */
3190 endoff = new_key.offset + datal;
3191 if (endoff > destoff+olen)
3192 endoff = destoff+olen;
3193 if (endoff > inode->i_size)
3194 btrfs_i_size_write(inode, endoff);
3195
3196 ret = btrfs_update_inode(trans, root, inode);
3197 if (ret) {
3198 btrfs_abort_transaction(trans, root, ret);
3199 btrfs_end_transaction(trans, root);
3200 goto out; 3439 goto out;
3201 } 3440 if (new_key.offset + datal >= destoff + len)
3202 ret = btrfs_end_transaction(trans, root); 3441 break;
3203 } 3442 }
3204 btrfs_release_path(path); 3443 btrfs_release_path(path);
3205 key.offset++; 3444 key.offset++;
3206 } 3445 }
3207 ret = 0; 3446 ret = 0;
3208 3447
3448 if (last_dest_end < destoff + len) {
3449 /*
3450 * We have an implicit hole (NO_HOLES feature is enabled) that
3451 * fully or partially overlaps our cloning range at its end.
3452 */
3453 btrfs_release_path(path);
3454
3455 /*
3456 * 1 - remove extent(s)
3457 * 1 - inode update
3458 */
3459 trans = btrfs_start_transaction(root, 2);
3460 if (IS_ERR(trans)) {
3461 ret = PTR_ERR(trans);
3462 goto out;
3463 }
3464 ret = btrfs_drop_extents(trans, root, inode,
3465 last_dest_end, destoff + len, 1);
3466 if (ret) {
3467 if (ret != -EOPNOTSUPP)
3468 btrfs_abort_transaction(trans, root, ret);
3469 btrfs_end_transaction(trans, root);
3470 goto out;
3471 }
3472 ret = clone_finish_inode_update(trans, inode, destoff + len,
3473 destoff, olen);
3474 if (ret)
3475 goto out;
3476 clone_update_extent_map(inode, trans, path, NULL, last_dest_end,
3477 destoff + len - last_dest_end);
3478 }
3479
3209out: 3480out:
3210 btrfs_release_path(path);
3211 btrfs_free_path(path); 3481 btrfs_free_path(path);
3212 vfree(buf); 3482 vfree(buf);
3213 return ret; 3483 return ret;
@@ -3319,15 +3589,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3319 goto out_unlock; 3589 goto out_unlock;
3320 } 3590 }
3321 3591
3322 /* truncate page cache pages from target inode range */ 3592 /*
3323 truncate_inode_pages_range(&inode->i_data, destoff, 3593 * Lock the target range too. Right after we replace the file extent
3324 PAGE_CACHE_ALIGN(destoff + len) - 1); 3594 * items in the fs tree (which now point to the cloned data), we might
3595 * have a worker replace them with extent items relative to a write
3596 * operation that was issued before this clone operation (i.e. confront
3597 * with inode.c:btrfs_finish_ordered_io).
3598 */
3599 if (same_inode) {
3600 u64 lock_start = min_t(u64, off, destoff);
3601 u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
3325 3602
3326 lock_extent_range(src, off, len); 3603 lock_extent_range(src, lock_start, lock_len);
3604 } else {
3605 lock_extent_range(src, off, len);
3606 lock_extent_range(inode, destoff, len);
3607 }
3327 3608
3328 ret = btrfs_clone(src, inode, off, olen, len, destoff); 3609 ret = btrfs_clone(src, inode, off, olen, len, destoff);
3329 3610
3330 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 3611 if (same_inode) {
3612 u64 lock_start = min_t(u64, off, destoff);
3613 u64 lock_end = max_t(u64, off, destoff) + len - 1;
3614
3615 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
3616 } else {
3617 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
3618 unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
3619 destoff + len - 1);
3620 }
3621 /*
3622 * Truncate page cache pages so that future reads will see the cloned
3623 * data immediately and not the previous data.
3624 */
3625 truncate_inode_pages_range(&inode->i_data, destoff,
3626 PAGE_CACHE_ALIGN(destoff + len) - 1);
3331out_unlock: 3627out_unlock:
3332 if (!same_inode) { 3628 if (!same_inode) {
3333 if (inode < src) { 3629 if (inode < src) {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b47f669aca75..dfad8514f0da 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws,
143 if (ret != LZO_E_OK) { 143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 144 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
145 ret); 145 ret);
146 ret = -1; 146 ret = -EIO;
147 goto out; 147 goto out;
148 } 148 }
149 149
@@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,
189 kunmap(out_page); 189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) { 190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL; 191 out_page = NULL;
192 ret = -1; 192 ret = -E2BIG;
193 goto out; 193 goto out;
194 } 194 }
195 195
@@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws,
208 208
209 /* we're making it bigger, give up */ 209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out) { 210 if (tot_in > 8192 && tot_in < tot_out) {
211 ret = -1; 211 ret = -E2BIG;
212 goto out; 212 goto out;
213 } 213 }
214 214
@@ -335,7 +335,7 @@ cont:
335 break; 335 break;
336 336
337 if (page_in_index + 1 >= total_pages_in) { 337 if (page_in_index + 1 >= total_pages_in) {
338 ret = -1; 338 ret = -EIO;
339 goto done; 339 goto done;
340 } 340 }
341 341
@@ -358,7 +358,7 @@ cont:
358 kunmap(pages_in[page_in_index - 1]); 358 kunmap(pages_in[page_in_index - 1]);
359 if (ret != LZO_E_OK) { 359 if (ret != LZO_E_OK) {
360 printk(KERN_WARNING "BTRFS: decompress failed\n"); 360 printk(KERN_WARNING "BTRFS: decompress failed\n");
361 ret = -1; 361 ret = -EIO;
362 break; 362 break;
363 } 363 }
364 364
@@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
402 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); 402 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
403 if (ret != LZO_E_OK) { 403 if (ret != LZO_E_OK) {
404 printk(KERN_WARNING "BTRFS: decompress failed!\n"); 404 printk(KERN_WARNING "BTRFS: decompress failed!\n");
405 ret = -1; 405 ret = -EIO;
406 goto out; 406 goto out;
407 } 407 }
408 408
409 if (out_len < start_byte) { 409 if (out_len < start_byte) {
410 ret = -1; 410 ret = -EIO;
411 goto out; 411 goto out;
412 } 412 }
413 413
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a94b05f72869..e12441c7cf1d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,
67{ 67{
68 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 68 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
69 btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " 69 btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
70 "%llu\n", offset); 70 "%llu", offset);
71} 71}
72 72
73/* 73/*
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2cf905877aaf..cf5aead95a7f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -32,6 +32,7 @@
32#include "ulist.h" 32#include "ulist.h"
33#include "backref.h" 33#include "backref.h"
34#include "extent_io.h" 34#include "extent_io.h"
35#include "qgroup.h"
35 36
36/* TODO XXX FIXME 37/* TODO XXX FIXME
37 * - subvol delete -> delete when ref goes to 0? delete limits also? 38 * - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -84,8 +85,8 @@ struct btrfs_qgroup {
84 /* 85 /*
85 * temp variables for accounting operations 86 * temp variables for accounting operations
86 */ 87 */
87 u64 tag; 88 u64 old_refcnt;
88 u64 refcnt; 89 u64 new_refcnt;
89}; 90};
90 91
91/* 92/*
@@ -98,6 +99,9 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 99 struct btrfs_qgroup *member;
99}; 100};
100 101
102#define ptr_to_u64(x) ((u64)(uintptr_t)x)
103#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
104
101static int 105static int
102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 106qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 int init_flags); 107 int init_flags);
@@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
242 return -ENOENT; 246 return -ENOENT;
243} 247}
244 248
249#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
250int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
251 u64 rfer, u64 excl)
252{
253 struct btrfs_qgroup *qgroup;
254
255 qgroup = find_qgroup_rb(fs_info, qgroupid);
256 if (!qgroup)
257 return -EINVAL;
258 if (qgroup->rfer != rfer || qgroup->excl != excl)
259 return -EINVAL;
260 return 0;
261}
262#endif
263
245/* 264/*
246 * The full config is read in one go, only called from open_ctree() 265 * The full config is read in one go, only called from open_ctree()
247 * It doesn't use any locking, as at this point we're still single-threaded 266 * It doesn't use any locking, as at this point we're still single-threaded
@@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
520 struct extent_buffer *leaf; 539 struct extent_buffer *leaf;
521 struct btrfs_key key; 540 struct btrfs_key key;
522 541
542#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
543 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
544 return 0;
545#endif
523 path = btrfs_alloc_path(); 546 path = btrfs_alloc_path();
524 if (!path) 547 if (!path)
525 return -ENOMEM; 548 return -ENOMEM;
@@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
669 int ret; 692 int ret;
670 int slot; 693 int slot;
671 694
695#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
696 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
697 return 0;
698#endif
672 key.objectid = 0; 699 key.objectid = 0;
673 key.type = BTRFS_QGROUP_INFO_KEY; 700 key.type = BTRFS_QGROUP_INFO_KEY;
674 key.offset = qgroup->qgroupid; 701 key.offset = qgroup->qgroupid;
@@ -1174,33 +1201,198 @@ out:
1174 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1201 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1175 return ret; 1202 return ret;
1176} 1203}
1204static int comp_oper(struct btrfs_qgroup_operation *oper1,
1205 struct btrfs_qgroup_operation *oper2)
1206{
1207 if (oper1->bytenr < oper2->bytenr)
1208 return -1;
1209 if (oper1->bytenr > oper2->bytenr)
1210 return 1;
1211 if (oper1->seq < oper2->seq)
1212 return -1;
1213 if (oper1->seq > oper2->seq)
1214 return -1;
1215 if (oper1->ref_root < oper2->ref_root)
1216 return -1;
1217 if (oper1->ref_root > oper2->ref_root)
1218 return 1;
1219 if (oper1->type < oper2->type)
1220 return -1;
1221 if (oper1->type > oper2->type)
1222 return 1;
1223 return 0;
1224}
1225
1226static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
1227 struct btrfs_qgroup_operation *oper)
1228{
1229 struct rb_node **p;
1230 struct rb_node *parent = NULL;
1231 struct btrfs_qgroup_operation *cur;
1232 int cmp;
1233
1234 spin_lock(&fs_info->qgroup_op_lock);
1235 p = &fs_info->qgroup_op_tree.rb_node;
1236 while (*p) {
1237 parent = *p;
1238 cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
1239 cmp = comp_oper(cur, oper);
1240 if (cmp < 0) {
1241 p = &(*p)->rb_right;
1242 } else if (cmp) {
1243 p = &(*p)->rb_left;
1244 } else {
1245 spin_unlock(&fs_info->qgroup_op_lock);
1246 return -EEXIST;
1247 }
1248 }
1249 rb_link_node(&oper->n, parent, p);
1250 rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
1251 spin_unlock(&fs_info->qgroup_op_lock);
1252 return 0;
1253}
1177 1254
1178/* 1255/*
1179 * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts 1256 * Record a quota operation for processing later on.
1180 * the modification into a list that's later used by btrfs_end_transaction to 1257 * @trans: the transaction we are adding the delayed op to.
1181 * pass the recorded modifications on to btrfs_qgroup_account_ref. 1258 * @fs_info: the fs_info for this fs.
1259 * @ref_root: the root of the reference we are acting on,
1260 * @bytenr: the bytenr we are acting on.
1261 * @num_bytes: the number of bytes in the reference.
1262 * @type: the type of operation this is.
1263 * @mod_seq: do we need to get a sequence number for looking up roots.
1264 *
1265 * We just add it to our trans qgroup_ref_list and carry on and process these
1266 * operations in order at some later point. If the reference root isn't a fs
1267 * root then we don't bother with doing anything.
1268 *
1269 * MUST BE HOLDING THE REF LOCK.
1182 */ 1270 */
1183int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, 1271int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1184 struct btrfs_delayed_ref_node *node, 1272 struct btrfs_fs_info *fs_info, u64 ref_root,
1185 struct btrfs_delayed_extent_op *extent_op) 1273 u64 bytenr, u64 num_bytes,
1274 enum btrfs_qgroup_operation_type type, int mod_seq)
1186{ 1275{
1187 struct qgroup_update *u; 1276 struct btrfs_qgroup_operation *oper;
1277 int ret;
1278
1279 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
1280 return 0;
1188 1281
1189 BUG_ON(!trans->delayed_ref_elem.seq); 1282 oper = kmalloc(sizeof(*oper), GFP_NOFS);
1190 u = kmalloc(sizeof(*u), GFP_NOFS); 1283 if (!oper)
1191 if (!u)
1192 return -ENOMEM; 1284 return -ENOMEM;
1193 1285
1194 u->node = node; 1286 oper->ref_root = ref_root;
1195 u->extent_op = extent_op; 1287 oper->bytenr = bytenr;
1196 list_add_tail(&u->list, &trans->qgroup_ref_list); 1288 oper->num_bytes = num_bytes;
1289 oper->type = type;
1290 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1291 INIT_LIST_HEAD(&oper->elem.list);
1292 oper->elem.seq = 0;
1293 ret = insert_qgroup_oper(fs_info, oper);
1294 if (ret) {
1295 /* Shouldn't happen so have an assert for developers */
1296 ASSERT(0);
1297 kfree(oper);
1298 return ret;
1299 }
1300 list_add_tail(&oper->list, &trans->qgroup_ref_list);
1301
1302 if (mod_seq)
1303 btrfs_get_tree_mod_seq(fs_info, &oper->elem);
1197 1304
1198 return 0; 1305 return 0;
1199} 1306}
1200 1307
1201static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, 1308/*
1202 struct ulist *roots, struct ulist *tmp, 1309 * The easy accounting, if we are adding/removing the only ref for an extent
1203 u64 seq) 1310 * then this qgroup and all of the parent qgroups get their refrence and
1311 * exclusive counts adjusted.
1312 */
1313static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1314 struct btrfs_qgroup_operation *oper)
1315{
1316 struct btrfs_qgroup *qgroup;
1317 struct ulist *tmp;
1318 struct btrfs_qgroup_list *glist;
1319 struct ulist_node *unode;
1320 struct ulist_iterator uiter;
1321 int sign = 0;
1322 int ret = 0;
1323
1324 tmp = ulist_alloc(GFP_NOFS);
1325 if (!tmp)
1326 return -ENOMEM;
1327
1328 spin_lock(&fs_info->qgroup_lock);
1329 if (!fs_info->quota_root)
1330 goto out;
1331 qgroup = find_qgroup_rb(fs_info, oper->ref_root);
1332 if (!qgroup)
1333 goto out;
1334 switch (oper->type) {
1335 case BTRFS_QGROUP_OPER_ADD_EXCL:
1336 sign = 1;
1337 break;
1338 case BTRFS_QGROUP_OPER_SUB_EXCL:
1339 sign = -1;
1340 break;
1341 default:
1342 ASSERT(0);
1343 }
1344 qgroup->rfer += sign * oper->num_bytes;
1345 qgroup->rfer_cmpr += sign * oper->num_bytes;
1346
1347 WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
1348 qgroup->excl += sign * oper->num_bytes;
1349 qgroup->excl_cmpr += sign * oper->num_bytes;
1350
1351 qgroup_dirty(fs_info, qgroup);
1352
1353 /* Get all of the parent groups that contain this qgroup */
1354 list_for_each_entry(glist, &qgroup->groups, next_group) {
1355 ret = ulist_add(tmp, glist->group->qgroupid,
1356 ptr_to_u64(glist->group), GFP_ATOMIC);
1357 if (ret < 0)
1358 goto out;
1359 }
1360
1361 /* Iterate all of the parents and adjust their reference counts */
1362 ULIST_ITER_INIT(&uiter);
1363 while ((unode = ulist_next(tmp, &uiter))) {
1364 qgroup = u64_to_ptr(unode->aux);
1365 qgroup->rfer += sign * oper->num_bytes;
1366 qgroup->rfer_cmpr += sign * oper->num_bytes;
1367 qgroup->excl += sign * oper->num_bytes;
1368 if (sign < 0)
1369 WARN_ON(qgroup->excl < oper->num_bytes);
1370 qgroup->excl_cmpr += sign * oper->num_bytes;
1371 qgroup_dirty(fs_info, qgroup);
1372
1373 /* Add any parents of the parents */
1374 list_for_each_entry(glist, &qgroup->groups, next_group) {
1375 ret = ulist_add(tmp, glist->group->qgroupid,
1376 ptr_to_u64(glist->group), GFP_ATOMIC);
1377 if (ret < 0)
1378 goto out;
1379 }
1380 }
1381 ret = 0;
1382out:
1383 spin_unlock(&fs_info->qgroup_lock);
1384 ulist_free(tmp);
1385 return ret;
1386}
1387
1388/*
1389 * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
1390 * properly.
1391 */
1392static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
1393 u64 root_to_skip, struct ulist *tmp,
1394 struct ulist *roots, struct ulist *qgroups,
1395 u64 seq, int *old_roots, int rescan)
1204{ 1396{
1205 struct ulist_node *unode; 1397 struct ulist_node *unode;
1206 struct ulist_iterator uiter; 1398 struct ulist_iterator uiter;
@@ -1211,256 +1403,549 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
1211 1403
1212 ULIST_ITER_INIT(&uiter); 1404 ULIST_ITER_INIT(&uiter);
1213 while ((unode = ulist_next(roots, &uiter))) { 1405 while ((unode = ulist_next(roots, &uiter))) {
1406 /* We don't count our current root here */
1407 if (unode->val == root_to_skip)
1408 continue;
1214 qg = find_qgroup_rb(fs_info, unode->val); 1409 qg = find_qgroup_rb(fs_info, unode->val);
1215 if (!qg) 1410 if (!qg)
1216 continue; 1411 continue;
1412 /*
1413 * We could have a pending removal of this same ref so we may
1414 * not have actually found our ref root when doing
1415 * btrfs_find_all_roots, so we need to keep track of how many
1416 * old roots we find in case we removed ours and added a
1417 * different one at the same time. I don't think this could
1418 * happen in practice but that sort of thinking leads to pain
1419 * and suffering and to the dark side.
1420 */
1421 (*old_roots)++;
1217 1422
1218 ulist_reinit(tmp); 1423 ulist_reinit(tmp);
1219 /* XXX id not needed */ 1424 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1220 ret = ulist_add(tmp, qg->qgroupid, 1425 GFP_ATOMIC);
1221 (u64)(uintptr_t)qg, GFP_ATOMIC); 1426 if (ret < 0)
1427 return ret;
1428 ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
1222 if (ret < 0) 1429 if (ret < 0)
1223 return ret; 1430 return ret;
1224 ULIST_ITER_INIT(&tmp_uiter); 1431 ULIST_ITER_INIT(&tmp_uiter);
1225 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1432 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1226 struct btrfs_qgroup_list *glist; 1433 struct btrfs_qgroup_list *glist;
1227 1434
1228 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; 1435 qg = u64_to_ptr(tmp_unode->aux);
1229 if (qg->refcnt < seq) 1436 /*
1230 qg->refcnt = seq + 1; 1437 * We use this sequence number to keep from having to
1438 * run the whole list and 0 out the refcnt every time.
1439 * We basically use sequnce as the known 0 count and
1440 * then add 1 everytime we see a qgroup. This is how we
1441 * get how many of the roots actually point up to the
1442 * upper level qgroups in order to determine exclusive
1443 * counts.
1444 *
1445 * For rescan we want to set old_refcnt to seq so our
1446 * exclusive calculations end up correct.
1447 */
1448 if (rescan)
1449 qg->old_refcnt = seq;
1450 else if (qg->old_refcnt < seq)
1451 qg->old_refcnt = seq + 1;
1231 else 1452 else
1232 ++qg->refcnt; 1453 qg->old_refcnt++;
1233 1454
1455 if (qg->new_refcnt < seq)
1456 qg->new_refcnt = seq + 1;
1457 else
1458 qg->new_refcnt++;
1234 list_for_each_entry(glist, &qg->groups, next_group) { 1459 list_for_each_entry(glist, &qg->groups, next_group) {
1460 ret = ulist_add(qgroups, glist->group->qgroupid,
1461 ptr_to_u64(glist->group),
1462 GFP_ATOMIC);
1463 if (ret < 0)
1464 return ret;
1235 ret = ulist_add(tmp, glist->group->qgroupid, 1465 ret = ulist_add(tmp, glist->group->qgroupid,
1236 (u64)(uintptr_t)glist->group, 1466 ptr_to_u64(glist->group),
1237 GFP_ATOMIC); 1467 GFP_ATOMIC);
1238 if (ret < 0) 1468 if (ret < 0)
1239 return ret; 1469 return ret;
1240 } 1470 }
1241 } 1471 }
1242 } 1472 }
1473 return 0;
1474}
1475
1476/*
1477 * We need to walk forward in our operation tree and account for any roots that
1478 * were deleted after we made this operation.
1479 */
1480static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
1481 struct btrfs_qgroup_operation *oper,
1482 struct ulist *tmp,
1483 struct ulist *qgroups, u64 seq,
1484 int *old_roots)
1485{
1486 struct ulist_node *unode;
1487 struct ulist_iterator uiter;
1488 struct btrfs_qgroup *qg;
1489 struct btrfs_qgroup_operation *tmp_oper;
1490 struct rb_node *n;
1491 int ret;
1492
1493 ulist_reinit(tmp);
1243 1494
1495 /*
1496 * We only walk forward in the tree since we're only interested in
1497 * removals that happened _after_ our operation.
1498 */
1499 spin_lock(&fs_info->qgroup_op_lock);
1500 n = rb_next(&oper->n);
1501 spin_unlock(&fs_info->qgroup_op_lock);
1502 if (!n)
1503 return 0;
1504 tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1505 while (tmp_oper->bytenr == oper->bytenr) {
1506 /*
1507 * If it's not a removal we don't care, additions work out
1508 * properly with our refcnt tracking.
1509 */
1510 if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
1511 tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
1512 goto next;
1513 qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
1514 if (!qg)
1515 goto next;
1516 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1517 GFP_ATOMIC);
1518 if (ret) {
1519 if (ret < 0)
1520 return ret;
1521 /*
1522 * We only want to increase old_roots if this qgroup is
1523 * not already in the list of qgroups. If it is already
1524 * there then that means it must have been re-added or
1525 * the delete will be discarded because we had an
1526 * existing ref that we haven't looked up yet. In this
1527 * case we don't want to increase old_roots. So if ret
1528 * == 1 then we know that this is the first time we've
1529 * seen this qgroup and we can bump the old_roots.
1530 */
1531 (*old_roots)++;
1532 ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
1533 GFP_ATOMIC);
1534 if (ret < 0)
1535 return ret;
1536 }
1537next:
1538 spin_lock(&fs_info->qgroup_op_lock);
1539 n = rb_next(&tmp_oper->n);
1540 spin_unlock(&fs_info->qgroup_op_lock);
1541 if (!n)
1542 break;
1543 tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1544 }
1545
1546 /* Ok now process the qgroups we found */
1547 ULIST_ITER_INIT(&uiter);
1548 while ((unode = ulist_next(tmp, &uiter))) {
1549 struct btrfs_qgroup_list *glist;
1550
1551 qg = u64_to_ptr(unode->aux);
1552 if (qg->old_refcnt < seq)
1553 qg->old_refcnt = seq + 1;
1554 else
1555 qg->old_refcnt++;
1556 if (qg->new_refcnt < seq)
1557 qg->new_refcnt = seq + 1;
1558 else
1559 qg->new_refcnt++;
1560 list_for_each_entry(glist, &qg->groups, next_group) {
1561 ret = ulist_add(qgroups, glist->group->qgroupid,
1562 ptr_to_u64(glist->group), GFP_ATOMIC);
1563 if (ret < 0)
1564 return ret;
1565 ret = ulist_add(tmp, glist->group->qgroupid,
1566 ptr_to_u64(glist->group), GFP_ATOMIC);
1567 if (ret < 0)
1568 return ret;
1569 }
1570 }
1244 return 0; 1571 return 0;
1245} 1572}
1246 1573
1247static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, 1574/* Add refcnt for the newly added reference. */
1248 struct ulist *roots, struct ulist *tmp, 1575static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
1249 u64 seq, int sgn, u64 num_bytes, 1576 struct btrfs_qgroup_operation *oper,
1250 struct btrfs_qgroup *qgroup) 1577 struct btrfs_qgroup *qgroup,
1578 struct ulist *tmp, struct ulist *qgroups,
1579 u64 seq)
1251{ 1580{
1252 struct ulist_node *unode; 1581 struct ulist_node *unode;
1253 struct ulist_iterator uiter; 1582 struct ulist_iterator uiter;
1254 struct btrfs_qgroup *qg; 1583 struct btrfs_qgroup *qg;
1255 struct btrfs_qgroup_list *glist;
1256 int ret; 1584 int ret;
1257 1585
1258 ulist_reinit(tmp); 1586 ulist_reinit(tmp);
1259 ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); 1587 ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
1588 GFP_ATOMIC);
1589 if (ret < 0)
1590 return ret;
1591 ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
1592 GFP_ATOMIC);
1260 if (ret < 0) 1593 if (ret < 0)
1261 return ret; 1594 return ret;
1262
1263 ULIST_ITER_INIT(&uiter); 1595 ULIST_ITER_INIT(&uiter);
1264 while ((unode = ulist_next(tmp, &uiter))) { 1596 while ((unode = ulist_next(tmp, &uiter))) {
1265 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1597 struct btrfs_qgroup_list *glist;
1266 if (qg->refcnt < seq) {
1267 /* not visited by step 1 */
1268 qg->rfer += sgn * num_bytes;
1269 qg->rfer_cmpr += sgn * num_bytes;
1270 if (roots->nnodes == 0) {
1271 qg->excl += sgn * num_bytes;
1272 qg->excl_cmpr += sgn * num_bytes;
1273 }
1274 qgroup_dirty(fs_info, qg);
1275 }
1276 WARN_ON(qg->tag >= seq);
1277 qg->tag = seq;
1278 1598
1599 qg = u64_to_ptr(unode->aux);
1600 if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
1601 if (qg->new_refcnt < seq)
1602 qg->new_refcnt = seq + 1;
1603 else
1604 qg->new_refcnt++;
1605 } else {
1606 if (qg->old_refcnt < seq)
1607 qg->old_refcnt = seq + 1;
1608 else
1609 qg->old_refcnt++;
1610 }
1279 list_for_each_entry(glist, &qg->groups, next_group) { 1611 list_for_each_entry(glist, &qg->groups, next_group) {
1280 ret = ulist_add(tmp, glist->group->qgroupid, 1612 ret = ulist_add(tmp, glist->group->qgroupid,
1281 (uintptr_t)glist->group, GFP_ATOMIC); 1613 ptr_to_u64(glist->group), GFP_ATOMIC);
1614 if (ret < 0)
1615 return ret;
1616 ret = ulist_add(qgroups, glist->group->qgroupid,
1617 ptr_to_u64(glist->group), GFP_ATOMIC);
1282 if (ret < 0) 1618 if (ret < 0)
1283 return ret; 1619 return ret;
1284 } 1620 }
1285 } 1621 }
1286
1287 return 0; 1622 return 0;
1288} 1623}
1289 1624
1290static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, 1625/*
1291 struct ulist *roots, struct ulist *tmp, 1626 * This adjusts the counters for all referenced qgroups if need be.
1292 u64 seq, int sgn, u64 num_bytes) 1627 */
1628static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
1629 u64 root_to_skip, u64 num_bytes,
1630 struct ulist *qgroups, u64 seq,
1631 int old_roots, int new_roots, int rescan)
1293{ 1632{
1294 struct ulist_node *unode; 1633 struct ulist_node *unode;
1295 struct ulist_iterator uiter; 1634 struct ulist_iterator uiter;
1296 struct btrfs_qgroup *qg; 1635 struct btrfs_qgroup *qg;
1297 struct ulist_node *tmp_unode; 1636 u64 cur_new_count, cur_old_count;
1298 struct ulist_iterator tmp_uiter;
1299 int ret;
1300 1637
1301 ULIST_ITER_INIT(&uiter); 1638 ULIST_ITER_INIT(&uiter);
1302 while ((unode = ulist_next(roots, &uiter))) { 1639 while ((unode = ulist_next(qgroups, &uiter))) {
1303 qg = find_qgroup_rb(fs_info, unode->val); 1640 bool dirty = false;
1304 if (!qg)
1305 continue;
1306 1641
1307 ulist_reinit(tmp); 1642 qg = u64_to_ptr(unode->aux);
1308 ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); 1643 /*
1309 if (ret < 0) 1644 * Wasn't referenced before but is now, add to the reference
1310 return ret; 1645 * counters.
1646 */
1647 if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
1648 qg->rfer += num_bytes;
1649 qg->rfer_cmpr += num_bytes;
1650 dirty = true;
1651 }
1311 1652
1312 ULIST_ITER_INIT(&tmp_uiter); 1653 /*
1313 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1654 * Was referenced before but isn't now, subtract from the
1314 struct btrfs_qgroup_list *glist; 1655 * reference counters.
1656 */
1657 if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
1658 qg->rfer -= num_bytes;
1659 qg->rfer_cmpr -= num_bytes;
1660 dirty = true;
1661 }
1315 1662
1316 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; 1663 if (qg->old_refcnt < seq)
1317 if (qg->tag == seq) 1664 cur_old_count = 0;
1318 continue; 1665 else
1666 cur_old_count = qg->old_refcnt - seq;
1667 if (qg->new_refcnt < seq)
1668 cur_new_count = 0;
1669 else
1670 cur_new_count = qg->new_refcnt - seq;
1319 1671
1320 if (qg->refcnt - seq == roots->nnodes) { 1672 /*
1321 qg->excl -= sgn * num_bytes; 1673 * If our refcount was the same as the roots previously but our
1322 qg->excl_cmpr -= sgn * num_bytes; 1674 * new count isn't the same as the number of roots now then we
1323 qgroup_dirty(fs_info, qg); 1675 * went from having a exclusive reference on this range to not.
1324 } 1676 */
1677 if (old_roots && cur_old_count == old_roots &&
1678 (cur_new_count != new_roots || new_roots == 0)) {
1679 WARN_ON(cur_new_count != new_roots && new_roots == 0);
1680 qg->excl -= num_bytes;
1681 qg->excl_cmpr -= num_bytes;
1682 dirty = true;
1683 }
1325 1684
1326 list_for_each_entry(glist, &qg->groups, next_group) { 1685 /*
1327 ret = ulist_add(tmp, glist->group->qgroupid, 1686 * If we didn't reference all the roots before but now we do we
1328 (uintptr_t)glist->group, 1687 * have an exclusive reference to this range.
1329 GFP_ATOMIC); 1688 */
1330 if (ret < 0) 1689 if ((!old_roots || (old_roots && cur_old_count != old_roots))
1331 return ret; 1690 && cur_new_count == new_roots) {
1332 } 1691 qg->excl += num_bytes;
1692 qg->excl_cmpr += num_bytes;
1693 dirty = true;
1333 } 1694 }
1334 }
1335 1695
1696 if (dirty)
1697 qgroup_dirty(fs_info, qg);
1698 }
1336 return 0; 1699 return 0;
1337} 1700}
1338 1701
1339/* 1702/*
1340 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 1703 * If we removed a data extent and there were other references for that bytenr
1341 * from the fs. First, all roots referencing the extent are searched, and 1704 * then we need to lookup all referenced roots to make sure we still don't
1342 * then the space is accounted accordingly to the different roots. The 1705 * reference this bytenr. If we do then we can just discard this operation.
1343 * accounting algorithm works in 3 steps documented inline.
1344 */ 1706 */
1345int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, 1707static int check_existing_refs(struct btrfs_trans_handle *trans,
1346 struct btrfs_fs_info *fs_info, 1708 struct btrfs_fs_info *fs_info,
1347 struct btrfs_delayed_ref_node *node, 1709 struct btrfs_qgroup_operation *oper)
1348 struct btrfs_delayed_extent_op *extent_op)
1349{ 1710{
1350 struct btrfs_root *quota_root;
1351 u64 ref_root;
1352 struct btrfs_qgroup *qgroup;
1353 struct ulist *roots = NULL; 1711 struct ulist *roots = NULL;
1354 u64 seq; 1712 struct ulist_node *unode;
1713 struct ulist_iterator uiter;
1355 int ret = 0; 1714 int ret = 0;
1356 int sgn;
1357 1715
1358 if (!fs_info->quota_enabled) 1716 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1359 return 0; 1717 oper->elem.seq, &roots);
1360 1718 if (ret < 0)
1361 BUG_ON(!fs_info->quota_root); 1719 return ret;
1720 ret = 0;
1362 1721
1363 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1722 ULIST_ITER_INIT(&uiter);
1364 node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 1723 while ((unode = ulist_next(roots, &uiter))) {
1365 struct btrfs_delayed_tree_ref *ref; 1724 if (unode->val == oper->ref_root) {
1366 ref = btrfs_delayed_node_to_tree_ref(node); 1725 ret = 1;
1367 ref_root = ref->root; 1726 break;
1368 } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1727 }
1369 node->type == BTRFS_SHARED_DATA_REF_KEY) {
1370 struct btrfs_delayed_data_ref *ref;
1371 ref = btrfs_delayed_node_to_data_ref(node);
1372 ref_root = ref->root;
1373 } else {
1374 BUG();
1375 } 1728 }
1729 ulist_free(roots);
1730 btrfs_put_tree_mod_seq(fs_info, &oper->elem);
1376 1731
1377 if (!is_fstree(ref_root)) { 1732 return ret;
1378 /* 1733}
1379 * non-fs-trees are not being accounted
1380 */
1381 return 0;
1382 }
1383 1734
1384 switch (node->action) { 1735/*
1385 case BTRFS_ADD_DELAYED_REF: 1736 * If we share a reference across multiple roots then we may need to adjust
1386 case BTRFS_ADD_DELAYED_EXTENT: 1737 * various qgroups referenced and exclusive counters. The basic premise is this
1387 sgn = 1; 1738 *
1388 seq = btrfs_tree_mod_seq_prev(node->seq); 1739 * 1) We have seq to represent a 0 count. Instead of looping through all of the
1389 break; 1740 * qgroups and resetting their refcount to 0 we just constantly bump this
1390 case BTRFS_DROP_DELAYED_REF: 1741 * sequence number to act as the base reference count. This means that if
1391 sgn = -1; 1742 * anybody is equal to or below this sequence they were never referenced. We
1392 seq = node->seq; 1743 * jack this sequence up by the number of roots we found each time in order to
1393 break; 1744 * make sure we don't have any overlap.
1394 case BTRFS_UPDATE_DELAYED_HEAD: 1745 *
1395 return 0; 1746 * 2) We first search all the roots that reference the area _except_ the root
1396 default: 1747 * we're acting on currently. This makes up the old_refcnt of all the qgroups
1397 BUG(); 1748 * before.
1398 } 1749 *
1750 * 3) We walk all of the qgroups referenced by the root we are currently acting
1751 * on, and will either adjust old_refcnt in the case of a removal or the
1752 * new_refcnt in the case of an addition.
1753 *
1754 * 4) Finally we walk all the qgroups that are referenced by this range
1755 * including the root we are acting on currently. We will adjust the counters
1756 * based on the number of roots we had and will have after this operation.
1757 *
1758 * Take this example as an illustration
1759 *
1760 * [qgroup 1/0]
1761 * / | \
1762 * [qg 0/0] [qg 0/1] [qg 0/2]
1763 * \ | /
1764 * [ extent ]
1765 *
1766 * Say we are adding a reference that is covered by qg 0/0. The first step
1767 * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
1768 * old_roots being 2. Because it is adding new_roots will be 1. We then go
1769 * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
1770 * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
1771 * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
1772 * reference and thus must add the size to the referenced bytes. Everything
1773 * else is the same so nothing else changes.
1774 */
1775static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
1776 struct btrfs_fs_info *fs_info,
1777 struct btrfs_qgroup_operation *oper)
1778{
1779 struct ulist *roots = NULL;
1780 struct ulist *qgroups, *tmp;
1781 struct btrfs_qgroup *qgroup;
1782 struct seq_list elem = {};
1783 u64 seq;
1784 int old_roots = 0;
1785 int new_roots = 0;
1786 int ret = 0;
1399 1787
1400 mutex_lock(&fs_info->qgroup_rescan_lock); 1788 if (oper->elem.seq) {
1401 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 1789 ret = check_existing_refs(trans, fs_info, oper);
1402 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { 1790 if (ret < 0)
1403 mutex_unlock(&fs_info->qgroup_rescan_lock); 1791 return ret;
1792 if (ret)
1404 return 0; 1793 return 0;
1405 }
1406 } 1794 }
1407 mutex_unlock(&fs_info->qgroup_rescan_lock);
1408 1795
1409 /* 1796 qgroups = ulist_alloc(GFP_NOFS);
1410 * the delayed ref sequence number we pass depends on the direction of 1797 if (!qgroups)
1411 * the operation. for add operations, we pass 1798 return -ENOMEM;
1412 * tree_mod_log_prev_seq(node->seq) to skip
1413 * the delayed ref's current sequence number, because we need the state
1414 * of the tree before the add operation. for delete operations, we pass
1415 * (node->seq) to include the delayed ref's current sequence number,
1416 * because we need the state of the tree after the delete operation.
1417 */
1418 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
1419 if (ret < 0)
1420 return ret;
1421
1422 spin_lock(&fs_info->qgroup_lock);
1423 1799
1424 quota_root = fs_info->quota_root; 1800 tmp = ulist_alloc(GFP_NOFS);
1425 if (!quota_root) 1801 if (!tmp)
1426 goto unlock; 1802 return -ENOMEM;
1427 1803
1428 qgroup = find_qgroup_rb(fs_info, ref_root); 1804 btrfs_get_tree_mod_seq(fs_info, &elem);
1805 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
1806 &roots);
1807 btrfs_put_tree_mod_seq(fs_info, &elem);
1808 if (ret < 0) {
1809 ulist_free(qgroups);
1810 ulist_free(tmp);
1811 return ret;
1812 }
1813 spin_lock(&fs_info->qgroup_lock);
1814 qgroup = find_qgroup_rb(fs_info, oper->ref_root);
1429 if (!qgroup) 1815 if (!qgroup)
1430 goto unlock; 1816 goto out;
1817 seq = fs_info->qgroup_seq;
1431 1818
1432 /* 1819 /*
1433 * step 1: for each old ref, visit all nodes once and inc refcnt 1820 * So roots is the list of all the roots currently pointing at the
1821 * bytenr, including the ref we are adding if we are adding, or not if
1822 * we are removing a ref. So we pass in the ref_root to skip that root
1823 * in our calculations. We set old_refnct and new_refcnt cause who the
1824 * hell knows what everything looked like before, and it doesn't matter
1825 * except...
1434 */ 1826 */
1435 ulist_reinit(fs_info->qgroup_ulist); 1827 ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
1436 seq = fs_info->qgroup_seq; 1828 seq, &old_roots, 0);
1437 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1829 if (ret < 0)
1830 goto out;
1438 1831
1439 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, 1832 /*
1440 seq); 1833 * Now adjust the refcounts of the qgroups that care about this
1441 if (ret) 1834 * reference, either the old_count in the case of removal or new_count
1442 goto unlock; 1835 * in the case of an addition.
1836 */
1837 ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
1838 seq);
1839 if (ret < 0)
1840 goto out;
1443 1841
1444 /* 1842 /*
1445 * step 2: walk from the new root 1843 * ...in the case of removals. If we had a removal before we got around
1844 * to processing this operation then we need to find that guy and count
1845 * his references as if they really existed so we don't end up screwing
1846 * up the exclusive counts. Then whenever we go to process the delete
1847 * everything will be grand and we can account for whatever exclusive
1848 * changes need to be made there. We also have to pass in old_roots so
1849 * we have an accurate count of the roots as it pertains to this
1850 * operations view of the world.
1446 */ 1851 */
1447 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, 1852 ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
1448 seq, sgn, node->num_bytes, qgroup); 1853 &old_roots);
1449 if (ret) 1854 if (ret < 0)
1450 goto unlock; 1855 goto out;
1451 1856
1452 /* 1857 /*
1453 * step 3: walk again from old refs 1858 * We are adding our root, need to adjust up the number of roots,
1859 * otherwise old_roots is the number of roots we want.
1454 */ 1860 */
1455 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, 1861 if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
1456 seq, sgn, node->num_bytes); 1862 new_roots = old_roots + 1;
1457 if (ret) 1863 } else {
1458 goto unlock; 1864 new_roots = old_roots;
1865 old_roots++;
1866 }
1867 fs_info->qgroup_seq += old_roots + 1;
1459 1868
1460unlock: 1869
1870 /*
1871 * And now the magic happens, bless Arne for having a pretty elegant
1872 * solution for this.
1873 */
1874 qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
1875 qgroups, seq, old_roots, new_roots, 0);
1876out:
1461 spin_unlock(&fs_info->qgroup_lock); 1877 spin_unlock(&fs_info->qgroup_lock);
1878 ulist_free(qgroups);
1462 ulist_free(roots); 1879 ulist_free(roots);
1880 ulist_free(tmp);
1881 return ret;
1882}
1883
1884/*
1885 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1886 * from the fs. First, all roots referencing the extent are searched, and
1887 * then the space is accounted accordingly to the different roots. The
1888 * accounting algorithm works in 3 steps documented inline.
1889 */
1890static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1891 struct btrfs_fs_info *fs_info,
1892 struct btrfs_qgroup_operation *oper)
1893{
1894 int ret = 0;
1895
1896 if (!fs_info->quota_enabled)
1897 return 0;
1898
1899 BUG_ON(!fs_info->quota_root);
1900
1901 mutex_lock(&fs_info->qgroup_rescan_lock);
1902 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1903 if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
1904 mutex_unlock(&fs_info->qgroup_rescan_lock);
1905 return 0;
1906 }
1907 }
1908 mutex_unlock(&fs_info->qgroup_rescan_lock);
1909
1910 ASSERT(is_fstree(oper->ref_root));
1911
1912 switch (oper->type) {
1913 case BTRFS_QGROUP_OPER_ADD_EXCL:
1914 case BTRFS_QGROUP_OPER_SUB_EXCL:
1915 ret = qgroup_excl_accounting(fs_info, oper);
1916 break;
1917 case BTRFS_QGROUP_OPER_ADD_SHARED:
1918 case BTRFS_QGROUP_OPER_SUB_SHARED:
1919 ret = qgroup_shared_accounting(trans, fs_info, oper);
1920 break;
1921 default:
1922 ASSERT(0);
1923 }
1924 return ret;
1925}
1926
1927/*
1928 * Needs to be called everytime we run delayed refs, even if there is an error
1929 * in order to cleanup outstanding operations.
1930 */
1931int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
1932 struct btrfs_fs_info *fs_info)
1933{
1934 struct btrfs_qgroup_operation *oper;
1935 int ret = 0;
1463 1936
1937 while (!list_empty(&trans->qgroup_ref_list)) {
1938 oper = list_first_entry(&trans->qgroup_ref_list,
1939 struct btrfs_qgroup_operation, list);
1940 list_del_init(&oper->list);
1941 if (!ret || !trans->aborted)
1942 ret = btrfs_qgroup_account(trans, fs_info, oper);
1943 spin_lock(&fs_info->qgroup_op_lock);
1944 rb_erase(&oper->n, &fs_info->qgroup_op_tree);
1945 spin_unlock(&fs_info->qgroup_op_lock);
1946 btrfs_put_tree_mod_seq(fs_info, &oper->elem);
1947 kfree(oper);
1948 }
1464 return ret; 1949 return ret;
1465} 1950}
1466 1951
@@ -1629,8 +2114,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1629 srcgroup = find_qgroup_rb(fs_info, srcid); 2114 srcgroup = find_qgroup_rb(fs_info, srcid);
1630 if (!srcgroup) 2115 if (!srcgroup)
1631 goto unlock; 2116 goto unlock;
1632 dstgroup->rfer = srcgroup->rfer - level_size; 2117
1633 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; 2118 /*
2119 * We call inherit after we clone the root in order to make sure
2120 * our counts don't go crazy, so at this point the only
2121 * difference between the two roots should be the root node.
2122 */
2123 dstgroup->rfer = srcgroup->rfer;
2124 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2125 dstgroup->excl = level_size;
2126 dstgroup->excl_cmpr = level_size;
1634 srcgroup->excl = level_size; 2127 srcgroup->excl = level_size;
1635 srcgroup->excl_cmpr = level_size; 2128 srcgroup->excl_cmpr = level_size;
1636 qgroup_dirty(fs_info, dstgroup); 2129 qgroup_dirty(fs_info, dstgroup);
@@ -1734,7 +2227,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1734 struct btrfs_qgroup *qg; 2227 struct btrfs_qgroup *qg;
1735 struct btrfs_qgroup_list *glist; 2228 struct btrfs_qgroup_list *glist;
1736 2229
1737 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 2230 qg = u64_to_ptr(unode->aux);
1738 2231
1739 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 2232 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1740 qg->reserved + (s64)qg->rfer + num_bytes > 2233 qg->reserved + (s64)qg->rfer + num_bytes >
@@ -1766,7 +2259,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1766 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 2259 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1767 struct btrfs_qgroup *qg; 2260 struct btrfs_qgroup *qg;
1768 2261
1769 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 2262 qg = u64_to_ptr(unode->aux);
1770 2263
1771 qg->reserved += num_bytes; 2264 qg->reserved += num_bytes;
1772 } 2265 }
@@ -1812,7 +2305,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1812 struct btrfs_qgroup *qg; 2305 struct btrfs_qgroup *qg;
1813 struct btrfs_qgroup_list *glist; 2306 struct btrfs_qgroup_list *glist;
1814 2307
1815 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 2308 qg = u64_to_ptr(unode->aux);
1816 2309
1817 qg->reserved -= num_bytes; 2310 qg->reserved -= num_bytes;
1818 2311
@@ -1848,15 +2341,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1848 */ 2341 */
1849static int 2342static int
1850qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 2343qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1851 struct btrfs_trans_handle *trans, struct ulist *tmp, 2344 struct btrfs_trans_handle *trans, struct ulist *qgroups,
1852 struct extent_buffer *scratch_leaf) 2345 struct ulist *tmp, struct extent_buffer *scratch_leaf)
1853{ 2346{
1854 struct btrfs_key found; 2347 struct btrfs_key found;
1855 struct ulist *roots = NULL; 2348 struct ulist *roots = NULL;
1856 struct ulist_node *unode;
1857 struct ulist_iterator uiter;
1858 struct seq_list tree_mod_seq_elem = {}; 2349 struct seq_list tree_mod_seq_elem = {};
2350 u64 num_bytes;
1859 u64 seq; 2351 u64 seq;
2352 int new_roots;
1860 int slot; 2353 int slot;
1861 int ret; 2354 int ret;
1862 2355
@@ -1897,8 +2390,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1897 mutex_unlock(&fs_info->qgroup_rescan_lock); 2390 mutex_unlock(&fs_info->qgroup_rescan_lock);
1898 2391
1899 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 2392 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
1900 u64 num_bytes;
1901
1902 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 2393 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
1903 if (found.type != BTRFS_EXTENT_ITEM_KEY && 2394 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
1904 found.type != BTRFS_METADATA_ITEM_KEY) 2395 found.type != BTRFS_METADATA_ITEM_KEY)
@@ -1908,76 +2399,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1908 else 2399 else
1909 num_bytes = found.offset; 2400 num_bytes = found.offset;
1910 2401
1911 ret = btrfs_find_all_roots(trans, fs_info, found.objectid, 2402 ulist_reinit(qgroups);
1912 tree_mod_seq_elem.seq, &roots); 2403 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
2404 &roots);
1913 if (ret < 0) 2405 if (ret < 0)
1914 goto out; 2406 goto out;
1915 spin_lock(&fs_info->qgroup_lock); 2407 spin_lock(&fs_info->qgroup_lock);
1916 seq = fs_info->qgroup_seq; 2408 seq = fs_info->qgroup_seq;
1917 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 2409 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1918 2410
1919 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 2411 new_roots = 0;
1920 if (ret) { 2412 ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
2413 seq, &new_roots, 1);
2414 if (ret < 0) {
1921 spin_unlock(&fs_info->qgroup_lock); 2415 spin_unlock(&fs_info->qgroup_lock);
1922 ulist_free(roots); 2416 ulist_free(roots);
1923 goto out; 2417 goto out;
1924 } 2418 }
1925 2419
1926 /* 2420 ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
1927 * step2 of btrfs_qgroup_account_ref works from a single root, 2421 seq, 0, new_roots, 1);
1928 * we're doing all at once here. 2422 if (ret < 0) {
1929 */ 2423 spin_unlock(&fs_info->qgroup_lock);
1930 ulist_reinit(tmp); 2424 ulist_free(roots);
1931 ULIST_ITER_INIT(&uiter); 2425 goto out;
1932 while ((unode = ulist_next(roots, &uiter))) {
1933 struct btrfs_qgroup *qg;
1934
1935 qg = find_qgroup_rb(fs_info, unode->val);
1936 if (!qg)
1937 continue;
1938
1939 ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
1940 GFP_ATOMIC);
1941 if (ret < 0) {
1942 spin_unlock(&fs_info->qgroup_lock);
1943 ulist_free(roots);
1944 goto out;
1945 }
1946 }
1947
1948 /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
1949 ULIST_ITER_INIT(&uiter);
1950 while ((unode = ulist_next(tmp, &uiter))) {
1951 struct btrfs_qgroup *qg;
1952 struct btrfs_qgroup_list *glist;
1953
1954 qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
1955 qg->rfer += num_bytes;
1956 qg->rfer_cmpr += num_bytes;
1957 WARN_ON(qg->tag >= seq);
1958 if (qg->refcnt - seq == roots->nnodes) {
1959 qg->excl += num_bytes;
1960 qg->excl_cmpr += num_bytes;
1961 }
1962 qgroup_dirty(fs_info, qg);
1963
1964 list_for_each_entry(glist, &qg->groups, next_group) {
1965 ret = ulist_add(tmp, glist->group->qgroupid,
1966 (uintptr_t)glist->group,
1967 GFP_ATOMIC);
1968 if (ret < 0) {
1969 spin_unlock(&fs_info->qgroup_lock);
1970 ulist_free(roots);
1971 goto out;
1972 }
1973 }
1974 } 2426 }
1975
1976 spin_unlock(&fs_info->qgroup_lock); 2427 spin_unlock(&fs_info->qgroup_lock);
1977 ulist_free(roots); 2428 ulist_free(roots);
1978 ret = 0;
1979 } 2429 }
1980
1981out: 2430out:
1982 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 2431 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1983 2432
@@ -1990,13 +2439,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
1990 qgroup_rescan_work); 2439 qgroup_rescan_work);
1991 struct btrfs_path *path; 2440 struct btrfs_path *path;
1992 struct btrfs_trans_handle *trans = NULL; 2441 struct btrfs_trans_handle *trans = NULL;
1993 struct ulist *tmp = NULL; 2442 struct ulist *tmp = NULL, *qgroups = NULL;
1994 struct extent_buffer *scratch_leaf = NULL; 2443 struct extent_buffer *scratch_leaf = NULL;
1995 int err = -ENOMEM; 2444 int err = -ENOMEM;
1996 2445
1997 path = btrfs_alloc_path(); 2446 path = btrfs_alloc_path();
1998 if (!path) 2447 if (!path)
1999 goto out; 2448 goto out;
2449 qgroups = ulist_alloc(GFP_NOFS);
2450 if (!qgroups)
2451 goto out;
2000 tmp = ulist_alloc(GFP_NOFS); 2452 tmp = ulist_alloc(GFP_NOFS);
2001 if (!tmp) 2453 if (!tmp)
2002 goto out; 2454 goto out;
@@ -2015,7 +2467,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2015 err = -EINTR; 2467 err = -EINTR;
2016 } else { 2468 } else {
2017 err = qgroup_rescan_leaf(fs_info, path, trans, 2469 err = qgroup_rescan_leaf(fs_info, path, trans,
2018 tmp, scratch_leaf); 2470 qgroups, tmp, scratch_leaf);
2019 } 2471 }
2020 if (err > 0) 2472 if (err > 0)
2021 btrfs_commit_transaction(trans, fs_info->fs_root); 2473 btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2025,6 +2477,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2025 2477
2026out: 2478out:
2027 kfree(scratch_leaf); 2479 kfree(scratch_leaf);
2480 ulist_free(qgroups);
2028 ulist_free(tmp); 2481 ulist_free(tmp);
2029 btrfs_free_path(path); 2482 btrfs_free_path(path);
2030 2483
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 000000000000..5952ff1fbd7a
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
1/*
2 * Copyright (C) 2014 Facebook. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_QGROUP__
20#define __BTRFS_QGROUP__
21
22/*
23 * A description of the operations, all of these operations only happen when we
24 * are adding the 1st reference for that subvolume in the case of adding space
25 * or on the last reference delete in the case of subtraction. The only
26 * exception is the last one, which is added for confusion.
27 *
28 * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
29 * one pointing at the bytes we are adding. This is called on the first
30 * allocation.
31 *
32 * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
33 * shared between subvols. This is called on the creation of a ref that already
34 * has refs from a different subvolume, so basically reflink.
35 *
36 * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
37 * one referencing the range.
38 *
39 * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
40 * refs with other subvolumes.
41 */
42enum btrfs_qgroup_operation_type {
43 BTRFS_QGROUP_OPER_ADD_EXCL,
44 BTRFS_QGROUP_OPER_ADD_SHARED,
45 BTRFS_QGROUP_OPER_SUB_EXCL,
46 BTRFS_QGROUP_OPER_SUB_SHARED,
47};
48
49struct btrfs_qgroup_operation {
50 u64 ref_root;
51 u64 bytenr;
52 u64 num_bytes;
53 u64 seq;
54 enum btrfs_qgroup_operation_type type;
55 struct seq_list elem;
56 struct rb_node n;
57 struct list_head list;
58};
59
60int btrfs_quota_enable(struct btrfs_trans_handle *trans,
61 struct btrfs_fs_info *fs_info);
62int btrfs_quota_disable(struct btrfs_trans_handle *trans,
63 struct btrfs_fs_info *fs_info);
64int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
65void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
66int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
67int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
68 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
69int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
70 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
71int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
72 struct btrfs_fs_info *fs_info, u64 qgroupid,
73 char *name);
74int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
75 struct btrfs_fs_info *fs_info, u64 qgroupid);
76int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
77 struct btrfs_fs_info *fs_info, u64 qgroupid,
78 struct btrfs_qgroup_limit *limit);
79int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
80void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
81struct btrfs_delayed_extent_op;
82int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info, u64 ref_root,
84 u64 bytenr, u64 num_bytes,
85 enum btrfs_qgroup_operation_type type,
86 int mod_seq);
87int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
88 struct btrfs_fs_info *fs_info);
89void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
90 struct btrfs_fs_info *fs_info,
91 struct btrfs_qgroup_operation *oper);
92int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
93 struct btrfs_fs_info *fs_info);
94int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
95 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
96 struct btrfs_qgroup_inherit *inherit);
97int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
98void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
99
100void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
101
102#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
103int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
104 u64 rfer, u64 excl);
105#endif
106
107#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7f92ab1daa87..65245a07275b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
337 if (bnode->root) 337 if (bnode->root)
338 fs_info = bnode->root->fs_info; 338 fs_info = bnode->root->fs_info;
339 btrfs_panic(fs_info, errno, "Inconsistency in backref cache " 339 btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
340 "found at offset %llu\n", bytenr); 340 "found at offset %llu", bytenr);
341} 341}
342 342
343/* 343/*
@@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root)
528{ 528{
529 struct btrfs_root *reloc_root; 529 struct btrfs_root *reloc_root;
530 530
531 if (!root->ref_cows) 531 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
532 return 0; 532 return 0;
533 533
534 reloc_root = root->reloc_root; 534 reloc_root = root->reloc_root;
@@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,
610 root = read_fs_root(rc->extent_root->fs_info, root_objectid); 610 root = read_fs_root(rc->extent_root->fs_info, root_objectid);
611 BUG_ON(IS_ERR(root)); 611 BUG_ON(IS_ERR(root));
612 612
613 if (root->ref_cows && 613 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
614 generation != btrfs_root_generation(&root->root_item)) 614 generation != btrfs_root_generation(&root->root_item))
615 return NULL; 615 return NULL;
616 616
@@ -887,7 +887,7 @@ again:
887 goto out; 887 goto out;
888 } 888 }
889 889
890 if (!root->ref_cows) 890 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
891 cur->cowonly = 1; 891 cur->cowonly = 1;
892 892
893 if (btrfs_root_level(&root->root_item) == cur->level) { 893 if (btrfs_root_level(&root->root_item) == cur->level) {
@@ -954,7 +954,8 @@ again:
954 upper->bytenr = eb->start; 954 upper->bytenr = eb->start;
955 upper->owner = btrfs_header_owner(eb); 955 upper->owner = btrfs_header_owner(eb);
956 upper->level = lower->level + 1; 956 upper->level = lower->level + 1;
957 if (!root->ref_cows) 957 if (!test_bit(BTRFS_ROOT_REF_COWS,
958 &root->state))
958 upper->cowonly = 1; 959 upper->cowonly = 1;
959 960
960 /* 961 /*
@@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
1258 if (rb_node) { 1259 if (rb_node) {
1259 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " 1260 btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1260 "for start=%llu while inserting into relocation " 1261 "for start=%llu while inserting into relocation "
1261 "tree\n", node->bytenr); 1262 "tree", node->bytenr);
1262 kfree(node); 1263 kfree(node);
1263 return -EEXIST; 1264 return -EEXIST;
1264 } 1265 }
@@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2441 next = walk_up_backref(next, edges, &index); 2442 next = walk_up_backref(next, edges, &index);
2442 root = next->root; 2443 root = next->root;
2443 BUG_ON(!root); 2444 BUG_ON(!root);
2444 BUG_ON(!root->ref_cows); 2445 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2445 2446
2446 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2447 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2447 record_reloc_root_in_trans(trans, root); 2448 record_reloc_root_in_trans(trans, root);
@@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2506 BUG_ON(!root); 2507 BUG_ON(!root);
2507 2508
2508 /* no other choice for non-references counted tree */ 2509 /* no other choice for non-references counted tree */
2509 if (!root->ref_cows) 2510 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2510 return root; 2511 return root;
2511 2512
2512 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2513 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2893 goto out; 2894 goto out;
2894 } 2895 }
2895 2896
2896 if (!root || root->ref_cows) { 2897 if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2897 ret = reserve_metadata_space(trans, rc, node); 2898 ret = reserve_metadata_space(trans, rc, node);
2898 if (ret) 2899 if (ret)
2899 goto out; 2900 goto out;
2900 } 2901 }
2901 2902
2902 if (root) { 2903 if (root) {
2903 if (root->ref_cows) { 2904 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2904 BUG_ON(node->new_bytenr); 2905 BUG_ON(node->new_bytenr);
2905 BUG_ON(!list_empty(&node->list)); 2906 BUG_ON(!list_empty(&node->list));
2906 btrfs_record_root_in_trans(trans, root); 2907 btrfs_record_root_in_trans(trans, root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 38bb47e7d6b1..360a728a639f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
306 break; 306 break;
307 } 307 }
308 308
309 root->orphan_item_inserted = 1; 309 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
310 310
311 err = btrfs_insert_fs_root(root->fs_info, root); 311 err = btrfs_insert_fs_root(root->fs_info, root);
312 if (err) { 312 if (err) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0be77993378e..ac80188eec88 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
588 588
589 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 589 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
590 do { 590 do {
591 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 591 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
592 &ref_root, &ref_level); 592 item_size, &ref_root,
593 &ref_level);
593 printk_in_rcu(KERN_WARNING 594 printk_in_rcu(KERN_WARNING
594 "BTRFS: %s at logical %llu on dev %s, " 595 "BTRFS: %s at logical %llu on dev %s, "
595 "sector %llu: metadata %s (level %d) in tree " 596 "sector %llu: metadata %s (level %d) in tree "
@@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
717out: 718out:
718 if (page) 719 if (page)
719 put_page(page); 720 put_page(page);
720 if (inode) 721
721 iput(inode); 722 iput(inode);
722 723
723 if (ret < 0) 724 if (ret < 0)
724 return ret; 725 return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fd38b5053479..6528aa662181 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -360,10 +360,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
360 /* 360 /*
361 * First time the inline_buf does not suffice 361 * First time the inline_buf does not suffice
362 */ 362 */
363 if (p->buf == p->inline_buf) 363 if (p->buf == p->inline_buf) {
364 tmp_buf = kmalloc(len, GFP_NOFS); 364 tmp_buf = kmalloc(len, GFP_NOFS);
365 else 365 if (tmp_buf)
366 memcpy(tmp_buf, p->buf, old_buf_len);
367 } else {
366 tmp_buf = krealloc(p->buf, len, GFP_NOFS); 368 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
369 }
367 if (!tmp_buf) 370 if (!tmp_buf)
368 return -ENOMEM; 371 return -ENOMEM;
369 p->buf = tmp_buf; 372 p->buf = tmp_buf;
@@ -972,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
972 struct btrfs_dir_item *di; 975 struct btrfs_dir_item *di;
973 struct btrfs_key di_key; 976 struct btrfs_key di_key;
974 char *buf = NULL; 977 char *buf = NULL;
975 const int buf_len = PATH_MAX; 978 int buf_len;
976 u32 name_len; 979 u32 name_len;
977 u32 data_len; 980 u32 data_len;
978 u32 cur; 981 u32 cur;
@@ -982,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
982 int num; 985 int num;
983 u8 type; 986 u8 type;
984 987
988 if (found_key->type == BTRFS_XATTR_ITEM_KEY)
989 buf_len = BTRFS_MAX_XATTR_SIZE(root);
990 else
991 buf_len = PATH_MAX;
992
985 buf = kmalloc(buf_len, GFP_NOFS); 993 buf = kmalloc(buf_len, GFP_NOFS);
986 if (!buf) { 994 if (!buf) {
987 ret = -ENOMEM; 995 ret = -ENOMEM;
@@ -1003,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1003 type = btrfs_dir_type(eb, di); 1011 type = btrfs_dir_type(eb, di);
1004 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1012 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1005 1013
1006 /* 1014 if (type == BTRFS_FT_XATTR) {
1007 * Path too long 1015 if (name_len > XATTR_NAME_MAX) {
1008 */ 1016 ret = -ENAMETOOLONG;
1009 if (name_len + data_len > buf_len) { 1017 goto out;
1010 ret = -ENAMETOOLONG; 1018 }
1011 goto out; 1019 if (name_len + data_len > buf_len) {
1020 ret = -E2BIG;
1021 goto out;
1022 }
1023 } else {
1024 /*
1025 * Path too long
1026 */
1027 if (name_len + data_len > buf_len) {
1028 ret = -ENAMETOOLONG;
1029 goto out;
1030 }
1012 } 1031 }
1013 1032
1014 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1033 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -1346,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx,
1346 ret = -EIO; 1365 ret = -EIO;
1347 btrfs_err(sctx->send_root->fs_info, "did not find backref in " 1366 btrfs_err(sctx->send_root->fs_info, "did not find backref in "
1348 "send_root. inode=%llu, offset=%llu, " 1367 "send_root. inode=%llu, offset=%llu, "
1349 "disk_byte=%llu found extent=%llu\n", 1368 "disk_byte=%llu found extent=%llu",
1350 ino, data_offset, disk_byte, found_key.objectid); 1369 ino, data_offset, disk_byte, found_key.objectid);
1351 goto out; 1370 goto out;
1352 } 1371 }
@@ -1625,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
1625 goto out; 1644 goto out;
1626 } 1645 }
1627 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1646 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1647 if (key.type == BTRFS_ROOT_ITEM_KEY) {
1648 ret = -ENOENT;
1649 goto out;
1650 }
1628 *found_inode = key.objectid; 1651 *found_inode = key.objectid;
1629 *found_type = btrfs_dir_type(path->nodes[0], di); 1652 *found_type = btrfs_dir_type(path->nodes[0], di);
1630 1653
@@ -1690,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
1690 goto out; 1713 goto out;
1691 btrfs_release_path(path); 1714 btrfs_release_path(path);
1692 1715
1693 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, 1716 if (dir_gen) {
1694 NULL, NULL); 1717 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
1695 if (ret < 0) 1718 NULL, NULL, NULL);
1696 goto out; 1719 if (ret < 0)
1720 goto out;
1721 }
1697 1722
1698 *dir = parent_dir; 1723 *dir = parent_dir;
1699 1724
@@ -1709,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,
1709 int ret; 1734 int ret;
1710 struct fs_path *tmp_name; 1735 struct fs_path *tmp_name;
1711 u64 tmp_dir; 1736 u64 tmp_dir;
1712 u64 tmp_dir_gen;
1713 1737
1714 tmp_name = fs_path_alloc(); 1738 tmp_name = fs_path_alloc();
1715 if (!tmp_name) 1739 if (!tmp_name)
1716 return -ENOMEM; 1740 return -ENOMEM;
1717 1741
1718 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1742 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
1719 if (ret < 0) 1743 if (ret < 0)
1720 goto out; 1744 goto out;
1721 1745
@@ -2026,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2026{ 2050{
2027 int ret; 2051 int ret;
2028 int nce_ret; 2052 int nce_ret;
2029 struct btrfs_path *path = NULL;
2030 struct name_cache_entry *nce = NULL; 2053 struct name_cache_entry *nce = NULL;
2031 2054
2032 /* 2055 /*
@@ -2052,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2052 } 2075 }
2053 } 2076 }
2054 2077
2055 path = alloc_path_for_send();
2056 if (!path)
2057 return -ENOMEM;
2058
2059 /* 2078 /*
2060 * If the inode is not existent yet, add the orphan name and return 1. 2079 * If the inode is not existent yet, add the orphan name and return 1.
2061 * This should only happen for the parent dir that we determine in 2080 * This should only happen for the parent dir that we determine in
@@ -2131,7 +2150,6 @@ out_cache:
2131 name_cache_clean_unused(sctx); 2150 name_cache_clean_unused(sctx);
2132 2151
2133out: 2152out:
2134 btrfs_free_path(path);
2135 return ret; 2153 return ret;
2136} 2154}
2137 2155
@@ -2942,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx,
2942static int add_pending_dir_move(struct send_ctx *sctx, 2960static int add_pending_dir_move(struct send_ctx *sctx,
2943 u64 ino, 2961 u64 ino,
2944 u64 ino_gen, 2962 u64 ino_gen,
2945 u64 parent_ino) 2963 u64 parent_ino,
2964 struct list_head *new_refs,
2965 struct list_head *deleted_refs)
2946{ 2966{
2947 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2967 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2948 struct rb_node *parent = NULL; 2968 struct rb_node *parent = NULL;
@@ -2974,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx,
2974 } 2994 }
2975 } 2995 }
2976 2996
2977 list_for_each_entry(cur, &sctx->deleted_refs, list) { 2997 list_for_each_entry(cur, deleted_refs, list) {
2978 ret = dup_ref(cur, &pm->update_refs); 2998 ret = dup_ref(cur, &pm->update_refs);
2979 if (ret < 0) 2999 if (ret < 0)
2980 goto out; 3000 goto out;
2981 } 3001 }
2982 list_for_each_entry(cur, &sctx->new_refs, list) { 3002 list_for_each_entry(cur, new_refs, list) {
2983 ret = dup_ref(cur, &pm->update_refs); 3003 ret = dup_ref(cur, &pm->update_refs);
2984 if (ret < 0) 3004 if (ret < 0)
2985 goto out; 3005 goto out;
@@ -3022,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3022 return NULL; 3042 return NULL;
3023} 3043}
3024 3044
3045static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3046 u64 ino, u64 gen, u64 *ancestor_ino)
3047{
3048 int ret = 0;
3049 u64 parent_inode = 0;
3050 u64 parent_gen = 0;
3051 u64 start_ino = ino;
3052
3053 *ancestor_ino = 0;
3054 while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3055 fs_path_reset(name);
3056
3057 if (is_waiting_for_rm(sctx, ino))
3058 break;
3059 if (is_waiting_for_move(sctx, ino)) {
3060 if (*ancestor_ino == 0)
3061 *ancestor_ino = ino;
3062 ret = get_first_ref(sctx->parent_root, ino,
3063 &parent_inode, &parent_gen, name);
3064 } else {
3065 ret = __get_cur_name_and_parent(sctx, ino, gen,
3066 &parent_inode,
3067 &parent_gen, name);
3068 if (ret > 0) {
3069 ret = 0;
3070 break;
3071 }
3072 }
3073 if (ret < 0)
3074 break;
3075 if (parent_inode == start_ino) {
3076 ret = 1;
3077 if (*ancestor_ino == 0)
3078 *ancestor_ino = ino;
3079 break;
3080 }
3081 ino = parent_inode;
3082 gen = parent_gen;
3083 }
3084 return ret;
3085}
3086
3025static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3087static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3026{ 3088{
3027 struct fs_path *from_path = NULL; 3089 struct fs_path *from_path = NULL;
@@ -3033,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3033 struct waiting_dir_move *dm = NULL; 3095 struct waiting_dir_move *dm = NULL;
3034 u64 rmdir_ino = 0; 3096 u64 rmdir_ino = 0;
3035 int ret; 3097 int ret;
3098 u64 ancestor = 0;
3036 3099
3037 name = fs_path_alloc(); 3100 name = fs_path_alloc();
3038 from_path = fs_path_alloc(); 3101 from_path = fs_path_alloc();
@@ -3051,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3051 if (ret < 0) 3114 if (ret < 0)
3052 goto out; 3115 goto out;
3053 3116
3054 if (parent_ino == sctx->cur_ino) { 3117 ret = get_cur_path(sctx, parent_ino, parent_gen,
3055 /* child only renamed, not moved */ 3118 from_path);
3056 ASSERT(parent_gen == sctx->cur_inode_gen); 3119 if (ret < 0)
3057 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, 3120 goto out;
3058 from_path); 3121 ret = fs_path_add_path(from_path, name);
3059 if (ret < 0) 3122 if (ret < 0)
3060 goto out; 3123 goto out;
3061 ret = fs_path_add_path(from_path, name); 3124
3062 if (ret < 0) 3125 sctx->send_progress = sctx->cur_ino + 1;
3063 goto out; 3126 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3064 } else { 3127 if (ret) {
3065 /* child moved and maybe renamed too */ 3128 LIST_HEAD(deleted_refs);
3066 sctx->send_progress = pm->ino; 3129 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3067 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3130 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3131 &pm->update_refs, &deleted_refs);
3068 if (ret < 0) 3132 if (ret < 0)
3069 goto out; 3133 goto out;
3070 } 3134 if (rmdir_ino) {
3071 3135 dm = get_waiting_dir_move(sctx, pm->ino);
3072 fs_path_free(name); 3136 ASSERT(dm);
3073 name = NULL; 3137 dm->rmdir_ino = rmdir_ino;
3074 3138 }
3075 to_path = fs_path_alloc();
3076 if (!to_path) {
3077 ret = -ENOMEM;
3078 goto out; 3139 goto out;
3079 } 3140 }
3080 3141 fs_path_reset(name);
3081 sctx->send_progress = sctx->cur_ino + 1; 3142 to_path = name;
3143 name = NULL;
3082 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3144 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
3083 if (ret < 0) 3145 if (ret < 0)
3084 goto out; 3146 goto out;
@@ -3202,127 +3264,74 @@ out:
3202static int wait_for_parent_move(struct send_ctx *sctx, 3264static int wait_for_parent_move(struct send_ctx *sctx,
3203 struct recorded_ref *parent_ref) 3265 struct recorded_ref *parent_ref)
3204{ 3266{
3205 int ret; 3267 int ret = 0;
3206 u64 ino = parent_ref->dir; 3268 u64 ino = parent_ref->dir;
3207 u64 parent_ino_before, parent_ino_after; 3269 u64 parent_ino_before, parent_ino_after;
3208 u64 old_gen;
3209 struct fs_path *path_before = NULL; 3270 struct fs_path *path_before = NULL;
3210 struct fs_path *path_after = NULL; 3271 struct fs_path *path_after = NULL;
3211 int len1, len2; 3272 int len1, len2;
3212 int register_upper_dirs;
3213 u64 gen;
3214
3215 if (is_waiting_for_move(sctx, ino))
3216 return 1;
3217
3218 if (parent_ref->dir <= sctx->cur_ino)
3219 return 0;
3220
3221 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3222 NULL, NULL, NULL, NULL);
3223 if (ret == -ENOENT)
3224 return 0;
3225 else if (ret < 0)
3226 return ret;
3227
3228 if (parent_ref->dir_gen != old_gen)
3229 return 0;
3230
3231 path_before = fs_path_alloc();
3232 if (!path_before)
3233 return -ENOMEM;
3234
3235 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3236 NULL, path_before);
3237 if (ret == -ENOENT) {
3238 ret = 0;
3239 goto out;
3240 } else if (ret < 0) {
3241 goto out;
3242 }
3243 3273
3244 path_after = fs_path_alloc(); 3274 path_after = fs_path_alloc();
3245 if (!path_after) { 3275 path_before = fs_path_alloc();
3276 if (!path_after || !path_before) {
3246 ret = -ENOMEM; 3277 ret = -ENOMEM;
3247 goto out; 3278 goto out;
3248 } 3279 }
3249 3280
3250 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3251 &gen, path_after);
3252 if (ret == -ENOENT) {
3253 ret = 0;
3254 goto out;
3255 } else if (ret < 0) {
3256 goto out;
3257 }
3258
3259 len1 = fs_path_len(path_before);
3260 len2 = fs_path_len(path_after);
3261 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3262 memcmp(path_before->start, path_after->start, len1)) {
3263 ret = 1;
3264 goto out;
3265 }
3266 ret = 0;
3267
3268 /* 3281 /*
3269 * Ok, our new most direct ancestor has a higher inode number but 3282 * Our current directory inode may not yet be renamed/moved because some
3270 * wasn't moved/renamed. So maybe some of the new ancestors higher in 3283 * ancestor (immediate or not) has to be renamed/moved first. So find if
3271 * the hierarchy have an higher inode number too *and* were renamed 3284 * such ancestor exists and make sure our own rename/move happens after
3272 * or moved - in this case we need to wait for the ancestor's rename 3285 * that ancestor is processed.
3273 * or move operation before we can do the move/rename for the current
3274 * inode.
3275 */ 3286 */
3276 register_upper_dirs = 0; 3287 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3277 ino = parent_ino_after; 3288 if (is_waiting_for_move(sctx, ino)) {
3278again: 3289 ret = 1;
3279 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { 3290 break;
3280 u64 parent_gen; 3291 }
3281 3292
3282 fs_path_reset(path_before); 3293 fs_path_reset(path_before);
3283 fs_path_reset(path_after); 3294 fs_path_reset(path_after);
3284 3295
3285 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3296 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3286 &parent_gen, path_after); 3297 NULL, path_after);
3287 if (ret < 0) 3298 if (ret < 0)
3288 goto out; 3299 goto out;
3289 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, 3300 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3290 NULL, path_before); 3301 NULL, path_before);
3291 if (ret == -ENOENT) { 3302 if (ret < 0 && ret != -ENOENT) {
3292 ret = 0;
3293 break;
3294 } else if (ret < 0) {
3295 goto out; 3303 goto out;
3304 } else if (ret == -ENOENT) {
3305 ret = 1;
3306 break;
3296 } 3307 }
3297 3308
3298 len1 = fs_path_len(path_before); 3309 len1 = fs_path_len(path_before);
3299 len2 = fs_path_len(path_after); 3310 len2 = fs_path_len(path_after);
3300 if (parent_ino_before != parent_ino_after || len1 != len2 || 3311 if (ino > sctx->cur_ino &&
3301 memcmp(path_before->start, path_after->start, len1)) { 3312 (parent_ino_before != parent_ino_after || len1 != len2 ||
3313 memcmp(path_before->start, path_after->start, len1))) {
3302 ret = 1; 3314 ret = 1;
3303 if (register_upper_dirs) { 3315 break;
3304 break;
3305 } else {
3306 register_upper_dirs = 1;
3307 ino = parent_ref->dir;
3308 gen = parent_ref->dir_gen;
3309 goto again;
3310 }
3311 } else if (register_upper_dirs) {
3312 ret = add_pending_dir_move(sctx, ino, gen,
3313 parent_ino_after);
3314 if (ret < 0 && ret != -EEXIST)
3315 goto out;
3316 } 3316 }
3317
3318 ino = parent_ino_after; 3317 ino = parent_ino_after;
3319 gen = parent_gen;
3320 } 3318 }
3321 3319
3322out: 3320out:
3323 fs_path_free(path_before); 3321 fs_path_free(path_before);
3324 fs_path_free(path_after); 3322 fs_path_free(path_after);
3325 3323
3324 if (ret == 1) {
3325 ret = add_pending_dir_move(sctx,
3326 sctx->cur_ino,
3327 sctx->cur_inode_gen,
3328 ino,
3329 &sctx->new_refs,
3330 &sctx->deleted_refs);
3331 if (!ret)
3332 ret = 1;
3333 }
3334
3326 return ret; 3335 return ret;
3327} 3336}
3328 3337
@@ -3483,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3483 if (ret < 0) 3492 if (ret < 0)
3484 goto out; 3493 goto out;
3485 if (ret) { 3494 if (ret) {
3486 ret = add_pending_dir_move(sctx,
3487 sctx->cur_ino,
3488 sctx->cur_inode_gen,
3489 cur->dir);
3490 *pending_move = 1; 3495 *pending_move = 1;
3491 } else { 3496 } else {
3492 ret = send_rename(sctx, valid_path, 3497 ret = send_rename(sctx, valid_path,
@@ -5487,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
5487 */ 5492 */
5488 if (root->send_in_progress < 0) 5493 if (root->send_in_progress < 0)
5489 btrfs_err(root->fs_info, 5494 btrfs_err(root->fs_info,
5490 "send_in_progres unbalanced %d root %llu\n", 5495 "send_in_progres unbalanced %d root %llu",
5491 root->send_in_progress, root->root_key.objectid); 5496 root->send_in_progress, root->root_key.objectid);
5492 spin_unlock(&root->root_item_lock); 5497 spin_unlock(&root->root_item_lock);
5493} 5498}
@@ -5515,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5515 5520
5516 /* 5521 /*
5517 * The subvolume must remain read-only during send, protect against 5522 * The subvolume must remain read-only during send, protect against
5518 * making it RW. 5523 * making it RW. This also protects against deletion.
5519 */ 5524 */
5520 spin_lock(&send_root->root_item_lock); 5525 spin_lock(&send_root->root_item_lock);
5521 send_root->send_in_progress++; 5526 send_root->send_in_progress++;
@@ -5575,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5575 } 5580 }
5576 5581
5577 sctx->send_root = send_root; 5582 sctx->send_root = send_root;
5583 /*
5584 * Unlikely but possible, if the subvolume is marked for deletion but
5585 * is slow to remove the directory entry, send can still be started
5586 */
5587 if (btrfs_root_dead(sctx->send_root)) {
5588 ret = -EPERM;
5589 goto out;
5590 }
5591
5578 sctx->clone_roots_cnt = arg->clone_sources_count; 5592 sctx->clone_roots_cnt = arg->clone_sources_count;
5579 5593
5580 sctx->send_max_size = BTRFS_SEND_BUF_SIZE; 5594 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
@@ -5664,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5664 5678
5665 spin_lock(&sctx->parent_root->root_item_lock); 5679 spin_lock(&sctx->parent_root->root_item_lock);
5666 sctx->parent_root->send_in_progress++; 5680 sctx->parent_root->send_in_progress++;
5667 if (!btrfs_root_readonly(sctx->parent_root)) { 5681 if (!btrfs_root_readonly(sctx->parent_root) ||
5682 btrfs_root_dead(sctx->parent_root)) {
5668 spin_unlock(&sctx->parent_root->root_item_lock); 5683 spin_unlock(&sctx->parent_root->root_item_lock);
5669 srcu_read_unlock(&fs_info->subvol_srcu, index); 5684 srcu_read_unlock(&fs_info->subvol_srcu, index);
5670 ret = -EPERM; 5685 ret = -EPERM;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9601d25a4607..4662d92a4b73 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -511,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
511 } else if (compress) { 511 } else if (compress) {
512 if (!btrfs_test_opt(root, COMPRESS)) 512 if (!btrfs_test_opt(root, COMPRESS))
513 btrfs_info(root->fs_info, 513 btrfs_info(root->fs_info,
514 "btrfs: use %s compression\n", 514 "btrfs: use %s compression",
515 compress_type); 515 compress_type);
516 } 516 }
517 break; 517 break;
@@ -580,8 +580,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
580 } 580 }
581 break; 581 break;
582 case Opt_acl: 582 case Opt_acl:
583#ifdef CONFIG_BTRFS_FS_POSIX_ACL
583 root->fs_info->sb->s_flags |= MS_POSIXACL; 584 root->fs_info->sb->s_flags |= MS_POSIXACL;
584 break; 585 break;
586#else
587 btrfs_err(root->fs_info,
588 "support for ACL not compiled in!");
589 ret = -EINVAL;
590 goto out;
591#endif
585 case Opt_noacl: 592 case Opt_noacl:
586 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 593 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
587 break; 594 break;
@@ -1413,6 +1420,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1413 * this also happens on 'umount -rf' or on shutdown, when 1420 * this also happens on 'umount -rf' or on shutdown, when
1414 * the filesystem is busy. 1421 * the filesystem is busy.
1415 */ 1422 */
1423 cancel_work_sync(&fs_info->async_reclaim_work);
1416 1424
1417 /* wait for the uuid_scan task to finish */ 1425 /* wait for the uuid_scan task to finish */
1418 down(&fs_info->uuid_tree_rescan_sem); 1426 down(&fs_info->uuid_tree_rescan_sem);
@@ -1894,6 +1902,9 @@ static int btrfs_run_sanity_tests(void)
1894 if (ret) 1902 if (ret)
1895 goto out; 1903 goto out;
1896 ret = btrfs_test_inodes(); 1904 ret = btrfs_test_inodes();
1905 if (ret)
1906 goto out;
1907 ret = btrfs_test_qgroups();
1897out: 1908out:
1898 btrfs_destroy_test_fs(); 1909 btrfs_destroy_test_fs();
1899 return ret; 1910 return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c5eb2143dc66..df39458f1487 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); 254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
255 255
256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
257 258
258static ssize_t raid_bytes_show(struct kobject *kobj, 259static ssize_t raid_bytes_show(struct kobject *kobj,
259 struct kobj_attribute *attr, char *buf); 260 struct kobj_attribute *attr, char *buf);
@@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
266{ 267{
267 struct btrfs_space_info *sinfo = to_space_info(kobj->parent); 268 struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
268 struct btrfs_block_group_cache *block_group; 269 struct btrfs_block_group_cache *block_group;
269 int index = kobj - sinfo->block_group_kobjs; 270 int index = to_raid_kobj(kobj)->raid_type;
270 u64 val = 0; 271 u64 val = 0;
271 272
272 down_read(&sinfo->groups_sem); 273 down_read(&sinfo->groups_sem);
@@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = {
288 289
289static void release_raid_kobj(struct kobject *kobj) 290static void release_raid_kobj(struct kobject *kobj)
290{ 291{
291 kobject_put(kobj->parent); 292 kfree(to_raid_kobj(kobj));
292} 293}
293 294
294struct kobj_type btrfs_raid_ktype = { 295struct kobj_type btrfs_raid_ktype = {
@@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
374 struct btrfs_root *root = fs_info->fs_root; 375 struct btrfs_root *root = fs_info->fs_root;
375 int ret; 376 int ret;
376 377
377 if (len >= BTRFS_LABEL_SIZE) { 378 if (len >= BTRFS_LABEL_SIZE)
378 pr_err("BTRFS: unable to set label with more than %d bytes\n",
379 BTRFS_LABEL_SIZE - 1);
380 return -EINVAL; 379 return -EINVAL;
381 }
382 380
383 trans = btrfs_start_transaction(root, 0); 381 trans = btrfs_start_transaction(root, 0);
384 if (IS_ERR(trans)) 382 if (IS_ERR(trans))
@@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
396} 394}
397BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); 395BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
398 396
397static ssize_t btrfs_no_store(struct kobject *kobj,
398 struct kobj_attribute *a,
399 const char *buf, size_t len)
400{
401 return -EPERM;
402}
403
404static ssize_t btrfs_nodesize_show(struct kobject *kobj,
405 struct kobj_attribute *a, char *buf)
406{
407 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
408
409 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
410}
411
412BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
413
414static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
415 struct kobj_attribute *a, char *buf)
416{
417 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
418
419 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
420}
421
422BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
423
424static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
425 struct kobj_attribute *a, char *buf)
426{
427 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
428
429 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
430}
431
432BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
433
399static struct attribute *btrfs_attrs[] = { 434static struct attribute *btrfs_attrs[] = {
400 BTRFS_ATTR_PTR(label), 435 BTRFS_ATTR_PTR(label),
436 BTRFS_ATTR_PTR(nodesize),
437 BTRFS_ATTR_PTR(sectorsize),
438 BTRFS_ATTR_PTR(clone_alignment),
401 NULL, 439 NULL,
402}; 440};
403 441
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 757ef00a75a4..a5dcacb5df9c 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include "btrfs-tests.h" 22#include "btrfs-tests.h"
23#include "../ctree.h" 23#include "../ctree.h"
24#include "../volumes.h"
25#include "../disk-io.h"
26#include "../qgroup.h"
24 27
25static struct vfsmount *test_mnt = NULL; 28static struct vfsmount *test_mnt = NULL;
26 29
@@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void)
72 kern_unmount(test_mnt); 75 kern_unmount(test_mnt);
73 unregister_filesystem(&test_type); 76 unregister_filesystem(&test_type);
74} 77}
78
79struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
80{
81 struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
82 GFP_NOFS);
83
84 if (!fs_info)
85 return fs_info;
86 fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
87 GFP_NOFS);
88 if (!fs_info->fs_devices) {
89 kfree(fs_info);
90 return NULL;
91 }
92 fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
93 GFP_NOFS);
94 if (!fs_info->super_copy) {
95 kfree(fs_info->fs_devices);
96 kfree(fs_info);
97 return NULL;
98 }
99
100 if (init_srcu_struct(&fs_info->subvol_srcu)) {
101 kfree(fs_info->fs_devices);
102 kfree(fs_info->super_copy);
103 kfree(fs_info);
104 return NULL;
105 }
106
107 spin_lock_init(&fs_info->buffer_lock);
108 spin_lock_init(&fs_info->qgroup_lock);
109 spin_lock_init(&fs_info->qgroup_op_lock);
110 spin_lock_init(&fs_info->super_lock);
111 spin_lock_init(&fs_info->fs_roots_radix_lock);
112 spin_lock_init(&fs_info->tree_mod_seq_lock);
113 mutex_init(&fs_info->qgroup_ioctl_lock);
114 mutex_init(&fs_info->qgroup_rescan_lock);
115 rwlock_init(&fs_info->tree_mod_log_lock);
116 fs_info->running_transaction = NULL;
117 fs_info->qgroup_tree = RB_ROOT;
118 fs_info->qgroup_ulist = NULL;
119 atomic64_set(&fs_info->tree_mod_seq, 0);
120 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
121 INIT_LIST_HEAD(&fs_info->dead_roots);
122 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
123 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
124 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
125 return fs_info;
126}
127
128static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
129{
130 struct radix_tree_iter iter;
131 void **slot;
132
133 spin_lock(&fs_info->buffer_lock);
134restart:
135 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
136 struct extent_buffer *eb;
137
138 eb = radix_tree_deref_slot(slot);
139 if (!eb)
140 continue;
141 /* Shouldn't happen but that kind of thinking creates CVE's */
142 if (radix_tree_exception(eb)) {
143 if (radix_tree_deref_retry(eb))
144 goto restart;
145 continue;
146 }
147 spin_unlock(&fs_info->buffer_lock);
148 free_extent_buffer_stale(eb);
149 spin_lock(&fs_info->buffer_lock);
150 }
151 spin_unlock(&fs_info->buffer_lock);
152
153 btrfs_free_qgroup_config(fs_info);
154 btrfs_free_fs_roots(fs_info);
155 cleanup_srcu_struct(&fs_info->subvol_srcu);
156 kfree(fs_info->super_copy);
157 kfree(fs_info->fs_devices);
158 kfree(fs_info);
159}
160
161void btrfs_free_dummy_root(struct btrfs_root *root)
162{
163 if (!root)
164 return;
165 if (root->node)
166 free_extent_buffer(root->node);
167 if (root->fs_info)
168 btrfs_free_dummy_fs_info(root->fs_info);
169 kfree(root);
170}
171
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 312560a9123d..fd3954224480 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,13 +23,18 @@
23 23
24#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) 24#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
25 25
26struct btrfs_root;
27
26int btrfs_test_free_space_cache(void); 28int btrfs_test_free_space_cache(void);
27int btrfs_test_extent_buffer_operations(void); 29int btrfs_test_extent_buffer_operations(void);
28int btrfs_test_extent_io(void); 30int btrfs_test_extent_io(void);
29int btrfs_test_inodes(void); 31int btrfs_test_inodes(void);
32int btrfs_test_qgroups(void);
30int btrfs_init_test_fs(void); 33int btrfs_init_test_fs(void);
31void btrfs_destroy_test_fs(void); 34void btrfs_destroy_test_fs(void);
32struct inode *btrfs_new_test_inode(void); 35struct inode *btrfs_new_test_inode(void);
36struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
37void btrfs_free_dummy_root(struct btrfs_root *root);
33#else 38#else
34static inline int btrfs_test_free_space_cache(void) 39static inline int btrfs_test_free_space_cache(void)
35{ 40{
@@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void)
54{ 59{
55 return 0; 60 return 0;
56} 61}
62static inline int btrfs_test_qgroups(void)
63{
64 return 0;
65}
57#endif 66#endif
58 67
59#endif 68#endif
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 397d1f99a8eb..3ae0f5b8bb80 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -23,33 +23,6 @@
23#include "../extent_io.h" 23#include "../extent_io.h"
24#include "../volumes.h" 24#include "../volumes.h"
25 25
26static struct btrfs_fs_info *alloc_dummy_fs_info(void)
27{
28 struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
29 GFP_NOFS);
30 if (!fs_info)
31 return fs_info;
32 fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
33 GFP_NOFS);
34 if (!fs_info->fs_devices) {
35 kfree(fs_info);
36 return NULL;
37 }
38 return fs_info;
39}
40static void free_dummy_root(struct btrfs_root *root)
41{
42 if (!root)
43 return;
44 if (root->fs_info) {
45 kfree(root->fs_info->fs_devices);
46 kfree(root->fs_info);
47 }
48 if (root->node)
49 free_extent_buffer(root->node);
50 kfree(root);
51}
52
53static void insert_extent(struct btrfs_root *root, u64 start, u64 len, 26static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
54 u64 ram_bytes, u64 offset, u64 disk_bytenr, 27 u64 ram_bytes, u64 offset, u64 disk_bytenr,
55 u64 disk_len, u32 type, u8 compression, int slot) 28 u64 disk_len, u32 type, u8 compression, int slot)
@@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void)
276 * We do this since btrfs_get_extent wants to assign em->bdev to 249 * We do this since btrfs_get_extent wants to assign em->bdev to
277 * root->fs_info->fs_devices->latest_bdev. 250 * root->fs_info->fs_devices->latest_bdev.
278 */ 251 */
279 root->fs_info = alloc_dummy_fs_info(); 252 root->fs_info = btrfs_alloc_dummy_fs_info();
280 if (!root->fs_info) { 253 if (!root->fs_info) {
281 test_msg("Couldn't allocate dummy fs info\n"); 254 test_msg("Couldn't allocate dummy fs info\n");
282 goto out; 255 goto out;
@@ -837,7 +810,7 @@ out:
837 if (!IS_ERR(em)) 810 if (!IS_ERR(em))
838 free_extent_map(em); 811 free_extent_map(em);
839 iput(inode); 812 iput(inode);
840 free_dummy_root(root); 813 btrfs_free_dummy_root(root);
841 return ret; 814 return ret;
842} 815}
843 816
@@ -864,7 +837,7 @@ static int test_hole_first(void)
864 goto out; 837 goto out;
865 } 838 }
866 839
867 root->fs_info = alloc_dummy_fs_info(); 840 root->fs_info = btrfs_alloc_dummy_fs_info();
868 if (!root->fs_info) { 841 if (!root->fs_info) {
869 test_msg("Couldn't allocate dummy fs info\n"); 842 test_msg("Couldn't allocate dummy fs info\n");
870 goto out; 843 goto out;
@@ -934,7 +907,7 @@ out:
934 if (!IS_ERR(em)) 907 if (!IS_ERR(em))
935 free_extent_map(em); 908 free_extent_map(em);
936 iput(inode); 909 iput(inode);
937 free_dummy_root(root); 910 btrfs_free_dummy_root(root);
938 return ret; 911 return ret;
939} 912}
940 913
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 000000000000..fa691b754aaf
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,468 @@
1/*
2 * Copyright (C) 2013 Facebook. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "btrfs-tests.h"
20#include "../ctree.h"
21#include "../transaction.h"
22#include "../disk-io.h"
23#include "../qgroup.h"
24
25static void init_dummy_trans(struct btrfs_trans_handle *trans)
26{
27 memset(trans, 0, sizeof(*trans));
28 trans->transid = 1;
29 INIT_LIST_HEAD(&trans->qgroup_ref_list);
30 trans->type = __TRANS_DUMMY;
31}
32
33static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
34 u64 num_bytes, u64 parent, u64 root_objectid)
35{
36 struct btrfs_trans_handle trans;
37 struct btrfs_extent_item *item;
38 struct btrfs_extent_inline_ref *iref;
39 struct btrfs_tree_block_info *block_info;
40 struct btrfs_path *path;
41 struct extent_buffer *leaf;
42 struct btrfs_key ins;
43 u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
44 int ret;
45
46 init_dummy_trans(&trans);
47
48 ins.objectid = bytenr;
49 ins.type = BTRFS_EXTENT_ITEM_KEY;
50 ins.offset = num_bytes;
51
52 path = btrfs_alloc_path();
53 if (!path) {
54 test_msg("Couldn't allocate path\n");
55 return -ENOMEM;
56 }
57
58 path->leave_spinning = 1;
59 ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
60 if (ret) {
61 test_msg("Couldn't insert ref %d\n", ret);
62 btrfs_free_path(path);
63 return ret;
64 }
65
66 leaf = path->nodes[0];
67 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
68 btrfs_set_extent_refs(leaf, item, 1);
69 btrfs_set_extent_generation(leaf, item, 1);
70 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
71 block_info = (struct btrfs_tree_block_info *)(item + 1);
72 btrfs_set_tree_block_level(leaf, block_info, 1);
73 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
74 if (parent > 0) {
75 btrfs_set_extent_inline_ref_type(leaf, iref,
76 BTRFS_SHARED_BLOCK_REF_KEY);
77 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
78 } else {
79 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
80 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
81 }
82 btrfs_free_path(path);
83 return 0;
84}
85
86static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
87 u64 parent, u64 root_objectid)
88{
89 struct btrfs_trans_handle trans;
90 struct btrfs_extent_item *item;
91 struct btrfs_path *path;
92 struct btrfs_key key;
93 u64 refs;
94 int ret;
95
96 init_dummy_trans(&trans);
97
98 key.objectid = bytenr;
99 key.type = BTRFS_EXTENT_ITEM_KEY;
100 key.offset = num_bytes;
101
102 path = btrfs_alloc_path();
103 if (!path) {
104 test_msg("Couldn't allocate path\n");
105 return -ENOMEM;
106 }
107
108 path->leave_spinning = 1;
109 ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
110 if (ret) {
111 test_msg("Couldn't find extent ref\n");
112 btrfs_free_path(path);
113 return ret;
114 }
115
116 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
117 struct btrfs_extent_item);
118 refs = btrfs_extent_refs(path->nodes[0], item);
119 btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
120 btrfs_release_path(path);
121
122 key.objectid = bytenr;
123 if (parent) {
124 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
125 key.offset = parent;
126 } else {
127 key.type = BTRFS_TREE_BLOCK_REF_KEY;
128 key.offset = root_objectid;
129 }
130
131 ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
132 if (ret)
133 test_msg("Failed to insert backref\n");
134 btrfs_free_path(path);
135 return ret;
136}
137
138static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
139 u64 num_bytes)
140{
141 struct btrfs_trans_handle trans;
142 struct btrfs_key key;
143 struct btrfs_path *path;
144 int ret;
145
146 init_dummy_trans(&trans);
147
148 key.objectid = bytenr;
149 key.type = BTRFS_EXTENT_ITEM_KEY;
150 key.offset = num_bytes;
151
152 path = btrfs_alloc_path();
153 if (!path) {
154 test_msg("Couldn't allocate path\n");
155 return -ENOMEM;
156 }
157 path->leave_spinning = 1;
158
159 ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
160 if (ret) {
161 test_msg("Didn't find our key %d\n", ret);
162 btrfs_free_path(path);
163 return ret;
164 }
165 btrfs_del_item(&trans, root, path);
166 btrfs_free_path(path);
167 return 0;
168}
169
170static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
171 u64 num_bytes, u64 parent, u64 root_objectid)
172{
173 struct btrfs_trans_handle trans;
174 struct btrfs_extent_item *item;
175 struct btrfs_path *path;
176 struct btrfs_key key;
177 u64 refs;
178 int ret;
179
180 init_dummy_trans(&trans);
181
182 key.objectid = bytenr;
183 key.type = BTRFS_EXTENT_ITEM_KEY;
184 key.offset = num_bytes;
185
186 path = btrfs_alloc_path();
187 if (!path) {
188 test_msg("Couldn't allocate path\n");
189 return -ENOMEM;
190 }
191
192 path->leave_spinning = 1;
193 ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
194 if (ret) {
195 test_msg("Couldn't find extent ref\n");
196 btrfs_free_path(path);
197 return ret;
198 }
199
200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
201 struct btrfs_extent_item);
202 refs = btrfs_extent_refs(path->nodes[0], item);
203 btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
204 btrfs_release_path(path);
205
206 key.objectid = bytenr;
207 if (parent) {
208 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
209 key.offset = parent;
210 } else {
211 key.type = BTRFS_TREE_BLOCK_REF_KEY;
212 key.offset = root_objectid;
213 }
214
215 ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
216 if (ret) {
217 test_msg("Couldn't find backref %d\n", ret);
218 btrfs_free_path(path);
219 return ret;
220 }
221 btrfs_del_item(&trans, root, path);
222 btrfs_free_path(path);
223 return ret;
224}
225
226static int test_no_shared_qgroup(struct btrfs_root *root)
227{
228 struct btrfs_trans_handle trans;
229 struct btrfs_fs_info *fs_info = root->fs_info;
230 int ret;
231
232 init_dummy_trans(&trans);
233
234 test_msg("Qgroup basic add\n");
235 ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
236 if (ret) {
237 test_msg("Couldn't create a qgroup %d\n", ret);
238 return ret;
239 }
240
241 ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
242 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
243 if (ret) {
244 test_msg("Couldn't add space to a qgroup %d\n", ret);
245 return ret;
246 }
247
248 ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
249 if (ret)
250 return ret;
251
252 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
253 if (ret) {
254 test_msg("Delayed qgroup accounting failed %d\n", ret);
255 return ret;
256 }
257
258 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
259 test_msg("Qgroup counts didn't match expected values\n");
260 return -EINVAL;
261 }
262
263 ret = remove_extent_item(root, 4096, 4096);
264 if (ret)
265 return -EINVAL;
266
267 ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
268 BTRFS_QGROUP_OPER_SUB_EXCL, 0);
269 if (ret) {
270 test_msg("Couldn't remove space from the qgroup %d\n", ret);
271 return -EINVAL;
272 }
273
274 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
275 if (ret) {
276 test_msg("Qgroup accounting failed %d\n", ret);
277 return -EINVAL;
278 }
279
280 if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
281 test_msg("Qgroup counts didn't match expected values\n");
282 return -EINVAL;
283 }
284
285 return 0;
286}
287
288/*
289 * Add a ref for two different roots to make sure the shared value comes out
290 * right, also remove one of the roots and make sure the exclusive count is
291 * adjusted properly.
292 */
293static int test_multiple_refs(struct btrfs_root *root)
294{
295 struct btrfs_trans_handle trans;
296 struct btrfs_fs_info *fs_info = root->fs_info;
297 int ret;
298
299 init_dummy_trans(&trans);
300
301 test_msg("Qgroup multiple refs test\n");
302
303 /* We have 5 created already from the previous test */
304 ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
305 if (ret) {
306 test_msg("Couldn't create a qgroup %d\n", ret);
307 return ret;
308 }
309
310 ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
311 if (ret)
312 return ret;
313
314 ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
315 BTRFS_QGROUP_OPER_ADD_EXCL, 0);
316 if (ret) {
317 test_msg("Couldn't add space to a qgroup %d\n", ret);
318 return ret;
319 }
320
321 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
322 if (ret) {
323 test_msg("Delayed qgroup accounting failed %d\n", ret);
324 return ret;
325 }
326
327 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
328 test_msg("Qgroup counts didn't match expected values\n");
329 return -EINVAL;
330 }
331
332 ret = add_tree_ref(root, 4096, 4096, 0, 256);
333 if (ret)
334 return ret;
335
336 ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
337 BTRFS_QGROUP_OPER_ADD_SHARED, 0);
338 if (ret) {
339 test_msg("Qgroup record ref failed %d\n", ret);
340 return ret;
341 }
342
343 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
344 if (ret) {
345 test_msg("Qgroup accounting failed %d\n", ret);
346 return ret;
347 }
348
349 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
350 test_msg("Qgroup counts didn't match expected values\n");
351 return -EINVAL;
352 }
353
354 if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
355 test_msg("Qgroup counts didn't match expected values\n");
356 return -EINVAL;
357 }
358
359 ret = remove_extent_ref(root, 4096, 4096, 0, 256);
360 if (ret)
361 return ret;
362
363 ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
364 BTRFS_QGROUP_OPER_SUB_SHARED, 0);
365 if (ret) {
366 test_msg("Qgroup record ref failed %d\n", ret);
367 return ret;
368 }
369
370 ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
371 if (ret) {
372 test_msg("Qgroup accounting failed %d\n", ret);
373 return ret;
374 }
375
376 if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
377 test_msg("Qgroup counts didn't match expected values\n");
378 return -EINVAL;
379 }
380
381 if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
382 test_msg("Qgroup counts didn't match expected values\n");
383 return -EINVAL;
384 }
385
386 return 0;
387}
388
389int btrfs_test_qgroups(void)
390{
391 struct btrfs_root *root;
392 struct btrfs_root *tmp_root;
393 int ret = 0;
394
395 root = btrfs_alloc_dummy_root();
396 if (IS_ERR(root)) {
397 test_msg("Couldn't allocate root\n");
398 return PTR_ERR(root);
399 }
400
401 root->fs_info = btrfs_alloc_dummy_fs_info();
402 if (!root->fs_info) {
403 test_msg("Couldn't allocate dummy fs info\n");
404 ret = -ENOMEM;
405 goto out;
406 }
407
408 /*
409 * Can't use bytenr 0, some things freak out
410 * *cough*backref walking code*cough*
411 */
412 root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
413 if (!root->node) {
414 test_msg("Couldn't allocate dummy buffer\n");
415 ret = -ENOMEM;
416 goto out;
417 }
418 root->alloc_bytenr += 8192;
419
420 tmp_root = btrfs_alloc_dummy_root();
421 if (IS_ERR(tmp_root)) {
422 test_msg("Couldn't allocate a fs root\n");
423 ret = PTR_ERR(tmp_root);
424 goto out;
425 }
426
427 tmp_root->root_key.objectid = 5;
428 root->fs_info->fs_root = tmp_root;
429 ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
430 if (ret) {
431 test_msg("Couldn't insert fs root %d\n", ret);
432 goto out;
433 }
434
435 tmp_root = btrfs_alloc_dummy_root();
436 if (IS_ERR(tmp_root)) {
437 test_msg("Couldn't allocate a fs root\n");
438 ret = PTR_ERR(tmp_root);
439 goto out;
440 }
441
442 tmp_root->root_key.objectid = 256;
443 ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
444 if (ret) {
445 test_msg("Couldn't insert fs root %d\n", ret);
446 goto out;
447 }
448
449 /* We are using this root as our extent root */
450 root->fs_info->extent_root = root;
451
452 /*
453 * Some of the paths we test assume we have a filled out fs_info, so we
454 * just need to addt he root in there so we don't panic.
455 */
456 root->fs_info->tree_root = root;
457 root->fs_info->quota_root = root;
458 root->fs_info->quota_enabled = 1;
459
460 test_msg("Running qgroup tests\n");
461 ret = test_no_shared_qgroup(root);
462 if (ret)
463 goto out;
464 ret = test_multiple_refs(root);
465out:
466 btrfs_free_dummy_root(root);
467 return ret;
468}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7579f6d0b854..9630f10f8e1e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,6 +31,7 @@
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h" 33#include "dev-replace.h"
34#include "qgroup.h"
34 35
35#define BTRFS_ROOT_TRANS_TAG 0 36#define BTRFS_ROOT_TRANS_TAG 0
36 37
@@ -241,18 +242,19 @@ loop:
241static int record_root_in_trans(struct btrfs_trans_handle *trans, 242static int record_root_in_trans(struct btrfs_trans_handle *trans,
242 struct btrfs_root *root) 243 struct btrfs_root *root)
243{ 244{
244 if (root->ref_cows && root->last_trans < trans->transid) { 245 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
246 root->last_trans < trans->transid) {
245 WARN_ON(root == root->fs_info->extent_root); 247 WARN_ON(root == root->fs_info->extent_root);
246 WARN_ON(root->commit_root != root->node); 248 WARN_ON(root->commit_root != root->node);
247 249
248 /* 250 /*
249 * see below for in_trans_setup usage rules 251 * see below for IN_TRANS_SETUP usage rules
250 * we have the reloc mutex held now, so there 252 * we have the reloc mutex held now, so there
251 * is only one writer in this function 253 * is only one writer in this function
252 */ 254 */
253 root->in_trans_setup = 1; 255 set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
254 256
255 /* make sure readers find in_trans_setup before 257 /* make sure readers find IN_TRANS_SETUP before
256 * they find our root->last_trans update 258 * they find our root->last_trans update
257 */ 259 */
258 smp_wmb(); 260 smp_wmb();
@@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
279 * But, we have to set root->last_trans before we 281 * But, we have to set root->last_trans before we
280 * init the relocation root, otherwise, we trip over warnings 282 * init the relocation root, otherwise, we trip over warnings
281 * in ctree.c. The solution used here is to flag ourselves 283 * in ctree.c. The solution used here is to flag ourselves
282 * with root->in_trans_setup. When this is 1, we're still 284 * with root IN_TRANS_SETUP. When this is 1, we're still
283 * fixing up the reloc trees and everyone must wait. 285 * fixing up the reloc trees and everyone must wait.
284 * 286 *
285 * When this is zero, they can trust root->last_trans and fly 287 * When this is zero, they can trust root->last_trans and fly
@@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
288 * done before we pop in the zero below 290 * done before we pop in the zero below
289 */ 291 */
290 btrfs_init_reloc_root(trans, root); 292 btrfs_init_reloc_root(trans, root);
291 smp_wmb(); 293 smp_mb__before_atomic();
292 root->in_trans_setup = 0; 294 clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
293 } 295 }
294 return 0; 296 return 0;
295} 297}
@@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
298int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 300int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
299 struct btrfs_root *root) 301 struct btrfs_root *root)
300{ 302{
301 if (!root->ref_cows) 303 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
302 return 0; 304 return 0;
303 305
304 /* 306 /*
305 * see record_root_in_trans for comments about in_trans_setup usage 307 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
306 * and barriers 308 * and barriers
307 */ 309 */
308 smp_rmb(); 310 smp_rmb();
309 if (root->last_trans == trans->transid && 311 if (root->last_trans == trans->transid &&
310 !root->in_trans_setup) 312 !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
311 return 0; 313 return 0;
312 314
313 mutex_lock(&root->fs_info->reloc_mutex); 315 mutex_lock(&root->fs_info->reloc_mutex);
@@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
365static inline bool need_reserve_reloc_root(struct btrfs_root *root) 367static inline bool need_reserve_reloc_root(struct btrfs_root *root)
366{ 368{
367 if (!root->fs_info->reloc_ctl || 369 if (!root->fs_info->reloc_ctl ||
368 !root->ref_cows || 370 !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
369 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 371 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
370 root->reloc_root) 372 root->reloc_root)
371 return false; 373 return false;
@@ -695,6 +697,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
695 unsigned long cur = trans->delayed_ref_updates; 697 unsigned long cur = trans->delayed_ref_updates;
696 int lock = (trans->type != TRANS_JOIN_NOLOCK); 698 int lock = (trans->type != TRANS_JOIN_NOLOCK);
697 int err = 0; 699 int err = 0;
700 int must_run_delayed_refs = 0;
698 701
699 if (trans->use_count > 1) { 702 if (trans->use_count > 1) {
700 trans->use_count--; 703 trans->use_count--;
@@ -702,14 +705,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
702 return 0; 705 return 0;
703 } 706 }
704 707
705 /*
706 * do the qgroup accounting as early as possible
707 */
708 err = btrfs_delayed_refs_qgroup_accounting(trans, info);
709
710 btrfs_trans_release_metadata(trans, root); 708 btrfs_trans_release_metadata(trans, root);
711 trans->block_rsv = NULL; 709 trans->block_rsv = NULL;
712 710
711 if (!list_empty(&trans->new_bgs))
712 btrfs_create_pending_block_groups(trans, root);
713
714 trans->delayed_ref_updates = 0;
715 if (!trans->sync) {
716 must_run_delayed_refs =
717 btrfs_should_throttle_delayed_refs(trans, root);
718 cur = max_t(unsigned long, cur, 32);
719
720 /*
721 * don't make the caller wait if they are from a NOLOCK
722 * or ATTACH transaction, it will deadlock with commit
723 */
724 if (must_run_delayed_refs == 1 &&
725 (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
726 must_run_delayed_refs = 2;
727 }
728
713 if (trans->qgroup_reserved) { 729 if (trans->qgroup_reserved) {
714 /* 730 /*
715 * the same root has to be passed here between start_transaction 731 * the same root has to be passed here between start_transaction
@@ -719,16 +735,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
719 trans->qgroup_reserved = 0; 735 trans->qgroup_reserved = 0;
720 } 736 }
721 737
722 if (!list_empty(&trans->new_bgs))
723 btrfs_create_pending_block_groups(trans, root);
724
725 trans->delayed_ref_updates = 0;
726 if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
727 cur = max_t(unsigned long, cur, 32);
728 trans->delayed_ref_updates = 0;
729 btrfs_run_delayed_refs(trans, root, cur);
730 }
731
732 btrfs_trans_release_metadata(trans, root); 738 btrfs_trans_release_metadata(trans, root);
733 trans->block_rsv = NULL; 739 trans->block_rsv = NULL;
734 740
@@ -778,6 +784,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
778 assert_qgroups_uptodate(trans); 784 assert_qgroups_uptodate(trans);
779 785
780 kmem_cache_free(btrfs_trans_handle_cachep, trans); 786 kmem_cache_free(btrfs_trans_handle_cachep, trans);
787 if (must_run_delayed_refs) {
788 btrfs_async_run_delayed_refs(root, cur,
789 must_run_delayed_refs == 1);
790 }
781 return err; 791 return err;
782} 792}
783 793
@@ -1049,8 +1059,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
1049 btrfs_save_ino_cache(root, trans); 1059 btrfs_save_ino_cache(root, trans);
1050 1060
1051 /* see comments in should_cow_block() */ 1061 /* see comments in should_cow_block() */
1052 root->force_cow = 0; 1062 clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1053 smp_wmb(); 1063 smp_mb__after_atomic();
1054 1064
1055 if (root->commit_root != root->node) { 1065 if (root->commit_root != root->node) {
1056 list_add_tail(&root->dirty_list, 1066 list_add_tail(&root->dirty_list,
@@ -1081,7 +1091,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
1081 struct btrfs_trans_handle *trans; 1091 struct btrfs_trans_handle *trans;
1082 int ret; 1092 int ret;
1083 1093
1084 if (xchg(&root->defrag_running, 1)) 1094 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
1085 return 0; 1095 return 0;
1086 1096
1087 while (1) { 1097 while (1) {
@@ -1104,7 +1114,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
1104 break; 1114 break;
1105 } 1115 }
1106 } 1116 }
1107 root->defrag_running = 0; 1117 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
1108 return ret; 1118 return ret;
1109} 1119}
1110 1120
@@ -1168,12 +1178,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1168 goto no_free_objectid; 1178 goto no_free_objectid;
1169 } 1179 }
1170 1180
1171 pending->error = btrfs_qgroup_inherit(trans, fs_info,
1172 root->root_key.objectid,
1173 objectid, pending->inherit);
1174 if (pending->error)
1175 goto no_free_objectid;
1176
1177 key.objectid = objectid; 1181 key.objectid = objectid;
1178 key.offset = (u64)-1; 1182 key.offset = (u64)-1;
1179 key.type = BTRFS_ROOT_ITEM_KEY; 1183 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1270,8 +1274,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1270 goto fail; 1274 goto fail;
1271 } 1275 }
1272 1276
1277 /*
1278 * We need to flush delayed refs in order to make sure all of our quota
1279 * operations have been done before we call btrfs_qgroup_inherit.
1280 */
1281 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1282 if (ret) {
1283 btrfs_abort_transaction(trans, root, ret);
1284 goto fail;
1285 }
1286
1287 pending->error = btrfs_qgroup_inherit(trans, fs_info,
1288 root->root_key.objectid,
1289 objectid, pending->inherit);
1290 if (pending->error)
1291 goto no_free_objectid;
1292
1273 /* see comments in should_cow_block() */ 1293 /* see comments in should_cow_block() */
1274 root->force_cow = 1; 1294 set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1275 smp_wmb(); 1295 smp_wmb();
1276 1296
1277 btrfs_set_root_node(new_root_item, tmp); 1297 btrfs_set_root_node(new_root_item, tmp);
@@ -1598,12 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1598 * them now so that they hinder processing of more delayed refs 1618 * them now so that they hinder processing of more delayed refs
1599 * as little as possible. 1619 * as little as possible.
1600 */ 1620 */
1601 if (ret) {
1602 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1603 return ret;
1604 }
1605
1606 ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1607 if (ret) 1621 if (ret)
1608 return ret; 1622 return ret;
1609 1623
@@ -1984,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1984 } 1998 }
1985 root = list_first_entry(&fs_info->dead_roots, 1999 root = list_first_entry(&fs_info->dead_roots,
1986 struct btrfs_root, root_list); 2000 struct btrfs_root, root_list);
1987 /*
1988 * Make sure root is not involved in send,
1989 * if we fail with first root, we return
1990 * directly rather than continue.
1991 */
1992 spin_lock(&root->root_item_lock);
1993 if (root->send_in_progress) {
1994 spin_unlock(&fs_info->trans_lock);
1995 spin_unlock(&root->root_item_lock);
1996 return 0;
1997 }
1998 spin_unlock(&root->root_item_lock);
1999
2000 list_del_init(&root->root_list); 2001 list_del_init(&root->root_list);
2001 spin_unlock(&fs_info->trans_lock); 2002 spin_unlock(&fs_info->trans_lock);
2002 2003
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b57b924e8e03..7dd558ed0716 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,6 +69,7 @@ struct btrfs_transaction {
69#define __TRANS_ATTACH (1U << 10) 69#define __TRANS_ATTACH (1U << 10)
70#define __TRANS_JOIN (1U << 11) 70#define __TRANS_JOIN (1U << 11)
71#define __TRANS_JOIN_NOLOCK (1U << 12) 71#define __TRANS_JOIN_NOLOCK (1U << 12)
72#define __TRANS_DUMMY (1U << 13)
72 73
73#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) 74#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
74#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) 75#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 76928ca97741..a63719cc9578 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
49 goto out; 49 goto out;
50 } 50 }
51 51
52 if (root->ref_cows == 0) 52 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
53 goto out; 53 goto out;
54 54
55 if (btrfs_test_opt(root, SSD)) 55 if (btrfs_test_opt(root, SSD))
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e2f45fc02610..9e1f2cd5e67a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,13 +20,11 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/list_sort.h> 22#include <linux/list_sort.h>
23#include "ctree.h" 23#include "tree-log.h"
24#include "transaction.h"
25#include "disk-io.h" 24#include "disk-io.h"
26#include "locking.h" 25#include "locking.h"
27#include "print-tree.h" 26#include "print-tree.h"
28#include "backref.h" 27#include "backref.h"
29#include "tree-log.h"
30#include "hash.h" 28#include "hash.h"
31 29
32/* magic values for the inode_only field in btrfs_log_inode: 30/* magic values for the inode_only field in btrfs_log_inode:
@@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
144 142
145 mutex_lock(&root->log_mutex); 143 mutex_lock(&root->log_mutex);
146 if (root->log_root) { 144 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == 145 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
148 trans->transid) {
149 ret = -EAGAIN; 146 ret = -EAGAIN;
150 goto out; 147 goto out;
151 } 148 }
152
153 if (!root->log_start_pid) { 149 if (!root->log_start_pid) {
154 root->log_start_pid = current->pid; 150 root->log_start_pid = current->pid;
155 root->log_multiple_pids = false; 151 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
156 } else if (root->log_start_pid != current->pid) { 152 } else if (root->log_start_pid != current->pid) {
157 root->log_multiple_pids = true; 153 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
158 } 154 }
159 155
160 atomic_inc(&root->log_batch); 156 atomic_inc(&root->log_batch);
@@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
181 if (ret) 177 if (ret)
182 goto out; 178 goto out;
183 } 179 }
184 root->log_multiple_pids = false; 180 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
185 root->log_start_pid = current->pid; 181 root->log_start_pid = current->pid;
186 atomic_inc(&root->log_batch); 182 atomic_inc(&root->log_batch);
187 atomic_inc(&root->log_writers); 183 atomic_inc(&root->log_writers);
@@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2500 while (1) { 2496 while (1) {
2501 int batch = atomic_read(&root->log_batch); 2497 int batch = atomic_read(&root->log_batch);
2502 /* when we're on an ssd, just kick the log commit out */ 2498 /* when we're on an ssd, just kick the log commit out */
2503 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2499 if (!btrfs_test_opt(root, SSD) &&
2500 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2504 mutex_unlock(&root->log_mutex); 2501 mutex_unlock(&root->log_mutex);
2505 schedule_timeout_uninterruptible(1); 2502 schedule_timeout_uninterruptible(1);
2506 mutex_lock(&root->log_mutex); 2503 mutex_lock(&root->log_mutex);
@@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2511 } 2508 }
2512 2509
2513 /* bail out if we need to do a full commit */ 2510 /* bail out if we need to do a full commit */
2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == 2511 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2515 trans->transid) {
2516 ret = -EAGAIN; 2512 ret = -EAGAIN;
2517 btrfs_free_logged_extents(log, log_transid); 2513 btrfs_free_logged_extents(log, log_transid);
2518 mutex_unlock(&root->log_mutex); 2514 mutex_unlock(&root->log_mutex);
@@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2533 blk_finish_plug(&plug); 2529 blk_finish_plug(&plug);
2534 btrfs_abort_transaction(trans, root, ret); 2530 btrfs_abort_transaction(trans, root, ret);
2535 btrfs_free_logged_extents(log, log_transid); 2531 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2532 btrfs_set_log_full_commit(root->fs_info, trans);
2537 trans->transid;
2538 mutex_unlock(&root->log_mutex); 2533 mutex_unlock(&root->log_mutex);
2539 goto out; 2534 goto out;
2540 } 2535 }
@@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2577 list_del_init(&root_log_ctx.list); 2572 list_del_init(&root_log_ctx.list);
2578 2573
2579 blk_finish_plug(&plug); 2574 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2575 btrfs_set_log_full_commit(root->fs_info, trans);
2581 trans->transid; 2576
2582 if (ret != -ENOSPC) { 2577 if (ret != -ENOSPC) {
2583 btrfs_abort_transaction(trans, root, ret); 2578 btrfs_abort_transaction(trans, root, ret);
2584 mutex_unlock(&log_root_tree->log_mutex); 2579 mutex_unlock(&log_root_tree->log_mutex);
@@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2622 * now that we've moved on to the tree of log tree roots, 2617 * now that we've moved on to the tree of log tree roots,
2623 * check the full commit flag again 2618 * check the full commit flag again
2624 */ 2619 */
2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == 2620 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2626 trans->transid) {
2627 blk_finish_plug(&plug); 2621 blk_finish_plug(&plug);
2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2622 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2629 btrfs_free_logged_extents(log, log_transid); 2623 btrfs_free_logged_extents(log, log_transid);
@@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2637 EXTENT_DIRTY | EXTENT_NEW); 2631 EXTENT_DIRTY | EXTENT_NEW);
2638 blk_finish_plug(&plug); 2632 blk_finish_plug(&plug);
2639 if (ret) { 2633 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2634 btrfs_set_log_full_commit(root->fs_info, trans);
2641 trans->transid;
2642 btrfs_abort_transaction(trans, root, ret); 2635 btrfs_abort_transaction(trans, root, ret);
2643 btrfs_free_logged_extents(log, log_transid); 2636 btrfs_free_logged_extents(log, log_transid);
2644 mutex_unlock(&log_root_tree->log_mutex); 2637 mutex_unlock(&log_root_tree->log_mutex);
@@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2667 */ 2660 */
2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2661 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2669 if (ret) { 2662 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = 2663 btrfs_set_log_full_commit(root->fs_info, trans);
2671 trans->transid;
2672 btrfs_abort_transaction(trans, root, ret); 2664 btrfs_abort_transaction(trans, root, ret);
2673 goto out_wake_log_root; 2665 goto out_wake_log_root;
2674 } 2666 }
@@ -2886,7 +2878,7 @@ fail:
2886out_unlock: 2878out_unlock:
2887 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2879 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2888 if (ret == -ENOSPC) { 2880 if (ret == -ENOSPC) {
2889 root->fs_info->last_trans_log_full_commit = trans->transid; 2881 btrfs_set_log_full_commit(root->fs_info, trans);
2890 ret = 0; 2882 ret = 0;
2891 } else if (ret < 0) 2883 } else if (ret < 0)
2892 btrfs_abort_transaction(trans, root, ret); 2884 btrfs_abort_transaction(trans, root, ret);
@@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2919 dirid, &index); 2911 dirid, &index);
2920 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2912 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2921 if (ret == -ENOSPC) { 2913 if (ret == -ENOSPC) {
2922 root->fs_info->last_trans_log_full_commit = trans->transid; 2914 btrfs_set_log_full_commit(root->fs_info, trans);
2923 ret = 0; 2915 ret = 0;
2924 } else if (ret < 0 && ret != -ENOENT) 2916 } else if (ret < 0 && ret != -ENOENT)
2925 btrfs_abort_transaction(trans, root, ret); 2917 btrfs_abort_transaction(trans, root, ret);
@@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4130 * make sure any commits to the log are forced 4122 * make sure any commits to the log are forced
4131 * to be full commits 4123 * to be full commits
4132 */ 4124 */
4133 root->fs_info->last_trans_log_full_commit = 4125 btrfs_set_log_full_commit(root->fs_info, trans);
4134 trans->transid;
4135 ret = 1; 4126 ret = 1;
4136 break; 4127 break;
4137 } 4128 }
@@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4177 goto end_no_trans; 4168 goto end_no_trans;
4178 } 4169 }
4179 4170
4171 /*
4172 * The prev transaction commit doesn't complete, we need do
4173 * full commit by ourselves.
4174 */
4180 if (root->fs_info->last_trans_log_full_commit > 4175 if (root->fs_info->last_trans_log_full_commit >
4181 root->fs_info->last_trans_committed) { 4176 root->fs_info->last_trans_committed) {
4182 ret = 1; 4177 ret = 1;
@@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4246end_trans: 4241end_trans:
4247 dput(old_parent); 4242 dput(old_parent);
4248 if (ret < 0) { 4243 if (ret < 0) {
4249 root->fs_info->last_trans_log_full_commit = trans->transid; 4244 btrfs_set_log_full_commit(root->fs_info, trans);
4250 ret = 1; 4245 ret = 1;
4251 } 4246 }
4252 4247
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 91b145fce333..7f5b41bd5373 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
19#ifndef __TREE_LOG_ 19#ifndef __TREE_LOG_
20#define __TREE_LOG_ 20#define __TREE_LOG_
21 21
22#include "ctree.h"
23#include "transaction.h"
24
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 25/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 26#define BTRFS_NO_LOG_SYNC 256
24 27
@@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
35 INIT_LIST_HEAD(&ctx->list); 38 INIT_LIST_HEAD(&ctx->list);
36} 39}
37 40
41static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
42 struct btrfs_trans_handle *trans)
43{
44 ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
45}
46
47static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
48 struct btrfs_trans_handle *trans)
49{
50 return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
51 trans->transid;
52}
53
38int btrfs_sync_log(struct btrfs_trans_handle *trans, 54int btrfs_sync_log(struct btrfs_trans_handle *trans,
39 struct btrfs_root *root, struct btrfs_log_ctx *ctx); 55 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 56int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 49d7fab73360..ffeed6d6326f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1452,6 +1452,22 @@ out:
1452 return ret; 1452 return ret;
1453} 1453}
1454 1454
1455/*
1456 * Function to update ctime/mtime for a given device path.
1457 * Mainly used for ctime/mtime based probe like libblkid.
1458 */
1459static void update_dev_time(char *path_name)
1460{
1461 struct file *filp;
1462
1463 filp = filp_open(path_name, O_RDWR, 0);
1464 if (!filp)
1465 return;
1466 file_update_time(filp);
1467 filp_close(filp, NULL);
1468 return;
1469}
1470
1455static int btrfs_rm_dev_item(struct btrfs_root *root, 1471static int btrfs_rm_dev_item(struct btrfs_root *root,
1456 struct btrfs_device *device) 1472 struct btrfs_device *device)
1457{ 1473{
@@ -1674,11 +1690,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1674 struct btrfs_fs_devices *fs_devices; 1690 struct btrfs_fs_devices *fs_devices;
1675 fs_devices = root->fs_info->fs_devices; 1691 fs_devices = root->fs_info->fs_devices;
1676 while (fs_devices) { 1692 while (fs_devices) {
1677 if (fs_devices->seed == cur_devices) 1693 if (fs_devices->seed == cur_devices) {
1694 fs_devices->seed = cur_devices->seed;
1678 break; 1695 break;
1696 }
1679 fs_devices = fs_devices->seed; 1697 fs_devices = fs_devices->seed;
1680 } 1698 }
1681 fs_devices->seed = cur_devices->seed;
1682 cur_devices->seed = NULL; 1699 cur_devices->seed = NULL;
1683 lock_chunks(root); 1700 lock_chunks(root);
1684 __btrfs_close_devices(cur_devices); 1701 __btrfs_close_devices(cur_devices);
@@ -1694,20 +1711,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1694 * remove it from the devices list and zero out the old super 1711 * remove it from the devices list and zero out the old super
1695 */ 1712 */
1696 if (clear_super && disk_super) { 1713 if (clear_super && disk_super) {
1714 u64 bytenr;
1715 int i;
1716
1697 /* make sure this device isn't detected as part of 1717 /* make sure this device isn't detected as part of
1698 * the FS anymore 1718 * the FS anymore
1699 */ 1719 */
1700 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1720 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1701 set_buffer_dirty(bh); 1721 set_buffer_dirty(bh);
1702 sync_dirty_buffer(bh); 1722 sync_dirty_buffer(bh);
1723
1724 /* clear the mirror copies of super block on the disk
1725 * being removed, 0th copy is been taken care above and
1726 * the below would take of the rest
1727 */
1728 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1729 bytenr = btrfs_sb_offset(i);
1730 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1731 i_size_read(bdev->bd_inode))
1732 break;
1733
1734 brelse(bh);
1735 bh = __bread(bdev, bytenr / 4096,
1736 BTRFS_SUPER_INFO_SIZE);
1737 if (!bh)
1738 continue;
1739
1740 disk_super = (struct btrfs_super_block *)bh->b_data;
1741
1742 if (btrfs_super_bytenr(disk_super) != bytenr ||
1743 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1744 continue;
1745 }
1746 memset(&disk_super->magic, 0,
1747 sizeof(disk_super->magic));
1748 set_buffer_dirty(bh);
1749 sync_dirty_buffer(bh);
1750 }
1703 } 1751 }
1704 1752
1705 ret = 0; 1753 ret = 0;
1706 1754
1707 /* Notify udev that device has changed */ 1755 if (bdev) {
1708 if (bdev) 1756 /* Notify udev that device has changed */
1709 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1757 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1710 1758
1759 /* Update ctime/mtime for device path for libblkid */
1760 update_dev_time(device_path);
1761 }
1762
1711error_brelse: 1763error_brelse:
1712 brelse(bh); 1764 brelse(bh);
1713 if (bdev) 1765 if (bdev)
@@ -1883,7 +1935,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1883 fs_devices->seeding = 0; 1935 fs_devices->seeding = 0;
1884 fs_devices->num_devices = 0; 1936 fs_devices->num_devices = 0;
1885 fs_devices->open_devices = 0; 1937 fs_devices->open_devices = 0;
1886 fs_devices->total_devices = 0;
1887 fs_devices->seed = seed_devices; 1938 fs_devices->seed = seed_devices;
1888 1939
1889 generate_random_uuid(fs_devices->fsid); 1940 generate_random_uuid(fs_devices->fsid);
@@ -2146,6 +2197,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2146 ret = btrfs_commit_transaction(trans, root); 2197 ret = btrfs_commit_transaction(trans, root);
2147 } 2198 }
2148 2199
2200 /* Update ctime/mtime for libblkid */
2201 update_dev_time(device_path);
2149 return ret; 2202 return ret;
2150 2203
2151error_trans: 2204error_trans:
@@ -2922,6 +2975,16 @@ static int should_balance_chunk(struct btrfs_root *root,
2922 return 0; 2975 return 0;
2923 } 2976 }
2924 2977
2978 /*
2979 * limited by count, must be the last filter
2980 */
2981 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
2982 if (bargs->limit == 0)
2983 return 0;
2984 else
2985 bargs->limit--;
2986 }
2987
2925 return 1; 2988 return 1;
2926} 2989}
2927 2990
@@ -2944,6 +3007,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2944 int ret; 3007 int ret;
2945 int enospc_errors = 0; 3008 int enospc_errors = 0;
2946 bool counting = true; 3009 bool counting = true;
3010 u64 limit_data = bctl->data.limit;
3011 u64 limit_meta = bctl->meta.limit;
3012 u64 limit_sys = bctl->sys.limit;
2947 3013
2948 /* step one make some room on all the devices */ 3014 /* step one make some room on all the devices */
2949 devices = &fs_info->fs_devices->devices; 3015 devices = &fs_info->fs_devices->devices;
@@ -2982,6 +3048,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2982 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3048 memset(&bctl->stat, 0, sizeof(bctl->stat));
2983 spin_unlock(&fs_info->balance_lock); 3049 spin_unlock(&fs_info->balance_lock);
2984again: 3050again:
3051 if (!counting) {
3052 bctl->data.limit = limit_data;
3053 bctl->meta.limit = limit_meta;
3054 bctl->sys.limit = limit_sys;
3055 }
2985 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3056 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2986 key.offset = (u64)-1; 3057 key.offset = (u64)-1;
2987 key.type = BTRFS_CHUNK_ITEM_KEY; 3058 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -3881,7 +3952,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
3881 u8 *ptr; 3952 u8 *ptr;
3882 3953
3883 array_size = btrfs_super_sys_array_size(super_copy); 3954 array_size = btrfs_super_sys_array_size(super_copy);
3884 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3955 if (array_size + item_size + sizeof(disk_key)
3956 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3885 return -EFBIG; 3957 return -EFBIG;
3886 3958
3887 ptr = super_copy->sys_chunk_array + array_size; 3959 ptr = super_copy->sys_chunk_array + array_size;
@@ -3986,6 +4058,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3986 btrfs_set_fs_incompat(info, RAID56); 4058 btrfs_set_fs_incompat(info, RAID56);
3987} 4059}
3988 4060
4061#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
4062 - sizeof(struct btrfs_item) \
4063 - sizeof(struct btrfs_chunk)) \
4064 / sizeof(struct btrfs_stripe) + 1)
4065
4066#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
4067 - 2 * sizeof(struct btrfs_disk_key) \
4068 - 2 * sizeof(struct btrfs_chunk)) \
4069 / sizeof(struct btrfs_stripe) + 1)
4070
3989static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4071static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, u64 start, 4072 struct btrfs_root *extent_root, u64 start,
3991 u64 type) 4073 u64 type)
@@ -4035,6 +4117,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4035 if (type & BTRFS_BLOCK_GROUP_DATA) { 4117 if (type & BTRFS_BLOCK_GROUP_DATA) {
4036 max_stripe_size = 1024 * 1024 * 1024; 4118 max_stripe_size = 1024 * 1024 * 1024;
4037 max_chunk_size = 10 * max_stripe_size; 4119 max_chunk_size = 10 * max_stripe_size;
4120 if (!devs_max)
4121 devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4038 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4122 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4039 /* for larger filesystems, use larger metadata chunks */ 4123 /* for larger filesystems, use larger metadata chunks */
4040 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4124 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
@@ -4042,11 +4126,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4042 else 4126 else
4043 max_stripe_size = 256 * 1024 * 1024; 4127 max_stripe_size = 256 * 1024 * 1024;
4044 max_chunk_size = max_stripe_size; 4128 max_chunk_size = max_stripe_size;
4129 if (!devs_max)
4130 devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4045 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4131 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4046 max_stripe_size = 32 * 1024 * 1024; 4132 max_stripe_size = 32 * 1024 * 1024;
4047 max_chunk_size = 2 * max_stripe_size; 4133 max_chunk_size = 2 * max_stripe_size;
4134 if (!devs_max)
4135 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4048 } else { 4136 } else {
4049 btrfs_err(info, "invalid chunk type 0x%llx requested\n", 4137 btrfs_err(info, "invalid chunk type 0x%llx requested",
4050 type); 4138 type);
4051 BUG_ON(1); 4139 BUG_ON(1);
4052 } 4140 }
@@ -4294,7 +4382,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4294 4382
4295 if (em->start != chunk_offset || em->len != chunk_size) { 4383 if (em->start != chunk_offset || em->len != chunk_size) {
4296 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4384 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4297 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, 4385 " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4298 chunk_size, em->start, em->len); 4386 chunk_size, em->start, em->len);
4299 free_extent_map(em); 4387 free_extent_map(em);
4300 return -EINVAL; 4388 return -EINVAL;
@@ -4496,14 +4584,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4496 * and exit, so return 1 so the callers don't try to use other copies. 4584 * and exit, so return 1 so the callers don't try to use other copies.
4497 */ 4585 */
4498 if (!em) { 4586 if (!em) {
4499 btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, 4587 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4500 logical+len); 4588 logical+len);
4501 return 1; 4589 return 1;
4502 } 4590 }
4503 4591
4504 if (em->start > logical || em->start + em->len < logical) { 4592 if (em->start > logical || em->start + em->len < logical) {
4505 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4593 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4506 "%Lu-%Lu\n", logical, logical+len, em->start, 4594 "%Lu-%Lu", logical, logical+len, em->start,
4507 em->start + em->len); 4595 em->start + em->len);
4508 free_extent_map(em); 4596 free_extent_map(em);
4509 return 1; 4597 return 1;
@@ -4684,7 +4772,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4684 4772
4685 if (em->start > logical || em->start + em->len < logical) { 4773 if (em->start > logical || em->start + em->len < logical) {
4686 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4774 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4687 "found %Lu-%Lu\n", logical, em->start, 4775 "found %Lu-%Lu", logical, em->start,
4688 em->start + em->len); 4776 em->start + em->len);
4689 free_extent_map(em); 4777 free_extent_map(em);
4690 return -EINVAL; 4778 return -EINVAL;
@@ -6058,10 +6146,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6058 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6146 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6059 struct btrfs_device *device; 6147 struct btrfs_device *device;
6060 6148
6061 mutex_lock(&fs_devices->device_list_mutex); 6149 while (fs_devices) {
6062 list_for_each_entry(device, &fs_devices->devices, dev_list) 6150 mutex_lock(&fs_devices->device_list_mutex);
6063 device->dev_root = fs_info->dev_root; 6151 list_for_each_entry(device, &fs_devices->devices, dev_list)
6064 mutex_unlock(&fs_devices->device_list_mutex); 6152 device->dev_root = fs_info->dev_root;
6153 mutex_unlock(&fs_devices->device_list_mutex);
6154
6155 fs_devices = fs_devices->seed;
6156 }
6065} 6157}
6066 6158
6067static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6159static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80754f9dd3df..1a15bbeb65e2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -255,6 +255,7 @@ struct map_lookup {
255#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) 255#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
256#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) 256#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
257#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) 257#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
258#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
258 259
259/* 260/*
260 * Profile changing flags. When SOFT is set we won't relocate chunk if 261 * Profile changing flags. When SOFT is set we won't relocate chunk if
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8e57191950cb..4f196314c0c1 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws,
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "BTRFS: deflateInit failed\n"); 100 printk(KERN_WARNING "BTRFS: deflateInit failed\n");
101 ret = -1; 101 ret = -EIO;
102 goto out; 102 goto out;
103 } 103 }
104 104
@@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,
110 110
111 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 111 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
112 if (out_page == NULL) { 112 if (out_page == NULL) {
113 ret = -1; 113 ret = -ENOMEM;
114 goto out; 114 goto out;
115 } 115 }
116 cpage_out = kmap(out_page); 116 cpage_out = kmap(out_page);
@@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws,
128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -EIO;
132 goto out; 132 goto out;
133 } 133 }
134 134
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
136 if (workspace->def_strm.total_in > 8192 && 136 if (workspace->def_strm.total_in > 8192 &&
137 workspace->def_strm.total_in < 137 workspace->def_strm.total_in <
138 workspace->def_strm.total_out) { 138 workspace->def_strm.total_out) {
139 ret = -1; 139 ret = -EIO;
140 goto out; 140 goto out;
141 } 141 }
142 /* we need another page for writing out. Test this 142 /* we need another page for writing out. Test this
@@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,
147 kunmap(out_page); 147 kunmap(out_page);
148 if (nr_pages == nr_dest_pages) { 148 if (nr_pages == nr_dest_pages) {
149 out_page = NULL; 149 out_page = NULL;
150 ret = -1; 150 ret = -E2BIG;
151 goto out; 151 goto out;
152 } 152 }
153 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 153 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
154 if (out_page == NULL) { 154 if (out_page == NULL) {
155 ret = -1; 155 ret = -ENOMEM;
156 goto out; 156 goto out;
157 } 157 }
158 cpage_out = kmap(out_page); 158 cpage_out = kmap(out_page);
@@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,
188 zlib_deflateEnd(&workspace->def_strm); 188 zlib_deflateEnd(&workspace->def_strm);
189 189
190 if (ret != Z_STREAM_END) { 190 if (ret != Z_STREAM_END) {
191 ret = -1; 191 ret = -EIO;
192 goto out; 192 goto out;
193 } 193 }
194 194
195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { 195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
196 ret = -1; 196 ret = -E2BIG;
197 goto out; 197 goto out;
198 } 198 }
199 199
@@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 255 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
256 return -1; 256 return -EIO;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
295 } 295 }
296 } 296 }
297 if (ret != Z_STREAM_END) 297 if (ret != Z_STREAM_END)
298 ret = -1; 298 ret = -EIO;
299 else 299 else
300 ret = 0; 300 ret = 0;
301done: 301done:
@@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 339 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
340 return -1; 340 return -EIO;
341 } 341 }
342 342
343 while (bytes_left > 0) { 343 while (bytes_left > 0) {
@@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
354 total_out = workspace->inf_strm.total_out; 354 total_out = workspace->inf_strm.total_out;
355 355
356 if (total_out == buf_start) { 356 if (total_out == buf_start) {
357 ret = -1; 357 ret = -EIO;
358 break; 358 break;
359 } 359 }
360 360
@@ -382,7 +382,7 @@ next:
382 } 382 }
383 383
384 if (ret != Z_STREAM_END && bytes_left != 0) 384 if (ret != Z_STREAM_END && bytes_left != 0)
385 ret = -1; 385 ret = -EIO;
386 else 386 else
387 ret = 0; 387 ret = 0;
388 388