aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 16:35:29 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 16:35:29 -0500
commit5cea7647e64657138138a3794ae172ee0fc175da (patch)
tree38adc54cba508db574e190e9d9aa601c36a8fd7c
parent808eb24e0e0939b487bf90e3888a9636f1c83acb (diff)
parentd28e649a5c58b779b303c252c66ee84a0f2c3b32 (diff)
Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "There are some new user features and the usual load of invisible enhancements or cleanups. New features: - extend mount options to specify zlib compression level, -o compress=zlib:9 - v2 of ioctl "extent to inode mapping", addressing a usecase where we want to retrieve more but inaccurate results and do the postprocessing in userspace, aiding defragmentation or deduplication tools - populate compression heuristics logic, do data sampling and try to guess compressibility by: looking for repeated patterns, counting unique byte values and distribution, calculating Shannon entropy; this will need more benchmarking and possibly fine tuning, but the base should be good enough - enable indexing for btrfs as lower filesystem in overlayfs - speedup page cache readahead during send on large files Internal enhancements: - more sanity checks of b-tree items when reading them from disk - more EINVAL/EUCLEAN fixups, missing BLK_STS_* conversion, other errno or error handling fixes - remove some homegrown IO-related logic, that's been obsoleted by core block layer changes (batching, plug/unplug, own counters) - add ref-verify, optional debugging feature to verify extent reference accounting - simplify code handling outstanding extents, make it more clear where and how the accounting is done - make delalloc reservations per-inode, simplify the code and make the logic more straightforward - extensive cleanup of delayed refs code Notable fixes: - fix send ioctl on 32bit with 64bit kernel" * 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (102 commits) btrfs: Fix bug for misused dev_t when lookup in dev state hash table. Btrfs: heuristic: add Shannon entropy calculation Btrfs: heuristic: add byte core set calculation Btrfs: heuristic: add byte set calculation Btrfs: heuristic: add detection of repeated data patterns Btrfs: heuristic: implement sampling logic Btrfs: heuristic: add bucket and sample counters and other defines Btrfs: compression: separate heuristic/compression workspaces btrfs: move btrfs_truncate_block out of trans handle btrfs: don't call btrfs_start_delalloc_roots in flushoncommit btrfs: track refs in a rb_tree instead of a list btrfs: add a comp_refs() helper btrfs: switch args for comp_*_refs btrfs: make the delalloc block rsv per inode btrfs: add tracepoints for outstanding extents mods Btrfs: rework outstanding_extents btrfs: increase output size for LOGICAL_INO_V2 ioctl btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents btrfs: send: remove unused code ...
-rw-r--r--fs/btrfs/Kconfig11
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h29
-rw-r--r--fs/btrfs/check-integrity.c8
-rw-r--r--fs/btrfs/compression.c493
-rw-r--r--fs/btrfs/compression.h6
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h30
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/delayed-ref.c296
-rw-r--r--fs/btrfs/delayed-ref.h54
-rw-r--r--fs/btrfs/disk-io.c227
-rw-r--r--fs/btrfs/extent-tree.c829
-rw-r--r--fs/btrfs/extent_io.c44
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c327
-rw-r--r--fs/btrfs/ioctl.c156
-rw-r--r--fs/btrfs/lzo.c5
-rw-r--r--fs/btrfs/ordered-data.c21
-rw-r--r--fs/btrfs/qgroup.c8
-rw-r--r--fs/btrfs/raid56.c30
-rw-r--r--fs/btrfs/ref-verify.c1031
-rw-r--r--fs/btrfs/ref-verify.h62
-rw-r--r--fs/btrfs/relocation.c17
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c22
-rw-r--r--fs/btrfs/send.c74
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/super.c37
-rw-r--r--fs/btrfs/sysfs.c63
-rw-r--r--fs/btrfs/sysfs.h26
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c20
-rw-r--r--fs/btrfs/tests/qgroup-tests.c30
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/tree-checker.c425
-rw-r--r--fs/btrfs/tree-checker.h26
-rw-r--r--fs/btrfs/tree-log.c34
-rw-r--r--fs/btrfs/volumes.c168
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/zlib.c15
-rw-r--r--fs/btrfs/zstd.c5
-rw-r--r--include/trace/events/btrfs.h41
-rw-r--r--include/uapi/linux/btrfs.h8
-rw-r--r--include/uapi/linux/btrfs_tree.h1
51 files changed, 3356 insertions, 1556 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a26c63b4ad68..2e558227931a 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -91,3 +91,14 @@ config BTRFS_ASSERT
91 any of the assertions trip. This is meant for btrfs developers only. 91 any of the assertions trip. This is meant for btrfs developers only.
92 92
93 If unsure, say N. 93 If unsure, say N.
94
95config BTRFS_FS_REF_VERIFY
96 bool "Btrfs with the ref verify tool compiled in"
97 depends on BTRFS_FS
98 default n
99 help
100 Enable run-time extent reference verification instrumentation. This
101 is meant to be used by btrfs developers for tracking down extent
102 reference problems or verifying they didn't break something.
103
104 If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f2cd9dedb037..6fe881d5cb38 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,10 +10,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
10 export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ 10 export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
11 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 11 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
12 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ 12 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
13 uuid-tree.o props.o hash.o free-space-tree.o 13 uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
14 14
15btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 15btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
16btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 16btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
17btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
17 18
18btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ 19btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
19 tests/extent-buffer-tests.o tests/btrfs-tests.o \ 20 tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e00c8a9fd5bb..d5540749f0e5 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -67,7 +67,7 @@ struct btrfs_workqueue {
67static void normal_work_helper(struct btrfs_work *work); 67static void normal_work_helper(struct btrfs_work *work);
68 68
69#define BTRFS_WORK_HELPER(name) \ 69#define BTRFS_WORK_HELPER(name) \
70void btrfs_##name(struct work_struct *arg) \ 70noinline_for_stack void btrfs_##name(struct work_struct *arg) \
71{ \ 71{ \
72 struct btrfs_work *work = container_of(arg, struct btrfs_work, \ 72 struct btrfs_work *work = container_of(arg, struct btrfs_work, \
73 normal_work); \ 73 normal_work); \
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b517ef1477ea..7d0dc100a09a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -40,12 +40,14 @@ static int check_extent_in_eb(const struct btrfs_key *key,
40 const struct extent_buffer *eb, 40 const struct extent_buffer *eb,
41 const struct btrfs_file_extent_item *fi, 41 const struct btrfs_file_extent_item *fi,
42 u64 extent_item_pos, 42 u64 extent_item_pos,
43 struct extent_inode_elem **eie) 43 struct extent_inode_elem **eie,
44 bool ignore_offset)
44{ 45{
45 u64 offset = 0; 46 u64 offset = 0;
46 struct extent_inode_elem *e; 47 struct extent_inode_elem *e;
47 48
48 if (!btrfs_file_extent_compression(eb, fi) && 49 if (!ignore_offset &&
50 !btrfs_file_extent_compression(eb, fi) &&
49 !btrfs_file_extent_encryption(eb, fi) && 51 !btrfs_file_extent_encryption(eb, fi) &&
50 !btrfs_file_extent_other_encoding(eb, fi)) { 52 !btrfs_file_extent_other_encoding(eb, fi)) {
51 u64 data_offset; 53 u64 data_offset;
@@ -84,7 +86,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
84 86
85static int find_extent_in_eb(const struct extent_buffer *eb, 87static int find_extent_in_eb(const struct extent_buffer *eb,
86 u64 wanted_disk_byte, u64 extent_item_pos, 88 u64 wanted_disk_byte, u64 extent_item_pos,
87 struct extent_inode_elem **eie) 89 struct extent_inode_elem **eie,
90 bool ignore_offset)
88{ 91{
89 u64 disk_byte; 92 u64 disk_byte;
90 struct btrfs_key key; 93 struct btrfs_key key;
@@ -113,7 +116,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
113 if (disk_byte != wanted_disk_byte) 116 if (disk_byte != wanted_disk_byte)
114 continue; 117 continue;
115 118
116 ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); 119 ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
117 if (ret < 0) 120 if (ret < 0)
118 return ret; 121 return ret;
119 } 122 }
@@ -419,7 +422,7 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
419static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 422static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
420 struct ulist *parents, struct prelim_ref *ref, 423 struct ulist *parents, struct prelim_ref *ref,
421 int level, u64 time_seq, const u64 *extent_item_pos, 424 int level, u64 time_seq, const u64 *extent_item_pos,
422 u64 total_refs) 425 u64 total_refs, bool ignore_offset)
423{ 426{
424 int ret = 0; 427 int ret = 0;
425 int slot; 428 int slot;
@@ -472,7 +475,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
472 if (extent_item_pos) { 475 if (extent_item_pos) {
473 ret = check_extent_in_eb(&key, eb, fi, 476 ret = check_extent_in_eb(&key, eb, fi,
474 *extent_item_pos, 477 *extent_item_pos,
475 &eie); 478 &eie, ignore_offset);
476 if (ret < 0) 479 if (ret < 0)
477 break; 480 break;
478 } 481 }
@@ -510,7 +513,8 @@ next:
510static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, 513static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
511 struct btrfs_path *path, u64 time_seq, 514 struct btrfs_path *path, u64 time_seq,
512 struct prelim_ref *ref, struct ulist *parents, 515 struct prelim_ref *ref, struct ulist *parents,
513 const u64 *extent_item_pos, u64 total_refs) 516 const u64 *extent_item_pos, u64 total_refs,
517 bool ignore_offset)
514{ 518{
515 struct btrfs_root *root; 519 struct btrfs_root *root;
516 struct btrfs_key root_key; 520 struct btrfs_key root_key;
@@ -581,7 +585,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
581 } 585 }
582 586
583 ret = add_all_parents(root, path, parents, ref, level, time_seq, 587 ret = add_all_parents(root, path, parents, ref, level, time_seq,
584 extent_item_pos, total_refs); 588 extent_item_pos, total_refs, ignore_offset);
585out: 589out:
586 path->lowest_level = 0; 590 path->lowest_level = 0;
587 btrfs_release_path(path); 591 btrfs_release_path(path);
@@ -616,7 +620,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
616 struct btrfs_path *path, u64 time_seq, 620 struct btrfs_path *path, u64 time_seq,
617 struct preftrees *preftrees, 621 struct preftrees *preftrees,
618 const u64 *extent_item_pos, u64 total_refs, 622 const u64 *extent_item_pos, u64 total_refs,
619 struct share_check *sc) 623 struct share_check *sc, bool ignore_offset)
620{ 624{
621 int err; 625 int err;
622 int ret = 0; 626 int ret = 0;
@@ -661,7 +665,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
661 } 665 }
662 err = resolve_indirect_ref(fs_info, path, time_seq, ref, 666 err = resolve_indirect_ref(fs_info, path, time_seq, ref,
663 parents, extent_item_pos, 667 parents, extent_item_pos,
664 total_refs); 668 total_refs, ignore_offset);
665 /* 669 /*
666 * we can only tolerate ENOENT,otherwise,we should catch error 670 * we can only tolerate ENOENT,otherwise,we should catch error
667 * and return directly. 671 * and return directly.
@@ -769,6 +773,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
769 struct btrfs_key key; 773 struct btrfs_key key;
770 struct btrfs_key tmp_op_key; 774 struct btrfs_key tmp_op_key;
771 struct btrfs_key *op_key = NULL; 775 struct btrfs_key *op_key = NULL;
776 struct rb_node *n;
772 int count; 777 int count;
773 int ret = 0; 778 int ret = 0;
774 779
@@ -778,7 +783,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
778 } 783 }
779 784
780 spin_lock(&head->lock); 785 spin_lock(&head->lock);
781 list_for_each_entry(node, &head->ref_list, list) { 786 for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
787 node = rb_entry(n, struct btrfs_delayed_ref_node,
788 ref_node);
782 if (node->seq > seq) 789 if (node->seq > seq)
783 continue; 790 continue;
784 791
@@ -1107,13 +1114,17 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
1107 * 1114 *
1108 * Otherwise this returns 0 for success and <0 for an error. 1115 * Otherwise this returns 0 for success and <0 for an error.
1109 * 1116 *
1117 * If ignore_offset is set to false, only extent refs whose offsets match
1118 * extent_item_pos are returned. If true, every extent ref is returned
1119 * and extent_item_pos is ignored.
1120 *
1110 * FIXME some caching might speed things up 1121 * FIXME some caching might speed things up
1111 */ 1122 */
1112static int find_parent_nodes(struct btrfs_trans_handle *trans, 1123static int find_parent_nodes(struct btrfs_trans_handle *trans,
1113 struct btrfs_fs_info *fs_info, u64 bytenr, 1124 struct btrfs_fs_info *fs_info, u64 bytenr,
1114 u64 time_seq, struct ulist *refs, 1125 u64 time_seq, struct ulist *refs,
1115 struct ulist *roots, const u64 *extent_item_pos, 1126 struct ulist *roots, const u64 *extent_item_pos,
1116 struct share_check *sc) 1127 struct share_check *sc, bool ignore_offset)
1117{ 1128{
1118 struct btrfs_key key; 1129 struct btrfs_key key;
1119 struct btrfs_path *path; 1130 struct btrfs_path *path;
@@ -1178,7 +1189,7 @@ again:
1178 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 1189 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1179 if (head) { 1190 if (head) {
1180 if (!mutex_trylock(&head->mutex)) { 1191 if (!mutex_trylock(&head->mutex)) {
1181 refcount_inc(&head->node.refs); 1192 refcount_inc(&head->refs);
1182 spin_unlock(&delayed_refs->lock); 1193 spin_unlock(&delayed_refs->lock);
1183 1194
1184 btrfs_release_path(path); 1195 btrfs_release_path(path);
@@ -1189,7 +1200,7 @@ again:
1189 */ 1200 */
1190 mutex_lock(&head->mutex); 1201 mutex_lock(&head->mutex);
1191 mutex_unlock(&head->mutex); 1202 mutex_unlock(&head->mutex);
1192 btrfs_put_delayed_ref(&head->node); 1203 btrfs_put_delayed_ref_head(head);
1193 goto again; 1204 goto again;
1194 } 1205 }
1195 spin_unlock(&delayed_refs->lock); 1206 spin_unlock(&delayed_refs->lock);
@@ -1235,7 +1246,7 @@ again:
1235 WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); 1246 WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
1236 1247
1237 ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, 1248 ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
1238 extent_item_pos, total_refs, sc); 1249 extent_item_pos, total_refs, sc, ignore_offset);
1239 if (ret) 1250 if (ret)
1240 goto out; 1251 goto out;
1241 1252
@@ -1282,7 +1293,7 @@ again:
1282 btrfs_tree_read_lock(eb); 1293 btrfs_tree_read_lock(eb);
1283 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1294 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1284 ret = find_extent_in_eb(eb, bytenr, 1295 ret = find_extent_in_eb(eb, bytenr,
1285 *extent_item_pos, &eie); 1296 *extent_item_pos, &eie, ignore_offset);
1286 btrfs_tree_read_unlock_blocking(eb); 1297 btrfs_tree_read_unlock_blocking(eb);
1287 free_extent_buffer(eb); 1298 free_extent_buffer(eb);
1288 if (ret < 0) 1299 if (ret < 0)
@@ -1350,7 +1361,7 @@ static void free_leaf_list(struct ulist *blocks)
1350static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 1361static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1351 struct btrfs_fs_info *fs_info, u64 bytenr, 1362 struct btrfs_fs_info *fs_info, u64 bytenr,
1352 u64 time_seq, struct ulist **leafs, 1363 u64 time_seq, struct ulist **leafs,
1353 const u64 *extent_item_pos) 1364 const u64 *extent_item_pos, bool ignore_offset)
1354{ 1365{
1355 int ret; 1366 int ret;
1356 1367
@@ -1359,7 +1370,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1359 return -ENOMEM; 1370 return -ENOMEM;
1360 1371
1361 ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, 1372 ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
1362 *leafs, NULL, extent_item_pos, NULL); 1373 *leafs, NULL, extent_item_pos, NULL, ignore_offset);
1363 if (ret < 0 && ret != -ENOENT) { 1374 if (ret < 0 && ret != -ENOENT) {
1364 free_leaf_list(*leafs); 1375 free_leaf_list(*leafs);
1365 return ret; 1376 return ret;
@@ -1383,7 +1394,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1383 */ 1394 */
1384static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, 1395static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
1385 struct btrfs_fs_info *fs_info, u64 bytenr, 1396 struct btrfs_fs_info *fs_info, u64 bytenr,
1386 u64 time_seq, struct ulist **roots) 1397 u64 time_seq, struct ulist **roots,
1398 bool ignore_offset)
1387{ 1399{
1388 struct ulist *tmp; 1400 struct ulist *tmp;
1389 struct ulist_node *node = NULL; 1401 struct ulist_node *node = NULL;
@@ -1402,7 +1414,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
1402 ULIST_ITER_INIT(&uiter); 1414 ULIST_ITER_INIT(&uiter);
1403 while (1) { 1415 while (1) {
1404 ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, 1416 ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
1405 tmp, *roots, NULL, NULL); 1417 tmp, *roots, NULL, NULL, ignore_offset);
1406 if (ret < 0 && ret != -ENOENT) { 1418 if (ret < 0 && ret != -ENOENT) {
1407 ulist_free(tmp); 1419 ulist_free(tmp);
1408 ulist_free(*roots); 1420 ulist_free(*roots);
@@ -1421,14 +1433,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
1421 1433
1422int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1434int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1423 struct btrfs_fs_info *fs_info, u64 bytenr, 1435 struct btrfs_fs_info *fs_info, u64 bytenr,
1424 u64 time_seq, struct ulist **roots) 1436 u64 time_seq, struct ulist **roots,
1437 bool ignore_offset)
1425{ 1438{
1426 int ret; 1439 int ret;
1427 1440
1428 if (!trans) 1441 if (!trans)
1429 down_read(&fs_info->commit_root_sem); 1442 down_read(&fs_info->commit_root_sem);
1430 ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, 1443 ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
1431 time_seq, roots); 1444 time_seq, roots, ignore_offset);
1432 if (!trans) 1445 if (!trans)
1433 up_read(&fs_info->commit_root_sem); 1446 up_read(&fs_info->commit_root_sem);
1434 return ret; 1447 return ret;
@@ -1483,7 +1496,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
1483 ULIST_ITER_INIT(&uiter); 1496 ULIST_ITER_INIT(&uiter);
1484 while (1) { 1497 while (1) {
1485 ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, 1498 ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
1486 roots, NULL, &shared); 1499 roots, NULL, &shared, false);
1487 if (ret == BACKREF_FOUND_SHARED) { 1500 if (ret == BACKREF_FOUND_SHARED) {
1488 /* this is the only condition under which we return 1 */ 1501 /* this is the only condition under which we return 1 */
1489 ret = 1; 1502 ret = 1;
@@ -1877,7 +1890,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
1877int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1890int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1878 u64 extent_item_objectid, u64 extent_item_pos, 1891 u64 extent_item_objectid, u64 extent_item_pos,
1879 int search_commit_root, 1892 int search_commit_root,
1880 iterate_extent_inodes_t *iterate, void *ctx) 1893 iterate_extent_inodes_t *iterate, void *ctx,
1894 bool ignore_offset)
1881{ 1895{
1882 int ret; 1896 int ret;
1883 struct btrfs_trans_handle *trans = NULL; 1897 struct btrfs_trans_handle *trans = NULL;
@@ -1903,14 +1917,15 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1903 1917
1904 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1918 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1905 tree_mod_seq_elem.seq, &refs, 1919 tree_mod_seq_elem.seq, &refs,
1906 &extent_item_pos); 1920 &extent_item_pos, ignore_offset);
1907 if (ret) 1921 if (ret)
1908 goto out; 1922 goto out;
1909 1923
1910 ULIST_ITER_INIT(&ref_uiter); 1924 ULIST_ITER_INIT(&ref_uiter);
1911 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1925 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1912 ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, 1926 ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
1913 tree_mod_seq_elem.seq, &roots); 1927 tree_mod_seq_elem.seq, &roots,
1928 ignore_offset);
1914 if (ret) 1929 if (ret)
1915 break; 1930 break;
1916 ULIST_ITER_INIT(&root_uiter); 1931 ULIST_ITER_INIT(&root_uiter);
@@ -1943,7 +1958,8 @@ out:
1943 1958
1944int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, 1959int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1945 struct btrfs_path *path, 1960 struct btrfs_path *path,
1946 iterate_extent_inodes_t *iterate, void *ctx) 1961 iterate_extent_inodes_t *iterate, void *ctx,
1962 bool ignore_offset)
1947{ 1963{
1948 int ret; 1964 int ret;
1949 u64 extent_item_pos; 1965 u64 extent_item_pos;
@@ -1961,7 +1977,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1961 extent_item_pos = logical - found_key.objectid; 1977 extent_item_pos = logical - found_key.objectid;
1962 ret = iterate_extent_inodes(fs_info, found_key.objectid, 1978 ret = iterate_extent_inodes(fs_info, found_key.objectid,
1963 extent_item_pos, search_commit_root, 1979 extent_item_pos, search_commit_root,
1964 iterate, ctx); 1980 iterate, ctx, ignore_offset);
1965 1981
1966 return ret; 1982 return ret;
1967} 1983}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e410335841aa..0c2fab8514ff 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -43,17 +43,19 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
43int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 43int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
44 u64 extent_item_objectid, 44 u64 extent_item_objectid,
45 u64 extent_offset, int search_commit_root, 45 u64 extent_offset, int search_commit_root,
46 iterate_extent_inodes_t *iterate, void *ctx); 46 iterate_extent_inodes_t *iterate, void *ctx,
47 bool ignore_offset);
47 48
48int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, 49int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
49 struct btrfs_path *path, 50 struct btrfs_path *path,
50 iterate_extent_inodes_t *iterate, void *ctx); 51 iterate_extent_inodes_t *iterate, void *ctx,
52 bool ignore_offset);
51 53
52int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 54int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
53 55
54int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 56int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
55 struct btrfs_fs_info *fs_info, u64 bytenr, 57 struct btrfs_fs_info *fs_info, u64 bytenr,
56 u64 time_seq, struct ulist **roots); 58 u64 time_seq, struct ulist **roots, bool ignore_offset);
57char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 59char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
58 u32 name_len, unsigned long name_off, 60 u32 name_len, unsigned long name_off,
59 struct extent_buffer *eb_in, u64 parent, 61 struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index eccadb5f62a5..63f0ccc92a71 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,14 +36,13 @@
36#define BTRFS_INODE_ORPHAN_META_RESERVED 1 36#define BTRFS_INODE_ORPHAN_META_RESERVED 1
37#define BTRFS_INODE_DUMMY 2 37#define BTRFS_INODE_DUMMY 2
38#define BTRFS_INODE_IN_DEFRAG 3 38#define BTRFS_INODE_IN_DEFRAG 3
39#define BTRFS_INODE_DELALLOC_META_RESERVED 4 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
40#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
41#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 41#define BTRFS_INODE_NEEDS_FULL_SYNC 6
42#define BTRFS_INODE_NEEDS_FULL_SYNC 7 42#define BTRFS_INODE_COPY_EVERYTHING 7
43#define BTRFS_INODE_COPY_EVERYTHING 8 43#define BTRFS_INODE_IN_DELALLOC_LIST 8
44#define BTRFS_INODE_IN_DELALLOC_LIST 9 44#define BTRFS_INODE_READDIO_NEED_LOCK 9
45#define BTRFS_INODE_READDIO_NEED_LOCK 10 45#define BTRFS_INODE_HAS_PROPS 10
46#define BTRFS_INODE_HAS_PROPS 11
47 46
48/* in memory btrfs inode */ 47/* in memory btrfs inode */
49struct btrfs_inode { 48struct btrfs_inode {
@@ -176,7 +175,8 @@ struct btrfs_inode {
176 * of extent items we've reserved metadata for. 175 * of extent items we've reserved metadata for.
177 */ 176 */
178 unsigned outstanding_extents; 177 unsigned outstanding_extents;
179 unsigned reserved_extents; 178
179 struct btrfs_block_rsv block_rsv;
180 180
181 /* 181 /*
182 * Cached values of inode properties 182 * Cached values of inode properties
@@ -267,6 +267,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
267 return false; 267 return false;
268} 268}
269 269
270static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
271 int mod)
272{
273 lockdep_assert_held(&inode->lock);
274 inode->outstanding_extents += mod;
275 if (btrfs_is_free_space_inode(inode))
276 return;
277 trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
278 mod);
279}
280
270static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) 281static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
271{ 282{
272 int ret = 0; 283 int ret = 0;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d5a9b51f0d7..7d51b5a5b505 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add(
613 struct btrfsic_dev_state_hashtable *h) 613 struct btrfsic_dev_state_hashtable *h)
614{ 614{
615 const unsigned int hashval = 615 const unsigned int hashval =
616 (((unsigned int)((uintptr_t)ds->bdev)) & 616 (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
617 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); 617 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
618 618
619 list_add(&ds->collision_resolving_node, h->table + hashval); 619 list_add(&ds->collision_resolving_node, h->table + hashval);
@@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
2803 mutex_lock(&btrfsic_mutex); 2803 mutex_lock(&btrfsic_mutex);
2804 /* since btrfsic_submit_bio() is also called before 2804 /* since btrfsic_submit_bio() is also called before
2805 * btrfsic_mount(), this might return NULL */ 2805 * btrfsic_mount(), this might return NULL */
2806 dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); 2806 dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
2807 if (NULL != dev_state && 2807 if (NULL != dev_state &&
2808 (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { 2808 (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
2809 unsigned int i = 0; 2809 unsigned int i = 0;
@@ -2913,7 +2913,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
2913 state = kvzalloc(sizeof(*state), GFP_KERNEL); 2913 state = kvzalloc(sizeof(*state), GFP_KERNEL);
2914 if (!state) { 2914 if (!state) {
2915 pr_info("btrfs check-integrity: allocation failed!\n"); 2915 pr_info("btrfs check-integrity: allocation failed!\n");
2916 return -1; 2916 return -ENOMEM;
2917 } 2917 }
2918 2918
2919 if (!btrfsic_is_initialized) { 2919 if (!btrfsic_is_initialized) {
@@ -2945,7 +2945,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
2945 if (NULL == ds) { 2945 if (NULL == ds) {
2946 pr_info("btrfs check-integrity: kmalloc() failed!\n"); 2946 pr_info("btrfs check-integrity: kmalloc() failed!\n");
2947 mutex_unlock(&btrfsic_mutex); 2947 mutex_unlock(&btrfsic_mutex);
2948 return -1; 2948 return -ENOMEM;
2949 } 2949 }
2950 ds->bdev = device->bdev; 2950 ds->bdev = device->bdev;
2951 ds->state = state; 2951 ds->state = state;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 280384bf34f1..b35ce16b3df3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,8 @@
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/sched/mm.h> 35#include <linux/sched/mm.h>
36#include <linux/sort.h>
37#include <linux/log2.h>
36#include "ctree.h" 38#include "ctree.h"
37#include "disk-io.h" 39#include "disk-io.h"
38#include "transaction.h" 40#include "transaction.h"
@@ -255,7 +257,8 @@ static void end_compressed_bio_write(struct bio *bio)
255 cb->start, 257 cb->start,
256 cb->start + cb->len - 1, 258 cb->start + cb->len - 1,
257 NULL, 259 NULL,
258 bio->bi_status ? 0 : 1); 260 bio->bi_status ?
261 BLK_STS_OK : BLK_STS_NOTSUPP);
259 cb->compressed_pages[0]->mapping = NULL; 262 cb->compressed_pages[0]->mapping = NULL;
260 263
261 end_compressed_writeback(inode, cb); 264 end_compressed_writeback(inode, cb);
@@ -706,7 +709,86 @@ out:
706 return ret; 709 return ret;
707} 710}
708 711
709static struct { 712/*
713 * Heuristic uses systematic sampling to collect data from the input data
714 * range, the logic can be tuned by the following constants:
715 *
716 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
717 * @SAMPLING_INTERVAL - range from which the sampled data can be collected
718 */
719#define SAMPLING_READ_SIZE (16)
720#define SAMPLING_INTERVAL (256)
721
722/*
723 * For statistical analysis of the input data we consider bytes that form a
724 * Galois Field of 256 objects. Each object has an attribute count, ie. how
725 * many times the object appeared in the sample.
726 */
727#define BUCKET_SIZE (256)
728
729/*
730 * The size of the sample is based on a statistical sampling rule of thumb.
731 * The common way is to perform sampling tests as long as the number of
732 * elements in each cell is at least 5.
733 *
734 * Instead of 5, we choose 32 to obtain more accurate results.
735 * If the data contain the maximum number of symbols, which is 256, we obtain a
736 * sample size bound by 8192.
737 *
738 * For a sample of at most 8KB of data per data range: 16 consecutive bytes
739 * from up to 512 locations.
740 */
741#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
742 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
743
744struct bucket_item {
745 u32 count;
746};
747
748struct heuristic_ws {
749 /* Partial copy of input data */
750 u8 *sample;
751 u32 sample_size;
752 /* Buckets store counters for each byte value */
753 struct bucket_item *bucket;
754 struct list_head list;
755};
756
757static void free_heuristic_ws(struct list_head *ws)
758{
759 struct heuristic_ws *workspace;
760
761 workspace = list_entry(ws, struct heuristic_ws, list);
762
763 kvfree(workspace->sample);
764 kfree(workspace->bucket);
765 kfree(workspace);
766}
767
768static struct list_head *alloc_heuristic_ws(void)
769{
770 struct heuristic_ws *ws;
771
772 ws = kzalloc(sizeof(*ws), GFP_KERNEL);
773 if (!ws)
774 return ERR_PTR(-ENOMEM);
775
776 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
777 if (!ws->sample)
778 goto fail;
779
780 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
781 if (!ws->bucket)
782 goto fail;
783
784 INIT_LIST_HEAD(&ws->list);
785 return &ws->list;
786fail:
787 free_heuristic_ws(&ws->list);
788 return ERR_PTR(-ENOMEM);
789}
790
791struct workspaces_list {
710 struct list_head idle_ws; 792 struct list_head idle_ws;
711 spinlock_t ws_lock; 793 spinlock_t ws_lock;
712 /* Number of free workspaces */ 794 /* Number of free workspaces */
@@ -715,7 +797,11 @@ static struct {
715 atomic_t total_ws; 797 atomic_t total_ws;
716 /* Waiters for a free workspace */ 798 /* Waiters for a free workspace */
717 wait_queue_head_t ws_wait; 799 wait_queue_head_t ws_wait;
718} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; 800};
801
802static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
803
804static struct workspaces_list btrfs_heuristic_ws;
719 805
720static const struct btrfs_compress_op * const btrfs_compress_op[] = { 806static const struct btrfs_compress_op * const btrfs_compress_op[] = {
721 &btrfs_zlib_compress, 807 &btrfs_zlib_compress,
@@ -725,11 +811,25 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
725 811
726void __init btrfs_init_compress(void) 812void __init btrfs_init_compress(void)
727{ 813{
814 struct list_head *workspace;
728 int i; 815 int i;
729 816
730 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 817 INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
731 struct list_head *workspace; 818 spin_lock_init(&btrfs_heuristic_ws.ws_lock);
819 atomic_set(&btrfs_heuristic_ws.total_ws, 0);
820 init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
821
822 workspace = alloc_heuristic_ws();
823 if (IS_ERR(workspace)) {
824 pr_warn(
825 "BTRFS: cannot preallocate heuristic workspace, will try later\n");
826 } else {
827 atomic_set(&btrfs_heuristic_ws.total_ws, 1);
828 btrfs_heuristic_ws.free_ws = 1;
829 list_add(workspace, &btrfs_heuristic_ws.idle_ws);
830 }
732 831
832 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
733 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); 833 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
734 spin_lock_init(&btrfs_comp_ws[i].ws_lock); 834 spin_lock_init(&btrfs_comp_ws[i].ws_lock);
735 atomic_set(&btrfs_comp_ws[i].total_ws, 0); 835 atomic_set(&btrfs_comp_ws[i].total_ws, 0);
@@ -756,18 +856,32 @@ void __init btrfs_init_compress(void)
756 * Preallocation makes a forward progress guarantees and we do not return 856 * Preallocation makes a forward progress guarantees and we do not return
757 * errors. 857 * errors.
758 */ 858 */
759static struct list_head *find_workspace(int type) 859static struct list_head *__find_workspace(int type, bool heuristic)
760{ 860{
761 struct list_head *workspace; 861 struct list_head *workspace;
762 int cpus = num_online_cpus(); 862 int cpus = num_online_cpus();
763 int idx = type - 1; 863 int idx = type - 1;
764 unsigned nofs_flag; 864 unsigned nofs_flag;
865 struct list_head *idle_ws;
866 spinlock_t *ws_lock;
867 atomic_t *total_ws;
868 wait_queue_head_t *ws_wait;
869 int *free_ws;
870
871 if (heuristic) {
872 idle_ws = &btrfs_heuristic_ws.idle_ws;
873 ws_lock = &btrfs_heuristic_ws.ws_lock;
874 total_ws = &btrfs_heuristic_ws.total_ws;
875 ws_wait = &btrfs_heuristic_ws.ws_wait;
876 free_ws = &btrfs_heuristic_ws.free_ws;
877 } else {
878 idle_ws = &btrfs_comp_ws[idx].idle_ws;
879 ws_lock = &btrfs_comp_ws[idx].ws_lock;
880 total_ws = &btrfs_comp_ws[idx].total_ws;
881 ws_wait = &btrfs_comp_ws[idx].ws_wait;
882 free_ws = &btrfs_comp_ws[idx].free_ws;
883 }
765 884
766 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
767 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
768 atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
769 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
770 int *free_ws = &btrfs_comp_ws[idx].free_ws;
771again: 885again:
772 spin_lock(ws_lock); 886 spin_lock(ws_lock);
773 if (!list_empty(idle_ws)) { 887 if (!list_empty(idle_ws)) {
@@ -797,7 +911,10 @@ again:
797 * context of btrfs_compress_bio/btrfs_compress_pages 911 * context of btrfs_compress_bio/btrfs_compress_pages
798 */ 912 */
799 nofs_flag = memalloc_nofs_save(); 913 nofs_flag = memalloc_nofs_save();
800 workspace = btrfs_compress_op[idx]->alloc_workspace(); 914 if (heuristic)
915 workspace = alloc_heuristic_ws();
916 else
917 workspace = btrfs_compress_op[idx]->alloc_workspace();
801 memalloc_nofs_restore(nofs_flag); 918 memalloc_nofs_restore(nofs_flag);
802 919
803 if (IS_ERR(workspace)) { 920 if (IS_ERR(workspace)) {
@@ -828,18 +945,38 @@ again:
828 return workspace; 945 return workspace;
829} 946}
830 947
948static struct list_head *find_workspace(int type)
949{
950 return __find_workspace(type, false);
951}
952
831/* 953/*
832 * put a workspace struct back on the list or free it if we have enough 954 * put a workspace struct back on the list or free it if we have enough
833 * idle ones sitting around 955 * idle ones sitting around
834 */ 956 */
835static void free_workspace(int type, struct list_head *workspace) 957static void __free_workspace(int type, struct list_head *workspace,
958 bool heuristic)
836{ 959{
837 int idx = type - 1; 960 int idx = type - 1;
838 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; 961 struct list_head *idle_ws;
839 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; 962 spinlock_t *ws_lock;
840 atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; 963 atomic_t *total_ws;
841 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; 964 wait_queue_head_t *ws_wait;
842 int *free_ws = &btrfs_comp_ws[idx].free_ws; 965 int *free_ws;
966
967 if (heuristic) {
968 idle_ws = &btrfs_heuristic_ws.idle_ws;
969 ws_lock = &btrfs_heuristic_ws.ws_lock;
970 total_ws = &btrfs_heuristic_ws.total_ws;
971 ws_wait = &btrfs_heuristic_ws.ws_wait;
972 free_ws = &btrfs_heuristic_ws.free_ws;
973 } else {
974 idle_ws = &btrfs_comp_ws[idx].idle_ws;
975 ws_lock = &btrfs_comp_ws[idx].ws_lock;
976 total_ws = &btrfs_comp_ws[idx].total_ws;
977 ws_wait = &btrfs_comp_ws[idx].ws_wait;
978 free_ws = &btrfs_comp_ws[idx].free_ws;
979 }
843 980
844 spin_lock(ws_lock); 981 spin_lock(ws_lock);
845 if (*free_ws <= num_online_cpus()) { 982 if (*free_ws <= num_online_cpus()) {
@@ -850,7 +987,10 @@ static void free_workspace(int type, struct list_head *workspace)
850 } 987 }
851 spin_unlock(ws_lock); 988 spin_unlock(ws_lock);
852 989
853 btrfs_compress_op[idx]->free_workspace(workspace); 990 if (heuristic)
991 free_heuristic_ws(workspace);
992 else
993 btrfs_compress_op[idx]->free_workspace(workspace);
854 atomic_dec(total_ws); 994 atomic_dec(total_ws);
855wake: 995wake:
856 /* 996 /*
@@ -861,6 +1001,11 @@ wake:
861 wake_up(ws_wait); 1001 wake_up(ws_wait);
862} 1002}
863 1003
1004static void free_workspace(int type, struct list_head *ws)
1005{
1006 return __free_workspace(type, ws, false);
1007}
1008
864/* 1009/*
865 * cleanup function for module exit 1010 * cleanup function for module exit
866 */ 1011 */
@@ -869,6 +1014,13 @@ static void free_workspaces(void)
869 struct list_head *workspace; 1014 struct list_head *workspace;
870 int i; 1015 int i;
871 1016
1017 while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
1018 workspace = btrfs_heuristic_ws.idle_ws.next;
1019 list_del(workspace);
1020 free_heuristic_ws(workspace);
1021 atomic_dec(&btrfs_heuristic_ws.total_ws);
1022 }
1023
872 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 1024 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
873 while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { 1025 while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
874 workspace = btrfs_comp_ws[i].idle_ws.next; 1026 workspace = btrfs_comp_ws[i].idle_ws.next;
@@ -883,6 +1035,11 @@ static void free_workspaces(void)
883 * Given an address space and start and length, compress the bytes into @pages 1035 * Given an address space and start and length, compress the bytes into @pages
884 * that are allocated on demand. 1036 * that are allocated on demand.
885 * 1037 *
1038 * @type_level is encoded algorithm and level, where level 0 means whatever
1039 * default the algorithm chooses and is opaque here;
1040 * - compression algo are 0-3
1041 * - the level are bits 4-7
1042 *
886 * @out_pages is an in/out parameter, holds maximum number of pages to allocate 1043 * @out_pages is an in/out parameter, holds maximum number of pages to allocate
887 * and returns number of actually allocated pages 1044 * and returns number of actually allocated pages
888 * 1045 *
@@ -897,7 +1054,7 @@ static void free_workspaces(void)
897 * @max_out tells us the max number of bytes that we're allowed to 1054 * @max_out tells us the max number of bytes that we're allowed to
898 * stuff into pages 1055 * stuff into pages
899 */ 1056 */
900int btrfs_compress_pages(int type, struct address_space *mapping, 1057int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
901 u64 start, struct page **pages, 1058 u64 start, struct page **pages,
902 unsigned long *out_pages, 1059 unsigned long *out_pages,
903 unsigned long *total_in, 1060 unsigned long *total_in,
@@ -905,9 +1062,11 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
905{ 1062{
906 struct list_head *workspace; 1063 struct list_head *workspace;
907 int ret; 1064 int ret;
1065 int type = type_level & 0xF;
908 1066
909 workspace = find_workspace(type); 1067 workspace = find_workspace(type);
910 1068
1069 btrfs_compress_op[type - 1]->set_level(workspace, type_level);
911 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, 1070 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
912 start, pages, 1071 start, pages,
913 out_pages, 1072 out_pages,
@@ -1066,6 +1225,211 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
1066} 1225}
1067 1226
1068/* 1227/*
1228 * Shannon Entropy calculation
1229 *
1230 * Pure byte distribution analysis fails to determine compressiability of data.
1231 * Try calculating entropy to estimate the average minimum number of bits
1232 * needed to encode the sampled data.
1233 *
1234 * For convenience, return the percentage of needed bits, instead of amount of
1235 * bits directly.
1236 *
1237 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1238 * and can be compressible with high probability
1239 *
1240 * @ENTROPY_LVL_HIGH - data are not compressible with high probability
1241 *
1242 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1243 */
1244#define ENTROPY_LVL_ACEPTABLE (65)
1245#define ENTROPY_LVL_HIGH (80)
1246
1247/*
1248 * For increasead precision in shannon_entropy calculation,
1249 * let's do pow(n, M) to save more digits after comma:
1250 *
1251 * - maximum int bit length is 64
1252 * - ilog2(MAX_SAMPLE_SIZE) -> 13
1253 * - 13 * 4 = 52 < 64 -> M = 4
1254 *
1255 * So use pow(n, 4).
1256 */
1257static inline u32 ilog2_w(u64 n)
1258{
1259 return ilog2(n * n * n * n);
1260}
1261
1262static u32 shannon_entropy(struct heuristic_ws *ws)
1263{
1264 const u32 entropy_max = 8 * ilog2_w(2);
1265 u32 entropy_sum = 0;
1266 u32 p, p_base, sz_base;
1267 u32 i;
1268
1269 sz_base = ilog2_w(ws->sample_size);
1270 for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1271 p = ws->bucket[i].count;
1272 p_base = ilog2_w(p);
1273 entropy_sum += p * (sz_base - p_base);
1274 }
1275
1276 entropy_sum /= ws->sample_size;
1277 return entropy_sum * 100 / entropy_max;
1278}
1279
1280/* Compare buckets by size, ascending */
1281static int bucket_comp_rev(const void *lv, const void *rv)
1282{
1283 const struct bucket_item *l = (const struct bucket_item *)lv;
1284 const struct bucket_item *r = (const struct bucket_item *)rv;
1285
1286 return r->count - l->count;
1287}
1288
1289/*
1290 * Size of the core byte set - how many bytes cover 90% of the sample
1291 *
1292 * There are several types of structured binary data that use nearly all byte
1293 * values. The distribution can be uniform and counts in all buckets will be
1294 * nearly the same (eg. encrypted data). Unlikely to be compressible.
1295 *
1296 * Other possibility is normal (Gaussian) distribution, where the data could
1297 * be potentially compressible, but we have to take a few more steps to decide
1298 * how much.
1299 *
1300 * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
1301 * compression algo can easy fix that
1302 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1303 * probability is not compressible
1304 */
1305#define BYTE_CORE_SET_LOW (64)
1306#define BYTE_CORE_SET_HIGH (200)
1307
1308static int byte_core_set_size(struct heuristic_ws *ws)
1309{
1310 u32 i;
1311 u32 coreset_sum = 0;
1312 const u32 core_set_threshold = ws->sample_size * 90 / 100;
1313 struct bucket_item *bucket = ws->bucket;
1314
1315 /* Sort in reverse order */
1316 sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
1317
1318 for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1319 coreset_sum += bucket[i].count;
1320
1321 if (coreset_sum > core_set_threshold)
1322 return i;
1323
1324 for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1325 coreset_sum += bucket[i].count;
1326 if (coreset_sum > core_set_threshold)
1327 break;
1328 }
1329
1330 return i;
1331}
1332
1333/*
1334 * Count byte values in buckets.
1335 * This heuristic can detect textual data (configs, xml, json, html, etc).
1336 * Because in most text-like data byte set is restricted to limited number of
1337 * possible characters, and that restriction in most cases makes data easy to
1338 * compress.
1339 *
1340 * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1341 * less - compressible
1342 * more - need additional analysis
1343 */
1344#define BYTE_SET_THRESHOLD (64)
1345
1346static u32 byte_set_size(const struct heuristic_ws *ws)
1347{
1348 u32 i;
1349 u32 byte_set_size = 0;
1350
1351 for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1352 if (ws->bucket[i].count > 0)
1353 byte_set_size++;
1354 }
1355
1356 /*
1357 * Continue collecting count of byte values in buckets. If the byte
1358 * set size is bigger then the threshold, it's pointless to continue,
1359 * the detection technique would fail for this type of data.
1360 */
1361 for (; i < BUCKET_SIZE; i++) {
1362 if (ws->bucket[i].count > 0) {
1363 byte_set_size++;
1364 if (byte_set_size > BYTE_SET_THRESHOLD)
1365 return byte_set_size;
1366 }
1367 }
1368
1369 return byte_set_size;
1370}
1371
1372static bool sample_repeated_patterns(struct heuristic_ws *ws)
1373{
1374 const u32 half_of_sample = ws->sample_size / 2;
1375 const u8 *data = ws->sample;
1376
1377 return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1378}
1379
1380static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1381 struct heuristic_ws *ws)
1382{
1383 struct page *page;
1384 u64 index, index_end;
1385 u32 i, curr_sample_pos;
1386 u8 *in_data;
1387
1388 /*
1389 * Compression handles the input data by chunks of 128KiB
1390 * (defined by BTRFS_MAX_UNCOMPRESSED)
1391 *
1392 * We do the same for the heuristic and loop over the whole range.
1393 *
1394 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1395 * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1396 */
1397 if (end - start > BTRFS_MAX_UNCOMPRESSED)
1398 end = start + BTRFS_MAX_UNCOMPRESSED;
1399
1400 index = start >> PAGE_SHIFT;
1401 index_end = end >> PAGE_SHIFT;
1402
1403 /* Don't miss unaligned end */
1404 if (!IS_ALIGNED(end, PAGE_SIZE))
1405 index_end++;
1406
1407 curr_sample_pos = 0;
1408 while (index < index_end) {
1409 page = find_get_page(inode->i_mapping, index);
1410 in_data = kmap(page);
1411 /* Handle case where the start is not aligned to PAGE_SIZE */
1412 i = start % PAGE_SIZE;
1413 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1414 /* Don't sample any garbage from the last page */
1415 if (start > end - SAMPLING_READ_SIZE)
1416 break;
1417 memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1418 SAMPLING_READ_SIZE);
1419 i += SAMPLING_INTERVAL;
1420 start += SAMPLING_INTERVAL;
1421 curr_sample_pos += SAMPLING_READ_SIZE;
1422 }
1423 kunmap(page);
1424 put_page(page);
1425
1426 index++;
1427 }
1428
1429 ws->sample_size = curr_sample_pos;
1430}
1431
1432/*
1069 * Compression heuristic. 1433 * Compression heuristic.
1070 * 1434 *
1071 * For now is's a naive and optimistic 'return true', we'll extend the logic to 1435 * For now is's a naive and optimistic 'return true', we'll extend the logic to
@@ -1082,18 +1446,87 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
1082 */ 1446 */
1083int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 1447int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1084{ 1448{
1085 u64 index = start >> PAGE_SHIFT; 1449 struct list_head *ws_list = __find_workspace(0, true);
1086 u64 end_index = end >> PAGE_SHIFT; 1450 struct heuristic_ws *ws;
1087 struct page *page; 1451 u32 i;
1088 int ret = 1; 1452 u8 byte;
1453 int ret = 0;
1089 1454
1090 while (index <= end_index) { 1455 ws = list_entry(ws_list, struct heuristic_ws, list);
1091 page = find_get_page(inode->i_mapping, index); 1456
1092 kmap(page); 1457 heuristic_collect_sample(inode, start, end, ws);
1093 kunmap(page); 1458
1094 put_page(page); 1459 if (sample_repeated_patterns(ws)) {
1095 index++; 1460 ret = 1;
1461 goto out;
1462 }
1463
1464 memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1465
1466 for (i = 0; i < ws->sample_size; i++) {
1467 byte = ws->sample[i];
1468 ws->bucket[byte].count++;
1469 }
1470
1471 i = byte_set_size(ws);
1472 if (i < BYTE_SET_THRESHOLD) {
1473 ret = 2;
1474 goto out;
1475 }
1476
1477 i = byte_core_set_size(ws);
1478 if (i <= BYTE_CORE_SET_LOW) {
1479 ret = 3;
1480 goto out;
1096 } 1481 }
1097 1482
1483 if (i >= BYTE_CORE_SET_HIGH) {
1484 ret = 0;
1485 goto out;
1486 }
1487
1488 i = shannon_entropy(ws);
1489 if (i <= ENTROPY_LVL_ACEPTABLE) {
1490 ret = 4;
1491 goto out;
1492 }
1493
1494 /*
1495 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1496 * needed to give green light to compression.
1497 *
1498 * For now just assume that compression at that level is not worth the
1499 * resources because:
1500 *
1501 * 1. it is possible to defrag the data later
1502 *
1503 * 2. the data would turn out to be hardly compressible, eg. 150 byte
1504 * values, every bucket has counter at level ~54. The heuristic would
1505 * be confused. This can happen when data have some internal repeated
1506 * patterns like "abbacbbc...". This can be detected by analyzing
1507 * pairs of bytes, which is too costly.
1508 */
1509 if (i < ENTROPY_LVL_HIGH) {
1510 ret = 5;
1511 goto out;
1512 } else {
1513 ret = 0;
1514 goto out;
1515 }
1516
1517out:
1518 __free_workspace(0, ws_list, true);
1098 return ret; 1519 return ret;
1099} 1520}
1521
1522unsigned int btrfs_compress_str2level(const char *str)
1523{
1524 if (strncmp(str, "zlib", 4) != 0)
1525 return 0;
1526
1527 /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
1528 if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
1529 return str[5] - '0';
1530
1531 return 0;
1532}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d2781ff8f994..da20755ebf21 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -76,7 +76,7 @@ struct compressed_bio {
76void btrfs_init_compress(void); 76void btrfs_init_compress(void);
77void btrfs_exit_compress(void); 77void btrfs_exit_compress(void);
78 78
79int btrfs_compress_pages(int type, struct address_space *mapping, 79int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
80 u64 start, struct page **pages, 80 u64 start, struct page **pages,
81 unsigned long *out_pages, 81 unsigned long *out_pages,
82 unsigned long *total_in, 82 unsigned long *total_in,
@@ -95,6 +95,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
95blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 95blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
96 int mirror_num, unsigned long bio_flags); 96 int mirror_num, unsigned long bio_flags);
97 97
98unsigned btrfs_compress_str2level(const char *str);
99
98enum btrfs_compression_type { 100enum btrfs_compression_type {
99 BTRFS_COMPRESS_NONE = 0, 101 BTRFS_COMPRESS_NONE = 0,
100 BTRFS_COMPRESS_ZLIB = 1, 102 BTRFS_COMPRESS_ZLIB = 1,
@@ -124,6 +126,8 @@ struct btrfs_compress_op {
124 struct page *dest_page, 126 struct page *dest_page,
125 unsigned long start_byte, 127 unsigned long start_byte,
126 size_t srclen, size_t destlen); 128 size_t srclen, size_t destlen);
129
130 void (*set_level)(struct list_head *ws, unsigned int type);
127}; 131};
128 132
129extern const struct btrfs_compress_op btrfs_zlib_compress; 133extern const struct btrfs_compress_op btrfs_zlib_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d49db7d86be..531e0a8645b0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
192 * tree until you end up with a lock on the root. A locked buffer 192 * tree until you end up with a lock on the root. A locked buffer
193 * is returned, with a reference held. 193 * is returned, with a reference held.
194 */ 194 */
195static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 195struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196{ 196{
197 struct extent_buffer *eb; 197 struct extent_buffer *eb;
198 198
@@ -5496,8 +5496,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5496 goto out; 5496 goto out;
5497 } else if (left_end_reached) { 5497 } else if (left_end_reached) {
5498 if (right_level == 0) { 5498 if (right_level == 0) {
5499 ret = changed_cb(left_root, right_root, 5499 ret = changed_cb(left_path, right_path,
5500 left_path, right_path,
5501 &right_key, 5500 &right_key,
5502 BTRFS_COMPARE_TREE_DELETED, 5501 BTRFS_COMPARE_TREE_DELETED,
5503 ctx); 5502 ctx);
@@ -5508,8 +5507,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5508 continue; 5507 continue;
5509 } else if (right_end_reached) { 5508 } else if (right_end_reached) {
5510 if (left_level == 0) { 5509 if (left_level == 0) {
5511 ret = changed_cb(left_root, right_root, 5510 ret = changed_cb(left_path, right_path,
5512 left_path, right_path,
5513 &left_key, 5511 &left_key,
5514 BTRFS_COMPARE_TREE_NEW, 5512 BTRFS_COMPARE_TREE_NEW,
5515 ctx); 5513 ctx);
@@ -5523,8 +5521,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5523 if (left_level == 0 && right_level == 0) { 5521 if (left_level == 0 && right_level == 0) {
5524 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5522 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5525 if (cmp < 0) { 5523 if (cmp < 0) {
5526 ret = changed_cb(left_root, right_root, 5524 ret = changed_cb(left_path, right_path,
5527 left_path, right_path,
5528 &left_key, 5525 &left_key,
5529 BTRFS_COMPARE_TREE_NEW, 5526 BTRFS_COMPARE_TREE_NEW,
5530 ctx); 5527 ctx);
@@ -5532,8 +5529,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5532 goto out; 5529 goto out;
5533 advance_left = ADVANCE; 5530 advance_left = ADVANCE;
5534 } else if (cmp > 0) { 5531 } else if (cmp > 0) {
5535 ret = changed_cb(left_root, right_root, 5532 ret = changed_cb(left_path, right_path,
5536 left_path, right_path,
5537 &right_key, 5533 &right_key,
5538 BTRFS_COMPARE_TREE_DELETED, 5534 BTRFS_COMPARE_TREE_DELETED,
5539 ctx); 5535 ctx);
@@ -5550,8 +5546,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5550 result = BTRFS_COMPARE_TREE_CHANGED; 5546 result = BTRFS_COMPARE_TREE_CHANGED;
5551 else 5547 else
5552 result = BTRFS_COMPARE_TREE_SAME; 5548 result = BTRFS_COMPARE_TREE_SAME;
5553 ret = changed_cb(left_root, right_root, 5549 ret = changed_cb(left_path, right_path,
5554 left_path, right_path,
5555 &left_key, result, ctx); 5550 &left_key, result, ctx);
5556 if (ret < 0) 5551 if (ret < 0)
5557 goto out; 5552 goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fc690384c58..f7df5536ab61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -523,7 +523,7 @@ struct btrfs_caching_control {
523}; 523};
524 524
525/* Once caching_thread() finds this much free space, it will wake up waiters. */ 525/* Once caching_thread() finds this much free space, it will wake up waiters. */
526#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) 526#define CACHING_CTL_WAKE_UP SZ_2M
527 527
528struct btrfs_io_ctl { 528struct btrfs_io_ctl {
529 void *cur, *orig; 529 void *cur, *orig;
@@ -763,8 +763,6 @@ struct btrfs_fs_info {
763 * delayed dir index item 763 * delayed dir index item
764 */ 764 */
765 struct btrfs_block_rsv global_block_rsv; 765 struct btrfs_block_rsv global_block_rsv;
766 /* block reservation for delay allocation */
767 struct btrfs_block_rsv delalloc_block_rsv;
768 /* block reservation for metadata operations */ 766 /* block reservation for metadata operations */
769 struct btrfs_block_rsv trans_block_rsv; 767 struct btrfs_block_rsv trans_block_rsv;
770 /* block reservation for chunk tree */ 768 /* block reservation for chunk tree */
@@ -790,6 +788,7 @@ struct btrfs_fs_info {
790 */ 788 */
791 unsigned long pending_changes; 789 unsigned long pending_changes;
792 unsigned long compress_type:4; 790 unsigned long compress_type:4;
791 unsigned int compress_level;
793 int commit_interval; 792 int commit_interval;
794 /* 793 /*
795 * It is a suggestive number, the read side is safe even it gets a 794 * It is a suggestive number, the read side is safe even it gets a
@@ -878,9 +877,6 @@ struct btrfs_fs_info {
878 rwlock_t tree_mod_log_lock; 877 rwlock_t tree_mod_log_lock;
879 struct rb_root tree_mod_log; 878 struct rb_root tree_mod_log;
880 879
881 atomic_t nr_async_submits;
882 atomic_t async_submit_draining;
883 atomic_t nr_async_bios;
884 atomic_t async_delalloc_pages; 880 atomic_t async_delalloc_pages;
885 atomic_t open_ioctl_trans; 881 atomic_t open_ioctl_trans;
886 882
@@ -1100,6 +1096,11 @@ struct btrfs_fs_info {
1100 u32 nodesize; 1096 u32 nodesize;
1101 u32 sectorsize; 1097 u32 sectorsize;
1102 u32 stripesize; 1098 u32 stripesize;
1099
1100#ifdef CONFIG_BTRFS_FS_REF_VERIFY
1101 spinlock_t ref_verify_lock;
1102 struct rb_root block_tree;
1103#endif
1103}; 1104};
1104 1105
1105static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 1106static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -1338,6 +1339,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
1338#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) 1339#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
1339#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) 1340#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
1340#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) 1341#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
1342#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
1341 1343
1342#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 1344#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
1343#define BTRFS_DEFAULT_MAX_INLINE (2048) 1345#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -2639,7 +2641,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2639 struct extent_buffer *buf, 2641 struct extent_buffer *buf,
2640 u64 parent, int last_ref); 2642 u64 parent, int last_ref);
2641int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 2643int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
2642 u64 root_objectid, u64 owner, 2644 struct btrfs_root *root, u64 owner,
2643 u64 offset, u64 ram_bytes, 2645 u64 offset, u64 ram_bytes,
2644 struct btrfs_key *ins); 2646 struct btrfs_key *ins);
2645int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 2647int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
@@ -2658,7 +2660,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2658 u64 bytenr, u64 num_bytes, u64 flags, 2660 u64 bytenr, u64 num_bytes, u64 flags,
2659 int level, int is_data); 2661 int level, int is_data);
2660int btrfs_free_extent(struct btrfs_trans_handle *trans, 2662int btrfs_free_extent(struct btrfs_trans_handle *trans,
2661 struct btrfs_fs_info *fs_info, 2663 struct btrfs_root *root,
2662 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 2664 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
2663 u64 owner, u64 offset); 2665 u64 owner, u64 offset);
2664 2666
@@ -2670,7 +2672,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
2670int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2672int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2671 struct btrfs_fs_info *fs_info); 2673 struct btrfs_fs_info *fs_info);
2672int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2674int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2673 struct btrfs_fs_info *fs_info, 2675 struct btrfs_root *root,
2674 u64 bytenr, u64 num_bytes, u64 parent, 2676 u64 bytenr, u64 num_bytes, u64 parent,
2675 u64 root_objectid, u64 owner, u64 offset); 2677 u64 root_objectid, u64 owner, u64 offset);
2676 2678
@@ -2744,6 +2746,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
2744 u64 *qgroup_reserved, bool use_global_rsv); 2746 u64 *qgroup_reserved, bool use_global_rsv);
2745void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 2747void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
2746 struct btrfs_block_rsv *rsv); 2748 struct btrfs_block_rsv *rsv);
2749void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
2750
2747int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); 2751int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
2748void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); 2752void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
2749int btrfs_delalloc_reserve_space(struct inode *inode, 2753int btrfs_delalloc_reserve_space(struct inode *inode,
@@ -2751,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
2751void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 2755void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2752struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 2756struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
2753 unsigned short type); 2757 unsigned short type);
2758void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
2759 struct btrfs_block_rsv *rsv,
2760 unsigned short type);
2754void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 2761void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
2755 struct btrfs_block_rsv *rsv); 2762 struct btrfs_block_rsv *rsv);
2756void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); 2763void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
@@ -2809,6 +2816,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2809 const struct btrfs_key *new_key); 2816 const struct btrfs_key *new_key);
2810struct extent_buffer *btrfs_root_node(struct btrfs_root *root); 2817struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
2811struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); 2818struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
2819struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
2812int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 2820int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
2813 struct btrfs_key *key, int lowest_level, 2821 struct btrfs_key *key, int lowest_level,
2814 u64 min_trans); 2822 u64 min_trans);
@@ -2821,9 +2829,7 @@ enum btrfs_compare_tree_result {
2821 BTRFS_COMPARE_TREE_CHANGED, 2829 BTRFS_COMPARE_TREE_CHANGED,
2822 BTRFS_COMPARE_TREE_SAME, 2830 BTRFS_COMPARE_TREE_SAME,
2823}; 2831};
2824typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, 2832typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
2825 struct btrfs_root *right_root,
2826 struct btrfs_path *left_path,
2827 struct btrfs_path *right_path, 2833 struct btrfs_path *right_path,
2828 struct btrfs_key *key, 2834 struct btrfs_key *key,
2829 enum btrfs_compare_tree_result result, 2835 enum btrfs_compare_tree_result result,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 19e4ad2f3f2e..5d73f79ded8b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
581 struct btrfs_block_rsv *dst_rsv; 581 struct btrfs_block_rsv *dst_rsv;
582 u64 num_bytes; 582 u64 num_bytes;
583 int ret; 583 int ret;
584 bool release = false;
585 584
586 src_rsv = trans->block_rsv; 585 src_rsv = trans->block_rsv;
587 dst_rsv = &fs_info->delayed_block_rsv; 586 dst_rsv = &fs_info->delayed_block_rsv;
@@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
589 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 588 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
590 589
591 /* 590 /*
592 * If our block_rsv is the delalloc block reserve then check and see if
593 * we have our extra reservation for updating the inode. If not fall
594 * through and try to reserve space quickly.
595 *
596 * We used to try and steal from the delalloc block rsv or the global
597 * reserve, but we'd steal a full reservation, which isn't kind. We are
598 * here through delalloc which means we've likely just cowed down close
599 * to the leaf that contains the inode, so we would steal less just
600 * doing the fallback inode update, so if we do end up having to steal
601 * from the global block rsv we hopefully only steal one or two blocks
602 * worth which is less likely to hurt us.
603 */
604 if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
605 spin_lock(&inode->lock);
606 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
607 &inode->runtime_flags))
608 release = true;
609 else
610 src_rsv = NULL;
611 spin_unlock(&inode->lock);
612 }
613
614 /*
615 * btrfs_dirty_inode will update the inode under btrfs_join_transaction 591 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
616 * which doesn't reserve space for speed. This is a problem since we 592 * which doesn't reserve space for speed. This is a problem since we
617 * still need to reserve space for this update, so try to reserve the 593 * still need to reserve space for this update, so try to reserve the
618 * space. 594 * space.
619 * 595 *
620 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 596 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
621 * we're accounted for. 597 * we always reserve enough to update the inode item.
622 */ 598 */
623 if (!src_rsv || (!trans->bytes_reserved && 599 if (!src_rsv || (!trans->bytes_reserved &&
624 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 600 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
@@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
643 } 619 }
644 620
645 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 621 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
646
647 /*
648 * Migrate only takes a reservation, it doesn't touch the size of the
649 * block_rsv. This is to simplify people who don't normally have things
650 * migrated from their block rsv. If they go to release their
651 * reservation, that will decrease the size as well, so if migrate
652 * reduced size we'd end up with a negative size. But for the
653 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
654 * but we could in fact do this reserve/migrate dance several times
655 * between the time we did the original reservation and we'd clean it
656 * up. So to take care of this, release the space for the meta
657 * reservation here. I think it may be time for a documentation page on
658 * how block rsvs. work.
659 */
660 if (!ret) { 622 if (!ret) {
661 trace_btrfs_space_reservation(fs_info, "delayed_inode", 623 trace_btrfs_space_reservation(fs_info, "delayed_inode",
662 btrfs_ino(inode), num_bytes, 1); 624 btrfs_ino(inode), num_bytes, 1);
663 node->bytes_reserved = num_bytes; 625 node->bytes_reserved = num_bytes;
664 } 626 }
665 627
666 if (release) {
667 trace_btrfs_space_reservation(fs_info, "delalloc",
668 btrfs_ino(inode), num_bytes, 0);
669 btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
670 }
671
672 return ret; 628 return ret;
673} 629}
674 630
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 93ffa898df6d..83be8f9fd906 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -40,10 +40,10 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
40/* 40/*
41 * compare two delayed tree backrefs with same bytenr and type 41 * compare two delayed tree backrefs with same bytenr and type
42 */ 42 */
43static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, 43static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
44 struct btrfs_delayed_tree_ref *ref1, int type) 44 struct btrfs_delayed_tree_ref *ref2)
45{ 45{
46 if (type == BTRFS_TREE_BLOCK_REF_KEY) { 46 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
47 if (ref1->root < ref2->root) 47 if (ref1->root < ref2->root)
48 return -1; 48 return -1;
49 if (ref1->root > ref2->root) 49 if (ref1->root > ref2->root)
@@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
60/* 60/*
61 * compare two delayed data backrefs with same bytenr and type 61 * compare two delayed data backrefs with same bytenr and type
62 */ 62 */
63static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, 63static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
64 struct btrfs_delayed_data_ref *ref1) 64 struct btrfs_delayed_data_ref *ref2)
65{ 65{
66 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { 66 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
67 if (ref1->root < ref2->root) 67 if (ref1->root < ref2->root)
@@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
85 return 0; 85 return 0;
86} 86}
87 87
88static int comp_refs(struct btrfs_delayed_ref_node *ref1,
89 struct btrfs_delayed_ref_node *ref2,
90 bool check_seq)
91{
92 int ret = 0;
93
94 if (ref1->type < ref2->type)
95 return -1;
96 if (ref1->type > ref2->type)
97 return 1;
98 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
99 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
100 ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
101 btrfs_delayed_node_to_tree_ref(ref2));
102 else
103 ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
104 btrfs_delayed_node_to_data_ref(ref2));
105 if (ret)
106 return ret;
107 if (check_seq) {
108 if (ref1->seq < ref2->seq)
109 return -1;
110 if (ref1->seq > ref2->seq)
111 return 1;
112 }
113 return 0;
114}
115
88/* insert a new ref to head ref rbtree */ 116/* insert a new ref to head ref rbtree */
89static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, 117static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
90 struct rb_node *node) 118 struct rb_node *node)
@@ -96,15 +124,43 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
96 u64 bytenr; 124 u64 bytenr;
97 125
98 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 126 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
99 bytenr = ins->node.bytenr; 127 bytenr = ins->bytenr;
100 while (*p) { 128 while (*p) {
101 parent_node = *p; 129 parent_node = *p;
102 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 130 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
103 href_node); 131 href_node);
104 132
105 if (bytenr < entry->node.bytenr) 133 if (bytenr < entry->bytenr)
134 p = &(*p)->rb_left;
135 else if (bytenr > entry->bytenr)
136 p = &(*p)->rb_right;
137 else
138 return entry;
139 }
140
141 rb_link_node(node, parent_node, p);
142 rb_insert_color(node, root);
143 return NULL;
144}
145
146static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
147 struct btrfs_delayed_ref_node *ins)
148{
149 struct rb_node **p = &root->rb_node;
150 struct rb_node *node = &ins->ref_node;
151 struct rb_node *parent_node = NULL;
152 struct btrfs_delayed_ref_node *entry;
153
154 while (*p) {
155 int comp;
156
157 parent_node = *p;
158 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
159 ref_node);
160 comp = comp_refs(ins, entry, true);
161 if (comp < 0)
106 p = &(*p)->rb_left; 162 p = &(*p)->rb_left;
107 else if (bytenr > entry->node.bytenr) 163 else if (comp > 0)
108 p = &(*p)->rb_right; 164 p = &(*p)->rb_right;
109 else 165 else
110 return entry; 166 return entry;
@@ -133,15 +189,15 @@ find_ref_head(struct rb_root *root, u64 bytenr,
133 while (n) { 189 while (n) {
134 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 190 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
135 191
136 if (bytenr < entry->node.bytenr) 192 if (bytenr < entry->bytenr)
137 n = n->rb_left; 193 n = n->rb_left;
138 else if (bytenr > entry->node.bytenr) 194 else if (bytenr > entry->bytenr)
139 n = n->rb_right; 195 n = n->rb_right;
140 else 196 else
141 return entry; 197 return entry;
142 } 198 }
143 if (entry && return_bigger) { 199 if (entry && return_bigger) {
144 if (bytenr > entry->node.bytenr) { 200 if (bytenr > entry->bytenr) {
145 n = rb_next(&entry->href_node); 201 n = rb_next(&entry->href_node);
146 if (!n) 202 if (!n)
147 n = rb_first(root); 203 n = rb_first(root);
@@ -164,17 +220,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
164 if (mutex_trylock(&head->mutex)) 220 if (mutex_trylock(&head->mutex))
165 return 0; 221 return 0;
166 222
167 refcount_inc(&head->node.refs); 223 refcount_inc(&head->refs);
168 spin_unlock(&delayed_refs->lock); 224 spin_unlock(&delayed_refs->lock);
169 225
170 mutex_lock(&head->mutex); 226 mutex_lock(&head->mutex);
171 spin_lock(&delayed_refs->lock); 227 spin_lock(&delayed_refs->lock);
172 if (!head->node.in_tree) { 228 if (RB_EMPTY_NODE(&head->href_node)) {
173 mutex_unlock(&head->mutex); 229 mutex_unlock(&head->mutex);
174 btrfs_put_delayed_ref(&head->node); 230 btrfs_put_delayed_ref_head(head);
175 return -EAGAIN; 231 return -EAGAIN;
176 } 232 }
177 btrfs_put_delayed_ref(&head->node); 233 btrfs_put_delayed_ref_head(head);
178 return 0; 234 return 0;
179} 235}
180 236
@@ -183,15 +239,11 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
183 struct btrfs_delayed_ref_head *head, 239 struct btrfs_delayed_ref_head *head,
184 struct btrfs_delayed_ref_node *ref) 240 struct btrfs_delayed_ref_node *ref)
185{ 241{
186 if (btrfs_delayed_ref_is_head(ref)) { 242 assert_spin_locked(&head->lock);
187 head = btrfs_delayed_node_to_head(ref); 243 rb_erase(&ref->ref_node, &head->ref_tree);
188 rb_erase(&head->href_node, &delayed_refs->href_root); 244 RB_CLEAR_NODE(&ref->ref_node);
189 } else { 245 if (!list_empty(&ref->add_list))
190 assert_spin_locked(&head->lock); 246 list_del(&ref->add_list);
191 list_del(&ref->list);
192 if (!list_empty(&ref->add_list))
193 list_del(&ref->add_list);
194 }
195 ref->in_tree = 0; 247 ref->in_tree = 0;
196 btrfs_put_delayed_ref(ref); 248 btrfs_put_delayed_ref(ref);
197 atomic_dec(&delayed_refs->num_entries); 249 atomic_dec(&delayed_refs->num_entries);
@@ -206,36 +258,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
206 u64 seq) 258 u64 seq)
207{ 259{
208 struct btrfs_delayed_ref_node *next; 260 struct btrfs_delayed_ref_node *next;
261 struct rb_node *node = rb_next(&ref->ref_node);
209 bool done = false; 262 bool done = false;
210 263
211 next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 264 while (!done && node) {
212 list);
213 while (!done && &next->list != &head->ref_list) {
214 int mod; 265 int mod;
215 struct btrfs_delayed_ref_node *next2;
216
217 next2 = list_next_entry(next, list);
218
219 if (next == ref)
220 goto next;
221 266
267 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
268 node = rb_next(node);
222 if (seq && next->seq >= seq) 269 if (seq && next->seq >= seq)
223 goto next; 270 break;
224 271 if (comp_refs(ref, next, false))
225 if (next->type != ref->type) 272 break;
226 goto next;
227
228 if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
229 ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
230 comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
231 btrfs_delayed_node_to_tree_ref(next),
232 ref->type))
233 goto next;
234 if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
235 ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
236 comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
237 btrfs_delayed_node_to_data_ref(next)))
238 goto next;
239 273
240 if (ref->action == next->action) { 274 if (ref->action == next->action) {
241 mod = next->ref_mod; 275 mod = next->ref_mod;
@@ -259,8 +293,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
259 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 293 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
260 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 294 ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
261 } 295 }
262next:
263 next = next2;
264 } 296 }
265 297
266 return done; 298 return done;
@@ -272,11 +304,12 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
272 struct btrfs_delayed_ref_head *head) 304 struct btrfs_delayed_ref_head *head)
273{ 305{
274 struct btrfs_delayed_ref_node *ref; 306 struct btrfs_delayed_ref_node *ref;
307 struct rb_node *node;
275 u64 seq = 0; 308 u64 seq = 0;
276 309
277 assert_spin_locked(&head->lock); 310 assert_spin_locked(&head->lock);
278 311
279 if (list_empty(&head->ref_list)) 312 if (RB_EMPTY_ROOT(&head->ref_tree))
280 return; 313 return;
281 314
282 /* We don't have too many refs to merge for data. */ 315 /* We don't have too many refs to merge for data. */
@@ -293,22 +326,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
293 } 326 }
294 spin_unlock(&fs_info->tree_mod_seq_lock); 327 spin_unlock(&fs_info->tree_mod_seq_lock);
295 328
296 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 329again:
297 list); 330 for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
298 while (&ref->list != &head->ref_list) { 331 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
299 if (seq && ref->seq >= seq) 332 if (seq && ref->seq >= seq)
300 goto next;
301
302 if (merge_ref(trans, delayed_refs, head, ref, seq)) {
303 if (list_empty(&head->ref_list))
304 break;
305 ref = list_first_entry(&head->ref_list,
306 struct btrfs_delayed_ref_node,
307 list);
308 continue; 333 continue;
309 } 334 if (merge_ref(trans, delayed_refs, head, ref, seq))
310next: 335 goto again;
311 ref = list_next_entry(ref, list);
312 } 336 }
313} 337}
314 338
@@ -380,8 +404,8 @@ again:
380 head->processing = 1; 404 head->processing = 1;
381 WARN_ON(delayed_refs->num_heads_ready == 0); 405 WARN_ON(delayed_refs->num_heads_ready == 0);
382 delayed_refs->num_heads_ready--; 406 delayed_refs->num_heads_ready--;
383 delayed_refs->run_delayed_start = head->node.bytenr + 407 delayed_refs->run_delayed_start = head->bytenr +
384 head->node.num_bytes; 408 head->num_bytes;
385 return head; 409 return head;
386} 410}
387 411
@@ -391,37 +415,19 @@ again:
391 * Return 0 for insert. 415 * Return 0 for insert.
392 * Return >0 for merge. 416 * Return >0 for merge.
393 */ 417 */
394static int 418static int insert_delayed_ref(struct btrfs_trans_handle *trans,
395add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, 419 struct btrfs_delayed_ref_root *root,
396 struct btrfs_delayed_ref_root *root, 420 struct btrfs_delayed_ref_head *href,
397 struct btrfs_delayed_ref_head *href, 421 struct btrfs_delayed_ref_node *ref)
398 struct btrfs_delayed_ref_node *ref)
399{ 422{
400 struct btrfs_delayed_ref_node *exist; 423 struct btrfs_delayed_ref_node *exist;
401 int mod; 424 int mod;
402 int ret = 0; 425 int ret = 0;
403 426
404 spin_lock(&href->lock); 427 spin_lock(&href->lock);
405 /* Check whether we can merge the tail node with ref */ 428 exist = tree_insert(&href->ref_tree, ref);
406 if (list_empty(&href->ref_list)) 429 if (!exist)
407 goto add_tail; 430 goto inserted;
408 exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
409 list);
410 /* No need to compare bytenr nor is_head */
411 if (exist->type != ref->type || exist->seq != ref->seq)
412 goto add_tail;
413
414 if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
415 exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
416 comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
417 btrfs_delayed_node_to_tree_ref(ref),
418 ref->type))
419 goto add_tail;
420 if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
421 exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
422 comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
423 btrfs_delayed_node_to_data_ref(ref)))
424 goto add_tail;
425 431
426 /* Now we are sure we can merge */ 432 /* Now we are sure we can merge */
427 ret = 1; 433 ret = 1;
@@ -452,9 +458,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
452 drop_delayed_ref(trans, root, href, exist); 458 drop_delayed_ref(trans, root, href, exist);
453 spin_unlock(&href->lock); 459 spin_unlock(&href->lock);
454 return ret; 460 return ret;
455 461inserted:
456add_tail:
457 list_add_tail(&ref->list, &href->ref_list);
458 if (ref->action == BTRFS_ADD_DELAYED_REF) 462 if (ref->action == BTRFS_ADD_DELAYED_REF)
459 list_add_tail(&ref->add_list, &href->ref_add_list); 463 list_add_tail(&ref->add_list, &href->ref_add_list);
460 atomic_inc(&root->num_entries); 464 atomic_inc(&root->num_entries);
@@ -469,20 +473,16 @@ add_tail:
469 */ 473 */
470static noinline void 474static noinline void
471update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, 475update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
472 struct btrfs_delayed_ref_node *existing, 476 struct btrfs_delayed_ref_head *existing,
473 struct btrfs_delayed_ref_node *update, 477 struct btrfs_delayed_ref_head *update,
474 int *old_ref_mod_ret) 478 int *old_ref_mod_ret)
475{ 479{
476 struct btrfs_delayed_ref_head *existing_ref;
477 struct btrfs_delayed_ref_head *ref;
478 int old_ref_mod; 480 int old_ref_mod;
479 481
480 existing_ref = btrfs_delayed_node_to_head(existing); 482 BUG_ON(existing->is_data != update->is_data);
481 ref = btrfs_delayed_node_to_head(update);
482 BUG_ON(existing_ref->is_data != ref->is_data);
483 483
484 spin_lock(&existing_ref->lock); 484 spin_lock(&existing->lock);
485 if (ref->must_insert_reserved) { 485 if (update->must_insert_reserved) {
486 /* if the extent was freed and then 486 /* if the extent was freed and then
487 * reallocated before the delayed ref 487 * reallocated before the delayed ref
488 * entries were processed, we can end up 488 * entries were processed, we can end up
@@ -490,7 +490,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
490 * the must_insert_reserved flag set. 490 * the must_insert_reserved flag set.
491 * Set it again here 491 * Set it again here
492 */ 492 */
493 existing_ref->must_insert_reserved = ref->must_insert_reserved; 493 existing->must_insert_reserved = update->must_insert_reserved;
494 494
495 /* 495 /*
496 * update the num_bytes so we make sure the accounting 496 * update the num_bytes so we make sure the accounting
@@ -500,22 +500,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
500 500
501 } 501 }
502 502
503 if (ref->extent_op) { 503 if (update->extent_op) {
504 if (!existing_ref->extent_op) { 504 if (!existing->extent_op) {
505 existing_ref->extent_op = ref->extent_op; 505 existing->extent_op = update->extent_op;
506 } else { 506 } else {
507 if (ref->extent_op->update_key) { 507 if (update->extent_op->update_key) {
508 memcpy(&existing_ref->extent_op->key, 508 memcpy(&existing->extent_op->key,
509 &ref->extent_op->key, 509 &update->extent_op->key,
510 sizeof(ref->extent_op->key)); 510 sizeof(update->extent_op->key));
511 existing_ref->extent_op->update_key = true; 511 existing->extent_op->update_key = true;
512 } 512 }
513 if (ref->extent_op->update_flags) { 513 if (update->extent_op->update_flags) {
514 existing_ref->extent_op->flags_to_set |= 514 existing->extent_op->flags_to_set |=
515 ref->extent_op->flags_to_set; 515 update->extent_op->flags_to_set;
516 existing_ref->extent_op->update_flags = true; 516 existing->extent_op->update_flags = true;
517 } 517 }
518 btrfs_free_delayed_extent_op(ref->extent_op); 518 btrfs_free_delayed_extent_op(update->extent_op);
519 } 519 }
520 } 520 }
521 /* 521 /*
@@ -523,23 +523,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
523 * only need the lock for this case cause we could be processing it 523 * only need the lock for this case cause we could be processing it
524 * currently, for refs we just added we know we're a-ok. 524 * currently, for refs we just added we know we're a-ok.
525 */ 525 */
526 old_ref_mod = existing_ref->total_ref_mod; 526 old_ref_mod = existing->total_ref_mod;
527 if (old_ref_mod_ret) 527 if (old_ref_mod_ret)
528 *old_ref_mod_ret = old_ref_mod; 528 *old_ref_mod_ret = old_ref_mod;
529 existing->ref_mod += update->ref_mod; 529 existing->ref_mod += update->ref_mod;
530 existing_ref->total_ref_mod += update->ref_mod; 530 existing->total_ref_mod += update->ref_mod;
531 531
532 /* 532 /*
533 * If we are going to from a positive ref mod to a negative or vice 533 * If we are going to from a positive ref mod to a negative or vice
534 * versa we need to make sure to adjust pending_csums accordingly. 534 * versa we need to make sure to adjust pending_csums accordingly.
535 */ 535 */
536 if (existing_ref->is_data) { 536 if (existing->is_data) {
537 if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) 537 if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
538 delayed_refs->pending_csums -= existing->num_bytes; 538 delayed_refs->pending_csums -= existing->num_bytes;
539 if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) 539 if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
540 delayed_refs->pending_csums += existing->num_bytes; 540 delayed_refs->pending_csums += existing->num_bytes;
541 } 541 }
542 spin_unlock(&existing_ref->lock); 542 spin_unlock(&existing->lock);
543} 543}
544 544
545/* 545/*
@@ -550,14 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
550static noinline struct btrfs_delayed_ref_head * 550static noinline struct btrfs_delayed_ref_head *
551add_delayed_ref_head(struct btrfs_fs_info *fs_info, 551add_delayed_ref_head(struct btrfs_fs_info *fs_info,
552 struct btrfs_trans_handle *trans, 552 struct btrfs_trans_handle *trans,
553 struct btrfs_delayed_ref_node *ref, 553 struct btrfs_delayed_ref_head *head_ref,
554 struct btrfs_qgroup_extent_record *qrecord, 554 struct btrfs_qgroup_extent_record *qrecord,
555 u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, 555 u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
556 int action, int is_data, int *qrecord_inserted_ret, 556 int action, int is_data, int *qrecord_inserted_ret,
557 int *old_ref_mod, int *new_ref_mod) 557 int *old_ref_mod, int *new_ref_mod)
558{ 558{
559 struct btrfs_delayed_ref_head *existing; 559 struct btrfs_delayed_ref_head *existing;
560 struct btrfs_delayed_ref_head *head_ref = NULL;
561 struct btrfs_delayed_ref_root *delayed_refs; 560 struct btrfs_delayed_ref_root *delayed_refs;
562 int count_mod = 1; 561 int count_mod = 1;
563 int must_insert_reserved = 0; 562 int must_insert_reserved = 0;
@@ -593,26 +592,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
593 592
594 delayed_refs = &trans->transaction->delayed_refs; 593 delayed_refs = &trans->transaction->delayed_refs;
595 594
596 /* first set the basic ref node struct up */ 595 refcount_set(&head_ref->refs, 1);
597 refcount_set(&ref->refs, 1); 596 head_ref->bytenr = bytenr;
598 ref->bytenr = bytenr; 597 head_ref->num_bytes = num_bytes;
599 ref->num_bytes = num_bytes; 598 head_ref->ref_mod = count_mod;
600 ref->ref_mod = count_mod;
601 ref->type = 0;
602 ref->action = 0;
603 ref->is_head = 1;
604 ref->in_tree = 1;
605 ref->seq = 0;
606
607 head_ref = btrfs_delayed_node_to_head(ref);
608 head_ref->must_insert_reserved = must_insert_reserved; 599 head_ref->must_insert_reserved = must_insert_reserved;
609 head_ref->is_data = is_data; 600 head_ref->is_data = is_data;
610 INIT_LIST_HEAD(&head_ref->ref_list); 601 head_ref->ref_tree = RB_ROOT;
611 INIT_LIST_HEAD(&head_ref->ref_add_list); 602 INIT_LIST_HEAD(&head_ref->ref_add_list);
603 RB_CLEAR_NODE(&head_ref->href_node);
612 head_ref->processing = 0; 604 head_ref->processing = 0;
613 head_ref->total_ref_mod = count_mod; 605 head_ref->total_ref_mod = count_mod;
614 head_ref->qgroup_reserved = 0; 606 head_ref->qgroup_reserved = 0;
615 head_ref->qgroup_ref_root = 0; 607 head_ref->qgroup_ref_root = 0;
608 spin_lock_init(&head_ref->lock);
609 mutex_init(&head_ref->mutex);
616 610
617 /* Record qgroup extent info if provided */ 611 /* Record qgroup extent info if provided */
618 if (qrecord) { 612 if (qrecord) {
@@ -632,17 +626,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
632 qrecord_inserted = 1; 626 qrecord_inserted = 1;
633 } 627 }
634 628
635 spin_lock_init(&head_ref->lock); 629 trace_add_delayed_ref_head(fs_info, head_ref, action);
636 mutex_init(&head_ref->mutex);
637
638 trace_add_delayed_ref_head(fs_info, ref, head_ref, action);
639 630
640 existing = htree_insert(&delayed_refs->href_root, 631 existing = htree_insert(&delayed_refs->href_root,
641 &head_ref->href_node); 632 &head_ref->href_node);
642 if (existing) { 633 if (existing) {
643 WARN_ON(ref_root && reserved && existing->qgroup_ref_root 634 WARN_ON(ref_root && reserved && existing->qgroup_ref_root
644 && existing->qgroup_reserved); 635 && existing->qgroup_reserved);
645 update_existing_head_ref(delayed_refs, &existing->node, ref, 636 update_existing_head_ref(delayed_refs, existing, head_ref,
646 old_ref_mod); 637 old_ref_mod);
647 /* 638 /*
648 * we've updated the existing ref, free the newly 639 * we've updated the existing ref, free the newly
@@ -699,7 +690,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
699 ref->is_head = 0; 690 ref->is_head = 0;
700 ref->in_tree = 1; 691 ref->in_tree = 1;
701 ref->seq = seq; 692 ref->seq = seq;
702 INIT_LIST_HEAD(&ref->list); 693 RB_CLEAR_NODE(&ref->ref_node);
703 INIT_LIST_HEAD(&ref->add_list); 694 INIT_LIST_HEAD(&ref->add_list);
704 695
705 full_ref = btrfs_delayed_node_to_tree_ref(ref); 696 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -713,7 +704,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
713 704
714 trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); 705 trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
715 706
716 ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 707 ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
717 708
718 /* 709 /*
719 * XXX: memory should be freed at the same level allocated. 710 * XXX: memory should be freed at the same level allocated.
@@ -756,7 +747,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
756 ref->is_head = 0; 747 ref->is_head = 0;
757 ref->in_tree = 1; 748 ref->in_tree = 1;
758 ref->seq = seq; 749 ref->seq = seq;
759 INIT_LIST_HEAD(&ref->list); 750 RB_CLEAR_NODE(&ref->ref_node);
760 INIT_LIST_HEAD(&ref->add_list); 751 INIT_LIST_HEAD(&ref->add_list);
761 752
762 full_ref = btrfs_delayed_node_to_data_ref(ref); 753 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -772,8 +763,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
772 763
773 trace_add_delayed_data_ref(fs_info, ref, full_ref, action); 764 trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
774 765
775 ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 766 ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
776
777 if (ret > 0) 767 if (ret > 0)
778 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); 768 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
779} 769}
@@ -821,7 +811,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
821 * insert both the head node and the new ref without dropping 811 * insert both the head node and the new ref without dropping
822 * the spin lock 812 * the spin lock
823 */ 813 */
824 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 814 head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
825 bytenr, num_bytes, 0, 0, action, 0, 815 bytenr, num_bytes, 0, 0, action, 0,
826 &qrecord_inserted, old_ref_mod, 816 &qrecord_inserted, old_ref_mod,
827 new_ref_mod); 817 new_ref_mod);
@@ -888,7 +878,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
888 * insert both the head node and the new ref without dropping 878 * insert both the head node and the new ref without dropping
889 * the spin lock 879 * the spin lock
890 */ 880 */
891 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 881 head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
892 bytenr, num_bytes, ref_root, reserved, 882 bytenr, num_bytes, ref_root, reserved,
893 action, 1, &qrecord_inserted, 883 action, 1, &qrecord_inserted,
894 old_ref_mod, new_ref_mod); 884 old_ref_mod, new_ref_mod);
@@ -920,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
920 delayed_refs = &trans->transaction->delayed_refs; 910 delayed_refs = &trans->transaction->delayed_refs;
921 spin_lock(&delayed_refs->lock); 911 spin_lock(&delayed_refs->lock);
922 912
923 add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, 913 add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
924 num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, 914 num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
925 extent_op->is_data, NULL, NULL, NULL); 915 extent_op->is_data, NULL, NULL, NULL);
926 916
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ce88e4ac5276..a43af432f859 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -26,18 +26,8 @@
26#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ 26#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
27#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ 27#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
28 28
29/*
30 * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
31 * same ref_node structure.
32 * Ref_head is in a higher logic level than tree/data ref, and duplicated
33 * bytenr/num_bytes in ref_node is really a waste or memory, they should be
34 * referred from ref_head.
35 * This gets more disgusting after we use list to store tree/data ref in
36 * ref_head. Must clean this mess up later.
37 */
38struct btrfs_delayed_ref_node { 29struct btrfs_delayed_ref_node {
39 /*data/tree ref use list, stored in ref_head->ref_list. */ 30 struct rb_node ref_node;
40 struct list_head list;
41 /* 31 /*
42 * If action is BTRFS_ADD_DELAYED_REF, also link this node to 32 * If action is BTRFS_ADD_DELAYED_REF, also link this node to
43 * ref_head->ref_add_list, then we do not need to iterate the 33 * ref_head->ref_add_list, then we do not need to iterate the
@@ -91,8 +81,9 @@ struct btrfs_delayed_extent_op {
91 * reference count modifications we've queued up. 81 * reference count modifications we've queued up.
92 */ 82 */
93struct btrfs_delayed_ref_head { 83struct btrfs_delayed_ref_head {
94 struct btrfs_delayed_ref_node node; 84 u64 bytenr;
95 85 u64 num_bytes;
86 refcount_t refs;
96 /* 87 /*
97 * the mutex is held while running the refs, and it is also 88 * the mutex is held while running the refs, and it is also
98 * held when checking the sum of reference modifications. 89 * held when checking the sum of reference modifications.
@@ -100,7 +91,7 @@ struct btrfs_delayed_ref_head {
100 struct mutex mutex; 91 struct mutex mutex;
101 92
102 spinlock_t lock; 93 spinlock_t lock;
103 struct list_head ref_list; 94 struct rb_root ref_tree;
104 /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ 95 /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
105 struct list_head ref_add_list; 96 struct list_head ref_add_list;
106 97
@@ -116,6 +107,14 @@ struct btrfs_delayed_ref_head {
116 int total_ref_mod; 107 int total_ref_mod;
117 108
118 /* 109 /*
110 * This is the current outstanding mod references for this bytenr. This
111 * is used with lookup_extent_info to get an accurate reference count
112 * for a bytenr, so it is adjusted as delayed refs are run so that any
113 * on disk reference count + ref_mod is accurate.
114 */
115 int ref_mod;
116
117 /*
119 * For qgroup reserved space freeing. 118 * For qgroup reserved space freeing.
120 * 119 *
121 * ref_root and reserved will be recorded after 120 * ref_root and reserved will be recorded after
@@ -234,15 +233,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
234 case BTRFS_SHARED_DATA_REF_KEY: 233 case BTRFS_SHARED_DATA_REF_KEY:
235 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 234 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
236 break; 235 break;
237 case 0:
238 kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
239 break;
240 default: 236 default:
241 BUG(); 237 BUG();
242 } 238 }
243 } 239 }
244} 240}
245 241
242static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
243{
244 if (refcount_dec_and_test(&head->refs))
245 kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
246}
247
246int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, 248int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
247 struct btrfs_trans_handle *trans, 249 struct btrfs_trans_handle *trans,
248 u64 bytenr, u64 num_bytes, u64 parent, 250 u64 bytenr, u64 num_bytes, u64 parent,
@@ -283,35 +285,17 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
283 u64 seq); 285 u64 seq);
284 286
285/* 287/*
286 * a node might live in a head or a regular ref, this lets you
287 * test for the proper type to use.
288 */
289static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
290{
291 return node->is_head;
292}
293
294/*
295 * helper functions to cast a node into its container 288 * helper functions to cast a node into its container
296 */ 289 */
297static inline struct btrfs_delayed_tree_ref * 290static inline struct btrfs_delayed_tree_ref *
298btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) 291btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
299{ 292{
300 WARN_ON(btrfs_delayed_ref_is_head(node));
301 return container_of(node, struct btrfs_delayed_tree_ref, node); 293 return container_of(node, struct btrfs_delayed_tree_ref, node);
302} 294}
303 295
304static inline struct btrfs_delayed_data_ref * 296static inline struct btrfs_delayed_data_ref *
305btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) 297btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
306{ 298{
307 WARN_ON(btrfs_delayed_ref_is_head(node));
308 return container_of(node, struct btrfs_delayed_data_ref, node); 299 return container_of(node, struct btrfs_delayed_data_ref, node);
309} 300}
310
311static inline struct btrfs_delayed_ref_head *
312btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
313{
314 WARN_ON(!btrfs_delayed_ref_is_head(node));
315 return container_of(node, struct btrfs_delayed_ref_head, node);
316}
317#endif 301#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab849037b..efce9a2fa9be 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,8 @@
50#include "sysfs.h" 50#include "sysfs.h"
51#include "qgroup.h" 51#include "qgroup.h"
52#include "compression.h" 52#include "compression.h"
53#include "tree-checker.h"
54#include "ref-verify.h"
53 55
54#ifdef CONFIG_X86 56#ifdef CONFIG_X86
55#include <asm/cpufeature.h> 57#include <asm/cpufeature.h>
@@ -543,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
543 return ret; 545 return ret;
544} 546}
545 547
546#define CORRUPT(reason, eb, root, slot) \
547 btrfs_crit(root->fs_info, \
548 "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
549 btrfs_header_level(eb) == 0 ? "leaf" : "node", \
550 reason, btrfs_header_bytenr(eb), root->objectid, slot)
551
552static noinline int check_leaf(struct btrfs_root *root,
553 struct extent_buffer *leaf)
554{
555 struct btrfs_fs_info *fs_info = root->fs_info;
556 struct btrfs_key key;
557 struct btrfs_key leaf_key;
558 u32 nritems = btrfs_header_nritems(leaf);
559 int slot;
560
561 /*
562 * Extent buffers from a relocation tree have a owner field that
563 * corresponds to the subvolume tree they are based on. So just from an
564 * extent buffer alone we can not find out what is the id of the
565 * corresponding subvolume tree, so we can not figure out if the extent
566 * buffer corresponds to the root of the relocation tree or not. So skip
567 * this check for relocation trees.
568 */
569 if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
570 struct btrfs_root *check_root;
571
572 key.objectid = btrfs_header_owner(leaf);
573 key.type = BTRFS_ROOT_ITEM_KEY;
574 key.offset = (u64)-1;
575
576 check_root = btrfs_get_fs_root(fs_info, &key, false);
577 /*
578 * The only reason we also check NULL here is that during
579 * open_ctree() some roots has not yet been set up.
580 */
581 if (!IS_ERR_OR_NULL(check_root)) {
582 struct extent_buffer *eb;
583
584 eb = btrfs_root_node(check_root);
585 /* if leaf is the root, then it's fine */
586 if (leaf != eb) {
587 CORRUPT("non-root leaf's nritems is 0",
588 leaf, check_root, 0);
589 free_extent_buffer(eb);
590 return -EIO;
591 }
592 free_extent_buffer(eb);
593 }
594 return 0;
595 }
596
597 if (nritems == 0)
598 return 0;
599
600 /* Check the 0 item */
601 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
602 BTRFS_LEAF_DATA_SIZE(fs_info)) {
603 CORRUPT("invalid item offset size pair", leaf, root, 0);
604 return -EIO;
605 }
606
607 /*
608 * Check to make sure each items keys are in the correct order and their
609 * offsets make sense. We only have to loop through nritems-1 because
610 * we check the current slot against the next slot, which verifies the
611 * next slot's offset+size makes sense and that the current's slot
612 * offset is correct.
613 */
614 for (slot = 0; slot < nritems - 1; slot++) {
615 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
616 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
617
618 /* Make sure the keys are in the right order */
619 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
620 CORRUPT("bad key order", leaf, root, slot);
621 return -EIO;
622 }
623
624 /*
625 * Make sure the offset and ends are right, remember that the
626 * item data starts at the end of the leaf and grows towards the
627 * front.
628 */
629 if (btrfs_item_offset_nr(leaf, slot) !=
630 btrfs_item_end_nr(leaf, slot + 1)) {
631 CORRUPT("slot offset bad", leaf, root, slot);
632 return -EIO;
633 }
634
635 /*
636 * Check to make sure that we don't point outside of the leaf,
637 * just in case all the items are consistent to each other, but
638 * all point outside of the leaf.
639 */
640 if (btrfs_item_end_nr(leaf, slot) >
641 BTRFS_LEAF_DATA_SIZE(fs_info)) {
642 CORRUPT("slot end outside of leaf", leaf, root, slot);
643 return -EIO;
644 }
645 }
646
647 return 0;
648}
649
650static int check_node(struct btrfs_root *root, struct extent_buffer *node)
651{
652 unsigned long nr = btrfs_header_nritems(node);
653 struct btrfs_key key, next_key;
654 int slot;
655 u64 bytenr;
656 int ret = 0;
657
658 if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
659 btrfs_crit(root->fs_info,
660 "corrupt node: block %llu root %llu nritems %lu",
661 node->start, root->objectid, nr);
662 return -EIO;
663 }
664
665 for (slot = 0; slot < nr - 1; slot++) {
666 bytenr = btrfs_node_blockptr(node, slot);
667 btrfs_node_key_to_cpu(node, &key, slot);
668 btrfs_node_key_to_cpu(node, &next_key, slot + 1);
669
670 if (!bytenr) {
671 CORRUPT("invalid item slot", node, root, slot);
672 ret = -EIO;
673 goto out;
674 }
675
676 if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
677 CORRUPT("bad key order", node, root, slot);
678 ret = -EIO;
679 goto out;
680 }
681 }
682out:
683 return ret;
684}
685
686static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 548static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
687 u64 phy_offset, struct page *page, 549 u64 phy_offset, struct page *page,
688 u64 start, u64 end, int mirror) 550 u64 start, u64 end, int mirror)
@@ -748,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
748 * that we don't try and read the other copies of this block, just 610 * that we don't try and read the other copies of this block, just
749 * return -EIO. 611 * return -EIO.
750 */ 612 */
751 if (found_level == 0 && check_leaf(root, eb)) { 613 if (found_level == 0 && btrfs_check_leaf(root, eb)) {
752 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 614 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
753 ret = -EIO; 615 ret = -EIO;
754 } 616 }
755 617
756 if (found_level > 0 && check_node(root, eb)) 618 if (found_level > 0 && btrfs_check_node(root, eb))
757 ret = -EIO; 619 ret = -EIO;
758 620
759 if (!ret) 621 if (!ret)
@@ -879,22 +741,9 @@ static void run_one_async_start(struct btrfs_work *work)
879 741
880static void run_one_async_done(struct btrfs_work *work) 742static void run_one_async_done(struct btrfs_work *work)
881{ 743{
882 struct btrfs_fs_info *fs_info;
883 struct async_submit_bio *async; 744 struct async_submit_bio *async;
884 int limit;
885 745
886 async = container_of(work, struct async_submit_bio, work); 746 async = container_of(work, struct async_submit_bio, work);
887 fs_info = async->fs_info;
888
889 limit = btrfs_async_submit_limit(fs_info);
890 limit = limit * 2 / 3;
891
892 /*
893 * atomic_dec_return implies a barrier for waitqueue_active
894 */
895 if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
896 waitqueue_active(&fs_info->async_submit_wait))
897 wake_up(&fs_info->async_submit_wait);
898 747
899 /* If an error occurred we just want to clean up the bio and move on */ 748 /* If an error occurred we just want to clean up the bio and move on */
900 if (async->status) { 749 if (async->status) {
@@ -942,19 +791,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
942 791
943 async->status = 0; 792 async->status = 0;
944 793
945 atomic_inc(&fs_info->nr_async_submits);
946
947 if (op_is_sync(bio->bi_opf)) 794 if (op_is_sync(bio->bi_opf))
948 btrfs_set_work_high_priority(&async->work); 795 btrfs_set_work_high_priority(&async->work);
949 796
950 btrfs_queue_work(fs_info->workers, &async->work); 797 btrfs_queue_work(fs_info->workers, &async->work);
951
952 while (atomic_read(&fs_info->async_submit_draining) &&
953 atomic_read(&fs_info->nr_async_submits)) {
954 wait_event(fs_info->async_submit_wait,
955 (atomic_read(&fs_info->nr_async_submits) == 0));
956 }
957
958 return 0; 798 return 0;
959} 799}
960 800
@@ -1005,9 +845,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
1005 return ret; 845 return ret;
1006} 846}
1007 847
1008static int check_async_write(unsigned long bio_flags) 848static int check_async_write(struct btrfs_inode *bi)
1009{ 849{
1010 if (bio_flags & EXTENT_BIO_TREE_LOG) 850 if (atomic_read(&bi->sync_writers))
1011 return 0; 851 return 0;
1012#ifdef CONFIG_X86 852#ifdef CONFIG_X86
1013 if (static_cpu_has(X86_FEATURE_XMM4_2)) 853 if (static_cpu_has(X86_FEATURE_XMM4_2))
@@ -1022,7 +862,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
1022{ 862{
1023 struct inode *inode = private_data; 863 struct inode *inode = private_data;
1024 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 864 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1025 int async = check_async_write(bio_flags); 865 int async = check_async_write(BTRFS_I(inode));
1026 blk_status_t ret; 866 blk_status_t ret;
1027 867
1028 if (bio_op(bio) != REQ_OP_WRITE) { 868 if (bio_op(bio) != REQ_OP_WRITE) {
@@ -2607,14 +2447,6 @@ int open_ctree(struct super_block *sb,
2607 goto fail_delalloc_bytes; 2447 goto fail_delalloc_bytes;
2608 } 2448 }
2609 2449
2610 fs_info->btree_inode = new_inode(sb);
2611 if (!fs_info->btree_inode) {
2612 err = -ENOMEM;
2613 goto fail_bio_counter;
2614 }
2615
2616 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2617
2618 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 2450 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2619 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 2451 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2620 INIT_LIST_HEAD(&fs_info->trans_list); 2452 INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2647,17 +2479,12 @@ int open_ctree(struct super_block *sb,
2647 btrfs_mapping_init(&fs_info->mapping_tree); 2479 btrfs_mapping_init(&fs_info->mapping_tree);
2648 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2480 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2649 BTRFS_BLOCK_RSV_GLOBAL); 2481 BTRFS_BLOCK_RSV_GLOBAL);
2650 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2651 BTRFS_BLOCK_RSV_DELALLOC);
2652 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 2482 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2653 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); 2483 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2654 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 2484 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2655 btrfs_init_block_rsv(&fs_info->delayed_block_rsv, 2485 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2656 BTRFS_BLOCK_RSV_DELOPS); 2486 BTRFS_BLOCK_RSV_DELOPS);
2657 atomic_set(&fs_info->nr_async_submits, 0);
2658 atomic_set(&fs_info->async_delalloc_pages, 0); 2487 atomic_set(&fs_info->async_delalloc_pages, 0);
2659 atomic_set(&fs_info->async_submit_draining, 0);
2660 atomic_set(&fs_info->nr_async_bios, 0);
2661 atomic_set(&fs_info->defrag_running, 0); 2488 atomic_set(&fs_info->defrag_running, 0);
2662 atomic_set(&fs_info->qgroup_op_seq, 0); 2489 atomic_set(&fs_info->qgroup_op_seq, 0);
2663 atomic_set(&fs_info->reada_works_cnt, 0); 2490 atomic_set(&fs_info->reada_works_cnt, 0);
@@ -2673,12 +2500,21 @@ int open_ctree(struct super_block *sb,
2673 /* readahead state */ 2500 /* readahead state */
2674 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 2501 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2675 spin_lock_init(&fs_info->reada_lock); 2502 spin_lock_init(&fs_info->reada_lock);
2503 btrfs_init_ref_verify(fs_info);
2676 2504
2677 fs_info->thread_pool_size = min_t(unsigned long, 2505 fs_info->thread_pool_size = min_t(unsigned long,
2678 num_online_cpus() + 2, 8); 2506 num_online_cpus() + 2, 8);
2679 2507
2680 INIT_LIST_HEAD(&fs_info->ordered_roots); 2508 INIT_LIST_HEAD(&fs_info->ordered_roots);
2681 spin_lock_init(&fs_info->ordered_root_lock); 2509 spin_lock_init(&fs_info->ordered_root_lock);
2510
2511 fs_info->btree_inode = new_inode(sb);
2512 if (!fs_info->btree_inode) {
2513 err = -ENOMEM;
2514 goto fail_bio_counter;
2515 }
2516 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2517
2682 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2518 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2683 GFP_KERNEL); 2519 GFP_KERNEL);
2684 if (!fs_info->delayed_root) { 2520 if (!fs_info->delayed_root) {
@@ -2895,12 +2731,13 @@ int open_ctree(struct super_block *sb,
2895 sb->s_bdi->congested_fn = btrfs_congested_fn; 2731 sb->s_bdi->congested_fn = btrfs_congested_fn;
2896 sb->s_bdi->congested_data = fs_info; 2732 sb->s_bdi->congested_data = fs_info;
2897 sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; 2733 sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
2898 sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; 2734 sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
2899 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); 2735 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
2900 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); 2736 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
2901 2737
2902 sb->s_blocksize = sectorsize; 2738 sb->s_blocksize = sectorsize;
2903 sb->s_blocksize_bits = blksize_bits(sectorsize); 2739 sb->s_blocksize_bits = blksize_bits(sectorsize);
2740 memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
2904 2741
2905 mutex_lock(&fs_info->chunk_mutex); 2742 mutex_lock(&fs_info->chunk_mutex);
2906 ret = btrfs_read_sys_array(fs_info); 2743 ret = btrfs_read_sys_array(fs_info);
@@ -3083,6 +2920,9 @@ retry_root_backup:
3083 if (ret) 2920 if (ret)
3084 goto fail_trans_kthread; 2921 goto fail_trans_kthread;
3085 2922
2923 if (btrfs_build_ref_tree(fs_info))
2924 btrfs_err(fs_info, "couldn't build ref tree");
2925
3086 /* do not make disk changes in broken FS or nologreplay is given */ 2926 /* do not make disk changes in broken FS or nologreplay is given */
3087 if (btrfs_super_log_root(disk_super) != 0 && 2927 if (btrfs_super_log_root(disk_super) != 0 &&
3088 !btrfs_test_opt(fs_info, NOLOGREPLAY)) { 2928 !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
@@ -3948,6 +3788,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
3948 cleanup_srcu_struct(&fs_info->subvol_srcu); 3788 cleanup_srcu_struct(&fs_info->subvol_srcu);
3949 3789
3950 btrfs_free_stripe_hash_table(fs_info); 3790 btrfs_free_stripe_hash_table(fs_info);
3791 btrfs_free_ref_cache(fs_info);
3951 3792
3952 __btrfs_free_block_rsv(root->orphan_block_rsv); 3793 __btrfs_free_block_rsv(root->orphan_block_rsv);
3953 root->orphan_block_rsv = NULL; 3794 root->orphan_block_rsv = NULL;
@@ -4007,7 +3848,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4007 buf->len, 3848 buf->len,
4008 fs_info->dirty_metadata_batch); 3849 fs_info->dirty_metadata_batch);
4009#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3850#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4010 if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { 3851 if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
4011 btrfs_print_leaf(buf); 3852 btrfs_print_leaf(buf);
4012 ASSERT(0); 3853 ASSERT(0);
4013 } 3854 }
@@ -4272,26 +4113,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4272 4113
4273 while ((node = rb_first(&delayed_refs->href_root)) != NULL) { 4114 while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
4274 struct btrfs_delayed_ref_head *head; 4115 struct btrfs_delayed_ref_head *head;
4275 struct btrfs_delayed_ref_node *tmp; 4116 struct rb_node *n;
4276 bool pin_bytes = false; 4117 bool pin_bytes = false;
4277 4118
4278 head = rb_entry(node, struct btrfs_delayed_ref_head, 4119 head = rb_entry(node, struct btrfs_delayed_ref_head,
4279 href_node); 4120 href_node);
4280 if (!mutex_trylock(&head->mutex)) { 4121 if (!mutex_trylock(&head->mutex)) {
4281 refcount_inc(&head->node.refs); 4122 refcount_inc(&head->refs);
4282 spin_unlock(&delayed_refs->lock); 4123 spin_unlock(&delayed_refs->lock);
4283 4124
4284 mutex_lock(&head->mutex); 4125 mutex_lock(&head->mutex);
4285 mutex_unlock(&head->mutex); 4126 mutex_unlock(&head->mutex);
4286 btrfs_put_delayed_ref(&head->node); 4127 btrfs_put_delayed_ref_head(head);
4287 spin_lock(&delayed_refs->lock); 4128 spin_lock(&delayed_refs->lock);
4288 continue; 4129 continue;
4289 } 4130 }
4290 spin_lock(&head->lock); 4131 spin_lock(&head->lock);
4291 list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, 4132 while ((n = rb_first(&head->ref_tree)) != NULL) {
4292 list) { 4133 ref = rb_entry(n, struct btrfs_delayed_ref_node,
4134 ref_node);
4293 ref->in_tree = 0; 4135 ref->in_tree = 0;
4294 list_del(&ref->list); 4136 rb_erase(&ref->ref_node, &head->ref_tree);
4137 RB_CLEAR_NODE(&ref->ref_node);
4295 if (!list_empty(&ref->add_list)) 4138 if (!list_empty(&ref->add_list))
4296 list_del(&ref->add_list); 4139 list_del(&ref->add_list);
4297 atomic_dec(&delayed_refs->num_entries); 4140 atomic_dec(&delayed_refs->num_entries);
@@ -4304,16 +4147,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4304 if (head->processing == 0) 4147 if (head->processing == 0)
4305 delayed_refs->num_heads_ready--; 4148 delayed_refs->num_heads_ready--;
4306 atomic_dec(&delayed_refs->num_entries); 4149 atomic_dec(&delayed_refs->num_entries);
4307 head->node.in_tree = 0;
4308 rb_erase(&head->href_node, &delayed_refs->href_root); 4150 rb_erase(&head->href_node, &delayed_refs->href_root);
4151 RB_CLEAR_NODE(&head->href_node);
4309 spin_unlock(&head->lock); 4152 spin_unlock(&head->lock);
4310 spin_unlock(&delayed_refs->lock); 4153 spin_unlock(&delayed_refs->lock);
4311 mutex_unlock(&head->mutex); 4154 mutex_unlock(&head->mutex);
4312 4155
4313 if (pin_bytes) 4156 if (pin_bytes)
4314 btrfs_pin_extent(fs_info, head->node.bytenr, 4157 btrfs_pin_extent(fs_info, head->bytenr,
4315 head->node.num_bytes, 1); 4158 head->num_bytes, 1);
4316 btrfs_put_delayed_ref(&head->node); 4159 btrfs_put_delayed_ref_head(head);
4317 cond_resched(); 4160 cond_resched();
4318 spin_lock(&delayed_refs->lock); 4161 spin_lock(&delayed_refs->lock);
4319 } 4162 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1..673ac4e01dd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/ratelimit.h> 27#include <linux/ratelimit.h>
28#include <linux/percpu_counter.h> 28#include <linux/percpu_counter.h>
29#include <linux/lockdep.h>
29#include "hash.h" 30#include "hash.h"
30#include "tree-log.h" 31#include "tree-log.h"
31#include "disk-io.h" 32#include "disk-io.h"
@@ -38,6 +39,7 @@
38#include "math.h" 39#include "math.h"
39#include "sysfs.h" 40#include "sysfs.h"
40#include "qgroup.h" 41#include "qgroup.h"
42#include "ref-verify.h"
41 43
42#undef SCRAMBLE_DELAYED_REFS 44#undef SCRAMBLE_DELAYED_REFS
43 45
@@ -61,9 +63,6 @@ enum {
61 CHUNK_ALLOC_FORCE = 2, 63 CHUNK_ALLOC_FORCE = 2,
62}; 64};
63 65
64static int update_block_group(struct btrfs_trans_handle *trans,
65 struct btrfs_fs_info *fs_info, u64 bytenr,
66 u64 num_bytes, int alloc);
67static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 66static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
68 struct btrfs_fs_info *fs_info, 67 struct btrfs_fs_info *fs_info,
69 struct btrfs_delayed_ref_node *node, u64 parent, 68 struct btrfs_delayed_ref_node *node, u64 parent,
@@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level,
91static void dump_space_info(struct btrfs_fs_info *fs_info, 90static void dump_space_info(struct btrfs_fs_info *fs_info,
92 struct btrfs_space_info *info, u64 bytes, 91 struct btrfs_space_info *info, u64 bytes,
93 int dump_block_groups); 92 int dump_block_groups);
94static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
95 u64 ram_bytes, u64 num_bytes, int delalloc);
96static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
97 u64 num_bytes, int delalloc);
98static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 93static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
99 u64 num_bytes); 94 u64 num_bytes);
100static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
101 struct btrfs_space_info *space_info,
102 u64 orig_bytes,
103 enum btrfs_reserve_flush_enum flush,
104 bool system_chunk);
105static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 95static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
106 struct btrfs_space_info *space_info, 96 struct btrfs_space_info *space_info,
107 u64 num_bytes); 97 u64 num_bytes);
@@ -652,7 +642,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
652 cache->cached = BTRFS_CACHE_FAST; 642 cache->cached = BTRFS_CACHE_FAST;
653 spin_unlock(&cache->lock); 643 spin_unlock(&cache->lock);
654 644
655 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 645 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
656 mutex_lock(&caching_ctl->mutex); 646 mutex_lock(&caching_ctl->mutex);
657 ret = load_free_space_cache(fs_info, cache); 647 ret = load_free_space_cache(fs_info, cache);
658 648
@@ -923,7 +913,7 @@ search_again:
923 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 913 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
924 if (head) { 914 if (head) {
925 if (!mutex_trylock(&head->mutex)) { 915 if (!mutex_trylock(&head->mutex)) {
926 refcount_inc(&head->node.refs); 916 refcount_inc(&head->refs);
927 spin_unlock(&delayed_refs->lock); 917 spin_unlock(&delayed_refs->lock);
928 918
929 btrfs_release_path(path); 919 btrfs_release_path(path);
@@ -934,7 +924,7 @@ search_again:
934 */ 924 */
935 mutex_lock(&head->mutex); 925 mutex_lock(&head->mutex);
936 mutex_unlock(&head->mutex); 926 mutex_unlock(&head->mutex);
937 btrfs_put_delayed_ref(&head->node); 927 btrfs_put_delayed_ref_head(head);
938 goto search_again; 928 goto search_again;
939 } 929 }
940 spin_lock(&head->lock); 930 spin_lock(&head->lock);
@@ -943,7 +933,7 @@ search_again:
943 else 933 else
944 BUG_ON(num_refs == 0); 934 BUG_ON(num_refs == 0);
945 935
946 num_refs += head->node.ref_mod; 936 num_refs += head->ref_mod;
947 spin_unlock(&head->lock); 937 spin_unlock(&head->lock);
948 mutex_unlock(&head->mutex); 938 mutex_unlock(&head->mutex);
949 } 939 }
@@ -2189,16 +2179,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
2189 2179
2190/* Can return -ENOMEM */ 2180/* Can return -ENOMEM */
2191int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2181int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2192 struct btrfs_fs_info *fs_info, 2182 struct btrfs_root *root,
2193 u64 bytenr, u64 num_bytes, u64 parent, 2183 u64 bytenr, u64 num_bytes, u64 parent,
2194 u64 root_objectid, u64 owner, u64 offset) 2184 u64 root_objectid, u64 owner, u64 offset)
2195{ 2185{
2186 struct btrfs_fs_info *fs_info = root->fs_info;
2196 int old_ref_mod, new_ref_mod; 2187 int old_ref_mod, new_ref_mod;
2197 int ret; 2188 int ret;
2198 2189
2199 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2190 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2200 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2191 root_objectid == BTRFS_TREE_LOG_OBJECTID);
2201 2192
2193 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2194 owner, offset, BTRFS_ADD_DELAYED_REF);
2195
2202 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2196 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2203 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2197 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2204 num_bytes, parent, 2198 num_bytes, parent,
@@ -2344,7 +2338,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2344 2338
2345static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2339static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2346 struct btrfs_fs_info *fs_info, 2340 struct btrfs_fs_info *fs_info,
2347 struct btrfs_delayed_ref_node *node, 2341 struct btrfs_delayed_ref_head *head,
2348 struct btrfs_delayed_extent_op *extent_op) 2342 struct btrfs_delayed_extent_op *extent_op)
2349{ 2343{
2350 struct btrfs_key key; 2344 struct btrfs_key key;
@@ -2366,14 +2360,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2366 if (!path) 2360 if (!path)
2367 return -ENOMEM; 2361 return -ENOMEM;
2368 2362
2369 key.objectid = node->bytenr; 2363 key.objectid = head->bytenr;
2370 2364
2371 if (metadata) { 2365 if (metadata) {
2372 key.type = BTRFS_METADATA_ITEM_KEY; 2366 key.type = BTRFS_METADATA_ITEM_KEY;
2373 key.offset = extent_op->level; 2367 key.offset = extent_op->level;
2374 } else { 2368 } else {
2375 key.type = BTRFS_EXTENT_ITEM_KEY; 2369 key.type = BTRFS_EXTENT_ITEM_KEY;
2376 key.offset = node->num_bytes; 2370 key.offset = head->num_bytes;
2377 } 2371 }
2378 2372
2379again: 2373again:
@@ -2390,17 +2384,17 @@ again:
2390 path->slots[0]--; 2384 path->slots[0]--;
2391 btrfs_item_key_to_cpu(path->nodes[0], &key, 2385 btrfs_item_key_to_cpu(path->nodes[0], &key,
2392 path->slots[0]); 2386 path->slots[0]);
2393 if (key.objectid == node->bytenr && 2387 if (key.objectid == head->bytenr &&
2394 key.type == BTRFS_EXTENT_ITEM_KEY && 2388 key.type == BTRFS_EXTENT_ITEM_KEY &&
2395 key.offset == node->num_bytes) 2389 key.offset == head->num_bytes)
2396 ret = 0; 2390 ret = 0;
2397 } 2391 }
2398 if (ret > 0) { 2392 if (ret > 0) {
2399 btrfs_release_path(path); 2393 btrfs_release_path(path);
2400 metadata = 0; 2394 metadata = 0;
2401 2395
2402 key.objectid = node->bytenr; 2396 key.objectid = head->bytenr;
2403 key.offset = node->num_bytes; 2397 key.offset = head->num_bytes;
2404 key.type = BTRFS_EXTENT_ITEM_KEY; 2398 key.type = BTRFS_EXTENT_ITEM_KEY;
2405 goto again; 2399 goto again;
2406 } 2400 }
@@ -2507,44 +2501,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2507 return 0; 2501 return 0;
2508 } 2502 }
2509 2503
2510 if (btrfs_delayed_ref_is_head(node)) {
2511 struct btrfs_delayed_ref_head *head;
2512 /*
2513 * we've hit the end of the chain and we were supposed
2514 * to insert this extent into the tree. But, it got
2515 * deleted before we ever needed to insert it, so all
2516 * we have to do is clean up the accounting
2517 */
2518 BUG_ON(extent_op);
2519 head = btrfs_delayed_node_to_head(node);
2520 trace_run_delayed_ref_head(fs_info, node, head, node->action);
2521
2522 if (head->total_ref_mod < 0) {
2523 struct btrfs_block_group_cache *cache;
2524
2525 cache = btrfs_lookup_block_group(fs_info, node->bytenr);
2526 ASSERT(cache);
2527 percpu_counter_add(&cache->space_info->total_bytes_pinned,
2528 -node->num_bytes);
2529 btrfs_put_block_group(cache);
2530 }
2531
2532 if (insert_reserved) {
2533 btrfs_pin_extent(fs_info, node->bytenr,
2534 node->num_bytes, 1);
2535 if (head->is_data) {
2536 ret = btrfs_del_csums(trans, fs_info,
2537 node->bytenr,
2538 node->num_bytes);
2539 }
2540 }
2541
2542 /* Also free its reserved qgroup space */
2543 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2544 head->qgroup_reserved);
2545 return ret;
2546 }
2547
2548 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2504 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2549 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2505 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2550 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, 2506 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
@@ -2563,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
2563{ 2519{
2564 struct btrfs_delayed_ref_node *ref; 2520 struct btrfs_delayed_ref_node *ref;
2565 2521
2566 if (list_empty(&head->ref_list)) 2522 if (RB_EMPTY_ROOT(&head->ref_tree))
2567 return NULL; 2523 return NULL;
2568 2524
2569 /* 2525 /*
@@ -2576,12 +2532,114 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
2576 return list_first_entry(&head->ref_add_list, 2532 return list_first_entry(&head->ref_add_list,
2577 struct btrfs_delayed_ref_node, add_list); 2533 struct btrfs_delayed_ref_node, add_list);
2578 2534
2579 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 2535 ref = rb_entry(rb_first(&head->ref_tree),
2580 list); 2536 struct btrfs_delayed_ref_node, ref_node);
2581 ASSERT(list_empty(&ref->add_list)); 2537 ASSERT(list_empty(&ref->add_list));
2582 return ref; 2538 return ref;
2583} 2539}
2584 2540
2541static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2542 struct btrfs_delayed_ref_head *head)
2543{
2544 spin_lock(&delayed_refs->lock);
2545 head->processing = 0;
2546 delayed_refs->num_heads_ready++;
2547 spin_unlock(&delayed_refs->lock);
2548 btrfs_delayed_ref_unlock(head);
2549}
2550
2551static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2552 struct btrfs_fs_info *fs_info,
2553 struct btrfs_delayed_ref_head *head)
2554{
2555 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2556 int ret;
2557
2558 if (!extent_op)
2559 return 0;
2560 head->extent_op = NULL;
2561 if (head->must_insert_reserved) {
2562 btrfs_free_delayed_extent_op(extent_op);
2563 return 0;
2564 }
2565 spin_unlock(&head->lock);
2566 ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
2567 btrfs_free_delayed_extent_op(extent_op);
2568 return ret ? ret : 1;
2569}
2570
2571static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2572 struct btrfs_fs_info *fs_info,
2573 struct btrfs_delayed_ref_head *head)
2574{
2575 struct btrfs_delayed_ref_root *delayed_refs;
2576 int ret;
2577
2578 delayed_refs = &trans->transaction->delayed_refs;
2579
2580 ret = cleanup_extent_op(trans, fs_info, head);
2581 if (ret < 0) {
2582 unselect_delayed_ref_head(delayed_refs, head);
2583 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2584 return ret;
2585 } else if (ret) {
2586 return ret;
2587 }
2588
2589 /*
2590 * Need to drop our head ref lock and re-acquire the delayed ref lock
2591 * and then re-check to make sure nobody got added.
2592 */
2593 spin_unlock(&head->lock);
2594 spin_lock(&delayed_refs->lock);
2595 spin_lock(&head->lock);
2596 if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
2597 spin_unlock(&head->lock);
2598 spin_unlock(&delayed_refs->lock);
2599 return 1;
2600 }
2601 delayed_refs->num_heads--;
2602 rb_erase(&head->href_node, &delayed_refs->href_root);
2603 RB_CLEAR_NODE(&head->href_node);
2604 spin_unlock(&delayed_refs->lock);
2605 spin_unlock(&head->lock);
2606 atomic_dec(&delayed_refs->num_entries);
2607
2608 trace_run_delayed_ref_head(fs_info, head, 0);
2609
2610 if (head->total_ref_mod < 0) {
2611 struct btrfs_block_group_cache *cache;
2612
2613 cache = btrfs_lookup_block_group(fs_info, head->bytenr);
2614 ASSERT(cache);
2615 percpu_counter_add(&cache->space_info->total_bytes_pinned,
2616 -head->num_bytes);
2617 btrfs_put_block_group(cache);
2618
2619 if (head->is_data) {
2620 spin_lock(&delayed_refs->lock);
2621 delayed_refs->pending_csums -= head->num_bytes;
2622 spin_unlock(&delayed_refs->lock);
2623 }
2624 }
2625
2626 if (head->must_insert_reserved) {
2627 btrfs_pin_extent(fs_info, head->bytenr,
2628 head->num_bytes, 1);
2629 if (head->is_data) {
2630 ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2631 head->num_bytes);
2632 }
2633 }
2634
2635 /* Also free its reserved qgroup space */
2636 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2637 head->qgroup_reserved);
2638 btrfs_delayed_ref_unlock(head);
2639 btrfs_put_delayed_ref_head(head);
2640 return 0;
2641}
2642
2585/* 2643/*
2586 * Returns 0 on success or if called with an already aborted transaction. 2644 * Returns 0 on success or if called with an already aborted transaction.
2587 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2645 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@ -2655,11 +2713,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2655 if (ref && ref->seq && 2713 if (ref && ref->seq &&
2656 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2714 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2657 spin_unlock(&locked_ref->lock); 2715 spin_unlock(&locked_ref->lock);
2658 spin_lock(&delayed_refs->lock); 2716 unselect_delayed_ref_head(delayed_refs, locked_ref);
2659 locked_ref->processing = 0;
2660 delayed_refs->num_heads_ready++;
2661 spin_unlock(&delayed_refs->lock);
2662 btrfs_delayed_ref_unlock(locked_ref);
2663 locked_ref = NULL; 2717 locked_ref = NULL;
2664 cond_resched(); 2718 cond_resched();
2665 count++; 2719 count++;
@@ -2667,102 +2721,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2667 } 2721 }
2668 2722
2669 /* 2723 /*
2670 * record the must insert reserved flag before we 2724 * We're done processing refs in this ref_head, clean everything
2671 * drop the spin lock. 2725 * up and move on to the next ref_head.
2672 */ 2726 */
2673 must_insert_reserved = locked_ref->must_insert_reserved;
2674 locked_ref->must_insert_reserved = 0;
2675
2676 extent_op = locked_ref->extent_op;
2677 locked_ref->extent_op = NULL;
2678
2679 if (!ref) { 2727 if (!ref) {
2680 2728 ret = cleanup_ref_head(trans, fs_info, locked_ref);
2681 2729 if (ret > 0 ) {
2682 /* All delayed refs have been processed, Go ahead 2730 /* We dropped our lock, we need to loop. */
2683 * and send the head node to run_one_delayed_ref, 2731 ret = 0;
2684 * so that any accounting fixes can happen
2685 */
2686 ref = &locked_ref->node;
2687
2688 if (extent_op && must_insert_reserved) {
2689 btrfs_free_delayed_extent_op(extent_op);
2690 extent_op = NULL;
2691 }
2692
2693 if (extent_op) {
2694 spin_unlock(&locked_ref->lock);
2695 ret = run_delayed_extent_op(trans, fs_info,
2696 ref, extent_op);
2697 btrfs_free_delayed_extent_op(extent_op);
2698
2699 if (ret) {
2700 /*
2701 * Need to reset must_insert_reserved if
2702 * there was an error so the abort stuff
2703 * can cleanup the reserved space
2704 * properly.
2705 */
2706 if (must_insert_reserved)
2707 locked_ref->must_insert_reserved = 1;
2708 spin_lock(&delayed_refs->lock);
2709 locked_ref->processing = 0;
2710 delayed_refs->num_heads_ready++;
2711 spin_unlock(&delayed_refs->lock);
2712 btrfs_debug(fs_info,
2713 "run_delayed_extent_op returned %d",
2714 ret);
2715 btrfs_delayed_ref_unlock(locked_ref);
2716 return ret;
2717 }
2718 continue; 2732 continue;
2733 } else if (ret) {
2734 return ret;
2719 } 2735 }
2736 locked_ref = NULL;
2737 count++;
2738 continue;
2739 }
2720 2740
2721 /* 2741 actual_count++;
2722 * Need to drop our head ref lock and re-acquire the 2742 ref->in_tree = 0;
2723 * delayed ref lock and then re-check to make sure 2743 rb_erase(&ref->ref_node, &locked_ref->ref_tree);
2724 * nobody got added. 2744 RB_CLEAR_NODE(&ref->ref_node);
2725 */ 2745 if (!list_empty(&ref->add_list))
2726 spin_unlock(&locked_ref->lock); 2746 list_del(&ref->add_list);
2727 spin_lock(&delayed_refs->lock); 2747 /*
2728 spin_lock(&locked_ref->lock); 2748 * When we play the delayed ref, also correct the ref_mod on
2729 if (!list_empty(&locked_ref->ref_list) || 2749 * head
2730 locked_ref->extent_op) { 2750 */
2731 spin_unlock(&locked_ref->lock); 2751 switch (ref->action) {
2732 spin_unlock(&delayed_refs->lock); 2752 case BTRFS_ADD_DELAYED_REF:
2733 continue; 2753 case BTRFS_ADD_DELAYED_EXTENT:
2734 } 2754 locked_ref->ref_mod -= ref->ref_mod;
2735 ref->in_tree = 0; 2755 break;
2736 delayed_refs->num_heads--; 2756 case BTRFS_DROP_DELAYED_REF:
2737 rb_erase(&locked_ref->href_node, 2757 locked_ref->ref_mod += ref->ref_mod;
2738 &delayed_refs->href_root); 2758 break;
2739 spin_unlock(&delayed_refs->lock); 2759 default:
2740 } else { 2760 WARN_ON(1);
2741 actual_count++;
2742 ref->in_tree = 0;
2743 list_del(&ref->list);
2744 if (!list_empty(&ref->add_list))
2745 list_del(&ref->add_list);
2746 } 2761 }
2747 atomic_dec(&delayed_refs->num_entries); 2762 atomic_dec(&delayed_refs->num_entries);
2748 2763
2749 if (!btrfs_delayed_ref_is_head(ref)) { 2764 /*
2750 /* 2765 * Record the must-insert_reserved flag before we drop the spin
2751 * when we play the delayed ref, also correct the 2766 * lock.
2752 * ref_mod on head 2767 */
2753 */ 2768 must_insert_reserved = locked_ref->must_insert_reserved;
2754 switch (ref->action) { 2769 locked_ref->must_insert_reserved = 0;
2755 case BTRFS_ADD_DELAYED_REF: 2770
2756 case BTRFS_ADD_DELAYED_EXTENT: 2771 extent_op = locked_ref->extent_op;
2757 locked_ref->node.ref_mod -= ref->ref_mod; 2772 locked_ref->extent_op = NULL;
2758 break;
2759 case BTRFS_DROP_DELAYED_REF:
2760 locked_ref->node.ref_mod += ref->ref_mod;
2761 break;
2762 default:
2763 WARN_ON(1);
2764 }
2765 }
2766 spin_unlock(&locked_ref->lock); 2773 spin_unlock(&locked_ref->lock);
2767 2774
2768 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, 2775 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
@@ -2770,33 +2777,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2770 2777
2771 btrfs_free_delayed_extent_op(extent_op); 2778 btrfs_free_delayed_extent_op(extent_op);
2772 if (ret) { 2779 if (ret) {
2773 spin_lock(&delayed_refs->lock); 2780 unselect_delayed_ref_head(delayed_refs, locked_ref);
2774 locked_ref->processing = 0;
2775 delayed_refs->num_heads_ready++;
2776 spin_unlock(&delayed_refs->lock);
2777 btrfs_delayed_ref_unlock(locked_ref);
2778 btrfs_put_delayed_ref(ref); 2781 btrfs_put_delayed_ref(ref);
2779 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2782 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2780 ret); 2783 ret);
2781 return ret; 2784 return ret;
2782 } 2785 }
2783 2786
2784 /*
2785 * If this node is a head, that means all the refs in this head
2786 * have been dealt with, and we will pick the next head to deal
2787 * with, so we must unlock the head and drop it from the cluster
2788 * list before we release it.
2789 */
2790 if (btrfs_delayed_ref_is_head(ref)) {
2791 if (locked_ref->is_data &&
2792 locked_ref->total_ref_mod < 0) {
2793 spin_lock(&delayed_refs->lock);
2794 delayed_refs->pending_csums -= ref->num_bytes;
2795 spin_unlock(&delayed_refs->lock);
2796 }
2797 btrfs_delayed_ref_unlock(locked_ref);
2798 locked_ref = NULL;
2799 }
2800 btrfs_put_delayed_ref(ref); 2787 btrfs_put_delayed_ref(ref);
2801 count++; 2788 count++;
2802 cond_resched(); 2789 cond_resched();
@@ -3100,33 +3087,16 @@ again:
3100 spin_unlock(&delayed_refs->lock); 3087 spin_unlock(&delayed_refs->lock);
3101 goto out; 3088 goto out;
3102 } 3089 }
3090 head = rb_entry(node, struct btrfs_delayed_ref_head,
3091 href_node);
3092 refcount_inc(&head->refs);
3093 spin_unlock(&delayed_refs->lock);
3103 3094
3104 while (node) { 3095 /* Mutex was contended, block until it's released and retry. */
3105 head = rb_entry(node, struct btrfs_delayed_ref_head, 3096 mutex_lock(&head->mutex);
3106 href_node); 3097 mutex_unlock(&head->mutex);
3107 if (btrfs_delayed_ref_is_head(&head->node)) {
3108 struct btrfs_delayed_ref_node *ref;
3109
3110 ref = &head->node;
3111 refcount_inc(&ref->refs);
3112
3113 spin_unlock(&delayed_refs->lock);
3114 /*
3115 * Mutex was contended, block until it's
3116 * released and try again
3117 */
3118 mutex_lock(&head->mutex);
3119 mutex_unlock(&head->mutex);
3120 3098
3121 btrfs_put_delayed_ref(ref); 3099 btrfs_put_delayed_ref_head(head);
3122 cond_resched();
3123 goto again;
3124 } else {
3125 WARN_ON(1);
3126 }
3127 node = rb_next(node);
3128 }
3129 spin_unlock(&delayed_refs->lock);
3130 cond_resched(); 3100 cond_resched();
3131 goto again; 3101 goto again;
3132 } 3102 }
@@ -3169,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
3169 struct btrfs_delayed_data_ref *data_ref; 3139 struct btrfs_delayed_data_ref *data_ref;
3170 struct btrfs_delayed_ref_root *delayed_refs; 3140 struct btrfs_delayed_ref_root *delayed_refs;
3171 struct btrfs_transaction *cur_trans; 3141 struct btrfs_transaction *cur_trans;
3142 struct rb_node *node;
3172 int ret = 0; 3143 int ret = 0;
3173 3144
3174 cur_trans = root->fs_info->running_transaction; 3145 cur_trans = root->fs_info->running_transaction;
@@ -3184,7 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
3184 } 3155 }
3185 3156
3186 if (!mutex_trylock(&head->mutex)) { 3157 if (!mutex_trylock(&head->mutex)) {
3187 refcount_inc(&head->node.refs); 3158 refcount_inc(&head->refs);
3188 spin_unlock(&delayed_refs->lock); 3159 spin_unlock(&delayed_refs->lock);
3189 3160
3190 btrfs_release_path(path); 3161 btrfs_release_path(path);
@@ -3195,13 +3166,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
3195 */ 3166 */
3196 mutex_lock(&head->mutex); 3167 mutex_lock(&head->mutex);
3197 mutex_unlock(&head->mutex); 3168 mutex_unlock(&head->mutex);
3198 btrfs_put_delayed_ref(&head->node); 3169 btrfs_put_delayed_ref_head(head);
3199 return -EAGAIN; 3170 return -EAGAIN;
3200 } 3171 }
3201 spin_unlock(&delayed_refs->lock); 3172 spin_unlock(&delayed_refs->lock);
3202 3173
3203 spin_lock(&head->lock); 3174 spin_lock(&head->lock);
3204 list_for_each_entry(ref, &head->ref_list, list) { 3175 /*
3176 * XXX: We should replace this with a proper search function in the
3177 * future.
3178 */
3179 for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
3180 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3205 /* If it's a shared ref we know a cross reference exists */ 3181 /* If it's a shared ref we know a cross reference exists */
3206 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3182 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3207 ret = 1; 3183 ret = 1;
@@ -3351,7 +3327,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3351 int level; 3327 int level;
3352 int ret = 0; 3328 int ret = 0;
3353 int (*process_func)(struct btrfs_trans_handle *, 3329 int (*process_func)(struct btrfs_trans_handle *,
3354 struct btrfs_fs_info *, 3330 struct btrfs_root *,
3355 u64, u64, u64, u64, u64, u64); 3331 u64, u64, u64, u64, u64, u64);
3356 3332
3357 3333
@@ -3391,7 +3367,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3391 3367
3392 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3368 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3393 key.offset -= btrfs_file_extent_offset(buf, fi); 3369 key.offset -= btrfs_file_extent_offset(buf, fi);
3394 ret = process_func(trans, fs_info, bytenr, num_bytes, 3370 ret = process_func(trans, root, bytenr, num_bytes,
3395 parent, ref_root, key.objectid, 3371 parent, ref_root, key.objectid,
3396 key.offset); 3372 key.offset);
3397 if (ret) 3373 if (ret)
@@ -3399,7 +3375,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3399 } else { 3375 } else {
3400 bytenr = btrfs_node_blockptr(buf, i); 3376 bytenr = btrfs_node_blockptr(buf, i);
3401 num_bytes = fs_info->nodesize; 3377 num_bytes = fs_info->nodesize;
3402 ret = process_func(trans, fs_info, bytenr, num_bytes, 3378 ret = process_func(trans, root, bytenr, num_bytes,
3403 parent, ref_root, level - 1, 0); 3379 parent, ref_root, level - 1, 0);
3404 if (ret) 3380 if (ret)
3405 goto fail; 3381 goto fail;
@@ -4843,7 +4819,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4843static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4819static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4844 u64 orig, bool wait_ordered) 4820 u64 orig, bool wait_ordered)
4845{ 4821{
4846 struct btrfs_block_rsv *block_rsv;
4847 struct btrfs_space_info *space_info; 4822 struct btrfs_space_info *space_info;
4848 struct btrfs_trans_handle *trans; 4823 struct btrfs_trans_handle *trans;
4849 u64 delalloc_bytes; 4824 u64 delalloc_bytes;
@@ -4859,8 +4834,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4859 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4834 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4860 4835
4861 trans = (struct btrfs_trans_handle *)current->journal_info; 4836 trans = (struct btrfs_trans_handle *)current->journal_info;
4862 block_rsv = &fs_info->delalloc_block_rsv; 4837 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4863 space_info = block_rsv->space_info;
4864 4838
4865 delalloc_bytes = percpu_counter_sum_positive( 4839 delalloc_bytes = percpu_counter_sum_positive(
4866 &fs_info->delalloc_bytes); 4840 &fs_info->delalloc_bytes);
@@ -4919,6 +4893,13 @@ skip_async:
4919 } 4893 }
4920} 4894}
4921 4895
4896struct reserve_ticket {
4897 u64 bytes;
4898 int error;
4899 struct list_head list;
4900 wait_queue_head_t wait;
4901};
4902
4922/** 4903/**
4923 * maybe_commit_transaction - possibly commit the transaction if its ok to 4904 * maybe_commit_transaction - possibly commit the transaction if its ok to
4924 * @root - the root we're allocating for 4905 * @root - the root we're allocating for
@@ -4930,18 +4911,29 @@ skip_async:
4930 * will return -ENOSPC. 4911 * will return -ENOSPC.
4931 */ 4912 */
4932static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4913static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4933 struct btrfs_space_info *space_info, 4914 struct btrfs_space_info *space_info)
4934 u64 bytes, int force)
4935{ 4915{
4916 struct reserve_ticket *ticket = NULL;
4936 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4917 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4937 struct btrfs_trans_handle *trans; 4918 struct btrfs_trans_handle *trans;
4919 u64 bytes;
4938 4920
4939 trans = (struct btrfs_trans_handle *)current->journal_info; 4921 trans = (struct btrfs_trans_handle *)current->journal_info;
4940 if (trans) 4922 if (trans)
4941 return -EAGAIN; 4923 return -EAGAIN;
4942 4924
4943 if (force) 4925 spin_lock(&space_info->lock);
4944 goto commit; 4926 if (!list_empty(&space_info->priority_tickets))
4927 ticket = list_first_entry(&space_info->priority_tickets,
4928 struct reserve_ticket, list);
4929 else if (!list_empty(&space_info->tickets))
4930 ticket = list_first_entry(&space_info->tickets,
4931 struct reserve_ticket, list);
4932 bytes = (ticket) ? ticket->bytes : 0;
4933 spin_unlock(&space_info->lock);
4934
4935 if (!bytes)
4936 return 0;
4945 4937
4946 /* See if there is enough pinned space to make this reservation */ 4938 /* See if there is enough pinned space to make this reservation */
4947 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4939 if (percpu_counter_compare(&space_info->total_bytes_pinned,
@@ -4956,8 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4956 return -ENOSPC; 4948 return -ENOSPC;
4957 4949
4958 spin_lock(&delayed_rsv->lock); 4950 spin_lock(&delayed_rsv->lock);
4951 if (delayed_rsv->size > bytes)
4952 bytes = 0;
4953 else
4954 bytes -= delayed_rsv->size;
4959 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4955 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4960 bytes - delayed_rsv->size) < 0) { 4956 bytes) < 0) {
4961 spin_unlock(&delayed_rsv->lock); 4957 spin_unlock(&delayed_rsv->lock);
4962 return -ENOSPC; 4958 return -ENOSPC;
4963 } 4959 }
@@ -4971,13 +4967,6 @@ commit:
4971 return btrfs_commit_transaction(trans); 4967 return btrfs_commit_transaction(trans);
4972} 4968}
4973 4969
4974struct reserve_ticket {
4975 u64 bytes;
4976 int error;
4977 struct list_head list;
4978 wait_queue_head_t wait;
4979};
4980
4981/* 4970/*
4982 * Try to flush some data based on policy set by @state. This is only advisory 4971 * Try to flush some data based on policy set by @state. This is only advisory
4983 * and may fail for various reasons. The caller is supposed to examine the 4972 * and may fail for various reasons. The caller is supposed to examine the
@@ -5027,8 +5016,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
5027 ret = 0; 5016 ret = 0;
5028 break; 5017 break;
5029 case COMMIT_TRANS: 5018 case COMMIT_TRANS:
5030 ret = may_commit_transaction(fs_info, space_info, 5019 ret = may_commit_transaction(fs_info, space_info);
5031 num_bytes, 0);
5032 break; 5020 break;
5033 default: 5021 default:
5034 ret = -ENOSPC; 5022 ret = -ENOSPC;
@@ -5582,11 +5570,12 @@ again:
5582 } 5570 }
5583} 5571}
5584 5572
5585static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5573static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5586 struct btrfs_block_rsv *block_rsv, 5574 struct btrfs_block_rsv *block_rsv,
5587 struct btrfs_block_rsv *dest, u64 num_bytes) 5575 struct btrfs_block_rsv *dest, u64 num_bytes)
5588{ 5576{
5589 struct btrfs_space_info *space_info = block_rsv->space_info; 5577 struct btrfs_space_info *space_info = block_rsv->space_info;
5578 u64 ret;
5590 5579
5591 spin_lock(&block_rsv->lock); 5580 spin_lock(&block_rsv->lock);
5592 if (num_bytes == (u64)-1) 5581 if (num_bytes == (u64)-1)
@@ -5601,6 +5590,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5601 } 5590 }
5602 spin_unlock(&block_rsv->lock); 5591 spin_unlock(&block_rsv->lock);
5603 5592
5593 ret = num_bytes;
5604 if (num_bytes > 0) { 5594 if (num_bytes > 0) {
5605 if (dest) { 5595 if (dest) {
5606 spin_lock(&dest->lock); 5596 spin_lock(&dest->lock);
@@ -5620,6 +5610,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5620 space_info_add_old_bytes(fs_info, space_info, 5610 space_info_add_old_bytes(fs_info, space_info,
5621 num_bytes); 5611 num_bytes);
5622 } 5612 }
5613 return ret;
5623} 5614}
5624 5615
5625int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5616int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@ -5643,6 +5634,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5643 rsv->type = type; 5634 rsv->type = type;
5644} 5635}
5645 5636
5637void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5638 struct btrfs_block_rsv *rsv,
5639 unsigned short type)
5640{
5641 btrfs_init_block_rsv(rsv, type);
5642 rsv->space_info = __find_space_info(fs_info,
5643 BTRFS_BLOCK_GROUP_METADATA);
5644}
5645
5646struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5646struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5647 unsigned short type) 5647 unsigned short type)
5648{ 5648{
@@ -5652,9 +5652,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5652 if (!block_rsv) 5652 if (!block_rsv)
5653 return NULL; 5653 return NULL;
5654 5654
5655 btrfs_init_block_rsv(block_rsv, type); 5655 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5656 block_rsv->space_info = __find_space_info(fs_info,
5657 BTRFS_BLOCK_GROUP_METADATA);
5658 return block_rsv; 5656 return block_rsv;
5659} 5657}
5660 5658
@@ -5737,6 +5735,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
5737 return ret; 5735 return ret;
5738} 5736}
5739 5737
5738/**
5739 * btrfs_inode_rsv_refill - refill the inode block rsv.
5740 * @inode - the inode we are refilling.
5741 * @flush - the flusing restriction.
5742 *
5743 * Essentially the same as btrfs_block_rsv_refill, except it uses the
5744 * block_rsv->size as the minimum size. We'll either refill the missing amount
5745 * or return if we already have enough space. This will also handle the resreve
5746 * tracepoint for the reserved amount.
5747 */
5748int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5749 enum btrfs_reserve_flush_enum flush)
5750{
5751 struct btrfs_root *root = inode->root;
5752 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5753 u64 num_bytes = 0;
5754 int ret = -ENOSPC;
5755
5756 spin_lock(&block_rsv->lock);
5757 if (block_rsv->reserved < block_rsv->size)
5758 num_bytes = block_rsv->size - block_rsv->reserved;
5759 spin_unlock(&block_rsv->lock);
5760
5761 if (num_bytes == 0)
5762 return 0;
5763
5764 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5765 if (!ret) {
5766 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5767 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5768 btrfs_ino(inode), num_bytes, 1);
5769 }
5770 return ret;
5771}
5772
5773/**
5774 * btrfs_inode_rsv_release - release any excessive reservation.
5775 * @inode - the inode we need to release from.
5776 *
5777 * This is the same as btrfs_block_rsv_release, except that it handles the
5778 * tracepoint for the reservation.
5779 */
5780void btrfs_inode_rsv_release(struct btrfs_inode *inode)
5781{
5782 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5783 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5784 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5785 u64 released = 0;
5786
5787 /*
5788 * Since we statically set the block_rsv->size we just want to say we
5789 * are releasing 0 bytes, and then we'll just get the reservation over
5790 * the size free'd.
5791 */
5792 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
5793 if (released > 0)
5794 trace_btrfs_space_reservation(fs_info, "delalloc",
5795 btrfs_ino(inode), released, 0);
5796}
5797
5740void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5798void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5741 struct btrfs_block_rsv *block_rsv, 5799 struct btrfs_block_rsv *block_rsv,
5742 u64 num_bytes) 5800 u64 num_bytes)
@@ -5808,7 +5866,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5808 5866
5809 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5867 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5810 fs_info->global_block_rsv.space_info = space_info; 5868 fs_info->global_block_rsv.space_info = space_info;
5811 fs_info->delalloc_block_rsv.space_info = space_info;
5812 fs_info->trans_block_rsv.space_info = space_info; 5869 fs_info->trans_block_rsv.space_info = space_info;
5813 fs_info->empty_block_rsv.space_info = space_info; 5870 fs_info->empty_block_rsv.space_info = space_info;
5814 fs_info->delayed_block_rsv.space_info = space_info; 5871 fs_info->delayed_block_rsv.space_info = space_info;
@@ -5828,8 +5885,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5828{ 5885{
5829 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5886 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5830 (u64)-1); 5887 (u64)-1);
5831 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5832 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5833 WARN_ON(fs_info->trans_block_rsv.size > 0); 5888 WARN_ON(fs_info->trans_block_rsv.size > 0);
5834 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5889 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5835 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5890 WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5841,12 +5896,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5841void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5896void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5842 struct btrfs_fs_info *fs_info) 5897 struct btrfs_fs_info *fs_info)
5843{ 5898{
5844 if (!trans->block_rsv) 5899 if (!trans->block_rsv) {
5900 ASSERT(!trans->bytes_reserved);
5845 return; 5901 return;
5902 }
5846 5903
5847 if (!trans->bytes_reserved) 5904 if (!trans->bytes_reserved)
5848 return; 5905 return;
5849 5906
5907 ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
5850 trace_btrfs_space_reservation(fs_info, "transaction", 5908 trace_btrfs_space_reservation(fs_info, "transaction",
5851 trans->transid, trans->bytes_reserved, 0); 5909 trans->transid, trans->bytes_reserved, 0);
5852 btrfs_block_rsv_release(fs_info, trans->block_rsv, 5910 btrfs_block_rsv_release(fs_info, trans->block_rsv,
@@ -5968,104 +6026,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5968 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 6026 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5969} 6027}
5970 6028
5971/** 6029static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5972 * drop_outstanding_extent - drop an outstanding extent 6030 struct btrfs_inode *inode)
5973 * @inode: the inode we're dropping the extent for
5974 * @num_bytes: the number of bytes we're releasing.
5975 *
5976 * This is called when we are freeing up an outstanding extent, either called
5977 * after an error or after an extent is written. This will return the number of
5978 * reserved extents that need to be freed. This must be called with
5979 * BTRFS_I(inode)->lock held.
5980 */
5981static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
5982 u64 num_bytes)
5983{
5984 unsigned drop_inode_space = 0;
5985 unsigned dropped_extents = 0;
5986 unsigned num_extents;
5987
5988 num_extents = count_max_extents(num_bytes);
5989 ASSERT(num_extents);
5990 ASSERT(inode->outstanding_extents >= num_extents);
5991 inode->outstanding_extents -= num_extents;
5992
5993 if (inode->outstanding_extents == 0 &&
5994 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5995 &inode->runtime_flags))
5996 drop_inode_space = 1;
5997
5998 /*
5999 * If we have more or the same amount of outstanding extents than we have
6000 * reserved then we need to leave the reserved extents count alone.
6001 */
6002 if (inode->outstanding_extents >= inode->reserved_extents)
6003 return drop_inode_space;
6004
6005 dropped_extents = inode->reserved_extents - inode->outstanding_extents;
6006 inode->reserved_extents -= dropped_extents;
6007 return dropped_extents + drop_inode_space;
6008}
6009
6010/**
6011 * calc_csum_metadata_size - return the amount of metadata space that must be
6012 * reserved/freed for the given bytes.
6013 * @inode: the inode we're manipulating
6014 * @num_bytes: the number of bytes in question
6015 * @reserve: 1 if we are reserving space, 0 if we are freeing space
6016 *
6017 * This adjusts the number of csum_bytes in the inode and then returns the
6018 * correct amount of metadata that must either be reserved or freed. We
6019 * calculate how many checksums we can fit into one leaf and then divide the
6020 * number of bytes that will need to be checksumed by this value to figure out
6021 * how many checksums will be required. If we are adding bytes then the number
6022 * may go up and we will return the number of additional bytes that must be
6023 * reserved. If it is going down we will return the number of bytes that must
6024 * be freed.
6025 *
6026 * This must be called with BTRFS_I(inode)->lock held.
6027 */
6028static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
6029 int reserve)
6030{ 6031{
6031 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6032 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6032 u64 old_csums, num_csums; 6033 u64 reserve_size = 0;
6033 6034 u64 csum_leaves;
6034 if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) 6035 unsigned outstanding_extents;
6035 return 0;
6036
6037 old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
6038 if (reserve)
6039 inode->csum_bytes += num_bytes;
6040 else
6041 inode->csum_bytes -= num_bytes;
6042 num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
6043
6044 /* No change, no need to reserve more */
6045 if (old_csums == num_csums)
6046 return 0;
6047 6036
6048 if (reserve) 6037 lockdep_assert_held(&inode->lock);
6049 return btrfs_calc_trans_metadata_size(fs_info, 6038 outstanding_extents = inode->outstanding_extents;
6050 num_csums - old_csums); 6039 if (outstanding_extents)
6040 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6041 outstanding_extents + 1);
6042 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6043 inode->csum_bytes);
6044 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6045 csum_leaves);
6051 6046
6052 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 6047 spin_lock(&block_rsv->lock);
6048 block_rsv->size = reserve_size;
6049 spin_unlock(&block_rsv->lock);
6053} 6050}
6054 6051
6055int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 6052int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6056{ 6053{
6057 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6054 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6058 struct btrfs_root *root = inode->root; 6055 struct btrfs_root *root = inode->root;
6059 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
6060 u64 to_reserve = 0;
6061 u64 csum_bytes;
6062 unsigned nr_extents; 6056 unsigned nr_extents;
6063 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 6057 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6064 int ret = 0; 6058 int ret = 0;
6065 bool delalloc_lock = true; 6059 bool delalloc_lock = true;
6066 u64 to_free = 0;
6067 unsigned dropped;
6068 bool release_extra = false;
6069 6060
6070 /* If we are a free space inode we need to not flush since we will be in 6061 /* If we are a free space inode we need to not flush since we will be in
6071 * the middle of a transaction commit. We also don't need the delalloc 6062 * the middle of a transaction commit. We also don't need the delalloc
@@ -6091,19 +6082,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6091 6082
6092 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6083 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6093 6084
6085 /* Add our new extents and calculate the new rsv size. */
6094 spin_lock(&inode->lock); 6086 spin_lock(&inode->lock);
6095 nr_extents = count_max_extents(num_bytes); 6087 nr_extents = count_max_extents(num_bytes);
6096 inode->outstanding_extents += nr_extents; 6088 btrfs_mod_outstanding_extents(inode, nr_extents);
6097 6089 inode->csum_bytes += num_bytes;
6098 nr_extents = 0; 6090 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6099 if (inode->outstanding_extents > inode->reserved_extents)
6100 nr_extents += inode->outstanding_extents -
6101 inode->reserved_extents;
6102
6103 /* We always want to reserve a slot for updating the inode. */
6104 to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
6105 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
6106 csum_bytes = inode->csum_bytes;
6107 spin_unlock(&inode->lock); 6091 spin_unlock(&inode->lock);
6108 6092
6109 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 6093 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
@@ -6113,92 +6097,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6113 goto out_fail; 6097 goto out_fail;
6114 } 6098 }
6115 6099
6116 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 6100 ret = btrfs_inode_rsv_refill(inode, flush);
6117 if (unlikely(ret)) { 6101 if (unlikely(ret)) {
6118 btrfs_qgroup_free_meta(root, 6102 btrfs_qgroup_free_meta(root,
6119 nr_extents * fs_info->nodesize); 6103 nr_extents * fs_info->nodesize);
6120 goto out_fail; 6104 goto out_fail;
6121 } 6105 }
6122 6106
6123 spin_lock(&inode->lock);
6124 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
6125 &inode->runtime_flags)) {
6126 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
6127 release_extra = true;
6128 }
6129 inode->reserved_extents += nr_extents;
6130 spin_unlock(&inode->lock);
6131
6132 if (delalloc_lock) 6107 if (delalloc_lock)
6133 mutex_unlock(&inode->delalloc_mutex); 6108 mutex_unlock(&inode->delalloc_mutex);
6134
6135 if (to_reserve)
6136 trace_btrfs_space_reservation(fs_info, "delalloc",
6137 btrfs_ino(inode), to_reserve, 1);
6138 if (release_extra)
6139 btrfs_block_rsv_release(fs_info, block_rsv,
6140 btrfs_calc_trans_metadata_size(fs_info, 1));
6141 return 0; 6109 return 0;
6142 6110
6143out_fail: 6111out_fail:
6144 spin_lock(&inode->lock); 6112 spin_lock(&inode->lock);
6145 dropped = drop_outstanding_extent(inode, num_bytes); 6113 nr_extents = count_max_extents(num_bytes);
6146 /* 6114 btrfs_mod_outstanding_extents(inode, -nr_extents);
6147 * If the inodes csum_bytes is the same as the original 6115 inode->csum_bytes -= num_bytes;
6148 * csum_bytes then we know we haven't raced with any free()ers 6116 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6149 * so we can just reduce our inodes csum bytes and carry on.
6150 */
6151 if (inode->csum_bytes == csum_bytes) {
6152 calc_csum_metadata_size(inode, num_bytes, 0);
6153 } else {
6154 u64 orig_csum_bytes = inode->csum_bytes;
6155 u64 bytes;
6156
6157 /*
6158 * This is tricky, but first we need to figure out how much we
6159 * freed from any free-ers that occurred during this
6160 * reservation, so we reset ->csum_bytes to the csum_bytes
6161 * before we dropped our lock, and then call the free for the
6162 * number of bytes that were freed while we were trying our
6163 * reservation.
6164 */
6165 bytes = csum_bytes - inode->csum_bytes;
6166 inode->csum_bytes = csum_bytes;
6167 to_free = calc_csum_metadata_size(inode, bytes, 0);
6168
6169
6170 /*
6171 * Now we need to see how much we would have freed had we not
6172 * been making this reservation and our ->csum_bytes were not
6173 * artificially inflated.
6174 */
6175 inode->csum_bytes = csum_bytes - num_bytes;
6176 bytes = csum_bytes - orig_csum_bytes;
6177 bytes = calc_csum_metadata_size(inode, bytes, 0);
6178
6179 /*
6180 * Now reset ->csum_bytes to what it should be. If bytes is
6181 * more than to_free then we would have freed more space had we
6182 * not had an artificially high ->csum_bytes, so we need to free
6183 * the remainder. If bytes is the same or less then we don't
6184 * need to do anything, the other free-ers did the correct
6185 * thing.
6186 */
6187 inode->csum_bytes = orig_csum_bytes - num_bytes;
6188 if (bytes > to_free)
6189 to_free = bytes - to_free;
6190 else
6191 to_free = 0;
6192 }
6193 spin_unlock(&inode->lock); 6117 spin_unlock(&inode->lock);
6194 if (dropped)
6195 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
6196 6118
6197 if (to_free) { 6119 btrfs_inode_rsv_release(inode);
6198 btrfs_block_rsv_release(fs_info, block_rsv, to_free);
6199 trace_btrfs_space_reservation(fs_info, "delalloc",
6200 btrfs_ino(inode), to_free, 0);
6201 }
6202 if (delalloc_lock) 6120 if (delalloc_lock)
6203 mutex_unlock(&inode->delalloc_mutex); 6121 mutex_unlock(&inode->delalloc_mutex);
6204 return ret; 6122 return ret;
@@ -6206,36 +6124,55 @@ out_fail:
6206 6124
6207/** 6125/**
6208 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6126 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6209 * @inode: the inode to release the reservation for 6127 * @inode: the inode to release the reservation for.
6210 * @num_bytes: the number of bytes we're releasing 6128 * @num_bytes: the number of bytes we are releasing.
6211 * 6129 *
6212 * This will release the metadata reservation for an inode. This can be called 6130 * This will release the metadata reservation for an inode. This can be called
6213 * once we complete IO for a given set of bytes to release their metadata 6131 * once we complete IO for a given set of bytes to release their metadata
6214 * reservations. 6132 * reservations, or on error for the same reason.
6215 */ 6133 */
6216void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) 6134void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
6217{ 6135{
6218 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6136 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6219 u64 to_free = 0;
6220 unsigned dropped;
6221 6137
6222 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6138 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6223 spin_lock(&inode->lock); 6139 spin_lock(&inode->lock);
6224 dropped = drop_outstanding_extent(inode, num_bytes); 6140 inode->csum_bytes -= num_bytes;
6225 6141 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6226 if (num_bytes)
6227 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6228 spin_unlock(&inode->lock); 6142 spin_unlock(&inode->lock);
6229 if (dropped > 0)
6230 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
6231 6143
6232 if (btrfs_is_testing(fs_info)) 6144 if (btrfs_is_testing(fs_info))
6233 return; 6145 return;
6234 6146
6235 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), 6147 btrfs_inode_rsv_release(inode);
6236 to_free, 0); 6148}
6149
6150/**
6151 * btrfs_delalloc_release_extents - release our outstanding_extents
6152 * @inode: the inode to balance the reservation for.
6153 * @num_bytes: the number of bytes we originally reserved with
6154 *
6155 * When we reserve space we increase outstanding_extents for the extents we may
6156 * add. Once we've set the range as delalloc or created our ordered extents we
6157 * have outstanding_extents to track the real usage, so we use this to free our
6158 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
6159 * with btrfs_delalloc_reserve_metadata.
6160 */
6161void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
6162{
6163 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6164 unsigned num_extents;
6165
6166 spin_lock(&inode->lock);
6167 num_extents = count_max_extents(num_bytes);
6168 btrfs_mod_outstanding_extents(inode, -num_extents);
6169 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6170 spin_unlock(&inode->lock);
6171
6172 if (btrfs_is_testing(fs_info))
6173 return;
6237 6174
6238 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); 6175 btrfs_inode_rsv_release(inode);
6239} 6176}
6240 6177
6241/** 6178/**
@@ -6282,10 +6219,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
6282 * @inode: inode we're releasing space for 6219 * @inode: inode we're releasing space for
6283 * @start: start position of the space already reserved 6220 * @start: start position of the space already reserved
6284 * @len: the len of the space already reserved 6221 * @len: the len of the space already reserved
6285 * 6222 * @release_bytes: the len of the space we consumed or didn't use
6286 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
6287 * called in the case that we don't need the metadata AND data reservations
6288 * anymore. So if there is an error or we insert an inline extent.
6289 * 6223 *
6290 * This function will release the metadata space that was not used and will 6224 * This function will release the metadata space that was not used and will
6291 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6225 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
@@ -6293,7 +6227,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
6293 * Also it will handle the qgroup reserved space. 6227 * Also it will handle the qgroup reserved space.
6294 */ 6228 */
6295void btrfs_delalloc_release_space(struct inode *inode, 6229void btrfs_delalloc_release_space(struct inode *inode,
6296 struct extent_changeset *reserved, u64 start, u64 len) 6230 struct extent_changeset *reserved,
6231 u64 start, u64 len)
6297{ 6232{
6298 btrfs_delalloc_release_metadata(BTRFS_I(inode), len); 6233 btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
6299 btrfs_free_reserved_data_space(inode, reserved, start, len); 6234 btrfs_free_reserved_data_space(inode, reserved, start, len);
@@ -6958,7 +6893,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6958 BUG_ON(!is_data && refs_to_drop != 1); 6893 BUG_ON(!is_data && refs_to_drop != 1);
6959 6894
6960 if (is_data) 6895 if (is_data)
6961 skinny_metadata = 0; 6896 skinny_metadata = false;
6962 6897
6963 ret = lookup_extent_backref(trans, info, path, &iref, 6898 ret = lookup_extent_backref(trans, info, path, &iref,
6964 bytenr, num_bytes, parent, 6899 bytenr, num_bytes, parent,
@@ -7213,7 +7148,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7213 goto out_delayed_unlock; 7148 goto out_delayed_unlock;
7214 7149
7215 spin_lock(&head->lock); 7150 spin_lock(&head->lock);
7216 if (!list_empty(&head->ref_list)) 7151 if (!RB_EMPTY_ROOT(&head->ref_tree))
7217 goto out; 7152 goto out;
7218 7153
7219 if (head->extent_op) { 7154 if (head->extent_op) {
@@ -7234,9 +7169,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7234 * at this point we have a head with no other entries. Go 7169 * at this point we have a head with no other entries. Go
7235 * ahead and process it. 7170 * ahead and process it.
7236 */ 7171 */
7237 head->node.in_tree = 0;
7238 rb_erase(&head->href_node, &delayed_refs->href_root); 7172 rb_erase(&head->href_node, &delayed_refs->href_root);
7239 7173 RB_CLEAR_NODE(&head->href_node);
7240 atomic_dec(&delayed_refs->num_entries); 7174 atomic_dec(&delayed_refs->num_entries);
7241 7175
7242 /* 7176 /*
@@ -7255,7 +7189,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7255 ret = 1; 7189 ret = 1;
7256 7190
7257 mutex_unlock(&head->mutex); 7191 mutex_unlock(&head->mutex);
7258 btrfs_put_delayed_ref(&head->node); 7192 btrfs_put_delayed_ref_head(head);
7259 return ret; 7193 return ret;
7260out: 7194out:
7261 spin_unlock(&head->lock); 7195 spin_unlock(&head->lock);
@@ -7277,6 +7211,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7277 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7211 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7278 int old_ref_mod, new_ref_mod; 7212 int old_ref_mod, new_ref_mod;
7279 7213
7214 btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7215 root->root_key.objectid,
7216 btrfs_header_level(buf), 0,
7217 BTRFS_DROP_DELAYED_REF);
7280 ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, 7218 ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
7281 buf->len, parent, 7219 buf->len, parent,
7282 root->root_key.objectid, 7220 root->root_key.objectid,
@@ -7329,16 +7267,21 @@ out:
7329 7267
7330/* Can return -ENOMEM */ 7268/* Can return -ENOMEM */
7331int btrfs_free_extent(struct btrfs_trans_handle *trans, 7269int btrfs_free_extent(struct btrfs_trans_handle *trans,
7332 struct btrfs_fs_info *fs_info, 7270 struct btrfs_root *root,
7333 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7271 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7334 u64 owner, u64 offset) 7272 u64 owner, u64 offset)
7335{ 7273{
7274 struct btrfs_fs_info *fs_info = root->fs_info;
7336 int old_ref_mod, new_ref_mod; 7275 int old_ref_mod, new_ref_mod;
7337 int ret; 7276 int ret;
7338 7277
7339 if (btrfs_is_testing(fs_info)) 7278 if (btrfs_is_testing(fs_info))
7340 return 0; 7279 return 0;
7341 7280
7281 if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7282 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7283 root_objectid, owner, offset,
7284 BTRFS_DROP_DELAYED_REF);
7342 7285
7343 /* 7286 /*
7344 * tree log blocks never actually go into the extent allocation 7287 * tree log blocks never actually go into the extent allocation
@@ -8306,17 +8249,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8306} 8249}
8307 8250
8308int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8251int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8309 u64 root_objectid, u64 owner, 8252 struct btrfs_root *root, u64 owner,
8310 u64 offset, u64 ram_bytes, 8253 u64 offset, u64 ram_bytes,
8311 struct btrfs_key *ins) 8254 struct btrfs_key *ins)
8312{ 8255{
8313 struct btrfs_fs_info *fs_info = trans->fs_info; 8256 struct btrfs_fs_info *fs_info = root->fs_info;
8314 int ret; 8257 int ret;
8315 8258
8316 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8259 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8260
8261 btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8262 root->root_key.objectid, owner, offset,
8263 BTRFS_ADD_DELAYED_EXTENT);
8317 8264
8318 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8265 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
8319 ins->offset, 0, root_objectid, owner, 8266 ins->offset, 0,
8267 root->root_key.objectid, owner,
8320 offset, ram_bytes, 8268 offset, ram_bytes,
8321 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); 8269 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8322 return ret; 8270 return ret;
@@ -8538,6 +8486,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8538 extent_op->is_data = false; 8486 extent_op->is_data = false;
8539 extent_op->level = level; 8487 extent_op->level = level;
8540 8488
8489 btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8490 root_objectid, level, 0,
8491 BTRFS_ADD_DELAYED_EXTENT);
8541 ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, 8492 ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
8542 ins.offset, parent, 8493 ins.offset, parent,
8543 root_objectid, level, 8494 root_objectid, level,
@@ -8894,7 +8845,7 @@ skip:
8894 ret); 8845 ret);
8895 } 8846 }
8896 } 8847 }
8897 ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, 8848 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8898 parent, root->root_key.objectid, 8849 parent, root->root_key.objectid,
8899 level - 1, 0); 8850 level - 1, 0);
8900 if (ret) 8851 if (ret)
@@ -9311,7 +9262,7 @@ out:
9311 * don't have it in the radix (like when we recover after a power fail 9262 * don't have it in the radix (like when we recover after a power fail
9312 * or unmount) so we don't leak memory. 9263 * or unmount) so we don't leak memory.
9313 */ 9264 */
9314 if (!for_reloc && root_dropped == false) 9265 if (!for_reloc && !root_dropped)
9315 btrfs_add_dead_root(root); 9266 btrfs_add_dead_root(root);
9316 if (err && err != -EAGAIN) 9267 if (err && err != -EAGAIN)
9317 btrfs_handle_fs_error(fs_info, err, NULL); 9268 btrfs_handle_fs_error(fs_info, err, NULL);
@@ -9968,9 +9919,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
9968 return 0; 9919 return 0;
9969} 9920}
9970 9921
9971static void __link_block_group(struct btrfs_space_info *space_info, 9922static void link_block_group(struct btrfs_block_group_cache *cache)
9972 struct btrfs_block_group_cache *cache)
9973{ 9923{
9924 struct btrfs_space_info *space_info = cache->space_info;
9974 int index = get_block_group_index(cache); 9925 int index = get_block_group_index(cache);
9975 bool first = false; 9926 bool first = false;
9976 9927
@@ -10178,7 +10129,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
10178 10129
10179 cache->space_info = space_info; 10130 cache->space_info = space_info;
10180 10131
10181 __link_block_group(space_info, cache); 10132 link_block_group(cache);
10182 10133
10183 set_avail_alloc_bits(info, cache->flags); 10134 set_avail_alloc_bits(info, cache->flags);
10184 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10135 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
@@ -10337,7 +10288,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10337 cache->bytes_super, &cache->space_info); 10288 cache->bytes_super, &cache->space_info);
10338 update_global_block_rsv(fs_info); 10289 update_global_block_rsv(fs_info);
10339 10290
10340 __link_block_group(cache->space_info, cache); 10291 link_block_group(cache);
10341 10292
10342 list_add_tail(&cache->bg_list, &trans->new_bgs); 10293 list_add_tail(&cache->bg_list, &trans->new_bgs);
10343 10294
@@ -10387,6 +10338,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10387 * remove it. 10338 * remove it.
10388 */ 10339 */
10389 free_excluded_extents(fs_info, block_group); 10340 free_excluded_extents(fs_info, block_group);
10341 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10342 block_group->key.offset);
10390 10343
10391 memcpy(&key, &block_group->key, sizeof(key)); 10344 memcpy(&key, &block_group->key, sizeof(key));
10392 index = get_block_group_index(block_group); 10345 index = get_block_group_index(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7fa50e12f18e..adbbc017191c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -110,7 +110,6 @@ struct extent_page_data {
110 struct bio *bio; 110 struct bio *bio;
111 struct extent_io_tree *tree; 111 struct extent_io_tree *tree;
112 get_extent_t *get_extent; 112 get_extent_t *get_extent;
113 unsigned long bio_flags;
114 113
115 /* tells writepage not to lock the state bits for this range 114 /* tells writepage not to lock the state bits for this range
116 * it still does the unlocking 115 * it still does the unlocking
@@ -2762,8 +2761,8 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
2762 */ 2761 */
2763static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, 2762static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2764 struct writeback_control *wbc, 2763 struct writeback_control *wbc,
2765 struct page *page, sector_t sector, 2764 struct page *page, u64 offset,
2766 size_t size, unsigned long offset, 2765 size_t size, unsigned long pg_offset,
2767 struct block_device *bdev, 2766 struct block_device *bdev,
2768 struct bio **bio_ret, 2767 struct bio **bio_ret,
2769 bio_end_io_t end_io_func, 2768 bio_end_io_t end_io_func,
@@ -2777,6 +2776,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2777 int contig = 0; 2776 int contig = 0;
2778 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2777 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2779 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2778 size_t page_size = min_t(size_t, size, PAGE_SIZE);
2779 sector_t sector = offset >> 9;
2780 2780
2781 if (bio_ret && *bio_ret) { 2781 if (bio_ret && *bio_ret) {
2782 bio = *bio_ret; 2782 bio = *bio_ret;
@@ -2787,8 +2787,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2787 2787
2788 if (prev_bio_flags != bio_flags || !contig || 2788 if (prev_bio_flags != bio_flags || !contig ||
2789 force_bio_submit || 2789 force_bio_submit ||
2790 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2790 merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
2791 bio_add_page(bio, page, page_size, offset) < page_size) { 2791 bio_add_page(bio, page, page_size, pg_offset) < page_size) {
2792 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2792 ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
2793 if (ret < 0) { 2793 if (ret < 0) {
2794 *bio_ret = NULL; 2794 *bio_ret = NULL;
@@ -2802,8 +2802,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2802 } 2802 }
2803 } 2803 }
2804 2804
2805 bio = btrfs_bio_alloc(bdev, (u64)sector << 9); 2805 bio = btrfs_bio_alloc(bdev, offset);
2806 bio_add_page(bio, page, page_size, offset); 2806 bio_add_page(bio, page, page_size, pg_offset);
2807 bio->bi_end_io = end_io_func; 2807 bio->bi_end_io = end_io_func;
2808 bio->bi_private = tree; 2808 bio->bi_private = tree;
2809 bio->bi_write_hint = page->mapping->host->i_write_hint; 2809 bio->bi_write_hint = page->mapping->host->i_write_hint;
@@ -2893,7 +2893,6 @@ static int __do_readpage(struct extent_io_tree *tree,
2893 u64 last_byte = i_size_read(inode); 2893 u64 last_byte = i_size_read(inode);
2894 u64 block_start; 2894 u64 block_start;
2895 u64 cur_end; 2895 u64 cur_end;
2896 sector_t sector;
2897 struct extent_map *em; 2896 struct extent_map *em;
2898 struct block_device *bdev; 2897 struct block_device *bdev;
2899 int ret = 0; 2898 int ret = 0;
@@ -2929,6 +2928,7 @@ static int __do_readpage(struct extent_io_tree *tree,
2929 } 2928 }
2930 while (cur <= end) { 2929 while (cur <= end) {
2931 bool force_bio_submit = false; 2930 bool force_bio_submit = false;
2931 u64 offset;
2932 2932
2933 if (cur >= last_byte) { 2933 if (cur >= last_byte) {
2934 char *userpage; 2934 char *userpage;
@@ -2968,9 +2968,9 @@ static int __do_readpage(struct extent_io_tree *tree,
2968 iosize = ALIGN(iosize, blocksize); 2968 iosize = ALIGN(iosize, blocksize);
2969 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2969 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2970 disk_io_size = em->block_len; 2970 disk_io_size = em->block_len;
2971 sector = em->block_start >> 9; 2971 offset = em->block_start;
2972 } else { 2972 } else {
2973 sector = (em->block_start + extent_offset) >> 9; 2973 offset = em->block_start + extent_offset;
2974 disk_io_size = iosize; 2974 disk_io_size = iosize;
2975 } 2975 }
2976 bdev = em->bdev; 2976 bdev = em->bdev;
@@ -3063,8 +3063,8 @@ static int __do_readpage(struct extent_io_tree *tree,
3063 } 3063 }
3064 3064
3065 ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, 3065 ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
3066 page, sector, disk_io_size, pg_offset, 3066 page, offset, disk_io_size,
3067 bdev, bio, 3067 pg_offset, bdev, bio,
3068 end_bio_extent_readpage, mirror_num, 3068 end_bio_extent_readpage, mirror_num,
3069 *bio_flags, 3069 *bio_flags,
3070 this_bio_flag, 3070 this_bio_flag,
@@ -3325,7 +3325,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3325 u64 extent_offset; 3325 u64 extent_offset;
3326 u64 block_start; 3326 u64 block_start;
3327 u64 iosize; 3327 u64 iosize;
3328 sector_t sector;
3329 struct extent_map *em; 3328 struct extent_map *em;
3330 struct block_device *bdev; 3329 struct block_device *bdev;
3331 size_t pg_offset = 0; 3330 size_t pg_offset = 0;
@@ -3368,6 +3367,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3368 3367
3369 while (cur <= end) { 3368 while (cur <= end) {
3370 u64 em_end; 3369 u64 em_end;
3370 u64 offset;
3371 3371
3372 if (cur >= i_size) { 3372 if (cur >= i_size) {
3373 if (tree->ops && tree->ops->writepage_end_io_hook) 3373 if (tree->ops && tree->ops->writepage_end_io_hook)
@@ -3389,7 +3389,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3389 BUG_ON(end < cur); 3389 BUG_ON(end < cur);
3390 iosize = min(em_end - cur, end - cur + 1); 3390 iosize = min(em_end - cur, end - cur + 1);
3391 iosize = ALIGN(iosize, blocksize); 3391 iosize = ALIGN(iosize, blocksize);
3392 sector = (em->block_start + extent_offset) >> 9; 3392 offset = em->block_start + extent_offset;
3393 bdev = em->bdev; 3393 bdev = em->bdev;
3394 block_start = em->block_start; 3394 block_start = em->block_start;
3395 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3395 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -3432,7 +3432,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3432 } 3432 }
3433 3433
3434 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3434 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3435 page, sector, iosize, pg_offset, 3435 page, offset, iosize, pg_offset,
3436 bdev, &epd->bio, 3436 bdev, &epd->bio,
3437 end_bio_extent_writepage, 3437 end_bio_extent_writepage,
3438 0, 0, 0, false); 3438 0, 0, 0, false);
@@ -3716,7 +3716,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3716 u64 offset = eb->start; 3716 u64 offset = eb->start;
3717 u32 nritems; 3717 u32 nritems;
3718 unsigned long i, num_pages; 3718 unsigned long i, num_pages;
3719 unsigned long bio_flags = 0;
3720 unsigned long start, end; 3719 unsigned long start, end;
3721 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 3720 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
3722 int ret = 0; 3721 int ret = 0;
@@ -3724,8 +3723,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3724 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3723 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3725 num_pages = num_extent_pages(eb->start, eb->len); 3724 num_pages = num_extent_pages(eb->start, eb->len);
3726 atomic_set(&eb->io_pages, num_pages); 3725 atomic_set(&eb->io_pages, num_pages);
3727 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3728 bio_flags = EXTENT_BIO_TREE_LOG;
3729 3726
3730 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3727 /* set btree blocks beyond nritems with 0 to avoid stale content. */
3731 nritems = btrfs_header_nritems(eb); 3728 nritems = btrfs_header_nritems(eb);
@@ -3749,11 +3746,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3749 clear_page_dirty_for_io(p); 3746 clear_page_dirty_for_io(p);
3750 set_page_writeback(p); 3747 set_page_writeback(p);
3751 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3748 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3752 p, offset >> 9, PAGE_SIZE, 0, bdev, 3749 p, offset, PAGE_SIZE, 0, bdev,
3753 &epd->bio, 3750 &epd->bio,
3754 end_bio_extent_buffer_writepage, 3751 end_bio_extent_buffer_writepage,
3755 0, epd->bio_flags, bio_flags, false); 3752 0, 0, 0, false);
3756 epd->bio_flags = bio_flags;
3757 if (ret) { 3753 if (ret) {
3758 set_btree_ioerr(p); 3754 set_btree_ioerr(p);
3759 if (PageWriteback(p)) 3755 if (PageWriteback(p))
@@ -3790,7 +3786,6 @@ int btree_write_cache_pages(struct address_space *mapping,
3790 .tree = tree, 3786 .tree = tree,
3791 .extent_locked = 0, 3787 .extent_locked = 0,
3792 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3788 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3793 .bio_flags = 0,
3794 }; 3789 };
3795 int ret = 0; 3790 int ret = 0;
3796 int done = 0; 3791 int done = 0;
@@ -4063,7 +4058,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
4063 if (epd->bio) { 4058 if (epd->bio) {
4064 int ret; 4059 int ret;
4065 4060
4066 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4061 ret = submit_one_bio(epd->bio, 0, 0);
4067 BUG_ON(ret < 0); /* -ENOMEM */ 4062 BUG_ON(ret < 0); /* -ENOMEM */
4068 epd->bio = NULL; 4063 epd->bio = NULL;
4069 } 4064 }
@@ -4086,7 +4081,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
4086 .get_extent = get_extent, 4081 .get_extent = get_extent,
4087 .extent_locked = 0, 4082 .extent_locked = 0,
4088 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4083 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4089 .bio_flags = 0,
4090 }; 4084 };
4091 4085
4092 ret = __extent_writepage(page, wbc, &epd); 4086 ret = __extent_writepage(page, wbc, &epd);
@@ -4111,7 +4105,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
4111 .get_extent = get_extent, 4105 .get_extent = get_extent,
4112 .extent_locked = 1, 4106 .extent_locked = 1,
4113 .sync_io = mode == WB_SYNC_ALL, 4107 .sync_io = mode == WB_SYNC_ALL,
4114 .bio_flags = 0,
4115 }; 4108 };
4116 struct writeback_control wbc_writepages = { 4109 struct writeback_control wbc_writepages = {
4117 .sync_mode = mode, 4110 .sync_mode = mode,
@@ -4151,7 +4144,6 @@ int extent_writepages(struct extent_io_tree *tree,
4151 .get_extent = get_extent, 4144 .get_extent = get_extent,
4152 .extent_locked = 0, 4145 .extent_locked = 0,
4153 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4146 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4154 .bio_flags = 0,
4155 }; 4147 };
4156 4148
4157 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, 4149 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e5535bbe6953..4a8861379d3e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -34,7 +34,6 @@
34 * type for this bio 34 * type for this bio
35 */ 35 */
36#define EXTENT_BIO_COMPRESSED 1 36#define EXTENT_BIO_COMPRESSED 1
37#define EXTENT_BIO_TREE_LOG 2
38#define EXTENT_BIO_FLAG_SHIFT 16 37#define EXTENT_BIO_FLAG_SHIFT 16
39 38
40/* these are bit numbers for test/set bit */ 39/* these are bit numbers for test/set bit */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..f80254d82f40 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -856,7 +856,7 @@ next_slot:
856 btrfs_mark_buffer_dirty(leaf); 856 btrfs_mark_buffer_dirty(leaf);
857 857
858 if (update_refs && disk_bytenr > 0) { 858 if (update_refs && disk_bytenr > 0) {
859 ret = btrfs_inc_extent_ref(trans, fs_info, 859 ret = btrfs_inc_extent_ref(trans, root,
860 disk_bytenr, num_bytes, 0, 860 disk_bytenr, num_bytes, 0,
861 root->root_key.objectid, 861 root->root_key.objectid,
862 new_key.objectid, 862 new_key.objectid,
@@ -940,7 +940,7 @@ delete_extent_item:
940 extent_end = ALIGN(extent_end, 940 extent_end = ALIGN(extent_end,
941 fs_info->sectorsize); 941 fs_info->sectorsize);
942 } else if (update_refs && disk_bytenr > 0) { 942 } else if (update_refs && disk_bytenr > 0) {
943 ret = btrfs_free_extent(trans, fs_info, 943 ret = btrfs_free_extent(trans, root,
944 disk_bytenr, num_bytes, 0, 944 disk_bytenr, num_bytes, 0,
945 root->root_key.objectid, 945 root->root_key.objectid,
946 key.objectid, key.offset - 946 key.objectid, key.offset -
@@ -1234,7 +1234,7 @@ again:
1234 extent_end - split); 1234 extent_end - split);
1235 btrfs_mark_buffer_dirty(leaf); 1235 btrfs_mark_buffer_dirty(leaf);
1236 1236
1237 ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes, 1237 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
1238 0, root->root_key.objectid, 1238 0, root->root_key.objectid,
1239 ino, orig_offset); 1239 ino, orig_offset);
1240 if (ret) { 1240 if (ret) {
@@ -1268,7 +1268,7 @@ again:
1268 extent_end = other_end; 1268 extent_end = other_end;
1269 del_slot = path->slots[0] + 1; 1269 del_slot = path->slots[0] + 1;
1270 del_nr++; 1270 del_nr++;
1271 ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 1271 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1272 0, root->root_key.objectid, 1272 0, root->root_key.objectid,
1273 ino, orig_offset); 1273 ino, orig_offset);
1274 if (ret) { 1274 if (ret) {
@@ -1288,7 +1288,7 @@ again:
1288 key.offset = other_start; 1288 key.offset = other_start;
1289 del_slot = path->slots[0]; 1289 del_slot = path->slots[0];
1290 del_nr++; 1290 del_nr++;
1291 ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 1291 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1292 0, root->root_key.objectid, 1292 0, root->root_key.objectid,
1293 ino, orig_offset); 1293 ino, orig_offset);
1294 if (ret) { 1294 if (ret) {
@@ -1590,7 +1590,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1590 int ret = 0; 1590 int ret = 0;
1591 bool only_release_metadata = false; 1591 bool only_release_metadata = false;
1592 bool force_page_uptodate = false; 1592 bool force_page_uptodate = false;
1593 bool need_unlock;
1594 1593
1595 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), 1594 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1596 PAGE_SIZE / (sizeof(struct page *))); 1595 PAGE_SIZE / (sizeof(struct page *)));
@@ -1613,6 +1612,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1613 size_t copied; 1612 size_t copied;
1614 size_t dirty_sectors; 1613 size_t dirty_sectors;
1615 size_t num_sectors; 1614 size_t num_sectors;
1615 int extents_locked;
1616 1616
1617 WARN_ON(num_pages > nrptrs); 1617 WARN_ON(num_pages > nrptrs);
1618 1618
@@ -1656,6 +1656,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1656 } 1656 }
1657 } 1657 }
1658 1658
1659 WARN_ON(reserve_bytes == 0);
1659 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 1660 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1660 reserve_bytes); 1661 reserve_bytes);
1661 if (ret) { 1662 if (ret) {
@@ -1669,7 +1670,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1669 } 1670 }
1670 1671
1671 release_bytes = reserve_bytes; 1672 release_bytes = reserve_bytes;
1672 need_unlock = false;
1673again: 1673again:
1674 /* 1674 /*
1675 * This is going to setup the pages array with the number of 1675 * This is going to setup the pages array with the number of
@@ -1679,19 +1679,23 @@ again:
1679 ret = prepare_pages(inode, pages, num_pages, 1679 ret = prepare_pages(inode, pages, num_pages,
1680 pos, write_bytes, 1680 pos, write_bytes,
1681 force_page_uptodate); 1681 force_page_uptodate);
1682 if (ret) 1682 if (ret) {
1683 btrfs_delalloc_release_extents(BTRFS_I(inode),
1684 reserve_bytes);
1683 break; 1685 break;
1686 }
1684 1687
1685 ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, 1688 extents_locked = lock_and_cleanup_extent_if_need(
1689 BTRFS_I(inode), pages,
1686 num_pages, pos, write_bytes, &lockstart, 1690 num_pages, pos, write_bytes, &lockstart,
1687 &lockend, &cached_state); 1691 &lockend, &cached_state);
1688 if (ret < 0) { 1692 if (extents_locked < 0) {
1689 if (ret == -EAGAIN) 1693 if (extents_locked == -EAGAIN)
1690 goto again; 1694 goto again;
1695 btrfs_delalloc_release_extents(BTRFS_I(inode),
1696 reserve_bytes);
1697 ret = extents_locked;
1691 break; 1698 break;
1692 } else if (ret > 0) {
1693 need_unlock = true;
1694 ret = 0;
1695 } 1699 }
1696 1700
1697 copied = btrfs_copy_from_user(pos, write_bytes, pages, i); 1701 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
@@ -1718,23 +1722,10 @@ again:
1718 PAGE_SIZE); 1722 PAGE_SIZE);
1719 } 1723 }
1720 1724
1721 /*
1722 * If we had a short copy we need to release the excess delaloc
1723 * bytes we reserved. We need to increment outstanding_extents
1724 * because btrfs_delalloc_release_space and
1725 * btrfs_delalloc_release_metadata will decrement it, but
1726 * we still have an outstanding extent for the chunk we actually
1727 * managed to copy.
1728 */
1729 if (num_sectors > dirty_sectors) { 1725 if (num_sectors > dirty_sectors) {
1730 /* release everything except the sectors we dirtied */ 1726 /* release everything except the sectors we dirtied */
1731 release_bytes -= dirty_sectors << 1727 release_bytes -= dirty_sectors <<
1732 fs_info->sb->s_blocksize_bits; 1728 fs_info->sb->s_blocksize_bits;
1733 if (copied > 0) {
1734 spin_lock(&BTRFS_I(inode)->lock);
1735 BTRFS_I(inode)->outstanding_extents++;
1736 spin_unlock(&BTRFS_I(inode)->lock);
1737 }
1738 if (only_release_metadata) { 1729 if (only_release_metadata) {
1739 btrfs_delalloc_release_metadata(BTRFS_I(inode), 1730 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1740 release_bytes); 1731 release_bytes);
@@ -1756,10 +1747,11 @@ again:
1756 if (copied > 0) 1747 if (copied > 0)
1757 ret = btrfs_dirty_pages(inode, pages, dirty_pages, 1748 ret = btrfs_dirty_pages(inode, pages, dirty_pages,
1758 pos, copied, NULL); 1749 pos, copied, NULL);
1759 if (need_unlock) 1750 if (extents_locked)
1760 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1751 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1761 lockstart, lockend, &cached_state, 1752 lockstart, lockend, &cached_state,
1762 GFP_NOFS); 1753 GFP_NOFS);
1754 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1763 if (ret) { 1755 if (ret) {
1764 btrfs_drop_pages(pages, num_pages); 1756 btrfs_drop_pages(pages, num_pages);
1765 break; 1757 break;
@@ -2046,7 +2038,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2046 struct btrfs_trans_handle *trans; 2038 struct btrfs_trans_handle *trans;
2047 struct btrfs_log_ctx ctx; 2039 struct btrfs_log_ctx ctx;
2048 int ret = 0, err; 2040 int ret = 0, err;
2049 bool full_sync = 0; 2041 bool full_sync = false;
2050 u64 len; 2042 u64 len;
2051 2043
2052 /* 2044 /*
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 684f12247db7..fe5e0324dca9 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1286,12 +1286,8 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
1286 struct btrfs_block_group_cache *block_group, 1286 struct btrfs_block_group_cache *block_group,
1287 struct btrfs_path *path) 1287 struct btrfs_path *path)
1288{ 1288{
1289 u64 start, end;
1290 int ret; 1289 int ret;
1291 1290
1292 start = block_group->key.objectid;
1293 end = block_group->key.objectid + block_group->key.offset;
1294
1295 block_group->needs_free_space = 0; 1291 block_group->needs_free_space = 0;
1296 1292
1297 ret = add_new_free_space_info(trans, fs_info, block_group, path); 1293 ret = add_new_free_space_info(trans, fs_info, block_group, path);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d02019747d00..022b19336fee 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -500,11 +500,12 @@ again:
500 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 500 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
501 prealloc, prealloc, &alloc_hint); 501 prealloc, prealloc, &alloc_hint);
502 if (ret) { 502 if (ret) {
503 btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); 503 btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
504 goto out_put; 504 goto out_put;
505 } 505 }
506 506
507 ret = btrfs_write_out_ino_cache(root, trans, path, inode); 507 ret = btrfs_write_out_ino_cache(root, trans, path, inode);
508 btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
508out_put: 509out_put:
509 iput(inode); 510 iput(inode);
510out_release: 511out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d94e3f68b9b1..b93fe05a39c7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/blkdev.h> 42#include <linux/blkdev.h>
43#include <linux/posix_acl_xattr.h> 43#include <linux/posix_acl_xattr.h>
44#include <linux/uio.h> 44#include <linux/uio.h>
45#include <linux/magic.h>
45#include "ctree.h" 46#include "ctree.h"
46#include "disk-io.h" 47#include "disk-io.h"
47#include "transaction.h" 48#include "transaction.h"
@@ -67,7 +68,6 @@ struct btrfs_iget_args {
67}; 68};
68 69
69struct btrfs_dio_data { 70struct btrfs_dio_data {
70 u64 outstanding_extents;
71 u64 reserve; 71 u64 reserve;
72 u64 unsubmitted_oe_range_start; 72 u64 unsubmitted_oe_range_start;
73 u64 unsubmitted_oe_range_end; 73 u64 unsubmitted_oe_range_end;
@@ -316,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
316 btrfs_free_path(path); 316 btrfs_free_path(path);
317 return PTR_ERR(trans); 317 return PTR_ERR(trans);
318 } 318 }
319 trans->block_rsv = &fs_info->delalloc_block_rsv; 319 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
320 320
321 if (compressed_size && compressed_pages) 321 if (compressed_size && compressed_pages)
322 extent_item_size = btrfs_file_extent_calc_inline_size( 322 extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -348,7 +348,6 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
348 } 348 }
349 349
350 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 350 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
351 btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
352 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); 351 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
353out: 352out:
354 /* 353 /*
@@ -458,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode,
458{ 457{
459 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 458 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
460 struct btrfs_root *root = BTRFS_I(inode)->root; 459 struct btrfs_root *root = BTRFS_I(inode)->root;
461 u64 num_bytes;
462 u64 blocksize = fs_info->sectorsize; 460 u64 blocksize = fs_info->sectorsize;
463 u64 actual_end; 461 u64 actual_end;
464 u64 isize = i_size_read(inode); 462 u64 isize = i_size_read(inode);
@@ -508,8 +506,6 @@ again:
508 506
509 total_compressed = min_t(unsigned long, total_compressed, 507 total_compressed = min_t(unsigned long, total_compressed,
510 BTRFS_MAX_UNCOMPRESSED); 508 BTRFS_MAX_UNCOMPRESSED);
511 num_bytes = ALIGN(end - start + 1, blocksize);
512 num_bytes = max(blocksize, num_bytes);
513 total_in = 0; 509 total_in = 0;
514 ret = 0; 510 ret = 0;
515 511
@@ -542,7 +538,10 @@ again:
542 */ 538 */
543 extent_range_clear_dirty_for_io(inode, start, end); 539 extent_range_clear_dirty_for_io(inode, start, end);
544 redirty = 1; 540 redirty = 1;
545 ret = btrfs_compress_pages(compress_type, 541
542 /* Compression level is applied here and only here */
543 ret = btrfs_compress_pages(
544 compress_type | (fs_info->compress_level << 4),
546 inode->i_mapping, start, 545 inode->i_mapping, start,
547 pages, 546 pages,
548 &nr_pages, 547 &nr_pages,
@@ -570,7 +569,7 @@ again:
570cont: 569cont:
571 if (start == 0) { 570 if (start == 0) {
572 /* lets try to make an inline extent */ 571 /* lets try to make an inline extent */
573 if (ret || total_in < (actual_end - start)) { 572 if (ret || total_in < actual_end) {
574 /* we didn't compress the entire range, try 573 /* we didn't compress the entire range, try
575 * to make an uncompressed inline extent. 574 * to make an uncompressed inline extent.
576 */ 575 */
@@ -584,16 +583,21 @@ cont:
584 } 583 }
585 if (ret <= 0) { 584 if (ret <= 0) {
586 unsigned long clear_flags = EXTENT_DELALLOC | 585 unsigned long clear_flags = EXTENT_DELALLOC |
587 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; 586 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
587 EXTENT_DO_ACCOUNTING;
588 unsigned long page_error_op; 588 unsigned long page_error_op;
589 589
590 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
591 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 590 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
592 591
593 /* 592 /*
594 * inline extent creation worked or returned error, 593 * inline extent creation worked or returned error,
595 * we don't need to create any more async work items. 594 * we don't need to create any more async work items.
596 * Unlock and free up our temp pages. 595 * Unlock and free up our temp pages.
596 *
597 * We use DO_ACCOUNTING here because we need the
598 * delalloc_release_metadata to be done _after_ we drop
599 * our outstanding extent for clearing delalloc for this
600 * range.
597 */ 601 */
598 extent_clear_unlock_delalloc(inode, start, end, end, 602 extent_clear_unlock_delalloc(inode, start, end, end,
599 NULL, clear_flags, 603 NULL, clear_flags,
@@ -602,10 +606,6 @@ cont:
602 PAGE_SET_WRITEBACK | 606 PAGE_SET_WRITEBACK |
603 page_error_op | 607 page_error_op |
604 PAGE_END_WRITEBACK); 608 PAGE_END_WRITEBACK);
605 if (ret == 0)
606 btrfs_free_reserved_data_space_noquota(inode,
607 start,
608 end - start + 1);
609 goto free_pages_out; 609 goto free_pages_out;
610 } 610 }
611 } 611 }
@@ -625,7 +625,6 @@ cont:
625 */ 625 */
626 total_in = ALIGN(total_in, PAGE_SIZE); 626 total_in = ALIGN(total_in, PAGE_SIZE);
627 if (total_compressed + blocksize <= total_in) { 627 if (total_compressed + blocksize <= total_in) {
628 num_bytes = total_in;
629 *num_added += 1; 628 *num_added += 1;
630 629
631 /* 630 /*
@@ -633,12 +632,12 @@ cont:
633 * allocation on disk for these compressed pages, and 632 * allocation on disk for these compressed pages, and
634 * will submit them to the elevator. 633 * will submit them to the elevator.
635 */ 634 */
636 add_async_extent(async_cow, start, num_bytes, 635 add_async_extent(async_cow, start, total_in,
637 total_compressed, pages, nr_pages, 636 total_compressed, pages, nr_pages,
638 compress_type); 637 compress_type);
639 638
640 if (start + num_bytes < end) { 639 if (start + total_in < end) {
641 start += num_bytes; 640 start += total_in;
642 pages = NULL; 641 pages = NULL;
643 cond_resched(); 642 cond_resched();
644 goto again; 643 goto again;
@@ -982,15 +981,19 @@ static noinline int cow_file_range(struct inode *inode,
982 ret = cow_file_range_inline(root, inode, start, end, 0, 981 ret = cow_file_range_inline(root, inode, start, end, 0,
983 BTRFS_COMPRESS_NONE, NULL); 982 BTRFS_COMPRESS_NONE, NULL);
984 if (ret == 0) { 983 if (ret == 0) {
984 /*
985 * We use DO_ACCOUNTING here because we need the
986 * delalloc_release_metadata to be run _after_ we drop
987 * our outstanding extent for clearing delalloc for this
988 * range.
989 */
985 extent_clear_unlock_delalloc(inode, start, end, 990 extent_clear_unlock_delalloc(inode, start, end,
986 delalloc_end, NULL, 991 delalloc_end, NULL,
987 EXTENT_LOCKED | EXTENT_DELALLOC | 992 EXTENT_LOCKED | EXTENT_DELALLOC |
988 EXTENT_DELALLOC_NEW | 993 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
989 EXTENT_DEFRAG, PAGE_UNLOCK | 994 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
990 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 995 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
991 PAGE_END_WRITEBACK); 996 PAGE_END_WRITEBACK);
992 btrfs_free_reserved_data_space_noquota(inode, start,
993 end - start + 1);
994 *nr_written = *nr_written + 997 *nr_written = *nr_written +
995 (end - start + PAGE_SIZE) / PAGE_SIZE; 998 (end - start + PAGE_SIZE) / PAGE_SIZE;
996 *page_started = 1; 999 *page_started = 1;
@@ -1226,13 +1229,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1226 1229
1227 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); 1230 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1228 1231
1229 while (atomic_read(&fs_info->async_submit_draining) &&
1230 atomic_read(&fs_info->async_delalloc_pages)) {
1231 wait_event(fs_info->async_submit_wait,
1232 (atomic_read(&fs_info->async_delalloc_pages) ==
1233 0));
1234 }
1235
1236 *nr_written += nr_pages; 1232 *nr_written += nr_pages;
1237 start = cur_end + 1; 1233 start = cur_end + 1;
1238 } 1234 }
@@ -1635,7 +1631,7 @@ static void btrfs_split_extent_hook(void *private_data,
1635 } 1631 }
1636 1632
1637 spin_lock(&BTRFS_I(inode)->lock); 1633 spin_lock(&BTRFS_I(inode)->lock);
1638 BTRFS_I(inode)->outstanding_extents++; 1634 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1639 spin_unlock(&BTRFS_I(inode)->lock); 1635 spin_unlock(&BTRFS_I(inode)->lock);
1640} 1636}
1641 1637
@@ -1665,7 +1661,7 @@ static void btrfs_merge_extent_hook(void *private_data,
1665 /* we're not bigger than the max, unreserve the space and go */ 1661 /* we're not bigger than the max, unreserve the space and go */
1666 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 1662 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1667 spin_lock(&BTRFS_I(inode)->lock); 1663 spin_lock(&BTRFS_I(inode)->lock);
1668 BTRFS_I(inode)->outstanding_extents--; 1664 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1669 spin_unlock(&BTRFS_I(inode)->lock); 1665 spin_unlock(&BTRFS_I(inode)->lock);
1670 return; 1666 return;
1671 } 1667 }
@@ -1696,7 +1692,7 @@ static void btrfs_merge_extent_hook(void *private_data,
1696 return; 1692 return;
1697 1693
1698 spin_lock(&BTRFS_I(inode)->lock); 1694 spin_lock(&BTRFS_I(inode)->lock);
1699 BTRFS_I(inode)->outstanding_extents--; 1695 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1700 spin_unlock(&BTRFS_I(inode)->lock); 1696 spin_unlock(&BTRFS_I(inode)->lock);
1701} 1697}
1702 1698
@@ -1766,15 +1762,12 @@ static void btrfs_set_bit_hook(void *private_data,
1766 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1762 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1767 struct btrfs_root *root = BTRFS_I(inode)->root; 1763 struct btrfs_root *root = BTRFS_I(inode)->root;
1768 u64 len = state->end + 1 - state->start; 1764 u64 len = state->end + 1 - state->start;
1765 u32 num_extents = count_max_extents(len);
1769 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); 1766 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1770 1767
1771 if (*bits & EXTENT_FIRST_DELALLOC) { 1768 spin_lock(&BTRFS_I(inode)->lock);
1772 *bits &= ~EXTENT_FIRST_DELALLOC; 1769 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1773 } else { 1770 spin_unlock(&BTRFS_I(inode)->lock);
1774 spin_lock(&BTRFS_I(inode)->lock);
1775 BTRFS_I(inode)->outstanding_extents++;
1776 spin_unlock(&BTRFS_I(inode)->lock);
1777 }
1778 1771
1779 /* For sanity tests */ 1772 /* For sanity tests */
1780 if (btrfs_is_testing(fs_info)) 1773 if (btrfs_is_testing(fs_info))
@@ -1828,13 +1821,9 @@ static void btrfs_clear_bit_hook(void *private_data,
1828 struct btrfs_root *root = inode->root; 1821 struct btrfs_root *root = inode->root;
1829 bool do_list = !btrfs_is_free_space_inode(inode); 1822 bool do_list = !btrfs_is_free_space_inode(inode);
1830 1823
1831 if (*bits & EXTENT_FIRST_DELALLOC) { 1824 spin_lock(&inode->lock);
1832 *bits &= ~EXTENT_FIRST_DELALLOC; 1825 btrfs_mod_outstanding_extents(inode, -num_extents);
1833 } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { 1826 spin_unlock(&inode->lock);
1834 spin_lock(&inode->lock);
1835 inode->outstanding_extents -= num_extents;
1836 spin_unlock(&inode->lock);
1837 }
1838 1827
1839 /* 1828 /*
1840 * We don't reserve metadata space for space cache inodes so we 1829 * We don't reserve metadata space for space cache inodes so we
@@ -2105,6 +2094,7 @@ again:
2105 0); 2094 0);
2106 ClearPageChecked(page); 2095 ClearPageChecked(page);
2107 set_page_dirty(page); 2096 set_page_dirty(page);
2097 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2108out: 2098out:
2109 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 2099 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2110 &cached_state, GFP_NOFS); 2100 &cached_state, GFP_NOFS);
@@ -2229,8 +2219,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2229 if (ret < 0) 2219 if (ret < 0)
2230 goto out; 2220 goto out;
2231 qg_released = ret; 2221 qg_released = ret;
2232 ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, 2222 ret = btrfs_alloc_reserved_file_extent(trans, root,
2233 btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); 2223 btrfs_ino(BTRFS_I(inode)),
2224 file_pos, qg_released, &ins);
2234out: 2225out:
2235 btrfs_free_path(path); 2226 btrfs_free_path(path);
2236 2227
@@ -2464,7 +2455,7 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
2464 ret = iterate_inodes_from_logical(old->bytenr + 2455 ret = iterate_inodes_from_logical(old->bytenr +
2465 old->extent_offset, fs_info, 2456 old->extent_offset, fs_info,
2466 path, record_one_backref, 2457 path, record_one_backref,
2467 old); 2458 old, false);
2468 if (ret < 0 && ret != -ENOENT) 2459 if (ret < 0 && ret != -ENOENT)
2469 return false; 2460 return false;
2470 2461
@@ -2682,7 +2673,7 @@ again:
2682 inode_add_bytes(inode, len); 2673 inode_add_bytes(inode, len);
2683 btrfs_release_path(path); 2674 btrfs_release_path(path);
2684 2675
2685 ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, 2676 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2686 new->disk_len, 0, 2677 new->disk_len, 0,
2687 backref->root_id, backref->inum, 2678 backref->root_id, backref->inum,
2688 new->file_pos); /* start - extent_offset */ 2679 new->file_pos); /* start - extent_offset */
@@ -2964,7 +2955,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2964 trans = NULL; 2955 trans = NULL;
2965 goto out; 2956 goto out;
2966 } 2957 }
2967 trans->block_rsv = &fs_info->delalloc_block_rsv; 2958 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2968 ret = btrfs_update_inode_fallback(trans, root, inode); 2959 ret = btrfs_update_inode_fallback(trans, root, inode);
2969 if (ret) /* -ENOMEM or corruption */ 2960 if (ret) /* -ENOMEM or corruption */
2970 btrfs_abort_transaction(trans, ret); 2961 btrfs_abort_transaction(trans, ret);
@@ -3000,7 +2991,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
3000 goto out; 2991 goto out;
3001 } 2992 }
3002 2993
3003 trans->block_rsv = &fs_info->delalloc_block_rsv; 2994 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3004 2995
3005 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2996 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3006 compress_type = ordered_extent->compress_type; 2997 compress_type = ordered_extent->compress_type;
@@ -3058,9 +3049,6 @@ out:
3058 0, &cached_state, GFP_NOFS); 3049 0, &cached_state, GFP_NOFS);
3059 } 3050 }
3060 3051
3061 if (root != fs_info->tree_root)
3062 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3063 ordered_extent->len);
3064 if (trans) 3052 if (trans)
3065 btrfs_end_transaction(trans); 3053 btrfs_end_transaction(trans);
3066 3054
@@ -4372,47 +4360,11 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
4372 4360
4373} 4361}
4374 4362
4375static int truncate_inline_extent(struct inode *inode, 4363/*
4376 struct btrfs_path *path, 4364 * Return this if we need to call truncate_block for the last bit of the
4377 struct btrfs_key *found_key, 4365 * truncate.
4378 const u64 item_end, 4366 */
4379 const u64 new_size) 4367#define NEED_TRUNCATE_BLOCK 1
4380{
4381 struct extent_buffer *leaf = path->nodes[0];
4382 int slot = path->slots[0];
4383 struct btrfs_file_extent_item *fi;
4384 u32 size = (u32)(new_size - found_key->offset);
4385 struct btrfs_root *root = BTRFS_I(inode)->root;
4386
4387 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4388
4389 if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4390 loff_t offset = new_size;
4391 loff_t page_end = ALIGN(offset, PAGE_SIZE);
4392
4393 /*
4394 * Zero out the remaining of the last page of our inline extent,
4395 * instead of directly truncating our inline extent here - that
4396 * would be much more complex (decompressing all the data, then
4397 * compressing the truncated data, which might be bigger than
4398 * the size of the inline extent, resize the extent, etc).
4399 * We release the path because to get the page we might need to
4400 * read the extent item from disk (data not in the page cache).
4401 */
4402 btrfs_release_path(path);
4403 return btrfs_truncate_block(inode, offset, page_end - offset,
4404 0);
4405 }
4406
4407 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4408 size = btrfs_file_extent_calc_inline_size(size);
4409 btrfs_truncate_item(root->fs_info, path, size, 1);
4410
4411 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4412 inode_sub_bytes(inode, item_end + 1 - new_size);
4413
4414 return 0;
4415}
4416 4368
4417/* 4369/*
4418 * this can truncate away extent items, csum items and directory items. 4370 * this can truncate away extent items, csum items and directory items.
@@ -4451,9 +4403,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4451 int err = 0; 4403 int err = 0;
4452 u64 ino = btrfs_ino(BTRFS_I(inode)); 4404 u64 ino = btrfs_ino(BTRFS_I(inode));
4453 u64 bytes_deleted = 0; 4405 u64 bytes_deleted = 0;
4454 bool be_nice = 0; 4406 bool be_nice = false;
4455 bool should_throttle = 0; 4407 bool should_throttle = false;
4456 bool should_end = 0; 4408 bool should_end = false;
4457 4409
4458 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4410 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4459 4411
@@ -4463,7 +4415,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4463 */ 4415 */
4464 if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && 4416 if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4465 test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4417 test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4466 be_nice = 1; 4418 be_nice = true;
4467 4419
4468 path = btrfs_alloc_path(); 4420 path = btrfs_alloc_path();
4469 if (!path) 4421 if (!path)
@@ -4573,11 +4525,6 @@ search_again:
4573 if (found_type != BTRFS_EXTENT_DATA_KEY) 4525 if (found_type != BTRFS_EXTENT_DATA_KEY)
4574 goto delete; 4526 goto delete;
4575 4527
4576 if (del_item)
4577 last_size = found_key.offset;
4578 else
4579 last_size = new_size;
4580
4581 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4528 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4582 u64 num_dec; 4529 u64 num_dec;
4583 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4530 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -4619,40 +4566,30 @@ search_again:
4619 */ 4566 */
4620 if (!del_item && 4567 if (!del_item &&
4621 btrfs_file_extent_encryption(leaf, fi) == 0 && 4568 btrfs_file_extent_encryption(leaf, fi) == 0 &&
4622 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4569 btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
4623 4570 btrfs_file_extent_compression(leaf, fi) == 0) {
4571 u32 size = (u32)(new_size - found_key.offset);
4572
4573 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4574 size = btrfs_file_extent_calc_inline_size(size);
4575 btrfs_truncate_item(root->fs_info, path, size, 1);
4576 } else if (!del_item) {
4624 /* 4577 /*
4625 * Need to release path in order to truncate a 4578 * We have to bail so the last_size is set to
4626 * compressed extent. So delete any accumulated 4579 * just before this extent.
4627 * extent items so far.
4628 */ 4580 */
4629 if (btrfs_file_extent_compression(leaf, fi) != 4581 err = NEED_TRUNCATE_BLOCK;
4630 BTRFS_COMPRESS_NONE && pending_del_nr) { 4582 break;
4631 err = btrfs_del_items(trans, root, path, 4583 }
4632 pending_del_slot,
4633 pending_del_nr);
4634 if (err) {
4635 btrfs_abort_transaction(trans,
4636 err);
4637 goto error;
4638 }
4639 pending_del_nr = 0;
4640 }
4641 4584
4642 err = truncate_inline_extent(inode, path, 4585 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4643 &found_key,
4644 item_end,
4645 new_size);
4646 if (err) {
4647 btrfs_abort_transaction(trans, err);
4648 goto error;
4649 }
4650 } else if (test_bit(BTRFS_ROOT_REF_COWS,
4651 &root->state)) {
4652 inode_sub_bytes(inode, item_end + 1 - new_size); 4586 inode_sub_bytes(inode, item_end + 1 - new_size);
4653 }
4654 } 4587 }
4655delete: 4588delete:
4589 if (del_item)
4590 last_size = found_key.offset;
4591 else
4592 last_size = new_size;
4656 if (del_item) { 4593 if (del_item) {
4657 if (!pending_del_nr) { 4594 if (!pending_del_nr) {
4658 /* no pending yet, add ourselves */ 4595 /* no pending yet, add ourselves */
@@ -4669,14 +4606,14 @@ delete:
4669 } else { 4606 } else {
4670 break; 4607 break;
4671 } 4608 }
4672 should_throttle = 0; 4609 should_throttle = false;
4673 4610
4674 if (found_extent && 4611 if (found_extent &&
4675 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4612 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4676 root == fs_info->tree_root)) { 4613 root == fs_info->tree_root)) {
4677 btrfs_set_path_blocking(path); 4614 btrfs_set_path_blocking(path);
4678 bytes_deleted += extent_num_bytes; 4615 bytes_deleted += extent_num_bytes;
4679 ret = btrfs_free_extent(trans, fs_info, extent_start, 4616 ret = btrfs_free_extent(trans, root, extent_start,
4680 extent_num_bytes, 0, 4617 extent_num_bytes, 0,
4681 btrfs_header_owner(leaf), 4618 btrfs_header_owner(leaf),
4682 ino, extent_offset); 4619 ino, extent_offset);
@@ -4688,11 +4625,11 @@ delete:
4688 if (be_nice) { 4625 if (be_nice) {
4689 if (truncate_space_check(trans, root, 4626 if (truncate_space_check(trans, root,
4690 extent_num_bytes)) { 4627 extent_num_bytes)) {
4691 should_end = 1; 4628 should_end = true;
4692 } 4629 }
4693 if (btrfs_should_throttle_delayed_refs(trans, 4630 if (btrfs_should_throttle_delayed_refs(trans,
4694 fs_info)) 4631 fs_info))
4695 should_throttle = 1; 4632 should_throttle = true;
4696 } 4633 }
4697 } 4634 }
4698 4635
@@ -4801,8 +4738,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4801 (!len || ((len & (blocksize - 1)) == 0))) 4738 (!len || ((len & (blocksize - 1)) == 0)))
4802 goto out; 4739 goto out;
4803 4740
4741 block_start = round_down(from, blocksize);
4742 block_end = block_start + blocksize - 1;
4743
4804 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 4744 ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4805 round_down(from, blocksize), blocksize); 4745 block_start, blocksize);
4806 if (ret) 4746 if (ret)
4807 goto out; 4747 goto out;
4808 4748
@@ -4810,15 +4750,12 @@ again:
4810 page = find_or_create_page(mapping, index, mask); 4750 page = find_or_create_page(mapping, index, mask);
4811 if (!page) { 4751 if (!page) {
4812 btrfs_delalloc_release_space(inode, data_reserved, 4752 btrfs_delalloc_release_space(inode, data_reserved,
4813 round_down(from, blocksize), 4753 block_start, blocksize);
4814 blocksize); 4754 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4815 ret = -ENOMEM; 4755 ret = -ENOMEM;
4816 goto out; 4756 goto out;
4817 } 4757 }
4818 4758
4819 block_start = round_down(from, blocksize);
4820 block_end = block_start + blocksize - 1;
4821
4822 if (!PageUptodate(page)) { 4759 if (!PageUptodate(page)) {
4823 ret = btrfs_readpage(NULL, page); 4760 ret = btrfs_readpage(NULL, page);
4824 lock_page(page); 4761 lock_page(page);
@@ -4883,6 +4820,7 @@ out_unlock:
4883 if (ret) 4820 if (ret)
4884 btrfs_delalloc_release_space(inode, data_reserved, block_start, 4821 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4885 blocksize); 4822 blocksize);
4823 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4886 unlock_page(page); 4824 unlock_page(page);
4887 put_page(page); 4825 put_page(page);
4888out: 4826out:
@@ -7797,33 +7735,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
7797 return em; 7735 return em;
7798} 7736}
7799 7737
7800static void adjust_dio_outstanding_extents(struct inode *inode,
7801 struct btrfs_dio_data *dio_data,
7802 const u64 len)
7803{
7804 unsigned num_extents = count_max_extents(len);
7805
7806 /*
7807 * If we have an outstanding_extents count still set then we're
7808 * within our reservation, otherwise we need to adjust our inode
7809 * counter appropriately.
7810 */
7811 if (dio_data->outstanding_extents >= num_extents) {
7812 dio_data->outstanding_extents -= num_extents;
7813 } else {
7814 /*
7815 * If dio write length has been split due to no large enough
7816 * contiguous space, we need to compensate our inode counter
7817 * appropriately.
7818 */
7819 u64 num_needed = num_extents - dio_data->outstanding_extents;
7820
7821 spin_lock(&BTRFS_I(inode)->lock);
7822 BTRFS_I(inode)->outstanding_extents += num_needed;
7823 spin_unlock(&BTRFS_I(inode)->lock);
7824 }
7825}
7826
7827static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7738static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7828 struct buffer_head *bh_result, int create) 7739 struct buffer_head *bh_result, int create)
7829{ 7740{
@@ -7985,7 +7896,6 @@ unlock:
7985 if (!dio_data->overwrite && start + len > i_size_read(inode)) 7896 if (!dio_data->overwrite && start + len > i_size_read(inode))
7986 i_size_write(inode, start + len); 7897 i_size_write(inode, start + len);
7987 7898
7988 adjust_dio_outstanding_extents(inode, dio_data, len);
7989 WARN_ON(dio_data->reserve < len); 7899 WARN_ON(dio_data->reserve < len);
7990 dio_data->reserve -= len; 7900 dio_data->reserve -= len;
7991 dio_data->unsubmitted_oe_range_end = start + len; 7901 dio_data->unsubmitted_oe_range_end = start + len;
@@ -8015,14 +7925,6 @@ unlock_err:
8015err: 7925err:
8016 if (dio_data) 7926 if (dio_data)
8017 current->journal_info = dio_data; 7927 current->journal_info = dio_data;
8018 /*
8019 * Compensate the delalloc release we do in btrfs_direct_IO() when we
8020 * write less data then expected, so that we don't underflow our inode's
8021 * outstanding extents counter.
8022 */
8023 if (create && dio_data)
8024 adjust_dio_outstanding_extents(inode, dio_data, len);
8025
8026 return ret; 7928 return ret;
8027} 7929}
8028 7930
@@ -8495,7 +8397,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
8495 if (dip->errors) { 8397 if (dip->errors) {
8496 bio_io_error(dip->orig_bio); 8398 bio_io_error(dip->orig_bio);
8497 } else { 8399 } else {
8498 dip->dio_bio->bi_status = 0; 8400 dip->dio_bio->bi_status = BLK_STS_OK;
8499 bio_endio(dip->orig_bio); 8401 bio_endio(dip->orig_bio);
8500 } 8402 }
8501out: 8403out:
@@ -8577,7 +8479,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
8577 goto err; 8479 goto err;
8578 } 8480 }
8579map: 8481map:
8580 ret = btrfs_map_bio(fs_info, bio, 0, async_submit); 8482 ret = btrfs_map_bio(fs_info, bio, 0, 0);
8581err: 8483err:
8582 bio_put(bio); 8484 bio_put(bio);
8583 return ret; 8485 return ret;
@@ -8786,7 +8688,6 @@ free_ordered:
8786} 8688}
8787 8689
8788static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 8690static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
8789 struct kiocb *iocb,
8790 const struct iov_iter *iter, loff_t offset) 8691 const struct iov_iter *iter, loff_t offset)
8791{ 8692{
8792 int seg; 8693 int seg;
@@ -8833,7 +8734,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8833 bool relock = false; 8734 bool relock = false;
8834 ssize_t ret; 8735 ssize_t ret;
8835 8736
8836 if (check_direct_IO(fs_info, iocb, iter, offset)) 8737 if (check_direct_IO(fs_info, iter, offset))
8837 return 0; 8738 return 0;
8838 8739
8839 inode_dio_begin(inode); 8740 inode_dio_begin(inode);
@@ -8868,7 +8769,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8868 offset, count); 8769 offset, count);
8869 if (ret) 8770 if (ret)
8870 goto out; 8771 goto out;
8871 dio_data.outstanding_extents = count_max_extents(count);
8872 8772
8873 /* 8773 /*
8874 * We need to know how many extents we reserved so that we can 8774 * We need to know how many extents we reserved so that we can
@@ -8915,6 +8815,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8915 } else if (ret >= 0 && (size_t)ret < count) 8815 } else if (ret >= 0 && (size_t)ret < count)
8916 btrfs_delalloc_release_space(inode, data_reserved, 8816 btrfs_delalloc_release_space(inode, data_reserved,
8917 offset, count - (size_t)ret); 8817 offset, count - (size_t)ret);
8818 btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8918 } 8819 }
8919out: 8820out:
8920 if (wakeup) 8821 if (wakeup)
@@ -9232,9 +9133,6 @@ again:
9232 fs_info->sectorsize); 9133 fs_info->sectorsize);
9233 if (reserved_space < PAGE_SIZE) { 9134 if (reserved_space < PAGE_SIZE) {
9234 end = page_start + reserved_space - 1; 9135 end = page_start + reserved_space - 1;
9235 spin_lock(&BTRFS_I(inode)->lock);
9236 BTRFS_I(inode)->outstanding_extents++;
9237 spin_unlock(&BTRFS_I(inode)->lock);
9238 btrfs_delalloc_release_space(inode, data_reserved, 9136 btrfs_delalloc_release_space(inode, data_reserved,
9239 page_start, PAGE_SIZE - reserved_space); 9137 page_start, PAGE_SIZE - reserved_space);
9240 } 9138 }
@@ -9286,12 +9184,14 @@ again:
9286 9184
9287out_unlock: 9185out_unlock:
9288 if (!ret) { 9186 if (!ret) {
9187 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9289 sb_end_pagefault(inode->i_sb); 9188 sb_end_pagefault(inode->i_sb);
9290 extent_changeset_free(data_reserved); 9189 extent_changeset_free(data_reserved);
9291 return VM_FAULT_LOCKED; 9190 return VM_FAULT_LOCKED;
9292 } 9191 }
9293 unlock_page(page); 9192 unlock_page(page);
9294out: 9193out:
9194 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9295 btrfs_delalloc_release_space(inode, data_reserved, page_start, 9195 btrfs_delalloc_release_space(inode, data_reserved, page_start,
9296 reserved_space); 9196 reserved_space);
9297out_noreserve: 9197out_noreserve:
@@ -9387,12 +9287,12 @@ static int btrfs_truncate(struct inode *inode)
9387 ret = btrfs_truncate_inode_items(trans, root, inode, 9287 ret = btrfs_truncate_inode_items(trans, root, inode,
9388 inode->i_size, 9288 inode->i_size,
9389 BTRFS_EXTENT_DATA_KEY); 9289 BTRFS_EXTENT_DATA_KEY);
9290 trans->block_rsv = &fs_info->trans_block_rsv;
9390 if (ret != -ENOSPC && ret != -EAGAIN) { 9291 if (ret != -ENOSPC && ret != -EAGAIN) {
9391 err = ret; 9292 err = ret;
9392 break; 9293 break;
9393 } 9294 }
9394 9295
9395 trans->block_rsv = &fs_info->trans_block_rsv;
9396 ret = btrfs_update_inode(trans, root, inode); 9296 ret = btrfs_update_inode(trans, root, inode);
9397 if (ret) { 9297 if (ret) {
9398 err = ret; 9298 err = ret;
@@ -9416,6 +9316,27 @@ static int btrfs_truncate(struct inode *inode)
9416 trans->block_rsv = rsv; 9316 trans->block_rsv = rsv;
9417 } 9317 }
9418 9318
9319 /*
9320 * We can't call btrfs_truncate_block inside a trans handle as we could
9321 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
9322 * we've truncated everything except the last little bit, and can do
9323 * btrfs_truncate_block and then update the disk_i_size.
9324 */
9325 if (ret == NEED_TRUNCATE_BLOCK) {
9326 btrfs_end_transaction(trans);
9327 btrfs_btree_balance_dirty(fs_info);
9328
9329 ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
9330 if (ret)
9331 goto out;
9332 trans = btrfs_start_transaction(root, 1);
9333 if (IS_ERR(trans)) {
9334 ret = PTR_ERR(trans);
9335 goto out;
9336 }
9337 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
9338 }
9339
9419 if (ret == 0 && inode->i_nlink > 0) { 9340 if (ret == 0 && inode->i_nlink > 0) {
9420 trans->block_rsv = root->orphan_block_rsv; 9341 trans->block_rsv = root->orphan_block_rsv;
9421 ret = btrfs_orphan_del(trans, BTRFS_I(inode)); 9342 ret = btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -9480,6 +9401,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9480 9401
9481struct inode *btrfs_alloc_inode(struct super_block *sb) 9402struct inode *btrfs_alloc_inode(struct super_block *sb)
9482{ 9403{
9404 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
9483 struct btrfs_inode *ei; 9405 struct btrfs_inode *ei;
9484 struct inode *inode; 9406 struct inode *inode;
9485 9407
@@ -9506,8 +9428,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
9506 9428
9507 spin_lock_init(&ei->lock); 9429 spin_lock_init(&ei->lock);
9508 ei->outstanding_extents = 0; 9430 ei->outstanding_extents = 0;
9509 ei->reserved_extents = 0; 9431 if (sb->s_magic != BTRFS_TEST_MAGIC)
9510 9432 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
9433 BTRFS_BLOCK_RSV_DELALLOC);
9511 ei->runtime_flags = 0; 9434 ei->runtime_flags = 0;
9512 ei->prop_compress = BTRFS_COMPRESS_NONE; 9435 ei->prop_compress = BTRFS_COMPRESS_NONE;
9513 ei->defrag_compress = BTRFS_COMPRESS_NONE; 9436 ei->defrag_compress = BTRFS_COMPRESS_NONE;
@@ -9557,8 +9480,9 @@ void btrfs_destroy_inode(struct inode *inode)
9557 9480
9558 WARN_ON(!hlist_empty(&inode->i_dentry)); 9481 WARN_ON(!hlist_empty(&inode->i_dentry));
9559 WARN_ON(inode->i_data.nrpages); 9482 WARN_ON(inode->i_data.nrpages);
9483 WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
9484 WARN_ON(BTRFS_I(inode)->block_rsv.size);
9560 WARN_ON(BTRFS_I(inode)->outstanding_extents); 9485 WARN_ON(BTRFS_I(inode)->outstanding_extents);
9561 WARN_ON(BTRFS_I(inode)->reserved_extents);
9562 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 9486 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9563 WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); 9487 WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9564 WARN_ON(BTRFS_I(inode)->csum_bytes); 9488 WARN_ON(BTRFS_I(inode)->csum_bytes);
@@ -10337,19 +10261,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
10337 ret = __start_delalloc_inodes(root, delay_iput, -1); 10261 ret = __start_delalloc_inodes(root, delay_iput, -1);
10338 if (ret > 0) 10262 if (ret > 0)
10339 ret = 0; 10263 ret = 0;
10340 /*
10341 * the filemap_flush will queue IO into the worker threads, but
10342 * we have to make sure the IO is actually started and that
10343 * ordered extents get created before we return
10344 */
10345 atomic_inc(&fs_info->async_submit_draining);
10346 while (atomic_read(&fs_info->nr_async_submits) ||
10347 atomic_read(&fs_info->async_delalloc_pages)) {
10348 wait_event(fs_info->async_submit_wait,
10349 (atomic_read(&fs_info->nr_async_submits) == 0 &&
10350 atomic_read(&fs_info->async_delalloc_pages) == 0));
10351 }
10352 atomic_dec(&fs_info->async_submit_draining);
10353 return ret; 10264 return ret;
10354} 10265}
10355 10266
@@ -10391,14 +10302,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
10391 spin_unlock(&fs_info->delalloc_root_lock); 10302 spin_unlock(&fs_info->delalloc_root_lock);
10392 10303
10393 ret = 0; 10304 ret = 0;
10394 atomic_inc(&fs_info->async_submit_draining);
10395 while (atomic_read(&fs_info->nr_async_submits) ||
10396 atomic_read(&fs_info->async_delalloc_pages)) {
10397 wait_event(fs_info->async_submit_wait,
10398 (atomic_read(&fs_info->nr_async_submits) == 0 &&
10399 atomic_read(&fs_info->async_delalloc_pages) == 0));
10400 }
10401 atomic_dec(&fs_info->async_submit_draining);
10402out: 10305out:
10403 if (!list_empty_careful(&splice)) { 10306 if (!list_empty_careful(&splice)) {
10404 spin_lock(&fs_info->delalloc_root_lock); 10307 spin_lock(&fs_info->delalloc_root_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6c7a49faf4e0..fd172a93d11a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -86,6 +86,19 @@ struct btrfs_ioctl_received_subvol_args_32 {
86 struct btrfs_ioctl_received_subvol_args_32) 86 struct btrfs_ioctl_received_subvol_args_32)
87#endif 87#endif
88 88
89#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
90struct btrfs_ioctl_send_args_32 {
91 __s64 send_fd; /* in */
92 __u64 clone_sources_count; /* in */
93 compat_uptr_t clone_sources; /* in */
94 __u64 parent_root; /* in */
95 __u64 flags; /* in */
96 __u64 reserved[4]; /* in */
97} __attribute__ ((__packed__));
98
99#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
100 struct btrfs_ioctl_send_args_32)
101#endif
89 102
90static int btrfs_clone(struct inode *src, struct inode *inode, 103static int btrfs_clone(struct inode *src, struct inode *inode,
91 u64 off, u64 olen, u64 olen_aligned, u64 destoff, 104 u64 off, u64 olen, u64 olen_aligned, u64 destoff,
@@ -609,23 +622,6 @@ fail_free:
609 return ret; 622 return ret;
610} 623}
611 624
612static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root)
613{
614 s64 writers;
615 DEFINE_WAIT(wait);
616
617 do {
618 prepare_to_wait(&root->subv_writers->wait, &wait,
619 TASK_UNINTERRUPTIBLE);
620
621 writers = percpu_counter_sum(&root->subv_writers->counter);
622 if (writers)
623 schedule();
624
625 finish_wait(&root->subv_writers->wait, &wait);
626 } while (writers);
627}
628
629static int create_snapshot(struct btrfs_root *root, struct inode *dir, 625static int create_snapshot(struct btrfs_root *root, struct inode *dir,
630 struct dentry *dentry, 626 struct dentry *dentry,
631 u64 *async_transid, bool readonly, 627 u64 *async_transid, bool readonly,
@@ -654,7 +650,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
654 650
655 atomic_inc(&root->will_be_snapshotted); 651 atomic_inc(&root->will_be_snapshotted);
656 smp_mb__after_atomic(); 652 smp_mb__after_atomic();
657 btrfs_wait_for_no_snapshotting_writes(root); 653 /* wait for no snapshot writes */
654 wait_event(root->subv_writers->wait,
655 percpu_counter_sum(&root->subv_writers->counter) == 0);
658 656
659 ret = btrfs_start_delalloc_inodes(root, 0); 657 ret = btrfs_start_delalloc_inodes(root, 0);
660 if (ret) 658 if (ret)
@@ -1219,6 +1217,7 @@ again:
1219 unlock_page(pages[i]); 1217 unlock_page(pages[i]);
1220 put_page(pages[i]); 1218 put_page(pages[i]);
1221 } 1219 }
1220 btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
1222 extent_changeset_free(data_reserved); 1221 extent_changeset_free(data_reserved);
1223 return i_done; 1222 return i_done;
1224out: 1223out:
@@ -1229,6 +1228,7 @@ out:
1229 btrfs_delalloc_release_space(inode, data_reserved, 1228 btrfs_delalloc_release_space(inode, data_reserved,
1230 start_index << PAGE_SHIFT, 1229 start_index << PAGE_SHIFT,
1231 page_cnt << PAGE_SHIFT); 1230 page_cnt << PAGE_SHIFT);
1231 btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
1232 extent_changeset_free(data_reserved); 1232 extent_changeset_free(data_reserved);
1233 return ret; 1233 return ret;
1234 1234
@@ -1420,21 +1420,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1420 filemap_flush(inode->i_mapping); 1420 filemap_flush(inode->i_mapping);
1421 } 1421 }
1422 1422
1423 if (do_compress) {
1424 /* the filemap_flush will queue IO into the worker threads, but
1425 * we have to make sure the IO is actually started and that
1426 * ordered extents get created before we return
1427 */
1428 atomic_inc(&fs_info->async_submit_draining);
1429 while (atomic_read(&fs_info->nr_async_submits) ||
1430 atomic_read(&fs_info->async_delalloc_pages)) {
1431 wait_event(fs_info->async_submit_wait,
1432 (atomic_read(&fs_info->nr_async_submits) == 0 &&
1433 atomic_read(&fs_info->async_delalloc_pages) == 0));
1434 }
1435 atomic_dec(&fs_info->async_submit_draining);
1436 }
1437
1438 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1423 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1439 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1424 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1440 } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { 1425 } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
@@ -1842,8 +1827,13 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1842 1827
1843 ret = btrfs_update_root(trans, fs_info->tree_root, 1828 ret = btrfs_update_root(trans, fs_info->tree_root,
1844 &root->root_key, &root->root_item); 1829 &root->root_key, &root->root_item);
1830 if (ret < 0) {
1831 btrfs_end_transaction(trans);
1832 goto out_reset;
1833 }
1834
1835 ret = btrfs_commit_transaction(trans);
1845 1836
1846 btrfs_commit_transaction(trans);
1847out_reset: 1837out_reset:
1848 if (ret) 1838 if (ret)
1849 btrfs_set_root_flags(&root->root_item, root_flags); 1839 btrfs_set_root_flags(&root->root_item, root_flags);
@@ -2179,7 +2169,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
2179 2169
2180 inode = file_inode(file); 2170 inode = file_inode(file);
2181 ret = search_ioctl(inode, &args.key, &buf_size, 2171 ret = search_ioctl(inode, &args.key, &buf_size,
2182 (char *)(&uarg->buf[0])); 2172 (char __user *)(&uarg->buf[0]));
2183 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) 2173 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
2184 ret = -EFAULT; 2174 ret = -EFAULT;
2185 else if (ret == -EOVERFLOW && 2175 else if (ret == -EOVERFLOW &&
@@ -3706,7 +3696,7 @@ process_slot:
3706 if (disko) { 3696 if (disko) {
3707 inode_add_bytes(inode, datal); 3697 inode_add_bytes(inode, datal);
3708 ret = btrfs_inc_extent_ref(trans, 3698 ret = btrfs_inc_extent_ref(trans,
3709 fs_info, 3699 root,
3710 disko, diskl, 0, 3700 disko, diskl, 0,
3711 root->root_key.objectid, 3701 root->root_key.objectid,
3712 btrfs_ino(BTRFS_I(inode)), 3702 btrfs_ino(BTRFS_I(inode)),
@@ -4129,10 +4119,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
4129 struct btrfs_ioctl_space_info *dest_orig; 4119 struct btrfs_ioctl_space_info *dest_orig;
4130 struct btrfs_ioctl_space_info __user *user_dest; 4120 struct btrfs_ioctl_space_info __user *user_dest;
4131 struct btrfs_space_info *info; 4121 struct btrfs_space_info *info;
4132 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 4122 static const u64 types[] = {
4133 BTRFS_BLOCK_GROUP_SYSTEM, 4123 BTRFS_BLOCK_GROUP_DATA,
4134 BTRFS_BLOCK_GROUP_METADATA, 4124 BTRFS_BLOCK_GROUP_SYSTEM,
4135 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; 4125 BTRFS_BLOCK_GROUP_METADATA,
4126 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
4127 };
4136 int num_types = 4; 4128 int num_types = 4;
4137 int alloc_size; 4129 int alloc_size;
4138 int ret = 0; 4130 int ret = 0;
@@ -4504,8 +4496,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
4504 ipath->fspath->val[i] = rel_ptr; 4496 ipath->fspath->val[i] = rel_ptr;
4505 } 4497 }
4506 4498
4507 ret = copy_to_user((void *)(unsigned long)ipa->fspath, 4499 ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
4508 (void *)(unsigned long)ipath->fspath, size); 4500 ipath->fspath, size);
4509 if (ret) { 4501 if (ret) {
4510 ret = -EFAULT; 4502 ret = -EFAULT;
4511 goto out; 4503 goto out;
@@ -4540,13 +4532,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
4540} 4532}
4541 4533
4542static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, 4534static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
4543 void __user *arg) 4535 void __user *arg, int version)
4544{ 4536{
4545 int ret = 0; 4537 int ret = 0;
4546 int size; 4538 int size;
4547 struct btrfs_ioctl_logical_ino_args *loi; 4539 struct btrfs_ioctl_logical_ino_args *loi;
4548 struct btrfs_data_container *inodes = NULL; 4540 struct btrfs_data_container *inodes = NULL;
4549 struct btrfs_path *path = NULL; 4541 struct btrfs_path *path = NULL;
4542 bool ignore_offset;
4550 4543
4551 if (!capable(CAP_SYS_ADMIN)) 4544 if (!capable(CAP_SYS_ADMIN))
4552 return -EPERM; 4545 return -EPERM;
@@ -4555,13 +4548,30 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
4555 if (IS_ERR(loi)) 4548 if (IS_ERR(loi))
4556 return PTR_ERR(loi); 4549 return PTR_ERR(loi);
4557 4550
4551 if (version == 1) {
4552 ignore_offset = false;
4553 size = min_t(u32, loi->size, SZ_64K);
4554 } else {
4555 /* All reserved bits must be 0 for now */
4556 if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
4557 ret = -EINVAL;
4558 goto out_loi;
4559 }
4560 /* Only accept flags we have defined so far */
4561 if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
4562 ret = -EINVAL;
4563 goto out_loi;
4564 }
4565 ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
4566 size = min_t(u32, loi->size, SZ_16M);
4567 }
4568
4558 path = btrfs_alloc_path(); 4569 path = btrfs_alloc_path();
4559 if (!path) { 4570 if (!path) {
4560 ret = -ENOMEM; 4571 ret = -ENOMEM;
4561 goto out; 4572 goto out;
4562 } 4573 }
4563 4574
4564 size = min_t(u32, loi->size, SZ_64K);
4565 inodes = init_data_container(size); 4575 inodes = init_data_container(size);
4566 if (IS_ERR(inodes)) { 4576 if (IS_ERR(inodes)) {
4567 ret = PTR_ERR(inodes); 4577 ret = PTR_ERR(inodes);
@@ -4570,20 +4580,21 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
4570 } 4580 }
4571 4581
4572 ret = iterate_inodes_from_logical(loi->logical, fs_info, path, 4582 ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
4573 build_ino_list, inodes); 4583 build_ino_list, inodes, ignore_offset);
4574 if (ret == -EINVAL) 4584 if (ret == -EINVAL)
4575 ret = -ENOENT; 4585 ret = -ENOENT;
4576 if (ret < 0) 4586 if (ret < 0)
4577 goto out; 4587 goto out;
4578 4588
4579 ret = copy_to_user((void *)(unsigned long)loi->inodes, 4589 ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
4580 (void *)(unsigned long)inodes, size); 4590 size);
4581 if (ret) 4591 if (ret)
4582 ret = -EFAULT; 4592 ret = -EFAULT;
4583 4593
4584out: 4594out:
4585 btrfs_free_path(path); 4595 btrfs_free_path(path);
4586 kvfree(inodes); 4596 kvfree(inodes);
4597out_loi:
4587 kfree(loi); 4598 kfree(loi);
4588 4599
4589 return ret; 4600 return ret;
@@ -5160,15 +5171,11 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
5160 root->root_key.objectid); 5171 root->root_key.objectid);
5161 if (ret < 0 && ret != -EEXIST) { 5172 if (ret < 0 && ret != -EEXIST) {
5162 btrfs_abort_transaction(trans, ret); 5173 btrfs_abort_transaction(trans, ret);
5174 btrfs_end_transaction(trans);
5163 goto out; 5175 goto out;
5164 } 5176 }
5165 } 5177 }
5166 ret = btrfs_commit_transaction(trans); 5178 ret = btrfs_commit_transaction(trans);
5167 if (ret < 0) {
5168 btrfs_abort_transaction(trans, ret);
5169 goto out;
5170 }
5171
5172out: 5179out:
5173 up_write(&fs_info->subvol_sem); 5180 up_write(&fs_info->subvol_sem);
5174 mnt_drop_write_file(file); 5181 mnt_drop_write_file(file);
@@ -5490,6 +5497,41 @@ out_drop_write:
5490 return ret; 5497 return ret;
5491} 5498}
5492 5499
5500static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
5501{
5502 struct btrfs_ioctl_send_args *arg;
5503 int ret;
5504
5505 if (compat) {
5506#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5507 struct btrfs_ioctl_send_args_32 args32;
5508
5509 ret = copy_from_user(&args32, argp, sizeof(args32));
5510 if (ret)
5511 return -EFAULT;
5512 arg = kzalloc(sizeof(*arg), GFP_KERNEL);
5513 if (!arg)
5514 return -ENOMEM;
5515 arg->send_fd = args32.send_fd;
5516 arg->clone_sources_count = args32.clone_sources_count;
5517 arg->clone_sources = compat_ptr(args32.clone_sources);
5518 arg->parent_root = args32.parent_root;
5519 arg->flags = args32.flags;
5520 memcpy(arg->reserved, args32.reserved,
5521 sizeof(args32.reserved));
5522#else
5523 return -ENOTTY;
5524#endif
5525 } else {
5526 arg = memdup_user(argp, sizeof(*arg));
5527 if (IS_ERR(arg))
5528 return PTR_ERR(arg);
5529 }
5530 ret = btrfs_ioctl_send(file, arg);
5531 kfree(arg);
5532 return ret;
5533}
5534
5493long btrfs_ioctl(struct file *file, unsigned int 5535long btrfs_ioctl(struct file *file, unsigned int
5494 cmd, unsigned long arg) 5536 cmd, unsigned long arg)
5495{ 5537{
@@ -5554,7 +5596,9 @@ long btrfs_ioctl(struct file *file, unsigned int
5554 case BTRFS_IOC_INO_PATHS: 5596 case BTRFS_IOC_INO_PATHS:
5555 return btrfs_ioctl_ino_to_path(root, argp); 5597 return btrfs_ioctl_ino_to_path(root, argp);
5556 case BTRFS_IOC_LOGICAL_INO: 5598 case BTRFS_IOC_LOGICAL_INO:
5557 return btrfs_ioctl_logical_to_ino(fs_info, argp); 5599 return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
5600 case BTRFS_IOC_LOGICAL_INO_V2:
5601 return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
5558 case BTRFS_IOC_SPACE_INFO: 5602 case BTRFS_IOC_SPACE_INFO:
5559 return btrfs_ioctl_space_info(fs_info, argp); 5603 return btrfs_ioctl_space_info(fs_info, argp);
5560 case BTRFS_IOC_SYNC: { 5604 case BTRFS_IOC_SYNC: {
@@ -5595,7 +5639,11 @@ long btrfs_ioctl(struct file *file, unsigned int
5595 return btrfs_ioctl_set_received_subvol_32(file, argp); 5639 return btrfs_ioctl_set_received_subvol_32(file, argp);
5596#endif 5640#endif
5597 case BTRFS_IOC_SEND: 5641 case BTRFS_IOC_SEND:
5598 return btrfs_ioctl_send(file, argp); 5642 return _btrfs_ioctl_send(file, argp, false);
5643#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5644 case BTRFS_IOC_SEND_32:
5645 return _btrfs_ioctl_send(file, argp, true);
5646#endif
5599 case BTRFS_IOC_GET_DEV_STATS: 5647 case BTRFS_IOC_GET_DEV_STATS:
5600 return btrfs_ioctl_get_dev_stats(fs_info, argp); 5648 return btrfs_ioctl_get_dev_stats(fs_info, argp);
5601 case BTRFS_IOC_QUOTA_CTL: 5649 case BTRFS_IOC_QUOTA_CTL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d433e75d489a..6c7f18cd3b61 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -430,10 +430,15 @@ out:
430 return ret; 430 return ret;
431} 431}
432 432
433static void lzo_set_level(struct list_head *ws, unsigned int type)
434{
435}
436
433const struct btrfs_compress_op btrfs_lzo_compress = { 437const struct btrfs_compress_op btrfs_lzo_compress = {
434 .alloc_workspace = lzo_alloc_workspace, 438 .alloc_workspace = lzo_alloc_workspace,
435 .free_workspace = lzo_free_workspace, 439 .free_workspace = lzo_free_workspace,
436 .compress_pages = lzo_compress_pages, 440 .compress_pages = lzo_compress_pages,
437 .decompress_bio = lzo_decompress_bio, 441 .decompress_bio = lzo_decompress_bio,
438 .decompress = lzo_decompress, 442 .decompress = lzo_decompress,
443 .set_level = lzo_set_level,
439}; 444};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a3aca495e33e..5b311aeddcc8 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -242,6 +242,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
242 } 242 }
243 spin_unlock(&root->ordered_extent_lock); 243 spin_unlock(&root->ordered_extent_lock);
244 244
245 /*
246 * We don't need the count_max_extents here, we can assume that all of
247 * that work has been done at higher layers, so this is truly the
248 * smallest the extent is going to get.
249 */
250 spin_lock(&BTRFS_I(inode)->lock);
251 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
252 spin_unlock(&BTRFS_I(inode)->lock);
253
245 return 0; 254 return 0;
246} 255}
247 256
@@ -591,11 +600,19 @@ void btrfs_remove_ordered_extent(struct inode *inode,
591{ 600{
592 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 601 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
593 struct btrfs_ordered_inode_tree *tree; 602 struct btrfs_ordered_inode_tree *tree;
594 struct btrfs_root *root = BTRFS_I(inode)->root; 603 struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
604 struct btrfs_root *root = btrfs_inode->root;
595 struct rb_node *node; 605 struct rb_node *node;
596 bool dec_pending_ordered = false; 606 bool dec_pending_ordered = false;
597 607
598 tree = &BTRFS_I(inode)->ordered_tree; 608 /* This is paired with btrfs_add_ordered_extent. */
609 spin_lock(&btrfs_inode->lock);
610 btrfs_mod_outstanding_extents(btrfs_inode, -1);
611 spin_unlock(&btrfs_inode->lock);
612 if (root != fs_info->tree_root)
613 btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
614
615 tree = &btrfs_inode->ordered_tree;
599 spin_lock_irq(&tree->lock); 616 spin_lock_irq(&tree->lock);
600 node = &entry->rb_node; 617 node = &entry->rb_node;
601 rb_erase(node, &tree->tree); 618 rb_erase(node, &tree->tree);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e172d4843eae..168fd03ca3ac 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1441,7 +1441,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
1441 u64 bytenr = qrecord->bytenr; 1441 u64 bytenr = qrecord->bytenr;
1442 int ret; 1442 int ret;
1443 1443
1444 ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root); 1444 ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
1445 if (ret < 0) 1445 if (ret < 0)
1446 return ret; 1446 return ret;
1447 1447
@@ -2031,7 +2031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
2031 /* Search commit root to find old_roots */ 2031 /* Search commit root to find old_roots */
2032 ret = btrfs_find_all_roots(NULL, fs_info, 2032 ret = btrfs_find_all_roots(NULL, fs_info,
2033 record->bytenr, 0, 2033 record->bytenr, 0,
2034 &record->old_roots); 2034 &record->old_roots, false);
2035 if (ret < 0) 2035 if (ret < 0)
2036 goto cleanup; 2036 goto cleanup;
2037 } 2037 }
@@ -2042,7 +2042,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
2042 * root. It's safe inside commit_transaction(). 2042 * root. It's safe inside commit_transaction().
2043 */ 2043 */
2044 ret = btrfs_find_all_roots(trans, fs_info, 2044 ret = btrfs_find_all_roots(trans, fs_info,
2045 record->bytenr, SEQ_LAST, &new_roots); 2045 record->bytenr, SEQ_LAST, &new_roots, false);
2046 if (ret < 0) 2046 if (ret < 0)
2047 goto cleanup; 2047 goto cleanup;
2048 if (qgroup_to_skip) { 2048 if (qgroup_to_skip) {
@@ -2570,7 +2570,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2570 num_bytes = found.offset; 2570 num_bytes = found.offset;
2571 2571
2572 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 2572 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
2573 &roots); 2573 &roots, false);
2574 if (ret < 0) 2574 if (ret < 0)
2575 goto out; 2575 goto out;
2576 /* For rescan, just pass old_roots as NULL */ 2576 /* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24a62224b24b..a7f79254ecca 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1326,6 +1326,9 @@ write_data:
1326 1326
1327cleanup: 1327cleanup:
1328 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1328 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1329
1330 while ((bio = bio_list_pop(&bio_list)))
1331 bio_put(bio);
1329} 1332}
1330 1333
1331/* 1334/*
@@ -1582,6 +1585,10 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1582 1585
1583cleanup: 1586cleanup:
1584 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1587 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1588
1589 while ((bio = bio_list_pop(&bio_list)))
1590 bio_put(bio);
1591
1585 return -EIO; 1592 return -EIO;
1586 1593
1587finish: 1594finish:
@@ -2107,6 +2114,10 @@ cleanup:
2107 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2114 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2108 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2115 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2109 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2116 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2117
2118 while ((bio = bio_list_pop(&bio_list)))
2119 bio_put(bio);
2120
2110 return -EIO; 2121 return -EIO;
2111} 2122}
2112 2123
@@ -2231,12 +2242,18 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2231 ASSERT(!bio->bi_iter.bi_size); 2242 ASSERT(!bio->bi_iter.bi_size);
2232 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2243 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2233 2244
2234 for (i = 0; i < rbio->real_stripes; i++) { 2245 /*
2246 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2247 * to the end position, so this search can start from the first parity
2248 * stripe.
2249 */
2250 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2235 if (bbio->stripes[i].dev == scrub_dev) { 2251 if (bbio->stripes[i].dev == scrub_dev) {
2236 rbio->scrubp = i; 2252 rbio->scrubp = i;
2237 break; 2253 break;
2238 } 2254 }
2239 } 2255 }
2256 ASSERT(i < rbio->real_stripes);
2240 2257
2241 /* Now we just support the sectorsize equals to page size */ 2258 /* Now we just support the sectorsize equals to page size */
2242 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2259 ASSERT(fs_info->sectorsize == PAGE_SIZE);
@@ -2454,6 +2471,9 @@ submit_write:
2454 2471
2455cleanup: 2472cleanup:
2456 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2473 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2474
2475 while ((bio = bio_list_pop(&bio_list)))
2476 bio_put(bio);
2457} 2477}
2458 2478
2459static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2479static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2563,12 +2583,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2563 int stripe; 2583 int stripe;
2564 struct bio *bio; 2584 struct bio *bio;
2565 2585
2586 bio_list_init(&bio_list);
2587
2566 ret = alloc_rbio_essential_pages(rbio); 2588 ret = alloc_rbio_essential_pages(rbio);
2567 if (ret) 2589 if (ret)
2568 goto cleanup; 2590 goto cleanup;
2569 2591
2570 bio_list_init(&bio_list);
2571
2572 atomic_set(&rbio->error, 0); 2592 atomic_set(&rbio->error, 0);
2573 /* 2593 /*
2574 * build a list of bios to read all the missing parts of this 2594 * build a list of bios to read all the missing parts of this
@@ -2636,6 +2656,10 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2636 2656
2637cleanup: 2657cleanup:
2638 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2658 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2659
2660 while ((bio = bio_list_pop(&bio_list)))
2661 bio_put(bio);
2662
2639 return; 2663 return;
2640 2664
2641finish: 2665finish:
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
new file mode 100644
index 000000000000..34878699d363
--- /dev/null
+++ b/fs/btrfs/ref-verify.c
@@ -0,0 +1,1031 @@
1/*
2 * Copyright (C) 2014 Facebook. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/stacktrace.h>
21#include "ctree.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "delayed-ref.h"
25#include "ref-verify.h"
26
27/*
28 * Used to keep track the roots and number of refs each root has for a given
29 * bytenr. This just tracks the number of direct references, no shared
30 * references.
31 */
32struct root_entry {
33 u64 root_objectid;
34 u64 num_refs;
35 struct rb_node node;
36};
37
38/*
39 * These are meant to represent what should exist in the extent tree, these can
40 * be used to verify the extent tree is consistent as these should all match
41 * what the extent tree says.
42 */
43struct ref_entry {
44 u64 root_objectid;
45 u64 parent;
46 u64 owner;
47 u64 offset;
48 u64 num_refs;
49 struct rb_node node;
50};
51
52#define MAX_TRACE 16
53
54/*
55 * Whenever we add/remove a reference we record the action. The action maps
56 * back to the delayed ref action. We hold the ref we are changing in the
57 * action so we can account for the history properly, and we record the root we
58 * were called with since it could be different from ref_root. We also store
59 * stack traces because thats how I roll.
60 */
61struct ref_action {
62 int action;
63 u64 root;
64 struct ref_entry ref;
65 struct list_head list;
66 unsigned long trace[MAX_TRACE];
67 unsigned int trace_len;
68};
69
70/*
71 * One of these for every block we reference, it holds the roots and references
72 * to it as well as all of the ref actions that have occured to it. We never
73 * free it until we unmount the file system in order to make sure re-allocations
74 * are happening properly.
75 */
76struct block_entry {
77 u64 bytenr;
78 u64 len;
79 u64 num_refs;
80 int metadata;
81 int from_disk;
82 struct rb_root roots;
83 struct rb_root refs;
84 struct rb_node node;
85 struct list_head actions;
86};
87
88static struct block_entry *insert_block_entry(struct rb_root *root,
89 struct block_entry *be)
90{
91 struct rb_node **p = &root->rb_node;
92 struct rb_node *parent_node = NULL;
93 struct block_entry *entry;
94
95 while (*p) {
96 parent_node = *p;
97 entry = rb_entry(parent_node, struct block_entry, node);
98 if (entry->bytenr > be->bytenr)
99 p = &(*p)->rb_left;
100 else if (entry->bytenr < be->bytenr)
101 p = &(*p)->rb_right;
102 else
103 return entry;
104 }
105
106 rb_link_node(&be->node, parent_node, p);
107 rb_insert_color(&be->node, root);
108 return NULL;
109}
110
111static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
112{
113 struct rb_node *n;
114 struct block_entry *entry = NULL;
115
116 n = root->rb_node;
117 while (n) {
118 entry = rb_entry(n, struct block_entry, node);
119 if (entry->bytenr < bytenr)
120 n = n->rb_right;
121 else if (entry->bytenr > bytenr)
122 n = n->rb_left;
123 else
124 return entry;
125 }
126 return NULL;
127}
128
129static struct root_entry *insert_root_entry(struct rb_root *root,
130 struct root_entry *re)
131{
132 struct rb_node **p = &root->rb_node;
133 struct rb_node *parent_node = NULL;
134 struct root_entry *entry;
135
136 while (*p) {
137 parent_node = *p;
138 entry = rb_entry(parent_node, struct root_entry, node);
139 if (entry->root_objectid > re->root_objectid)
140 p = &(*p)->rb_left;
141 else if (entry->root_objectid < re->root_objectid)
142 p = &(*p)->rb_right;
143 else
144 return entry;
145 }
146
147 rb_link_node(&re->node, parent_node, p);
148 rb_insert_color(&re->node, root);
149 return NULL;
150
151}
152
153static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
154{
155 if (ref1->root_objectid < ref2->root_objectid)
156 return -1;
157 if (ref1->root_objectid > ref2->root_objectid)
158 return 1;
159 if (ref1->parent < ref2->parent)
160 return -1;
161 if (ref1->parent > ref2->parent)
162 return 1;
163 if (ref1->owner < ref2->owner)
164 return -1;
165 if (ref1->owner > ref2->owner)
166 return 1;
167 if (ref1->offset < ref2->offset)
168 return -1;
169 if (ref1->offset > ref2->offset)
170 return 1;
171 return 0;
172}
173
174static struct ref_entry *insert_ref_entry(struct rb_root *root,
175 struct ref_entry *ref)
176{
177 struct rb_node **p = &root->rb_node;
178 struct rb_node *parent_node = NULL;
179 struct ref_entry *entry;
180 int cmp;
181
182 while (*p) {
183 parent_node = *p;
184 entry = rb_entry(parent_node, struct ref_entry, node);
185 cmp = comp_refs(entry, ref);
186 if (cmp > 0)
187 p = &(*p)->rb_left;
188 else if (cmp < 0)
189 p = &(*p)->rb_right;
190 else
191 return entry;
192 }
193
194 rb_link_node(&ref->node, parent_node, p);
195 rb_insert_color(&ref->node, root);
196 return NULL;
197
198}
199
200static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
201{
202 struct rb_node *n;
203 struct root_entry *entry = NULL;
204
205 n = root->rb_node;
206 while (n) {
207 entry = rb_entry(n, struct root_entry, node);
208 if (entry->root_objectid < objectid)
209 n = n->rb_right;
210 else if (entry->root_objectid > objectid)
211 n = n->rb_left;
212 else
213 return entry;
214 }
215 return NULL;
216}
217
218#ifdef CONFIG_STACKTRACE
219static void __save_stack_trace(struct ref_action *ra)
220{
221 struct stack_trace stack_trace;
222
223 stack_trace.max_entries = MAX_TRACE;
224 stack_trace.nr_entries = 0;
225 stack_trace.entries = ra->trace;
226 stack_trace.skip = 2;
227 save_stack_trace(&stack_trace);
228 ra->trace_len = stack_trace.nr_entries;
229}
230
231static void __print_stack_trace(struct btrfs_fs_info *fs_info,
232 struct ref_action *ra)
233{
234 struct stack_trace trace;
235
236 if (ra->trace_len == 0) {
237 btrfs_err(fs_info, " ref-verify: no stacktrace");
238 return;
239 }
240 trace.nr_entries = ra->trace_len;
241 trace.entries = ra->trace;
242 print_stack_trace(&trace, 2);
243}
244#else
245static void inline __save_stack_trace(struct ref_action *ra)
246{
247}
248
249static void inline __print_stack_trace(struct btrfs_fs_info *fs_info,
250 struct ref_action *ra)
251{
252 btrfs_err(fs_info, " ref-verify: no stacktrace support");
253}
254#endif
255
256static void free_block_entry(struct block_entry *be)
257{
258 struct root_entry *re;
259 struct ref_entry *ref;
260 struct ref_action *ra;
261 struct rb_node *n;
262
263 while ((n = rb_first(&be->roots))) {
264 re = rb_entry(n, struct root_entry, node);
265 rb_erase(&re->node, &be->roots);
266 kfree(re);
267 }
268
269 while((n = rb_first(&be->refs))) {
270 ref = rb_entry(n, struct ref_entry, node);
271 rb_erase(&ref->node, &be->refs);
272 kfree(ref);
273 }
274
275 while (!list_empty(&be->actions)) {
276 ra = list_first_entry(&be->actions, struct ref_action,
277 list);
278 list_del(&ra->list);
279 kfree(ra);
280 }
281 kfree(be);
282}
283
284static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
285 u64 bytenr, u64 len,
286 u64 root_objectid)
287{
288 struct block_entry *be = NULL, *exist;
289 struct root_entry *re = NULL;
290
291 re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
292 be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
293 if (!be || !re) {
294 kfree(re);
295 kfree(be);
296 return ERR_PTR(-ENOMEM);
297 }
298 be->bytenr = bytenr;
299 be->len = len;
300
301 re->root_objectid = root_objectid;
302 re->num_refs = 0;
303
304 spin_lock(&fs_info->ref_verify_lock);
305 exist = insert_block_entry(&fs_info->block_tree, be);
306 if (exist) {
307 if (root_objectid) {
308 struct root_entry *exist_re;
309
310 exist_re = insert_root_entry(&exist->roots, re);
311 if (exist_re)
312 kfree(re);
313 }
314 kfree(be);
315 return exist;
316 }
317
318 be->num_refs = 0;
319 be->metadata = 0;
320 be->from_disk = 0;
321 be->roots = RB_ROOT;
322 be->refs = RB_ROOT;
323 INIT_LIST_HEAD(&be->actions);
324 if (root_objectid)
325 insert_root_entry(&be->roots, re);
326 else
327 kfree(re);
328 return be;
329}
330
331static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
332 u64 parent, u64 bytenr, int level)
333{
334 struct block_entry *be;
335 struct root_entry *re;
336 struct ref_entry *ref = NULL, *exist;
337
338 ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
339 if (!ref)
340 return -ENOMEM;
341
342 if (parent)
343 ref->root_objectid = 0;
344 else
345 ref->root_objectid = ref_root;
346 ref->parent = parent;
347 ref->owner = level;
348 ref->offset = 0;
349 ref->num_refs = 1;
350
351 be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root);
352 if (IS_ERR(be)) {
353 kfree(ref);
354 return PTR_ERR(be);
355 }
356 be->num_refs++;
357 be->from_disk = 1;
358 be->metadata = 1;
359
360 if (!parent) {
361 ASSERT(ref_root);
362 re = lookup_root_entry(&be->roots, ref_root);
363 ASSERT(re);
364 re->num_refs++;
365 }
366 exist = insert_ref_entry(&be->refs, ref);
367 if (exist) {
368 exist->num_refs++;
369 kfree(ref);
370 }
371 spin_unlock(&fs_info->ref_verify_lock);
372
373 return 0;
374}
375
376static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
377 u64 parent, u32 num_refs, u64 bytenr,
378 u64 num_bytes)
379{
380 struct block_entry *be;
381 struct ref_entry *ref;
382
383 ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
384 if (!ref)
385 return -ENOMEM;
386 be = add_block_entry(fs_info, bytenr, num_bytes, 0);
387 if (IS_ERR(be)) {
388 kfree(ref);
389 return PTR_ERR(be);
390 }
391 be->num_refs += num_refs;
392
393 ref->parent = parent;
394 ref->num_refs = num_refs;
395 if (insert_ref_entry(&be->refs, ref)) {
396 spin_unlock(&fs_info->ref_verify_lock);
397 btrfs_err(fs_info, "existing shared ref when reading from disk?");
398 kfree(ref);
399 return -EINVAL;
400 }
401 spin_unlock(&fs_info->ref_verify_lock);
402 return 0;
403}
404
405static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
406 struct extent_buffer *leaf,
407 struct btrfs_extent_data_ref *dref,
408 u64 bytenr, u64 num_bytes)
409{
410 struct block_entry *be;
411 struct ref_entry *ref;
412 struct root_entry *re;
413 u64 ref_root = btrfs_extent_data_ref_root(leaf, dref);
414 u64 owner = btrfs_extent_data_ref_objectid(leaf, dref);
415 u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
416 u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
417
418 ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
419 if (!ref)
420 return -ENOMEM;
421 be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
422 if (IS_ERR(be)) {
423 kfree(ref);
424 return PTR_ERR(be);
425 }
426 be->num_refs += num_refs;
427
428 ref->parent = 0;
429 ref->owner = owner;
430 ref->root_objectid = ref_root;
431 ref->offset = offset;
432 ref->num_refs = num_refs;
433 if (insert_ref_entry(&be->refs, ref)) {
434 spin_unlock(&fs_info->ref_verify_lock);
435 btrfs_err(fs_info, "existing ref when reading from disk?");
436 kfree(ref);
437 return -EINVAL;
438 }
439
440 re = lookup_root_entry(&be->roots, ref_root);
441 if (!re) {
442 spin_unlock(&fs_info->ref_verify_lock);
443 btrfs_err(fs_info, "missing root in new block entry?");
444 return -EINVAL;
445 }
446 re->num_refs += num_refs;
447 spin_unlock(&fs_info->ref_verify_lock);
448 return 0;
449}
450
451static int process_extent_item(struct btrfs_fs_info *fs_info,
452 struct btrfs_path *path, struct btrfs_key *key,
453 int slot, int *tree_block_level)
454{
455 struct btrfs_extent_item *ei;
456 struct btrfs_extent_inline_ref *iref;
457 struct btrfs_extent_data_ref *dref;
458 struct btrfs_shared_data_ref *sref;
459 struct extent_buffer *leaf = path->nodes[0];
460 u32 item_size = btrfs_item_size_nr(leaf, slot);
461 unsigned long end, ptr;
462 u64 offset, flags, count;
463 int type, ret;
464
465 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
466 flags = btrfs_extent_flags(leaf, ei);
467
468 if ((key->type == BTRFS_EXTENT_ITEM_KEY) &&
469 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
470 struct btrfs_tree_block_info *info;
471
472 info = (struct btrfs_tree_block_info *)(ei + 1);
473 *tree_block_level = btrfs_tree_block_level(leaf, info);
474 iref = (struct btrfs_extent_inline_ref *)(info + 1);
475 } else {
476 if (key->type == BTRFS_METADATA_ITEM_KEY)
477 *tree_block_level = key->offset;
478 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
479 }
480
481 ptr = (unsigned long)iref;
482 end = (unsigned long)ei + item_size;
483 while (ptr < end) {
484 iref = (struct btrfs_extent_inline_ref *)ptr;
485 type = btrfs_extent_inline_ref_type(leaf, iref);
486 offset = btrfs_extent_inline_ref_offset(leaf, iref);
487 switch (type) {
488 case BTRFS_TREE_BLOCK_REF_KEY:
489 ret = add_tree_block(fs_info, offset, 0, key->objectid,
490 *tree_block_level);
491 break;
492 case BTRFS_SHARED_BLOCK_REF_KEY:
493 ret = add_tree_block(fs_info, 0, offset, key->objectid,
494 *tree_block_level);
495 break;
496 case BTRFS_EXTENT_DATA_REF_KEY:
497 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
498 ret = add_extent_data_ref(fs_info, leaf, dref,
499 key->objectid, key->offset);
500 break;
501 case BTRFS_SHARED_DATA_REF_KEY:
502 sref = (struct btrfs_shared_data_ref *)(iref + 1);
503 count = btrfs_shared_data_ref_count(leaf, sref);
504 ret = add_shared_data_ref(fs_info, offset, count,
505 key->objectid, key->offset);
506 break;
507 default:
508 btrfs_err(fs_info, "invalid key type in iref");
509 ret = -EINVAL;
510 break;
511 }
512 if (ret)
513 break;
514 ptr += btrfs_extent_inline_ref_size(type);
515 }
516 return ret;
517}
518
519static int process_leaf(struct btrfs_root *root,
520 struct btrfs_path *path, u64 *bytenr, u64 *num_bytes)
521{
522 struct btrfs_fs_info *fs_info = root->fs_info;
523 struct extent_buffer *leaf = path->nodes[0];
524 struct btrfs_extent_data_ref *dref;
525 struct btrfs_shared_data_ref *sref;
526 u32 count;
527 int i = 0, tree_block_level = 0, ret;
528 struct btrfs_key key;
529 int nritems = btrfs_header_nritems(leaf);
530
531 for (i = 0; i < nritems; i++) {
532 btrfs_item_key_to_cpu(leaf, &key, i);
533 switch (key.type) {
534 case BTRFS_EXTENT_ITEM_KEY:
535 *num_bytes = key.offset;
536 case BTRFS_METADATA_ITEM_KEY:
537 *bytenr = key.objectid;
538 ret = process_extent_item(fs_info, path, &key, i,
539 &tree_block_level);
540 break;
541 case BTRFS_TREE_BLOCK_REF_KEY:
542 ret = add_tree_block(fs_info, key.offset, 0,
543 key.objectid, tree_block_level);
544 break;
545 case BTRFS_SHARED_BLOCK_REF_KEY:
546 ret = add_tree_block(fs_info, 0, key.offset,
547 key.objectid, tree_block_level);
548 break;
549 case BTRFS_EXTENT_DATA_REF_KEY:
550 dref = btrfs_item_ptr(leaf, i,
551 struct btrfs_extent_data_ref);
552 ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr,
553 *num_bytes);
554 break;
555 case BTRFS_SHARED_DATA_REF_KEY:
556 sref = btrfs_item_ptr(leaf, i,
557 struct btrfs_shared_data_ref);
558 count = btrfs_shared_data_ref_count(leaf, sref);
559 ret = add_shared_data_ref(fs_info, key.offset, count,
560 *bytenr, *num_bytes);
561 break;
562 default:
563 break;
564 }
565 if (ret)
566 break;
567 }
568 return ret;
569}
570
571/* Walk down to the leaf from the given level */
572static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
573 int level, u64 *bytenr, u64 *num_bytes)
574{
575 struct btrfs_fs_info *fs_info = root->fs_info;
576 struct extent_buffer *eb;
577 u64 block_bytenr, gen;
578 int ret = 0;
579
580 while (level >= 0) {
581 if (level) {
582 block_bytenr = btrfs_node_blockptr(path->nodes[level],
583 path->slots[level]);
584 gen = btrfs_node_ptr_generation(path->nodes[level],
585 path->slots[level]);
586 eb = read_tree_block(fs_info, block_bytenr, gen);
587 if (IS_ERR(eb))
588 return PTR_ERR(eb);
589 if (!extent_buffer_uptodate(eb)) {
590 free_extent_buffer(eb);
591 return -EIO;
592 }
593 btrfs_tree_read_lock(eb);
594 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
595 path->nodes[level-1] = eb;
596 path->slots[level-1] = 0;
597 path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
598 } else {
599 ret = process_leaf(root, path, bytenr, num_bytes);
600 if (ret)
601 break;
602 }
603 level--;
604 }
605 return ret;
606}
607
608/* Walk up to the next node that needs to be processed */
609static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
610 int *level)
611{
612 int l;
613
614 for (l = 0; l < BTRFS_MAX_LEVEL; l++) {
615 if (!path->nodes[l])
616 continue;
617 if (l) {
618 path->slots[l]++;
619 if (path->slots[l] <
620 btrfs_header_nritems(path->nodes[l])) {
621 *level = l;
622 return 0;
623 }
624 }
625 btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]);
626 free_extent_buffer(path->nodes[l]);
627 path->nodes[l] = NULL;
628 path->slots[l] = 0;
629 path->locks[l] = 0;
630 }
631
632 return 1;
633}
634
635static void dump_ref_action(struct btrfs_fs_info *fs_info,
636 struct ref_action *ra)
637{
638 btrfs_err(fs_info,
639" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
640 ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent,
641 ra->ref.owner, ra->ref.offset, ra->ref.num_refs);
642 __print_stack_trace(fs_info, ra);
643}
644
645/*
646 * Dumps all the information from the block entry to printk, it's going to be
647 * awesome.
648 */
649static void dump_block_entry(struct btrfs_fs_info *fs_info,
650 struct block_entry *be)
651{
652 struct ref_entry *ref;
653 struct root_entry *re;
654 struct ref_action *ra;
655 struct rb_node *n;
656
657 btrfs_err(fs_info,
658"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d",
659 be->bytenr, be->len, be->num_refs, be->metadata,
660 be->from_disk);
661
662 for (n = rb_first(&be->refs); n; n = rb_next(n)) {
663 ref = rb_entry(n, struct ref_entry, node);
664 btrfs_err(fs_info,
665" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
666 ref->root_objectid, ref->parent, ref->owner,
667 ref->offset, ref->num_refs);
668 }
669
670 for (n = rb_first(&be->roots); n; n = rb_next(n)) {
671 re = rb_entry(n, struct root_entry, node);
672 btrfs_err(fs_info, " root entry %llu, num_refs %llu",
673 re->root_objectid, re->num_refs);
674 }
675
676 list_for_each_entry(ra, &be->actions, list)
677 dump_ref_action(fs_info, ra);
678}
679
680/*
681 * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
682 * @root: the root we are making this modification from.
683 * @bytenr: the bytenr we are modifying.
684 * @num_bytes: number of bytes.
685 * @parent: the parent bytenr.
686 * @ref_root: the original root owner of the bytenr.
687 * @owner: level in the case of metadata, inode in the case of data.
688 * @offset: 0 for metadata, file offset for data.
689 * @action: the action that we are doing, this is the same as the delayed ref
690 * action.
691 *
692 * This will add an action item to the given bytenr and do sanity checks to make
693 * sure we haven't messed something up. If we are making a new allocation and
694 * this block entry has history we will delete all previous actions as long as
695 * our sanity checks pass as they are no longer needed.
696 */
697int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
698 u64 parent, u64 ref_root, u64 owner, u64 offset,
699 int action)
700{
701 struct btrfs_fs_info *fs_info = root->fs_info;
702 struct ref_entry *ref = NULL, *exist;
703 struct ref_action *ra = NULL;
704 struct block_entry *be = NULL;
705 struct root_entry *re = NULL;
706 int ret = 0;
707 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
708
709 if (!btrfs_test_opt(root->fs_info, REF_VERIFY))
710 return 0;
711
712 ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
713 ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
714 if (!ra || !ref) {
715 kfree(ref);
716 kfree(ra);
717 ret = -ENOMEM;
718 goto out;
719 }
720
721 if (parent) {
722 ref->parent = parent;
723 } else {
724 ref->root_objectid = ref_root;
725 ref->owner = owner;
726 ref->offset = offset;
727 }
728 ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
729
730 memcpy(&ra->ref, ref, sizeof(struct ref_entry));
731 /*
732 * Save the extra info from the delayed ref in the ref action to make it
733 * easier to figure out what is happening. The real ref's we add to the
734 * ref tree need to reflect what we save on disk so it matches any
735 * on-disk refs we pre-loaded.
736 */
737 ra->ref.owner = owner;
738 ra->ref.offset = offset;
739 ra->ref.root_objectid = ref_root;
740 __save_stack_trace(ra);
741
742 INIT_LIST_HEAD(&ra->list);
743 ra->action = action;
744 ra->root = root->objectid;
745
746 /*
747 * This is an allocation, preallocate the block_entry in case we haven't
748 * used it before.
749 */
750 ret = -EINVAL;
751 if (action == BTRFS_ADD_DELAYED_EXTENT) {
752 /*
753 * For subvol_create we'll just pass in whatever the parent root
754 * is and the new root objectid, so let's not treat the passed
755 * in root as if it really has a ref for this bytenr.
756 */
757 be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root);
758 if (IS_ERR(be)) {
759 kfree(ra);
760 ret = PTR_ERR(be);
761 goto out;
762 }
763 be->num_refs++;
764 if (metadata)
765 be->metadata = 1;
766
767 if (be->num_refs != 1) {
768 btrfs_err(fs_info,
769 "re-allocated a block that still has references to it!");
770 dump_block_entry(fs_info, be);
771 dump_ref_action(fs_info, ra);
772 goto out_unlock;
773 }
774
775 while (!list_empty(&be->actions)) {
776 struct ref_action *tmp;
777
778 tmp = list_first_entry(&be->actions, struct ref_action,
779 list);
780 list_del(&tmp->list);
781 kfree(tmp);
782 }
783 } else {
784 struct root_entry *tmp;
785
786 if (!parent) {
787 re = kmalloc(sizeof(struct root_entry), GFP_NOFS);
788 if (!re) {
789 kfree(ref);
790 kfree(ra);
791 ret = -ENOMEM;
792 goto out;
793 }
794 /*
795 * This is the root that is modifying us, so it's the
796 * one we want to lookup below when we modify the
797 * re->num_refs.
798 */
799 ref_root = root->objectid;
800 re->root_objectid = root->objectid;
801 re->num_refs = 0;
802 }
803
804 spin_lock(&root->fs_info->ref_verify_lock);
805 be = lookup_block_entry(&root->fs_info->block_tree, bytenr);
806 if (!be) {
807 btrfs_err(fs_info,
808"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
809 action, (unsigned long long)bytenr,
810 (unsigned long long)num_bytes);
811 dump_ref_action(fs_info, ra);
812 kfree(ref);
813 kfree(ra);
814 goto out_unlock;
815 }
816
817 if (!parent) {
818 tmp = insert_root_entry(&be->roots, re);
819 if (tmp) {
820 kfree(re);
821 re = tmp;
822 }
823 }
824 }
825
826 exist = insert_ref_entry(&be->refs, ref);
827 if (exist) {
828 if (action == BTRFS_DROP_DELAYED_REF) {
829 if (exist->num_refs == 0) {
830 btrfs_err(fs_info,
831"dropping a ref for a existing root that doesn't have a ref on the block");
832 dump_block_entry(fs_info, be);
833 dump_ref_action(fs_info, ra);
834 kfree(ra);
835 goto out_unlock;
836 }
837 exist->num_refs--;
838 if (exist->num_refs == 0) {
839 rb_erase(&exist->node, &be->refs);
840 kfree(exist);
841 }
842 } else if (!be->metadata) {
843 exist->num_refs++;
844 } else {
845 btrfs_err(fs_info,
846"attempting to add another ref for an existing ref on a tree block");
847 dump_block_entry(fs_info, be);
848 dump_ref_action(fs_info, ra);
849 kfree(ra);
850 goto out_unlock;
851 }
852 kfree(ref);
853 } else {
854 if (action == BTRFS_DROP_DELAYED_REF) {
855 btrfs_err(fs_info,
856"dropping a ref for a root that doesn't have a ref on the block");
857 dump_block_entry(fs_info, be);
858 dump_ref_action(fs_info, ra);
859 kfree(ra);
860 goto out_unlock;
861 }
862 }
863
864 if (!parent && !re) {
865 re = lookup_root_entry(&be->roots, ref_root);
866 if (!re) {
867 /*
868 * This shouldn't happen because we will add our re
869 * above when we lookup the be with !parent, but just in
870 * case catch this case so we don't panic because I
871 * didn't thik of some other corner case.
872 */
873 btrfs_err(fs_info, "failed to find root %llu for %llu",
874 root->objectid, be->bytenr);
875 dump_block_entry(fs_info, be);
876 dump_ref_action(fs_info, ra);
877 kfree(ra);
878 goto out_unlock;
879 }
880 }
881 if (action == BTRFS_DROP_DELAYED_REF) {
882 if (re)
883 re->num_refs--;
884 be->num_refs--;
885 } else if (action == BTRFS_ADD_DELAYED_REF) {
886 be->num_refs++;
887 if (re)
888 re->num_refs++;
889 }
890 list_add_tail(&ra->list, &be->actions);
891 ret = 0;
892out_unlock:
893 spin_unlock(&root->fs_info->ref_verify_lock);
894out:
895 if (ret)
896 btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
897 return ret;
898}
899
900/* Free up the ref cache */
901void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
902{
903 struct block_entry *be;
904 struct rb_node *n;
905
906 if (!btrfs_test_opt(fs_info, REF_VERIFY))
907 return;
908
909 spin_lock(&fs_info->ref_verify_lock);
910 while ((n = rb_first(&fs_info->block_tree))) {
911 be = rb_entry(n, struct block_entry, node);
912 rb_erase(&be->node, &fs_info->block_tree);
913 free_block_entry(be);
914 cond_resched_lock(&fs_info->ref_verify_lock);
915 }
916 spin_unlock(&fs_info->ref_verify_lock);
917}
918
919void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
920 u64 len)
921{
922 struct block_entry *be = NULL, *entry;
923 struct rb_node *n;
924
925 if (!btrfs_test_opt(fs_info, REF_VERIFY))
926 return;
927
928 spin_lock(&fs_info->ref_verify_lock);
929 n = fs_info->block_tree.rb_node;
930 while (n) {
931 entry = rb_entry(n, struct block_entry, node);
932 if (entry->bytenr < start) {
933 n = n->rb_right;
934 } else if (entry->bytenr > start) {
935 n = n->rb_left;
936 } else {
937 be = entry;
938 break;
939 }
940 /* We want to get as close to start as possible */
941 if (be == NULL ||
942 (entry->bytenr < start && be->bytenr > start) ||
943 (entry->bytenr < start && entry->bytenr > be->bytenr))
944 be = entry;
945 }
946
947 /*
948 * Could have an empty block group, maybe have something to check for
949 * this case to verify we were actually empty?
950 */
951 if (!be) {
952 spin_unlock(&fs_info->ref_verify_lock);
953 return;
954 }
955
956 n = &be->node;
957 while (n) {
958 be = rb_entry(n, struct block_entry, node);
959 n = rb_next(n);
960 if (be->bytenr < start && be->bytenr + be->len > start) {
961 btrfs_err(fs_info,
962 "block entry overlaps a block group [%llu,%llu]!",
963 start, len);
964 dump_block_entry(fs_info, be);
965 continue;
966 }
967 if (be->bytenr < start)
968 continue;
969 if (be->bytenr >= start + len)
970 break;
971 if (be->bytenr + be->len > start + len) {
972 btrfs_err(fs_info,
973 "block entry overlaps a block group [%llu,%llu]!",
974 start, len);
975 dump_block_entry(fs_info, be);
976 }
977 rb_erase(&be->node, &fs_info->block_tree);
978 free_block_entry(be);
979 }
980 spin_unlock(&fs_info->ref_verify_lock);
981}
982
983/* Walk down all roots and build the ref tree, meant to be called at mount */
984int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
985{
986 struct btrfs_path *path;
987 struct btrfs_root *root;
988 struct extent_buffer *eb;
989 u64 bytenr = 0, num_bytes = 0;
990 int ret, level;
991
992 if (!btrfs_test_opt(fs_info, REF_VERIFY))
993 return 0;
994
995 path = btrfs_alloc_path();
996 if (!path)
997 return -ENOMEM;
998
999 eb = btrfs_read_lock_root_node(fs_info->extent_root);
1000 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1001 level = btrfs_header_level(eb);
1002 path->nodes[level] = eb;
1003 path->slots[level] = 0;
1004 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
1005
1006 while (1) {
1007 /*
1008 * We have to keep track of the bytenr/num_bytes we last hit
1009 * because we could have run out of space for an inline ref, and
1010 * would have had to added a ref key item which may appear on a
1011 * different leaf from the original extent item.
1012 */
1013 ret = walk_down_tree(fs_info->extent_root, path, level,
1014 &bytenr, &num_bytes);
1015 if (ret)
1016 break;
1017 ret = walk_up_tree(root, path, &level);
1018 if (ret < 0)
1019 break;
1020 if (ret > 0) {
1021 ret = 0;
1022 break;
1023 }
1024 }
1025 if (ret) {
1026 btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
1027 btrfs_free_ref_cache(fs_info);
1028 }
1029 btrfs_free_path(path);
1030 return ret;
1031}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
new file mode 100644
index 000000000000..3bf02ce0e1e2
--- /dev/null
+++ b/fs/btrfs/ref-verify.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2014 Facebook. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REF_VERIFY__
19#define __REF_VERIFY__
20
21#ifdef CONFIG_BTRFS_FS_REF_VERIFY
22int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
23void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
24int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
25 u64 parent, u64 ref_root, u64 owner, u64 offset,
26 int action);
27void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
28 u64 len);
29
30static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
31{
32 spin_lock_init(&fs_info->ref_verify_lock);
33 fs_info->block_tree = RB_ROOT;
34}
35#else
36static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
37{
38 return 0;
39}
40
41static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
42{
43}
44
45static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr,
46 u64 num_bytes, u64 parent, u64 ref_root,
47 u64 owner, u64 offset, int action)
48{
49 return 0;
50}
51
52static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info,
53 u64 start, u64 len)
54{
55}
56
57static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
58{
59}
60
61#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
62#endif /* _REF_VERIFY__ */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9841faef08ea..4cf2eb67eba6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1742,7 +1742,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1742 dirty = 1; 1742 dirty = 1;
1743 1743
1744 key.offset -= btrfs_file_extent_offset(leaf, fi); 1744 key.offset -= btrfs_file_extent_offset(leaf, fi);
1745 ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, 1745 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1746 num_bytes, parent, 1746 num_bytes, parent,
1747 btrfs_header_owner(leaf), 1747 btrfs_header_owner(leaf),
1748 key.objectid, key.offset); 1748 key.objectid, key.offset);
@@ -1751,7 +1751,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1751 break; 1751 break;
1752 } 1752 }
1753 1753
1754 ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 1754 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1755 parent, btrfs_header_owner(leaf), 1755 parent, btrfs_header_owner(leaf),
1756 key.objectid, key.offset); 1756 key.objectid, key.offset);
1757 if (ret) { 1757 if (ret) {
@@ -1952,21 +1952,21 @@ again:
1952 path->slots[level], old_ptr_gen); 1952 path->slots[level], old_ptr_gen);
1953 btrfs_mark_buffer_dirty(path->nodes[level]); 1953 btrfs_mark_buffer_dirty(path->nodes[level]);
1954 1954
1955 ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr, 1955 ret = btrfs_inc_extent_ref(trans, src, old_bytenr,
1956 blocksize, path->nodes[level]->start, 1956 blocksize, path->nodes[level]->start,
1957 src->root_key.objectid, level - 1, 0); 1957 src->root_key.objectid, level - 1, 0);
1958 BUG_ON(ret); 1958 BUG_ON(ret);
1959 ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, 1959 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr,
1960 blocksize, 0, dest->root_key.objectid, 1960 blocksize, 0, dest->root_key.objectid,
1961 level - 1, 0); 1961 level - 1, 0);
1962 BUG_ON(ret); 1962 BUG_ON(ret);
1963 1963
1964 ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize, 1964 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1965 path->nodes[level]->start, 1965 path->nodes[level]->start,
1966 src->root_key.objectid, level - 1, 0); 1966 src->root_key.objectid, level - 1, 0);
1967 BUG_ON(ret); 1967 BUG_ON(ret);
1968 1968
1969 ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize, 1969 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1970 0, dest->root_key.objectid, level - 1, 1970 0, dest->root_key.objectid, level - 1,
1971 0); 1971 0);
1972 BUG_ON(ret); 1972 BUG_ON(ret);
@@ -2808,7 +2808,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2808 trans->transid); 2808 trans->transid);
2809 btrfs_mark_buffer_dirty(upper->eb); 2809 btrfs_mark_buffer_dirty(upper->eb);
2810 2810
2811 ret = btrfs_inc_extent_ref(trans, root->fs_info, 2811 ret = btrfs_inc_extent_ref(trans, root,
2812 node->eb->start, blocksize, 2812 node->eb->start, blocksize,
2813 upper->eb->start, 2813 upper->eb->start,
2814 btrfs_header_owner(upper->eb), 2814 btrfs_header_owner(upper->eb),
@@ -3246,6 +3246,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
3246 put_page(page); 3246 put_page(page);
3247 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3247 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3248 PAGE_SIZE); 3248 PAGE_SIZE);
3249 btrfs_delalloc_release_extents(BTRFS_I(inode),
3250 PAGE_SIZE);
3249 ret = -EIO; 3251 ret = -EIO;
3250 goto out; 3252 goto out;
3251 } 3253 }
@@ -3275,6 +3277,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
3275 put_page(page); 3277 put_page(page);
3276 3278
3277 index++; 3279 index++;
3280 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
3278 balance_dirty_pages_ratelimited(inode->i_mapping); 3281 balance_dirty_pages_ratelimited(inode->i_mapping);
3279 btrfs_throttle(fs_info); 3282 btrfs_throttle(fs_info);
3280 } 3283 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 95bcc3cce78f..3338407ef0f0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -226,10 +226,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
226 struct btrfs_root *root; 226 struct btrfs_root *root;
227 int err = 0; 227 int err = 0;
228 int ret; 228 int ret;
229 bool can_recover = true;
230
231 if (sb_rdonly(fs_info->sb))
232 can_recover = false;
233 229
234 path = btrfs_alloc_path(); 230 path = btrfs_alloc_path();
235 if (!path) 231 if (!path)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e3f6c49e5c4d..b2f871d80982 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -231,7 +231,7 @@ struct scrub_warning {
231 struct btrfs_path *path; 231 struct btrfs_path *path;
232 u64 extent_item_size; 232 u64 extent_item_size;
233 const char *errstr; 233 const char *errstr;
234 sector_t sector; 234 u64 physical;
235 u64 logical; 235 u64 logical;
236 struct btrfs_device *dev; 236 struct btrfs_device *dev;
237}; 237};
@@ -797,10 +797,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
797 */ 797 */
798 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 798 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
799 btrfs_warn_in_rcu(fs_info, 799 btrfs_warn_in_rcu(fs_info,
800 "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", 800"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
801 swarn->errstr, swarn->logical, 801 swarn->errstr, swarn->logical,
802 rcu_str_deref(swarn->dev->name), 802 rcu_str_deref(swarn->dev->name),
803 (unsigned long long)swarn->sector, 803 swarn->physical,
804 root, inum, offset, 804 root, inum, offset,
805 min(isize - offset, (u64)PAGE_SIZE), nlink, 805 min(isize - offset, (u64)PAGE_SIZE), nlink,
806 (char *)(unsigned long)ipath->fspath->val[i]); 806 (char *)(unsigned long)ipath->fspath->val[i]);
@@ -810,10 +810,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
810 810
811err: 811err:
812 btrfs_warn_in_rcu(fs_info, 812 btrfs_warn_in_rcu(fs_info,
813 "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", 813 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
814 swarn->errstr, swarn->logical, 814 swarn->errstr, swarn->logical,
815 rcu_str_deref(swarn->dev->name), 815 rcu_str_deref(swarn->dev->name),
816 (unsigned long long)swarn->sector, 816 swarn->physical,
817 root, inum, offset, ret); 817 root, inum, offset, ret);
818 818
819 free_ipath(ipath); 819 free_ipath(ipath);
@@ -845,7 +845,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
845 if (!path) 845 if (!path)
846 return; 846 return;
847 847
848 swarn.sector = (sblock->pagev[0]->physical) >> 9; 848 swarn.physical = sblock->pagev[0]->physical;
849 swarn.logical = sblock->pagev[0]->logical; 849 swarn.logical = sblock->pagev[0]->logical;
850 swarn.errstr = errstr; 850 swarn.errstr = errstr;
851 swarn.dev = NULL; 851 swarn.dev = NULL;
@@ -868,10 +868,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
868 item_size, &ref_root, 868 item_size, &ref_root,
869 &ref_level); 869 &ref_level);
870 btrfs_warn_in_rcu(fs_info, 870 btrfs_warn_in_rcu(fs_info,
871 "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu", 871"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
872 errstr, swarn.logical, 872 errstr, swarn.logical,
873 rcu_str_deref(dev->name), 873 rcu_str_deref(dev->name),
874 (unsigned long long)swarn.sector, 874 swarn.physical,
875 ref_level ? "node" : "leaf", 875 ref_level ? "node" : "leaf",
876 ret < 0 ? -1 : ref_level, 876 ret < 0 ? -1 : ref_level,
877 ret < 0 ? -1 : ref_root); 877 ret < 0 ? -1 : ref_root);
@@ -883,7 +883,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
883 swarn.dev = dev; 883 swarn.dev = dev;
884 iterate_extent_inodes(fs_info, found_key.objectid, 884 iterate_extent_inodes(fs_info, found_key.objectid,
885 extent_item_pos, 1, 885 extent_item_pos, 1,
886 scrub_print_warning_inode, &swarn); 886 scrub_print_warning_inode, &swarn, false);
887 } 887 }
888 888
889out: 889out:
@@ -1047,7 +1047,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
1047 * can be found. 1047 * can be found.
1048 */ 1048 */
1049 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, 1049 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1050 scrub_fixup_readpage, fixup); 1050 scrub_fixup_readpage, fixup, false);
1051 if (ret < 0) { 1051 if (ret < 0) {
1052 uncorrectable = 1; 1052 uncorrectable = 1;
1053 goto out; 1053 goto out;
@@ -4390,7 +4390,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
4390 } 4390 }
4391 4391
4392 ret = iterate_inodes_from_logical(logical, fs_info, path, 4392 ret = iterate_inodes_from_logical(logical, fs_info, path,
4393 record_inode_for_nocow, nocow_ctx); 4393 record_inode_for_nocow, nocow_ctx, false);
4394 if (ret != 0 && ret != -ENOENT) { 4394 if (ret != 0 && ret != -ENOENT) {
4395 btrfs_warn(fs_info, 4395 btrfs_warn(fs_info,
4396 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", 4396 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8fd195cfe81b..c10e4c70f02d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -26,6 +26,7 @@
26#include <linux/radix-tree.h> 26#include <linux/radix-tree.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/compat.h>
29 30
30#include "send.h" 31#include "send.h"
31#include "backref.h" 32#include "backref.h"
@@ -992,7 +993,6 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
992 * path must point to the dir item when called. 993 * path must point to the dir item when called.
993 */ 994 */
994static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, 995static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 struct btrfs_key *found_key,
996 iterate_dir_item_t iterate, void *ctx) 996 iterate_dir_item_t iterate, void *ctx)
997{ 997{
998 int ret = 0; 998 int ret = 0;
@@ -1271,12 +1271,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1271 */ 1271 */
1272 if (ino >= bctx->cur_objectid) 1272 if (ino >= bctx->cur_objectid)
1273 return 0; 1273 return 0;
1274#if 0
1275 if (ino > bctx->cur_objectid)
1276 return 0;
1277 if (offset + bctx->extent_len > bctx->cur_offset)
1278 return 0;
1279#endif
1280 } 1274 }
1281 1275
1282 bctx->found++; 1276 bctx->found++;
@@ -1429,7 +1423,7 @@ static int find_extent_clone(struct send_ctx *sctx,
1429 extent_item_pos = 0; 1423 extent_item_pos = 0;
1430 ret = iterate_extent_inodes(fs_info, found_key.objectid, 1424 ret = iterate_extent_inodes(fs_info, found_key.objectid,
1431 extent_item_pos, 1, __iterate_backrefs, 1425 extent_item_pos, 1, __iterate_backrefs,
1432 backref_ctx); 1426 backref_ctx, false);
1433 1427
1434 if (ret < 0) 1428 if (ret < 0)
1435 goto out; 1429 goto out;
@@ -4106,8 +4100,8 @@ out:
4106 return ret; 4100 return ret;
4107} 4101}
4108 4102
4109static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, 4103static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
4110 struct fs_path *name, void *ctx, struct list_head *refs) 4104 void *ctx, struct list_head *refs)
4111{ 4105{
4112 int ret = 0; 4106 int ret = 0;
4113 struct send_ctx *sctx = ctx; 4107 struct send_ctx *sctx = ctx;
@@ -4143,8 +4137,7 @@ static int __record_new_ref(int num, u64 dir, int index,
4143 void *ctx) 4137 void *ctx)
4144{ 4138{
4145 struct send_ctx *sctx = ctx; 4139 struct send_ctx *sctx = ctx;
4146 return record_ref(sctx->send_root, num, dir, index, name, 4140 return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
4147 ctx, &sctx->new_refs);
4148} 4141}
4149 4142
4150 4143
@@ -4153,8 +4146,8 @@ static int __record_deleted_ref(int num, u64 dir, int index,
4153 void *ctx) 4146 void *ctx)
4154{ 4147{
4155 struct send_ctx *sctx = ctx; 4148 struct send_ctx *sctx = ctx;
4156 return record_ref(sctx->parent_root, num, dir, index, name, 4149 return record_ref(sctx->parent_root, dir, name, ctx,
4157 ctx, &sctx->deleted_refs); 4150 &sctx->deleted_refs);
4158} 4151}
4159 4152
4160static int record_new_ref(struct send_ctx *sctx) 4153static int record_new_ref(struct send_ctx *sctx)
@@ -4498,7 +4491,7 @@ static int process_new_xattr(struct send_ctx *sctx)
4498 int ret = 0; 4491 int ret = 0;
4499 4492
4500 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4493 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4501 sctx->cmp_key, __process_new_xattr, sctx); 4494 __process_new_xattr, sctx);
4502 4495
4503 return ret; 4496 return ret;
4504} 4497}
@@ -4506,7 +4499,7 @@ static int process_new_xattr(struct send_ctx *sctx)
4506static int process_deleted_xattr(struct send_ctx *sctx) 4499static int process_deleted_xattr(struct send_ctx *sctx)
4507{ 4500{
4508 return iterate_dir_item(sctx->parent_root, sctx->right_path, 4501 return iterate_dir_item(sctx->parent_root, sctx->right_path,
4509 sctx->cmp_key, __process_deleted_xattr, sctx); 4502 __process_deleted_xattr, sctx);
4510} 4503}
4511 4504
4512struct find_xattr_ctx { 4505struct find_xattr_ctx {
@@ -4551,7 +4544,7 @@ static int find_xattr(struct btrfs_root *root,
4551 ctx.found_data = NULL; 4544 ctx.found_data = NULL;
4552 ctx.found_data_len = 0; 4545 ctx.found_data_len = 0;
4553 4546
4554 ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); 4547 ret = iterate_dir_item(root, path, __find_xattr, &ctx);
4555 if (ret < 0) 4548 if (ret < 0)
4556 return ret; 4549 return ret;
4557 4550
@@ -4621,11 +4614,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
4621 int ret = 0; 4614 int ret = 0;
4622 4615
4623 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4616 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4624 sctx->cmp_key, __process_changed_new_xattr, sctx); 4617 __process_changed_new_xattr, sctx);
4625 if (ret < 0) 4618 if (ret < 0)
4626 goto out; 4619 goto out;
4627 ret = iterate_dir_item(sctx->parent_root, sctx->right_path, 4620 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
4628 sctx->cmp_key, __process_changed_deleted_xattr, sctx); 4621 __process_changed_deleted_xattr, sctx);
4629 4622
4630out: 4623out:
4631 return ret; 4624 return ret;
@@ -4675,8 +4668,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
4675 goto out; 4668 goto out;
4676 } 4669 }
4677 4670
4678 ret = iterate_dir_item(root, path, &found_key, 4671 ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
4679 __process_new_xattr, sctx);
4680 if (ret < 0) 4672 if (ret < 0)
4681 goto out; 4673 goto out;
4682 4674
@@ -4723,16 +4715,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
4723 /* initial readahead */ 4715 /* initial readahead */
4724 memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 4716 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4725 file_ra_state_init(&sctx->ra, inode->i_mapping); 4717 file_ra_state_init(&sctx->ra, inode->i_mapping);
4726 page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index,
4727 last_index - index + 1);
4728 4718
4729 while (index <= last_index) { 4719 while (index <= last_index) {
4730 unsigned cur_len = min_t(unsigned, len, 4720 unsigned cur_len = min_t(unsigned, len,
4731 PAGE_SIZE - pg_offset); 4721 PAGE_SIZE - pg_offset);
4732 page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); 4722
4723 page = find_lock_page(inode->i_mapping, index);
4733 if (!page) { 4724 if (!page) {
4734 ret = -ENOMEM; 4725 page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
4735 break; 4726 NULL, index, last_index + 1 - index);
4727
4728 page = find_or_create_page(inode->i_mapping, index,
4729 GFP_KERNEL);
4730 if (!page) {
4731 ret = -ENOMEM;
4732 break;
4733 }
4734 }
4735
4736 if (PageReadahead(page)) {
4737 page_cache_async_readahead(inode->i_mapping, &sctx->ra,
4738 NULL, page, index, last_index + 1 - index);
4736 } 4739 }
4737 4740
4738 if (!PageUptodate(page)) { 4741 if (!PageUptodate(page)) {
@@ -6162,9 +6165,7 @@ out:
6162 * Updates compare related fields in sctx and simply forwards to the actual 6165 * Updates compare related fields in sctx and simply forwards to the actual
6163 * changed_xxx functions. 6166 * changed_xxx functions.
6164 */ 6167 */
6165static int changed_cb(struct btrfs_root *left_root, 6168static int changed_cb(struct btrfs_path *left_path,
6166 struct btrfs_root *right_root,
6167 struct btrfs_path *left_path,
6168 struct btrfs_path *right_path, 6169 struct btrfs_path *right_path,
6169 struct btrfs_key *key, 6170 struct btrfs_key *key,
6170 enum btrfs_compare_tree_result result, 6171 enum btrfs_compare_tree_result result,
@@ -6246,8 +6247,8 @@ static int full_send_tree(struct send_ctx *sctx)
6246 slot = path->slots[0]; 6247 slot = path->slots[0];
6247 btrfs_item_key_to_cpu(eb, &found_key, slot); 6248 btrfs_item_key_to_cpu(eb, &found_key, slot);
6248 6249
6249 ret = changed_cb(send_root, NULL, path, NULL, 6250 ret = changed_cb(path, NULL, &found_key,
6250 &found_key, BTRFS_COMPARE_TREE_NEW, sctx); 6251 BTRFS_COMPARE_TREE_NEW, sctx);
6251 if (ret < 0) 6252 if (ret < 0)
6252 goto out; 6253 goto out;
6253 6254
@@ -6365,13 +6366,12 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
6365 spin_unlock(&root->root_item_lock); 6366 spin_unlock(&root->root_item_lock);
6366} 6367}
6367 6368
6368long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) 6369long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
6369{ 6370{
6370 int ret = 0; 6371 int ret = 0;
6371 struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; 6372 struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
6372 struct btrfs_fs_info *fs_info = send_root->fs_info; 6373 struct btrfs_fs_info *fs_info = send_root->fs_info;
6373 struct btrfs_root *clone_root; 6374 struct btrfs_root *clone_root;
6374 struct btrfs_ioctl_send_args *arg = NULL;
6375 struct btrfs_key key; 6375 struct btrfs_key key;
6376 struct send_ctx *sctx = NULL; 6376 struct send_ctx *sctx = NULL;
6377 u32 i; 6377 u32 i;
@@ -6407,13 +6407,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
6407 goto out; 6407 goto out;
6408 } 6408 }
6409 6409
6410 arg = memdup_user(arg_, sizeof(*arg));
6411 if (IS_ERR(arg)) {
6412 ret = PTR_ERR(arg);
6413 arg = NULL;
6414 goto out;
6415 }
6416
6417 /* 6410 /*
6418 * Check that we don't overflow at later allocations, we request 6411 * Check that we don't overflow at later allocations, we request
6419 * clone_sources_count + 1 items, and compare to unsigned long inside 6412 * clone_sources_count + 1 items, and compare to unsigned long inside
@@ -6654,7 +6647,6 @@ out:
6654 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) 6647 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
6655 btrfs_root_dec_send_in_progress(sctx->parent_root); 6648 btrfs_root_dec_send_in_progress(sctx->parent_root);
6656 6649
6657 kfree(arg);
6658 kvfree(clone_sources_tmp); 6650 kvfree(clone_sources_tmp);
6659 6651
6660 if (sctx) { 6652 if (sctx) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 02e00166c4da..3aa4bc55754f 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,5 +130,5 @@ enum {
130#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) 130#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
131 131
132#ifdef __KERNEL__ 132#ifdef __KERNEL__
133long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); 133long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
134#endif 134#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 161694b66038..65af029559b5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,7 +202,6 @@ static struct ratelimit_state printk_limits[] = {
202 202
203void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 203void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
204{ 204{
205 struct super_block *sb = fs_info->sb;
206 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; 205 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
207 struct va_format vaf; 206 struct va_format vaf;
208 va_list args; 207 va_list args;
@@ -228,7 +227,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
228 vaf.va = &args; 227 vaf.va = &args;
229 228
230 if (__ratelimit(ratelimit)) 229 if (__ratelimit(ratelimit))
231 printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); 230 printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
231 fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
232 232
233 va_end(args); 233 va_end(args);
234} 234}
@@ -292,7 +292,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
292 vaf.va = &args; 292 vaf.va = &args;
293 293
294 errstr = btrfs_decode_error(errno); 294 errstr = btrfs_decode_error(errno);
295 if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) 295 if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
296 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", 296 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
297 s_id, function, line, &vaf, errno, errstr); 297 s_id, function, line, &vaf, errno, errstr);
298 298
@@ -326,6 +326,9 @@ enum {
326#ifdef CONFIG_BTRFS_DEBUG 326#ifdef CONFIG_BTRFS_DEBUG
327 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, 327 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
328#endif 328#endif
329#ifdef CONFIG_BTRFS_FS_REF_VERIFY
330 Opt_ref_verify,
331#endif
329 Opt_err, 332 Opt_err,
330}; 333};
331 334
@@ -387,6 +390,9 @@ static const match_table_t tokens = {
387 {Opt_fragment_metadata, "fragment=metadata"}, 390 {Opt_fragment_metadata, "fragment=metadata"},
388 {Opt_fragment_all, "fragment=all"}, 391 {Opt_fragment_all, "fragment=all"},
389#endif 392#endif
393#ifdef CONFIG_BTRFS_FS_REF_VERIFY
394 {Opt_ref_verify, "ref_verify"},
395#endif
390 {Opt_err, NULL}, 396 {Opt_err, NULL},
391}; 397};
392 398
@@ -502,6 +508,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
502 strncmp(args[0].from, "zlib", 4) == 0) { 508 strncmp(args[0].from, "zlib", 4) == 0) {
503 compress_type = "zlib"; 509 compress_type = "zlib";
504 info->compress_type = BTRFS_COMPRESS_ZLIB; 510 info->compress_type = BTRFS_COMPRESS_ZLIB;
511 info->compress_level =
512 btrfs_compress_str2level(args[0].from);
505 btrfs_set_opt(info->mount_opt, COMPRESS); 513 btrfs_set_opt(info->mount_opt, COMPRESS);
506 btrfs_clear_opt(info->mount_opt, NODATACOW); 514 btrfs_clear_opt(info->mount_opt, NODATACOW);
507 btrfs_clear_opt(info->mount_opt, NODATASUM); 515 btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -549,9 +557,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
549 compress_force != saved_compress_force)) || 557 compress_force != saved_compress_force)) ||
550 (!btrfs_test_opt(info, COMPRESS) && 558 (!btrfs_test_opt(info, COMPRESS) &&
551 no_compress == 1)) { 559 no_compress == 1)) {
552 btrfs_info(info, "%s %s compression", 560 btrfs_info(info, "%s %s compression, level %d",
553 (compress_force) ? "force" : "use", 561 (compress_force) ? "force" : "use",
554 compress_type); 562 compress_type, info->compress_level);
555 } 563 }
556 compress_force = false; 564 compress_force = false;
557 break; 565 break;
@@ -825,6 +833,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
825 btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); 833 btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
826 break; 834 break;
827#endif 835#endif
836#ifdef CONFIG_BTRFS_FS_REF_VERIFY
837 case Opt_ref_verify:
838 btrfs_info(info, "doing ref verification");
839 btrfs_set_opt(info->mount_opt, REF_VERIFY);
840 break;
841#endif
828 case Opt_err: 842 case Opt_err:
829 btrfs_info(info, "unrecognized mount option '%s'", p); 843 btrfs_info(info, "unrecognized mount option '%s'", p);
830 ret = -EINVAL; 844 ret = -EINVAL;
@@ -1205,8 +1219,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
1205 * happens. The pending operations are delayed to the 1219 * happens. The pending operations are delayed to the
1206 * next commit after thawing. 1220 * next commit after thawing.
1207 */ 1221 */
1208 if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) 1222 if (sb_start_write_trylock(sb))
1209 __sb_end_write(sb, SB_FREEZE_WRITE); 1223 sb_end_write(sb);
1210 else 1224 else
1211 return 0; 1225 return 0;
1212 trans = btrfs_start_transaction(root, 0); 1226 trans = btrfs_start_transaction(root, 0);
@@ -1246,6 +1260,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1246 seq_printf(seq, ",compress-force=%s", compress_type); 1260 seq_printf(seq, ",compress-force=%s", compress_type);
1247 else 1261 else
1248 seq_printf(seq, ",compress=%s", compress_type); 1262 seq_printf(seq, ",compress=%s", compress_type);
1263 if (info->compress_level)
1264 seq_printf(seq, ":%d", info->compress_level);
1249 } 1265 }
1250 if (btrfs_test_opt(info, NOSSD)) 1266 if (btrfs_test_opt(info, NOSSD))
1251 seq_puts(seq, ",nossd"); 1267 seq_puts(seq, ",nossd");
@@ -1305,6 +1321,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1305 if (btrfs_test_opt(info, FRAGMENT_METADATA)) 1321 if (btrfs_test_opt(info, FRAGMENT_METADATA))
1306 seq_puts(seq, ",fragment=metadata"); 1322 seq_puts(seq, ",fragment=metadata");
1307#endif 1323#endif
1324 if (btrfs_test_opt(info, REF_VERIFY))
1325 seq_puts(seq, ",ref_verify");
1308 seq_printf(seq, ",subvolid=%llu", 1326 seq_printf(seq, ",subvolid=%llu",
1309 BTRFS_I(d_inode(dentry))->root->root_key.objectid); 1327 BTRFS_I(d_inode(dentry))->root->root_key.objectid);
1310 seq_puts(seq, ",subvol="); 1328 seq_puts(seq, ",subvol=");
@@ -2112,7 +2130,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2112 * succeed even if the Avail is zero. But this is better than the other 2130 * succeed even if the Avail is zero. But this is better than the other
2113 * way around. 2131 * way around.
2114 */ 2132 */
2115 thresh = 4 * 1024 * 1024; 2133 thresh = SZ_4M;
2116 2134
2117 if (!mixed && total_free_meta - thresh < block_rsv->size) 2135 if (!mixed && total_free_meta - thresh < block_rsv->size)
2118 buf->f_bavail = 0; 2136 buf->f_bavail = 0;
@@ -2319,6 +2337,9 @@ static void btrfs_print_mod_info(void)
2319#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2337#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2320 ", integrity-checker=on" 2338 ", integrity-checker=on"
2321#endif 2339#endif
2340#ifdef CONFIG_BTRFS_FS_REF_VERIFY
2341 ", ref-verify=on"
2342#endif
2322 "\n", 2343 "\n",
2323 btrfs_crc32c_impl()); 2344 btrfs_crc32c_impl());
2324} 2345}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 883881b16c86..a28bba801264 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -247,7 +247,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
247 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 247 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
248 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); 248 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
249} 249}
250BTRFS_ATTR(global_rsv_size, global_rsv_size_show); 250BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show);
251 251
252static ssize_t global_rsv_reserved_show(struct kobject *kobj, 252static ssize_t global_rsv_reserved_show(struct kobject *kobj,
253 struct kobj_attribute *a, char *buf) 253 struct kobj_attribute *a, char *buf)
@@ -256,15 +256,15 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
256 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 256 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
257 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); 257 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
258} 258}
259BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); 259BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show);
260 260
261#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 261#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
262#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) 262#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
263 263
264static ssize_t raid_bytes_show(struct kobject *kobj, 264static ssize_t raid_bytes_show(struct kobject *kobj,
265 struct kobj_attribute *attr, char *buf); 265 struct kobj_attribute *attr, char *buf);
266BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); 266BTRFS_ATTR(raid, total_bytes, raid_bytes_show);
267BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); 267BTRFS_ATTR(raid, used_bytes, raid_bytes_show);
268 268
269static ssize_t raid_bytes_show(struct kobject *kobj, 269static ssize_t raid_bytes_show(struct kobject *kobj,
270 struct kobj_attribute *attr, char *buf) 270 struct kobj_attribute *attr, char *buf)
@@ -277,7 +277,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
277 277
278 down_read(&sinfo->groups_sem); 278 down_read(&sinfo->groups_sem);
279 list_for_each_entry(block_group, &sinfo->block_groups[index], list) { 279 list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
280 if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) 280 if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
281 val += block_group->key.offset; 281 val += block_group->key.offset;
282 else 282 else
283 val += btrfs_block_group_used(&block_group->item); 283 val += btrfs_block_group_used(&block_group->item);
@@ -287,8 +287,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
287} 287}
288 288
289static struct attribute *raid_attributes[] = { 289static struct attribute *raid_attributes[] = {
290 BTRFS_RAID_ATTR_PTR(total_bytes), 290 BTRFS_ATTR_PTR(raid, total_bytes),
291 BTRFS_RAID_ATTR_PTR(used_bytes), 291 BTRFS_ATTR_PTR(raid, used_bytes),
292 NULL 292 NULL
293}; 293};
294 294
@@ -311,7 +311,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
311 struct btrfs_space_info *sinfo = to_space_info(kobj); \ 311 struct btrfs_space_info *sinfo = to_space_info(kobj); \
312 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ 312 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
313} \ 313} \
314BTRFS_ATTR(field, btrfs_space_info_show_##field) 314BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
315 315
316static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, 316static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
317 struct kobj_attribute *a, 317 struct kobj_attribute *a,
@@ -331,19 +331,20 @@ SPACE_INFO_ATTR(bytes_may_use);
331SPACE_INFO_ATTR(bytes_readonly); 331SPACE_INFO_ATTR(bytes_readonly);
332SPACE_INFO_ATTR(disk_used); 332SPACE_INFO_ATTR(disk_used);
333SPACE_INFO_ATTR(disk_total); 333SPACE_INFO_ATTR(disk_total);
334BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); 334BTRFS_ATTR(space_info, total_bytes_pinned,
335 btrfs_space_info_show_total_bytes_pinned);
335 336
336static struct attribute *space_info_attrs[] = { 337static struct attribute *space_info_attrs[] = {
337 BTRFS_ATTR_PTR(flags), 338 BTRFS_ATTR_PTR(space_info, flags),
338 BTRFS_ATTR_PTR(total_bytes), 339 BTRFS_ATTR_PTR(space_info, total_bytes),
339 BTRFS_ATTR_PTR(bytes_used), 340 BTRFS_ATTR_PTR(space_info, bytes_used),
340 BTRFS_ATTR_PTR(bytes_pinned), 341 BTRFS_ATTR_PTR(space_info, bytes_pinned),
341 BTRFS_ATTR_PTR(bytes_reserved), 342 BTRFS_ATTR_PTR(space_info, bytes_reserved),
342 BTRFS_ATTR_PTR(bytes_may_use), 343 BTRFS_ATTR_PTR(space_info, bytes_may_use),
343 BTRFS_ATTR_PTR(bytes_readonly), 344 BTRFS_ATTR_PTR(space_info, bytes_readonly),
344 BTRFS_ATTR_PTR(disk_used), 345 BTRFS_ATTR_PTR(space_info, disk_used),
345 BTRFS_ATTR_PTR(disk_total), 346 BTRFS_ATTR_PTR(space_info, disk_total),
346 BTRFS_ATTR_PTR(total_bytes_pinned), 347 BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
347 NULL, 348 NULL,
348}; 349};
349 350
@@ -361,8 +362,8 @@ struct kobj_type space_info_ktype = {
361}; 362};
362 363
363static const struct attribute *allocation_attrs[] = { 364static const struct attribute *allocation_attrs[] = {
364 BTRFS_ATTR_PTR(global_rsv_reserved), 365 BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
365 BTRFS_ATTR_PTR(global_rsv_size), 366 BTRFS_ATTR_PTR(allocation, global_rsv_size),
366 NULL, 367 NULL,
367}; 368};
368 369
@@ -415,7 +416,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
415 416
416 return len; 417 return len;
417} 418}
418BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); 419BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store);
419 420
420static ssize_t btrfs_nodesize_show(struct kobject *kobj, 421static ssize_t btrfs_nodesize_show(struct kobject *kobj,
421 struct kobj_attribute *a, char *buf) 422 struct kobj_attribute *a, char *buf)
@@ -425,7 +426,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
425 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); 426 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
426} 427}
427 428
428BTRFS_ATTR(nodesize, btrfs_nodesize_show); 429BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
429 430
430static ssize_t btrfs_sectorsize_show(struct kobject *kobj, 431static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
431 struct kobj_attribute *a, char *buf) 432 struct kobj_attribute *a, char *buf)
@@ -436,7 +437,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
436 fs_info->super_copy->sectorsize); 437 fs_info->super_copy->sectorsize);
437} 438}
438 439
439BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); 440BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
440 441
441static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, 442static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
442 struct kobj_attribute *a, char *buf) 443 struct kobj_attribute *a, char *buf)
@@ -447,7 +448,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
447 fs_info->super_copy->sectorsize); 448 fs_info->super_copy->sectorsize);
448} 449}
449 450
450BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); 451BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
451 452
452static ssize_t quota_override_show(struct kobject *kobj, 453static ssize_t quota_override_show(struct kobject *kobj,
453 struct kobj_attribute *a, char *buf) 454 struct kobj_attribute *a, char *buf)
@@ -487,14 +488,14 @@ static ssize_t quota_override_store(struct kobject *kobj,
487 return len; 488 return len;
488} 489}
489 490
490BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); 491BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
491 492
492static const struct attribute *btrfs_attrs[] = { 493static const struct attribute *btrfs_attrs[] = {
493 BTRFS_ATTR_PTR(label), 494 BTRFS_ATTR_PTR(, label),
494 BTRFS_ATTR_PTR(nodesize), 495 BTRFS_ATTR_PTR(, nodesize),
495 BTRFS_ATTR_PTR(sectorsize), 496 BTRFS_ATTR_PTR(, sectorsize),
496 BTRFS_ATTR_PTR(clone_alignment), 497 BTRFS_ATTR_PTR(, clone_alignment),
497 BTRFS_ATTR_PTR(quota_override), 498 BTRFS_ATTR_PTR(, quota_override),
498 NULL, 499 NULL,
499}; 500};
500 501
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 4cb908305e5d..80457f31c29f 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -21,21 +21,16 @@ enum btrfs_feature_set {
21 .store = _store, \ 21 .store = _store, \
22} 22}
23 23
24#define BTRFS_ATTR_RW(_name, _show, _store) \ 24#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
25 static struct kobj_attribute btrfs_attr_##_name = \ 25 static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
26 __INIT_KOBJ_ATTR(_name, 0644, _show, _store) 26 __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
27 27
28#define BTRFS_ATTR(_name, _show) \ 28#define BTRFS_ATTR(_prefix, _name, _show) \
29 static struct kobj_attribute btrfs_attr_##_name = \ 29 static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
30 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) 30 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
31 31
32#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) 32#define BTRFS_ATTR_PTR(_prefix, _name) \
33 33 (&btrfs_attr_##_prefix##_##_name.attr)
34#define BTRFS_RAID_ATTR(_name, _show) \
35 static struct kobj_attribute btrfs_raid_attr_##_name = \
36 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
37
38#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
39 34
40 35
41struct btrfs_feature_attr { 36struct btrfs_feature_attr {
@@ -44,15 +39,16 @@ struct btrfs_feature_attr {
44 u64 feature_bit; 39 u64 feature_bit;
45}; 40};
46 41
47#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ 42#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \
48static struct btrfs_feature_attr btrfs_attr_##_name = { \ 43static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
49 .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ 44 .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
50 btrfs_feature_attr_show, \ 45 btrfs_feature_attr_show, \
51 btrfs_feature_attr_store), \ 46 btrfs_feature_attr_store), \
52 .feature_set = _feature_set, \ 47 .feature_set = _feature_set, \
53 .feature_bit = _prefix ##_## _feature_bit, \ 48 .feature_bit = _feature_prefix ##_## _feature_bit, \
54} 49}
55#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) 50#define BTRFS_FEAT_ATTR_PTR(_name) \
51 (&btrfs_attr_features_##_name.kobj_attr.attr)
56 52
57#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ 53#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
58 BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) 54 BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 1458bb0ea124..8444a018cca2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -500,7 +500,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
500 path = btrfs_alloc_path(); 500 path = btrfs_alloc_path();
501 if (!path) { 501 if (!path) {
502 test_msg("Couldn't allocate path\n"); 502 test_msg("Couldn't allocate path\n");
503 return -ENOMEM; 503 ret = -ENOMEM;
504 goto out;
504 } 505 }
505 506
506 ret = add_block_group_free_space(&trans, root->fs_info, cache); 507 ret = add_block_group_free_space(&trans, root->fs_info, cache);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 8c91d03cc82d..f797642c013d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
770 offset = em->start + em->len; 770 offset = em->start + em->len;
771 free_extent_map(em); 771 free_extent_map(em);
772 772
773 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); 773 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
774 if (IS_ERR(em)) { 774 if (IS_ERR(em)) {
775 test_msg("Got an error when we shouldn't have\n"); 775 test_msg("Got an error when we shouldn't have\n");
776 goto out; 776 goto out;
@@ -968,7 +968,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
968 btrfs_test_inode_set_ops(inode); 968 btrfs_test_inode_set_ops(inode);
969 969
970 /* [BTRFS_MAX_EXTENT_SIZE] */ 970 /* [BTRFS_MAX_EXTENT_SIZE] */
971 BTRFS_I(inode)->outstanding_extents++;
972 ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 971 ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
973 NULL, 0); 972 NULL, 0);
974 if (ret) { 973 if (ret) {
@@ -983,7 +982,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
983 } 982 }
984 983
985 /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ 984 /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
986 BTRFS_I(inode)->outstanding_extents++;
987 ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, 985 ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
988 BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, 986 BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
989 NULL, 0); 987 NULL, 0);
@@ -1003,7 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1003 BTRFS_MAX_EXTENT_SIZE >> 1, 1001 BTRFS_MAX_EXTENT_SIZE >> 1,
1004 (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, 1002 (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
1005 EXTENT_DELALLOC | EXTENT_DIRTY | 1003 EXTENT_DELALLOC | EXTENT_DIRTY |
1006 EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, 1004 EXTENT_UPTODATE, 0, 0,
1007 NULL, GFP_KERNEL); 1005 NULL, GFP_KERNEL);
1008 if (ret) { 1006 if (ret) {
1009 test_msg("clear_extent_bit returned %d\n", ret); 1007 test_msg("clear_extent_bit returned %d\n", ret);
@@ -1017,7 +1015,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1017 } 1015 }
1018 1016
1019 /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ 1017 /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
1020 BTRFS_I(inode)->outstanding_extents++;
1021 ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, 1018 ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
1022 (BTRFS_MAX_EXTENT_SIZE >> 1) 1019 (BTRFS_MAX_EXTENT_SIZE >> 1)
1023 + sectorsize - 1, 1020 + sectorsize - 1,
@@ -1035,12 +1032,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1035 1032
1036 /* 1033 /*
1037 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] 1034 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
1038 *
1039 * I'm artificially adding 2 to outstanding_extents because in the
1040 * buffered IO case we'd add things up as we go, but I don't feel like
1041 * doing that here, this isn't the interesting case we want to test.
1042 */ 1035 */
1043 BTRFS_I(inode)->outstanding_extents += 2;
1044 ret = btrfs_set_extent_delalloc(inode, 1036 ret = btrfs_set_extent_delalloc(inode,
1045 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, 1037 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
1046 (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, 1038 (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
@@ -1059,7 +1051,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1059 /* 1051 /*
1060 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] 1052 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
1061 */ 1053 */
1062 BTRFS_I(inode)->outstanding_extents++;
1063 ret = btrfs_set_extent_delalloc(inode, 1054 ret = btrfs_set_extent_delalloc(inode,
1064 BTRFS_MAX_EXTENT_SIZE + sectorsize, 1055 BTRFS_MAX_EXTENT_SIZE + sectorsize,
1065 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); 1056 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
@@ -1079,7 +1070,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1079 BTRFS_MAX_EXTENT_SIZE + sectorsize, 1070 BTRFS_MAX_EXTENT_SIZE + sectorsize,
1080 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 1071 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
1081 EXTENT_DIRTY | EXTENT_DELALLOC | 1072 EXTENT_DIRTY | EXTENT_DELALLOC |
1082 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, 1073 EXTENT_UPTODATE, 0, 0,
1083 NULL, GFP_KERNEL); 1074 NULL, GFP_KERNEL);
1084 if (ret) { 1075 if (ret) {
1085 test_msg("clear_extent_bit returned %d\n", ret); 1076 test_msg("clear_extent_bit returned %d\n", ret);
@@ -1096,7 +1087,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1096 * Refill the hole again just for good measure, because I thought it 1087 * Refill the hole again just for good measure, because I thought it
1097 * might fail and I'd rather satisfy my paranoia at this point. 1088 * might fail and I'd rather satisfy my paranoia at this point.
1098 */ 1089 */
1099 BTRFS_I(inode)->outstanding_extents++;
1100 ret = btrfs_set_extent_delalloc(inode, 1090 ret = btrfs_set_extent_delalloc(inode,
1101 BTRFS_MAX_EXTENT_SIZE + sectorsize, 1091 BTRFS_MAX_EXTENT_SIZE + sectorsize,
1102 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); 1092 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
@@ -1114,7 +1104,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
1114 /* Empty */ 1104 /* Empty */
1115 ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, 1105 ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
1116 EXTENT_DIRTY | EXTENT_DELALLOC | 1106 EXTENT_DIRTY | EXTENT_DELALLOC |
1117 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, 1107 EXTENT_UPTODATE, 0, 0,
1118 NULL, GFP_KERNEL); 1108 NULL, GFP_KERNEL);
1119 if (ret) { 1109 if (ret) {
1120 test_msg("clear_extent_bit returned %d\n", ret); 1110 test_msg("clear_extent_bit returned %d\n", ret);
@@ -1131,7 +1121,7 @@ out:
1131 if (ret) 1121 if (ret)
1132 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, 1122 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
1133 EXTENT_DIRTY | EXTENT_DELALLOC | 1123 EXTENT_DIRTY | EXTENT_DELALLOC |
1134 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, 1124 EXTENT_UPTODATE, 0, 0,
1135 NULL, GFP_KERNEL); 1125 NULL, GFP_KERNEL);
1136 iput(inode); 1126 iput(inode);
1137 btrfs_free_dummy_root(root); 1127 btrfs_free_dummy_root(root);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 0f4ce970d195..90204b166643 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -240,7 +240,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
240 * we can only call btrfs_qgroup_account_extent() directly to test 240 * we can only call btrfs_qgroup_account_extent() directly to test
241 * quota. 241 * quota.
242 */ 242 */
243 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 243 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
244 false);
244 if (ret) { 245 if (ret) {
245 ulist_free(old_roots); 246 ulist_free(old_roots);
246 test_msg("Couldn't find old roots: %d\n", ret); 247 test_msg("Couldn't find old roots: %d\n", ret);
@@ -252,7 +253,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
252 if (ret) 253 if (ret)
253 return ret; 254 return ret;
254 255
255 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 256 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
257 false);
256 if (ret) { 258 if (ret) {
257 ulist_free(old_roots); 259 ulist_free(old_roots);
258 ulist_free(new_roots); 260 ulist_free(new_roots);
@@ -275,7 +277,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
275 old_roots = NULL; 277 old_roots = NULL;
276 new_roots = NULL; 278 new_roots = NULL;
277 279
278 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 280 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
281 false);
279 if (ret) { 282 if (ret) {
280 ulist_free(old_roots); 283 ulist_free(old_roots);
281 test_msg("Couldn't find old roots: %d\n", ret); 284 test_msg("Couldn't find old roots: %d\n", ret);
@@ -286,7 +289,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
286 if (ret) 289 if (ret)
287 return -EINVAL; 290 return -EINVAL;
288 291
289 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 292 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
293 false);
290 if (ret) { 294 if (ret) {
291 ulist_free(old_roots); 295 ulist_free(old_roots);
292 ulist_free(new_roots); 296 ulist_free(new_roots);
@@ -337,7 +341,8 @@ static int test_multiple_refs(struct btrfs_root *root,
337 return ret; 341 return ret;
338 } 342 }
339 343
340 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 344 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
345 false);
341 if (ret) { 346 if (ret) {
342 ulist_free(old_roots); 347 ulist_free(old_roots);
343 test_msg("Couldn't find old roots: %d\n", ret); 348 test_msg("Couldn't find old roots: %d\n", ret);
@@ -349,7 +354,8 @@ static int test_multiple_refs(struct btrfs_root *root,
349 if (ret) 354 if (ret)
350 return ret; 355 return ret;
351 356
352 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 357 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
358 false);
353 if (ret) { 359 if (ret) {
354 ulist_free(old_roots); 360 ulist_free(old_roots);
355 ulist_free(new_roots); 361 ulist_free(new_roots);
@@ -370,7 +376,8 @@ static int test_multiple_refs(struct btrfs_root *root,
370 return -EINVAL; 376 return -EINVAL;
371 } 377 }
372 378
373 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 379 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
380 false);
374 if (ret) { 381 if (ret) {
375 ulist_free(old_roots); 382 ulist_free(old_roots);
376 test_msg("Couldn't find old roots: %d\n", ret); 383 test_msg("Couldn't find old roots: %d\n", ret);
@@ -382,7 +389,8 @@ static int test_multiple_refs(struct btrfs_root *root,
382 if (ret) 389 if (ret)
383 return ret; 390 return ret;
384 391
385 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 392 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
393 false);
386 if (ret) { 394 if (ret) {
387 ulist_free(old_roots); 395 ulist_free(old_roots);
388 ulist_free(new_roots); 396 ulist_free(new_roots);
@@ -409,7 +417,8 @@ static int test_multiple_refs(struct btrfs_root *root,
409 return -EINVAL; 417 return -EINVAL;
410 } 418 }
411 419
412 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 420 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
421 false);
413 if (ret) { 422 if (ret) {
414 ulist_free(old_roots); 423 ulist_free(old_roots);
415 test_msg("Couldn't find old roots: %d\n", ret); 424 test_msg("Couldn't find old roots: %d\n", ret);
@@ -421,7 +430,8 @@ static int test_multiple_refs(struct btrfs_root *root,
421 if (ret) 430 if (ret)
422 return ret; 431 return ret;
423 432
424 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 433 ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
434 false);
425 if (ret) { 435 if (ret) {
426 ulist_free(old_roots); 436 ulist_free(old_roots);
427 ulist_free(new_roots); 437 ulist_free(new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f615d59b0489..5a8c2649af2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -797,8 +797,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
797{ 797{
798 struct btrfs_fs_info *fs_info = trans->fs_info; 798 struct btrfs_fs_info *fs_info = trans->fs_info;
799 799
800 if (fs_info->global_block_rsv.space_info->full && 800 if (btrfs_check_space_for_delayed_refs(trans, fs_info))
801 btrfs_check_space_for_delayed_refs(trans, fs_info))
802 return 1; 801 return 1;
803 802
804 return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); 803 return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -950,6 +949,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
950 u64 start = 0; 949 u64 start = 0;
951 u64 end; 950 u64 end;
952 951
952 atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
953 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 953 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
954 mark, &cached_state)) { 954 mark, &cached_state)) {
955 bool wait_writeback = false; 955 bool wait_writeback = false;
@@ -985,6 +985,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
985 cond_resched(); 985 cond_resched();
986 start = end + 1; 986 start = end + 1;
987 } 987 }
988 atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
988 return werr; 989 return werr;
989} 990}
990 991
@@ -1915,8 +1916,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1915 1916
1916static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1917static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1917{ 1918{
1919 /*
1920 * We use writeback_inodes_sb here because if we used
1921 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
1922 * Currently are holding the fs freeze lock, if we do an async flush
1923 * we'll do btrfs_join_transaction() and deadlock because we need to
1924 * wait for the fs freeze lock. Using the direct flushing we benefit
1925 * from already being in a transaction and our join_transaction doesn't
1926 * have to re-take the fs freeze lock.
1927 */
1918 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1928 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
1919 return btrfs_start_delalloc_roots(fs_info, 1, -1); 1929 writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
1920 return 0; 1930 return 0;
1921} 1931}
1922 1932
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000000..114fc5f0ecc5
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,425 @@
1/*
2 * Copyright (C) Qu Wenruo 2017. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program.
15 */
16
17/*
18 * The module is used to catch unexpected/corrupted tree block data.
19 * Such behavior can be caused either by a fuzzed image or bugs.
20 *
21 * The objective is to do leaf/node validation checks when tree block is read
22 * from disk, and check *every* possible member, so other code won't
23 * need to checking them again.
24 *
25 * Due to the potential and unwanted damage, every checker needs to be
26 * carefully reviewed otherwise so it does not prevent mount of valid images.
27 */
28
29#include "ctree.h"
30#include "tree-checker.h"
31#include "disk-io.h"
32#include "compression.h"
33
34/*
35 * Error message should follow the following format:
36 * corrupt <type>: <identifier>, <reason>[, <bad_value>]
37 *
38 * @type: leaf or node
39 * @identifier: the necessary info to locate the leaf/node.
40 * It's recommened to decode key.objecitd/offset if it's
41 * meaningful.
42 * @reason: describe the error
43 * @bad_value: optional, it's recommened to output bad value and its
44 * expected value (range).
45 *
46 * Since comma is used to separate the components, only space is allowed
47 * inside each component.
48 */
49
50/*
51 * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
52 * Allows callers to customize the output.
53 */
54__printf(4, 5)
55static void generic_err(const struct btrfs_root *root,
56 const struct extent_buffer *eb, int slot,
57 const char *fmt, ...)
58{
59 struct va_format vaf;
60 va_list args;
61
62 va_start(args, fmt);
63
64 vaf.fmt = fmt;
65 vaf.va = &args;
66
67 btrfs_crit(root->fs_info,
68 "corrupt %s: root=%llu block=%llu slot=%d, %pV",
69 btrfs_header_level(eb) == 0 ? "leaf" : "node",
70 root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
71 va_end(args);
72}
73
74/*
75 * Customized reporter for extent data item, since its key objectid and
76 * offset has its own meaning.
77 */
78__printf(4, 5)
79static void file_extent_err(const struct btrfs_root *root,
80 const struct extent_buffer *eb, int slot,
81 const char *fmt, ...)
82{
83 struct btrfs_key key;
84 struct va_format vaf;
85 va_list args;
86
87 btrfs_item_key_to_cpu(eb, &key, slot);
88 va_start(args, fmt);
89
90 vaf.fmt = fmt;
91 vaf.va = &args;
92
93 btrfs_crit(root->fs_info,
94 "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
95 btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
96 btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
97 va_end(args);
98}
99
100/*
101 * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
102 * Else return 1
103 */
104#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \
105({ \
106 if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
107 file_extent_err((root), (leaf), (slot), \
108 "invalid %s for file extent, have %llu, should be aligned to %u", \
109 (#name), btrfs_file_extent_##name((leaf), (fi)), \
110 (alignment)); \
111 (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
112})
113
114static int check_extent_data_item(struct btrfs_root *root,
115 struct extent_buffer *leaf,
116 struct btrfs_key *key, int slot)
117{
118 struct btrfs_file_extent_item *fi;
119 u32 sectorsize = root->fs_info->sectorsize;
120 u32 item_size = btrfs_item_size_nr(leaf, slot);
121
122 if (!IS_ALIGNED(key->offset, sectorsize)) {
123 file_extent_err(root, leaf, slot,
124"unaligned file_offset for file extent, have %llu should be aligned to %u",
125 key->offset, sectorsize);
126 return -EUCLEAN;
127 }
128
129 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
130
131 if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
132 file_extent_err(root, leaf, slot,
133 "invalid type for file extent, have %u expect range [0, %u]",
134 btrfs_file_extent_type(leaf, fi),
135 BTRFS_FILE_EXTENT_TYPES);
136 return -EUCLEAN;
137 }
138
139 /*
140 * Support for new compression/encrption must introduce incompat flag,
141 * and must be caught in open_ctree().
142 */
143 if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
144 file_extent_err(root, leaf, slot,
145 "invalid compression for file extent, have %u expect range [0, %u]",
146 btrfs_file_extent_compression(leaf, fi),
147 BTRFS_COMPRESS_TYPES);
148 return -EUCLEAN;
149 }
150 if (btrfs_file_extent_encryption(leaf, fi)) {
151 file_extent_err(root, leaf, slot,
152 "invalid encryption for file extent, have %u expect 0",
153 btrfs_file_extent_encryption(leaf, fi));
154 return -EUCLEAN;
155 }
156 if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
157 /* Inline extent must have 0 as key offset */
158 if (key->offset) {
159 file_extent_err(root, leaf, slot,
160 "invalid file_offset for inline file extent, have %llu expect 0",
161 key->offset);
162 return -EUCLEAN;
163 }
164
165 /* Compressed inline extent has no on-disk size, skip it */
166 if (btrfs_file_extent_compression(leaf, fi) !=
167 BTRFS_COMPRESS_NONE)
168 return 0;
169
170 /* Uncompressed inline extent size must match item size */
171 if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
172 btrfs_file_extent_ram_bytes(leaf, fi)) {
173 file_extent_err(root, leaf, slot,
174 "invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
175 item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
176 btrfs_file_extent_ram_bytes(leaf, fi));
177 return -EUCLEAN;
178 }
179 return 0;
180 }
181
182 /* Regular or preallocated extent has fixed item size */
183 if (item_size != sizeof(*fi)) {
184 file_extent_err(root, leaf, slot,
185 "invalid item size for reg/prealloc file extent, have %u expect %zu",
186 item_size, sizeof(*fi));
187 return -EUCLEAN;
188 }
189 if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) ||
190 CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) ||
191 CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) ||
192 CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) ||
193 CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize))
194 return -EUCLEAN;
195 return 0;
196}
197
198static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
199 struct btrfs_key *key, int slot)
200{
201 u32 sectorsize = root->fs_info->sectorsize;
202 u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
203
204 if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
205 generic_err(root, leaf, slot,
206 "invalid key objectid for csum item, have %llu expect %llu",
207 key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
208 return -EUCLEAN;
209 }
210 if (!IS_ALIGNED(key->offset, sectorsize)) {
211 generic_err(root, leaf, slot,
212 "unaligned key offset for csum item, have %llu should be aligned to %u",
213 key->offset, sectorsize);
214 return -EUCLEAN;
215 }
216 if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
217 generic_err(root, leaf, slot,
218 "unaligned item size for csum item, have %u should be aligned to %u",
219 btrfs_item_size_nr(leaf, slot), csumsize);
220 return -EUCLEAN;
221 }
222 return 0;
223}
224
225/*
226 * Common point to switch the item-specific validation.
227 */
228static int check_leaf_item(struct btrfs_root *root,
229 struct extent_buffer *leaf,
230 struct btrfs_key *key, int slot)
231{
232 int ret = 0;
233
234 switch (key->type) {
235 case BTRFS_EXTENT_DATA_KEY:
236 ret = check_extent_data_item(root, leaf, key, slot);
237 break;
238 case BTRFS_EXTENT_CSUM_KEY:
239 ret = check_csum_item(root, leaf, key, slot);
240 break;
241 }
242 return ret;
243}
244
245int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
246{
247 struct btrfs_fs_info *fs_info = root->fs_info;
248 /* No valid key type is 0, so all key should be larger than this key */
249 struct btrfs_key prev_key = {0, 0, 0};
250 struct btrfs_key key;
251 u32 nritems = btrfs_header_nritems(leaf);
252 int slot;
253
254 /*
255 * Extent buffers from a relocation tree have a owner field that
256 * corresponds to the subvolume tree they are based on. So just from an
257 * extent buffer alone we can not find out what is the id of the
258 * corresponding subvolume tree, so we can not figure out if the extent
259 * buffer corresponds to the root of the relocation tree or not. So
260 * skip this check for relocation trees.
261 */
262 if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
263 struct btrfs_root *check_root;
264
265 key.objectid = btrfs_header_owner(leaf);
266 key.type = BTRFS_ROOT_ITEM_KEY;
267 key.offset = (u64)-1;
268
269 check_root = btrfs_get_fs_root(fs_info, &key, false);
270 /*
271 * The only reason we also check NULL here is that during
272 * open_ctree() some roots has not yet been set up.
273 */
274 if (!IS_ERR_OR_NULL(check_root)) {
275 struct extent_buffer *eb;
276
277 eb = btrfs_root_node(check_root);
278 /* if leaf is the root, then it's fine */
279 if (leaf != eb) {
280 generic_err(check_root, leaf, 0,
281 "invalid nritems, have %u should not be 0 for non-root leaf",
282 nritems);
283 free_extent_buffer(eb);
284 return -EUCLEAN;
285 }
286 free_extent_buffer(eb);
287 }
288 return 0;
289 }
290
291 if (nritems == 0)
292 return 0;
293
294 /*
295 * Check the following things to make sure this is a good leaf, and
296 * leaf users won't need to bother with similar sanity checks:
297 *
298 * 1) key ordering
299 * 2) item offset and size
300 * No overlap, no hole, all inside the leaf.
301 * 3) item content
302 * If possible, do comprehensive sanity check.
303 * NOTE: All checks must only rely on the item data itself.
304 */
305 for (slot = 0; slot < nritems; slot++) {
306 u32 item_end_expected;
307 int ret;
308
309 btrfs_item_key_to_cpu(leaf, &key, slot);
310
311 /* Make sure the keys are in the right order */
312 if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
313 generic_err(root, leaf, slot,
314 "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
315 prev_key.objectid, prev_key.type,
316 prev_key.offset, key.objectid, key.type,
317 key.offset);
318 return -EUCLEAN;
319 }
320
321 /*
322 * Make sure the offset and ends are right, remember that the
323 * item data starts at the end of the leaf and grows towards the
324 * front.
325 */
326 if (slot == 0)
327 item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
328 else
329 item_end_expected = btrfs_item_offset_nr(leaf,
330 slot - 1);
331 if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
332 generic_err(root, leaf, slot,
333 "unexpected item end, have %u expect %u",
334 btrfs_item_end_nr(leaf, slot),
335 item_end_expected);
336 return -EUCLEAN;
337 }
338
339 /*
340 * Check to make sure that we don't point outside of the leaf,
341 * just in case all the items are consistent to each other, but
342 * all point outside of the leaf.
343 */
344 if (btrfs_item_end_nr(leaf, slot) >
345 BTRFS_LEAF_DATA_SIZE(fs_info)) {
346 generic_err(root, leaf, slot,
347 "slot end outside of leaf, have %u expect range [0, %u]",
348 btrfs_item_end_nr(leaf, slot),
349 BTRFS_LEAF_DATA_SIZE(fs_info));
350 return -EUCLEAN;
351 }
352
353 /* Also check if the item pointer overlaps with btrfs item. */
354 if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
355 btrfs_item_ptr_offset(leaf, slot)) {
356 generic_err(root, leaf, slot,
357 "slot overlaps with its data, item end %lu data start %lu",
358 btrfs_item_nr_offset(slot) +
359 sizeof(struct btrfs_item),
360 btrfs_item_ptr_offset(leaf, slot));
361 return -EUCLEAN;
362 }
363
364 /* Check if the item size and content meet other criteria */
365 ret = check_leaf_item(root, leaf, &key, slot);
366 if (ret < 0)
367 return ret;
368
369 prev_key.objectid = key.objectid;
370 prev_key.type = key.type;
371 prev_key.offset = key.offset;
372 }
373
374 return 0;
375}
376
377int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
378{
379 unsigned long nr = btrfs_header_nritems(node);
380 struct btrfs_key key, next_key;
381 int slot;
382 u64 bytenr;
383 int ret = 0;
384
385 if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
386 btrfs_crit(root->fs_info,
387"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
388 root->objectid, node->start,
389 nr == 0 ? "small" : "large", nr,
390 BTRFS_NODEPTRS_PER_BLOCK(root->fs_info));
391 return -EUCLEAN;
392 }
393
394 for (slot = 0; slot < nr - 1; slot++) {
395 bytenr = btrfs_node_blockptr(node, slot);
396 btrfs_node_key_to_cpu(node, &key, slot);
397 btrfs_node_key_to_cpu(node, &next_key, slot + 1);
398
399 if (!bytenr) {
400 generic_err(root, node, slot,
401 "invalid NULL node pointer");
402 ret = -EUCLEAN;
403 goto out;
404 }
405 if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) {
406 generic_err(root, node, slot,
407 "unaligned pointer, have %llu should be aligned to %u",
408 bytenr, root->fs_info->sectorsize);
409 ret = -EUCLEAN;
410 goto out;
411 }
412
413 if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
414 generic_err(root, node, slot,
415 "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
416 key.objectid, key.type, key.offset,
417 next_key.objectid, next_key.type,
418 next_key.offset);
419 ret = -EUCLEAN;
420 goto out;
421 }
422 }
423out:
424 return ret;
425}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000000..96c486e95d70
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) Qu Wenruo 2017. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program.
15 */
16
17#ifndef __BTRFS_TREE_CHECKER__
18#define __BTRFS_TREE_CHECKER__
19
20#include "ctree.h"
21#include "extent_io.h"
22
23int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
24int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
25
26#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c800d067fcbf..aa7c71cff575 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -717,7 +717,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
718 ins.offset); 718 ins.offset);
719 if (ret == 0) { 719 if (ret == 0) {
720 ret = btrfs_inc_extent_ref(trans, fs_info, 720 ret = btrfs_inc_extent_ref(trans, root,
721 ins.objectid, ins.offset, 721 ins.objectid, ins.offset,
722 0, root->root_key.objectid, 722 0, root->root_key.objectid,
723 key->objectid, offset); 723 key->objectid, offset);
@@ -2699,34 +2699,36 @@ static void wait_log_commit(struct btrfs_root *root, int transid)
2699 * so we know that if ours is more than 2 older than the 2699 * so we know that if ours is more than 2 older than the
2700 * current transaction, we're done 2700 * current transaction, we're done
2701 */ 2701 */
2702 do { 2702 for (;;) {
2703 prepare_to_wait(&root->log_commit_wait[index], 2703 prepare_to_wait(&root->log_commit_wait[index],
2704 &wait, TASK_UNINTERRUPTIBLE); 2704 &wait, TASK_UNINTERRUPTIBLE);
2705 mutex_unlock(&root->log_mutex);
2706 2705
2707 if (root->log_transid_committed < transid && 2706 if (!(root->log_transid_committed < transid &&
2708 atomic_read(&root->log_commit[index])) 2707 atomic_read(&root->log_commit[index])))
2709 schedule(); 2708 break;
2710 2709
2711 finish_wait(&root->log_commit_wait[index], &wait); 2710 mutex_unlock(&root->log_mutex);
2711 schedule();
2712 mutex_lock(&root->log_mutex); 2712 mutex_lock(&root->log_mutex);
2713 } while (root->log_transid_committed < transid && 2713 }
2714 atomic_read(&root->log_commit[index])); 2714 finish_wait(&root->log_commit_wait[index], &wait);
2715} 2715}
2716 2716
2717static void wait_for_writer(struct btrfs_root *root) 2717static void wait_for_writer(struct btrfs_root *root)
2718{ 2718{
2719 DEFINE_WAIT(wait); 2719 DEFINE_WAIT(wait);
2720 2720
2721 while (atomic_read(&root->log_writers)) { 2721 for (;;) {
2722 prepare_to_wait(&root->log_writer_wait, 2722 prepare_to_wait(&root->log_writer_wait, &wait,
2723 &wait, TASK_UNINTERRUPTIBLE); 2723 TASK_UNINTERRUPTIBLE);
2724 if (!atomic_read(&root->log_writers))
2725 break;
2726
2724 mutex_unlock(&root->log_mutex); 2727 mutex_unlock(&root->log_mutex);
2725 if (atomic_read(&root->log_writers)) 2728 schedule();
2726 schedule();
2727 finish_wait(&root->log_writer_wait, &wait);
2728 mutex_lock(&root->log_mutex); 2729 mutex_lock(&root->log_mutex);
2729 } 2730 }
2731 finish_wait(&root->log_writer_wait, &wait);
2730} 2732}
2731 2733
2732static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2734static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
@@ -4645,7 +4647,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4645 struct btrfs_key min_key; 4647 struct btrfs_key min_key;
4646 struct btrfs_key max_key; 4648 struct btrfs_key max_key;
4647 struct btrfs_root *log = root->log_root; 4649 struct btrfs_root *log = root->log_root;
4648 struct extent_buffer *src = NULL;
4649 LIST_HEAD(logged_list); 4650 LIST_HEAD(logged_list);
4650 u64 last_extent = 0; 4651 u64 last_extent = 0;
4651 int err = 0; 4652 int err = 0;
@@ -4888,7 +4889,6 @@ again:
4888 goto next_slot; 4889 goto next_slot;
4889 } 4890 }
4890 4891
4891 src = path->nodes[0];
4892 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4892 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
4893 ins_nr++; 4893 ins_nr++;
4894 goto next_slot; 4894 goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b39737568c22..f1ecb938ba4d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -360,7 +360,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
360 int again = 0; 360 int again = 0;
361 unsigned long num_run; 361 unsigned long num_run;
362 unsigned long batch_run = 0; 362 unsigned long batch_run = 0;
363 unsigned long limit;
364 unsigned long last_waited = 0; 363 unsigned long last_waited = 0;
365 int force_reg = 0; 364 int force_reg = 0;
366 int sync_pending = 0; 365 int sync_pending = 0;
@@ -375,8 +374,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
375 blk_start_plug(&plug); 374 blk_start_plug(&plug);
376 375
377 bdi = device->bdev->bd_bdi; 376 bdi = device->bdev->bd_bdi;
378 limit = btrfs_async_submit_limit(fs_info);
379 limit = limit * 2 / 3;
380 377
381loop: 378loop:
382 spin_lock(&device->io_lock); 379 spin_lock(&device->io_lock);
@@ -443,13 +440,6 @@ loop_lock:
443 pending = pending->bi_next; 440 pending = pending->bi_next;
444 cur->bi_next = NULL; 441 cur->bi_next = NULL;
445 442
446 /*
447 * atomic_dec_return implies a barrier for waitqueue_active
448 */
449 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
450 waitqueue_active(&fs_info->async_submit_wait))
451 wake_up(&fs_info->async_submit_wait);
452
453 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 443 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
454 444
455 /* 445 /*
@@ -517,12 +507,6 @@ loop_lock:
517 &device->work); 507 &device->work);
518 goto done; 508 goto done;
519 } 509 }
520 /* unplug every 64 requests just for good measure */
521 if (batch_run % 64 == 0) {
522 blk_finish_plug(&plug);
523 blk_start_plug(&plug);
524 sync_pending = 0;
525 }
526 } 510 }
527 511
528 cond_resched(); 512 cond_resched();
@@ -547,7 +531,7 @@ static void pending_bios_fn(struct btrfs_work *work)
547} 531}
548 532
549 533
550void btrfs_free_stale_device(struct btrfs_device *cur_dev) 534static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
551{ 535{
552 struct btrfs_fs_devices *fs_devs; 536 struct btrfs_fs_devices *fs_devs;
553 struct btrfs_device *dev; 537 struct btrfs_device *dev;
@@ -1068,14 +1052,15 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1068 return ret; 1052 return ret;
1069} 1053}
1070 1054
1071void btrfs_release_disk_super(struct page *page) 1055static void btrfs_release_disk_super(struct page *page)
1072{ 1056{
1073 kunmap(page); 1057 kunmap(page);
1074 put_page(page); 1058 put_page(page);
1075} 1059}
1076 1060
1077int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1061static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1078 struct page **page, struct btrfs_super_block **disk_super) 1062 struct page **page,
1063 struct btrfs_super_block **disk_super)
1079{ 1064{
1080 void *p; 1065 void *p;
1081 pgoff_t index; 1066 pgoff_t index;
@@ -1817,8 +1802,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1817 return 0; 1802 return 0;
1818} 1803}
1819 1804
1820struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, 1805static struct btrfs_device * btrfs_find_next_active_device(
1821 struct btrfs_device *device) 1806 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1822{ 1807{
1823 struct btrfs_device *next_device; 1808 struct btrfs_device *next_device;
1824 1809
@@ -2031,19 +2016,20 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2031 } 2016 }
2032 2017
2033 btrfs_close_bdev(srcdev); 2018 btrfs_close_bdev(srcdev);
2034
2035 call_rcu(&srcdev->rcu, free_device); 2019 call_rcu(&srcdev->rcu, free_device);
2036 2020
2037 /*
2038 * unless fs_devices is seed fs, num_devices shouldn't go
2039 * zero
2040 */
2041 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
2042
2043 /* if this is no devs we rather delete the fs_devices */ 2021 /* if this is no devs we rather delete the fs_devices */
2044 if (!fs_devices->num_devices) { 2022 if (!fs_devices->num_devices) {
2045 struct btrfs_fs_devices *tmp_fs_devices; 2023 struct btrfs_fs_devices *tmp_fs_devices;
2046 2024
2025 /*
2026 * On a mounted FS, num_devices can't be zero unless it's a
2027 * seed. In case of a seed device being replaced, the replace
2028 * target added to the sprout FS, so there will be no more
2029 * device left under the seed FS.
2030 */
2031 ASSERT(fs_devices->seeding);
2032
2047 tmp_fs_devices = fs_info->fs_devices; 2033 tmp_fs_devices = fs_info->fs_devices;
2048 while (tmp_fs_devices) { 2034 while (tmp_fs_devices) {
2049 if (tmp_fs_devices->seed == fs_devices) { 2035 if (tmp_fs_devices->seed == fs_devices) {
@@ -2323,6 +2309,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2323 u64 tmp; 2309 u64 tmp;
2324 int seeding_dev = 0; 2310 int seeding_dev = 0;
2325 int ret = 0; 2311 int ret = 0;
2312 bool unlocked = false;
2326 2313
2327 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) 2314 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2328 return -EROFS; 2315 return -EROFS;
@@ -2399,7 +2386,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2399 if (seeding_dev) { 2386 if (seeding_dev) {
2400 sb->s_flags &= ~MS_RDONLY; 2387 sb->s_flags &= ~MS_RDONLY;
2401 ret = btrfs_prepare_sprout(fs_info); 2388 ret = btrfs_prepare_sprout(fs_info);
2402 BUG_ON(ret); /* -ENOMEM */ 2389 if (ret) {
2390 btrfs_abort_transaction(trans, ret);
2391 goto error_trans;
2392 }
2403 } 2393 }
2404 2394
2405 device->fs_devices = fs_info->fs_devices; 2395 device->fs_devices = fs_info->fs_devices;
@@ -2445,14 +2435,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2445 mutex_unlock(&fs_info->chunk_mutex); 2435 mutex_unlock(&fs_info->chunk_mutex);
2446 if (ret) { 2436 if (ret) {
2447 btrfs_abort_transaction(trans, ret); 2437 btrfs_abort_transaction(trans, ret);
2448 goto error_trans; 2438 goto error_sysfs;
2449 } 2439 }
2450 } 2440 }
2451 2441
2452 ret = btrfs_add_device(trans, fs_info, device); 2442 ret = btrfs_add_device(trans, fs_info, device);
2453 if (ret) { 2443 if (ret) {
2454 btrfs_abort_transaction(trans, ret); 2444 btrfs_abort_transaction(trans, ret);
2455 goto error_trans; 2445 goto error_sysfs;
2456 } 2446 }
2457 2447
2458 if (seeding_dev) { 2448 if (seeding_dev) {
@@ -2461,7 +2451,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2461 ret = btrfs_finish_sprout(trans, fs_info); 2451 ret = btrfs_finish_sprout(trans, fs_info);
2462 if (ret) { 2452 if (ret) {
2463 btrfs_abort_transaction(trans, ret); 2453 btrfs_abort_transaction(trans, ret);
2464 goto error_trans; 2454 goto error_sysfs;
2465 } 2455 }
2466 2456
2467 /* Sprouting would change fsid of the mounted root, 2457 /* Sprouting would change fsid of the mounted root,
@@ -2479,6 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2479 if (seeding_dev) { 2469 if (seeding_dev) {
2480 mutex_unlock(&uuid_mutex); 2470 mutex_unlock(&uuid_mutex);
2481 up_write(&sb->s_umount); 2471 up_write(&sb->s_umount);
2472 unlocked = true;
2482 2473
2483 if (ret) /* transaction commit */ 2474 if (ret) /* transaction commit */
2484 return ret; 2475 return ret;
@@ -2491,7 +2482,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2491 if (IS_ERR(trans)) { 2482 if (IS_ERR(trans)) {
2492 if (PTR_ERR(trans) == -ENOENT) 2483 if (PTR_ERR(trans) == -ENOENT)
2493 return 0; 2484 return 0;
2494 return PTR_ERR(trans); 2485 ret = PTR_ERR(trans);
2486 trans = NULL;
2487 goto error_sysfs;
2495 } 2488 }
2496 ret = btrfs_commit_transaction(trans); 2489 ret = btrfs_commit_transaction(trans);
2497 } 2490 }
@@ -2500,14 +2493,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2500 update_dev_time(device_path); 2493 update_dev_time(device_path);
2501 return ret; 2494 return ret;
2502 2495
2496error_sysfs:
2497 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2503error_trans: 2498error_trans:
2504 btrfs_end_transaction(trans); 2499 if (seeding_dev)
2500 sb->s_flags |= MS_RDONLY;
2501 if (trans)
2502 btrfs_end_transaction(trans);
2505 rcu_string_free(device->name); 2503 rcu_string_free(device->name);
2506 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2507 kfree(device); 2504 kfree(device);
2508error: 2505error:
2509 blkdev_put(bdev, FMODE_EXCL); 2506 blkdev_put(bdev, FMODE_EXCL);
2510 if (seeding_dev) { 2507 if (seeding_dev && !unlocked) {
2511 mutex_unlock(&uuid_mutex); 2508 mutex_unlock(&uuid_mutex);
2512 up_write(&sb->s_umount); 2509 up_write(&sb->s_umount);
2513 } 2510 }
@@ -4813,16 +4810,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4813 em_tree = &info->mapping_tree.map_tree; 4810 em_tree = &info->mapping_tree.map_tree;
4814 write_lock(&em_tree->lock); 4811 write_lock(&em_tree->lock);
4815 ret = add_extent_mapping(em_tree, em, 0); 4812 ret = add_extent_mapping(em_tree, em, 0);
4816 if (!ret) {
4817 list_add_tail(&em->list, &trans->transaction->pending_chunks);
4818 refcount_inc(&em->refs);
4819 }
4820 write_unlock(&em_tree->lock);
4821 if (ret) { 4813 if (ret) {
4814 write_unlock(&em_tree->lock);
4822 free_extent_map(em); 4815 free_extent_map(em);
4823 goto error; 4816 goto error;
4824 } 4817 }
4825 4818
4819 list_add_tail(&em->list, &trans->transaction->pending_chunks);
4820 refcount_inc(&em->refs);
4821 write_unlock(&em_tree->lock);
4822
4826 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); 4823 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
4827 if (ret) 4824 if (ret)
4828 goto error_del_extent; 4825 goto error_del_extent;
@@ -5695,10 +5692,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5695 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5692 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5696 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5693 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5697 &stripe_index); 5694 &stripe_index);
5698 if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) 5695 if (!need_full_stripe(op))
5699 mirror_num = 1; 5696 mirror_num = 1;
5700 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5697 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5701 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) 5698 if (need_full_stripe(op))
5702 num_stripes = map->num_stripes; 5699 num_stripes = map->num_stripes;
5703 else if (mirror_num) 5700 else if (mirror_num)
5704 stripe_index = mirror_num - 1; 5701 stripe_index = mirror_num - 1;
@@ -5711,7 +5708,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5711 } 5708 }
5712 5709
5713 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5710 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5714 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { 5711 if (need_full_stripe(op)) {
5715 num_stripes = map->num_stripes; 5712 num_stripes = map->num_stripes;
5716 } else if (mirror_num) { 5713 } else if (mirror_num) {
5717 stripe_index = mirror_num - 1; 5714 stripe_index = mirror_num - 1;
@@ -5725,7 +5722,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5725 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5722 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5726 stripe_index *= map->sub_stripes; 5723 stripe_index *= map->sub_stripes;
5727 5724
5728 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) 5725 if (need_full_stripe(op))
5729 num_stripes = map->sub_stripes; 5726 num_stripes = map->sub_stripes;
5730 else if (mirror_num) 5727 else if (mirror_num)
5731 stripe_index += mirror_num - 1; 5728 stripe_index += mirror_num - 1;
@@ -5740,9 +5737,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5740 } 5737 }
5741 5738
5742 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5739 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5743 if (need_raid_map && 5740 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
5744 (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
5745 mirror_num > 1)) {
5746 /* push stripe_nr back to the start of the full stripe */ 5741 /* push stripe_nr back to the start of the full stripe */
5747 stripe_nr = div64_u64(raid56_full_stripe_start, 5742 stripe_nr = div64_u64(raid56_full_stripe_start,
5748 stripe_len * nr_data_stripes(map)); 5743 stripe_len * nr_data_stripes(map));
@@ -5769,9 +5764,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5769 /* We distribute the parity blocks across stripes */ 5764 /* We distribute the parity blocks across stripes */
5770 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5765 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
5771 &stripe_index); 5766 &stripe_index);
5772 if ((op != BTRFS_MAP_WRITE && 5767 if (!need_full_stripe(op) && mirror_num <= 1)
5773 op != BTRFS_MAP_GET_READ_MIRRORS) &&
5774 mirror_num <= 1)
5775 mirror_num = 1; 5768 mirror_num = 1;
5776 } 5769 }
5777 } else { 5770 } else {
@@ -6033,7 +6026,7 @@ static void btrfs_end_bio(struct bio *bio)
6033 * this bio is actually up to date, we didn't 6026 * this bio is actually up to date, we didn't
6034 * go over the max number of errors 6027 * go over the max number of errors
6035 */ 6028 */
6036 bio->bi_status = 0; 6029 bio->bi_status = BLK_STS_OK;
6037 } 6030 }
6038 6031
6039 btrfs_end_bbio(bbio, bio); 6032 btrfs_end_bbio(bbio, bio);
@@ -6069,13 +6062,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6069 return; 6062 return;
6070 } 6063 }
6071 6064
6072 /*
6073 * nr_async_bios allows us to reliably return congestion to the
6074 * higher layers. Otherwise, the async bio makes it appear we have
6075 * made progress against dirty pages when we've really just put it
6076 * on a queue for later
6077 */
6078 atomic_inc(&fs_info->nr_async_bios);
6079 WARN_ON(bio->bi_next); 6065 WARN_ON(bio->bi_next);
6080 bio->bi_next = NULL; 6066 bio->bi_next = NULL;
6081 6067
@@ -6144,7 +6130,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6144 6130
6145 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6131 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6146 bio->bi_iter.bi_sector = logical >> 9; 6132 bio->bi_iter.bi_sector = logical >> 9;
6147 bio->bi_status = BLK_STS_IOERR; 6133 if (atomic_read(&bbio->error) > bbio->max_errors)
6134 bio->bi_status = BLK_STS_IOERR;
6135 else
6136 bio->bi_status = BLK_STS_OK;
6148 btrfs_end_bbio(bbio, bio); 6137 btrfs_end_bbio(bbio, bio);
6149 } 6138 }
6150} 6139}
@@ -6249,7 +6238,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6249 6238
6250 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6239 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6251 if (IS_ERR(device)) 6240 if (IS_ERR(device))
6252 return NULL; 6241 return device;
6253 6242
6254 list_add(&device->dev_list, &fs_devices->devices); 6243 list_add(&device->dev_list, &fs_devices->devices);
6255 device->fs_devices = fs_devices; 6244 device->fs_devices = fs_devices;
@@ -6377,6 +6366,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6377 return 0; 6366 return 0;
6378} 6367}
6379 6368
6369static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6370 u64 devid, u8 *uuid, bool error)
6371{
6372 if (error)
6373 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6374 devid, uuid);
6375 else
6376 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6377 devid, uuid);
6378}
6379
6380static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6380static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6381 struct extent_buffer *leaf, 6381 struct extent_buffer *leaf,
6382 struct btrfs_chunk *chunk) 6382 struct btrfs_chunk *chunk)
@@ -6447,18 +6447,21 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6447 if (!map->stripes[i].dev && 6447 if (!map->stripes[i].dev &&
6448 !btrfs_test_opt(fs_info, DEGRADED)) { 6448 !btrfs_test_opt(fs_info, DEGRADED)) {
6449 free_extent_map(em); 6449 free_extent_map(em);
6450 btrfs_report_missing_device(fs_info, devid, uuid); 6450 btrfs_report_missing_device(fs_info, devid, uuid, true);
6451 return -EIO; 6451 return -ENOENT;
6452 } 6452 }
6453 if (!map->stripes[i].dev) { 6453 if (!map->stripes[i].dev) {
6454 map->stripes[i].dev = 6454 map->stripes[i].dev =
6455 add_missing_dev(fs_info->fs_devices, devid, 6455 add_missing_dev(fs_info->fs_devices, devid,
6456 uuid); 6456 uuid);
6457 if (!map->stripes[i].dev) { 6457 if (IS_ERR(map->stripes[i].dev)) {
6458 free_extent_map(em); 6458 free_extent_map(em);
6459 return -EIO; 6459 btrfs_err(fs_info,
6460 "failed to init missing dev %llu: %ld",
6461 devid, PTR_ERR(map->stripes[i].dev));
6462 return PTR_ERR(map->stripes[i].dev);
6460 } 6463 }
6461 btrfs_report_missing_device(fs_info, devid, uuid); 6464 btrfs_report_missing_device(fs_info, devid, uuid, false);
6462 } 6465 }
6463 map->stripes[i].dev->in_fs_metadata = 1; 6466 map->stripes[i].dev->in_fs_metadata = 1;
6464 } 6467 }
@@ -6577,19 +6580,28 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
6577 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6580 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6578 if (!device) { 6581 if (!device) {
6579 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6582 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6580 btrfs_report_missing_device(fs_info, devid, dev_uuid); 6583 btrfs_report_missing_device(fs_info, devid,
6581 return -EIO; 6584 dev_uuid, true);
6585 return -ENOENT;
6582 } 6586 }
6583 6587
6584 device = add_missing_dev(fs_devices, devid, dev_uuid); 6588 device = add_missing_dev(fs_devices, devid, dev_uuid);
6585 if (!device) 6589 if (IS_ERR(device)) {
6586 return -ENOMEM; 6590 btrfs_err(fs_info,
6587 btrfs_report_missing_device(fs_info, devid, dev_uuid); 6591 "failed to add missing dev %llu: %ld",
6592 devid, PTR_ERR(device));
6593 return PTR_ERR(device);
6594 }
6595 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6588 } else { 6596 } else {
6589 if (!device->bdev) { 6597 if (!device->bdev) {
6590 btrfs_report_missing_device(fs_info, devid, dev_uuid); 6598 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6591 if (!btrfs_test_opt(fs_info, DEGRADED)) 6599 btrfs_report_missing_device(fs_info,
6592 return -EIO; 6600 devid, dev_uuid, true);
6601 return -ENOENT;
6602 }
6603 btrfs_report_missing_device(fs_info, devid,
6604 dev_uuid, false);
6593 } 6605 }
6594 6606
6595 if(!device->bdev && !device->missing) { 6607 if(!device->bdev && !device->missing) {
@@ -6756,12 +6768,6 @@ out_short_read:
6756 return -EIO; 6768 return -EIO;
6757} 6769}
6758 6770
6759void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
6760 u8 *uuid)
6761{
6762 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
6763}
6764
6765/* 6771/*
6766 * Check if all chunks in the fs are OK for read-write degraded mount 6772 * Check if all chunks in the fs are OK for read-write degraded mount
6767 * 6773 *
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6108fdfec67f..ff15208344a7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -542,7 +542,5 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
542void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); 542void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
543 543
544bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); 544bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
545void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
546 u8 *uuid);
547 545
548#endif 546#endif
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c248f9286366..2b52950dc2c6 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -37,6 +37,7 @@ struct workspace {
37 z_stream strm; 37 z_stream strm;
38 char *buf; 38 char *buf;
39 struct list_head list; 39 struct list_head list;
40 int level;
40}; 41};
41 42
42static void zlib_free_workspace(struct list_head *ws) 43static void zlib_free_workspace(struct list_head *ws)
@@ -96,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
96 *total_out = 0; 97 *total_out = 0;
97 *total_in = 0; 98 *total_in = 0;
98 99
99 if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { 100 if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
100 pr_warn("BTRFS: deflateInit failed\n"); 101 pr_warn("BTRFS: deflateInit failed\n");
101 ret = -EIO; 102 ret = -EIO;
102 goto out; 103 goto out;
@@ -402,10 +403,22 @@ next:
402 return ret; 403 return ret;
403} 404}
404 405
406static void zlib_set_level(struct list_head *ws, unsigned int type)
407{
408 struct workspace *workspace = list_entry(ws, struct workspace, list);
409 unsigned level = (type & 0xF0) >> 4;
410
411 if (level > 9)
412 level = 9;
413
414 workspace->level = level > 0 ? level : 3;
415}
416
405const struct btrfs_compress_op btrfs_zlib_compress = { 417const struct btrfs_compress_op btrfs_zlib_compress = {
406 .alloc_workspace = zlib_alloc_workspace, 418 .alloc_workspace = zlib_alloc_workspace,
407 .free_workspace = zlib_free_workspace, 419 .free_workspace = zlib_free_workspace,
408 .compress_pages = zlib_compress_pages, 420 .compress_pages = zlib_compress_pages,
409 .decompress_bio = zlib_decompress_bio, 421 .decompress_bio = zlib_decompress_bio,
410 .decompress = zlib_decompress, 422 .decompress = zlib_decompress,
423 .set_level = zlib_set_level,
411}; 424};
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 607ce47b483a..17f2dd8fddb8 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -423,10 +423,15 @@ finish:
423 return ret; 423 return ret;
424} 424}
425 425
426static void zstd_set_level(struct list_head *ws, unsigned int type)
427{
428}
429
426const struct btrfs_compress_op btrfs_zstd_compress = { 430const struct btrfs_compress_op btrfs_zstd_compress = {
427 .alloc_workspace = zstd_alloc_workspace, 431 .alloc_workspace = zstd_alloc_workspace,
428 .free_workspace = zstd_free_workspace, 432 .free_workspace = zstd_free_workspace,
429 .compress_pages = zstd_compress_pages, 433 .compress_pages = zstd_compress_pages,
430 .decompress_bio = zstd_decompress_bio, 434 .decompress_bio = zstd_decompress_bio,
431 .decompress = zstd_decompress, 435 .decompress = zstd_decompress,
436 .set_level = zstd_set_level,
432}; 437};
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 32d0c1fe2bfa..4342a329821f 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -29,6 +29,13 @@ struct btrfs_qgroup_extent_record;
29struct btrfs_qgroup; 29struct btrfs_qgroup;
30struct prelim_ref; 30struct prelim_ref;
31 31
32TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR);
33TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS);
34TRACE_DEFINE_ENUM(FLUSH_DELALLOC);
35TRACE_DEFINE_ENUM(FLUSH_DELALLOC_WAIT);
36TRACE_DEFINE_ENUM(ALLOC_CHUNK);
37TRACE_DEFINE_ENUM(COMMIT_TRANS);
38
32#define show_ref_type(type) \ 39#define show_ref_type(type) \
33 __print_symbolic(type, \ 40 __print_symbolic(type, \
34 { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ 41 { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \
@@ -792,11 +799,10 @@ DEFINE_EVENT(btrfs_delayed_data_ref, run_delayed_data_ref,
792DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, 799DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
793 800
794 TP_PROTO(const struct btrfs_fs_info *fs_info, 801 TP_PROTO(const struct btrfs_fs_info *fs_info,
795 const struct btrfs_delayed_ref_node *ref,
796 const struct btrfs_delayed_ref_head *head_ref, 802 const struct btrfs_delayed_ref_head *head_ref,
797 int action), 803 int action),
798 804
799 TP_ARGS(fs_info, ref, head_ref, action), 805 TP_ARGS(fs_info, head_ref, action),
800 806
801 TP_STRUCT__entry_btrfs( 807 TP_STRUCT__entry_btrfs(
802 __field( u64, bytenr ) 808 __field( u64, bytenr )
@@ -806,8 +812,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
806 ), 812 ),
807 813
808 TP_fast_assign_btrfs(fs_info, 814 TP_fast_assign_btrfs(fs_info,
809 __entry->bytenr = ref->bytenr; 815 __entry->bytenr = head_ref->bytenr;
810 __entry->num_bytes = ref->num_bytes; 816 __entry->num_bytes = head_ref->num_bytes;
811 __entry->action = action; 817 __entry->action = action;
812 __entry->is_data = head_ref->is_data; 818 __entry->is_data = head_ref->is_data;
813 ), 819 ),
@@ -822,21 +828,19 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
822DEFINE_EVENT(btrfs_delayed_ref_head, add_delayed_ref_head, 828DEFINE_EVENT(btrfs_delayed_ref_head, add_delayed_ref_head,
823 829
824 TP_PROTO(const struct btrfs_fs_info *fs_info, 830 TP_PROTO(const struct btrfs_fs_info *fs_info,
825 const struct btrfs_delayed_ref_node *ref,
826 const struct btrfs_delayed_ref_head *head_ref, 831 const struct btrfs_delayed_ref_head *head_ref,
827 int action), 832 int action),
828 833
829 TP_ARGS(fs_info, ref, head_ref, action) 834 TP_ARGS(fs_info, head_ref, action)
830); 835);
831 836
832DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head, 837DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head,
833 838
834 TP_PROTO(const struct btrfs_fs_info *fs_info, 839 TP_PROTO(const struct btrfs_fs_info *fs_info,
835 const struct btrfs_delayed_ref_node *ref,
836 const struct btrfs_delayed_ref_head *head_ref, 840 const struct btrfs_delayed_ref_head *head_ref,
837 int action), 841 int action),
838 842
839 TP_ARGS(fs_info, ref, head_ref, action) 843 TP_ARGS(fs_info, head_ref, action)
840); 844);
841 845
842#define show_chunk_type(type) \ 846#define show_chunk_type(type) \
@@ -1692,6 +1696,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
1692 TP_ARGS(fs_info, oldref, newref, tree_size) 1696 TP_ARGS(fs_info, oldref, newref, tree_size)
1693); 1697);
1694 1698
1699TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
1700 TP_PROTO(struct btrfs_root *root, u64 ino, int mod),
1701
1702 TP_ARGS(root, ino, mod),
1703
1704 TP_STRUCT__entry_btrfs(
1705 __field( u64, root_objectid )
1706 __field( u64, ino )
1707 __field( int, mod )
1708 ),
1709
1710 TP_fast_assign_btrfs(root->fs_info,
1711 __entry->root_objectid = root->objectid;
1712 __entry->ino = ino;
1713 __entry->mod = mod;
1714 ),
1715
1716 TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d",
1717 show_root_type(__entry->root_objectid),
1718 (unsigned long long)__entry->ino, __entry->mod)
1719);
1695#endif /* _TRACE_BTRFS_H */ 1720#endif /* _TRACE_BTRFS_H */
1696 1721
1697/* This part must be outside protection */ 1722/* This part must be outside protection */
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 6cdfd12cd14c..ce615b75e855 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -609,10 +609,14 @@ struct btrfs_ioctl_ino_path_args {
609struct btrfs_ioctl_logical_ino_args { 609struct btrfs_ioctl_logical_ino_args {
610 __u64 logical; /* in */ 610 __u64 logical; /* in */
611 __u64 size; /* in */ 611 __u64 size; /* in */
612 __u64 reserved[4]; 612 __u64 reserved[3]; /* must be 0 for now */
613 __u64 flags; /* in, v2 only */
613 /* struct btrfs_data_container *inodes; out */ 614 /* struct btrfs_data_container *inodes; out */
614 __u64 inodes; 615 __u64 inodes;
615}; 616};
617/* Return every ref to the extent, not just those containing logical block.
618 * Requires logical == extent bytenr. */
619#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0)
616 620
617enum btrfs_dev_stat_values { 621enum btrfs_dev_stat_values {
618 /* disk I/O failure stats */ 622 /* disk I/O failure stats */
@@ -836,5 +840,7 @@ enum btrfs_err_code {
836 struct btrfs_ioctl_feature_flags[3]) 840 struct btrfs_ioctl_feature_flags[3])
837#define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \ 841#define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \
838 struct btrfs_ioctl_vol_args_v2) 842 struct btrfs_ioctl_vol_args_v2)
843#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \
844 struct btrfs_ioctl_logical_ino_args)
839 845
840#endif /* _UAPI_LINUX_BTRFS_H */ 846#endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 8f659bb7badc..6d6e5da51527 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -733,6 +733,7 @@ struct btrfs_balance_item {
733#define BTRFS_FILE_EXTENT_INLINE 0 733#define BTRFS_FILE_EXTENT_INLINE 0
734#define BTRFS_FILE_EXTENT_REG 1 734#define BTRFS_FILE_EXTENT_REG 1
735#define BTRFS_FILE_EXTENT_PREALLOC 2 735#define BTRFS_FILE_EXTENT_PREALLOC 2
736#define BTRFS_FILE_EXTENT_TYPES 2
736 737
737struct btrfs_file_extent_item { 738struct btrfs_file_extent_item {
738 /* 739 /*