aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/async-thread.c11
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c123
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h27
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c21
-rw-r--r--fs/btrfs/ctree.c106
-rw-r--r--fs/btrfs/ctree.h93
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c284
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c265
-rw-r--r--fs/btrfs/extent_io.c483
-rw-r--r--fs/btrfs/extent_io.h60
-rw-r--r--fs/btrfs/file-item.c30
-rw-r--r--fs/btrfs/file.c151
-rw-r--r--fs/btrfs/free-space-cache.c157
-rw-r--r--fs/btrfs/hash.c4
-rw-r--r--fs/btrfs/inode-item.c12
-rw-r--r--fs/btrfs/inode-map.c68
-rw-r--r--fs/btrfs/inode.c648
-rw-r--r--fs/btrfs/ioctl.c60
-rw-r--r--fs/btrfs/lzo.c3
-rw-r--r--fs/btrfs/orphan.c4
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c30
-rw-r--r--fs/btrfs/raid56.c8
-rw-r--r--fs/btrfs/reada.c2
-rw-r--r--fs/btrfs/relocation.c142
-rw-r--r--fs/btrfs/scrub.c67
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/super.c137
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/sysfs.h16
-rw-r--r--fs/btrfs/tests/free-space-tests.c516
-rw-r--r--fs/btrfs/transaction.c52
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c259
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/uuid-tree.c1
-rw-r--r--fs/btrfs/volumes.c676
-rw-r--r--fs/btrfs/volumes.h166
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c141
-rw-r--r--include/trace/events/btrfs.h85
49 files changed, 3617 insertions, 1534 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index fbd76ded9a34..4dabeb893b7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper);
74BTRFS_WORK_HELPER(endio_meta_helper); 74BTRFS_WORK_HELPER(endio_meta_helper);
75BTRFS_WORK_HELPER(endio_meta_write_helper); 75BTRFS_WORK_HELPER(endio_meta_write_helper);
76BTRFS_WORK_HELPER(endio_raid56_helper); 76BTRFS_WORK_HELPER(endio_raid56_helper);
77BTRFS_WORK_HELPER(endio_repair_helper);
77BTRFS_WORK_HELPER(rmw_helper); 78BTRFS_WORK_HELPER(rmw_helper);
78BTRFS_WORK_HELPER(endio_write_helper); 79BTRFS_WORK_HELPER(endio_write_helper);
79BTRFS_WORK_HELPER(freespace_write_helper); 80BTRFS_WORK_HELPER(freespace_write_helper);
@@ -91,7 +92,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active,
91{ 92{
92 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 93 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
93 94
94 if (unlikely(!ret)) 95 if (!ret)
95 return NULL; 96 return NULL;
96 97
97 ret->max_active = max_active; 98 ret->max_active = max_active;
@@ -115,7 +116,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active,
115 ret->normal_wq = alloc_workqueue("%s-%s", flags, 116 ret->normal_wq = alloc_workqueue("%s-%s", flags,
116 ret->max_active, "btrfs", 117 ret->max_active, "btrfs",
117 name); 118 name);
118 if (unlikely(!ret->normal_wq)) { 119 if (!ret->normal_wq) {
119 kfree(ret); 120 kfree(ret);
120 return NULL; 121 return NULL;
121 } 122 }
@@ -137,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
137{ 138{
138 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 139 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
139 140
140 if (unlikely(!ret)) 141 if (!ret)
141 return NULL; 142 return NULL;
142 143
143 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, 144 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
144 max_active, thresh); 145 max_active, thresh);
145 if (unlikely(!ret->normal)) { 146 if (!ret->normal) {
146 kfree(ret); 147 kfree(ret);
147 return NULL; 148 return NULL;
148 } 149 }
@@ -150,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
150 if (flags & WQ_HIGHPRI) { 151 if (flags & WQ_HIGHPRI) {
151 ret->high = __btrfs_alloc_workqueue(name, flags, max_active, 152 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
152 thresh); 153 thresh);
153 if (unlikely(!ret->high)) { 154 if (!ret->high) {
154 __btrfs_destroy_workqueue(ret->normal); 155 __btrfs_destroy_workqueue(ret->normal);
155 kfree(ret); 156 kfree(ret);
156 return NULL; 157 return NULL;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e9e31c94758f..e386c29ef1f6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper);
53BTRFS_WORK_HELPER_PROTO(endio_meta_helper); 53BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
54BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); 54BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
55BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); 55BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
56BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
56BTRFS_WORK_HELPER_PROTO(rmw_helper); 57BTRFS_WORK_HELPER_PROTO(rmw_helper);
57BTRFS_WORK_HELPER_PROTO(endio_write_helper); 58BTRFS_WORK_HELPER_PROTO(endio_write_helper);
58BTRFS_WORK_HELPER_PROTO(freespace_write_helper); 59BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 54a201dac7f9..2d3e32ebfd15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
25#include "delayed-ref.h" 25#include "delayed-ref.h"
26#include "locking.h" 26#include "locking.h"
27 27
28/* Just an arbitrary number so we can be sure this happened */
29#define BACKREF_FOUND_SHARED 6
30
28struct extent_inode_elem { 31struct extent_inode_elem {
29 u64 inum; 32 u64 inum;
30 u64 offset; 33 u64 offset;
@@ -377,7 +380,8 @@ out:
377static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 380static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
378 struct btrfs_path *path, u64 time_seq, 381 struct btrfs_path *path, u64 time_seq,
379 struct list_head *head, 382 struct list_head *head,
380 const u64 *extent_item_pos, u64 total_refs) 383 const u64 *extent_item_pos, u64 total_refs,
384 u64 root_objectid)
381{ 385{
382 int err; 386 int err;
383 int ret = 0; 387 int ret = 0;
@@ -402,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
402 continue; 406 continue;
403 if (ref->count == 0) 407 if (ref->count == 0)
404 continue; 408 continue;
409 if (root_objectid && ref->root_id != root_objectid) {
410 ret = BACKREF_FOUND_SHARED;
411 goto out;
412 }
405 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 413 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
406 parents, extent_item_pos, 414 parents, extent_item_pos,
407 total_refs); 415 total_refs);
@@ -482,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
482 continue; 490 continue;
483 BUG_ON(!ref->wanted_disk_byte); 491 BUG_ON(!ref->wanted_disk_byte);
484 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 492 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
485 fs_info->tree_root->leafsize, 0); 493 0);
486 if (!eb || !extent_buffer_uptodate(eb)) { 494 if (!eb || !extent_buffer_uptodate(eb)) {
487 free_extent_buffer(eb); 495 free_extent_buffer(eb);
488 return -EIO; 496 return -EIO;
@@ -561,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
561 * smaller or equal that seq to the list 569 * smaller or equal that seq to the list
562 */ 570 */
563static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 571static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
564 struct list_head *prefs, u64 *total_refs) 572 struct list_head *prefs, u64 *total_refs,
573 u64 inum)
565{ 574{
566 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 575 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
567 struct rb_node *n = &head->node.rb_node; 576 struct rb_node *n = &head->node.rb_node;
@@ -625,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
625 key.objectid = ref->objectid; 634 key.objectid = ref->objectid;
626 key.type = BTRFS_EXTENT_DATA_KEY; 635 key.type = BTRFS_EXTENT_DATA_KEY;
627 key.offset = ref->offset; 636 key.offset = ref->offset;
637
638 /*
639 * Found a inum that doesn't match our known inum, we
640 * know it's shared.
641 */
642 if (inum && ref->objectid != inum) {
643 ret = BACKREF_FOUND_SHARED;
644 break;
645 }
646
628 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, 647 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
629 node->bytenr, 648 node->bytenr,
630 node->ref_mod * sgn, GFP_ATOMIC); 649 node->ref_mod * sgn, GFP_ATOMIC);
@@ -659,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
659static int __add_inline_refs(struct btrfs_fs_info *fs_info, 678static int __add_inline_refs(struct btrfs_fs_info *fs_info,
660 struct btrfs_path *path, u64 bytenr, 679 struct btrfs_path *path, u64 bytenr,
661 int *info_level, struct list_head *prefs, 680 int *info_level, struct list_head *prefs,
662 u64 *total_refs) 681 u64 *total_refs, u64 inum)
663{ 682{
664 int ret = 0; 683 int ret = 0;
665 int slot; 684 int slot;
@@ -744,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
744 dref); 763 dref);
745 key.type = BTRFS_EXTENT_DATA_KEY; 764 key.type = BTRFS_EXTENT_DATA_KEY;
746 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 765 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
766
767 if (inum && key.objectid != inum) {
768 ret = BACKREF_FOUND_SHARED;
769 break;
770 }
771
747 root = btrfs_extent_data_ref_root(leaf, dref); 772 root = btrfs_extent_data_ref_root(leaf, dref);
748 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 773 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
749 bytenr, count, GFP_NOFS); 774 bytenr, count, GFP_NOFS);
@@ -765,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
765 */ 790 */
766static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 791static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
767 struct btrfs_path *path, u64 bytenr, 792 struct btrfs_path *path, u64 bytenr,
768 int info_level, struct list_head *prefs) 793 int info_level, struct list_head *prefs, u64 inum)
769{ 794{
770 struct btrfs_root *extent_root = fs_info->extent_root; 795 struct btrfs_root *extent_root = fs_info->extent_root;
771 int ret; 796 int ret;
@@ -827,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
827 dref); 852 dref);
828 key.type = BTRFS_EXTENT_DATA_KEY; 853 key.type = BTRFS_EXTENT_DATA_KEY;
829 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 854 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
855
856 if (inum && key.objectid != inum) {
857 ret = BACKREF_FOUND_SHARED;
858 break;
859 }
860
830 root = btrfs_extent_data_ref_root(leaf, dref); 861 root = btrfs_extent_data_ref_root(leaf, dref);
831 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 862 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
832 bytenr, count, GFP_NOFS); 863 bytenr, count, GFP_NOFS);
@@ -854,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
854static int find_parent_nodes(struct btrfs_trans_handle *trans, 885static int find_parent_nodes(struct btrfs_trans_handle *trans,
855 struct btrfs_fs_info *fs_info, u64 bytenr, 886 struct btrfs_fs_info *fs_info, u64 bytenr,
856 u64 time_seq, struct ulist *refs, 887 u64 time_seq, struct ulist *refs,
857 struct ulist *roots, const u64 *extent_item_pos) 888 struct ulist *roots, const u64 *extent_item_pos,
889 u64 root_objectid, u64 inum)
858{ 890{
859 struct btrfs_key key; 891 struct btrfs_key key;
860 struct btrfs_path *path; 892 struct btrfs_path *path;
@@ -929,7 +961,8 @@ again:
929 } 961 }
930 spin_unlock(&delayed_refs->lock); 962 spin_unlock(&delayed_refs->lock);
931 ret = __add_delayed_refs(head, time_seq, 963 ret = __add_delayed_refs(head, time_seq,
932 &prefs_delayed, &total_refs); 964 &prefs_delayed, &total_refs,
965 inum);
933 mutex_unlock(&head->mutex); 966 mutex_unlock(&head->mutex);
934 if (ret) 967 if (ret)
935 goto out; 968 goto out;
@@ -951,11 +984,11 @@ again:
951 key.type == BTRFS_METADATA_ITEM_KEY)) { 984 key.type == BTRFS_METADATA_ITEM_KEY)) {
952 ret = __add_inline_refs(fs_info, path, bytenr, 985 ret = __add_inline_refs(fs_info, path, bytenr,
953 &info_level, &prefs, 986 &info_level, &prefs,
954 &total_refs); 987 &total_refs, inum);
955 if (ret) 988 if (ret)
956 goto out; 989 goto out;
957 ret = __add_keyed_refs(fs_info, path, bytenr, 990 ret = __add_keyed_refs(fs_info, path, bytenr,
958 info_level, &prefs); 991 info_level, &prefs, inum);
959 if (ret) 992 if (ret)
960 goto out; 993 goto out;
961 } 994 }
@@ -971,7 +1004,8 @@ again:
971 __merge_refs(&prefs, 1); 1004 __merge_refs(&prefs, 1);
972 1005
973 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 1006 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
974 extent_item_pos, total_refs); 1007 extent_item_pos, total_refs,
1008 root_objectid);
975 if (ret) 1009 if (ret)
976 goto out; 1010 goto out;
977 1011
@@ -981,6 +1015,11 @@ again:
981 ref = list_first_entry(&prefs, struct __prelim_ref, list); 1015 ref = list_first_entry(&prefs, struct __prelim_ref, list);
982 WARN_ON(ref->count < 0); 1016 WARN_ON(ref->count < 0);
983 if (roots && ref->count && ref->root_id && ref->parent == 0) { 1017 if (roots && ref->count && ref->root_id && ref->parent == 0) {
1018 if (root_objectid && ref->root_id != root_objectid) {
1019 ret = BACKREF_FOUND_SHARED;
1020 goto out;
1021 }
1022
984 /* no parent == root of tree */ 1023 /* no parent == root of tree */
985 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 1024 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
986 if (ret < 0) 1025 if (ret < 0)
@@ -989,12 +1028,10 @@ again:
989 if (ref->count && ref->parent) { 1028 if (ref->count && ref->parent) {
990 if (extent_item_pos && !ref->inode_list && 1029 if (extent_item_pos && !ref->inode_list &&
991 ref->level == 0) { 1030 ref->level == 0) {
992 u32 bsz;
993 struct extent_buffer *eb; 1031 struct extent_buffer *eb;
994 bsz = btrfs_level_size(fs_info->extent_root, 1032
995 ref->level);
996 eb = read_tree_block(fs_info->extent_root, 1033 eb = read_tree_block(fs_info->extent_root,
997 ref->parent, bsz, 0); 1034 ref->parent, 0);
998 if (!eb || !extent_buffer_uptodate(eb)) { 1035 if (!eb || !extent_buffer_uptodate(eb)) {
999 free_extent_buffer(eb); 1036 free_extent_buffer(eb);
1000 ret = -EIO; 1037 ret = -EIO;
@@ -1087,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1087 return -ENOMEM; 1124 return -ENOMEM;
1088 1125
1089 ret = find_parent_nodes(trans, fs_info, bytenr, 1126 ret = find_parent_nodes(trans, fs_info, bytenr,
1090 time_seq, *leafs, NULL, extent_item_pos); 1127 time_seq, *leafs, NULL, extent_item_pos, 0, 0);
1091 if (ret < 0 && ret != -ENOENT) { 1128 if (ret < 0 && ret != -ENOENT) {
1092 free_leaf_list(*leafs); 1129 free_leaf_list(*leafs);
1093 return ret; 1130 return ret;
@@ -1130,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1130 ULIST_ITER_INIT(&uiter); 1167 ULIST_ITER_INIT(&uiter);
1131 while (1) { 1168 while (1) {
1132 ret = find_parent_nodes(trans, fs_info, bytenr, 1169 ret = find_parent_nodes(trans, fs_info, bytenr,
1133 time_seq, tmp, *roots, NULL); 1170 time_seq, tmp, *roots, NULL, 0, 0);
1134 if (ret < 0 && ret != -ENOENT) { 1171 if (ret < 0 && ret != -ENOENT) {
1135 ulist_free(tmp); 1172 ulist_free(tmp);
1136 ulist_free(*roots); 1173 ulist_free(*roots);
@@ -1161,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1161 return ret; 1198 return ret;
1162} 1199}
1163 1200
1201int btrfs_check_shared(struct btrfs_trans_handle *trans,
1202 struct btrfs_fs_info *fs_info, u64 root_objectid,
1203 u64 inum, u64 bytenr)
1204{
1205 struct ulist *tmp = NULL;
1206 struct ulist *roots = NULL;
1207 struct ulist_iterator uiter;
1208 struct ulist_node *node;
1209 struct seq_list elem = {};
1210 int ret = 0;
1211
1212 tmp = ulist_alloc(GFP_NOFS);
1213 roots = ulist_alloc(GFP_NOFS);
1214 if (!tmp || !roots) {
1215 ulist_free(tmp);
1216 ulist_free(roots);
1217 return -ENOMEM;
1218 }
1219
1220 if (trans)
1221 btrfs_get_tree_mod_seq(fs_info, &elem);
1222 else
1223 down_read(&fs_info->commit_root_sem);
1224 ULIST_ITER_INIT(&uiter);
1225 while (1) {
1226 ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
1227 roots, NULL, root_objectid, inum);
1228 if (ret == BACKREF_FOUND_SHARED) {
1229 ret = 1;
1230 break;
1231 }
1232 if (ret < 0 && ret != -ENOENT)
1233 break;
1234 node = ulist_next(tmp, &uiter);
1235 if (!node)
1236 break;
1237 bytenr = node->val;
1238 cond_resched();
1239 }
1240 if (trans)
1241 btrfs_put_tree_mod_seq(fs_info, &elem);
1242 else
1243 up_read(&fs_info->commit_root_sem);
1244 ulist_free(tmp);
1245 ulist_free(roots);
1246 return ret;
1247}
1248
1164/* 1249/*
1165 * this makes the path point to (inum INODE_ITEM ioff) 1250 * this makes the path point to (inum INODE_ITEM ioff)
1166 */ 1251 */
@@ -1193,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1193 unsigned long ptr; 1278 unsigned long ptr;
1194 1279
1195 key.objectid = inode_objectid; 1280 key.objectid = inode_objectid;
1196 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 1281 key.type = BTRFS_INODE_EXTREF_KEY;
1197 key.offset = start_off; 1282 key.offset = start_off;
1198 1283
1199 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1284 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1233,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1233 ret = -ENOENT; 1318 ret = -ENOENT;
1234 if (found_key.objectid != inode_objectid) 1319 if (found_key.objectid != inode_objectid)
1235 break; 1320 break;
1236 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) 1321 if (found_key.type != BTRFS_INODE_EXTREF_KEY)
1237 break; 1322 break;
1238 1323
1239 ret = 0; 1324 ret = 0;
@@ -1366,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1366 } 1451 }
1367 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1452 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1453 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1454 size = fs_info->extent_root->nodesize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1455 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1371 size = found_key->offset; 1456 size = found_key->offset;
1372 1457
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20fec282..2a1ac6bfc724 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
71 u64 start_off, struct btrfs_path *path, 71 u64 start_off, struct btrfs_path *path,
72 struct btrfs_inode_extref **ret_extref, 72 struct btrfs_inode_extref **ret_extref,
73 u64 *found_off); 73 u64 *found_off);
74int btrfs_check_shared(struct btrfs_trans_handle *trans,
75 struct btrfs_fs_info *fs_info, u64 root_objectid,
76 u64 inum, u64 bytenr);
74 77
75int __init btrfs_prelim_ref_init(void); 78int __init btrfs_prelim_ref_init(void);
76void btrfs_prelim_ref_exit(void); 79void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 56b8522d5767..4aadadcfab20 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
44#define BTRFS_INODE_IN_DELALLOC_LIST 9 44#define BTRFS_INODE_IN_DELALLOC_LIST 9
45#define BTRFS_INODE_READDIO_NEED_LOCK 10 45#define BTRFS_INODE_READDIO_NEED_LOCK 10
46#define BTRFS_INODE_HAS_PROPS 11 46#define BTRFS_INODE_HAS_PROPS 11
47/*
48 * The following 3 bits are meant only for the btree inode.
49 * When any of them is set, it means an error happened while writing an
50 * extent buffer belonging to:
51 * 1) a non-log btree
52 * 2) a log btree and first log sub-transaction
53 * 3) a log btree and second log sub-transaction
54 */
55#define BTRFS_INODE_BTREE_ERR 12
56#define BTRFS_INODE_BTREE_LOG1_ERR 13
57#define BTRFS_INODE_BTREE_LOG2_ERR 14
47 58
48/* in memory btrfs inode */ 59/* in memory btrfs inode */
49struct btrfs_inode { 60struct btrfs_inode {
@@ -121,6 +132,12 @@ struct btrfs_inode {
121 u64 delalloc_bytes; 132 u64 delalloc_bytes;
122 133
123 /* 134 /*
135 * total number of bytes pending defrag, used by stat to check whether
136 * it needs COW.
137 */
138 u64 defrag_bytes;
139
140 /*
124 * the size of the file stored in the metadata on disk. data=ordered 141 * the size of the file stored in the metadata on disk. data=ordered
125 * means the in-memory i_size might be larger than the size on disk 142 * means the in-memory i_size might be larger than the size on disk
126 * because not all the blocks are written yet. 143 * because not all the blocks are written yet.
@@ -248,8 +265,11 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
248 return 0; 265 return 0;
249} 266}
250 267
268#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
269
251struct btrfs_dio_private { 270struct btrfs_dio_private {
252 struct inode *inode; 271 struct inode *inode;
272 unsigned long flags;
253 u64 logical_offset; 273 u64 logical_offset;
254 u64 disk_bytenr; 274 u64 disk_bytenr;
255 u64 bytes; 275 u64 bytes;
@@ -266,7 +286,12 @@ struct btrfs_dio_private {
266 286
267 /* dio_bio came from fs/direct-io.c */ 287 /* dio_bio came from fs/direct-io.c */
268 struct bio *dio_bio; 288 struct bio *dio_bio;
269 u8 csum[0]; 289
290 /*
291 * The original bio may be splited to several sub-bios, this is
292 * done during endio of sub-bios
293 */
294 int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
270}; 295};
271 296
272/* 297/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae30250f..cb7f3fe9c9f6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
807 807
808 /* super block bytenr is always the unmapped device bytenr */ 808 /* super block bytenr is always the unmapped device bytenr */
809 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 809 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
810 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 810 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
811 return -1; 811 return -1;
812 bh = __bread(superblock_bdev, dev_bytenr / 4096, 812 bh = __bread(superblock_bdev, dev_bytenr / 4096,
813 BTRFS_SUPER_INFO_SIZE); 813 BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
820 btrfs_super_magic(super_tmp) != BTRFS_MAGIC || 820 btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
821 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 821 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
822 btrfs_super_nodesize(super_tmp) != state->metablock_size || 822 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
823 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
824 btrfs_super_sectorsize(super_tmp) != state->datablock_size) { 823 btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
825 brelse(bh); 824 brelse(bh);
826 return 0; 825 return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
1252 1251
1253 while (len > 0) { 1252 while (len > 0) {
1254 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); 1253 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
1255 BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> 1254 BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
1256 PAGE_CACHE_SHIFT);
1257 kaddr = block_ctx->datav[i]; 1255 kaddr = block_ctx->datav[i];
1258 memcpy(dst, kaddr + offset_in_page, cur); 1256 memcpy(dst, kaddr + offset_in_page, cur);
1259 1257
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
3120 struct list_head *dev_head = &fs_devices->devices; 3118 struct list_head *dev_head = &fs_devices->devices;
3121 struct btrfs_device *device; 3119 struct btrfs_device *device;
3122 3120
3123 if (root->nodesize != root->leafsize) {
3124 printk(KERN_INFO
3125 "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
3126 root->nodesize, root->leafsize);
3127 return -1;
3128 }
3129 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { 3121 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
3130 printk(KERN_INFO 3122 printk(KERN_INFO
3131 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 3123 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3132 root->nodesize, PAGE_CACHE_SIZE); 3124 root->nodesize, PAGE_CACHE_SIZE);
3133 return -1; 3125 return -1;
3134 } 3126 }
3135 if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3136 printk(KERN_INFO
3137 "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3138 root->leafsize, PAGE_CACHE_SIZE);
3139 return -1;
3140 }
3141 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { 3127 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3142 printk(KERN_INFO 3128 printk(KERN_INFO
3143 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 3129 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b47187..d3220d31d3cb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 91 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
92 92
93 return sizeof(struct compressed_bio) + 93 return sizeof(struct compressed_bio) +
94 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 94 (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
95 csum_size;
96} 95}
97 96
98static struct bio *compressed_bio_alloc(struct block_device *bdev, 97static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
389 * freed before we're done setting it up 388 * freed before we're done setting it up
390 */ 389 */
391 atomic_inc(&cb->pending_bios); 390 atomic_inc(&cb->pending_bios);
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 391 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
392 BTRFS_WQ_ENDIO_DATA);
393 BUG_ON(ret); /* -ENOMEM */ 393 BUG_ON(ret); /* -ENOMEM */
394 394
395 if (!skip_sum) { 395 if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
420 } 420 }
421 bio_get(bio); 421 bio_get(bio);
422 422
423 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 423 ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
424 BUG_ON(ret); /* -ENOMEM */ 424 BUG_ON(ret); /* -ENOMEM */
425 425
426 if (!skip_sum) { 426 if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
615 cb->compress_type = extent_compress_type(bio_flags); 615 cb->compress_type = extent_compress_type(bio_flags);
616 cb->orig_bio = bio; 616 cb->orig_bio = bio;
617 617
618 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 618 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
619 PAGE_CACHE_SIZE;
620 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, 619 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
621 GFP_NOFS); 620 GFP_NOFS);
622 if (!cb->compressed_pages) 621 if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
670 PAGE_CACHE_SIZE) { 669 PAGE_CACHE_SIZE) {
671 bio_get(comp_bio); 670 bio_get(comp_bio);
672 671
673 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 672 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
673 BTRFS_WQ_ENDIO_DATA);
674 BUG_ON(ret); /* -ENOMEM */ 674 BUG_ON(ret); /* -ENOMEM */
675 675
676 /* 676 /*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
686 comp_bio, sums); 686 comp_bio, sums);
687 BUG_ON(ret); /* -ENOMEM */ 687 BUG_ON(ret); /* -ENOMEM */
688 } 688 }
689 sums += (comp_bio->bi_iter.bi_size + 689 sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
690 root->sectorsize - 1) / root->sectorsize; 690 root->sectorsize);
691 691
692 ret = btrfs_map_bio(root, READ, comp_bio, 692 ret = btrfs_map_bio(root, READ, comp_bio,
693 mirror_num, 0); 693 mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
708 } 708 }
709 bio_get(comp_bio); 709 bio_get(comp_bio);
710 710
711 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 711 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
712 BTRFS_WQ_ENDIO_DATA);
712 BUG_ON(ret); /* -ENOMEM */ 713 BUG_ON(ret); /* -ENOMEM */
713 714
714 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 715 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 44ee5d2e52a4..19bc6162fb8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
258 else 258 else
259 btrfs_node_key(buf, &disk_key, 0); 259 btrfs_node_key(buf, &disk_key, 0);
260 260
261 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 261 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
262 new_root_objectid, &disk_key, level, 262 &disk_key, level, buf->start, 0);
263 buf->start, 0);
264 if (IS_ERR(cow)) 263 if (IS_ERR(cow))
265 return PTR_ERR(cow); 264 return PTR_ERR(cow);
266 265
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1133 } else 1132 } else
1134 parent_start = 0; 1133 parent_start = 0;
1135 1134
1136 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 1135 cow = btrfs_alloc_tree_block(trans, root, parent_start,
1137 root->root_key.objectid, &disk_key, 1136 root->root_key.objectid, &disk_key, level,
1138 level, search_start, empty_size); 1137 search_start, empty_size);
1139 if (IS_ERR(cow)) 1138 if (IS_ERR(cow))
1140 return PTR_ERR(cow); 1139 return PTR_ERR(cow);
1141 1140
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1425 struct tree_mod_root *old_root = NULL; 1424 struct tree_mod_root *old_root = NULL;
1426 u64 old_generation = 0; 1425 u64 old_generation = 0;
1427 u64 logical; 1426 u64 logical;
1428 u32 blocksize;
1429 1427
1430 eb_root = btrfs_read_lock_root_node(root); 1428 eb_root = btrfs_read_lock_root_node(root);
1431 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1429 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1444 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1442 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1445 btrfs_tree_read_unlock(eb_root); 1443 btrfs_tree_read_unlock(eb_root);
1446 free_extent_buffer(eb_root); 1444 free_extent_buffer(eb_root);
1447 blocksize = btrfs_level_size(root, old_root->level); 1445 old = read_tree_block(root, logical, 0);
1448 old = read_tree_block(root, logical, blocksize, 0);
1449 if (WARN_ON(!old || !extent_buffer_uptodate(old))) { 1446 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1450 free_extent_buffer(old); 1447 free_extent_buffer(old);
1451 btrfs_warn(root->fs_info, 1448 btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1506 struct btrfs_root *root, 1503 struct btrfs_root *root,
1507 struct extent_buffer *buf) 1504 struct extent_buffer *buf)
1508{ 1505{
1509#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1506 if (btrfs_test_is_dummy_root(root))
1510 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1511 return 0; 1507 return 0;
1512#endif 1508
1513 /* ensure we can see the force_cow */ 1509 /* ensure we can see the force_cow */
1514 smp_rmb(); 1510 smp_rmb();
1515 1511
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1651 WARN_ON(trans->transid != root->fs_info->generation); 1647 WARN_ON(trans->transid != root->fs_info->generation);
1652 1648
1653 parent_nritems = btrfs_header_nritems(parent); 1649 parent_nritems = btrfs_header_nritems(parent);
1654 blocksize = btrfs_level_size(root, parent_level - 1); 1650 blocksize = root->nodesize;
1655 end_slot = parent_nritems; 1651 end_slot = parent_nritems;
1656 1652
1657 if (parent_nritems == 1) 1653 if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1685 continue; 1681 continue;
1686 } 1682 }
1687 1683
1688 cur = btrfs_find_tree_block(root, blocknr, blocksize); 1684 cur = btrfs_find_tree_block(root, blocknr);
1689 if (cur) 1685 if (cur)
1690 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1686 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1691 else 1687 else
1692 uptodate = 0; 1688 uptodate = 0;
1693 if (!cur || !uptodate) { 1689 if (!cur || !uptodate) {
1694 if (!cur) { 1690 if (!cur) {
1695 cur = read_tree_block(root, blocknr, 1691 cur = read_tree_block(root, blocknr, gen);
1696 blocksize, gen);
1697 if (!cur || !extent_buffer_uptodate(cur)) { 1692 if (!cur || !extent_buffer_uptodate(cur)) {
1698 free_extent_buffer(cur); 1693 free_extent_buffer(cur);
1699 return -EIO; 1694 return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1872 BUG_ON(level == 0); 1867 BUG_ON(level == 0);
1873 1868
1874 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1869 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1875 btrfs_level_size(root, level - 1),
1876 btrfs_node_ptr_generation(parent, slot)); 1870 btrfs_node_ptr_generation(parent, slot));
1877 if (eb && !extent_buffer_uptodate(eb)) { 1871 if (eb && !extent_buffer_uptodate(eb)) {
1878 free_extent_buffer(eb); 1872 free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
2267 node = path->nodes[level]; 2261 node = path->nodes[level];
2268 2262
2269 search = btrfs_node_blockptr(node, slot); 2263 search = btrfs_node_blockptr(node, slot);
2270 blocksize = btrfs_level_size(root, level - 1); 2264 blocksize = root->nodesize;
2271 eb = btrfs_find_tree_block(root, search, blocksize); 2265 eb = btrfs_find_tree_block(root, search);
2272 if (eb) { 2266 if (eb) {
2273 free_extent_buffer(eb); 2267 free_extent_buffer(eb);
2274 return; 2268 return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
2298 if ((search <= target && target - search <= 65536) || 2292 if ((search <= target && target - search <= 65536) ||
2299 (search > target && search - target <= 65536)) { 2293 (search > target && search - target <= 65536)) {
2300 gen = btrfs_node_ptr_generation(node, nr); 2294 gen = btrfs_node_ptr_generation(node, nr);
2301 readahead_tree_block(root, search, blocksize, gen); 2295 readahead_tree_block(root, search, blocksize);
2302 nread += blocksize; 2296 nread += blocksize;
2303 } 2297 }
2304 nscan++; 2298 nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2325 2319
2326 nritems = btrfs_header_nritems(parent); 2320 nritems = btrfs_header_nritems(parent);
2327 slot = path->slots[level + 1]; 2321 slot = path->slots[level + 1];
2328 blocksize = btrfs_level_size(root, level); 2322 blocksize = root->nodesize;
2329 2323
2330 if (slot > 0) { 2324 if (slot > 0) {
2331 block1 = btrfs_node_blockptr(parent, slot - 1); 2325 block1 = btrfs_node_blockptr(parent, slot - 1);
2332 gen = btrfs_node_ptr_generation(parent, slot - 1); 2326 gen = btrfs_node_ptr_generation(parent, slot - 1);
2333 eb = btrfs_find_tree_block(root, block1, blocksize); 2327 eb = btrfs_find_tree_block(root, block1);
2334 /* 2328 /*
2335 * if we get -eagain from btrfs_buffer_uptodate, we 2329 * if we get -eagain from btrfs_buffer_uptodate, we
2336 * don't want to return eagain here. That will loop 2330 * don't want to return eagain here. That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2343 if (slot + 1 < nritems) { 2337 if (slot + 1 < nritems) {
2344 block2 = btrfs_node_blockptr(parent, slot + 1); 2338 block2 = btrfs_node_blockptr(parent, slot + 1);
2345 gen = btrfs_node_ptr_generation(parent, slot + 1); 2339 gen = btrfs_node_ptr_generation(parent, slot + 1);
2346 eb = btrfs_find_tree_block(root, block2, blocksize); 2340 eb = btrfs_find_tree_block(root, block2);
2347 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2341 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2348 block2 = 0; 2342 block2 = 0;
2349 free_extent_buffer(eb); 2343 free_extent_buffer(eb);
2350 } 2344 }
2351 2345
2352 if (block1) 2346 if (block1)
2353 readahead_tree_block(root, block1, blocksize, 0); 2347 readahead_tree_block(root, block1, blocksize);
2354 if (block2) 2348 if (block2)
2355 readahead_tree_block(root, block2, blocksize, 0); 2349 readahead_tree_block(root, block2, blocksize);
2356} 2350}
2357 2351
2358 2352
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2454{ 2448{
2455 u64 blocknr; 2449 u64 blocknr;
2456 u64 gen; 2450 u64 gen;
2457 u32 blocksize;
2458 struct extent_buffer *b = *eb_ret; 2451 struct extent_buffer *b = *eb_ret;
2459 struct extent_buffer *tmp; 2452 struct extent_buffer *tmp;
2460 int ret; 2453 int ret;
2461 2454
2462 blocknr = btrfs_node_blockptr(b, slot); 2455 blocknr = btrfs_node_blockptr(b, slot);
2463 gen = btrfs_node_ptr_generation(b, slot); 2456 gen = btrfs_node_ptr_generation(b, slot);
2464 blocksize = btrfs_level_size(root, level - 1);
2465 2457
2466 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2458 tmp = btrfs_find_tree_block(root, blocknr);
2467 if (tmp) { 2459 if (tmp) {
2468 /* first we do an atomic uptodate check */ 2460 /* first we do an atomic uptodate check */
2469 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2461 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2507 btrfs_release_path(p); 2499 btrfs_release_path(p);
2508 2500
2509 ret = -EAGAIN; 2501 ret = -EAGAIN;
2510 tmp = read_tree_block(root, blocknr, blocksize, 0); 2502 tmp = read_tree_block(root, blocknr, 0);
2511 if (tmp) { 2503 if (tmp) {
2512 /* 2504 /*
2513 * If the read above didn't mark this buffer up to date, 2505 * If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
2792 if (!should_cow_block(trans, root, b)) 2784 if (!should_cow_block(trans, root, b))
2793 goto cow_done; 2785 goto cow_done;
2794 2786
2795 btrfs_set_path_blocking(p);
2796
2797 /* 2787 /*
2798 * must have write locks on this node and the 2788 * must have write locks on this node and the
2799 * parent 2789 * parent
@@ -2807,6 +2797,7 @@ again:
2807 goto again; 2797 goto again;
2808 } 2798 }
2809 2799
2800 btrfs_set_path_blocking(p);
2810 err = btrfs_cow_block(trans, root, b, 2801 err = btrfs_cow_block(trans, root, b,
2811 p->nodes[level + 1], 2802 p->nodes[level + 1],
2812 p->slots[level + 1], &b); 2803 p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3362 else 3353 else
3363 btrfs_node_key(lower, &lower_key, 0); 3354 btrfs_node_key(lower, &lower_key, 0);
3364 3355
3365 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3356 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3366 root->root_key.objectid, &lower_key, 3357 &lower_key, level, root->node->start, 0);
3367 level, root->node->start, 0);
3368 if (IS_ERR(c)) 3358 if (IS_ERR(c))
3369 return PTR_ERR(c); 3359 return PTR_ERR(c);
3370 3360
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3502 mid = (c_nritems + 1) / 2; 3492 mid = (c_nritems + 1) / 2;
3503 btrfs_node_key(c, &disk_key, mid); 3493 btrfs_node_key(c, &disk_key, mid);
3504 3494
3505 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3495 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3506 root->root_key.objectid, 3496 &disk_key, level, c->start, 0);
3507 &disk_key, level, c->start, 0);
3508 if (IS_ERR(split)) 3497 if (IS_ERR(split))
3509 return PTR_ERR(split); 3498 return PTR_ERR(split);
3510 3499
@@ -4282,13 +4271,12 @@ again:
4282 else 4271 else
4283 btrfs_item_key(l, &disk_key, mid); 4272 btrfs_item_key(l, &disk_key, mid);
4284 4273
4285 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 4274 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4286 root->root_key.objectid, 4275 &disk_key, 0, l->start, 0);
4287 &disk_key, 0, l->start, 0);
4288 if (IS_ERR(right)) 4276 if (IS_ERR(right))
4289 return PTR_ERR(right); 4277 return PTR_ERR(right);
4290 4278
4291 root_add_used(root, root->leafsize); 4279 root_add_used(root, root->nodesize);
4292 4280
4293 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 4281 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4294 btrfs_set_header_bytenr(right, right->start); 4282 btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4626 ptr = btrfs_item_ptr_offset(leaf, slot); 4614 ptr = btrfs_item_ptr_offset(leaf, slot);
4627 memmove_extent_buffer(leaf, ptr, 4615 memmove_extent_buffer(leaf, ptr,
4628 (unsigned long)fi, 4616 (unsigned long)fi,
4629 offsetof(struct btrfs_file_extent_item, 4617 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4630 disk_bytenr));
4631 } 4618 }
4632 } 4619 }
4633 4620
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4738 int slot; 4725 int slot;
4739 struct btrfs_map_token token; 4726 struct btrfs_map_token token;
4740 4727
4728 if (path->slots[0] == 0) {
4729 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4730 fixup_low_keys(root, path, &disk_key, 1);
4731 }
4732 btrfs_unlock_up_safe(path, 1);
4733
4741 btrfs_init_map_token(&token); 4734 btrfs_init_map_token(&token);
4742 4735
4743 leaf = path->nodes[0]; 4736 leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4798 } 4791 }
4799 4792
4800 btrfs_set_header_nritems(leaf, nritems + nr); 4793 btrfs_set_header_nritems(leaf, nritems + nr);
4801
4802 if (slot == 0) {
4803 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4804 fixup_low_keys(root, path, &disk_key, 1);
4805 }
4806 btrfs_unlock_up_safe(path, 1);
4807 btrfs_mark_buffer_dirty(leaf); 4794 btrfs_mark_buffer_dirty(leaf);
4808 4795
4809 if (btrfs_leaf_free_space(root, leaf) < 0) { 4796 if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5145 u32 nritems; 5132 u32 nritems;
5146 int level; 5133 int level;
5147 int ret = 1; 5134 int ret = 1;
5135 int keep_locks = path->keep_locks;
5148 5136
5149 WARN_ON(!path->keep_locks); 5137 path->keep_locks = 1;
5150again: 5138again:
5151 cur = btrfs_read_lock_root_node(root); 5139 cur = btrfs_read_lock_root_node(root);
5152 level = btrfs_header_level(cur); 5140 level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
5210 path->slots[level] = slot; 5198 path->slots[level] = slot;
5211 if (level == path->lowest_level) { 5199 if (level == path->lowest_level) {
5212 ret = 0; 5200 ret = 0;
5213 unlock_up(path, level, 1, 0, NULL);
5214 goto out; 5201 goto out;
5215 } 5202 }
5216 btrfs_set_path_blocking(path); 5203 btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
5225 btrfs_clear_path_blocking(path, NULL, 0); 5212 btrfs_clear_path_blocking(path, NULL, 0);
5226 } 5213 }
5227out: 5214out:
5228 if (ret == 0) 5215 path->keep_locks = keep_locks;
5216 if (ret == 0) {
5217 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5218 btrfs_set_path_blocking(path);
5229 memcpy(min_key, &found_key, sizeof(found_key)); 5219 memcpy(min_key, &found_key, sizeof(found_key));
5230 btrfs_set_path_blocking(path); 5220 }
5231 return ret; 5221 return ret;
5232} 5222}
5233 5223
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5375 goto out; 5365 goto out;
5376 } 5366 }
5377 5367
5378 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); 5368 tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5379 if (!tmp_buf) { 5369 if (!tmp_buf) {
5380 ret = -ENOMEM; 5370 ret = -ENOMEM;
5381 goto out; 5371 goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5520 goto out; 5510 goto out;
5521 advance_right = ADVANCE; 5511 advance_right = ADVANCE;
5522 } else { 5512 } else {
5523 enum btrfs_compare_tree_result cmp; 5513 enum btrfs_compare_tree_result result;
5524 5514
5525 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5515 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5526 ret = tree_compare_item(left_root, left_path, 5516 ret = tree_compare_item(left_root, left_path,
5527 right_path, tmp_buf); 5517 right_path, tmp_buf);
5528 if (ret) 5518 if (ret)
5529 cmp = BTRFS_COMPARE_TREE_CHANGED; 5519 result = BTRFS_COMPARE_TREE_CHANGED;
5530 else 5520 else
5531 cmp = BTRFS_COMPARE_TREE_SAME; 5521 result = BTRFS_COMPARE_TREE_SAME;
5532 ret = changed_cb(left_root, right_root, 5522 ret = changed_cb(left_root, right_root,
5533 left_path, right_path, 5523 left_path, right_path,
5534 &left_key, cmp, ctx); 5524 &left_key, result, ctx);
5535 if (ret < 0) 5525 if (ret < 0)
5536 goto out; 5526 goto out;
5537 advance_left = ADVANCE; 5527 advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e29b614fe93..d557264ee974 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/btrfs.h> 35#include <linux/btrfs.h>
36#include <linux/workqueue.h> 36#include <linux/workqueue.h>
37#include <linux/security.h>
37#include "extent_io.h" 38#include "extent_io.h"
38#include "extent_map.h" 39#include "extent_map.h"
39#include "async-thread.h" 40#include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
62 63
63#define BTRFS_COMPAT_EXTENT_TREE_V0 64#define BTRFS_COMPAT_EXTENT_TREE_V0
64 65
65/*
66 * files bigger than this get some pre-flushing when they are added
67 * to the ordered operations list. That way we limit the total
68 * work done by the commit
69 */
70#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
71
72/* holds pointers to all of the tree roots */ 66/* holds pointers to all of the tree roots */
73#define BTRFS_ROOT_TREE_OBJECTID 1ULL 67#define BTRFS_ROOT_TREE_OBJECTID 1ULL
74 68
@@ -391,10 +385,12 @@ struct btrfs_header {
391 sizeof(struct btrfs_header)) / \ 385 sizeof(struct btrfs_header)) / \
392 sizeof(struct btrfs_key_ptr)) 386 sizeof(struct btrfs_key_ptr))
393#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) 387#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
394#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) 388#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
389#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
390 (offsetof(struct btrfs_file_extent_item, disk_bytenr))
395#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 391#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
396 sizeof(struct btrfs_item) - \ 392 sizeof(struct btrfs_item) - \
397 sizeof(struct btrfs_file_extent_item)) 393 BTRFS_FILE_EXTENT_INLINE_DATA_START)
398#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 394#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
399 sizeof(struct btrfs_item) -\ 395 sizeof(struct btrfs_item) -\
400 sizeof(struct btrfs_dir_item)) 396 sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
474 __le64 num_devices; 470 __le64 num_devices;
475 __le32 sectorsize; 471 __le32 sectorsize;
476 __le32 nodesize; 472 __le32 nodesize;
477 __le32 leafsize; 473 __le32 __unused_leafsize;
478 __le32 stripesize; 474 __le32 stripesize;
479 __le32 sys_chunk_array_size; 475 __le32 sys_chunk_array_size;
480 __le64 chunk_root_generation; 476 __le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
903 /* 899 /*
904 * disk space consumed by the extent, checksum blocks are included 900 * disk space consumed by the extent, checksum blocks are included
905 * in these numbers 901 * in these numbers
902 *
903 * At this offset in the structure, the inline extent data start.
906 */ 904 */
907 __le64 disk_bytenr; 905 __le64 disk_bytenr;
908 __le64 disk_num_bytes; 906 __le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
1305 */ 1303 */
1306 struct list_head cluster_list; 1304 struct list_head cluster_list;
1307 1305
1308 /* For delayed block group creation */ 1306 /* For delayed block group creation or deletion of empty block groups */
1309 struct list_head new_bg_list; 1307 struct list_head bg_list;
1310}; 1308};
1311 1309
1312/* delayed seq elem */ 1310/* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
1545 struct btrfs_workqueue *endio_workers; 1543 struct btrfs_workqueue *endio_workers;
1546 struct btrfs_workqueue *endio_meta_workers; 1544 struct btrfs_workqueue *endio_meta_workers;
1547 struct btrfs_workqueue *endio_raid56_workers; 1545 struct btrfs_workqueue *endio_raid56_workers;
1546 struct btrfs_workqueue *endio_repair_workers;
1548 struct btrfs_workqueue *rmw_workers; 1547 struct btrfs_workqueue *rmw_workers;
1549 struct btrfs_workqueue *endio_meta_write_workers; 1548 struct btrfs_workqueue *endio_meta_write_workers;
1550 struct btrfs_workqueue *endio_write_workers; 1549 struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
1574 int do_barriers; 1573 int do_barriers;
1575 int closing; 1574 int closing;
1576 int log_root_recovering; 1575 int log_root_recovering;
1576 int open;
1577 1577
1578 u64 total_pinned; 1578 u64 total_pinned;
1579 1579
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
1723 1723
1724 /* Used to reclaim the metadata space in the background. */ 1724 /* Used to reclaim the metadata space in the background. */
1725 struct work_struct async_reclaim_work; 1725 struct work_struct async_reclaim_work;
1726
1727 spinlock_t unused_bgs_lock;
1728 struct list_head unused_bgs;
1729
1730 /* For btrfs to record security options */
1731 struct security_mnt_opts security_opts;
1726}; 1732};
1727 1733
1728struct btrfs_subvolume_writers { 1734struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
1776 1782
1777 /* free ino cache stuff */ 1783 /* free ino cache stuff */
1778 struct btrfs_free_space_ctl *free_ino_ctl; 1784 struct btrfs_free_space_ctl *free_ino_ctl;
1779 enum btrfs_caching_type cached; 1785 enum btrfs_caching_type ino_cache_state;
1780 spinlock_t cache_lock; 1786 spinlock_t ino_cache_lock;
1781 wait_queue_head_t cache_wait; 1787 wait_queue_head_t ino_cache_wait;
1782 struct btrfs_free_space_ctl *free_ino_pinned; 1788 struct btrfs_free_space_ctl *free_ino_pinned;
1783 u64 cache_progress; 1789 u64 ino_cache_progress;
1784 struct inode *cache_inode; 1790 struct inode *ino_cache_inode;
1785 1791
1786 struct mutex log_mutex; 1792 struct mutex log_mutex;
1787 wait_queue_head_t log_writer_wait; 1793 wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
1806 /* node allocations are done in nodesize units */ 1812 /* node allocations are done in nodesize units */
1807 u32 nodesize; 1813 u32 nodesize;
1808 1814
1809 /* leaf allocations are done in leafsize units */
1810 u32 leafsize;
1811
1812 u32 stripesize; 1815 u32 stripesize;
1813 1816
1814 u32 type; 1817 u32 type;
1815 1818
1816 u64 highest_objectid; 1819 u64 highest_objectid;
1817 1820
1818#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1821 /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
1819 u64 alloc_bytenr; 1822 u64 alloc_bytenr;
1820#endif
1821 1823
1822 u64 defrag_trans_start; 1824 u64 defrag_trans_start;
1823 struct btrfs_key defrag_progress; 1825 struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
2094#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) 2096#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2095 2097
2096#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2098#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2099#define BTRFS_DEFAULT_MAX_INLINE (8192)
2097 2100
2098#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 2101#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
2099#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 2102#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
2995 sectorsize, 32); 2998 sectorsize, 32);
2996BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, 2999BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
2997 nodesize, 32); 3000 nodesize, 32);
2998BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
2999 leafsize, 32);
3000BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, 3001BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
3001 stripesize, 32); 3002 stripesize, 32);
3002BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, 3003BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
3049static inline unsigned long 3050static inline unsigned long
3050btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) 3051btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
3051{ 3052{
3052 unsigned long offset = (unsigned long)e; 3053 return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
3053 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
3054 return offset;
3055} 3054}
3056 3055
3057static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) 3056static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
3058{ 3057{
3059 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; 3058 return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
3060} 3059}
3061 3060
3062BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, 3061BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
3086static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, 3085static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
3087 struct btrfs_item *e) 3086 struct btrfs_item *e)
3088{ 3087{
3089 unsigned long offset; 3088 return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
3090 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
3091 return btrfs_item_size(eb, e) - offset;
3092} 3089}
3093 3090
3094/* this returns the number of file bytes represented by the inline item. 3091/* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
3232 return sb->s_fs_info; 3229 return sb->s_fs_info;
3233} 3230}
3234 3231
3235static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
3236{
3237 if (level == 0)
3238 return root->leafsize;
3239 return root->nodesize;
3240}
3241
3242/* helper function to cast into the data area of the leaf. */ 3232/* helper function to cast into the data area of the leaf. */
3243#define btrfs_item_ptr(leaf, slot, type) \ 3233#define btrfs_item_ptr(leaf, slot, type) \
3244 ((type *)(btrfs_leaf_data(leaf) + \ 3234 ((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
3263static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3253static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3264 unsigned num_items) 3254 unsigned num_items)
3265{ 3255{
3266 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3256 return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3267 2 * num_items; 3257 2 * num_items;
3268} 3258}
3269 3259
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3274static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, 3264static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3275 unsigned num_items) 3265 unsigned num_items)
3276{ 3266{
3277 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3267 return root->nodesize * BTRFS_MAX_LEVEL * num_items;
3278 num_items;
3279} 3268}
3280 3269
3281int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 3270int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
3305 u64 bytenr); 3294 u64 bytenr);
3306void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3295void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3307int get_block_group_index(struct btrfs_block_group_cache *cache); 3296int get_block_group_index(struct btrfs_block_group_cache *cache);
3308struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 3297struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root, u32 blocksize, 3298 struct btrfs_root *root, u64 parent,
3310 u64 parent, u64 root_objectid, 3299 u64 root_objectid,
3311 struct btrfs_disk_key *key, int level, 3300 struct btrfs_disk_key *key, int level,
3312 u64 hint, u64 empty_size); 3301 u64 hint, u64 empty_size);
3313void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3302void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3363 u64 size); 3352 u64 size);
3364int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3353int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3365 struct btrfs_root *root, u64 group_start); 3354 struct btrfs_root *root, u64 group_start);
3355void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3366void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3356void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3367 struct btrfs_root *root); 3357 struct btrfs_root *root);
3368u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3358u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3604 kfree(fs_info->uuid_root); 3594 kfree(fs_info->uuid_root);
3605 kfree(fs_info->super_copy); 3595 kfree(fs_info->super_copy);
3606 kfree(fs_info->super_for_commit); 3596 kfree(fs_info->super_for_commit);
3597 security_free_mnt_opts(&fs_info->security_opts);
3607 kfree(fs_info); 3598 kfree(fs_info);
3608} 3599}
3609 3600
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
3739int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 3730int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
3740 struct bio *bio, u32 *dst); 3731 struct bio *bio, u32 *dst);
3741int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 3732int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
3742 struct btrfs_dio_private *dip, struct bio *bio, 3733 struct bio *bio, u64 logical_offset);
3743 u64 logical_offset);
3744int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 3734int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
3745 struct btrfs_root *root, 3735 struct btrfs_root *root,
3746 u64 objectid, u64 pos, 3736 u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
4141/* Sanity test specific functions */ 4131/* Sanity test specific functions */
4142#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4132#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4143void btrfs_test_destroy_inode(struct inode *inode); 4133void btrfs_test_destroy_inode(struct inode *inode);
4144int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
4145 u64 rfer, u64 excl);
4146#endif 4134#endif
4147 4135
4136static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
4137{
4138#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4139 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
4140 return 1;
4141#endif
4142 return 0;
4143}
4144
4148#endif 4145#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2e90f855d7d..054577bddaf2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1042 int ret; 1042 int ret;
1043 1043
1044 key.objectid = node->inode_id; 1044 key.objectid = node->inode_id;
1045 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1045 key.type = BTRFS_INODE_ITEM_KEY;
1046 key.offset = 0; 1046 key.offset = 0;
1047 1047
1048 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1048 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
1099search: 1099search:
1100 btrfs_release_path(path); 1100 btrfs_release_path(path);
1101 1101
1102 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 1102 key.type = BTRFS_INODE_EXTREF_KEY;
1103 key.offset = -1; 1103 key.offset = -1;
1104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1105 if (ret < 0) 1105 if (ret < 0)
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1473 } 1473 }
1474 1474
1475 delayed_item->key.objectid = btrfs_ino(dir); 1475 delayed_item->key.objectid = btrfs_ino(dir);
1476 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1476 delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
1477 delayed_item->key.offset = index; 1477 delayed_item->key.offset = index;
1478 1478
1479 dir_item = (struct btrfs_dir_item *)delayed_item->data; 1479 dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1542 return PTR_ERR(node); 1542 return PTR_ERR(node);
1543 1543
1544 item_key.objectid = btrfs_ino(dir); 1544 item_key.objectid = btrfs_ino(dir);
1545 btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); 1545 item_key.type = BTRFS_DIR_INDEX_KEY;
1546 item_key.offset = index; 1546 item_key.offset = index;
1547 1547
1548 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); 1548 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..6f662b34ba0e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
168 dev_replace->srcdev->total_bytes; 168 dev_replace->srcdev->total_bytes;
169 dev_replace->tgtdev->disk_total_bytes = 169 dev_replace->tgtdev->disk_total_bytes =
170 dev_replace->srcdev->disk_total_bytes; 170 dev_replace->srcdev->disk_total_bytes;
171 dev_replace->tgtdev->commit_total_bytes =
172 dev_replace->srcdev->commit_total_bytes;
171 dev_replace->tgtdev->bytes_used = 173 dev_replace->tgtdev->bytes_used =
172 dev_replace->srcdev->bytes_used; 174 dev_replace->srcdev->bytes_used;
175 dev_replace->tgtdev->commit_bytes_used =
176 dev_replace->srcdev->commit_bytes_used;
173 } 177 }
174 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; 178 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
175 btrfs_init_dev_replace_tgtdev_for_resume(fs_info, 179 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
329 args->start.tgtdev_name[0] == '\0') 333 args->start.tgtdev_name[0] == '\0')
330 return -EINVAL; 334 return -EINVAL;
331 335
332 mutex_lock(&fs_info->volume_mutex); 336 /*
333 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 337 * Here we commit the transaction to make sure commit_total_bytes
334 &tgt_device); 338 * of all the devices are updated.
335 if (ret) { 339 */
336 btrfs_err(fs_info, "target device %s is invalid!", 340 trans = btrfs_attach_transaction(root);
337 args->start.tgtdev_name); 341 if (!IS_ERR(trans)) {
338 mutex_unlock(&fs_info->volume_mutex); 342 ret = btrfs_commit_transaction(trans, root);
339 return -EINVAL; 343 if (ret)
344 return ret;
345 } else if (PTR_ERR(trans) != -ENOENT) {
346 return PTR_ERR(trans);
340 } 347 }
341 348
349 /* the disk copy procedure reuses the scrub code */
350 mutex_lock(&fs_info->volume_mutex);
342 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 351 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
343 args->start.srcdev_name, 352 args->start.srcdev_name,
344 &src_device); 353 &src_device);
345 mutex_unlock(&fs_info->volume_mutex);
346 if (ret) { 354 if (ret) {
347 ret = -EINVAL; 355 mutex_unlock(&fs_info->volume_mutex);
348 goto leave_no_lock; 356 return ret;
349 } 357 }
350 358
351 if (tgt_device->total_bytes < src_device->total_bytes) { 359 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
352 btrfs_err(fs_info, "target device is smaller than source device!"); 360 src_device, &tgt_device);
353 ret = -EINVAL; 361 mutex_unlock(&fs_info->volume_mutex);
354 goto leave_no_lock; 362 if (ret)
355 } 363 return ret;
356 364
357 btrfs_dev_replace_lock(dev_replace); 365 btrfs_dev_replace_lock(dev_replace);
358 switch (dev_replace->replace_state) { 366 switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
380 src_device->devid, 388 src_device->devid,
381 rcu_str_deref(tgt_device->name)); 389 rcu_str_deref(tgt_device->name));
382 390
383 tgt_device->total_bytes = src_device->total_bytes;
384 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
385 tgt_device->bytes_used = src_device->bytes_used;
386
387 /* 391 /*
388 * from now on, the writes to the srcdev are all duplicated to 392 * from now on, the writes to the srcdev are all duplicated to
389 * go to the tgtdev as well (refer to btrfs_map_block()). 393 * go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
414 418
415 /* the disk copy procedure reuses the scrub code */ 419 /* the disk copy procedure reuses the scrub code */
416 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 420 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
417 src_device->total_bytes, 421 btrfs_device_get_total_bytes(src_device),
418 &dev_replace->scrub_progress, 0, 1); 422 &dev_replace->scrub_progress, 0, 1);
419 423
420 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 424 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
426 dev_replace->srcdev = NULL; 430 dev_replace->srcdev = NULL;
427 dev_replace->tgtdev = NULL; 431 dev_replace->tgtdev = NULL;
428 btrfs_dev_replace_unlock(dev_replace); 432 btrfs_dev_replace_unlock(dev_replace);
429leave_no_lock: 433 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
430 if (tgt_device)
431 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
432 return ret; 434 return ret;
433} 435}
434 436
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
507 ret = btrfs_commit_transaction(trans, root); 509 ret = btrfs_commit_transaction(trans, root);
508 WARN_ON(ret); 510 WARN_ON(ret);
509 511
512 mutex_lock(&uuid_mutex);
510 /* keep away write_all_supers() during the finishing procedure */ 513 /* keep away write_all_supers() during the finishing procedure */
511 mutex_lock(&root->fs_info->chunk_mutex);
512 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 514 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
515 mutex_lock(&root->fs_info->chunk_mutex);
513 btrfs_dev_replace_lock(dev_replace); 516 btrfs_dev_replace_lock(dev_replace);
514 dev_replace->replace_state = 517 dev_replace->replace_state =
515 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 518 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 src_device->devid, 535 src_device->devid,
533 rcu_str_deref(tgt_device->name), scrub_ret); 536 rcu_str_deref(tgt_device->name), scrub_ret);
534 btrfs_dev_replace_unlock(dev_replace); 537 btrfs_dev_replace_unlock(dev_replace);
535 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
536 mutex_unlock(&root->fs_info->chunk_mutex); 538 mutex_unlock(&root->fs_info->chunk_mutex);
539 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
540 mutex_unlock(&uuid_mutex);
537 if (tgt_device) 541 if (tgt_device)
538 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 542 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
539 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 543 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
542 } 546 }
543 547
544 printk_in_rcu(KERN_INFO 548 printk_in_rcu(KERN_INFO
545 "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", 549 "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
546 src_device->missing ? "<missing disk>" : 550 src_device->missing ? "<missing disk>" :
547 rcu_str_deref(src_device->name), 551 rcu_str_deref(src_device->name),
548 src_device->devid, 552 src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
550 tgt_device->is_tgtdev_for_dev_replace = 0; 554 tgt_device->is_tgtdev_for_dev_replace = 0;
551 tgt_device->devid = src_device->devid; 555 tgt_device->devid = src_device->devid;
552 src_device->devid = BTRFS_DEV_REPLACE_DEVID; 556 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
553 tgt_device->bytes_used = src_device->bytes_used;
554 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 557 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
555 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 558 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
556 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 559 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
557 tgt_device->total_bytes = src_device->total_bytes; 560 btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
558 tgt_device->disk_total_bytes = src_device->disk_total_bytes; 561 btrfs_device_set_disk_total_bytes(tgt_device,
559 tgt_device->bytes_used = src_device->bytes_used; 562 src_device->disk_total_bytes);
563 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
564 ASSERT(list_empty(&src_device->resized_list));
565 tgt_device->commit_total_bytes = src_device->commit_total_bytes;
566 tgt_device->commit_bytes_used = src_device->bytes_used;
560 if (fs_info->sb->s_bdev == src_device->bdev) 567 if (fs_info->sb->s_bdev == src_device->bdev)
561 fs_info->sb->s_bdev = tgt_device->bdev; 568 fs_info->sb->s_bdev = tgt_device->bdev;
562 if (fs_info->fs_devices->latest_bdev == src_device->bdev) 569 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
563 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 570 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
564 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 571 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
572 fs_info->fs_devices->rw_devices++;
565 573
566 /* replace the sysfs entry */ 574 /* replace the sysfs entry */
567 btrfs_kobj_rm_device(fs_info, src_device); 575 btrfs_kobj_rm_device(fs_info, src_device);
568 btrfs_kobj_add_device(fs_info, tgt_device); 576 btrfs_kobj_add_device(fs_info, tgt_device);
569 577
578 btrfs_dev_replace_unlock(dev_replace);
579
570 btrfs_rm_dev_replace_blocked(fs_info); 580 btrfs_rm_dev_replace_blocked(fs_info);
571 581
572 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 582 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
580 * superblock is scratched out so that it is no longer marked to 590 * superblock is scratched out so that it is no longer marked to
581 * belong to this filesystem. 591 * belong to this filesystem.
582 */ 592 */
583 btrfs_dev_replace_unlock(dev_replace);
584 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
585 mutex_unlock(&root->fs_info->chunk_mutex); 593 mutex_unlock(&root->fs_info->chunk_mutex);
594 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
595 mutex_unlock(&uuid_mutex);
586 596
587 /* write back the superblocks */ 597 /* write back the superblocks */
588 trans = btrfs_start_transaction(root, 0); 598 trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
643 struct btrfs_ioctl_dev_replace_args *args) 653 struct btrfs_ioctl_dev_replace_args *args)
644{ 654{
645 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 655 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
656 struct btrfs_device *srcdev;
646 657
647 btrfs_dev_replace_lock(dev_replace); 658 btrfs_dev_replace_lock(dev_replace);
648 /* even if !dev_replace_is_valid, the values are good enough for 659 /* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
665 break; 676 break;
666 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 677 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
667 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 678 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
679 srcdev = dev_replace->srcdev;
668 args->status.progress_1000 = div64_u64(dev_replace->cursor_left, 680 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
669 div64_u64(dev_replace->srcdev->total_bytes, 1000)); 681 div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
670 break; 682 break;
671 } 683 }
672 btrfs_dev_replace_unlock(dev_replace); 684 btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
825 837
826 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 838 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
827 dev_replace->committed_cursor_left, 839 dev_replace->committed_cursor_left,
828 dev_replace->srcdev->total_bytes, 840 btrfs_device_get_total_bytes(dev_replace->srcdev),
829 &dev_replace->scrub_progress, 0, 1); 841 &dev_replace->scrub_progress, 0, 1);
830 ret = btrfs_dev_replace_finishing(fs_info, ret); 842 ret = btrfs_dev_replace_finishing(fs_info, ret);
831 WARN_ON(ret); 843 WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df5dcea..fc8df866e919 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
86 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); 86 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
87 87
88 key.objectid = objectid; 88 key.objectid = objectid;
89 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 89 key.type = BTRFS_XATTR_ITEM_KEY;
90 key.offset = btrfs_name_hash(name, name_len); 90 key.offset = btrfs_name_hash(name, name_len);
91 91
92 data_size = sizeof(*dir_item) + name_len + data_len; 92 data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
137 u32 data_size; 137 u32 data_size;
138 138
139 key.objectid = btrfs_ino(dir); 139 key.objectid = btrfs_ino(dir);
140 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 140 key.type = BTRFS_DIR_ITEM_KEY;
141 key.offset = btrfs_name_hash(name, name_len); 141 key.offset = btrfs_name_hash(name, name_len);
142 142
143 path = btrfs_alloc_path(); 143 path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
204 int cow = mod != 0; 204 int cow = mod != 0;
205 205
206 key.objectid = dir; 206 key.objectid = dir;
207 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 207 key.type = BTRFS_DIR_ITEM_KEY;
208 208
209 key.offset = btrfs_name_hash(name, name_len); 209 key.offset = btrfs_name_hash(name, name_len);
210 210
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
234 return -ENOMEM; 234 return -ENOMEM;
235 235
236 key.objectid = dir; 236 key.objectid = dir;
237 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 237 key.type = BTRFS_DIR_ITEM_KEY;
238 key.offset = btrfs_name_hash(name, name_len); 238 key.offset = btrfs_name_hash(name, name_len);
239 239
240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
297 int cow = mod != 0; 297 int cow = mod != 0;
298 298
299 key.objectid = dir; 299 key.objectid = dir;
300 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 300 key.type = BTRFS_DIR_INDEX_KEY;
301 key.offset = objectid; 301 key.offset = objectid;
302 302
303 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 303 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
367 int cow = mod != 0; 367 int cow = mod != 0;
368 368
369 key.objectid = dir; 369 key.objectid = dir;
370 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 370 key.type = BTRFS_XATTR_ITEM_KEY;
371 key.offset = btrfs_name_hash(name, name_len); 371 key.offset = btrfs_name_hash(name, name_len);
372 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 372 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
373 if (ret < 0) 373 if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d0d78dc07792..fa45e3cae40d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -72,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
72static void btrfs_error_commit_super(struct btrfs_root *root); 72static void btrfs_error_commit_super(struct btrfs_root *root);
73 73
74/* 74/*
75 * end_io_wq structs are used to do processing in task context when an IO is 75 * btrfs_end_io_wq structs are used to do processing in task context when an IO
76 * complete. This is used during reads to verify checksums, and it is used 76 * is complete. This is used during reads to verify checksums, and it is used
77 * by writes to insert metadata for new file extents after IO is complete. 77 * by writes to insert metadata for new file extents after IO is complete.
78 */ 78 */
79struct end_io_wq { 79struct btrfs_end_io_wq {
80 struct bio *bio; 80 struct bio *bio;
81 bio_end_io_t *end_io; 81 bio_end_io_t *end_io;
82 void *private; 82 void *private;
83 struct btrfs_fs_info *info; 83 struct btrfs_fs_info *info;
84 int error; 84 int error;
85 int metadata; 85 enum btrfs_wq_endio_type metadata;
86 struct list_head list; 86 struct list_head list;
87 struct btrfs_work work; 87 struct btrfs_work work;
88}; 88};
89 89
90static struct kmem_cache *btrfs_end_io_wq_cache;
91
92int __init btrfs_end_io_wq_init(void)
93{
94 btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
95 sizeof(struct btrfs_end_io_wq),
96 0,
97 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
98 NULL);
99 if (!btrfs_end_io_wq_cache)
100 return -ENOMEM;
101 return 0;
102}
103
104void btrfs_end_io_wq_exit(void)
105{
106 if (btrfs_end_io_wq_cache)
107 kmem_cache_destroy(btrfs_end_io_wq_cache);
108}
109
90/* 110/*
91 * async submit bios are used to offload expensive checksumming 111 * async submit bios are used to offload expensive checksumming
92 * onto the worker threads. They checksum file and metadata bios 112 * onto the worker threads. They checksum file and metadata bios
@@ -327,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
327{ 347{
328 struct extent_state *cached_state = NULL; 348 struct extent_state *cached_state = NULL;
329 int ret; 349 int ret;
330 bool need_lock = (current->journal_info == 350 bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
331 (void *)BTRFS_SEND_TRANS_STUB);
332 351
333 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 352 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
334 return 0; 353 return 0;
@@ -348,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
348 ret = 0; 367 ret = 0;
349 goto out; 368 goto out;
350 } 369 }
351 printk_ratelimited("parent transid verify failed on %llu wanted %llu " 370 printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
352 "found %llu\n", 371 eb->fs_info->sb->s_id, eb->start,
353 eb->start, parent_transid, btrfs_header_generation(eb)); 372 parent_transid, btrfs_header_generation(eb));
354 ret = 1; 373 ret = 1;
355 374
356 /* 375 /*
@@ -607,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
607 goto err; 626 goto err;
608 627
609 eb->read_mirror = mirror; 628 eb->read_mirror = mirror;
610 if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 629 if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
611 ret = -EIO; 630 ret = -EIO;
612 goto err; 631 goto err;
613 } 632 }
614 633
615 found_start = btrfs_header_bytenr(eb); 634 found_start = btrfs_header_bytenr(eb);
616 if (found_start != eb->start) { 635 if (found_start != eb->start) {
617 printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " 636 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
618 "%llu %llu\n", 637 "%llu %llu\n",
619 found_start, eb->start); 638 eb->fs_info->sb->s_id, found_start, eb->start);
620 ret = -EIO; 639 ret = -EIO;
621 goto err; 640 goto err;
622 } 641 }
623 if (check_tree_block_fsid(root, eb)) { 642 if (check_tree_block_fsid(root, eb)) {
624 printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", 643 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
625 eb->start); 644 eb->fs_info->sb->s_id, eb->start);
626 ret = -EIO; 645 ret = -EIO;
627 goto err; 646 goto err;
628 } 647 }
@@ -680,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
680 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 699 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
681 700
682 eb = (struct extent_buffer *)page->private; 701 eb = (struct extent_buffer *)page->private;
683 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 702 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
684 eb->read_mirror = failed_mirror; 703 eb->read_mirror = failed_mirror;
685 atomic_dec(&eb->io_pages); 704 atomic_dec(&eb->io_pages);
686 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 705 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -690,7 +709,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
690 709
691static void end_workqueue_bio(struct bio *bio, int err) 710static void end_workqueue_bio(struct bio *bio, int err)
692{ 711{
693 struct end_io_wq *end_io_wq = bio->bi_private; 712 struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
694 struct btrfs_fs_info *fs_info; 713 struct btrfs_fs_info *fs_info;
695 struct btrfs_workqueue *wq; 714 struct btrfs_workqueue *wq;
696 btrfs_work_func_t func; 715 btrfs_work_func_t func;
@@ -713,7 +732,11 @@ static void end_workqueue_bio(struct bio *bio, int err)
713 func = btrfs_endio_write_helper; 732 func = btrfs_endio_write_helper;
714 } 733 }
715 } else { 734 } else {
716 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { 735 if (unlikely(end_io_wq->metadata ==
736 BTRFS_WQ_ENDIO_DIO_REPAIR)) {
737 wq = fs_info->endio_repair_workers;
738 func = btrfs_endio_repair_helper;
739 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
717 wq = fs_info->endio_raid56_workers; 740 wq = fs_info->endio_raid56_workers;
718 func = btrfs_endio_raid56_helper; 741 func = btrfs_endio_raid56_helper;
719 } else if (end_io_wq->metadata) { 742 } else if (end_io_wq->metadata) {
@@ -729,19 +752,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
729 btrfs_queue_work(wq, &end_io_wq->work); 752 btrfs_queue_work(wq, &end_io_wq->work);
730} 753}
731 754
732/*
733 * For the metadata arg you want
734 *
735 * 0 - if data
736 * 1 - if normal metadta
737 * 2 - if writing to the free space cache area
738 * 3 - raid parity work
739 */
740int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 755int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
741 int metadata) 756 enum btrfs_wq_endio_type metadata)
742{ 757{
743 struct end_io_wq *end_io_wq; 758 struct btrfs_end_io_wq *end_io_wq;
744 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); 759
760 end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
745 if (!end_io_wq) 761 if (!end_io_wq)
746 return -ENOMEM; 762 return -ENOMEM;
747 763
@@ -925,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
925 * can happen in the async kernel threads 941 * can happen in the async kernel threads
926 */ 942 */
927 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 943 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
928 bio, 1); 944 bio, BTRFS_WQ_ENDIO_METADATA);
929 if (ret) 945 if (ret)
930 goto out_w_error; 946 goto out_w_error;
931 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 947 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1057,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
1057 .set_page_dirty = btree_set_page_dirty, 1073 .set_page_dirty = btree_set_page_dirty,
1058}; 1074};
1059 1075
1060int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1076void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
1061 u64 parent_transid)
1062{ 1077{
1063 struct extent_buffer *buf = NULL; 1078 struct extent_buffer *buf = NULL;
1064 struct inode *btree_inode = root->fs_info->btree_inode; 1079 struct inode *btree_inode = root->fs_info->btree_inode;
1065 int ret = 0;
1066 1080
1067 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1081 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1068 if (!buf) 1082 if (!buf)
1069 return 0; 1083 return;
1070 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1084 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1071 buf, 0, WAIT_NONE, btree_get_extent, 0); 1085 buf, 0, WAIT_NONE, btree_get_extent, 0);
1072 free_extent_buffer(buf); 1086 free_extent_buffer(buf);
1073 return ret;
1074} 1087}
1075 1088
1076int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1089int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1106,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1106} 1119}
1107 1120
1108struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1121struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1109 u64 bytenr, u32 blocksize) 1122 u64 bytenr)
1110{ 1123{
1111 return find_extent_buffer(root->fs_info, bytenr); 1124 return find_extent_buffer(root->fs_info, bytenr);
1112} 1125}
@@ -1114,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1114struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1127struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1115 u64 bytenr, u32 blocksize) 1128 u64 bytenr, u32 blocksize)
1116{ 1129{
1117#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1130 if (btrfs_test_is_dummy_root(root))
1118 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1119 return alloc_test_extent_buffer(root->fs_info, bytenr, 1131 return alloc_test_extent_buffer(root->fs_info, bytenr,
1120 blocksize); 1132 blocksize);
1121#endif
1122 return alloc_extent_buffer(root->fs_info, bytenr, blocksize); 1133 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1123} 1134}
1124 1135
@@ -1136,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1136} 1147}
1137 1148
1138struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 1149struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1139 u32 blocksize, u64 parent_transid) 1150 u64 parent_transid)
1140{ 1151{
1141 struct extent_buffer *buf = NULL; 1152 struct extent_buffer *buf = NULL;
1142 int ret; 1153 int ret;
1143 1154
1144 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1155 buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
1145 if (!buf) 1156 if (!buf)
1146 return NULL; 1157 return NULL;
1147 1158
@@ -1200,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1200 kfree(writers); 1211 kfree(writers);
1201} 1212}
1202 1213
1203static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1214static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1204 u32 stripesize, struct btrfs_root *root, 1215 struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1205 struct btrfs_fs_info *fs_info,
1206 u64 objectid) 1216 u64 objectid)
1207{ 1217{
1208 root->node = NULL; 1218 root->node = NULL;
1209 root->commit_root = NULL; 1219 root->commit_root = NULL;
1210 root->sectorsize = sectorsize; 1220 root->sectorsize = sectorsize;
1211 root->nodesize = nodesize; 1221 root->nodesize = nodesize;
1212 root->leafsize = leafsize;
1213 root->stripesize = stripesize; 1222 root->stripesize = stripesize;
1214 root->state = 0; 1223 root->state = 0;
1215 root->orphan_cleanup_state = 0; 1224 root->orphan_cleanup_state = 0;
@@ -1295,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1295 root = btrfs_alloc_root(NULL); 1304 root = btrfs_alloc_root(NULL);
1296 if (!root) 1305 if (!root)
1297 return ERR_PTR(-ENOMEM); 1306 return ERR_PTR(-ENOMEM);
1298 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); 1307 __setup_root(4096, 4096, 4096, root, NULL, 1);
1299 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); 1308 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1300 root->alloc_bytenr = 0; 1309 root->alloc_bytenr = 0;
1301 1310
@@ -1318,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1318 if (!root) 1327 if (!root)
1319 return ERR_PTR(-ENOMEM); 1328 return ERR_PTR(-ENOMEM);
1320 1329
1321 __setup_root(tree_root->nodesize, tree_root->leafsize, 1330 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1322 tree_root->sectorsize, tree_root->stripesize, 1331 tree_root->stripesize, root, fs_info, objectid);
1323 root, fs_info, objectid);
1324 root->root_key.objectid = objectid; 1332 root->root_key.objectid = objectid;
1325 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1333 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1326 root->root_key.offset = 0; 1334 root->root_key.offset = 0;
1327 1335
1328 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 1336 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1329 0, objectid, NULL, 0, 0, 0);
1330 if (IS_ERR(leaf)) { 1337 if (IS_ERR(leaf)) {
1331 ret = PTR_ERR(leaf); 1338 ret = PTR_ERR(leaf);
1332 leaf = NULL; 1339 leaf = NULL;
@@ -1396,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1396 if (!root) 1403 if (!root)
1397 return ERR_PTR(-ENOMEM); 1404 return ERR_PTR(-ENOMEM);
1398 1405
1399 __setup_root(tree_root->nodesize, tree_root->leafsize, 1406 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1400 tree_root->sectorsize, tree_root->stripesize, 1407 tree_root->stripesize, root, fs_info,
1401 root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1408 BTRFS_TREE_LOG_OBJECTID);
1402 1409
1403 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1410 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1404 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1411 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1413,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1413 * updated (along with back refs to the log tree). 1420 * updated (along with back refs to the log tree).
1414 */ 1421 */
1415 1422
1416 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1423 leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1417 BTRFS_TREE_LOG_OBJECTID, NULL, 1424 NULL, 0, 0, 0);
1418 0, 0, 0);
1419 if (IS_ERR(leaf)) { 1425 if (IS_ERR(leaf)) {
1420 kfree(root); 1426 kfree(root);
1421 return ERR_CAST(leaf); 1427 return ERR_CAST(leaf);
@@ -1465,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1465 btrfs_set_stack_inode_generation(inode_item, 1); 1471 btrfs_set_stack_inode_generation(inode_item, 1);
1466 btrfs_set_stack_inode_size(inode_item, 3); 1472 btrfs_set_stack_inode_size(inode_item, 3);
1467 btrfs_set_stack_inode_nlink(inode_item, 1); 1473 btrfs_set_stack_inode_nlink(inode_item, 1);
1468 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 1474 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
1469 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 1475 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1470 1476
1471 btrfs_set_root_node(&log_root->root_item, log_root->node); 1477 btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1485,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1485 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1491 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1486 struct btrfs_path *path; 1492 struct btrfs_path *path;
1487 u64 generation; 1493 u64 generation;
1488 u32 blocksize;
1489 int ret; 1494 int ret;
1490 1495
1491 path = btrfs_alloc_path(); 1496 path = btrfs_alloc_path();
@@ -1498,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1498 goto alloc_fail; 1503 goto alloc_fail;
1499 } 1504 }
1500 1505
1501 __setup_root(tree_root->nodesize, tree_root->leafsize, 1506 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1502 tree_root->sectorsize, tree_root->stripesize, 1507 tree_root->stripesize, root, fs_info, key->objectid);
1503 root, fs_info, key->objectid);
1504 1508
1505 ret = btrfs_find_root(tree_root, key, path, 1509 ret = btrfs_find_root(tree_root, key, path,
1506 &root->root_item, &root->root_key); 1510 &root->root_item, &root->root_key);
@@ -1511,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1511 } 1515 }
1512 1516
1513 generation = btrfs_root_generation(&root->root_item); 1517 generation = btrfs_root_generation(&root->root_item);
1514 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1515 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1518 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1516 blocksize, generation); 1519 generation);
1517 if (!root->node) { 1520 if (!root->node) {
1518 ret = -ENOMEM; 1521 ret = -ENOMEM;
1519 goto find_fail; 1522 goto find_fail;
@@ -1573,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1573 root->subv_writers = writers; 1576 root->subv_writers = writers;
1574 1577
1575 btrfs_init_free_ino_ctl(root); 1578 btrfs_init_free_ino_ctl(root);
1576 spin_lock_init(&root->cache_lock); 1579 spin_lock_init(&root->ino_cache_lock);
1577 init_waitqueue_head(&root->cache_wait); 1580 init_waitqueue_head(&root->ino_cache_wait);
1578 1581
1579 ret = get_anon_bdev(&root->anon_dev); 1582 ret = get_anon_bdev(&root->anon_dev);
1580 if (ret) 1583 if (ret)
@@ -1708,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1708 return ret; 1711 return ret;
1709} 1712}
1710 1713
1711/*
1712 * If this fails, caller must call bdi_destroy() to get rid of the
1713 * bdi again.
1714 */
1715static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) 1714static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1716{ 1715{
1717 int err; 1716 int err;
@@ -1734,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1734static void end_workqueue_fn(struct btrfs_work *work) 1733static void end_workqueue_fn(struct btrfs_work *work)
1735{ 1734{
1736 struct bio *bio; 1735 struct bio *bio;
1737 struct end_io_wq *end_io_wq; 1736 struct btrfs_end_io_wq *end_io_wq;
1738 int error; 1737 int error;
1739 1738
1740 end_io_wq = container_of(work, struct end_io_wq, work); 1739 end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1741 bio = end_io_wq->bio; 1740 bio = end_io_wq->bio;
1742 1741
1743 error = end_io_wq->error; 1742 error = end_io_wq->error;
1744 bio->bi_private = end_io_wq->private; 1743 bio->bi_private = end_io_wq->private;
1745 bio->bi_end_io = end_io_wq->end_io; 1744 bio->bi_end_io = end_io_wq->end_io;
1746 kfree(end_io_wq); 1745 kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1747 bio_endio_nodec(bio, error); 1746 bio_endio_nodec(bio, error);
1748} 1747}
1749 1748
@@ -1772,6 +1771,7 @@ static int cleaner_kthread(void *arg)
1772 } 1771 }
1773 1772
1774 btrfs_run_delayed_iputs(root); 1773 btrfs_run_delayed_iputs(root);
1774 btrfs_delete_unused_bgs(root->fs_info);
1775 again = btrfs_clean_one_deleted_snapshot(root); 1775 again = btrfs_clean_one_deleted_snapshot(root);
1776 mutex_unlock(&root->fs_info->cleaner_mutex); 1776 mutex_unlock(&root->fs_info->cleaner_mutex);
1777 1777
@@ -2063,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2063 btrfs_destroy_workqueue(fs_info->endio_workers); 2063 btrfs_destroy_workqueue(fs_info->endio_workers);
2064 btrfs_destroy_workqueue(fs_info->endio_meta_workers); 2064 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2065 btrfs_destroy_workqueue(fs_info->endio_raid56_workers); 2065 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2066 btrfs_destroy_workqueue(fs_info->endio_repair_workers);
2066 btrfs_destroy_workqueue(fs_info->rmw_workers); 2067 btrfs_destroy_workqueue(fs_info->rmw_workers);
2067 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); 2068 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2068 btrfs_destroy_workqueue(fs_info->endio_write_workers); 2069 btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2143,8 +2144,6 @@ int open_ctree(struct super_block *sb,
2143{ 2144{
2144 u32 sectorsize; 2145 u32 sectorsize;
2145 u32 nodesize; 2146 u32 nodesize;
2146 u32 leafsize;
2147 u32 blocksize;
2148 u32 stripesize; 2147 u32 stripesize;
2149 u64 generation; 2148 u64 generation;
2150 u64 features; 2149 u64 features;
@@ -2233,6 +2232,7 @@ int open_ctree(struct super_block *sb,
2233 spin_lock_init(&fs_info->super_lock); 2232 spin_lock_init(&fs_info->super_lock);
2234 spin_lock_init(&fs_info->qgroup_op_lock); 2233 spin_lock_init(&fs_info->qgroup_op_lock);
2235 spin_lock_init(&fs_info->buffer_lock); 2234 spin_lock_init(&fs_info->buffer_lock);
2235 spin_lock_init(&fs_info->unused_bgs_lock);
2236 rwlock_init(&fs_info->tree_mod_log_lock); 2236 rwlock_init(&fs_info->tree_mod_log_lock);
2237 mutex_init(&fs_info->reloc_mutex); 2237 mutex_init(&fs_info->reloc_mutex);
2238 mutex_init(&fs_info->delalloc_root_mutex); 2238 mutex_init(&fs_info->delalloc_root_mutex);
@@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb,
2242 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2242 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2243 INIT_LIST_HEAD(&fs_info->space_info); 2243 INIT_LIST_HEAD(&fs_info->space_info);
2244 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2244 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2245 INIT_LIST_HEAD(&fs_info->unused_bgs);
2245 btrfs_mapping_init(&fs_info->mapping_tree); 2246 btrfs_mapping_init(&fs_info->mapping_tree);
2246 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2247 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2247 BTRFS_BLOCK_RSV_GLOBAL); 2248 BTRFS_BLOCK_RSV_GLOBAL);
@@ -2260,7 +2261,7 @@ int open_ctree(struct super_block *sb,
2260 atomic_set(&fs_info->qgroup_op_seq, 0); 2261 atomic_set(&fs_info->qgroup_op_seq, 0);
2261 atomic64_set(&fs_info->tree_mod_seq, 0); 2262 atomic64_set(&fs_info->tree_mod_seq, 0);
2262 fs_info->sb = sb; 2263 fs_info->sb = sb;
2263 fs_info->max_inline = 8192 * 1024; 2264 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2264 fs_info->metadata_ratio = 0; 2265 fs_info->metadata_ratio = 0;
2265 fs_info->defrag_inodes = RB_ROOT; 2266 fs_info->defrag_inodes = RB_ROOT;
2266 fs_info->free_chunk_space = 0; 2267 fs_info->free_chunk_space = 0;
@@ -2389,7 +2390,7 @@ int open_ctree(struct super_block *sb,
2389 goto fail_alloc; 2390 goto fail_alloc;
2390 } 2391 }
2391 2392
2392 __setup_root(4096, 4096, 4096, 4096, tree_root, 2393 __setup_root(4096, 4096, 4096, tree_root,
2393 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2394 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2394 2395
2395 invalidate_bdev(fs_devices->latest_bdev); 2396 invalidate_bdev(fs_devices->latest_bdev);
@@ -2469,19 +2470,22 @@ int open_ctree(struct super_block *sb,
2469 goto fail_alloc; 2470 goto fail_alloc;
2470 } 2471 }
2471 2472
2472 if (btrfs_super_leafsize(disk_super) != 2473 /*
2474 * Leafsize and nodesize were always equal, this is only a sanity check.
2475 */
2476 if (le32_to_cpu(disk_super->__unused_leafsize) !=
2473 btrfs_super_nodesize(disk_super)) { 2477 btrfs_super_nodesize(disk_super)) {
2474 printk(KERN_ERR "BTRFS: couldn't mount because metadata " 2478 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2475 "blocksizes don't match. node %d leaf %d\n", 2479 "blocksizes don't match. node %d leaf %d\n",
2476 btrfs_super_nodesize(disk_super), 2480 btrfs_super_nodesize(disk_super),
2477 btrfs_super_leafsize(disk_super)); 2481 le32_to_cpu(disk_super->__unused_leafsize));
2478 err = -EINVAL; 2482 err = -EINVAL;
2479 goto fail_alloc; 2483 goto fail_alloc;
2480 } 2484 }
2481 if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { 2485 if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
2482 printk(KERN_ERR "BTRFS: couldn't mount because metadata " 2486 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2483 "blocksize (%d) was too large\n", 2487 "blocksize (%d) was too large\n",
2484 btrfs_super_leafsize(disk_super)); 2488 btrfs_super_nodesize(disk_super));
2485 err = -EINVAL; 2489 err = -EINVAL;
2486 goto fail_alloc; 2490 goto fail_alloc;
2487 } 2491 }
@@ -2498,17 +2502,16 @@ int open_ctree(struct super_block *sb,
2498 * flag our filesystem as having big metadata blocks if 2502 * flag our filesystem as having big metadata blocks if
2499 * they are bigger than the page size 2503 * they are bigger than the page size
2500 */ 2504 */
2501 if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { 2505 if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
2502 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) 2506 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2503 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); 2507 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
2504 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; 2508 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2505 } 2509 }
2506 2510
2507 nodesize = btrfs_super_nodesize(disk_super); 2511 nodesize = btrfs_super_nodesize(disk_super);
2508 leafsize = btrfs_super_leafsize(disk_super);
2509 sectorsize = btrfs_super_sectorsize(disk_super); 2512 sectorsize = btrfs_super_sectorsize(disk_super);
2510 stripesize = btrfs_super_stripesize(disk_super); 2513 stripesize = btrfs_super_stripesize(disk_super);
2511 fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); 2514 fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2512 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); 2515 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2513 2516
2514 /* 2517 /*
@@ -2516,7 +2519,7 @@ int open_ctree(struct super_block *sb,
2516 * extent buffers for the same range. It leads to corruptions 2519 * extent buffers for the same range. It leads to corruptions
2517 */ 2520 */
2518 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2521 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2519 (sectorsize != leafsize)) { 2522 (sectorsize != nodesize)) {
2520 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " 2523 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
2521 "are not allowed for mixed block groups on %s\n", 2524 "are not allowed for mixed block groups on %s\n",
2522 sb->s_id); 2525 sb->s_id);
@@ -2579,6 +2582,8 @@ int open_ctree(struct super_block *sb,
2579 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); 2582 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2580 fs_info->endio_raid56_workers = 2583 fs_info->endio_raid56_workers =
2581 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); 2584 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2585 fs_info->endio_repair_workers =
2586 btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2582 fs_info->rmw_workers = 2587 fs_info->rmw_workers =
2583 btrfs_alloc_workqueue("rmw", flags, max_active, 2); 2588 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2584 fs_info->endio_write_workers = 2589 fs_info->endio_write_workers =
@@ -2600,11 +2605,12 @@ int open_ctree(struct super_block *sb,
2600 fs_info->submit_workers && fs_info->flush_workers && 2605 fs_info->submit_workers && fs_info->flush_workers &&
2601 fs_info->endio_workers && fs_info->endio_meta_workers && 2606 fs_info->endio_workers && fs_info->endio_meta_workers &&
2602 fs_info->endio_meta_write_workers && 2607 fs_info->endio_meta_write_workers &&
2608 fs_info->endio_repair_workers &&
2603 fs_info->endio_write_workers && fs_info->endio_raid56_workers && 2609 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2604 fs_info->endio_freespace_worker && fs_info->rmw_workers && 2610 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2605 fs_info->caching_workers && fs_info->readahead_workers && 2611 fs_info->caching_workers && fs_info->readahead_workers &&
2606 fs_info->fixup_workers && fs_info->delayed_workers && 2612 fs_info->fixup_workers && fs_info->delayed_workers &&
2607 fs_info->fixup_workers && fs_info->extent_workers && 2613 fs_info->extent_workers &&
2608 fs_info->qgroup_rescan_workers)) { 2614 fs_info->qgroup_rescan_workers)) {
2609 err = -ENOMEM; 2615 err = -ENOMEM;
2610 goto fail_sb_buffer; 2616 goto fail_sb_buffer;
@@ -2615,7 +2621,6 @@ int open_ctree(struct super_block *sb,
2615 4 * 1024 * 1024 / PAGE_CACHE_SIZE); 2621 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
2616 2622
2617 tree_root->nodesize = nodesize; 2623 tree_root->nodesize = nodesize;
2618 tree_root->leafsize = leafsize;
2619 tree_root->sectorsize = sectorsize; 2624 tree_root->sectorsize = sectorsize;
2620 tree_root->stripesize = stripesize; 2625 tree_root->stripesize = stripesize;
2621 2626
@@ -2642,16 +2647,14 @@ int open_ctree(struct super_block *sb,
2642 goto fail_sb_buffer; 2647 goto fail_sb_buffer;
2643 } 2648 }
2644 2649
2645 blocksize = btrfs_level_size(tree_root,
2646 btrfs_super_chunk_root_level(disk_super));
2647 generation = btrfs_super_chunk_root_generation(disk_super); 2650 generation = btrfs_super_chunk_root_generation(disk_super);
2648 2651
2649 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2652 __setup_root(nodesize, sectorsize, stripesize, chunk_root,
2650 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); 2653 fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2651 2654
2652 chunk_root->node = read_tree_block(chunk_root, 2655 chunk_root->node = read_tree_block(chunk_root,
2653 btrfs_super_chunk_root(disk_super), 2656 btrfs_super_chunk_root(disk_super),
2654 blocksize, generation); 2657 generation);
2655 if (!chunk_root->node || 2658 if (!chunk_root->node ||
2656 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2659 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2657 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", 2660 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2684,13 +2687,11 @@ int open_ctree(struct super_block *sb,
2684 } 2687 }
2685 2688
2686retry_root_backup: 2689retry_root_backup:
2687 blocksize = btrfs_level_size(tree_root,
2688 btrfs_super_root_level(disk_super));
2689 generation = btrfs_super_generation(disk_super); 2690 generation = btrfs_super_generation(disk_super);
2690 2691
2691 tree_root->node = read_tree_block(tree_root, 2692 tree_root->node = read_tree_block(tree_root,
2692 btrfs_super_root(disk_super), 2693 btrfs_super_root(disk_super),
2693 blocksize, generation); 2694 generation);
2694 if (!tree_root->node || 2695 if (!tree_root->node ||
2695 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 2696 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
2696 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", 2697 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2859,9 +2860,6 @@ retry_root_backup:
2859 err = -EIO; 2860 err = -EIO;
2860 goto fail_qgroup; 2861 goto fail_qgroup;
2861 } 2862 }
2862 blocksize =
2863 btrfs_level_size(tree_root,
2864 btrfs_super_log_root_level(disk_super));
2865 2863
2866 log_tree_root = btrfs_alloc_root(fs_info); 2864 log_tree_root = btrfs_alloc_root(fs_info);
2867 if (!log_tree_root) { 2865 if (!log_tree_root) {
@@ -2869,11 +2867,10 @@ retry_root_backup:
2869 goto fail_qgroup; 2867 goto fail_qgroup;
2870 } 2868 }
2871 2869
2872 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2870 __setup_root(nodesize, sectorsize, stripesize,
2873 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 2871 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
2874 2872
2875 log_tree_root->node = read_tree_block(tree_root, bytenr, 2873 log_tree_root->node = read_tree_block(tree_root, bytenr,
2876 blocksize,
2877 generation + 1); 2874 generation + 1);
2878 if (!log_tree_root->node || 2875 if (!log_tree_root->node ||
2879 !extent_buffer_uptodate(log_tree_root->node)) { 2876 !extent_buffer_uptodate(log_tree_root->node)) {
@@ -2980,6 +2977,8 @@ retry_root_backup:
2980 fs_info->update_uuid_tree_gen = 1; 2977 fs_info->update_uuid_tree_gen = 1;
2981 } 2978 }
2982 2979
2980 fs_info->open = 1;
2981
2983 return 0; 2982 return 0;
2984 2983
2985fail_qgroup: 2984fail_qgroup:
@@ -3139,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
3139 3138
3140 for (i = 0; i < max_mirrors; i++) { 3139 for (i = 0; i < max_mirrors; i++) {
3141 bytenr = btrfs_sb_offset(i); 3140 bytenr = btrfs_sb_offset(i);
3142 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 3141 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3142 device->commit_total_bytes)
3143 break; 3143 break;
3144 3144
3145 if (wait) { 3145 if (wait) {
@@ -3456,8 +3456,9 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3456 btrfs_set_stack_device_type(dev_item, dev->type); 3456 btrfs_set_stack_device_type(dev_item, dev->type);
3457 btrfs_set_stack_device_id(dev_item, dev->devid); 3457 btrfs_set_stack_device_id(dev_item, dev->devid);
3458 btrfs_set_stack_device_total_bytes(dev_item, 3458 btrfs_set_stack_device_total_bytes(dev_item,
3459 dev->disk_total_bytes); 3459 dev->commit_total_bytes);
3460 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); 3460 btrfs_set_stack_device_bytes_used(dev_item,
3461 dev->commit_bytes_used);
3461 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 3462 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3462 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 3463 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3463 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); 3464 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3532,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3532 3533
3533static void free_fs_root(struct btrfs_root *root) 3534static void free_fs_root(struct btrfs_root *root)
3534{ 3535{
3535 iput(root->cache_inode); 3536 iput(root->ino_cache_inode);
3536 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 3537 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3537 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3538 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3538 root->orphan_block_rsv = NULL; 3539 root->orphan_block_rsv = NULL;
@@ -3623,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
3623 return btrfs_commit_transaction(trans, root); 3624 return btrfs_commit_transaction(trans, root);
3624} 3625}
3625 3626
3626int close_ctree(struct btrfs_root *root) 3627void close_ctree(struct btrfs_root *root)
3627{ 3628{
3628 struct btrfs_fs_info *fs_info = root->fs_info; 3629 struct btrfs_fs_info *fs_info = root->fs_info;
3629 int ret; 3630 int ret;
@@ -3689,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
3689 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 3690 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3690 btrfs_stop_all_workers(fs_info); 3691 btrfs_stop_all_workers(fs_info);
3691 3692
3693 fs_info->open = 0;
3692 free_root_pointers(fs_info, 1); 3694 free_root_pointers(fs_info, 1);
3693 3695
3694 iput(fs_info->btree_inode); 3696 iput(fs_info->btree_inode);
@@ -3711,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
3711 3713
3712 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3714 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3713 root->orphan_block_rsv = NULL; 3715 root->orphan_block_rsv = NULL;
3714
3715 return 0;
3716} 3716}
3717 3717
3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3814,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3814static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3814static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3815 int read_only) 3815 int read_only)
3816{ 3816{
3817 struct btrfs_super_block *sb = fs_info->super_copy;
3818 int ret = 0;
3819
3820 if (sb->root_level > BTRFS_MAX_LEVEL) {
3821 printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
3822 sb->root_level, BTRFS_MAX_LEVEL);
3823 ret = -EINVAL;
3824 }
3825 if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
3826 printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
3827 sb->chunk_root_level, BTRFS_MAX_LEVEL);
3828 ret = -EINVAL;
3829 }
3830 if (sb->log_root_level > BTRFS_MAX_LEVEL) {
3831 printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
3832 sb->log_root_level, BTRFS_MAX_LEVEL);
3833 ret = -EINVAL;
3834 }
3835
3817 /* 3836 /*
3818 * Placeholder for checks 3837 * The common minimum, we don't know if we can trust the nodesize/sectorsize
3838 * items yet, they'll be verified later. Issue just a warning.
3819 */ 3839 */
3820 return 0; 3840 if (!IS_ALIGNED(sb->root, 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root);
3843 if (!IS_ALIGNED(sb->chunk_root, 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3845 sb->chunk_root);
3846 if (!IS_ALIGNED(sb->log_root, 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3848 sb->log_root);
3849
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
3851 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
3852 fs_info->fsid, sb->dev_item.fsid);
3853 ret = -EINVAL;
3854 }
3855
3856 /*
3857 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
3858 * done later
3859 */
3860 if (sb->num_devices > (1UL << 31))
3861 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
3862 sb->num_devices);
3863
3864 if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
3865 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
3866 sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
3867 ret = -EINVAL;
3868 }
3869
3870 /*
3871 * The generation is a global counter, we'll trust it more than the others
3872 * but it's still possible that it's the one that's wrong.
3873 */
3874 if (sb->generation < sb->chunk_root_generation)
3875 printk(KERN_WARNING
3876 "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
3877 sb->generation, sb->chunk_root_generation);
3878 if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
3879 printk(KERN_WARNING
3880 "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
3881 sb->generation, sb->cache_generation);
3882
3883 return ret;
3821} 3884}
3822 3885
3823static void btrfs_error_commit_super(struct btrfs_root *root) 3886static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -4009,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
4009 4072
4010 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 4073 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
4011 while (start <= end) { 4074 while (start <= end) {
4012 eb = btrfs_find_tree_block(root, start, 4075 eb = btrfs_find_tree_block(root, start);
4013 root->leafsize); 4076 start += root->nodesize;
4014 start += root->leafsize;
4015 if (!eb) 4077 if (!eb)
4016 continue; 4078 continue;
4017 wait_on_extent_buffer_writeback(eb); 4079 wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ceba0a9..414651821fb3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum { 28enum btrfs_wq_endio_type {
29 BTRFS_WQ_ENDIO_DATA = 0, 29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1, 30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2, 31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3, 32 BTRFS_WQ_ENDIO_RAID56 = 3,
33 BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
33}; 34};
34 35
35static inline u64 btrfs_sb_offset(int mirror) 36static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
44struct btrfs_fs_devices; 45struct btrfs_fs_devices;
45 46
46struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 47struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
47 u32 blocksize, u64 parent_transid); 48 u64 parent_transid);
48int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 49void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
49 u64 parent_transid);
50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
51 int mirror_num, struct extent_buffer **eb); 51 int mirror_num, struct extent_buffer **eb);
52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
56int open_ctree(struct super_block *sb, 56int open_ctree(struct super_block *sb,
57 struct btrfs_fs_devices *fs_devices, 57 struct btrfs_fs_devices *fs_devices,
58 char *options); 58 char *options);
59int close_ctree(struct btrfs_root *root); 59void close_ctree(struct btrfs_root *root);
60int write_ctree_super(struct btrfs_trans_handle *trans, 60int write_ctree_super(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, int max_mirrors); 61 struct btrfs_root *root, int max_mirrors);
62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr);
66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root); 68int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
119u32 btrfs_csum_data(char *data, u32 seed, size_t len); 119u32 btrfs_csum_data(char *data, u32 seed, size_t len);
120void btrfs_csum_final(u32 crc, char *result); 120void btrfs_csum_final(u32 crc, char *result);
121int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 121int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
122 int metadata); 122 enum btrfs_wq_endio_type metadata);
123int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 123int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
124 int rw, struct bio *bio, int mirror_num, 124 int rw, struct bio *bio, int mirror_num,
125 unsigned long bio_flags, u64 bio_offset, 125 unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
141 void (*flush_fn)(void *)); 141 void (*flush_fn)(void *));
142int btrfs_calc_num_tolerated_disk_barrier_failures( 142int btrfs_calc_num_tolerated_disk_barrier_failures(
143 struct btrfs_fs_info *fs_info); 143 struct btrfs_fs_info *fs_info);
144int __init btrfs_end_io_wq_init(void);
145void btrfs_end_io_wq_exit(void);
144 146
145#ifdef CONFIG_DEBUG_LOCK_ALLOC 147#ifdef CONFIG_DEBUG_LOCK_ALLOC
146void btrfs_init_lockdep(void); 148void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3de8ed..37d164540c3a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
70 return ERR_PTR(-ESTALE); 70 return ERR_PTR(-ESTALE);
71 71
72 key.objectid = root_objectid; 72 key.objectid = root_objectid;
73 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 73 key.type = BTRFS_ROOT_ITEM_KEY;
74 key.offset = (u64)-1; 74 key.offset = (u64)-1;
75 75
76 index = srcu_read_lock(&fs_info->subvol_srcu); 76 index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 } 82 }
83 83
84 key.objectid = objectid; 84 key.objectid = objectid;
85 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 85 key.type = BTRFS_INODE_ITEM_KEY;
86 key.offset = 0; 86 key.offset = 0;
87 87
88 inode = btrfs_iget(sb, &key, root, NULL); 88 inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index caaf015d6e4b..d56589571012 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
491 key.objectid); 491 key.objectid);
492 if (key.type == BTRFS_METADATA_ITEM_KEY) 492 if (key.type == BTRFS_METADATA_ITEM_KEY)
493 last = key.objectid + 493 last = key.objectid +
494 fs_info->tree_root->leafsize; 494 fs_info->tree_root->nodesize;
495 else 495 else
496 last = key.objectid + key.offset; 496 last = key.objectid + key.offset;
497 497
@@ -765,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
765 * different 765 * different
766 */ 766 */
767 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 767 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
768 offset = root->leafsize; 768 offset = root->nodesize;
769 metadata = 0; 769 metadata = 0;
770 } 770 }
771 771
@@ -799,13 +799,13 @@ again:
799 path->slots[0]); 799 path->slots[0]);
800 if (key.objectid == bytenr && 800 if (key.objectid == bytenr &&
801 key.type == BTRFS_EXTENT_ITEM_KEY && 801 key.type == BTRFS_EXTENT_ITEM_KEY &&
802 key.offset == root->leafsize) 802 key.offset == root->nodesize)
803 ret = 0; 803 ret = 0;
804 } 804 }
805 if (ret) { 805 if (ret) {
806 key.objectid = bytenr; 806 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY; 807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = root->leafsize; 808 key.offset = root->nodesize;
809 btrfs_release_path(path); 809 btrfs_release_path(path);
810 goto again; 810 goto again;
811 } 811 }
@@ -2651,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2651 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2651 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2652 num_heads = heads_to_leaves(root, num_heads); 2652 num_heads = heads_to_leaves(root, num_heads);
2653 if (num_heads > 1) 2653 if (num_heads > 1)
2654 num_bytes += (num_heads - 1) * root->leafsize; 2654 num_bytes += (num_heads - 1) * root->nodesize;
2655 num_bytes <<= 1; 2655 num_bytes <<= 1;
2656 global_rsv = &root->fs_info->global_block_rsv; 2656 global_rsv = &root->fs_info->global_block_rsv;
2657 2657
@@ -3073,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3073 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3073 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3074 u64, u64, u64, u64, u64, u64, int); 3074 u64, u64, u64, u64, u64, u64, int);
3075 3075
3076#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3076
3077 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 3077 if (btrfs_test_is_dummy_root(root))
3078 return 0; 3078 return 0;
3079#endif 3079
3080 ref_root = btrfs_header_owner(buf); 3080 ref_root = btrfs_header_owner(buf);
3081 nritems = btrfs_header_nritems(buf); 3081 nritems = btrfs_header_nritems(buf);
3082 level = btrfs_header_level(buf); 3082 level = btrfs_header_level(buf);
@@ -3097,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3097 for (i = 0; i < nritems; i++) { 3097 for (i = 0; i < nritems; i++) {
3098 if (level == 0) { 3098 if (level == 0) {
3099 btrfs_item_key_to_cpu(buf, &key, i); 3099 btrfs_item_key_to_cpu(buf, &key, i);
3100 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3100 if (key.type != BTRFS_EXTENT_DATA_KEY)
3101 continue; 3101 continue;
3102 fi = btrfs_item_ptr(buf, i, 3102 fi = btrfs_item_ptr(buf, i,
3103 struct btrfs_file_extent_item); 3103 struct btrfs_file_extent_item);
@@ -3117,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3117 goto fail; 3117 goto fail;
3118 } else { 3118 } else {
3119 bytenr = btrfs_node_blockptr(buf, i); 3119 bytenr = btrfs_node_blockptr(buf, i);
3120 num_bytes = btrfs_level_size(root, level - 1); 3120 num_bytes = root->nodesize;
3121 ret = process_func(trans, root, bytenr, num_bytes, 3121 ret = process_func(trans, root, bytenr, num_bytes,
3122 parent, ref_root, level - 1, 0, 3122 parent, ref_root, level - 1, 0,
3123 1); 3123 1);
@@ -4343,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4343} 4343}
4344 4344
4345static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4345static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4346 struct btrfs_fs_info *fs_info) 4346 struct btrfs_fs_info *fs_info,
4347 int flush_state)
4347{ 4348{
4348 u64 used; 4349 u64 used;
4349 4350
4350 spin_lock(&space_info->lock); 4351 spin_lock(&space_info->lock);
4352 /*
4353 * We run out of space and have not got any free space via flush_space,
4354 * so don't bother doing async reclaim.
4355 */
4356 if (flush_state > COMMIT_TRANS && space_info->full) {
4357 spin_unlock(&space_info->lock);
4358 return 0;
4359 }
4360
4351 used = space_info->bytes_used + space_info->bytes_reserved + 4361 used = space_info->bytes_used + space_info->bytes_reserved +
4352 space_info->bytes_pinned + space_info->bytes_readonly + 4362 space_info->bytes_pinned + space_info->bytes_readonly +
4353 space_info->bytes_may_use; 4363 space_info->bytes_may_use;
@@ -4380,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4380 flush_space(fs_info->fs_root, space_info, to_reclaim, 4390 flush_space(fs_info->fs_root, space_info, to_reclaim,
4381 to_reclaim, flush_state); 4391 to_reclaim, flush_state);
4382 flush_state++; 4392 flush_state++;
4383 if (!btrfs_need_do_async_reclaim(space_info, fs_info)) 4393 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4394 flush_state))
4384 return; 4395 return;
4385 } while (flush_state <= COMMIT_TRANS); 4396 } while (flush_state <= COMMIT_TRANS);
4386 4397
4387 if (btrfs_need_do_async_reclaim(space_info, fs_info)) 4398 if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
4388 queue_work(system_unbound_wq, work); 4399 queue_work(system_unbound_wq, work);
4389} 4400}
4390 4401
@@ -4502,7 +4513,13 @@ again:
4502 space_info->flush = 1; 4513 space_info->flush = 1;
4503 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4514 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4504 used += orig_bytes; 4515 used += orig_bytes;
4505 if (need_do_async_reclaim(space_info, root->fs_info, used) && 4516 /*
4517 * We will do the space reservation dance during log replay,
4518 * which means we won't have fs_info->fs_root set, so don't do
4519 * the async reclaim as we will panic.
4520 */
4521 if (!root->fs_info->log_root_recovering &&
4522 need_do_async_reclaim(space_info, root->fs_info, used) &&
4506 !work_busy(&root->fs_info->async_reclaim_work)) 4523 !work_busy(&root->fs_info->async_reclaim_work))
4507 queue_work(system_unbound_wq, 4524 queue_work(system_unbound_wq,
4508 &root->fs_info->async_reclaim_work); 4525 &root->fs_info->async_reclaim_work);
@@ -4839,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4839 if (num_bytes * 3 > meta_used) 4856 if (num_bytes * 3 > meta_used)
4840 num_bytes = div64_u64(meta_used, 3); 4857 num_bytes = div64_u64(meta_used, 3);
4841 4858
4842 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4859 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
4843} 4860}
4844 4861
4845static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4862static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4988,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4988 5005
4989 if (root->fs_info->quota_enabled) { 5006 if (root->fs_info->quota_enabled) {
4990 /* One for parent inode, two for dir entries */ 5007 /* One for parent inode, two for dir entries */
4991 num_bytes = 3 * root->leafsize; 5008 num_bytes = 3 * root->nodesize;
4992 ret = btrfs_qgroup_reserve(root, num_bytes); 5009 ret = btrfs_qgroup_reserve(root, num_bytes);
4993 if (ret) 5010 if (ret)
4994 return ret; 5011 return ret;
@@ -5176,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5176 5193
5177 if (root->fs_info->quota_enabled) { 5194 if (root->fs_info->quota_enabled) {
5178 ret = btrfs_qgroup_reserve(root, num_bytes + 5195 ret = btrfs_qgroup_reserve(root, num_bytes +
5179 nr_extents * root->leafsize); 5196 nr_extents * root->nodesize);
5180 if (ret) 5197 if (ret)
5181 goto out_fail; 5198 goto out_fail;
5182 } 5199 }
@@ -5185,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5185 if (unlikely(ret)) { 5202 if (unlikely(ret)) {
5186 if (root->fs_info->quota_enabled) 5203 if (root->fs_info->quota_enabled)
5187 btrfs_qgroup_free(root, num_bytes + 5204 btrfs_qgroup_free(root, num_bytes +
5188 nr_extents * root->leafsize); 5205 nr_extents * root->nodesize);
5189 goto out_fail; 5206 goto out_fail;
5190 } 5207 }
5191 5208
@@ -5301,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5301 btrfs_ino(inode), to_free, 0); 5318 btrfs_ino(inode), to_free, 0);
5302 if (root->fs_info->quota_enabled) { 5319 if (root->fs_info->quota_enabled) {
5303 btrfs_qgroup_free(root, num_bytes + 5320 btrfs_qgroup_free(root, num_bytes +
5304 dropped * root->leafsize); 5321 dropped * root->nodesize);
5305 } 5322 }
5306 5323
5307 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5324 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5422,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root,
5422 spin_unlock(&cache->space_info->lock); 5439 spin_unlock(&cache->space_info->lock);
5423 } else { 5440 } else {
5424 old_val -= num_bytes; 5441 old_val -= num_bytes;
5442
5443 /*
5444 * No longer have used bytes in this block group, queue
5445 * it for deletion.
5446 */
5447 if (old_val == 0) {
5448 spin_lock(&info->unused_bgs_lock);
5449 if (list_empty(&cache->bg_list)) {
5450 btrfs_get_block_group(cache);
5451 list_add_tail(&cache->bg_list,
5452 &info->unused_bgs);
5453 }
5454 spin_unlock(&info->unused_bgs_lock);
5455 }
5425 btrfs_set_block_group_used(&cache->item, old_val); 5456 btrfs_set_block_group_used(&cache->item, old_val);
5426 cache->pinned += num_bytes; 5457 cache->pinned += num_bytes;
5427 cache->space_info->bytes_pinned += num_bytes; 5458 cache->space_info->bytes_pinned += num_bytes;
@@ -6233,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6233 int ret; 6264 int ret;
6234 struct btrfs_fs_info *fs_info = root->fs_info; 6265 struct btrfs_fs_info *fs_info = root->fs_info;
6235 6266
6236#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6267 if (btrfs_test_is_dummy_root(root))
6237 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
6238 return 0; 6268 return 0;
6239#endif 6269
6240 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6270 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6241 6271
6242 /* 6272 /*
@@ -6263,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6263 return ret; 6293 return ret;
6264} 6294}
6265 6295
6266static u64 stripe_align(struct btrfs_root *root,
6267 struct btrfs_block_group_cache *cache,
6268 u64 val, u64 num_bytes)
6269{
6270 u64 ret = ALIGN(val, root->stripesize);
6271 return ret;
6272}
6273
6274/* 6296/*
6275 * when we wait for progress in the block group caching, its because 6297 * when we wait for progress in the block group caching, its because
6276 * our allocation attempt failed at least once. So, we must sleep 6298 * our allocation attempt failed at least once. So, we must sleep
@@ -6464,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6464 bool have_caching_bg = false; 6486 bool have_caching_bg = false;
6465 6487
6466 WARN_ON(num_bytes < root->sectorsize); 6488 WARN_ON(num_bytes < root->sectorsize);
6467 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6489 ins->type = BTRFS_EXTENT_ITEM_KEY;
6468 ins->objectid = 0; 6490 ins->objectid = 0;
6469 ins->offset = 0; 6491 ins->offset = 0;
6470 6492
@@ -6751,8 +6773,7 @@ unclustered_alloc:
6751 goto loop; 6773 goto loop;
6752 } 6774 }
6753checks: 6775checks:
6754 search_start = stripe_align(root, block_group, 6776 search_start = ALIGN(offset, root->stripesize);
6755 offset, num_bytes);
6756 6777
6757 /* move on to the next group */ 6778 /* move on to the next group */
6758 if (search_start + num_bytes > 6779 if (search_start + num_bytes >
@@ -7077,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7077 path = btrfs_alloc_path(); 7098 path = btrfs_alloc_path();
7078 if (!path) { 7099 if (!path) {
7079 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7100 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7080 root->leafsize); 7101 root->nodesize);
7081 return -ENOMEM; 7102 return -ENOMEM;
7082 } 7103 }
7083 7104
@@ -7086,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7086 ins, size); 7107 ins, size);
7087 if (ret) { 7108 if (ret) {
7088 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7109 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7089 root->leafsize); 7110 root->nodesize);
7090 btrfs_free_path(path); 7111 btrfs_free_path(path);
7091 return ret; 7112 return ret;
7092 } 7113 }
@@ -7101,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7101 7122
7102 if (skinny_metadata) { 7123 if (skinny_metadata) {
7103 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7124 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7104 num_bytes = root->leafsize; 7125 num_bytes = root->nodesize;
7105 } else { 7126 } else {
7106 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7127 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7107 btrfs_set_tree_block_key(leaf, block_info, key); 7128 btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7131,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7131 return ret; 7152 return ret;
7132 } 7153 }
7133 7154
7134 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7155 ret = update_block_group(root, ins->objectid, root->nodesize, 1);
7135 if (ret) { /* -ENOENT, logic error */ 7156 if (ret) { /* -ENOENT, logic error */
7136 btrfs_err(fs_info, "update block group failed for %llu %llu", 7157 btrfs_err(fs_info, "update block group failed for %llu %llu",
7137 ins->objectid, ins->offset); 7158 ins->objectid, ins->offset);
7138 BUG(); 7159 BUG();
7139 } 7160 }
7140 7161
7141 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 7162 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7142 return ret; 7163 return ret;
7143} 7164}
7144 7165
@@ -7213,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7213 btrfs_set_buffer_uptodate(buf); 7234 btrfs_set_buffer_uptodate(buf);
7214 7235
7215 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7236 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7237 buf->log_index = root->log_transid % 2;
7216 /* 7238 /*
7217 * we allow two log transactions at a time, use different 7239 * we allow two log transactions at a time, use different
7218 * EXENT bit to differentiate dirty pages. 7240 * EXENT bit to differentiate dirty pages.
7219 */ 7241 */
7220 if (root->log_transid % 2 == 0) 7242 if (buf->log_index == 0)
7221 set_extent_dirty(&root->dirty_log_pages, buf->start, 7243 set_extent_dirty(&root->dirty_log_pages, buf->start,
7222 buf->start + buf->len - 1, GFP_NOFS); 7244 buf->start + buf->len - 1, GFP_NOFS);
7223 else 7245 else
7224 set_extent_new(&root->dirty_log_pages, buf->start, 7246 set_extent_new(&root->dirty_log_pages, buf->start,
7225 buf->start + buf->len - 1, GFP_NOFS); 7247 buf->start + buf->len - 1, GFP_NOFS);
7226 } else { 7248 } else {
7249 buf->log_index = -1;
7227 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7250 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7228 buf->start + buf->len - 1, GFP_NOFS); 7251 buf->start + buf->len - 1, GFP_NOFS);
7229 } 7252 }
@@ -7300,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7300 * 7323 *
7301 * returns the tree buffer or NULL. 7324 * returns the tree buffer or NULL.
7302 */ 7325 */
7303struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7326struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7304 struct btrfs_root *root, u32 blocksize, 7327 struct btrfs_root *root,
7305 u64 parent, u64 root_objectid, 7328 u64 parent, u64 root_objectid,
7306 struct btrfs_disk_key *key, int level, 7329 struct btrfs_disk_key *key, int level,
7307 u64 hint, u64 empty_size) 7330 u64 hint, u64 empty_size)
@@ -7311,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
7311 struct extent_buffer *buf; 7334 struct extent_buffer *buf;
7312 u64 flags = 0; 7335 u64 flags = 0;
7313 int ret; 7336 int ret;
7337 u32 blocksize = root->nodesize;
7314 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7338 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7315 SKINNY_METADATA); 7339 SKINNY_METADATA);
7316 7340
7317#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7341 if (btrfs_test_is_dummy_root(root)) {
7318 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
7319 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7342 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7320 blocksize, level); 7343 blocksize, level);
7321 if (!IS_ERR(buf)) 7344 if (!IS_ERR(buf))
7322 root->alloc_bytenr += blocksize; 7345 root->alloc_bytenr += blocksize;
7323 return buf; 7346 return buf;
7324 } 7347 }
7325#endif 7348
7326 block_rsv = use_block_rsv(trans, root, blocksize); 7349 block_rsv = use_block_rsv(trans, root, blocksize);
7327 if (IS_ERR(block_rsv)) 7350 if (IS_ERR(block_rsv))
7328 return ERR_CAST(block_rsv); 7351 return ERR_CAST(block_rsv);
@@ -7417,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7417 7440
7418 eb = path->nodes[wc->level]; 7441 eb = path->nodes[wc->level];
7419 nritems = btrfs_header_nritems(eb); 7442 nritems = btrfs_header_nritems(eb);
7420 blocksize = btrfs_level_size(root, wc->level - 1); 7443 blocksize = root->nodesize;
7421 7444
7422 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7445 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7423 if (nread >= wc->reada_count) 7446 if (nread >= wc->reada_count)
@@ -7464,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7464 continue; 7487 continue;
7465 } 7488 }
7466reada: 7489reada:
7467 ret = readahead_tree_block(root, bytenr, blocksize, 7490 readahead_tree_block(root, bytenr, blocksize);
7468 generation);
7469 if (ret)
7470 break;
7471 nread++; 7491 nread++;
7472 } 7492 }
7473 wc->reada_slot = slot; 7493 wc->reada_slot = slot;
@@ -7626,7 +7646,6 @@ walk_down:
7626 level = root_level; 7646 level = root_level;
7627 while (level >= 0) { 7647 while (level >= 0) {
7628 if (path->nodes[level] == NULL) { 7648 if (path->nodes[level] == NULL) {
7629 int child_bsize = root->nodesize;
7630 int parent_slot; 7649 int parent_slot;
7631 u64 child_gen; 7650 u64 child_gen;
7632 u64 child_bytenr; 7651 u64 child_bytenr;
@@ -7638,8 +7657,7 @@ walk_down:
7638 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 7657 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7639 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7658 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7640 7659
7641 eb = read_tree_block(root, child_bytenr, child_bsize, 7660 eb = read_tree_block(root, child_bytenr, child_gen);
7642 child_gen);
7643 if (!eb || !extent_buffer_uptodate(eb)) { 7661 if (!eb || !extent_buffer_uptodate(eb)) {
7644 ret = -EIO; 7662 ret = -EIO;
7645 goto out; 7663 goto out;
@@ -7655,7 +7673,7 @@ walk_down:
7655 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7673 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7656 root->objectid, 7674 root->objectid,
7657 child_bytenr, 7675 child_bytenr,
7658 child_bsize, 7676 root->nodesize,
7659 BTRFS_QGROUP_OPER_SUB_SUBTREE, 7677 BTRFS_QGROUP_OPER_SUB_SUBTREE,
7660 0); 7678 0);
7661 if (ret) 7679 if (ret)
@@ -7806,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7806 } 7824 }
7807 7825
7808 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7826 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7809 blocksize = btrfs_level_size(root, level - 1); 7827 blocksize = root->nodesize;
7810 7828
7811 next = btrfs_find_tree_block(root, bytenr, blocksize); 7829 next = btrfs_find_tree_block(root, bytenr);
7812 if (!next) { 7830 if (!next) {
7813 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7831 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7814 if (!next) 7832 if (!next)
@@ -7870,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7870 if (!next) { 7888 if (!next) {
7871 if (reada && level == 1) 7889 if (reada && level == 1)
7872 reada_walk_down(trans, root, wc, path); 7890 reada_walk_down(trans, root, wc, path);
7873 next = read_tree_block(root, bytenr, blocksize, generation); 7891 next = read_tree_block(root, bytenr, generation);
7874 if (!next || !extent_buffer_uptodate(next)) { 7892 if (!next || !extent_buffer_uptodate(next)) {
7875 free_extent_buffer(next); 7893 free_extent_buffer(next);
7876 return -EIO; 7894 return -EIO;
@@ -8853,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8853 } 8871 }
8854 up_write(&info->commit_root_sem); 8872 up_write(&info->commit_root_sem);
8855 8873
8874 spin_lock(&info->unused_bgs_lock);
8875 while (!list_empty(&info->unused_bgs)) {
8876 block_group = list_first_entry(&info->unused_bgs,
8877 struct btrfs_block_group_cache,
8878 bg_list);
8879 list_del_init(&block_group->bg_list);
8880 btrfs_put_block_group(block_group);
8881 }
8882 spin_unlock(&info->unused_bgs_lock);
8883
8856 spin_lock(&info->block_group_cache_lock); 8884 spin_lock(&info->block_group_cache_lock);
8857 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8885 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8858 block_group = rb_entry(n, struct btrfs_block_group_cache, 8886 block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8987,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8987 init_rwsem(&cache->data_rwsem); 9015 init_rwsem(&cache->data_rwsem);
8988 INIT_LIST_HEAD(&cache->list); 9016 INIT_LIST_HEAD(&cache->list);
8989 INIT_LIST_HEAD(&cache->cluster_list); 9017 INIT_LIST_HEAD(&cache->cluster_list);
8990 INIT_LIST_HEAD(&cache->new_bg_list); 9018 INIT_LIST_HEAD(&cache->bg_list);
8991 btrfs_init_free_space_ctl(cache); 9019 btrfs_init_free_space_ctl(cache);
8992 9020
8993 return cache; 9021 return cache;
@@ -9009,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
9009 root = info->extent_root; 9037 root = info->extent_root;
9010 key.objectid = 0; 9038 key.objectid = 0;
9011 key.offset = 0; 9039 key.offset = 0;
9012 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 9040 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9013 path = btrfs_alloc_path(); 9041 path = btrfs_alloc_path();
9014 if (!path) 9042 if (!path)
9015 return -ENOMEM; 9043 return -ENOMEM;
@@ -9128,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
9128 __link_block_group(space_info, cache); 9156 __link_block_group(space_info, cache);
9129 9157
9130 set_avail_alloc_bits(root->fs_info, cache->flags); 9158 set_avail_alloc_bits(root->fs_info, cache->flags);
9131 if (btrfs_chunk_readonly(root, cache->key.objectid)) 9159 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9132 set_block_group_ro(cache, 1); 9160 set_block_group_ro(cache, 1);
9161 } else if (btrfs_block_group_used(&cache->item) == 0) {
9162 spin_lock(&info->unused_bgs_lock);
9163 /* Should always be true but just in case. */
9164 if (list_empty(&cache->bg_list)) {
9165 btrfs_get_block_group(cache);
9166 list_add_tail(&cache->bg_list,
9167 &info->unused_bgs);
9168 }
9169 spin_unlock(&info->unused_bgs_lock);
9170 }
9133 } 9171 }
9134 9172
9135 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 9173 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9170,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9170 struct btrfs_key key; 9208 struct btrfs_key key;
9171 int ret = 0; 9209 int ret = 0;
9172 9210
9173 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 9211 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9174 new_bg_list) { 9212 list_del_init(&block_group->bg_list);
9175 list_del_init(&block_group->new_bg_list);
9176
9177 if (ret) 9213 if (ret)
9178 continue; 9214 continue;
9179 9215
@@ -9259,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9259 9295
9260 __link_block_group(cache->space_info, cache); 9296 __link_block_group(cache->space_info, cache);
9261 9297
9262 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 9298 list_add_tail(&cache->bg_list, &trans->new_bgs);
9263 9299
9264 set_avail_alloc_bits(extent_root->fs_info, type); 9300 set_avail_alloc_bits(extent_root->fs_info, type);
9265 9301
@@ -9413,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9413 9449
9414 memcpy(&key, &block_group->key, sizeof(key)); 9450 memcpy(&key, &block_group->key, sizeof(key));
9415 9451
9416 btrfs_clear_space_info_full(root->fs_info);
9417
9418 btrfs_put_block_group(block_group); 9452 btrfs_put_block_group(block_group);
9419 btrfs_put_block_group(block_group); 9453 btrfs_put_block_group(block_group);
9420 9454
@@ -9430,6 +9464,101 @@ out:
9430 return ret; 9464 return ret;
9431} 9465}
9432 9466
9467/*
9468 * Process the unused_bgs list and remove any that don't have any allocated
9469 * space inside of them.
9470 */
9471void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9472{
9473 struct btrfs_block_group_cache *block_group;
9474 struct btrfs_space_info *space_info;
9475 struct btrfs_root *root = fs_info->extent_root;
9476 struct btrfs_trans_handle *trans;
9477 int ret = 0;
9478
9479 if (!fs_info->open)
9480 return;
9481
9482 spin_lock(&fs_info->unused_bgs_lock);
9483 while (!list_empty(&fs_info->unused_bgs)) {
9484 u64 start, end;
9485
9486 block_group = list_first_entry(&fs_info->unused_bgs,
9487 struct btrfs_block_group_cache,
9488 bg_list);
9489 space_info = block_group->space_info;
9490 list_del_init(&block_group->bg_list);
9491 if (ret || btrfs_mixed_space_info(space_info)) {
9492 btrfs_put_block_group(block_group);
9493 continue;
9494 }
9495 spin_unlock(&fs_info->unused_bgs_lock);
9496
9497 /* Don't want to race with allocators so take the groups_sem */
9498 down_write(&space_info->groups_sem);
9499 spin_lock(&block_group->lock);
9500 if (block_group->reserved ||
9501 btrfs_block_group_used(&block_group->item) ||
9502 block_group->ro) {
9503 /*
9504 * We want to bail if we made new allocations or have
9505 * outstanding allocations in this block group. We do
9506 * the ro check in case balance is currently acting on
9507 * this block group.
9508 */
9509 spin_unlock(&block_group->lock);
9510 up_write(&space_info->groups_sem);
9511 goto next;
9512 }
9513 spin_unlock(&block_group->lock);
9514
9515 /* We don't want to force the issue, only flip if it's ok. */
9516 ret = set_block_group_ro(block_group, 0);
9517 up_write(&space_info->groups_sem);
9518 if (ret < 0) {
9519 ret = 0;
9520 goto next;
9521 }
9522
9523 /*
9524 * Want to do this before we do anything else so we can recover
9525 * properly if we fail to join the transaction.
9526 */
9527 trans = btrfs_join_transaction(root);
9528 if (IS_ERR(trans)) {
9529 btrfs_set_block_group_rw(root, block_group);
9530 ret = PTR_ERR(trans);
9531 goto next;
9532 }
9533
9534 /*
9535 * We could have pending pinned extents for this block group,
9536 * just delete them, we don't care about them anymore.
9537 */
9538 start = block_group->key.objectid;
9539 end = start + block_group->key.offset - 1;
9540 clear_extent_bits(&fs_info->freed_extents[0], start, end,
9541 EXTENT_DIRTY, GFP_NOFS);
9542 clear_extent_bits(&fs_info->freed_extents[1], start, end,
9543 EXTENT_DIRTY, GFP_NOFS);
9544
9545 /* Reset pinned so btrfs_put_block_group doesn't complain */
9546 block_group->pinned = 0;
9547
9548 /*
9549 * Btrfs_remove_chunk will abort the transaction if things go
9550 * horribly wrong.
9551 */
9552 ret = btrfs_remove_chunk(trans, root,
9553 block_group->key.objectid);
9554 btrfs_end_transaction(trans, root);
9555next:
9556 btrfs_put_block_group(block_group);
9557 spin_lock(&fs_info->unused_bgs_lock);
9558 }
9559 spin_unlock(&fs_info->unused_bgs_lock);
9560}
9561
9433int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9562int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
9434{ 9563{
9435 struct btrfs_space_info *space_info; 9564 struct btrfs_space_info *space_info;
@@ -9561,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9561 9690
9562int btrfs_start_nocow_write(struct btrfs_root *root) 9691int btrfs_start_nocow_write(struct btrfs_root *root)
9563{ 9692{
9564 if (unlikely(atomic_read(&root->will_be_snapshoted))) 9693 if (atomic_read(&root->will_be_snapshoted))
9565 return 0; 9694 return 0;
9566 9695
9567 percpu_counter_inc(&root->subv_writers->counter); 9696 percpu_counter_inc(&root->subv_writers->counter);
@@ -9569,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9569 * Make sure counter is updated before we check for snapshot creation. 9698 * Make sure counter is updated before we check for snapshot creation.
9570 */ 9699 */
9571 smp_mb(); 9700 smp_mb();
9572 if (unlikely(atomic_read(&root->will_be_snapshoted))) { 9701 if (atomic_read(&root->will_be_snapshoted)) {
9573 btrfs_end_nocow_write(root); 9702 btrfs_end_nocow_write(root);
9574 return 0; 9703 return 0;
9575 } 9704 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index af0359dcf337..bf3f424e0013 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache; 25static struct kmem_cache *extent_buffer_cache;
26static struct bio_set *btrfs_bioset; 26static struct bio_set *btrfs_bioset;
27 27
28static inline bool extent_state_in_tree(const struct extent_state *state)
29{
30 return !RB_EMPTY_NODE(&state->rb_node);
31}
32
28#ifdef CONFIG_BTRFS_DEBUG 33#ifdef CONFIG_BTRFS_DEBUG
29static LIST_HEAD(buffers); 34static LIST_HEAD(buffers);
30static LIST_HEAD(states); 35static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
59 64
60 while (!list_empty(&states)) { 65 while (!list_empty(&states)) {
61 state = list_entry(states.next, struct extent_state, leak_list); 66 state = list_entry(states.next, struct extent_state, leak_list);
62 printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " 67 pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
63 "state %lu in tree %p refs %d\n", 68 state->start, state->end, state->state,
64 state->start, state->end, state->state, state->tree, 69 extent_state_in_tree(state),
65 atomic_read(&state->refs)); 70 atomic_read(&state->refs));
66 list_del(&state->leak_list); 71 list_del(&state->leak_list);
67 kmem_cache_free(extent_state_cache, state); 72 kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
209 return state; 214 return state;
210 state->state = 0; 215 state->state = 0;
211 state->private = 0; 216 state->private = 0;
212 state->tree = NULL; 217 RB_CLEAR_NODE(&state->rb_node);
213 btrfs_leak_debug_add(&state->leak_list, &states); 218 btrfs_leak_debug_add(&state->leak_list, &states);
214 atomic_set(&state->refs, 1); 219 atomic_set(&state->refs, 1);
215 init_waitqueue_head(&state->wq); 220 init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
222 if (!state) 227 if (!state)
223 return; 228 return;
224 if (atomic_dec_and_test(&state->refs)) { 229 if (atomic_dec_and_test(&state->refs)) {
225 WARN_ON(state->tree); 230 WARN_ON(extent_state_in_tree(state));
226 btrfs_leak_debug_del(&state->leak_list); 231 btrfs_leak_debug_del(&state->leak_list);
227 trace_free_extent_state(state, _RET_IP_); 232 trace_free_extent_state(state, _RET_IP_);
228 kmem_cache_free(extent_state_cache, state); 233 kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
371 other->state == state->state) { 376 other->state == state->state) {
372 merge_cb(tree, state, other); 377 merge_cb(tree, state, other);
373 state->start = other->start; 378 state->start = other->start;
374 other->tree = NULL;
375 rb_erase(&other->rb_node, &tree->state); 379 rb_erase(&other->rb_node, &tree->state);
380 RB_CLEAR_NODE(&other->rb_node);
376 free_extent_state(other); 381 free_extent_state(other);
377 } 382 }
378 } 383 }
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
383 other->state == state->state) { 388 other->state == state->state) {
384 merge_cb(tree, state, other); 389 merge_cb(tree, state, other);
385 state->end = other->end; 390 state->end = other->end;
386 other->tree = NULL;
387 rb_erase(&other->rb_node, &tree->state); 391 rb_erase(&other->rb_node, &tree->state);
392 RB_CLEAR_NODE(&other->rb_node);
388 free_extent_state(other); 393 free_extent_state(other);
389 } 394 }
390 } 395 }
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
442 found->start, found->end, start, end); 447 found->start, found->end, start, end);
443 return -EEXIST; 448 return -EEXIST;
444 } 449 }
445 state->tree = tree;
446 merge_state(tree, state); 450 merge_state(tree, state);
447 return 0; 451 return 0;
448} 452}
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
486 free_extent_state(prealloc); 490 free_extent_state(prealloc);
487 return -EEXIST; 491 return -EEXIST;
488 } 492 }
489 prealloc->tree = tree;
490 return 0; 493 return 0;
491} 494}
492 495
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
524 wake_up(&state->wq); 527 wake_up(&state->wq);
525 if (state->state == 0) { 528 if (state->state == 0) {
526 next = next_state(state); 529 next = next_state(state);
527 if (state->tree) { 530 if (extent_state_in_tree(state)) {
528 rb_erase(&state->rb_node, &tree->state); 531 rb_erase(&state->rb_node, &tree->state);
529 state->tree = NULL; 532 RB_CLEAR_NODE(&state->rb_node);
530 free_extent_state(state); 533 free_extent_state(state);
531 } else { 534 } else {
532 WARN_ON(1); 535 WARN_ON(1);
@@ -606,8 +609,8 @@ again:
606 cached_state = NULL; 609 cached_state = NULL;
607 } 610 }
608 611
609 if (cached && cached->tree && cached->start <= start && 612 if (cached && extent_state_in_tree(cached) &&
610 cached->end > start) { 613 cached->start <= start && cached->end > start) {
611 if (clear) 614 if (clear)
612 atomic_dec(&cached->refs); 615 atomic_dec(&cached->refs);
613 state = cached; 616 state = cached;
@@ -843,7 +846,7 @@ again:
843 if (cached_state && *cached_state) { 846 if (cached_state && *cached_state) {
844 state = *cached_state; 847 state = *cached_state;
845 if (state->start <= start && state->end > start && 848 if (state->start <= start && state->end > start &&
846 state->tree) { 849 extent_state_in_tree(state)) {
847 node = &state->rb_node; 850 node = &state->rb_node;
848 goto hit_next; 851 goto hit_next;
849 } 852 }
@@ -1069,7 +1072,7 @@ again:
1069 if (cached_state && *cached_state) { 1072 if (cached_state && *cached_state) {
1070 state = *cached_state; 1073 state = *cached_state;
1071 if (state->start <= start && state->end > start && 1074 if (state->start <= start && state->end > start &&
1072 state->tree) { 1075 extent_state_in_tree(state)) {
1073 node = &state->rb_node; 1076 node = &state->rb_node;
1074 goto hit_next; 1077 goto hit_next;
1075 } 1078 }
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1459 spin_lock(&tree->lock); 1462 spin_lock(&tree->lock);
1460 if (cached_state && *cached_state) { 1463 if (cached_state && *cached_state) {
1461 state = *cached_state; 1464 state = *cached_state;
1462 if (state->end == start - 1 && state->tree) { 1465 if (state->end == start - 1 && extent_state_in_tree(state)) {
1463 n = rb_next(&state->rb_node); 1466 n = rb_next(&state->rb_node);
1464 while (n) { 1467 while (n) {
1465 state = rb_entry(n, struct extent_state, 1468 state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1905 int bitset = 0; 1908 int bitset = 0;
1906 1909
1907 spin_lock(&tree->lock); 1910 spin_lock(&tree->lock);
1908 if (cached && cached->tree && cached->start <= start && 1911 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1909 cached->end > start) 1912 cached->end > start)
1910 node = &cached->rb_node; 1913 node = &cached->rb_node;
1911 else 1914 else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1959 SetPageUptodate(page); 1962 SetPageUptodate(page);
1960} 1963}
1961 1964
1962/* 1965int free_io_failure(struct inode *inode, struct io_failure_record *rec)
1963 * When IO fails, either with EIO or csum verification fails, we
1964 * try other mirrors that might have a good copy of the data. This
1965 * io_failure_record is used to record state as we go through all the
1966 * mirrors. If another mirror has good data, the page is set up to date
1967 * and things continue. If a good mirror can't be found, the original
1968 * bio end_io callback is called to indicate things have failed.
1969 */
1970struct io_failure_record {
1971 struct page *page;
1972 u64 start;
1973 u64 len;
1974 u64 logical;
1975 unsigned long bio_flags;
1976 int this_mirror;
1977 int failed_mirror;
1978 int in_validation;
1979};
1980
1981static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1982 int did_repair)
1983{ 1966{
1984 int ret; 1967 int ret;
1985 int err = 0; 1968 int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
2012 * currently, there can be no more than two copies of every data bit. thus, 1995 * currently, there can be no more than two copies of every data bit. thus,
2013 * exactly one rewrite is required. 1996 * exactly one rewrite is required.
2014 */ 1997 */
2015int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1998int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2016 u64 length, u64 logical, struct page *page, 1999 struct page *page, unsigned int pg_offset, int mirror_num)
2017 int mirror_num)
2018{ 2000{
2001 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2019 struct bio *bio; 2002 struct bio *bio;
2020 struct btrfs_device *dev; 2003 struct btrfs_device *dev;
2021 u64 map_length = 0; 2004 u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2053 return -EIO; 2036 return -EIO;
2054 } 2037 }
2055 bio->bi_bdev = dev->bdev; 2038 bio->bi_bdev = dev->bdev;
2056 bio_add_page(bio, page, length, start - page_offset(page)); 2039 bio_add_page(bio, page, length, pg_offset);
2057 2040
2058 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { 2041 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2059 /* try to remap that extent elsewhere? */ 2042 /* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2063 } 2046 }
2064 2047
2065 printk_ratelimited_in_rcu(KERN_INFO 2048 printk_ratelimited_in_rcu(KERN_INFO
2066 "BTRFS: read error corrected: ino %lu off %llu " 2049 "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
2067 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2050 btrfs_ino(inode), start,
2068 start, rcu_str_deref(dev->name), sector); 2051 rcu_str_deref(dev->name), sector);
2069
2070 bio_put(bio); 2052 bio_put(bio);
2071 return 0; 2053 return 0;
2072} 2054}
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2082 return -EROFS; 2064 return -EROFS;
2083 2065
2084 for (i = 0; i < num_pages; i++) { 2066 for (i = 0; i < num_pages; i++) {
2085 struct page *p = extent_buffer_page(eb, i); 2067 struct page *p = eb->pages[i];
2086 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2068
2087 start, p, mirror_num); 2069 ret = repair_io_failure(root->fs_info->btree_inode, start,
2070 PAGE_CACHE_SIZE, start, p,
2071 start - page_offset(p), mirror_num);
2088 if (ret) 2072 if (ret)
2089 break; 2073 break;
2090 start += PAGE_CACHE_SIZE; 2074 start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2097 * each time an IO finishes, we do a fast check in the IO failure tree 2081 * each time an IO finishes, we do a fast check in the IO failure tree
2098 * to see if we need to process or clean up an io_failure_record 2082 * to see if we need to process or clean up an io_failure_record
2099 */ 2083 */
2100static int clean_io_failure(u64 start, struct page *page) 2084int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2085 unsigned int pg_offset)
2101{ 2086{
2102 u64 private; 2087 u64 private;
2103 u64 private_failure; 2088 u64 private_failure;
2104 struct io_failure_record *failrec; 2089 struct io_failure_record *failrec;
2105 struct inode *inode = page->mapping->host;
2106 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2090 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2107 struct extent_state *state; 2091 struct extent_state *state;
2108 int num_copies; 2092 int num_copies;
2109 int did_repair = 0;
2110 int ret; 2093 int ret;
2111 2094
2112 private = 0; 2095 private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
2127 /* there was no real error, just free the record */ 2110 /* there was no real error, just free the record */
2128 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2111 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2129 failrec->start); 2112 failrec->start);
2130 did_repair = 1;
2131 goto out; 2113 goto out;
2132 } 2114 }
2133 if (fs_info->sb->s_flags & MS_RDONLY) 2115 if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
2144 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2126 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2145 failrec->len); 2127 failrec->len);
2146 if (num_copies > 1) { 2128 if (num_copies > 1) {
2147 ret = repair_io_failure(fs_info, start, failrec->len, 2129 repair_io_failure(inode, start, failrec->len,
2148 failrec->logical, page, 2130 failrec->logical, page,
2149 failrec->failed_mirror); 2131 pg_offset, failrec->failed_mirror);
2150 did_repair = !ret;
2151 } 2132 }
2152 ret = 0;
2153 } 2133 }
2154 2134
2155out: 2135out:
2156 if (!ret) 2136 free_io_failure(inode, failrec);
2157 ret = free_io_failure(inode, failrec, did_repair);
2158 2137
2159 return ret; 2138 return 0;
2160} 2139}
2161 2140
2162/* 2141/*
2163 * this is a generic handler for readpage errors (default 2142 * Can be called when
2164 * readpage_io_failed_hook). if other copies exist, read those and write back 2143 * - hold extent lock
2165 * good data to the failed position. does not investigate in remapping the 2144 * - under ordered extent
2166 * failed extent elsewhere, hoping the device will be smart enough to do this as 2145 * - the inode is freeing
2167 * needed
2168 */ 2146 */
2147void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2148{
2149 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2150 struct io_failure_record *failrec;
2151 struct extent_state *state, *next;
2169 2152
2170static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2153 if (RB_EMPTY_ROOT(&failure_tree->state))
2171 struct page *page, u64 start, u64 end, 2154 return;
2172 int failed_mirror) 2155
2156 spin_lock(&failure_tree->lock);
2157 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2158 while (state) {
2159 if (state->start > end)
2160 break;
2161
2162 ASSERT(state->end <= end);
2163
2164 next = next_state(state);
2165
2166 failrec = (struct io_failure_record *)state->private;
2167 free_extent_state(state);
2168 kfree(failrec);
2169
2170 state = next;
2171 }
2172 spin_unlock(&failure_tree->lock);
2173}
2174
2175int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2176 struct io_failure_record **failrec_ret)
2173{ 2177{
2174 struct io_failure_record *failrec = NULL; 2178 struct io_failure_record *failrec;
2175 u64 private; 2179 u64 private;
2176 struct extent_map *em; 2180 struct extent_map *em;
2177 struct inode *inode = page->mapping->host;
2178 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2181 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2179 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2182 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2180 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2183 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2181 struct bio *bio;
2182 struct btrfs_io_bio *btrfs_failed_bio;
2183 struct btrfs_io_bio *btrfs_bio;
2184 int num_copies;
2185 int ret; 2184 int ret;
2186 int read_mode;
2187 u64 logical; 2185 u64 logical;
2188 2186
2189 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2190
2191 ret = get_state_private(failure_tree, start, &private); 2187 ret = get_state_private(failure_tree, start, &private);
2192 if (ret) { 2188 if (ret) {
2193 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2189 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2194 if (!failrec) 2190 if (!failrec)
2195 return -ENOMEM; 2191 return -ENOMEM;
2192
2196 failrec->start = start; 2193 failrec->start = start;
2197 failrec->len = end - start + 1; 2194 failrec->len = end - start + 1;
2198 failrec->this_mirror = 0; 2195 failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2212 em = NULL; 2209 em = NULL;
2213 } 2210 }
2214 read_unlock(&em_tree->lock); 2211 read_unlock(&em_tree->lock);
2215
2216 if (!em) { 2212 if (!em) {
2217 kfree(failrec); 2213 kfree(failrec);
2218 return -EIO; 2214 return -EIO;
2219 } 2215 }
2216
2220 logical = start - em->start; 2217 logical = start - em->start;
2221 logical = em->block_start + logical; 2218 logical = em->block_start + logical;
2222 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2219 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2225 extent_set_compress_type(&failrec->bio_flags, 2222 extent_set_compress_type(&failrec->bio_flags,
2226 em->compress_type); 2223 em->compress_type);
2227 } 2224 }
2228 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2225
2229 "len=%llu\n", logical, start, failrec->len); 2226 pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
2227 logical, start, failrec->len);
2228
2230 failrec->logical = logical; 2229 failrec->logical = logical;
2231 free_extent_map(em); 2230 free_extent_map(em);
2232 2231
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2246 } 2245 }
2247 } else { 2246 } else {
2248 failrec = (struct io_failure_record *)(unsigned long)private; 2247 failrec = (struct io_failure_record *)(unsigned long)private;
2249 pr_debug("bio_readpage_error: (found) logical=%llu, " 2248 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2250 "start=%llu, len=%llu, validation=%d\n",
2251 failrec->logical, failrec->start, failrec->len, 2249 failrec->logical, failrec->start, failrec->len,
2252 failrec->in_validation); 2250 failrec->in_validation);
2253 /* 2251 /*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2256 * clean_io_failure() clean all those errors at once. 2254 * clean_io_failure() clean all those errors at once.
2257 */ 2255 */
2258 } 2256 }
2257
2258 *failrec_ret = failrec;
2259
2260 return 0;
2261}
2262
2263int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
2264 struct io_failure_record *failrec, int failed_mirror)
2265{
2266 int num_copies;
2267
2259 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2268 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2260 failrec->logical, failrec->len); 2269 failrec->logical, failrec->len);
2261 if (num_copies == 1) { 2270 if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2264 * all the retry and error correction code that follows. no 2273 * all the retry and error correction code that follows. no
2265 * matter what the error is, it is very likely to persist. 2274 * matter what the error is, it is very likely to persist.
2266 */ 2275 */
2267 pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", 2276 pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
2268 num_copies, failrec->this_mirror, failed_mirror); 2277 num_copies, failrec->this_mirror, failed_mirror);
2269 free_io_failure(inode, failrec, 0); 2278 return 0;
2270 return -EIO;
2271 } 2279 }
2272 2280
2273 /* 2281 /*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2287 BUG_ON(failrec->in_validation); 2295 BUG_ON(failrec->in_validation);
2288 failrec->in_validation = 1; 2296 failrec->in_validation = 1;
2289 failrec->this_mirror = failed_mirror; 2297 failrec->this_mirror = failed_mirror;
2290 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2291 } else { 2298 } else {
2292 /* 2299 /*
2293 * we're ready to fulfill a) and b) alongside. get a good copy 2300 * we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2303 failrec->this_mirror++; 2310 failrec->this_mirror++;
2304 if (failrec->this_mirror == failed_mirror) 2311 if (failrec->this_mirror == failed_mirror)
2305 failrec->this_mirror++; 2312 failrec->this_mirror++;
2306 read_mode = READ_SYNC;
2307 } 2313 }
2308 2314
2309 if (failrec->this_mirror > num_copies) { 2315 if (failrec->this_mirror > num_copies) {
2310 pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", 2316 pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
2311 num_copies, failrec->this_mirror, failed_mirror); 2317 num_copies, failrec->this_mirror, failed_mirror);
2312 free_io_failure(inode, failrec, 0); 2318 return 0;
2313 return -EIO;
2314 } 2319 }
2315 2320
2321 return 1;
2322}
2323
2324
2325struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2326 struct io_failure_record *failrec,
2327 struct page *page, int pg_offset, int icsum,
2328 bio_end_io_t *endio_func, void *data)
2329{
2330 struct bio *bio;
2331 struct btrfs_io_bio *btrfs_failed_bio;
2332 struct btrfs_io_bio *btrfs_bio;
2333
2316 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2334 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2317 if (!bio) { 2335 if (!bio)
2318 free_io_failure(inode, failrec, 0); 2336 return NULL;
2319 return -EIO; 2337
2320 } 2338 bio->bi_end_io = endio_func;
2321 bio->bi_end_io = failed_bio->bi_end_io;
2322 bio->bi_iter.bi_sector = failrec->logical >> 9; 2339 bio->bi_iter.bi_sector = failrec->logical >> 9;
2323 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2340 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2324 bio->bi_iter.bi_size = 0; 2341 bio->bi_iter.bi_size = 0;
2342 bio->bi_private = data;
2325 2343
2326 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2344 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2327 if (btrfs_failed_bio->csum) { 2345 if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2330 2348
2331 btrfs_bio = btrfs_io_bio(bio); 2349 btrfs_bio = btrfs_io_bio(bio);
2332 btrfs_bio->csum = btrfs_bio->csum_inline; 2350 btrfs_bio->csum = btrfs_bio->csum_inline;
2333 phy_offset >>= inode->i_sb->s_blocksize_bits; 2351 icsum *= csum_size;
2334 phy_offset *= csum_size; 2352 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2335 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
2336 csum_size); 2353 csum_size);
2337 } 2354 }
2338 2355
2339 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2356 bio_add_page(bio, page, failrec->len, pg_offset);
2357
2358 return bio;
2359}
2360
2361/*
2362 * this is a generic handler for readpage errors (default
2363 * readpage_io_failed_hook). if other copies exist, read those and write back
2364 * good data to the failed position. does not investigate in remapping the
2365 * failed extent elsewhere, hoping the device will be smart enough to do this as
2366 * needed
2367 */
2368
2369static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2370 struct page *page, u64 start, u64 end,
2371 int failed_mirror)
2372{
2373 struct io_failure_record *failrec;
2374 struct inode *inode = page->mapping->host;
2375 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2376 struct bio *bio;
2377 int read_mode;
2378 int ret;
2379
2380 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2381
2382 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2383 if (ret)
2384 return ret;
2385
2386 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
2387 if (!ret) {
2388 free_io_failure(inode, failrec);
2389 return -EIO;
2390 }
2391
2392 if (failed_bio->bi_vcnt > 1)
2393 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2394 else
2395 read_mode = READ_SYNC;
2396
2397 phy_offset >>= inode->i_sb->s_blocksize_bits;
2398 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2399 start - page_offset(page),
2400 (int)phy_offset, failed_bio->bi_end_io,
2401 NULL);
2402 if (!bio) {
2403 free_io_failure(inode, failrec);
2404 return -EIO;
2405 }
2340 2406
2341 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2407 pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
2342 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2408 read_mode, failrec->this_mirror, failrec->in_validation);
2343 failrec->this_mirror, num_copies, failrec->in_validation);
2344 2409
2345 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2410 ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2346 failrec->this_mirror, 2411 failrec->this_mirror,
2347 failrec->bio_flags, 0); 2412 failrec->bio_flags, 0);
2413 if (ret) {
2414 free_io_failure(inode, failrec);
2415 bio_put(bio);
2416 }
2417
2348 return ret; 2418 return ret;
2349} 2419}
2350 2420
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2469 struct inode *inode = page->mapping->host; 2539 struct inode *inode = page->mapping->host;
2470 2540
2471 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2541 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2472 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, 2542 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
2473 io_bio->mirror_num); 2543 io_bio->mirror_num);
2474 tree = &BTRFS_I(inode)->io_tree; 2544 tree = &BTRFS_I(inode)->io_tree;
2475 2545
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2503 if (ret) 2573 if (ret)
2504 uptodate = 0; 2574 uptodate = 0;
2505 else 2575 else
2506 clean_io_failure(start, page); 2576 clean_io_failure(inode, start, page, 0);
2507 } 2577 }
2508 2578
2509 if (likely(uptodate)) 2579 if (likely(uptodate))
@@ -2540,12 +2610,12 @@ readpage_ok:
2540 if (likely(uptodate)) { 2610 if (likely(uptodate)) {
2541 loff_t i_size = i_size_read(inode); 2611 loff_t i_size = i_size_read(inode);
2542 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2612 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2543 unsigned offset; 2613 unsigned off;
2544 2614
2545 /* Zero out the end if this page straddles i_size */ 2615 /* Zero out the end if this page straddles i_size */
2546 offset = i_size & (PAGE_CACHE_SIZE-1); 2616 off = i_size & (PAGE_CACHE_SIZE-1);
2547 if (page->index == end_index && offset) 2617 if (page->index == end_index && off)
2548 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2618 zero_user_segment(page, off, PAGE_CACHE_SIZE);
2549 SetPageUptodate(page); 2619 SetPageUptodate(page);
2550 } else { 2620 } else {
2551 ClearPageUptodate(page); 2621 ClearPageUptodate(page);
@@ -2618,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2618 2688
2619struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2689struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2620{ 2690{
2621 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2691 struct btrfs_io_bio *btrfs_bio;
2622} 2692 struct bio *new;
2623 2693
2694 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2695 if (new) {
2696 btrfs_bio = btrfs_io_bio(new);
2697 btrfs_bio->csum = NULL;
2698 btrfs_bio->csum_allocated = NULL;
2699 btrfs_bio->end_io = NULL;
2700 }
2701 return new;
2702}
2624 2703
2625/* this also allocates from the btrfs_bioset */ 2704/* this also allocates from the btrfs_bioset */
2626struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2705struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3501,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
3501 3580
3502 num_pages = num_extent_pages(eb->start, eb->len); 3581 num_pages = num_extent_pages(eb->start, eb->len);
3503 for (i = 0; i < num_pages; i++) { 3582 for (i = 0; i < num_pages; i++) {
3504 struct page *p = extent_buffer_page(eb, i); 3583 struct page *p = eb->pages[i];
3505 3584
3506 if (!trylock_page(p)) { 3585 if (!trylock_page(p)) {
3507 if (!flush) { 3586 if (!flush) {
@@ -3522,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
3522 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3601 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3523} 3602}
3524 3603
3604static void set_btree_ioerr(struct page *page)
3605{
3606 struct extent_buffer *eb = (struct extent_buffer *)page->private;
3607 struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
3608
3609 SetPageError(page);
3610 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3611 return;
3612
3613 /*
3614 * If writeback for a btree extent that doesn't belong to a log tree
3615 * failed, increment the counter transaction->eb_write_errors.
3616 * We do this because while the transaction is running and before it's
3617 * committing (when we call filemap_fdata[write|wait]_range against
3618 * the btree inode), we might have
3619 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3620 * returns an error or an error happens during writeback, when we're
3621 * committing the transaction we wouldn't know about it, since the pages
3622 * can be no longer dirty nor marked anymore for writeback (if a
3623 * subsequent modification to the extent buffer didn't happen before the
3624 * transaction commit), which makes filemap_fdata[write|wait]_range not
3625 * able to find the pages tagged with SetPageError at transaction
3626 * commit time. So if this happens we must abort the transaction,
3627 * otherwise we commit a super block with btree roots that point to
3628 * btree nodes/leafs whose content on disk is invalid - either garbage
3629 * or the content of some node/leaf from a past generation that got
3630 * cowed or deleted and is no longer valid.
3631 *
3632 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3633 * not be enough - we need to distinguish between log tree extents vs
3634 * non-log tree extents, and the next filemap_fdatawait_range() call
3635 * will catch and clear such errors in the mapping - and that call might
3636 * be from a log sync and not from a transaction commit. Also, checking
3637 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3638 * not done and would not be reliable - the eb might have been released
3639 * from memory and reading it back again means that flag would not be
3640 * set (since it's a runtime flag, not persisted on disk).
3641 *
3642 * Using the flags below in the btree inode also makes us achieve the
3643 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3644 * writeback for all dirty pages and before filemap_fdatawait_range()
3645 * is called, the writeback for all dirty pages had already finished
3646 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3647 * filemap_fdatawait_range() would return success, as it could not know
3648 * that writeback errors happened (the pages were no longer tagged for
3649 * writeback).
3650 */
3651 switch (eb->log_index) {
3652 case -1:
3653 set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
3654 break;
3655 case 0:
3656 set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
3657 break;
3658 case 1:
3659 set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
3660 break;
3661 default:
3662 BUG(); /* unexpected, logic error */
3663 }
3664}
3665
3525static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3666static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3526{ 3667{
3527 struct bio_vec *bvec; 3668 struct bio_vec *bvec;
@@ -3535,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3535 BUG_ON(!eb); 3676 BUG_ON(!eb);
3536 done = atomic_dec_and_test(&eb->io_pages); 3677 done = atomic_dec_and_test(&eb->io_pages);
3537 3678
3538 if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3679 if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3539 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3540 ClearPageUptodate(page); 3680 ClearPageUptodate(page);
3541 SetPageError(page); 3681 set_btree_ioerr(page);
3542 } 3682 }
3543 3683
3544 end_page_writeback(page); 3684 end_page_writeback(page);
@@ -3565,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3565 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3705 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3566 int ret = 0; 3706 int ret = 0;
3567 3707
3568 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3708 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3569 num_pages = num_extent_pages(eb->start, eb->len); 3709 num_pages = num_extent_pages(eb->start, eb->len);
3570 atomic_set(&eb->io_pages, num_pages); 3710 atomic_set(&eb->io_pages, num_pages);
3571 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3711 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3572 bio_flags = EXTENT_BIO_TREE_LOG; 3712 bio_flags = EXTENT_BIO_TREE_LOG;
3573 3713
3574 for (i = 0; i < num_pages; i++) { 3714 for (i = 0; i < num_pages; i++) {
3575 struct page *p = extent_buffer_page(eb, i); 3715 struct page *p = eb->pages[i];
3576 3716
3577 clear_page_dirty_for_io(p); 3717 clear_page_dirty_for_io(p);
3578 set_page_writeback(p); 3718 set_page_writeback(p);
@@ -3582,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3582 0, epd->bio_flags, bio_flags); 3722 0, epd->bio_flags, bio_flags);
3583 epd->bio_flags = bio_flags; 3723 epd->bio_flags = bio_flags;
3584 if (ret) { 3724 if (ret) {
3585 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3725 set_btree_ioerr(p);
3586 SetPageError(p); 3726 end_page_writeback(p);
3587 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3727 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3588 end_extent_buffer_writeback(eb); 3728 end_extent_buffer_writeback(eb);
3589 ret = -EIO; 3729 ret = -EIO;
@@ -3596,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3596 3736
3597 if (unlikely(ret)) { 3737 if (unlikely(ret)) {
3598 for (; i < num_pages; i++) { 3738 for (; i < num_pages; i++) {
3599 struct page *p = extent_buffer_page(eb, i); 3739 struct page *p = eb->pages[i];
3740 clear_page_dirty_for_io(p);
3600 unlock_page(p); 3741 unlock_page(p);
3601 } 3742 }
3602 } 3743 }
@@ -4166,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
4166 return NULL; 4307 return NULL;
4167} 4308}
4168 4309
4169static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
4170{
4171 unsigned long cnt = *((unsigned long *)ctx);
4172
4173 cnt++;
4174 *((unsigned long *)ctx) = cnt;
4175
4176 /* Now we're sure that the extent is shared. */
4177 if (cnt > 1)
4178 return 1;
4179 return 0;
4180}
4181
4182int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4310int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4183 __u64 start, __u64 len, get_extent_t *get_extent) 4311 __u64 start, __u64 len, get_extent_t *get_extent)
4184{ 4312{
@@ -4195,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4195 struct extent_map *em = NULL; 4323 struct extent_map *em = NULL;
4196 struct extent_state *cached_state = NULL; 4324 struct extent_state *cached_state = NULL;
4197 struct btrfs_path *path; 4325 struct btrfs_path *path;
4326 struct btrfs_root *root = BTRFS_I(inode)->root;
4198 int end = 0; 4327 int end = 0;
4199 u64 em_start = 0; 4328 u64 em_start = 0;
4200 u64 em_len = 0; 4329 u64 em_len = 0;
@@ -4215,8 +4344,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4215 * lookup the last file extent. We're not using i_size here 4344 * lookup the last file extent. We're not using i_size here
4216 * because there might be preallocation past i_size 4345 * because there might be preallocation past i_size
4217 */ 4346 */
4218 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 4347 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4219 path, btrfs_ino(inode), -1, 0); 4348 0);
4220 if (ret < 0) { 4349 if (ret < 0) {
4221 btrfs_free_path(path); 4350 btrfs_free_path(path);
4222 return ret; 4351 return ret;
@@ -4224,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4224 WARN_ON(!ret); 4353 WARN_ON(!ret);
4225 path->slots[0]--; 4354 path->slots[0]--;
4226 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4355 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4227 found_type = btrfs_key_type(&found_key); 4356 found_type = found_key.type;
4228 4357
4229 /* No extents, but there might be delalloc bits */ 4358 /* No extents, but there might be delalloc bits */
4230 if (found_key.objectid != btrfs_ino(inode) || 4359 if (found_key.objectid != btrfs_ino(inode) ||
@@ -4309,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4309 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4438 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4310 flags |= (FIEMAP_EXTENT_DELALLOC | 4439 flags |= (FIEMAP_EXTENT_DELALLOC |
4311 FIEMAP_EXTENT_UNKNOWN); 4440 FIEMAP_EXTENT_UNKNOWN);
4312 } else { 4441 } else if (fieinfo->fi_extents_max) {
4313 unsigned long ref_cnt = 0; 4442 u64 bytenr = em->block_start -
4443 (em->start - em->orig_start);
4314 4444
4315 disko = em->block_start + offset_in_extent; 4445 disko = em->block_start + offset_in_extent;
4316 4446
4317 /* 4447 /*
4318 * As btrfs supports shared space, this information 4448 * As btrfs supports shared space, this information
4319 * can be exported to userspace tools via 4449 * can be exported to userspace tools via
4320 * flag FIEMAP_EXTENT_SHARED. 4450 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
4451 * then we're just getting a count and we can skip the
4452 * lookup stuff.
4321 */ 4453 */
4322 ret = iterate_inodes_from_logical( 4454 ret = btrfs_check_shared(NULL, root->fs_info,
4323 em->block_start, 4455 root->objectid,
4324 BTRFS_I(inode)->root->fs_info, 4456 btrfs_ino(inode), bytenr);
4325 path, count_ext_ref, &ref_cnt); 4457 if (ret < 0)
4326 if (ret < 0 && ret != -ENOENT)
4327 goto out_free; 4458 goto out_free;
4328 4459 if (ret)
4329 if (ref_cnt > 1)
4330 flags |= FIEMAP_EXTENT_SHARED; 4460 flags |= FIEMAP_EXTENT_SHARED;
4461 ret = 0;
4331 } 4462 }
4332 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4463 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4333 flags |= FIEMAP_EXTENT_ENCODED; 4464 flags |= FIEMAP_EXTENT_ENCODED;
@@ -4381,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
4381/* 4512/*
4382 * Helper for releasing extent buffer page. 4513 * Helper for releasing extent buffer page.
4383 */ 4514 */
4384static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4515static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
4385 unsigned long start_idx)
4386{ 4516{
4387 unsigned long index; 4517 unsigned long index;
4388 unsigned long num_pages;
4389 struct page *page; 4518 struct page *page;
4390 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4519 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4391 4520
4392 BUG_ON(extent_buffer_under_io(eb)); 4521 BUG_ON(extent_buffer_under_io(eb));
4393 4522
4394 num_pages = num_extent_pages(eb->start, eb->len); 4523 index = num_extent_pages(eb->start, eb->len);
4395 index = start_idx + num_pages; 4524 if (index == 0)
4396 if (start_idx >= index)
4397 return; 4525 return;
4398 4526
4399 do { 4527 do {
4400 index--; 4528 index--;
4401 page = extent_buffer_page(eb, index); 4529 page = eb->pages[index];
4402 if (page && mapped) { 4530 if (page && mapped) {
4403 spin_lock(&page->mapping->private_lock); 4531 spin_lock(&page->mapping->private_lock);
4404 /* 4532 /*
@@ -4429,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4429 /* One for when we alloced the page */ 4557 /* One for when we alloced the page */
4430 page_cache_release(page); 4558 page_cache_release(page);
4431 } 4559 }
4432 } while (index != start_idx); 4560 } while (index != 0);
4433} 4561}
4434 4562
4435/* 4563/*
@@ -4437,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4437 */ 4565 */
4438static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4566static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4439{ 4567{
4440 btrfs_release_extent_buffer_page(eb, 0); 4568 btrfs_release_extent_buffer_page(eb);
4441 __free_extent_buffer(eb); 4569 __free_extent_buffer(eb);
4442} 4570}
4443 4571
@@ -4580,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4580 4708
4581 num_pages = num_extent_pages(eb->start, eb->len); 4709 num_pages = num_extent_pages(eb->start, eb->len);
4582 for (i = 0; i < num_pages; i++) { 4710 for (i = 0; i < num_pages; i++) {
4583 struct page *p = extent_buffer_page(eb, i); 4711 struct page *p = eb->pages[i];
4712
4584 if (p != accessed) 4713 if (p != accessed)
4585 mark_page_accessed(p); 4714 mark_page_accessed(p);
4586 } 4715 }
@@ -4749,7 +4878,7 @@ again:
4749 */ 4878 */
4750 SetPageChecked(eb->pages[0]); 4879 SetPageChecked(eb->pages[0]);
4751 for (i = 1; i < num_pages; i++) { 4880 for (i = 1; i < num_pages; i++) {
4752 p = extent_buffer_page(eb, i); 4881 p = eb->pages[i];
4753 ClearPageChecked(p); 4882 ClearPageChecked(p);
4754 unlock_page(p); 4883 unlock_page(p);
4755 } 4884 }
@@ -4794,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
4794 } 4923 }
4795 4924
4796 /* Should be safe to release our pages at this point */ 4925 /* Should be safe to release our pages at this point */
4797 btrfs_release_extent_buffer_page(eb, 0); 4926 btrfs_release_extent_buffer_page(eb);
4798 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4927 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4799 return 1; 4928 return 1;
4800 } 4929 }
@@ -4860,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
4860 num_pages = num_extent_pages(eb->start, eb->len); 4989 num_pages = num_extent_pages(eb->start, eb->len);
4861 4990
4862 for (i = 0; i < num_pages; i++) { 4991 for (i = 0; i < num_pages; i++) {
4863 page = extent_buffer_page(eb, i); 4992 page = eb->pages[i];
4864 if (!PageDirty(page)) 4993 if (!PageDirty(page))
4865 continue; 4994 continue;
4866 4995
@@ -4896,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
4896 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5025 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4897 5026
4898 for (i = 0; i < num_pages; i++) 5027 for (i = 0; i < num_pages; i++)
4899 set_page_dirty(extent_buffer_page(eb, i)); 5028 set_page_dirty(eb->pages[i]);
4900 return was_dirty; 5029 return was_dirty;
4901} 5030}
4902 5031
@@ -4909,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4909 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5038 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4910 num_pages = num_extent_pages(eb->start, eb->len); 5039 num_pages = num_extent_pages(eb->start, eb->len);
4911 for (i = 0; i < num_pages; i++) { 5040 for (i = 0; i < num_pages; i++) {
4912 page = extent_buffer_page(eb, i); 5041 page = eb->pages[i];
4913 if (page) 5042 if (page)
4914 ClearPageUptodate(page); 5043 ClearPageUptodate(page);
4915 } 5044 }
@@ -4925,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
4925 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5054 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4926 num_pages = num_extent_pages(eb->start, eb->len); 5055 num_pages = num_extent_pages(eb->start, eb->len);
4927 for (i = 0; i < num_pages; i++) { 5056 for (i = 0; i < num_pages; i++) {
4928 page = extent_buffer_page(eb, i); 5057 page = eb->pages[i];
4929 SetPageUptodate(page); 5058 SetPageUptodate(page);
4930 } 5059 }
4931 return 0; 5060 return 0;
@@ -4965,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4965 5094
4966 num_pages = num_extent_pages(eb->start, eb->len); 5095 num_pages = num_extent_pages(eb->start, eb->len);
4967 for (i = start_i; i < num_pages; i++) { 5096 for (i = start_i; i < num_pages; i++) {
4968 page = extent_buffer_page(eb, i); 5097 page = eb->pages[i];
4969 if (wait == WAIT_NONE) { 5098 if (wait == WAIT_NONE) {
4970 if (!trylock_page(page)) 5099 if (!trylock_page(page))
4971 goto unlock_exit; 5100 goto unlock_exit;
@@ -4984,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4984 goto unlock_exit; 5113 goto unlock_exit;
4985 } 5114 }
4986 5115
4987 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 5116 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
4988 eb->read_mirror = 0; 5117 eb->read_mirror = 0;
4989 atomic_set(&eb->io_pages, num_reads); 5118 atomic_set(&eb->io_pages, num_reads);
4990 for (i = start_i; i < num_pages; i++) { 5119 for (i = start_i; i < num_pages; i++) {
4991 page = extent_buffer_page(eb, i); 5120 page = eb->pages[i];
4992 if (!PageUptodate(page)) { 5121 if (!PageUptodate(page)) {
4993 ClearPageError(page); 5122 ClearPageError(page);
4994 err = __extent_read_full_page(tree, page, 5123 err = __extent_read_full_page(tree, page,
@@ -5013,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5013 return ret; 5142 return ret;
5014 5143
5015 for (i = start_i; i < num_pages; i++) { 5144 for (i = start_i; i < num_pages; i++) {
5016 page = extent_buffer_page(eb, i); 5145 page = eb->pages[i];
5017 wait_on_page_locked(page); 5146 wait_on_page_locked(page);
5018 if (!PageUptodate(page)) 5147 if (!PageUptodate(page))
5019 ret = -EIO; 5148 ret = -EIO;
@@ -5024,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5024unlock_exit: 5153unlock_exit:
5025 i = start_i; 5154 i = start_i;
5026 while (locked_pages > 0) { 5155 while (locked_pages > 0) {
5027 page = extent_buffer_page(eb, i); 5156 page = eb->pages[i];
5028 i++; 5157 i++;
5029 unlock_page(page); 5158 unlock_page(page);
5030 locked_pages--; 5159 locked_pages--;
@@ -5050,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
5050 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5179 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5051 5180
5052 while (len > 0) { 5181 while (len > 0) {
5053 page = extent_buffer_page(eb, i); 5182 page = eb->pages[i];
5054 5183
5055 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5184 cur = min(len, (PAGE_CACHE_SIZE - offset));
5056 kaddr = page_address(page); 5185 kaddr = page_address(page);
@@ -5082,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
5082 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5211 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5083 5212
5084 while (len > 0) { 5213 while (len > 0) {
5085 page = extent_buffer_page(eb, i); 5214 page = eb->pages[i];
5086 5215
5087 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5216 cur = min(len, (PAGE_CACHE_SIZE - offset));
5088 kaddr = page_address(page); 5217 kaddr = page_address(page);
@@ -5131,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
5131 return -EINVAL; 5260 return -EINVAL;
5132 } 5261 }
5133 5262
5134 p = extent_buffer_page(eb, i); 5263 p = eb->pages[i];
5135 kaddr = page_address(p); 5264 kaddr = page_address(p);
5136 *map = kaddr + offset; 5265 *map = kaddr + offset;
5137 *map_len = PAGE_CACHE_SIZE - offset; 5266 *map_len = PAGE_CACHE_SIZE - offset;
@@ -5157,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
5157 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5286 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5158 5287
5159 while (len > 0) { 5288 while (len > 0) {
5160 page = extent_buffer_page(eb, i); 5289 page = eb->pages[i];
5161 5290
5162 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5291 cur = min(len, (PAGE_CACHE_SIZE - offset));
5163 5292
@@ -5191,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5191 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5320 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5192 5321
5193 while (len > 0) { 5322 while (len > 0) {
5194 page = extent_buffer_page(eb, i); 5323 page = eb->pages[i];
5195 WARN_ON(!PageUptodate(page)); 5324 WARN_ON(!PageUptodate(page));
5196 5325
5197 cur = min(len, PAGE_CACHE_SIZE - offset); 5326 cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5221,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
5221 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5350 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5222 5351
5223 while (len > 0) { 5352 while (len > 0) {
5224 page = extent_buffer_page(eb, i); 5353 page = eb->pages[i];
5225 WARN_ON(!PageUptodate(page)); 5354 WARN_ON(!PageUptodate(page));
5226 5355
5227 cur = min(len, PAGE_CACHE_SIZE - offset); 5356 cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5252,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5252 (PAGE_CACHE_SIZE - 1); 5381 (PAGE_CACHE_SIZE - 1);
5253 5382
5254 while (len > 0) { 5383 while (len > 0) {
5255 page = extent_buffer_page(dst, i); 5384 page = dst->pages[i];
5256 WARN_ON(!PageUptodate(page)); 5385 WARN_ON(!PageUptodate(page));
5257 5386
5258 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 5387 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5330,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5330 cur = min_t(unsigned long, cur, 5459 cur = min_t(unsigned long, cur,
5331 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 5460 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
5332 5461
5333 copy_pages(extent_buffer_page(dst, dst_i), 5462 copy_pages(dst->pages[dst_i], dst->pages[src_i],
5334 extent_buffer_page(dst, src_i),
5335 dst_off_in_page, src_off_in_page, cur); 5463 dst_off_in_page, src_off_in_page, cur);
5336 5464
5337 src_offset += cur; 5465 src_offset += cur;
@@ -5377,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5377 5505
5378 cur = min_t(unsigned long, len, src_off_in_page + 1); 5506 cur = min_t(unsigned long, len, src_off_in_page + 1);
5379 cur = min(cur, dst_off_in_page + 1); 5507 cur = min(cur, dst_off_in_page + 1);
5380 copy_pages(extent_buffer_page(dst, dst_i), 5508 copy_pages(dst->pages[dst_i], dst->pages[src_i],
5381 extent_buffer_page(dst, src_i),
5382 dst_off_in_page - cur + 1, 5509 dst_off_in_page - cur + 1,
5383 src_off_in_page - cur + 1, cur); 5510 src_off_in_page - cur + 1, cur);
5384 5511
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e7bde1..6d4b938be986 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
11#define EXTENT_NEW (1 << 4) 11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5) 12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6) 13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 14#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 15#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 16#define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
34 32
35/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
36#define EXTENT_BUFFER_UPTODATE 0 34#define EXTENT_BUFFER_UPTODATE 0
37#define EXTENT_BUFFER_BLOCKING 1
38#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
39#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
40#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ 37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
41#define EXTENT_BUFFER_TREE_REF 5 38#define EXTENT_BUFFER_TREE_REF 5
42#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
43#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
44#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
45#define EXTENT_BUFFER_DUMMY 9 42#define EXTENT_BUFFER_DUMMY 9
46#define EXTENT_BUFFER_IN_TREE 10 43#define EXTENT_BUFFER_IN_TREE 10
44#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
47 45
48/* these are flags for extent_clear_unlock_delalloc */ 46/* these are flags for extent_clear_unlock_delalloc */
49#define PAGE_UNLOCK (1 << 0) 47#define PAGE_UNLOCK (1 << 0)
@@ -57,7 +55,6 @@
57 * map has page->private set to one. 55 * map has page->private set to one.
58 */ 56 */
59#define EXTENT_PAGE_PRIVATE 1 57#define EXTENT_PAGE_PRIVATE 1
60#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
61 58
62struct extent_state; 59struct extent_state;
63struct btrfs_root; 60struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
108 struct rb_node rb_node; 105 struct rb_node rb_node;
109 106
110 /* ADD NEW ELEMENTS AFTER THIS */ 107 /* ADD NEW ELEMENTS AFTER THIS */
111 struct extent_io_tree *tree;
112 wait_queue_head_t wq; 108 wait_queue_head_t wq;
113 atomic_t refs; 109 atomic_t refs;
114 unsigned long state; 110 unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
126struct extent_buffer { 122struct extent_buffer {
127 u64 start; 123 u64 start;
128 unsigned long len; 124 unsigned long len;
129 unsigned long map_start;
130 unsigned long map_len;
131 unsigned long bflags; 125 unsigned long bflags;
132 struct btrfs_fs_info *fs_info; 126 struct btrfs_fs_info *fs_info;
133 spinlock_t refs_lock; 127 spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
144 atomic_t blocking_readers; 138 atomic_t blocking_readers;
145 atomic_t spinning_readers; 139 atomic_t spinning_readers;
146 atomic_t spinning_writers; 140 atomic_t spinning_writers;
147 int lock_nested; 141 short lock_nested;
142 /* >= 0 if eb belongs to a log tree, -1 otherwise */
143 short log_index;
148 144
149 /* protects write locks */ 145 /* protects write locks */
150 rwlock_t lock; 146 rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
286 (start >> PAGE_CACHE_SHIFT); 282 (start >> PAGE_CACHE_SHIFT);
287} 283}
288 284
289static inline struct page *extent_buffer_page(struct extent_buffer *eb,
290 unsigned long i)
291{
292 return eb->pages[i];
293}
294
295static inline void extent_buffer_get(struct extent_buffer *eb) 285static inline void extent_buffer_get(struct extent_buffer *eb)
296{ 286{
297 atomic_inc(&eb->refs); 287 atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
341 331
342struct btrfs_fs_info; 332struct btrfs_fs_info;
343 333
344int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 334int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
345 u64 length, u64 logical, struct page *page, 335 struct page *page, unsigned int pg_offset,
346 int mirror_num); 336 int mirror_num);
337int clean_io_failure(struct inode *inode, u64 start, struct page *page,
338 unsigned int pg_offset);
347int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 339int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
348int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 340int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
349 int mirror_num); 341 int mirror_num);
342
343/*
344 * When IO fails, either with EIO or csum verification fails, we
345 * try other mirrors that might have a good copy of the data. This
346 * io_failure_record is used to record state as we go through all the
347 * mirrors. If another mirror has good data, the page is set up to date
348 * and things continue. If a good mirror can't be found, the original
349 * bio end_io callback is called to indicate things have failed.
350 */
351struct io_failure_record {
352 struct page *page;
353 u64 start;
354 u64 len;
355 u64 logical;
356 unsigned long bio_flags;
357 int this_mirror;
358 int failed_mirror;
359 int in_validation;
360};
361
362void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
363int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
364 struct io_failure_record **failrec_ret);
365int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
366 struct io_failure_record *failrec, int fail_mirror);
367struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
368 struct io_failure_record *failrec,
369 struct page *page, int pg_offset, int icsum,
370 bio_end_io_t *endio_func, void *data);
371int free_io_failure(struct inode *inode, struct io_failure_record *rec);
350#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 372#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
351noinline u64 find_lock_delalloc_range(struct inode *inode, 373noinline u64 find_lock_delalloc_range(struct inode *inode,
352 struct extent_io_tree *tree, 374 struct extent_io_tree *tree,
353 struct page *locked_page, u64 *start, 375 struct page *locked_page, u64 *start,
354 u64 *end, u64 max_bytes); 376 u64 *end, u64 max_bytes);
377#endif
355struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 378struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
356 u64 start, unsigned long len); 379 u64 start, unsigned long len);
357#endif 380#endif
358#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54c84daec9b5..783a94355efd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
55 return -ENOMEM; 55 return -ENOMEM;
56 file_key.objectid = objectid; 56 file_key.objectid = objectid;
57 file_key.offset = pos; 57 file_key.offset = pos;
58 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 58 file_key.type = BTRFS_EXTENT_DATA_KEY;
59 59
60 path->leave_spinning = 1; 60 path->leave_spinning = 1;
61 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 61 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
100 100
101 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 101 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
102 file_key.offset = bytenr; 102 file_key.offset = bytenr;
103 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 103 file_key.type = BTRFS_EXTENT_CSUM_KEY;
104 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); 104 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
105 if (ret < 0) 105 if (ret < 0)
106 goto fail; 106 goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
111 goto fail; 111 goto fail;
112 path->slots[0]--; 112 path->slots[0]--;
113 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 113 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
114 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) 114 if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
115 goto fail; 115 goto fail;
116 116
117 csum_offset = (bytenr - found_key.offset) >> 117 csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148 148
149 file_key.objectid = objectid; 149 file_key.objectid = objectid;
150 file_key.offset = offset; 150 file_key.offset = offset;
151 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 151 file_key.type = BTRFS_EXTENT_DATA_KEY;
152 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); 152 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
153 return ret; 153 return ret;
154} 154}
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
299} 299}
300 300
301int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 301int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
302 struct btrfs_dio_private *dip, struct bio *bio, 302 struct bio *bio, u64 offset)
303 u64 offset)
304{ 303{
305 int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; 304 return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
306 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
307 int ret;
308
309 len >>= inode->i_sb->s_blocksize_bits;
310 len *= csum_size;
311
312 ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
313 (u32 *)(dip->csum + len), 1);
314 return ret;
315} 305}
316 306
317int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 307int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
329 u64 csum_end; 319 u64 csum_end;
330 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 320 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
331 321
332 ASSERT(start == ALIGN(start, root->sectorsize) && 322 ASSERT(IS_ALIGNED(start, root->sectorsize) &&
333 (end + 1) == ALIGN(end + 1, root->sectorsize)); 323 IS_ALIGNED(end + 1, root->sectorsize));
334 324
335 path = btrfs_alloc_path(); 325 path = btrfs_alloc_path();
336 if (!path) 326 if (!path)
@@ -720,7 +710,7 @@ again:
720 bytenr = sums->bytenr + total_bytes; 710 bytenr = sums->bytenr + total_bytes;
721 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 711 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
722 file_key.offset = bytenr; 712 file_key.offset = bytenr;
723 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 713 file_key.type = BTRFS_EXTENT_CSUM_KEY;
724 714
725 item = btrfs_lookup_csum(trans, root, path, bytenr, 1); 715 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
726 if (!IS_ERR(item)) { 716 if (!IS_ERR(item)) {
@@ -790,7 +780,7 @@ again:
790 csum_offset = (bytenr - found_key.offset) >> 780 csum_offset = (bytenr - found_key.offset) >>
791 root->fs_info->sb->s_blocksize_bits; 781 root->fs_info->sb->s_blocksize_bits;
792 782
793 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || 783 if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
794 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || 784 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
795 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { 785 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
796 goto insert; 786 goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ff1cc0399b9a..a18ceabd99a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
299 299
300 /* get the inode */ 300 /* get the inode */
301 key.objectid = defrag->root; 301 key.objectid = defrag->root;
302 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 302 key.type = BTRFS_ROOT_ITEM_KEY;
303 key.offset = (u64)-1; 303 key.offset = (u64)-1;
304 304
305 index = srcu_read_lock(&fs_info->subvol_srcu); 305 index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
311 } 311 }
312 312
313 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 key.type = BTRFS_INODE_ITEM_KEY;
315 key.offset = 0; 315 key.offset = 0;
316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
317 if (IS_ERR(inode)) { 317 if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
452 if (unlikely(copied == 0)) 452 if (unlikely(copied == 0))
453 break; 453 break;
454 454
455 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 455 if (copied < PAGE_CACHE_SIZE - offset) {
456 offset += copied; 456 offset += copied;
457 } else { 457 } else {
458 pg++; 458 pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1481 bool force_page_uptodate = false; 1481 bool force_page_uptodate = false;
1482 bool need_unlock; 1482 bool need_unlock;
1483 1483
1484 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1484 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
1485 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1485 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1486 (sizeof(struct page *)));
1487 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1486 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1488 nrptrs = max(nrptrs, 8); 1487 nrptrs = max(nrptrs, 8);
1489 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1488 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1497 size_t write_bytes = min(iov_iter_count(i), 1496 size_t write_bytes = min(iov_iter_count(i),
1498 nrptrs * (size_t)PAGE_CACHE_SIZE - 1497 nrptrs * (size_t)PAGE_CACHE_SIZE -
1499 offset); 1498 offset);
1500 size_t num_pages = (write_bytes + offset + 1499 size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1501 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1500 PAGE_CACHE_SIZE);
1502 size_t reserve_bytes; 1501 size_t reserve_bytes;
1503 size_t dirty_pages; 1502 size_t dirty_pages;
1504 size_t copied; 1503 size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1526 * our prealloc extent may be smaller than 1525 * our prealloc extent may be smaller than
1527 * write_bytes, so scale down. 1526 * write_bytes, so scale down.
1528 */ 1527 */
1529 num_pages = (write_bytes + offset + 1528 num_pages = DIV_ROUND_UP(write_bytes + offset,
1530 PAGE_CACHE_SIZE - 1) >> 1529 PAGE_CACHE_SIZE);
1531 PAGE_CACHE_SHIFT;
1532 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1530 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1533 ret = 0; 1531 ret = 0;
1534 } else { 1532 } else {
@@ -1590,9 +1588,8 @@ again:
1590 dirty_pages = 0; 1588 dirty_pages = 0;
1591 } else { 1589 } else {
1592 force_page_uptodate = false; 1590 force_page_uptodate = false;
1593 dirty_pages = (copied + offset + 1591 dirty_pages = DIV_ROUND_UP(copied + offset,
1594 PAGE_CACHE_SIZE - 1) >> 1592 PAGE_CACHE_SIZE);
1595 PAGE_CACHE_SHIFT;
1596 } 1593 }
1597 1594
1598 /* 1595 /*
@@ -1653,7 +1650,7 @@ again:
1653 cond_resched(); 1650 cond_resched();
1654 1651
1655 balance_dirty_pages_ratelimited(inode->i_mapping); 1652 balance_dirty_pages_ratelimited(inode->i_mapping);
1656 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1653 if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
1657 btrfs_btree_balance_dirty(root); 1654 btrfs_btree_balance_dirty(root);
1658 1655
1659 pos += copied; 1656 pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1795 if (sync) 1792 if (sync)
1796 atomic_inc(&BTRFS_I(inode)->sync_writers); 1793 atomic_inc(&BTRFS_I(inode)->sync_writers);
1797 1794
1798 if (unlikely(file->f_flags & O_DIRECT)) { 1795 if (file->f_flags & O_DIRECT) {
1799 num_written = __btrfs_direct_write(iocb, from, pos); 1796 num_written = __btrfs_direct_write(iocb, from, pos);
1800 } else { 1797 } else {
1801 num_written = __btrfs_buffered_write(file, from, pos); 1798 num_written = __btrfs_buffered_write(file, from, pos);
@@ -1852,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1852 return 0; 1849 return 0;
1853} 1850}
1854 1851
1852static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1853{
1854 int ret;
1855
1856 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862
1863 return ret;
1864}
1865
1855/* 1866/*
1856 * fsync call for both files and directories. This logs the inode into 1867 * fsync call for both files and directories. This logs the inode into
1857 * the tree log instead of forcing full commits whenever possible. 1868 * the tree log instead of forcing full commits whenever possible.
@@ -1881,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1881 * multi-task, and make the performance up. See 1892 * multi-task, and make the performance up. See
1882 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1893 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1883 */ 1894 */
1884 atomic_inc(&BTRFS_I(inode)->sync_writers); 1895 ret = start_ordered_ops(inode, start, end);
1885 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1886 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1887 &BTRFS_I(inode)->runtime_flags))
1888 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1889 atomic_dec(&BTRFS_I(inode)->sync_writers);
1890 if (ret) 1896 if (ret)
1891 return ret; 1897 return ret;
1892 1898
1893 mutex_lock(&inode->i_mutex); 1899 mutex_lock(&inode->i_mutex);
1894
1895 /*
1896 * We flush the dirty pages again to avoid some dirty pages in the
1897 * range being left.
1898 */
1899 atomic_inc(&root->log_batch); 1900 atomic_inc(&root->log_batch);
1900 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1901 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1901 &BTRFS_I(inode)->runtime_flags); 1902 &BTRFS_I(inode)->runtime_flags);
1903 /*
1904 * We might have have had more pages made dirty after calling
1905 * start_ordered_ops and before acquiring the inode's i_mutex.
1906 */
1902 if (full_sync) { 1907 if (full_sync) {
1908 /*
1909 * For a full sync, we need to make sure any ordered operations
1910 * start and finish before we start logging the inode, so that
1911 * all extents are persisted and the respective file extent
1912 * items are in the fs/subvol btree.
1913 */
1903 ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1914 ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1904 if (ret) { 1915 } else {
1905 mutex_unlock(&inode->i_mutex); 1916 /*
1906 goto out; 1917 * Start any new ordered operations before starting to log the
1907 } 1918 * inode. We will wait for them to finish in btrfs_sync_log().
1919 *
1920 * Right before acquiring the inode's mutex, we might have new
1921 * writes dirtying pages, which won't immediately start the
1922 * respective ordered operations - that is done through the
1923 * fill_delalloc callbacks invoked from the writepage and
1924 * writepages address space operations. So make sure we start
1925 * all ordered operations before starting to log our inode. Not
1926 * doing this means that while logging the inode, writeback
1927 * could start and invoke writepage/writepages, which would call
1928 * the fill_delalloc callbacks (cow_file_range,
1929 * submit_compressed_extents). These callbacks add first an
1930 * extent map to the modified list of extents and then create
1931 * the respective ordered operation, which means in
1932 * tree-log.c:btrfs_log_inode() we might capture all existing
1933 * ordered operations (with btrfs_get_logged_extents()) before
1934 * the fill_delalloc callback adds its ordered operation, and by
1935 * the time we visit the modified list of extent maps (with
1936 * btrfs_log_changed_extents()), we see and process the extent
1937 * map they created. We then use the extent map to construct a
1938 * file extent item for logging without waiting for the
1939 * respective ordered operation to finish - this file extent
1940 * item points to a disk location that might not have yet been
1941 * written to, containing random data - so after a crash a log
1942 * replay will make our inode have file extent items that point
1943 * to disk locations containing invalid data, as we returned
1944 * success to userspace without waiting for the respective
1945 * ordered operation to finish, because it wasn't captured by
1946 * btrfs_get_logged_extents().
1947 */
1948 ret = start_ordered_ops(inode, start, end);
1949 }
1950 if (ret) {
1951 mutex_unlock(&inode->i_mutex);
1952 goto out;
1908 } 1953 }
1909 atomic_inc(&root->log_batch); 1954 atomic_inc(&root->log_batch);
1910 1955
@@ -1984,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1984 */ 2029 */
1985 mutex_unlock(&inode->i_mutex); 2030 mutex_unlock(&inode->i_mutex);
1986 2031
2032 /*
2033 * If any of the ordered extents had an error, just return it to user
2034 * space, so that the application knows some writes didn't succeed and
2035 * can take proper action (retry for e.g.). Blindly committing the
2036 * transaction in this case, would fool userspace that everything was
2037 * successful. And we also want to make sure our log doesn't contain
2038 * file extent items pointing to extents that weren't fully written to -
2039 * just like in the non fast fsync path, where we check for the ordered
2040 * operation's error flag before writing to the log tree and return -EIO
2041 * if any of them had this flag set (btrfs_wait_ordered_range) -
2042 * therefore we need to check for errors in the ordered operations,
2043 * which are indicated by ctx.io_err.
2044 */
2045 if (ctx.io_err) {
2046 btrfs_end_transaction(trans, root);
2047 ret = ctx.io_err;
2048 goto out;
2049 }
2050
1987 if (ret != BTRFS_NO_LOG_SYNC) { 2051 if (ret != BTRFS_NO_LOG_SYNC) {
1988 if (!ret) { 2052 if (!ret) {
1989 ret = btrfs_sync_log(trans, root, &ctx); 2053 ret = btrfs_sync_log(trans, root, &ctx);
@@ -2621,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2621 struct btrfs_root *root = BTRFS_I(inode)->root; 2685 struct btrfs_root *root = BTRFS_I(inode)->root;
2622 struct extent_map *em = NULL; 2686 struct extent_map *em = NULL;
2623 struct extent_state *cached_state = NULL; 2687 struct extent_state *cached_state = NULL;
2624 u64 lockstart = *offset; 2688 u64 lockstart;
2625 u64 lockend = i_size_read(inode); 2689 u64 lockend;
2626 u64 start = *offset; 2690 u64 start;
2627 u64 len = i_size_read(inode); 2691 u64 len;
2628 int ret = 0; 2692 int ret = 0;
2629 2693
2630 lockend = max_t(u64, root->sectorsize, lockend); 2694 if (inode->i_size == 0)
2695 return -ENXIO;
2696
2697 /*
2698 * *offset can be negative, in this case we start finding DATA/HOLE from
2699 * the very start of the file.
2700 */
2701 start = max_t(loff_t, 0, *offset);
2702
2703 lockstart = round_down(start, root->sectorsize);
2704 lockend = round_up(i_size_read(inode), root->sectorsize);
2631 if (lockend <= lockstart) 2705 if (lockend <= lockstart)
2632 lockend = lockstart + root->sectorsize; 2706 lockend = lockstart + root->sectorsize;
2633
2634 lockend--; 2707 lockend--;
2635 len = lockend - lockstart + 1; 2708 len = lockend - lockstart + 1;
2636 2709
2637 len = max_t(u64, len, root->sectorsize);
2638 if (inode->i_size == 0)
2639 return -ENXIO;
2640
2641 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 2710 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2642 &cached_state); 2711 &cached_state);
2643 2712
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627cb5f9..33848196550e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
279 int num_pages; 279 int num_pages;
280 int check_crcs = 0; 280 int check_crcs = 0;
281 281
282 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 282 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
283 PAGE_CACHE_SHIFT;
284 283
285 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
286 check_crcs = 1; 285 check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1998 return merged; 1997 return merged;
1999} 1998}
2000 1999
2000static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2001 struct btrfs_free_space *info,
2002 bool update_stat)
2003{
2004 struct btrfs_free_space *bitmap;
2005 unsigned long i;
2006 unsigned long j;
2007 const u64 end = info->offset + info->bytes;
2008 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2009 u64 bytes;
2010
2011 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2012 if (!bitmap)
2013 return false;
2014
2015 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2016 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2017 if (j == i)
2018 return false;
2019 bytes = (j - i) * ctl->unit;
2020 info->bytes += bytes;
2021
2022 if (update_stat)
2023 bitmap_clear_bits(ctl, bitmap, end, bytes);
2024 else
2025 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2026
2027 if (!bitmap->bytes)
2028 free_bitmap(ctl, bitmap);
2029
2030 return true;
2031}
2032
2033static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2034 struct btrfs_free_space *info,
2035 bool update_stat)
2036{
2037 struct btrfs_free_space *bitmap;
2038 u64 bitmap_offset;
2039 unsigned long i;
2040 unsigned long j;
2041 unsigned long prev_j;
2042 u64 bytes;
2043
2044 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2045 /* If we're on a boundary, try the previous logical bitmap. */
2046 if (bitmap_offset == info->offset) {
2047 if (info->offset == 0)
2048 return false;
2049 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2050 }
2051
2052 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2053 if (!bitmap)
2054 return false;
2055
2056 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2057 j = 0;
2058 prev_j = (unsigned long)-1;
2059 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2060 if (j > i)
2061 break;
2062 prev_j = j;
2063 }
2064 if (prev_j == i)
2065 return false;
2066
2067 if (prev_j == (unsigned long)-1)
2068 bytes = (i + 1) * ctl->unit;
2069 else
2070 bytes = (i - prev_j) * ctl->unit;
2071
2072 info->offset -= bytes;
2073 info->bytes += bytes;
2074
2075 if (update_stat)
2076 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2077 else
2078 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2079
2080 if (!bitmap->bytes)
2081 free_bitmap(ctl, bitmap);
2082
2083 return true;
2084}
2085
2086/*
2087 * We prefer always to allocate from extent entries, both for clustered and
2088 * non-clustered allocation requests. So when attempting to add a new extent
2089 * entry, try to see if there's adjacent free space in bitmap entries, and if
2090 * there is, migrate that space from the bitmaps to the extent.
2091 * Like this we get better chances of satisfying space allocation requests
2092 * because we attempt to satisfy them based on a single cache entry, and never
2093 * on 2 or more entries - even if the entries represent a contiguous free space
2094 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2095 * ends).
2096 */
2097static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2098 struct btrfs_free_space *info,
2099 bool update_stat)
2100{
2101 /*
2102 * Only work with disconnected entries, as we can change their offset,
2103 * and must be extent entries.
2104 */
2105 ASSERT(!info->bitmap);
2106 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2107
2108 if (ctl->total_bitmaps > 0) {
2109 bool stole_end;
2110 bool stole_front = false;
2111
2112 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2113 if (ctl->total_bitmaps > 0)
2114 stole_front = steal_from_bitmap_to_front(ctl, info,
2115 update_stat);
2116
2117 if (stole_end || stole_front)
2118 try_merge_free_space(ctl, info, update_stat);
2119 }
2120}
2121
2001int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, 2122int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2002 u64 offset, u64 bytes) 2123 u64 offset, u64 bytes)
2003{ 2124{
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2010 2131
2011 info->offset = offset; 2132 info->offset = offset;
2012 info->bytes = bytes; 2133 info->bytes = bytes;
2134 RB_CLEAR_NODE(&info->offset_index);
2013 2135
2014 spin_lock(&ctl->tree_lock); 2136 spin_lock(&ctl->tree_lock);
2015 2137
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2029 goto out; 2151 goto out;
2030 } 2152 }
2031link: 2153link:
2154 /*
2155 * Only steal free space from adjacent bitmaps if we're sure we're not
2156 * going to add the new free space to existing bitmap entries - because
2157 * that would mean unnecessary work that would be reverted. Therefore
2158 * attempt to steal space from bitmaps if we're adding an extent entry.
2159 */
2160 steal_from_bitmap(ctl, info, true);
2161
2032 ret = link_free_space(ctl, info); 2162 ret = link_free_space(ctl, info);
2033 if (ret) 2163 if (ret)
2034 kmem_cache_free(btrfs_free_space_cachep, info); 2164 kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
2205 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2335 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2206 node = rb_next(&entry->offset_index); 2336 node = rb_next(&entry->offset_index);
2207 rb_erase(&entry->offset_index, &cluster->root); 2337 rb_erase(&entry->offset_index, &cluster->root);
2338 RB_CLEAR_NODE(&entry->offset_index);
2208 2339
2209 bitmap = (entry->bitmap != NULL); 2340 bitmap = (entry->bitmap != NULL);
2210 if (!bitmap) 2341 if (!bitmap) {
2211 try_merge_free_space(ctl, entry, false); 2342 try_merge_free_space(ctl, entry, false);
2343 steal_from_bitmap(ctl, entry, false);
2344 }
2212 tree_insert_offset(&ctl->free_space_offset, 2345 tree_insert_offset(&ctl->free_space_offset,
2213 entry->offset, &entry->offset_index, bitmap); 2346 entry->offset, &entry->offset_index, bitmap);
2214 } 2347 }
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3033{ 3166{
3034 struct inode *inode = NULL; 3167 struct inode *inode = NULL;
3035 3168
3036 spin_lock(&root->cache_lock); 3169 spin_lock(&root->ino_cache_lock);
3037 if (root->cache_inode) 3170 if (root->ino_cache_inode)
3038 inode = igrab(root->cache_inode); 3171 inode = igrab(root->ino_cache_inode);
3039 spin_unlock(&root->cache_lock); 3172 spin_unlock(&root->ino_cache_lock);
3040 if (inode) 3173 if (inode)
3041 return inode; 3174 return inode;
3042 3175
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3044 if (IS_ERR(inode)) 3177 if (IS_ERR(inode))
3045 return inode; 3178 return inode;
3046 3179
3047 spin_lock(&root->cache_lock); 3180 spin_lock(&root->ino_cache_lock);
3048 if (!btrfs_fs_closing(root->fs_info)) 3181 if (!btrfs_fs_closing(root->fs_info))
3049 root->cache_inode = igrab(inode); 3182 root->ino_cache_inode = igrab(inode);
3050 spin_unlock(&root->cache_lock); 3183 spin_unlock(&root->ino_cache_lock);
3051 3184
3052 return inode; 3185 return inode;
3053} 3186}
@@ -3176,6 +3309,7 @@ again:
3176 map = NULL; 3309 map = NULL;
3177 add_new_bitmap(ctl, info, offset); 3310 add_new_bitmap(ctl, info, offset);
3178 bitmap_info = info; 3311 bitmap_info = info;
3312 info = NULL;
3179 } 3313 }
3180 3314
3181 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 3315 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
3186 if (bytes) 3320 if (bytes)
3187 goto again; 3321 goto again;
3188 3322
3323 if (info)
3324 kmem_cache_free(btrfs_free_space_cachep, info);
3189 if (map) 3325 if (map)
3190 kfree(map); 3326 kfree(map);
3191 return 0; 3327 return 0;
@@ -3260,6 +3396,7 @@ have_info:
3260 goto have_info; 3396 goto have_info;
3261 } 3397 }
3262 3398
3399 ret = 0;
3263 goto out; 3400 goto out;
3264 } 3401 }
3265 3402
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa82c62..64f15bb30a81 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
20int __init btrfs_hash_init(void) 20int __init btrfs_hash_init(void)
21{ 21{
22 tfm = crypto_alloc_shash("crc32c", 0, 0); 22 tfm = crypto_alloc_shash("crc32c", 0, 0);
23 if (IS_ERR(tfm))
24 return PTR_ERR(tfm);
25 23
26 return 0; 24 return PTR_ERR_OR_ZERO(tfm);
27} 25}
28 26
29void btrfs_hash_exit(void) 27void btrfs_hash_exit(void)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df703c9..8ffa4783cbf4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
135 u32 item_size; 135 u32 item_size;
136 136
137 key.objectid = inode_objectid; 137 key.objectid = inode_objectid;
138 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 138 key.type = BTRFS_INODE_EXTREF_KEY;
139 key.offset = btrfs_extref_hash(ref_objectid, name, name_len); 139 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
140 140
141 path = btrfs_alloc_path(); 141 path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
209 209
210 key.objectid = inode_objectid; 210 key.objectid = inode_objectid;
211 key.offset = ref_objectid; 211 key.offset = ref_objectid;
212 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 212 key.type = BTRFS_INODE_REF_KEY;
213 213
214 path = btrfs_alloc_path(); 214 path = btrfs_alloc_path();
215 if (!path) 215 if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
337 337
338 key.objectid = inode_objectid; 338 key.objectid = inode_objectid;
339 key.offset = ref_objectid; 339 key.offset = ref_objectid;
340 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 340 key.type = BTRFS_INODE_REF_KEY;
341 341
342 path = btrfs_alloc_path(); 342 path = btrfs_alloc_path();
343 if (!path) 343 if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
400 struct btrfs_key key; 400 struct btrfs_key key;
401 int ret; 401 int ret;
402 key.objectid = objectid; 402 key.objectid = objectid;
403 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 403 key.type = BTRFS_INODE_ITEM_KEY;
404 key.offset = 0; 404 key.offset = 0;
405 405
406 ret = btrfs_insert_empty_item(trans, root, path, &key, 406 ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
420 struct btrfs_key found_key; 420 struct btrfs_key found_key;
421 421
422 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); 422 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
423 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && 423 if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
424 location->offset == (u64)-1 && path->slots[0] != 0) { 424 location->offset == (u64)-1 && path->slots[0] != 0) {
425 slot = path->slots[0] - 1; 425 slot = path->slots[0] - 1;
426 leaf = path->nodes[0]; 426 leaf = path->nodes[0];
427 btrfs_item_key_to_cpu(leaf, &found_key, slot); 427 btrfs_item_key_to_cpu(leaf, &found_key, slot);
428 if (found_key.objectid == location->objectid && 428 if (found_key.objectid == location->objectid &&
429 btrfs_key_type(&found_key) == btrfs_key_type(location)) { 429 found_key.type == location->type) {
430 path->slots[0]--; 430 path->slots[0]--;
431 return 0; 431 return 0;
432 } 432 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe19079f..83d646bd2e4b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
87 */ 87 */
88 btrfs_item_key_to_cpu(leaf, &key, 0); 88 btrfs_item_key_to_cpu(leaf, &key, 0);
89 btrfs_release_path(path); 89 btrfs_release_path(path);
90 root->cache_progress = last; 90 root->ino_cache_progress = last;
91 up_read(&fs_info->commit_root_sem); 91 up_read(&fs_info->commit_root_sem);
92 schedule_timeout(1); 92 schedule_timeout(1);
93 goto again; 93 goto again;
@@ -106,7 +106,7 @@ again:
106 if (last != (u64)-1 && last + 1 != key.objectid) { 106 if (last != (u64)-1 && last + 1 != key.objectid) {
107 __btrfs_add_free_space(ctl, last + 1, 107 __btrfs_add_free_space(ctl, last + 1,
108 key.objectid - last - 1); 108 key.objectid - last - 1);
109 wake_up(&root->cache_wait); 109 wake_up(&root->ino_cache_wait);
110 } 110 }
111 111
112 last = key.objectid; 112 last = key.objectid;
@@ -119,14 +119,14 @@ next:
119 root->highest_objectid - last - 1); 119 root->highest_objectid - last - 1);
120 } 120 }
121 121
122 spin_lock(&root->cache_lock); 122 spin_lock(&root->ino_cache_lock);
123 root->cached = BTRFS_CACHE_FINISHED; 123 root->ino_cache_state = BTRFS_CACHE_FINISHED;
124 spin_unlock(&root->cache_lock); 124 spin_unlock(&root->ino_cache_lock);
125 125
126 root->cache_progress = (u64)-1; 126 root->ino_cache_progress = (u64)-1;
127 btrfs_unpin_free_ino(root); 127 btrfs_unpin_free_ino(root);
128out: 128out:
129 wake_up(&root->cache_wait); 129 wake_up(&root->ino_cache_wait);
130 up_read(&fs_info->commit_root_sem); 130 up_read(&fs_info->commit_root_sem);
131 131
132 btrfs_free_path(path); 132 btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
144 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 144 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
145 return; 145 return;
146 146
147 spin_lock(&root->cache_lock); 147 spin_lock(&root->ino_cache_lock);
148 if (root->cached != BTRFS_CACHE_NO) { 148 if (root->ino_cache_state != BTRFS_CACHE_NO) {
149 spin_unlock(&root->cache_lock); 149 spin_unlock(&root->ino_cache_lock);
150 return; 150 return;
151 } 151 }
152 152
153 root->cached = BTRFS_CACHE_STARTED; 153 root->ino_cache_state = BTRFS_CACHE_STARTED;
154 spin_unlock(&root->cache_lock); 154 spin_unlock(&root->ino_cache_lock);
155 155
156 ret = load_free_ino_cache(root->fs_info, root); 156 ret = load_free_ino_cache(root->fs_info, root);
157 if (ret == 1) { 157 if (ret == 1) {
158 spin_lock(&root->cache_lock); 158 spin_lock(&root->ino_cache_lock);
159 root->cached = BTRFS_CACHE_FINISHED; 159 root->ino_cache_state = BTRFS_CACHE_FINISHED;
160 spin_unlock(&root->cache_lock); 160 spin_unlock(&root->ino_cache_lock);
161 return; 161 return;
162 } 162 }
163 163
@@ -196,11 +196,11 @@ again:
196 196
197 start_caching(root); 197 start_caching(root);
198 198
199 wait_event(root->cache_wait, 199 wait_event(root->ino_cache_wait,
200 root->cached == BTRFS_CACHE_FINISHED || 200 root->ino_cache_state == BTRFS_CACHE_FINISHED ||
201 root->free_ino_ctl->free_space > 0); 201 root->free_ino_ctl->free_space > 0);
202 202
203 if (root->cached == BTRFS_CACHE_FINISHED && 203 if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
204 root->free_ino_ctl->free_space == 0) 204 root->free_ino_ctl->free_space == 0)
205 return -ENOSPC; 205 return -ENOSPC;
206 else 206 else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
214 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 214 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
215 return; 215 return;
216again: 216again:
217 if (root->cached == BTRFS_CACHE_FINISHED) { 217 if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
218 __btrfs_add_free_space(pinned, objectid, 1); 218 __btrfs_add_free_space(pinned, objectid, 1);
219 } else { 219 } else {
220 down_write(&root->fs_info->commit_root_sem); 220 down_write(&root->fs_info->commit_root_sem);
221 spin_lock(&root->cache_lock); 221 spin_lock(&root->ino_cache_lock);
222 if (root->cached == BTRFS_CACHE_FINISHED) { 222 if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
223 spin_unlock(&root->cache_lock); 223 spin_unlock(&root->ino_cache_lock);
224 up_write(&root->fs_info->commit_root_sem); 224 up_write(&root->fs_info->commit_root_sem);
225 goto again; 225 goto again;
226 } 226 }
227 spin_unlock(&root->cache_lock); 227 spin_unlock(&root->ino_cache_lock);
228 228
229 start_caching(root); 229 start_caching(root);
230 230
@@ -235,10 +235,10 @@ again:
235} 235}
236 236
237/* 237/*
238 * When a transaction is committed, we'll move those inode numbers which 238 * When a transaction is committed, we'll move those inode numbers which are
239 * are smaller than root->cache_progress from pinned tree to free_ino tree, 239 * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
240 * and others will just be dropped, because the commit root we were 240 * others will just be dropped, because the commit root we were searching has
241 * searching has changed. 241 * changed.
242 * 242 *
243 * Must be called with root->fs_info->commit_root_sem held 243 * Must be called with root->fs_info->commit_root_sem held
244 */ 244 */
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
261 info = rb_entry(n, struct btrfs_free_space, offset_index); 261 info = rb_entry(n, struct btrfs_free_space, offset_index);
262 BUG_ON(info->bitmap); /* Logic error */ 262 BUG_ON(info->bitmap); /* Logic error */
263 263
264 if (info->offset > root->cache_progress) 264 if (info->offset > root->ino_cache_progress)
265 goto free; 265 goto free;
266 else if (info->offset + info->bytes > root->cache_progress) 266 else if (info->offset + info->bytes > root->ino_cache_progress)
267 count = root->cache_progress - info->offset + 1; 267 count = root->ino_cache_progress - info->offset + 1;
268 else 268 else
269 count = info->bytes; 269 count = info->bytes;
270 270
@@ -462,13 +462,13 @@ again:
462 } 462 }
463 } 463 }
464 464
465 spin_lock(&root->cache_lock); 465 spin_lock(&root->ino_cache_lock);
466 if (root->cached != BTRFS_CACHE_FINISHED) { 466 if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
467 ret = -1; 467 ret = -1;
468 spin_unlock(&root->cache_lock); 468 spin_unlock(&root->ino_cache_lock);
469 goto out_put; 469 goto out_put;
470 } 470 }
471 spin_unlock(&root->cache_lock); 471 spin_unlock(&root->ino_cache_lock);
472 472
473 spin_lock(&ctl->tree_lock); 473 spin_lock(&ctl->tree_lock);
474 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; 474 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 016c403bfe7e..fc9c0439caa3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
153 153
154 key.objectid = btrfs_ino(inode); 154 key.objectid = btrfs_ino(inode);
155 key.offset = start; 155 key.offset = start;
156 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 156 key.type = BTRFS_EXTENT_DATA_KEY;
157 157
158 datasize = btrfs_file_extent_calc_inline_size(cur_size); 158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 path->leave_spinning = 1; 159 path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
249 data_len = compressed_size; 249 data_len = compressed_size;
250 250
251 if (start > 0 || 251 if (start > 0 ||
252 actual_end >= PAGE_CACHE_SIZE || 252 actual_end > PAGE_CACHE_SIZE ||
253 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 253 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
254 (!compressed_size && 254 (!compressed_size &&
255 (actual_end & (root->sectorsize - 1)) == 0) || 255 (actual_end & (root->sectorsize - 1)) == 0) ||
256 end + 1 < isize || 256 end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
348 return 0; 348 return 0;
349} 349}
350 350
351static inline int inode_need_compress(struct inode *inode)
352{
353 struct btrfs_root *root = BTRFS_I(inode)->root;
354
355 /* force compress */
356 if (btrfs_test_opt(root, FORCE_COMPRESS))
357 return 1;
358 /* bad compression ratios */
359 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
360 return 0;
361 if (btrfs_test_opt(root, COMPRESS) ||
362 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
363 BTRFS_I(inode)->force_compress)
364 return 1;
365 return 0;
366}
367
351/* 368/*
352 * we create compressed extents in two phases. The first 369 * we create compressed extents in two phases. The first
353 * phase compresses a range of pages that have already been 370 * phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
444 * inode has not been flagged as nocompress. This flag can 461 * inode has not been flagged as nocompress. This flag can
445 * change at any time if we discover bad compression ratios. 462 * change at any time if we discover bad compression ratios.
446 */ 463 */
447 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 464 if (inode_need_compress(inode)) {
448 (btrfs_test_opt(root, COMPRESS) ||
449 (BTRFS_I(inode)->force_compress) ||
450 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
451 WARN_ON(pages); 465 WARN_ON(pages);
452 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 466 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
453 if (!pages) { 467 if (!pages) {
@@ -1094,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1094 async_cow->locked_page = locked_page; 1108 async_cow->locked_page = locked_page;
1095 async_cow->start = start; 1109 async_cow->start = start;
1096 1110
1097 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1111 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1112 !btrfs_test_opt(root, FORCE_COMPRESS))
1098 cur_end = end; 1113 cur_end = end;
1099 else 1114 else
1100 cur_end = min(end, start + 512 * 1024 - 1); 1115 cur_end = min(end, start + 512 * 1024 - 1);
@@ -1445,6 +1460,26 @@ error:
1445 return ret; 1460 return ret;
1446} 1461}
1447 1462
1463static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1464{
1465
1466 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1467 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1468 return 0;
1469
1470 /*
1471 * @defrag_bytes is a hint value, no spinlock held here,
1472 * if is not zero, it means the file is defragging.
1473 * Force cow if given extent needs to be defragged.
1474 */
1475 if (BTRFS_I(inode)->defrag_bytes &&
1476 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1477 EXTENT_DEFRAG, 0, NULL))
1478 return 1;
1479
1480 return 0;
1481}
1482
1448/* 1483/*
1449 * extent_io.c call back to do delayed allocation processing 1484 * extent_io.c call back to do delayed allocation processing
1450 */ 1485 */
@@ -1453,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1453 unsigned long *nr_written) 1488 unsigned long *nr_written)
1454{ 1489{
1455 int ret; 1490 int ret;
1456 struct btrfs_root *root = BTRFS_I(inode)->root; 1491 int force_cow = need_force_cow(inode, start, end);
1457 1492
1458 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1493 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1459 ret = run_delalloc_nocow(inode, locked_page, start, end, 1494 ret = run_delalloc_nocow(inode, locked_page, start, end,
1460 page_started, 1, nr_written); 1495 page_started, 1, nr_written);
1461 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1496 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1462 ret = run_delalloc_nocow(inode, locked_page, start, end, 1497 ret = run_delalloc_nocow(inode, locked_page, start, end,
1463 page_started, 0, nr_written); 1498 page_started, 0, nr_written);
1464 } else if (!btrfs_test_opt(root, COMPRESS) && 1499 } else if (!inode_need_compress(inode)) {
1465 !(BTRFS_I(inode)->force_compress) &&
1466 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1467 ret = cow_file_range(inode, locked_page, start, end, 1500 ret = cow_file_range(inode, locked_page, start, end,
1468 page_started, nr_written, 1); 1501 page_started, nr_written, 1);
1469 } else { 1502 } else {
@@ -1555,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1555 struct extent_state *state, unsigned long *bits) 1588 struct extent_state *state, unsigned long *bits)
1556{ 1589{
1557 1590
1591 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1592 WARN_ON(1);
1558 /* 1593 /*
1559 * set_bit and clear bit hooks normally require _irqsave/restore 1594 * set_bit and clear bit hooks normally require _irqsave/restore
1560 * but in this case, we are only testing for the DELALLOC 1595 * but in this case, we are only testing for the DELALLOC
@@ -1577,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1577 root->fs_info->delalloc_batch); 1612 root->fs_info->delalloc_batch);
1578 spin_lock(&BTRFS_I(inode)->lock); 1613 spin_lock(&BTRFS_I(inode)->lock);
1579 BTRFS_I(inode)->delalloc_bytes += len; 1614 BTRFS_I(inode)->delalloc_bytes += len;
1615 if (*bits & EXTENT_DEFRAG)
1616 BTRFS_I(inode)->defrag_bytes += len;
1580 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1617 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1581 &BTRFS_I(inode)->runtime_flags)) 1618 &BTRFS_I(inode)->runtime_flags))
1582 btrfs_add_delalloc_inodes(root, inode); 1619 btrfs_add_delalloc_inodes(root, inode);
@@ -1591,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1591 struct extent_state *state, 1628 struct extent_state *state,
1592 unsigned long *bits) 1629 unsigned long *bits)
1593{ 1630{
1631 u64 len = state->end + 1 - state->start;
1632
1633 spin_lock(&BTRFS_I(inode)->lock);
1634 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1635 BTRFS_I(inode)->defrag_bytes -= len;
1636 spin_unlock(&BTRFS_I(inode)->lock);
1637
1594 /* 1638 /*
1595 * set_bit and clear bit hooks normally require _irqsave/restore 1639 * set_bit and clear bit hooks normally require _irqsave/restore
1596 * but in this case, we are only testing for the DELALLOC 1640 * but in this case, we are only testing for the DELALLOC
@@ -1598,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1598 */ 1642 */
1599 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1643 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1600 struct btrfs_root *root = BTRFS_I(inode)->root; 1644 struct btrfs_root *root = BTRFS_I(inode)->root;
1601 u64 len = state->end + 1 - state->start;
1602 bool do_list = !btrfs_is_free_space_inode(inode); 1645 bool do_list = !btrfs_is_free_space_inode(inode);
1603 1646
1604 if (*bits & EXTENT_FIRST_DELALLOC) { 1647 if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -2660,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2660 goto out; 2703 goto out;
2661 } 2704 }
2662 2705
2706 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2707 ordered_extent->file_offset +
2708 ordered_extent->len - 1);
2709
2663 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2710 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2664 truncated = true; 2711 truncated = true;
2665 logical_len = ordered_extent->truncated_len; 2712 logical_len = ordered_extent->truncated_len;
@@ -2856,6 +2903,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2856 return 0; 2903 return 0;
2857} 2904}
2858 2905
2906static int __readpage_endio_check(struct inode *inode,
2907 struct btrfs_io_bio *io_bio,
2908 int icsum, struct page *page,
2909 int pgoff, u64 start, size_t len)
2910{
2911 char *kaddr;
2912 u32 csum_expected;
2913 u32 csum = ~(u32)0;
2914 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2915 DEFAULT_RATELIMIT_BURST);
2916
2917 csum_expected = *(((u32 *)io_bio->csum) + icsum);
2918
2919 kaddr = kmap_atomic(page);
2920 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
2921 btrfs_csum_final(csum, (char *)&csum);
2922 if (csum != csum_expected)
2923 goto zeroit;
2924
2925 kunmap_atomic(kaddr);
2926 return 0;
2927zeroit:
2928 if (__ratelimit(&_rs))
2929 btrfs_info(BTRFS_I(inode)->root->fs_info,
2930 "csum failed ino %llu off %llu csum %u expected csum %u",
2931 btrfs_ino(inode), start, csum, csum_expected);
2932 memset(kaddr + pgoff, 1, len);
2933 flush_dcache_page(page);
2934 kunmap_atomic(kaddr);
2935 if (csum_expected == 0)
2936 return 0;
2937 return -EIO;
2938}
2939
2859/* 2940/*
2860 * when reads are done, we need to check csums to verify the data is correct 2941 * when reads are done, we need to check csums to verify the data is correct
2861 * if there's a match, we allow the bio to finish. If not, the code in 2942 * if there's a match, we allow the bio to finish. If not, the code in
@@ -2868,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2868 size_t offset = start - page_offset(page); 2949 size_t offset = start - page_offset(page);
2869 struct inode *inode = page->mapping->host; 2950 struct inode *inode = page->mapping->host;
2870 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2951 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2871 char *kaddr;
2872 struct btrfs_root *root = BTRFS_I(inode)->root; 2952 struct btrfs_root *root = BTRFS_I(inode)->root;
2873 u32 csum_expected;
2874 u32 csum = ~(u32)0;
2875 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2876 DEFAULT_RATELIMIT_BURST);
2877 2953
2878 if (PageChecked(page)) { 2954 if (PageChecked(page)) {
2879 ClearPageChecked(page); 2955 ClearPageChecked(page);
2880 goto good; 2956 return 0;
2881 } 2957 }
2882 2958
2883 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2959 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2884 goto good; 2960 return 0;
2885 2961
2886 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2962 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2887 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2963 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2891,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2891 } 2967 }
2892 2968
2893 phy_offset >>= inode->i_sb->s_blocksize_bits; 2969 phy_offset >>= inode->i_sb->s_blocksize_bits;
2894 csum_expected = *(((u32 *)io_bio->csum) + phy_offset); 2970 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
2895 2971 start, (size_t)(end - start + 1));
2896 kaddr = kmap_atomic(page);
2897 csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
2898 btrfs_csum_final(csum, (char *)&csum);
2899 if (csum != csum_expected)
2900 goto zeroit;
2901
2902 kunmap_atomic(kaddr);
2903good:
2904 return 0;
2905
2906zeroit:
2907 if (__ratelimit(&_rs))
2908 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2909 btrfs_ino(page->mapping->host), start, csum, csum_expected);
2910 memset(kaddr + offset, 1, end - start + 1);
2911 flush_dcache_page(page);
2912 kunmap_atomic(kaddr);
2913 if (csum_expected == 0)
2914 return 0;
2915 return -EIO;
2916} 2972}
2917 2973
2918struct delayed_iput { 2974struct delayed_iput {
@@ -3159,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3159 path->reada = -1; 3215 path->reada = -1;
3160 3216
3161 key.objectid = BTRFS_ORPHAN_OBJECTID; 3217 key.objectid = BTRFS_ORPHAN_OBJECTID;
3162 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 3218 key.type = BTRFS_ORPHAN_ITEM_KEY;
3163 key.offset = (u64)-1; 3219 key.offset = (u64)-1;
3164 3220
3165 while (1) { 3221 while (1) {
@@ -3186,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3186 /* make sure the item matches what we want */ 3242 /* make sure the item matches what we want */
3187 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3243 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3188 break; 3244 break;
3189 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 3245 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3190 break; 3246 break;
3191 3247
3192 /* release the path since we're done with it */ 3248 /* release the path since we're done with it */
@@ -3662,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3662 * without delay 3718 * without delay
3663 */ 3719 */
3664 if (!btrfs_is_free_space_inode(inode) 3720 if (!btrfs_is_free_space_inode(inode)
3665 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 3721 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3722 && !root->fs_info->log_root_recovering) {
3666 btrfs_update_root_times(trans, root); 3723 btrfs_update_root_times(trans, root);
3667 3724
3668 ret = btrfs_delayed_update_inode(trans, root, inode); 3725 ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4085,7 +4142,7 @@ search_again:
4085 fi = NULL; 4142 fi = NULL;
4086 leaf = path->nodes[0]; 4143 leaf = path->nodes[0];
4087 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4088 found_type = btrfs_key_type(&found_key); 4145 found_type = found_key.type;
4089 4146
4090 if (found_key.objectid != ino) 4147 if (found_key.objectid != ino)
4091 break; 4148 break;
@@ -4747,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
4747 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 4804 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4748 btrfs_wait_ordered_range(inode, 0, (u64)-1); 4805 btrfs_wait_ordered_range(inode, 0, (u64)-1);
4749 4806
4807 btrfs_free_io_failure_record(inode, 0, (u64)-1);
4808
4750 if (root->fs_info->log_root_recovering) { 4809 if (root->fs_info->log_root_recovering) {
4751 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 4810 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4752 &BTRFS_I(inode)->runtime_flags)); 4811 &BTRFS_I(inode)->runtime_flags));
@@ -5331,7 +5390,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5331 btrfs_get_delayed_items(inode, &ins_list, &del_list); 5390 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5332 } 5391 }
5333 5392
5334 btrfs_set_key_type(&key, key_type); 5393 key.type = key_type;
5335 key.offset = ctx->pos; 5394 key.offset = ctx->pos;
5336 key.objectid = btrfs_ino(inode); 5395 key.objectid = btrfs_ino(inode);
5337 5396
@@ -5356,7 +5415,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5356 5415
5357 if (found_key.objectid != key.objectid) 5416 if (found_key.objectid != key.objectid)
5358 break; 5417 break;
5359 if (btrfs_key_type(&found_key) != key_type) 5418 if (found_key.type != key_type)
5360 break; 5419 break;
5361 if (found_key.offset < ctx->pos) 5420 if (found_key.offset < ctx->pos)
5362 goto next; 5421 goto next;
@@ -5568,7 +5627,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5568 int ret; 5627 int ret;
5569 5628
5570 key.objectid = btrfs_ino(inode); 5629 key.objectid = btrfs_ino(inode);
5571 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 5630 key.type = BTRFS_DIR_INDEX_KEY;
5572 key.offset = (u64)-1; 5631 key.offset = (u64)-1;
5573 5632
5574 path = btrfs_alloc_path(); 5633 path = btrfs_alloc_path();
@@ -5600,7 +5659,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5600 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5659 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5601 5660
5602 if (found_key.objectid != btrfs_ino(inode) || 5661 if (found_key.objectid != btrfs_ino(inode) ||
5603 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 5662 found_key.type != BTRFS_DIR_INDEX_KEY) {
5604 BTRFS_I(inode)->index_cnt = 2; 5663 BTRFS_I(inode)->index_cnt = 2;
5605 goto out; 5664 goto out;
5606 } 5665 }
@@ -5718,7 +5777,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5718 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5777 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5719 5778
5720 key[0].objectid = objectid; 5779 key[0].objectid = objectid;
5721 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5780 key[0].type = BTRFS_INODE_ITEM_KEY;
5722 key[0].offset = 0; 5781 key[0].offset = 0;
5723 5782
5724 sizes[0] = sizeof(struct btrfs_inode_item); 5783 sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5731,7 +5790,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5731 * add more hard links than can fit in the ref item. 5790 * add more hard links than can fit in the ref item.
5732 */ 5791 */
5733 key[1].objectid = objectid; 5792 key[1].objectid = objectid;
5734 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 5793 key[1].type = BTRFS_INODE_REF_KEY;
5735 key[1].offset = ref_objectid; 5794 key[1].offset = ref_objectid;
5736 5795
5737 sizes[1] = name_len + sizeof(*ref); 5796 sizes[1] = name_len + sizeof(*ref);
@@ -5740,7 +5799,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5740 location = &BTRFS_I(inode)->location; 5799 location = &BTRFS_I(inode)->location;
5741 location->objectid = objectid; 5800 location->objectid = objectid;
5742 location->offset = 0; 5801 location->offset = 0;
5743 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 5802 location->type = BTRFS_INODE_ITEM_KEY;
5744 5803
5745 ret = btrfs_insert_inode_locked(inode); 5804 ret = btrfs_insert_inode_locked(inode);
5746 if (ret < 0) 5805 if (ret < 0)
@@ -5832,7 +5891,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
5832 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 5891 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5833 } else { 5892 } else {
5834 key.objectid = ino; 5893 key.objectid = ino;
5835 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 5894 key.type = BTRFS_INODE_ITEM_KEY;
5836 key.offset = 0; 5895 key.offset = 0;
5837 } 5896 }
5838 5897
@@ -6191,21 +6250,60 @@ out_fail_inode:
6191 goto out_fail; 6250 goto out_fail;
6192} 6251}
6193 6252
6253/* Find next extent map of a given extent map, caller needs to ensure locks */
6254static struct extent_map *next_extent_map(struct extent_map *em)
6255{
6256 struct rb_node *next;
6257
6258 next = rb_next(&em->rb_node);
6259 if (!next)
6260 return NULL;
6261 return container_of(next, struct extent_map, rb_node);
6262}
6263
6264static struct extent_map *prev_extent_map(struct extent_map *em)
6265{
6266 struct rb_node *prev;
6267
6268 prev = rb_prev(&em->rb_node);
6269 if (!prev)
6270 return NULL;
6271 return container_of(prev, struct extent_map, rb_node);
6272}
6273
6194/* helper for btfs_get_extent. Given an existing extent in the tree, 6274/* helper for btfs_get_extent. Given an existing extent in the tree,
6275 * the existing extent is the nearest extent to map_start,
6195 * and an extent that you want to insert, deal with overlap and insert 6276 * and an extent that you want to insert, deal with overlap and insert
6196 * the new extent into the tree. 6277 * the best fitted new extent into the tree.
6197 */ 6278 */
6198static int merge_extent_mapping(struct extent_map_tree *em_tree, 6279static int merge_extent_mapping(struct extent_map_tree *em_tree,
6199 struct extent_map *existing, 6280 struct extent_map *existing,
6200 struct extent_map *em, 6281 struct extent_map *em,
6201 u64 map_start) 6282 u64 map_start)
6202{ 6283{
6284 struct extent_map *prev;
6285 struct extent_map *next;
6286 u64 start;
6287 u64 end;
6203 u64 start_diff; 6288 u64 start_diff;
6204 6289
6205 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6290 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6206 start_diff = map_start - em->start; 6291
6207 em->start = map_start; 6292 if (existing->start > map_start) {
6208 em->len = existing->start - em->start; 6293 next = existing;
6294 prev = prev_extent_map(next);
6295 } else {
6296 prev = existing;
6297 next = next_extent_map(prev);
6298 }
6299
6300 start = prev ? extent_map_end(prev) : em->start;
6301 start = max_t(u64, start, em->start);
6302 end = next ? next->start : extent_map_end(em);
6303 end = min_t(u64, end, extent_map_end(em));
6304 start_diff = start - em->start;
6305 em->start = start;
6306 em->len = end - start;
6209 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6307 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6210 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6308 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6211 em->block_start += start_diff; 6309 em->block_start += start_diff;
@@ -6333,7 +6431,7 @@ again:
6333 struct btrfs_file_extent_item); 6431 struct btrfs_file_extent_item);
6334 /* are we inside the extent that was found? */ 6432 /* are we inside the extent that was found? */
6335 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6433 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6336 found_type = btrfs_key_type(&found_key); 6434 found_type = found_key.type;
6337 if (found_key.objectid != objectid || 6435 if (found_key.objectid != objectid ||
6338 found_type != BTRFS_EXTENT_DATA_KEY) { 6436 found_type != BTRFS_EXTENT_DATA_KEY) {
6339 /* 6437 /*
@@ -6482,25 +6580,21 @@ insert:
6482 6580
6483 ret = 0; 6581 ret = 0;
6484 6582
6485 existing = lookup_extent_mapping(em_tree, start, len); 6583 existing = search_extent_mapping(em_tree, start, len);
6486 if (existing && (existing->start > start || 6584 /*
6487 existing->start + existing->len <= start)) { 6585 * existing will always be non-NULL, since there must be
6586 * extent causing the -EEXIST.
6587 */
6588 if (start >= extent_map_end(existing) ||
6589 start <= existing->start) {
6590 /*
6591 * The existing extent map is the one nearest to
6592 * the [start, start + len) range which overlaps
6593 */
6594 err = merge_extent_mapping(em_tree, existing,
6595 em, start);
6488 free_extent_map(existing); 6596 free_extent_map(existing);
6489 existing = NULL; 6597 if (err) {
6490 }
6491 if (!existing) {
6492 existing = lookup_extent_mapping(em_tree, em->start,
6493 em->len);
6494 if (existing) {
6495 err = merge_extent_mapping(em_tree, existing,
6496 em, start);
6497 free_extent_map(existing);
6498 if (err) {
6499 free_extent_map(em);
6500 em = NULL;
6501 }
6502 } else {
6503 err = -EIO;
6504 free_extent_map(em); 6598 free_extent_map(em);
6505 em = NULL; 6599 em = NULL;
6506 } 6600 }
@@ -7112,8 +7206,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7112 block_start, len, 7206 block_start, len,
7113 orig_block_len, 7207 orig_block_len,
7114 ram_bytes, type); 7208 ram_bytes, type);
7115 if (IS_ERR(em)) 7209 if (IS_ERR(em)) {
7210 ret = PTR_ERR(em);
7116 goto unlock_err; 7211 goto unlock_err;
7212 }
7117 } 7213 }
7118 7214
7119 ret = btrfs_add_ordered_extent_dio(inode, start, 7215 ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7188,45 +7284,277 @@ unlock_err:
7188 return ret; 7284 return ret;
7189} 7285}
7190 7286
7191static void btrfs_endio_direct_read(struct bio *bio, int err) 7287static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7288 int rw, int mirror_num)
7192{ 7289{
7193 struct btrfs_dio_private *dip = bio->bi_private;
7194 struct bio_vec *bvec;
7195 struct inode *inode = dip->inode;
7196 struct btrfs_root *root = BTRFS_I(inode)->root; 7290 struct btrfs_root *root = BTRFS_I(inode)->root;
7197 struct bio *dio_bio; 7291 int ret;
7198 u32 *csums = (u32 *)dip->csum; 7292
7293 BUG_ON(rw & REQ_WRITE);
7294
7295 bio_get(bio);
7296
7297 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7298 BTRFS_WQ_ENDIO_DIO_REPAIR);
7299 if (ret)
7300 goto err;
7301
7302 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7303err:
7304 bio_put(bio);
7305 return ret;
7306}
7307
7308static int btrfs_check_dio_repairable(struct inode *inode,
7309 struct bio *failed_bio,
7310 struct io_failure_record *failrec,
7311 int failed_mirror)
7312{
7313 int num_copies;
7314
7315 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7316 failrec->logical, failrec->len);
7317 if (num_copies == 1) {
7318 /*
7319 * we only have a single copy of the data, so don't bother with
7320 * all the retry and error correction code that follows. no
7321 * matter what the error is, it is very likely to persist.
7322 */
7323 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7324 num_copies, failrec->this_mirror, failed_mirror);
7325 return 0;
7326 }
7327
7328 failrec->failed_mirror = failed_mirror;
7329 failrec->this_mirror++;
7330 if (failrec->this_mirror == failed_mirror)
7331 failrec->this_mirror++;
7332
7333 if (failrec->this_mirror > num_copies) {
7334 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7335 num_copies, failrec->this_mirror, failed_mirror);
7336 return 0;
7337 }
7338
7339 return 1;
7340}
7341
7342static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7343 struct page *page, u64 start, u64 end,
7344 int failed_mirror, bio_end_io_t *repair_endio,
7345 void *repair_arg)
7346{
7347 struct io_failure_record *failrec;
7348 struct bio *bio;
7349 int isector;
7350 int read_mode;
7351 int ret;
7352
7353 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7354
7355 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7356 if (ret)
7357 return ret;
7358
7359 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7360 failed_mirror);
7361 if (!ret) {
7362 free_io_failure(inode, failrec);
7363 return -EIO;
7364 }
7365
7366 if (failed_bio->bi_vcnt > 1)
7367 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7368 else
7369 read_mode = READ_SYNC;
7370
7371 isector = start - btrfs_io_bio(failed_bio)->logical;
7372 isector >>= inode->i_sb->s_blocksize_bits;
7373 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7374 0, isector, repair_endio, repair_arg);
7375 if (!bio) {
7376 free_io_failure(inode, failrec);
7377 return -EIO;
7378 }
7379
7380 btrfs_debug(BTRFS_I(inode)->root->fs_info,
7381 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7382 read_mode, failrec->this_mirror, failrec->in_validation);
7383
7384 ret = submit_dio_repair_bio(inode, bio, read_mode,
7385 failrec->this_mirror);
7386 if (ret) {
7387 free_io_failure(inode, failrec);
7388 bio_put(bio);
7389 }
7390
7391 return ret;
7392}
7393
7394struct btrfs_retry_complete {
7395 struct completion done;
7396 struct inode *inode;
7397 u64 start;
7398 int uptodate;
7399};
7400
7401static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7402{
7403 struct btrfs_retry_complete *done = bio->bi_private;
7404 struct bio_vec *bvec;
7405 int i;
7406
7407 if (err)
7408 goto end;
7409
7410 done->uptodate = 1;
7411 bio_for_each_segment_all(bvec, bio, i)
7412 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
7413end:
7414 complete(&done->done);
7415 bio_put(bio);
7416}
7417
7418static int __btrfs_correct_data_nocsum(struct inode *inode,
7419 struct btrfs_io_bio *io_bio)
7420{
7421 struct bio_vec *bvec;
7422 struct btrfs_retry_complete done;
7199 u64 start; 7423 u64 start;
7200 int i; 7424 int i;
7425 int ret;
7426
7427 start = io_bio->logical;
7428 done.inode = inode;
7429
7430 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7431try_again:
7432 done.uptodate = 0;
7433 done.start = start;
7434 init_completion(&done.done);
7435
7436 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7437 start + bvec->bv_len - 1,
7438 io_bio->mirror_num,
7439 btrfs_retry_endio_nocsum, &done);
7440 if (ret)
7441 return ret;
7442
7443 wait_for_completion(&done.done);
7444
7445 if (!done.uptodate) {
7446 /* We might have another mirror, so try again */
7447 goto try_again;
7448 }
7449
7450 start += bvec->bv_len;
7451 }
7452
7453 return 0;
7454}
7455
7456static void btrfs_retry_endio(struct bio *bio, int err)
7457{
7458 struct btrfs_retry_complete *done = bio->bi_private;
7459 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7460 struct bio_vec *bvec;
7461 int uptodate;
7462 int ret;
7463 int i;
7464
7465 if (err)
7466 goto end;
7201 7467
7202 start = dip->logical_offset; 7468 uptodate = 1;
7203 bio_for_each_segment_all(bvec, bio, i) { 7469 bio_for_each_segment_all(bvec, bio, i) {
7204 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 7470 ret = __readpage_endio_check(done->inode, io_bio, i,
7205 struct page *page = bvec->bv_page; 7471 bvec->bv_page, 0,
7206 char *kaddr; 7472 done->start, bvec->bv_len);
7207 u32 csum = ~(u32)0; 7473 if (!ret)
7208 unsigned long flags; 7474 clean_io_failure(done->inode, done->start,
7209 7475 bvec->bv_page, 0);
7210 local_irq_save(flags); 7476 else
7211 kaddr = kmap_atomic(page); 7477 uptodate = 0;
7212 csum = btrfs_csum_data(kaddr + bvec->bv_offset, 7478 }
7213 csum, bvec->bv_len); 7479
7214 btrfs_csum_final(csum, (char *)&csum); 7480 done->uptodate = uptodate;
7215 kunmap_atomic(kaddr); 7481end:
7216 local_irq_restore(flags); 7482 complete(&done->done);
7217 7483 bio_put(bio);
7218 flush_dcache_page(bvec->bv_page); 7484}
7219 if (csum != csums[i]) { 7485
7220 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 7486static int __btrfs_subio_endio_read(struct inode *inode,
7221 btrfs_ino(inode), start, csum, 7487 struct btrfs_io_bio *io_bio, int err)
7222 csums[i]); 7488{
7223 err = -EIO; 7489 struct bio_vec *bvec;
7224 } 7490 struct btrfs_retry_complete done;
7491 u64 start;
7492 u64 offset = 0;
7493 int i;
7494 int ret;
7495
7496 err = 0;
7497 start = io_bio->logical;
7498 done.inode = inode;
7499
7500 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7501 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7502 0, start, bvec->bv_len);
7503 if (likely(!ret))
7504 goto next;
7505try_again:
7506 done.uptodate = 0;
7507 done.start = start;
7508 init_completion(&done.done);
7509
7510 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7511 start + bvec->bv_len - 1,
7512 io_bio->mirror_num,
7513 btrfs_retry_endio, &done);
7514 if (ret) {
7515 err = ret;
7516 goto next;
7225 } 7517 }
7226 7518
7519 wait_for_completion(&done.done);
7520
7521 if (!done.uptodate) {
7522 /* We might have another mirror, so try again */
7523 goto try_again;
7524 }
7525next:
7526 offset += bvec->bv_len;
7227 start += bvec->bv_len; 7527 start += bvec->bv_len;
7228 } 7528 }
7229 7529
7530 return err;
7531}
7532
7533static int btrfs_subio_endio_read(struct inode *inode,
7534 struct btrfs_io_bio *io_bio, int err)
7535{
7536 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7537
7538 if (skip_csum) {
7539 if (unlikely(err))
7540 return __btrfs_correct_data_nocsum(inode, io_bio);
7541 else
7542 return 0;
7543 } else {
7544 return __btrfs_subio_endio_read(inode, io_bio, err);
7545 }
7546}
7547
7548static void btrfs_endio_direct_read(struct bio *bio, int err)
7549{
7550 struct btrfs_dio_private *dip = bio->bi_private;
7551 struct inode *inode = dip->inode;
7552 struct bio *dio_bio;
7553 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7554
7555 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7556 err = btrfs_subio_endio_read(inode, io_bio, err);
7557
7230 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 7558 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7231 dip->logical_offset + dip->bytes - 1); 7559 dip->logical_offset + dip->bytes - 1);
7232 dio_bio = dip->dio_bio; 7560 dio_bio = dip->dio_bio;
@@ -7237,6 +7565,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
7237 if (err) 7565 if (err)
7238 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 7566 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7239 dio_end_io(dio_bio, err); 7567 dio_end_io(dio_bio, err);
7568
7569 if (io_bio->end_io)
7570 io_bio->end_io(io_bio, err);
7240 bio_put(bio); 7571 bio_put(bio);
7241} 7572}
7242 7573
@@ -7302,12 +7633,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
7302{ 7633{
7303 struct btrfs_dio_private *dip = bio->bi_private; 7634 struct btrfs_dio_private *dip = bio->bi_private;
7304 7635
7636 if (err)
7637 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7638 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7639 btrfs_ino(dip->inode), bio->bi_rw,
7640 (unsigned long long)bio->bi_iter.bi_sector,
7641 bio->bi_iter.bi_size, err);
7642
7643 if (dip->subio_endio)
7644 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
7645
7305 if (err) { 7646 if (err) {
7306 btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7307 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7308 btrfs_ino(dip->inode), bio->bi_rw,
7309 (unsigned long long)bio->bi_iter.bi_sector,
7310 bio->bi_iter.bi_size, err);
7311 dip->errors = 1; 7647 dip->errors = 1;
7312 7648
7313 /* 7649 /*
@@ -7338,6 +7674,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7338 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 7674 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7339} 7675}
7340 7676
7677static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
7678 struct inode *inode,
7679 struct btrfs_dio_private *dip,
7680 struct bio *bio,
7681 u64 file_offset)
7682{
7683 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7684 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7685 int ret;
7686
7687 /*
7688 * We load all the csum data we need when we submit
7689 * the first bio to reduce the csum tree search and
7690 * contention.
7691 */
7692 if (dip->logical_offset == file_offset) {
7693 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
7694 file_offset);
7695 if (ret)
7696 return ret;
7697 }
7698
7699 if (bio == dip->orig_bio)
7700 return 0;
7701
7702 file_offset -= dip->logical_offset;
7703 file_offset >>= inode->i_sb->s_blocksize_bits;
7704 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
7705
7706 return 0;
7707}
7708
7341static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 7709static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7342 int rw, u64 file_offset, int skip_sum, 7710 int rw, u64 file_offset, int skip_sum,
7343 int async_submit) 7711 int async_submit)
@@ -7353,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7353 bio_get(bio); 7721 bio_get(bio);
7354 7722
7355 if (!write) { 7723 if (!write) {
7356 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 7724 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7725 BTRFS_WQ_ENDIO_DATA);
7357 if (ret) 7726 if (ret)
7358 goto err; 7727 goto err;
7359 } 7728 }
@@ -7376,13 +7745,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7376 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 7745 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7377 if (ret) 7746 if (ret)
7378 goto err; 7747 goto err;
7379 } else if (!skip_sum) { 7748 } else {
7380 ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, 7749 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
7381 file_offset); 7750 file_offset);
7382 if (ret) 7751 if (ret)
7383 goto err; 7752 goto err;
7384 } 7753 }
7385
7386map: 7754map:
7387 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 7755 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7388err: 7756err:
@@ -7403,7 +7771,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7403 u64 submit_len = 0; 7771 u64 submit_len = 0;
7404 u64 map_length; 7772 u64 map_length;
7405 int nr_pages = 0; 7773 int nr_pages = 0;
7406 int ret = 0; 7774 int ret;
7407 int async_submit = 0; 7775 int async_submit = 0;
7408 7776
7409 map_length = orig_bio->bi_iter.bi_size; 7777 map_length = orig_bio->bi_iter.bi_size;
@@ -7414,6 +7782,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7414 7782
7415 if (map_length >= orig_bio->bi_iter.bi_size) { 7783 if (map_length >= orig_bio->bi_iter.bi_size) {
7416 bio = orig_bio; 7784 bio = orig_bio;
7785 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
7417 goto submit; 7786 goto submit;
7418 } 7787 }
7419 7788
@@ -7430,12 +7799,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7430 7799
7431 bio->bi_private = dip; 7800 bio->bi_private = dip;
7432 bio->bi_end_io = btrfs_end_dio_bio; 7801 bio->bi_end_io = btrfs_end_dio_bio;
7802 btrfs_io_bio(bio)->logical = file_offset;
7433 atomic_inc(&dip->pending_bios); 7803 atomic_inc(&dip->pending_bios);
7434 7804
7435 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 7805 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7436 if (unlikely(map_length < submit_len + bvec->bv_len || 7806 if (map_length < submit_len + bvec->bv_len ||
7437 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 7807 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7438 bvec->bv_offset) < bvec->bv_len)) { 7808 bvec->bv_offset) < bvec->bv_len) {
7439 /* 7809 /*
7440 * inc the count before we submit the bio so 7810 * inc the count before we submit the bio so
7441 * we know the end IO handler won't happen before 7811 * we know the end IO handler won't happen before
@@ -7464,6 +7834,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7464 goto out_err; 7834 goto out_err;
7465 bio->bi_private = dip; 7835 bio->bi_private = dip;
7466 bio->bi_end_io = btrfs_end_dio_bio; 7836 bio->bi_end_io = btrfs_end_dio_bio;
7837 btrfs_io_bio(bio)->logical = file_offset;
7467 7838
7468 map_length = orig_bio->bi_iter.bi_size; 7839 map_length = orig_bio->bi_iter.bi_size;
7469 ret = btrfs_map_block(root->fs_info, rw, 7840 ret = btrfs_map_block(root->fs_info, rw,
@@ -7507,11 +7878,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7507 struct btrfs_root *root = BTRFS_I(inode)->root; 7878 struct btrfs_root *root = BTRFS_I(inode)->root;
7508 struct btrfs_dio_private *dip; 7879 struct btrfs_dio_private *dip;
7509 struct bio *io_bio; 7880 struct bio *io_bio;
7881 struct btrfs_io_bio *btrfs_bio;
7510 int skip_sum; 7882 int skip_sum;
7511 int sum_len;
7512 int write = rw & REQ_WRITE; 7883 int write = rw & REQ_WRITE;
7513 int ret = 0; 7884 int ret = 0;
7514 u16 csum_size;
7515 7885
7516 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7886 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7517 7887
@@ -7521,16 +7891,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7521 goto free_ordered; 7891 goto free_ordered;
7522 } 7892 }
7523 7893
7524 if (!skip_sum && !write) { 7894 dip = kzalloc(sizeof(*dip), GFP_NOFS);
7525 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7526 sum_len = dio_bio->bi_iter.bi_size >>
7527 inode->i_sb->s_blocksize_bits;
7528 sum_len *= csum_size;
7529 } else {
7530 sum_len = 0;
7531 }
7532
7533 dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7534 if (!dip) { 7895 if (!dip) {
7535 ret = -ENOMEM; 7896 ret = -ENOMEM;
7536 goto free_io_bio; 7897 goto free_io_bio;
@@ -7542,20 +7903,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7542 dip->bytes = dio_bio->bi_iter.bi_size; 7903 dip->bytes = dio_bio->bi_iter.bi_size;
7543 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 7904 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7544 io_bio->bi_private = dip; 7905 io_bio->bi_private = dip;
7545 dip->errors = 0;
7546 dip->orig_bio = io_bio; 7906 dip->orig_bio = io_bio;
7547 dip->dio_bio = dio_bio; 7907 dip->dio_bio = dio_bio;
7548 atomic_set(&dip->pending_bios, 0); 7908 atomic_set(&dip->pending_bios, 0);
7909 btrfs_bio = btrfs_io_bio(io_bio);
7910 btrfs_bio->logical = file_offset;
7549 7911
7550 if (write) 7912 if (write) {
7551 io_bio->bi_end_io = btrfs_endio_direct_write; 7913 io_bio->bi_end_io = btrfs_endio_direct_write;
7552 else 7914 } else {
7553 io_bio->bi_end_io = btrfs_endio_direct_read; 7915 io_bio->bi_end_io = btrfs_endio_direct_read;
7916 dip->subio_endio = btrfs_subio_endio_read;
7917 }
7554 7918
7555 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7919 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7556 if (!ret) 7920 if (!ret)
7557 return; 7921 return;
7558 7922
7923 if (btrfs_bio->end_io)
7924 btrfs_bio->end_io(btrfs_bio, ret);
7559free_io_bio: 7925free_io_bio:
7560 bio_put(io_bio); 7926 bio_put(io_bio);
7561 7927
@@ -7652,8 +8018,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7652 ret = btrfs_delalloc_reserve_space(inode, count); 8018 ret = btrfs_delalloc_reserve_space(inode, count);
7653 if (ret) 8019 if (ret)
7654 goto out; 8020 goto out;
7655 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 8021 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7656 &BTRFS_I(inode)->runtime_flags))) { 8022 &BTRFS_I(inode)->runtime_flags)) {
7657 inode_dio_done(inode); 8023 inode_dio_done(inode);
7658 flags = DIO_LOCKING | DIO_SKIP_HOLES; 8024 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7659 wakeup = false; 8025 wakeup = false;
@@ -8173,6 +8539,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8173 ei->last_sub_trans = 0; 8539 ei->last_sub_trans = 0;
8174 ei->logged_trans = 0; 8540 ei->logged_trans = 0;
8175 ei->delalloc_bytes = 0; 8541 ei->delalloc_bytes = 0;
8542 ei->defrag_bytes = 0;
8176 ei->disk_i_size = 0; 8543 ei->disk_i_size = 0;
8177 ei->flags = 0; 8544 ei->flags = 0;
8178 ei->csum_bytes = 0; 8545 ei->csum_bytes = 0;
@@ -8231,6 +8598,7 @@ void btrfs_destroy_inode(struct inode *inode)
8231 WARN_ON(BTRFS_I(inode)->reserved_extents); 8598 WARN_ON(BTRFS_I(inode)->reserved_extents);
8232 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 8599 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8233 WARN_ON(BTRFS_I(inode)->csum_bytes); 8600 WARN_ON(BTRFS_I(inode)->csum_bytes);
8601 WARN_ON(BTRFS_I(inode)->defrag_bytes);
8234 8602
8235 /* 8603 /*
8236 * This can happen where we create an inode, but somebody else also 8604 * This can happen where we create an inode, but somebody else also
@@ -8646,7 +9014,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8646 spin_unlock(&root->delalloc_lock); 9014 spin_unlock(&root->delalloc_lock);
8647 9015
8648 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 9016 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8649 if (unlikely(!work)) { 9017 if (!work) {
8650 if (delay_iput) 9018 if (delay_iput)
8651 btrfs_add_delayed_iput(inode); 9019 btrfs_add_delayed_iput(inode);
8652 else 9020 else
@@ -8832,7 +9200,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8832 } 9200 }
8833 key.objectid = btrfs_ino(inode); 9201 key.objectid = btrfs_ino(inode);
8834 key.offset = 0; 9202 key.offset = 0;
8835 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 9203 key.type = BTRFS_EXTENT_DATA_KEY;
8836 datasize = btrfs_file_extent_calc_inline_size(name_len); 9204 datasize = btrfs_file_extent_calc_inline_size(name_len);
8837 err = btrfs_insert_empty_item(trans, root, path, &key, 9205 err = btrfs_insert_empty_item(trans, root, path, &key,
8838 datasize); 9206 datasize);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8a8e29878c34..e732274f1afd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
332 goto out_drop; 332 goto out_drop;
333 333
334 } else { 334 } else {
335 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
336 if (ret && ret != -ENODATA)
337 goto out_drop;
335 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 338 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
336 } 339 }
337 340
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
477 if (ret) 480 if (ret)
478 goto fail; 481 goto fail;
479 482
480 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 483 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
481 0, objectid, NULL, 0, 0, 0);
482 if (IS_ERR(leaf)) { 484 if (IS_ERR(leaf)) {
483 ret = PTR_ERR(leaf); 485 ret = PTR_ERR(leaf);
484 goto fail; 486 goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
503 btrfs_set_stack_inode_generation(inode_item, 1); 505 btrfs_set_stack_inode_generation(inode_item, 1);
504 btrfs_set_stack_inode_size(inode_item, 3); 506 btrfs_set_stack_inode_size(inode_item, 3);
505 btrfs_set_stack_inode_nlink(inode_item, 1); 507 btrfs_set_stack_inode_nlink(inode_item, 1);
506 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 508 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
507 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 509 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
508 510
509 btrfs_set_root_flags(&root_item, 0); 511 btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
535 537
536 key.objectid = objectid; 538 key.objectid = objectid;
537 key.offset = 0; 539 key.offset = 0;
538 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 540 key.type = BTRFS_ROOT_ITEM_KEY;
539 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 541 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
540 &root_item); 542 &root_item);
541 if (ret) 543 if (ret)
@@ -882,7 +884,7 @@ out_unlock:
882 * file you want to defrag, we return 0 to let you know to skip this 884 * file you want to defrag, we return 0 to let you know to skip this
883 * part of the file 885 * part of the file
884 */ 886 */
885static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) 887static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
886{ 888{
887 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 889 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
888 struct extent_map *em = NULL; 890 struct extent_map *em = NULL;
@@ -917,7 +919,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
917 */ 919 */
918static int find_new_extents(struct btrfs_root *root, 920static int find_new_extents(struct btrfs_root *root,
919 struct inode *inode, u64 newer_than, 921 struct inode *inode, u64 newer_than,
920 u64 *off, int thresh) 922 u64 *off, u32 thresh)
921{ 923{
922 struct btrfs_path *path; 924 struct btrfs_path *path;
923 struct btrfs_key min_key; 925 struct btrfs_key min_key;
@@ -936,12 +938,9 @@ static int find_new_extents(struct btrfs_root *root,
936 min_key.offset = *off; 938 min_key.offset = *off;
937 939
938 while (1) { 940 while (1) {
939 path->keep_locks = 1;
940 ret = btrfs_search_forward(root, &min_key, path, newer_than); 941 ret = btrfs_search_forward(root, &min_key, path, newer_than);
941 if (ret != 0) 942 if (ret != 0)
942 goto none; 943 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot: 944process_slot:
946 if (min_key.objectid != ino) 945 if (min_key.objectid != ino)
947 goto none; 946 goto none;
@@ -1029,7 +1028,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
1029 return ret; 1028 return ret;
1030} 1029}
1031 1030
1032static int should_defrag_range(struct inode *inode, u64 start, int thresh, 1031static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
1033 u64 *last_len, u64 *skip, u64 *defrag_end, 1032 u64 *last_len, u64 *skip, u64 *defrag_end,
1034 int compress) 1033 int compress)
1035{ 1034{
@@ -1259,7 +1258,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1259 int ret; 1258 int ret;
1260 int defrag_count = 0; 1259 int defrag_count = 0;
1261 int compress_type = BTRFS_COMPRESS_ZLIB; 1260 int compress_type = BTRFS_COMPRESS_ZLIB;
1262 int extent_thresh = range->extent_thresh; 1261 u32 extent_thresh = range->extent_thresh;
1263 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 1262 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1264 unsigned long cluster = max_cluster; 1263 unsigned long cluster = max_cluster;
1265 u64 new_align = ~((u64)128 * 1024 - 1); 1264 u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1335,8 +1334,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1335 inode->i_mapping->writeback_index = i; 1334 inode->i_mapping->writeback_index = i;
1336 1335
1337 while (i <= last_index && defrag_count < max_to_defrag && 1336 while (i <= last_index && defrag_count < max_to_defrag &&
1338 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 1337 (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
1339 PAGE_CACHE_SHIFT)) {
1340 /* 1338 /*
1341 * make sure we stop running if someone unmounts 1339 * make sure we stop running if someone unmounts
1342 * the FS 1340 * the FS
@@ -1359,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1359 * the should_defrag function tells us how much to skip 1357 * the should_defrag function tells us how much to skip
1360 * bump our counter by the suggested amount 1358 * bump our counter by the suggested amount
1361 */ 1359 */
1362 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1360 next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
1363 i = max(i + 1, next); 1361 i = max(i + 1, next);
1364 continue; 1362 continue;
1365 } 1363 }
@@ -1554,7 +1552,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1554 goto out_free; 1552 goto out_free;
1555 } 1553 }
1556 1554
1557 old_size = device->total_bytes; 1555 old_size = btrfs_device_get_total_bytes(device);
1558 1556
1559 if (mod < 0) { 1557 if (mod < 0) {
1560 if (new_size > old_size) { 1558 if (new_size > old_size) {
@@ -2089,8 +2087,6 @@ static noinline int search_ioctl(struct inode *inode,
2089 key.type = sk->min_type; 2087 key.type = sk->min_type;
2090 key.offset = sk->min_offset; 2088 key.offset = sk->min_offset;
2091 2089
2092 path->keep_locks = 1;
2093
2094 while (1) { 2090 while (1) {
2095 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2091 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
2096 if (ret != 0) { 2092 if (ret != 0) {
@@ -2526,9 +2522,9 @@ out_unlock:
2526 ASSERT(dest->send_in_progress == 0); 2522 ASSERT(dest->send_in_progress == 0);
2527 2523
2528 /* the last ref */ 2524 /* the last ref */
2529 if (dest->cache_inode) { 2525 if (dest->ino_cache_inode) {
2530 iput(dest->cache_inode); 2526 iput(dest->ino_cache_inode);
2531 dest->cache_inode = NULL; 2527 dest->ino_cache_inode = NULL;
2532 } 2528 }
2533 } 2529 }
2534out_dput: 2530out_dput:
@@ -2634,6 +2630,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2634 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2630 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2635 ret = btrfs_init_new_device(root, vol_args->name); 2631 ret = btrfs_init_new_device(root, vol_args->name);
2636 2632
2633 if (!ret)
2634 btrfs_info(root->fs_info, "disk added %s",vol_args->name);
2635
2637 kfree(vol_args); 2636 kfree(vol_args);
2638out: 2637out:
2639 mutex_unlock(&root->fs_info->volume_mutex); 2638 mutex_unlock(&root->fs_info->volume_mutex);
@@ -2673,6 +2672,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2673 mutex_unlock(&root->fs_info->volume_mutex); 2672 mutex_unlock(&root->fs_info->volume_mutex);
2674 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2673 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2675 2674
2675 if (!ret)
2676 btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
2677
2676out: 2678out:
2677 kfree(vol_args); 2679 kfree(vol_args);
2678err_drop: 2680err_drop:
@@ -2737,8 +2739,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2737 } 2739 }
2738 2740
2739 di_args->devid = dev->devid; 2741 di_args->devid = dev->devid;
2740 di_args->bytes_used = dev->bytes_used; 2742 di_args->bytes_used = btrfs_device_get_bytes_used(dev);
2741 di_args->total_bytes = dev->total_bytes; 2743 di_args->total_bytes = btrfs_device_get_total_bytes(dev);
2742 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2744 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2743 if (dev->name) { 2745 if (dev->name) {
2744 struct rcu_string *name; 2746 struct rcu_string *name;
@@ -3164,7 +3166,7 @@ static void clone_update_extent_map(struct inode *inode,
3164 em->start + em->len - 1, 0); 3166 em->start + em->len - 1, 0);
3165 } 3167 }
3166 3168
3167 if (unlikely(ret)) 3169 if (ret)
3168 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3170 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3169 &BTRFS_I(inode)->runtime_flags); 3171 &BTRFS_I(inode)->runtime_flags);
3170} 3172}
@@ -3199,7 +3201,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3199 u64 last_dest_end = destoff; 3201 u64 last_dest_end = destoff;
3200 3202
3201 ret = -ENOMEM; 3203 ret = -ENOMEM;
3202 buf = vmalloc(btrfs_level_size(root, 0)); 3204 buf = vmalloc(root->nodesize);
3203 if (!buf) 3205 if (!buf)
3204 return ret; 3206 return ret;
3205 3207
@@ -3252,11 +3254,11 @@ process_slot:
3252 slot = path->slots[0]; 3254 slot = path->slots[0];
3253 3255
3254 btrfs_item_key_to_cpu(leaf, &key, slot); 3256 btrfs_item_key_to_cpu(leaf, &key, slot);
3255 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || 3257 if (key.type > BTRFS_EXTENT_DATA_KEY ||
3256 key.objectid != btrfs_ino(src)) 3258 key.objectid != btrfs_ino(src))
3257 break; 3259 break;
3258 3260
3259 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { 3261 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3260 struct btrfs_file_extent_item *extent; 3262 struct btrfs_file_extent_item *extent;
3261 int type; 3263 int type;
3262 u32 size; 3264 u32 size;
@@ -5283,6 +5285,12 @@ long btrfs_ioctl(struct file *file, unsigned int
5283 if (ret) 5285 if (ret)
5284 return ret; 5286 return ret;
5285 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 5287 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
5288 /*
5289 * The transaction thread may want to do more work,
5290 * namely it pokes the cleaner ktread that will start
5291 * processing uncleaned subvols.
5292 */
5293 wake_up_process(root->fs_info->transaction_kthread);
5286 return ret; 5294 return ret;
5287 } 5295 }
5288 case BTRFS_IOC_START_SYNC: 5296 case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad8514f0da..78285f30909e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
266 char *data_in; 266 char *data_in;
267 unsigned long page_in_index = 0; 267 unsigned long page_in_index = 0;
268 unsigned long page_out_index = 0; 268 unsigned long page_out_index = 0;
269 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 269 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
270 PAGE_CACHE_SIZE;
271 unsigned long buf_start; 270 unsigned long buf_start;
272 unsigned long buf_offset = 0; 271 unsigned long buf_offset = 0;
273 unsigned long bytes; 272 unsigned long bytes;
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793edb38ca..47767d5b8f0b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
27 int ret = 0; 27 int ret = 0;
28 28
29 key.objectid = BTRFS_ORPHAN_OBJECTID; 29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 30 key.type = BTRFS_ORPHAN_ITEM_KEY;
31 key.offset = offset; 31 key.offset = offset;
32 32
33 path = btrfs_alloc_path(); 33 path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
48 int ret = 0; 48 int ret = 0;
49 49
50 key.objectid = BTRFS_ORPHAN_OBJECTID; 50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 51 key.type = BTRFS_ORPHAN_ITEM_KEY;
52 key.offset = offset; 52 key.offset = offset;
53 53
54 path = btrfs_alloc_path(); 54 path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4ad3b9a..647ab12fdf5d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
195 for (i = 0 ; i < nr ; i++) { 195 for (i = 0 ; i < nr ; i++) {
196 item = btrfs_item_nr(i); 196 item = btrfs_item_nr(i);
197 btrfs_item_key_to_cpu(l, &key, i); 197 btrfs_item_key_to_cpu(l, &key, i);
198 type = btrfs_key_type(&key); 198 type = key.type;
199 printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " 199 printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
200 "itemsize %d\n", 200 "itemsize %d\n",
201 i, key.objectid, type, key.offset, 201 i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
336 for (i = 0; i < nr; i++) { 336 for (i = 0; i < nr; i++) {
337 struct extent_buffer *next = read_tree_block(root, 337 struct extent_buffer *next = read_tree_block(root,
338 btrfs_node_blockptr(c, i), 338 btrfs_node_blockptr(c, i),
339 btrfs_level_size(root, level - 1),
340 btrfs_node_ptr_generation(c, i)); 339 btrfs_node_ptr_generation(c, i));
341 if (btrfs_is_leaf(next) && 340 if (btrfs_is_leaf(next) &&
342 level != 1) 341 level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ded5c601d916..48b60dbf807f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
539 struct extent_buffer *leaf; 539 struct extent_buffer *leaf;
540 struct btrfs_key key; 540 struct btrfs_key key;
541 541
542#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 542 if (btrfs_test_is_dummy_root(quota_root))
543 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
544 return 0; 543 return 0;
545#endif 544
546 path = btrfs_alloc_path(); 545 path = btrfs_alloc_path();
547 if (!path) 546 if (!path)
548 return -ENOMEM; 547 return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
551 key.type = BTRFS_QGROUP_INFO_KEY; 550 key.type = BTRFS_QGROUP_INFO_KEY;
552 key.offset = qgroupid; 551 key.offset = qgroupid;
553 552
553 /*
554 * Avoid a transaction abort by catching -EEXIST here. In that
555 * case, we proceed by re-initializing the existing structure
556 * on disk.
557 */
558
554 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 559 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
555 sizeof(*qgroup_info)); 560 sizeof(*qgroup_info));
556 if (ret) 561 if (ret && ret != -EEXIST)
557 goto out; 562 goto out;
558 563
559 leaf = path->nodes[0]; 564 leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
572 key.type = BTRFS_QGROUP_LIMIT_KEY; 577 key.type = BTRFS_QGROUP_LIMIT_KEY;
573 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 578 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
574 sizeof(*qgroup_limit)); 579 sizeof(*qgroup_limit));
575 if (ret) 580 if (ret && ret != -EEXIST)
576 goto out; 581 goto out;
577 582
578 leaf = path->nodes[0]; 583 leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
692 int ret; 697 int ret;
693 int slot; 698 int slot;
694 699
695#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 700 if (btrfs_test_is_dummy_root(root))
696 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
697 return 0; 701 return 0;
698#endif 702
699 key.objectid = 0; 703 key.objectid = 0;
700 key.type = BTRFS_QGROUP_INFO_KEY; 704 key.type = BTRFS_QGROUP_INFO_KEY;
701 key.offset = qgroup->qgroupid; 705 key.offset = qgroup->qgroupid;
@@ -1335,6 +1339,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1335 INIT_LIST_HEAD(&oper->elem.list); 1339 INIT_LIST_HEAD(&oper->elem.list);
1336 oper->elem.seq = 0; 1340 oper->elem.seq = 0;
1337 1341
1342 trace_btrfs_qgroup_record_ref(oper);
1343
1338 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { 1344 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1339 /* 1345 /*
1340 * If any operation for this bytenr/ref_root combo 1346 * If any operation for this bytenr/ref_root combo
@@ -2077,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
2077 2083
2078 ASSERT(is_fstree(oper->ref_root)); 2084 ASSERT(is_fstree(oper->ref_root));
2079 2085
2086 trace_btrfs_qgroup_account(oper);
2087
2080 switch (oper->type) { 2088 switch (oper->type) {
2081 case BTRFS_QGROUP_OPER_ADD_EXCL: 2089 case BTRFS_QGROUP_OPER_ADD_EXCL:
2082 case BTRFS_QGROUP_OPER_SUB_EXCL: 2090 case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -2237,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2237 if (srcid) { 2245 if (srcid) {
2238 struct btrfs_root *srcroot; 2246 struct btrfs_root *srcroot;
2239 struct btrfs_key srckey; 2247 struct btrfs_key srckey;
2240 int srcroot_level;
2241 2248
2242 srckey.objectid = srcid; 2249 srckey.objectid = srcid;
2243 srckey.type = BTRFS_ROOT_ITEM_KEY; 2250 srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2249,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2249 } 2256 }
2250 2257
2251 rcu_read_lock(); 2258 rcu_read_lock();
2252 srcroot_level = btrfs_header_level(srcroot->node); 2259 level_size = srcroot->nodesize;
2253 level_size = btrfs_level_size(srcroot, srcroot_level);
2254 rcu_read_unlock(); 2260 rcu_read_unlock();
2255 } 2261 }
2256 2262
@@ -2566,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2566 found.type != BTRFS_METADATA_ITEM_KEY) 2572 found.type != BTRFS_METADATA_ITEM_KEY)
2567 continue; 2573 continue;
2568 if (found.type == BTRFS_METADATA_ITEM_KEY) 2574 if (found.type == BTRFS_METADATA_ITEM_KEY)
2569 num_bytes = fs_info->extent_root->leafsize; 2575 num_bytes = fs_info->extent_root->nodesize;
2570 else 2576 else
2571 num_bytes = found.offset; 2577 num_bytes = found.offset;
2572 2578
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0a6b6e4bcbb9..6a41631cb959 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
913{ 913{
914 unsigned long nr = stripe_len * nr_stripes; 914 unsigned long nr = stripe_len * nr_stripes;
915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 915 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
916} 916}
917 917
918/* 918/*
@@ -1442,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1442 struct btrfs_bio *bbio = rbio->bbio; 1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list; 1443 struct bio_list bio_list;
1444 int ret; 1444 int ret;
1445 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1446 int pagenr; 1446 int pagenr;
1447 int stripe; 1447 int stripe;
1448 struct bio *bio; 1448 struct bio *bio;
@@ -1725,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1725 int pagenr, stripe; 1725 int pagenr, stripe;
1726 void **pointers; 1726 void **pointers;
1727 int faila = -1, failb = -1; 1727 int faila = -1, failb = -1;
1728 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1728 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1729 struct page *page; 1729 struct page *page;
1730 int err; 1730 int err;
1731 int i; 1731 int i;
@@ -1940,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1940 struct btrfs_bio *bbio = rbio->bbio; 1940 struct btrfs_bio *bbio = rbio->bbio;
1941 struct bio_list bio_list; 1941 struct bio_list bio_list;
1942 int ret; 1942 int ret;
1943 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1944 int pagenr; 1944 int pagenr;
1945 int stripe; 1945 int stripe;
1946 struct bio *bio; 1946 struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 20408c6b665a..b63ae20618fb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
347 if (!re) 347 if (!re)
348 return NULL; 348 return NULL;
349 349
350 blocksize = btrfs_level_size(root, level); 350 blocksize = root->nodesize;
351 re->logical = logical; 351 re->logical = logical;
352 re->blocksize = blocksize; 352 re->blocksize = blocksize;
353 re->top = *top; 353 re->top = *top;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..74257d6436ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
736 err = ret; 736 err = ret;
737 goto out; 737 goto out;
738 } 738 }
739 BUG_ON(!ret || !path1->slots[0]); 739 ASSERT(ret);
740 ASSERT(path1->slots[0]);
740 741
741 path1->slots[0]--; 742 path1->slots[0]--;
742 743
@@ -746,10 +747,10 @@ again:
746 * the backref was added previously when processing 747 * the backref was added previously when processing
747 * backref of type BTRFS_TREE_BLOCK_REF_KEY 748 * backref of type BTRFS_TREE_BLOCK_REF_KEY
748 */ 749 */
749 BUG_ON(!list_is_singular(&cur->upper)); 750 ASSERT(list_is_singular(&cur->upper));
750 edge = list_entry(cur->upper.next, struct backref_edge, 751 edge = list_entry(cur->upper.next, struct backref_edge,
751 list[LOWER]); 752 list[LOWER]);
752 BUG_ON(!list_empty(&edge->list[UPPER])); 753 ASSERT(list_empty(&edge->list[UPPER]));
753 exist = edge->node[UPPER]; 754 exist = edge->node[UPPER];
754 /* 755 /*
755 * add the upper level block to pending list if we need 756 * add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
831 cur->cowonly = 1; 832 cur->cowonly = 1;
832 } 833 }
833#else 834#else
834 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 835 ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
835 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 836 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
836#endif 837#endif
837 if (key.objectid == key.offset) { 838 if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
840 * backref of this type. 841 * backref of this type.
841 */ 842 */
842 root = find_reloc_root(rc, cur->bytenr); 843 root = find_reloc_root(rc, cur->bytenr);
843 BUG_ON(!root); 844 ASSERT(root);
844 cur->root = root; 845 cur->root = root;
845 break; 846 break;
846 } 847 }
@@ -868,7 +869,7 @@ again:
868 } else { 869 } else {
869 upper = rb_entry(rb_node, struct backref_node, 870 upper = rb_entry(rb_node, struct backref_node,
870 rb_node); 871 rb_node);
871 BUG_ON(!upper->checked); 872 ASSERT(upper->checked);
872 INIT_LIST_HEAD(&edge->list[UPPER]); 873 INIT_LIST_HEAD(&edge->list[UPPER]);
873 } 874 }
874 list_add_tail(&edge->list[LOWER], &cur->upper); 875 list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
892 893
893 if (btrfs_root_level(&root->root_item) == cur->level) { 894 if (btrfs_root_level(&root->root_item) == cur->level) {
894 /* tree root */ 895 /* tree root */
895 BUG_ON(btrfs_root_bytenr(&root->root_item) != 896 ASSERT(btrfs_root_bytenr(&root->root_item) ==
896 cur->bytenr); 897 cur->bytenr);
897 if (should_ignore_root(root)) 898 if (should_ignore_root(root))
898 list_add(&cur->list, &useless); 899 list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
927 need_check = true; 928 need_check = true;
928 for (; level < BTRFS_MAX_LEVEL; level++) { 929 for (; level < BTRFS_MAX_LEVEL; level++) {
929 if (!path2->nodes[level]) { 930 if (!path2->nodes[level]) {
930 BUG_ON(btrfs_root_bytenr(&root->root_item) != 931 ASSERT(btrfs_root_bytenr(&root->root_item) ==
931 lower->bytenr); 932 lower->bytenr);
932 if (should_ignore_root(root)) 933 if (should_ignore_root(root))
933 list_add(&lower->list, &useless); 934 list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
977 need_check = false; 978 need_check = false;
978 list_add_tail(&edge->list[UPPER], 979 list_add_tail(&edge->list[UPPER],
979 &list); 980 &list);
980 } else 981 } else {
982 if (upper->checked)
983 need_check = true;
981 INIT_LIST_HEAD(&edge->list[UPPER]); 984 INIT_LIST_HEAD(&edge->list[UPPER]);
985 }
982 } else { 986 } else {
983 upper = rb_entry(rb_node, struct backref_node, 987 upper = rb_entry(rb_node, struct backref_node,
984 rb_node); 988 rb_node);
985 BUG_ON(!upper->checked); 989 ASSERT(upper->checked);
986 INIT_LIST_HEAD(&edge->list[UPPER]); 990 INIT_LIST_HEAD(&edge->list[UPPER]);
987 if (!upper->owner) 991 if (!upper->owner)
988 upper->owner = btrfs_header_owner(eb); 992 upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
1026 * everything goes well, connect backref nodes and insert backref nodes 1030 * everything goes well, connect backref nodes and insert backref nodes
1027 * into the cache. 1031 * into the cache.
1028 */ 1032 */
1029 BUG_ON(!node->checked); 1033 ASSERT(node->checked);
1030 cowonly = node->cowonly; 1034 cowonly = node->cowonly;
1031 if (!cowonly) { 1035 if (!cowonly) {
1032 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1036 rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
1062 continue; 1066 continue;
1063 } 1067 }
1064 1068
1065 BUG_ON(!upper->checked); 1069 if (!upper->checked) {
1066 BUG_ON(cowonly != upper->cowonly); 1070 /*
1071 * Still want to blow up for developers since this is a
1072 * logic bug.
1073 */
1074 ASSERT(0);
1075 err = -EINVAL;
1076 goto out;
1077 }
1078 if (cowonly != upper->cowonly) {
1079 ASSERT(0);
1080 err = -EINVAL;
1081 goto out;
1082 }
1083
1067 if (!cowonly) { 1084 if (!cowonly) {
1068 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1085 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1069 &upper->rb_node); 1086 &upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
1086 while (!list_empty(&useless)) { 1103 while (!list_empty(&useless)) {
1087 upper = list_entry(useless.next, struct backref_node, list); 1104 upper = list_entry(useless.next, struct backref_node, list);
1088 list_del_init(&upper->list); 1105 list_del_init(&upper->list);
1089 BUG_ON(!list_empty(&upper->upper)); 1106 ASSERT(list_empty(&upper->upper));
1090 if (upper == node) 1107 if (upper == node)
1091 node = NULL; 1108 node = NULL;
1092 if (upper->lowest) { 1109 if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
1119 if (err) { 1136 if (err) {
1120 while (!list_empty(&useless)) { 1137 while (!list_empty(&useless)) {
1121 lower = list_entry(useless.next, 1138 lower = list_entry(useless.next,
1122 struct backref_node, upper); 1139 struct backref_node, list);
1123 list_del_init(&lower->upper); 1140 list_del_init(&lower->list);
1124 } 1141 }
1125 upper = node; 1142 while (!list_empty(&list)) {
1126 INIT_LIST_HEAD(&list); 1143 edge = list_first_entry(&list, struct backref_edge,
1127 while (upper) { 1144 list[UPPER]);
1128 if (RB_EMPTY_NODE(&upper->rb_node)) { 1145 list_del(&edge->list[UPPER]);
1129 list_splice_tail(&upper->upper, &list);
1130 free_backref_node(cache, upper);
1131 }
1132
1133 if (list_empty(&list))
1134 break;
1135
1136 edge = list_entry(list.next, struct backref_edge,
1137 list[LOWER]);
1138 list_del(&edge->list[LOWER]); 1146 list_del(&edge->list[LOWER]);
1147 lower = edge->node[LOWER];
1139 upper = edge->node[UPPER]; 1148 upper = edge->node[UPPER];
1140 free_backref_edge(cache, edge); 1149 free_backref_edge(cache, edge);
1150
1151 /*
1152 * Lower is no longer linked to any upper backref nodes
1153 * and isn't in the cache, we can free it ourselves.
1154 */
1155 if (list_empty(&lower->upper) &&
1156 RB_EMPTY_NODE(&lower->rb_node))
1157 list_add(&lower->list, &useless);
1158
1159 if (!RB_EMPTY_NODE(&upper->rb_node))
1160 continue;
1161
1162 /* Add this guy's upper edges to the list to proces */
1163 list_for_each_entry(edge, &upper->upper, list[LOWER])
1164 list_add_tail(&edge->list[UPPER], &list);
1165 if (list_empty(&upper->upper))
1166 list_add(&upper->list, &useless);
1167 }
1168
1169 while (!list_empty(&useless)) {
1170 lower = list_entry(useless.next,
1171 struct backref_node, list);
1172 list_del_init(&lower->list);
1173 free_backref_node(cache, lower);
1141 } 1174 }
1142 return ERR_PTR(err); 1175 return ERR_PTR(err);
1143 } 1176 }
1144 BUG_ON(node && node->detached); 1177 ASSERT(!node || !node->detached);
1145 return node; 1178 return node;
1146} 1179}
1147 1180
@@ -1787,7 +1820,7 @@ again:
1787 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1820 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1788 1821
1789 old_bytenr = btrfs_node_blockptr(parent, slot); 1822 old_bytenr = btrfs_node_blockptr(parent, slot);
1790 blocksize = btrfs_level_size(dest, level - 1); 1823 blocksize = dest->nodesize;
1791 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1824 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1792 1825
1793 if (level <= max_level) { 1826 if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
1813 break; 1846 break;
1814 } 1847 }
1815 1848
1816 eb = read_tree_block(dest, old_bytenr, blocksize, 1849 eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1817 old_ptr_gen);
1818 if (!eb || !extent_buffer_uptodate(eb)) { 1850 if (!eb || !extent_buffer_uptodate(eb)) {
1819 ret = (!eb) ? -ENOMEM : -EIO; 1851 ret = (!eb) ? -ENOMEM : -EIO;
1820 free_extent_buffer(eb); 1852 free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1944 u64 bytenr; 1976 u64 bytenr;
1945 u64 ptr_gen = 0; 1977 u64 ptr_gen = 0;
1946 u64 last_snapshot; 1978 u64 last_snapshot;
1947 u32 blocksize;
1948 u32 nritems; 1979 u32 nritems;
1949 1980
1950 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1981 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1970 } 2001 }
1971 2002
1972 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2003 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1973 blocksize = btrfs_level_size(root, i - 1); 2004 eb = read_tree_block(root, bytenr, ptr_gen);
1974 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1975 if (!eb || !extent_buffer_uptodate(eb)) { 2005 if (!eb || !extent_buffer_uptodate(eb)) {
1976 free_extent_buffer(eb); 2006 free_extent_buffer(eb);
1977 return -EIO; 2007 return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
2316} 2346}
2317 2347
2318static noinline_for_stack 2348static noinline_for_stack
2319int merge_reloc_roots(struct reloc_control *rc) 2349void merge_reloc_roots(struct reloc_control *rc)
2320{ 2350{
2321 struct btrfs_root *root; 2351 struct btrfs_root *root;
2322 struct btrfs_root *reloc_root; 2352 struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
2397 } 2427 }
2398 2428
2399 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2429 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2400 return ret;
2401} 2430}
2402 2431
2403static void free_block_list(struct rb_root *blocks) 2432static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
2544 if (next->processed && (reserve || next != node)) 2573 if (next->processed && (reserve || next != node))
2545 break; 2574 break;
2546 2575
2547 num_bytes += btrfs_level_size(rc->extent_root, 2576 num_bytes += rc->extent_root->nodesize;
2548 next->level);
2549 2577
2550 if (list_empty(&next->upper)) 2578 if (list_empty(&next->upper))
2551 break; 2579 break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2679 goto next; 2707 goto next;
2680 } 2708 }
2681 2709
2682 blocksize = btrfs_level_size(root, node->level); 2710 blocksize = root->nodesize;
2683 generation = btrfs_node_ptr_generation(upper->eb, slot); 2711 generation = btrfs_node_ptr_generation(upper->eb, slot);
2684 eb = read_tree_block(root, bytenr, blocksize, generation); 2712 eb = read_tree_block(root, bytenr, generation);
2685 if (!eb || !extent_buffer_uptodate(eb)) { 2713 if (!eb || !extent_buffer_uptodate(eb)) {
2686 free_extent_buffer(eb); 2714 free_extent_buffer(eb);
2687 err = -EIO; 2715 err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
2789 u32 blocksize; 2817 u32 blocksize;
2790 if (node->level == 0 || 2818 if (node->level == 0 ||
2791 in_block_group(node->bytenr, rc->block_group)) { 2819 in_block_group(node->bytenr, rc->block_group)) {
2792 blocksize = btrfs_level_size(rc->extent_root, node->level); 2820 blocksize = rc->extent_root->nodesize;
2793 mark_block_processed(rc, node->bytenr, blocksize); 2821 mark_block_processed(rc, node->bytenr, blocksize);
2794 } 2822 }
2795 node->processed = 1; 2823 node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2843 2871
2844 BUG_ON(block->key_ready); 2872 BUG_ON(block->key_ready);
2845 eb = read_tree_block(rc->extent_root, block->bytenr, 2873 eb = read_tree_block(rc->extent_root, block->bytenr,
2846 block->key.objectid, block->key.offset); 2874 block->key.offset);
2847 if (!eb || !extent_buffer_uptodate(eb)) { 2875 if (!eb || !extent_buffer_uptodate(eb)) {
2848 free_extent_buffer(eb); 2876 free_extent_buffer(eb);
2849 return -EIO; 2877 return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
2858 return 0; 2886 return 0;
2859} 2887}
2860 2888
2861static int reada_tree_block(struct reloc_control *rc,
2862 struct tree_block *block)
2863{
2864 BUG_ON(block->key_ready);
2865 if (block->key.type == BTRFS_METADATA_ITEM_KEY)
2866 readahead_tree_block(rc->extent_root, block->bytenr,
2867 block->key.objectid,
2868 rc->extent_root->leafsize);
2869 else
2870 readahead_tree_block(rc->extent_root, block->bytenr,
2871 block->key.objectid, block->key.offset);
2872 return 0;
2873}
2874
2875/* 2889/*
2876 * helper function to relocate a tree block 2890 * helper function to relocate a tree block
2877 */ 2891 */
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2951 while (rb_node) { 2965 while (rb_node) {
2952 block = rb_entry(rb_node, struct tree_block, rb_node); 2966 block = rb_entry(rb_node, struct tree_block, rb_node);
2953 if (!block->key_ready) 2967 if (!block->key_ready)
2954 reada_tree_block(rc, block); 2968 readahead_tree_block(rc->extent_root, block->bytenr,
2969 block->key.objectid);
2955 rb_node = rb_next(rb_node); 2970 rb_node = rb_next(rb_node);
2956 } 2971 }
2957 2972
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
3313 return -ENOMEM; 3328 return -ENOMEM;
3314 3329
3315 block->bytenr = extent_key->objectid; 3330 block->bytenr = extent_key->objectid;
3316 block->key.objectid = rc->extent_root->leafsize; 3331 block->key.objectid = rc->extent_root->nodesize;
3317 block->key.offset = generation; 3332 block->key.offset = generation;
3318 block->level = level; 3333 block->level = level;
3319 block->key_ready = 0; 3334 block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
3640 struct btrfs_extent_inline_ref *iref; 3655 struct btrfs_extent_inline_ref *iref;
3641 unsigned long ptr; 3656 unsigned long ptr;
3642 unsigned long end; 3657 unsigned long end;
3643 u32 blocksize = btrfs_level_size(rc->extent_root, 0); 3658 u32 blocksize = rc->extent_root->nodesize;
3644 int ret = 0; 3659 int ret = 0;
3645 int err = 0; 3660 int err = 0;
3646 3661
@@ -3783,7 +3798,7 @@ next:
3783 } 3798 }
3784 3799
3785 if (key.type == BTRFS_METADATA_ITEM_KEY && 3800 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3786 key.objectid + rc->extent_root->leafsize <= 3801 key.objectid + rc->extent_root->nodesize <=
3787 rc->search_start) { 3802 rc->search_start) {
3788 path->slots[0]++; 3803 path->slots[0]++;
3789 goto next; 3804 goto next;
@@ -3801,7 +3816,7 @@ next:
3801 rc->search_start = key.objectid + key.offset; 3816 rc->search_start = key.objectid + key.offset;
3802 else 3817 else
3803 rc->search_start = key.objectid + 3818 rc->search_start = key.objectid +
3804 rc->extent_root->leafsize; 3819 rc->extent_root->nodesize;
3805 memcpy(extent_key, &key, sizeof(key)); 3820 memcpy(extent_key, &key, sizeof(key));
3806 return 0; 3821 return 0;
3807 } 3822 }
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4096 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4111 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4097 BTRFS_INODE_PREALLOC); 4112 BTRFS_INODE_PREALLOC);
4098 btrfs_mark_buffer_dirty(leaf); 4113 btrfs_mark_buffer_dirty(leaf);
4099 btrfs_release_path(path);
4100out: 4114out:
4101 btrfs_free_path(path); 4115 btrfs_free_path(path);
4102 return ret; 4116 return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4a41f37be22..efa083113827 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
137 int pages_per_rd_bio; 137 int pages_per_rd_bio;
138 u32 sectorsize; 138 u32 sectorsize;
139 u32 nodesize; 139 u32 nodesize;
140 u32 leafsize;
141 140
142 int is_dev_replace; 141 int is_dev_replace;
143 struct scrub_wr_ctx wr_ctx; 142 struct scrub_wr_ctx wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
178struct scrub_warning { 177struct scrub_warning {
179 struct btrfs_path *path; 178 struct btrfs_path *path;
180 u64 extent_item_size; 179 u64 extent_item_size;
181 char *scratch_buf;
182 char *msg_buf;
183 const char *errstr; 180 const char *errstr;
184 sector_t sector; 181 sector_t sector;
185 u64 logical; 182 u64 logical;
186 struct btrfs_device *dev; 183 struct btrfs_device *dev;
187 int msg_bufsize;
188 int scratch_bufsize;
189}; 184};
190 185
191
192static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 186static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
193static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 187static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
194static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 188static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
438 } 432 }
439 sctx->first_free = 0; 433 sctx->first_free = 0;
440 sctx->nodesize = dev->dev_root->nodesize; 434 sctx->nodesize = dev->dev_root->nodesize;
441 sctx->leafsize = dev->dev_root->leafsize;
442 sctx->sectorsize = dev->dev_root->sectorsize; 435 sctx->sectorsize = dev->dev_root->sectorsize;
443 atomic_set(&sctx->bios_in_flight, 0); 436 atomic_set(&sctx->bios_in_flight, 0);
444 atomic_set(&sctx->workers_pending, 0); 437 atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
553 u64 ref_root; 546 u64 ref_root;
554 u32 item_size; 547 u32 item_size;
555 u8 ref_level; 548 u8 ref_level;
556 const int bufsize = 4096;
557 int ret; 549 int ret;
558 550
559 WARN_ON(sblock->page_count < 1); 551 WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
561 fs_info = sblock->sctx->dev_root->fs_info; 553 fs_info = sblock->sctx->dev_root->fs_info;
562 554
563 path = btrfs_alloc_path(); 555 path = btrfs_alloc_path();
556 if (!path)
557 return;
564 558
565 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
566 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
567 swarn.sector = (sblock->pagev[0]->physical) >> 9; 559 swarn.sector = (sblock->pagev[0]->physical) >> 9;
568 swarn.logical = sblock->pagev[0]->logical; 560 swarn.logical = sblock->pagev[0]->logical;
569 swarn.errstr = errstr; 561 swarn.errstr = errstr;
570 swarn.dev = NULL; 562 swarn.dev = NULL;
571 swarn.msg_bufsize = bufsize;
572 swarn.scratch_bufsize = bufsize;
573
574 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
575 goto out;
576 563
577 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 564 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
578 &flags); 565 &flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
613 600
614out: 601out:
615 btrfs_free_path(path); 602 btrfs_free_path(path);
616 kfree(swarn.scratch_buf);
617 kfree(swarn.msg_buf);
618} 603}
619 604
620static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 605static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
681 ret = -EIO; 666 ret = -EIO;
682 goto out; 667 goto out;
683 } 668 }
684 fs_info = BTRFS_I(inode)->root->fs_info; 669 ret = repair_io_failure(inode, offset, PAGE_SIZE,
685 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
686 fixup->logical, page, 670 fixup->logical, page,
671 offset - page_offset(page),
687 fixup->mirror_num); 672 fixup->mirror_num);
688 unlock_page(page); 673 unlock_page(page);
689 corrected = !ret; 674 corrected = !ret;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1361 return; 1346 return;
1362} 1347}
1363 1348
1349static inline int scrub_check_fsid(u8 fsid[],
1350 struct scrub_page *spage)
1351{
1352 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1353 int ret;
1354
1355 ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1356 return !ret;
1357}
1358
1364static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1359static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1365 struct scrub_block *sblock, 1360 struct scrub_block *sblock,
1366 int is_metadata, int have_csum, 1361 int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1380 h = (struct btrfs_header *)mapped_buffer; 1375 h = (struct btrfs_header *)mapped_buffer;
1381 1376
1382 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || 1377 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1383 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1378 !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1384 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1379 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1385 BTRFS_UUID_SIZE)) { 1380 BTRFS_UUID_SIZE)) {
1386 sblock->header_error = 1; 1381 sblock->header_error = 1;
@@ -1751,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1751 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) 1746 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1752 ++fail; 1747 ++fail;
1753 1748
1754 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1749 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1755 ++fail; 1750 ++fail;
1756 1751
1757 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1752 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1758 BTRFS_UUID_SIZE)) 1753 BTRFS_UUID_SIZE))
1759 ++fail; 1754 ++fail;
1760 1755
1761 WARN_ON(sctx->nodesize != sctx->leafsize);
1762 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1756 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1763 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1757 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1764 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1758 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1791,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1791{ 1785{
1792 struct btrfs_super_block *s; 1786 struct btrfs_super_block *s;
1793 struct scrub_ctx *sctx = sblock->sctx; 1787 struct scrub_ctx *sctx = sblock->sctx;
1794 struct btrfs_root *root = sctx->dev_root;
1795 struct btrfs_fs_info *fs_info = root->fs_info;
1796 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1788 u8 calculated_csum[BTRFS_CSUM_SIZE];
1797 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1789 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1798 struct page *page; 1790 struct page *page;
@@ -1817,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1817 if (sblock->pagev[0]->generation != btrfs_super_generation(s)) 1809 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1818 ++fail_gen; 1810 ++fail_gen;
1819 1811
1820 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1812 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1821 ++fail_cor; 1813 ++fail_cor;
1822 1814
1823 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1815 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2196,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2196 sctx->stat.data_bytes_scrubbed += len; 2188 sctx->stat.data_bytes_scrubbed += len;
2197 spin_unlock(&sctx->stat_lock); 2189 spin_unlock(&sctx->stat_lock);
2198 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2190 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2199 WARN_ON(sctx->nodesize != sctx->leafsize);
2200 blocksize = sctx->nodesize; 2191 blocksize = sctx->nodesize;
2201 spin_lock(&sctx->stat_lock); 2192 spin_lock(&sctx->stat_lock);
2202 sctx->stat.tree_extents_scrubbed++; 2193 sctx->stat.tree_extents_scrubbed++;
@@ -2487,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2487 btrfs_item_key_to_cpu(l, &key, slot); 2478 btrfs_item_key_to_cpu(l, &key, slot);
2488 2479
2489 if (key.type == BTRFS_METADATA_ITEM_KEY) 2480 if (key.type == BTRFS_METADATA_ITEM_KEY)
2490 bytes = root->leafsize; 2481 bytes = root->nodesize;
2491 else 2482 else
2492 bytes = key.offset; 2483 bytes = key.offset;
2493 2484
@@ -2714,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2714 if (found_key.objectid != scrub_dev->devid) 2705 if (found_key.objectid != scrub_dev->devid)
2715 break; 2706 break;
2716 2707
2717 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2708 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2718 break; 2709 break;
2719 2710
2720 if (found_key.offset >= end) 2711 if (found_key.offset >= end)
@@ -2828,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2828 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 2819 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2829 return -EIO; 2820 return -EIO;
2830 2821
2831 gen = root->fs_info->last_trans_committed; 2822 /* Seed devices of a new filesystem has their own generation. */
2823 if (scrub_dev->fs_devices != root->fs_info->fs_devices)
2824 gen = scrub_dev->generation;
2825 else
2826 gen = root->fs_info->last_trans_committed;
2832 2827
2833 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2828 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2834 bytenr = btrfs_sb_offset(i); 2829 bytenr = btrfs_sb_offset(i);
2835 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2830 if (bytenr + BTRFS_SUPER_INFO_SIZE >
2831 scrub_dev->commit_total_bytes)
2836 break; 2832 break;
2837 2833
2838 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2834 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2910,17 +2906,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2910 if (btrfs_fs_closing(fs_info)) 2906 if (btrfs_fs_closing(fs_info))
2911 return -EINVAL; 2907 return -EINVAL;
2912 2908
2913 /*
2914 * check some assumptions
2915 */
2916 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2917 btrfs_err(fs_info,
2918 "scrub: size assumption nodesize == leafsize (%d == %d) fails",
2919 fs_info->chunk_root->nodesize,
2920 fs_info->chunk_root->leafsize);
2921 return -EINVAL;
2922 }
2923
2924 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 2909 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2925 /* 2910 /*
2926 * in this case scrub is unable to calculate the checksum 2911 * in this case scrub is unable to calculate the checksum
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa662181..874828dd0a86 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
515 set_fs(KERNEL_DS); 515 set_fs(KERNEL_DS);
516 516
517 while (pos < len) { 517 while (pos < len) {
518 ret = vfs_write(filp, (char *)buf + pos, len - pos, off); 518 ret = vfs_write(filp, (__force const char __user *)buf + pos,
519 len - pos, off);
519 /* TODO handle that correctly */ 520 /* TODO handle that correctly */
520 /*if (ret == -ERESTARTSYS) { 521 /*if (ret == -ERESTARTSYS) {
521 continue; 522 continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
985 int num; 986 int num;
986 u8 type; 987 u8 type;
987 988
988 if (found_key->type == BTRFS_XATTR_ITEM_KEY) 989 /*
989 buf_len = BTRFS_MAX_XATTR_SIZE(root); 990 * Start with a small buffer (1 page). If later we end up needing more
990 else 991 * space, which can happen for xattrs on a fs with a leaf size greater
991 buf_len = PATH_MAX; 992 * then the page size, attempt to increase the buffer. Typically xattr
992 993 * values are small.
994 */
995 buf_len = PATH_MAX;
993 buf = kmalloc(buf_len, GFP_NOFS); 996 buf = kmalloc(buf_len, GFP_NOFS);
994 if (!buf) { 997 if (!buf) {
995 ret = -ENOMEM; 998 ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1016 ret = -ENAMETOOLONG; 1019 ret = -ENAMETOOLONG;
1017 goto out; 1020 goto out;
1018 } 1021 }
1019 if (name_len + data_len > buf_len) { 1022 if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
1020 ret = -E2BIG; 1023 ret = -E2BIG;
1021 goto out; 1024 goto out;
1022 } 1025 }
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1024 /* 1027 /*
1025 * Path too long 1028 * Path too long
1026 */ 1029 */
1027 if (name_len + data_len > buf_len) { 1030 if (name_len + data_len > PATH_MAX) {
1028 ret = -ENAMETOOLONG; 1031 ret = -ENAMETOOLONG;
1029 goto out; 1032 goto out;
1030 } 1033 }
1031 } 1034 }
1032 1035
1036 if (name_len + data_len > buf_len) {
1037 buf_len = name_len + data_len;
1038 if (is_vmalloc_addr(buf)) {
1039 vfree(buf);
1040 buf = NULL;
1041 } else {
1042 char *tmp = krealloc(buf, buf_len,
1043 GFP_NOFS | __GFP_NOWARN);
1044
1045 if (!tmp)
1046 kfree(buf);
1047 buf = tmp;
1048 }
1049 if (!buf) {
1050 buf = vmalloc(buf_len);
1051 if (!buf) {
1052 ret = -ENOMEM;
1053 goto out;
1054 }
1055 }
1056 }
1057
1033 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1058 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1034 name_len + data_len); 1059 name_len + data_len);
1035 1060
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1050 } 1075 }
1051 1076
1052out: 1077out:
1053 kfree(buf); 1078 kvfree(buf);
1054 return ret; 1079 return ret;
1055} 1080}
1056 1081
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3302 if (ret < 0 && ret != -ENOENT) { 3327 if (ret < 0 && ret != -ENOENT) {
3303 goto out; 3328 goto out;
3304 } else if (ret == -ENOENT) { 3329 } else if (ret == -ENOENT) {
3305 ret = 1; 3330 ret = 0;
3306 break; 3331 break;
3307 } 3332 }
3308 3333
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5703 NULL); 5728 NULL);
5704 sort_clone_roots = 1; 5729 sort_clone_roots = 1;
5705 5730
5706 current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; 5731 current->journal_info = BTRFS_SEND_TRANS_STUB;
5707 ret = send_subvol(sctx); 5732 ret = send_subvol(sctx);
5708 current->journal_info = NULL; 5733 current->journal_info = NULL;
5709 if (ret < 0) 5734 if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c4124de4435b..a2b97ef10317 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
60#include "backref.h" 60#include "backref.h"
61#include "tests/btrfs-tests.h" 61#include "tests/btrfs-tests.h"
62 62
63#include "qgroup.h"
63#define CREATE_TRACE_POINTS 64#define CREATE_TRACE_POINTS
64#include <trace/events/btrfs.h> 65#include <trace/events/btrfs.h>
65 66
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
307 308
308static void btrfs_put_super(struct super_block *sb) 309static void btrfs_put_super(struct super_block *sb)
309{ 310{
310 (void)close_ctree(btrfs_sb(sb)->tree_root); 311 close_ctree(btrfs_sb(sb)->tree_root);
311 /* FIXME: need to fix VFS to return error? */
312 /* AV: return it _where_? ->put_super() can be triggered by any number
313 * of async events, up to and including delivery of SIGKILL to the
314 * last process that kept it busy. Or segfault in the aforementioned
315 * process... Whom would you report that to?
316 */
317} 312}
318 313
319enum { 314enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
400 int ret = 0; 395 int ret = 0;
401 char *compress_type; 396 char *compress_type;
402 bool compress_force = false; 397 bool compress_force = false;
403 bool compress = false;
404 398
405 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 399 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
406 if (cache_gen) 400 if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
478 /* Fallthrough */ 472 /* Fallthrough */
479 case Opt_compress: 473 case Opt_compress:
480 case Opt_compress_type: 474 case Opt_compress_type:
481 compress = true;
482 if (token == Opt_compress || 475 if (token == Opt_compress ||
483 token == Opt_compress_force || 476 token == Opt_compress_force ||
484 strcmp(args[0].from, "zlib") == 0) { 477 strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
508 btrfs_set_and_info(root, FORCE_COMPRESS, 501 btrfs_set_and_info(root, FORCE_COMPRESS,
509 "force %s compression", 502 "force %s compression",
510 compress_type); 503 compress_type);
511 } else if (compress) { 504 } else {
512 if (!btrfs_test_opt(root, COMPRESS)) 505 if (!btrfs_test_opt(root, COMPRESS))
513 btrfs_info(root->fs_info, 506 btrfs_info(root->fs_info,
514 "btrfs: use %s compression", 507 "btrfs: use %s compression",
515 compress_type); 508 compress_type);
509 /*
510 * If we remount from compress-force=xxx to
511 * compress=xxx, we need clear FORCE_COMPRESS
512 * flag, otherwise, there is no way for users
513 * to disable forcible compression separately.
514 */
515 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
516 } 516 }
517 break; 517 break;
518 case Opt_ssd: 518 case Opt_ssd:
@@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1014 seq_puts(seq, ",nodatacow"); 1014 seq_puts(seq, ",nodatacow");
1015 if (btrfs_test_opt(root, NOBARRIER)) 1015 if (btrfs_test_opt(root, NOBARRIER))
1016 seq_puts(seq, ",nobarrier"); 1016 seq_puts(seq, ",nobarrier");
1017 if (info->max_inline != 8192 * 1024) 1017 if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
1018 seq_printf(seq, ",max_inline=%llu", info->max_inline); 1018 seq_printf(seq, ",max_inline=%llu", info->max_inline);
1019 if (info->alloc_start != 0) 1019 if (info->alloc_start != 0)
1020 seq_printf(seq, ",alloc_start=%llu", info->alloc_start); 1020 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1215,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
1215 return root; 1215 return root;
1216} 1216}
1217 1217
1218static int parse_security_options(char *orig_opts,
1219 struct security_mnt_opts *sec_opts)
1220{
1221 char *secdata = NULL;
1222 int ret = 0;
1223
1224 secdata = alloc_secdata();
1225 if (!secdata)
1226 return -ENOMEM;
1227 ret = security_sb_copy_data(orig_opts, secdata);
1228 if (ret) {
1229 free_secdata(secdata);
1230 return ret;
1231 }
1232 ret = security_sb_parse_opts_str(secdata, sec_opts);
1233 free_secdata(secdata);
1234 return ret;
1235}
1236
1237static int setup_security_options(struct btrfs_fs_info *fs_info,
1238 struct super_block *sb,
1239 struct security_mnt_opts *sec_opts)
1240{
1241 int ret = 0;
1242
1243 /*
1244 * Call security_sb_set_mnt_opts() to check whether new sec_opts
1245 * is valid.
1246 */
1247 ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
1248 if (ret)
1249 return ret;
1250
1251#ifdef CONFIG_SECURITY
1252 if (!fs_info->security_opts.num_mnt_opts) {
1253 /* first time security setup, copy sec_opts to fs_info */
1254 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
1255 } else {
1256 /*
1257 * Since SELinux(the only one supports security_mnt_opts) does
1258 * NOT support changing context during remount/mount same sb,
1259 * This must be the same or part of the same security options,
1260 * just free it.
1261 */
1262 security_free_mnt_opts(sec_opts);
1263 }
1264#endif
1265 return ret;
1266}
1267
1218/* 1268/*
1219 * Find a superblock for the given device / mount point. 1269 * Find a superblock for the given device / mount point.
1220 * 1270 *
@@ -1229,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1229 struct dentry *root; 1279 struct dentry *root;
1230 struct btrfs_fs_devices *fs_devices = NULL; 1280 struct btrfs_fs_devices *fs_devices = NULL;
1231 struct btrfs_fs_info *fs_info = NULL; 1281 struct btrfs_fs_info *fs_info = NULL;
1282 struct security_mnt_opts new_sec_opts;
1232 fmode_t mode = FMODE_READ; 1283 fmode_t mode = FMODE_READ;
1233 char *subvol_name = NULL; 1284 char *subvol_name = NULL;
1234 u64 subvol_objectid = 0; 1285 u64 subvol_objectid = 0;
@@ -1251,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1251 return root; 1302 return root;
1252 } 1303 }
1253 1304
1305 security_init_mnt_opts(&new_sec_opts);
1306 if (data) {
1307 error = parse_security_options(data, &new_sec_opts);
1308 if (error)
1309 return ERR_PTR(error);
1310 }
1311
1254 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 1312 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
1255 if (error) 1313 if (error)
1256 return ERR_PTR(error); 1314 goto error_sec_opts;
1257 1315
1258 /* 1316 /*
1259 * Setup a dummy root and fs_info for test/set super. This is because 1317 * Setup a dummy root and fs_info for test/set super. This is because
@@ -1262,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1262 * then open_ctree will properly initialize everything later. 1320 * then open_ctree will properly initialize everything later.
1263 */ 1321 */
1264 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 1322 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
1265 if (!fs_info) 1323 if (!fs_info) {
1266 return ERR_PTR(-ENOMEM); 1324 error = -ENOMEM;
1325 goto error_sec_opts;
1326 }
1267 1327
1268 fs_info->fs_devices = fs_devices; 1328 fs_info->fs_devices = fs_devices;
1269 1329
1270 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 1330 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1271 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 1331 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1332 security_init_mnt_opts(&fs_info->security_opts);
1272 if (!fs_info->super_copy || !fs_info->super_for_commit) { 1333 if (!fs_info->super_copy || !fs_info->super_for_commit) {
1273 error = -ENOMEM; 1334 error = -ENOMEM;
1274 goto error_fs_info; 1335 goto error_fs_info;
@@ -1306,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1306 } 1367 }
1307 1368
1308 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); 1369 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
1309 if (IS_ERR(root)) 1370 if (IS_ERR(root)) {
1371 deactivate_locked_super(s);
1372 error = PTR_ERR(root);
1373 goto error_sec_opts;
1374 }
1375
1376 fs_info = btrfs_sb(s);
1377 error = setup_security_options(fs_info, s, &new_sec_opts);
1378 if (error) {
1379 dput(root);
1310 deactivate_locked_super(s); 1380 deactivate_locked_super(s);
1381 goto error_sec_opts;
1382 }
1311 1383
1312 return root; 1384 return root;
1313 1385
@@ -1315,6 +1387,8 @@ error_close_devices:
1315 btrfs_close_devices(fs_devices); 1387 btrfs_close_devices(fs_devices);
1316error_fs_info: 1388error_fs_info:
1317 free_fs_info(fs_info); 1389 free_fs_info(fs_info);
1390error_sec_opts:
1391 security_free_mnt_opts(&new_sec_opts);
1318 return ERR_PTR(error); 1392 return ERR_PTR(error);
1319} 1393}
1320 1394
@@ -1396,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1396 sync_filesystem(sb); 1470 sync_filesystem(sb);
1397 btrfs_remount_prepare(fs_info); 1471 btrfs_remount_prepare(fs_info);
1398 1472
1473 if (data) {
1474 struct security_mnt_opts new_sec_opts;
1475
1476 security_init_mnt_opts(&new_sec_opts);
1477 ret = parse_security_options(data, &new_sec_opts);
1478 if (ret)
1479 goto restore;
1480 ret = setup_security_options(fs_info, sb,
1481 &new_sec_opts);
1482 if (ret) {
1483 security_free_mnt_opts(&new_sec_opts);
1484 goto restore;
1485 }
1486 }
1487
1399 ret = btrfs_parse_options(root, data); 1488 ret = btrfs_parse_options(root, data);
1400 if (ret) { 1489 if (ret) {
1401 ret = -EINVAL; 1490 ret = -EINVAL;
@@ -1694,7 +1783,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1694 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 1783 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
1695 int ret; 1784 int ret;
1696 1785
1697 /* holding chunk_muext to avoid allocating new chunks */ 1786 /*
1787 * holding chunk_muext to avoid allocating new chunks, holding
1788 * device_list_mutex to avoid the device being removed
1789 */
1790 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1698 mutex_lock(&fs_info->chunk_mutex); 1791 mutex_lock(&fs_info->chunk_mutex);
1699 rcu_read_lock(); 1792 rcu_read_lock();
1700 list_for_each_entry_rcu(found, head, list) { 1793 list_for_each_entry_rcu(found, head, list) {
@@ -1735,11 +1828,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1735 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1828 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1736 if (ret) { 1829 if (ret) {
1737 mutex_unlock(&fs_info->chunk_mutex); 1830 mutex_unlock(&fs_info->chunk_mutex);
1831 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1738 return ret; 1832 return ret;
1739 } 1833 }
1740 buf->f_bavail += div_u64(total_free_data, factor); 1834 buf->f_bavail += div_u64(total_free_data, factor);
1741 buf->f_bavail = buf->f_bavail >> bits; 1835 buf->f_bavail = buf->f_bavail >> bits;
1742 mutex_unlock(&fs_info->chunk_mutex); 1836 mutex_unlock(&fs_info->chunk_mutex);
1837 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1743 1838
1744 buf->f_type = BTRFS_SUPER_MAGIC; 1839 buf->f_type = BTRFS_SUPER_MAGIC;
1745 buf->f_bsize = dentry->d_sb->s_blocksize; 1840 buf->f_bsize = dentry->d_sb->s_blocksize;
@@ -1769,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
1769 .name = "btrfs", 1864 .name = "btrfs",
1770 .mount = btrfs_mount, 1865 .mount = btrfs_mount,
1771 .kill_sb = btrfs_kill_super, 1866 .kill_sb = btrfs_kill_super,
1772 .fs_flags = FS_REQUIRES_DEV, 1867 .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
1773}; 1868};
1774MODULE_ALIAS_FS("btrfs"); 1869MODULE_ALIAS_FS("btrfs");
1775 1870
@@ -1993,11 +2088,15 @@ static int __init init_btrfs_fs(void)
1993 2088
1994 err = btrfs_prelim_ref_init(); 2089 err = btrfs_prelim_ref_init();
1995 if (err) 2090 if (err)
2091 goto free_delayed_ref;
2092
2093 err = btrfs_end_io_wq_init();
2094 if (err)
1996 goto free_prelim_ref; 2095 goto free_prelim_ref;
1997 2096
1998 err = btrfs_interface_init(); 2097 err = btrfs_interface_init();
1999 if (err) 2098 if (err)
2000 goto free_delayed_ref; 2099 goto free_end_io_wq;
2001 2100
2002 btrfs_init_lockdep(); 2101 btrfs_init_lockdep();
2003 2102
@@ -2015,6 +2114,8 @@ static int __init init_btrfs_fs(void)
2015 2114
2016unregister_ioctl: 2115unregister_ioctl:
2017 btrfs_interface_exit(); 2116 btrfs_interface_exit();
2117free_end_io_wq:
2118 btrfs_end_io_wq_exit();
2018free_prelim_ref: 2119free_prelim_ref:
2019 btrfs_prelim_ref_exit(); 2120 btrfs_prelim_ref_exit();
2020free_delayed_ref: 2121free_delayed_ref:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 12e53556e214..b2e7bb4393f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
242 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 242 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
243 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); 243 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
244} 244}
245BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); 245BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
246 246
247static ssize_t global_rsv_reserved_show(struct kobject *kobj, 247static ssize_t global_rsv_reserved_show(struct kobject *kobj,
248 struct kobj_attribute *a, char *buf) 248 struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
251 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 251 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
252 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); 252 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
253} 253}
254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); 254BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
255 255
256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) 257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
306 struct btrfs_space_info *sinfo = to_space_info(kobj); \ 306 struct btrfs_space_info *sinfo = to_space_info(kobj); \
307 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ 307 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
308} \ 308} \
309BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) 309BTRFS_ATTR(field, btrfs_space_info_show_##field)
310 310
311static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, 311static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
312 struct kobj_attribute *a, 312 struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
325SPACE_INFO_ATTR(bytes_may_use); 325SPACE_INFO_ATTR(bytes_may_use);
326SPACE_INFO_ATTR(disk_used); 326SPACE_INFO_ATTR(disk_used);
327SPACE_INFO_ATTR(disk_total); 327SPACE_INFO_ATTR(disk_total);
328BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); 328BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
329 329
330static struct attribute *space_info_attrs[] = { 330static struct attribute *space_info_attrs[] = {
331 BTRFS_ATTR_PTR(flags), 331 BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
363 struct kobj_attribute *a, char *buf) 363 struct kobj_attribute *a, char *buf)
364{ 364{
365 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 365 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
366 return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); 366 char *label = fs_info->super_copy->label;
367 return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
367} 368}
368 369
369static ssize_t btrfs_label_store(struct kobject *kobj, 370static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
374 struct btrfs_trans_handle *trans; 375 struct btrfs_trans_handle *trans;
375 struct btrfs_root *root = fs_info->fs_root; 376 struct btrfs_root *root = fs_info->fs_root;
376 int ret; 377 int ret;
378 size_t p_len;
377 379
378 if (len >= BTRFS_LABEL_SIZE) 380 if (fs_info->sb->s_flags & MS_RDONLY)
381 return -EROFS;
382
383 /*
384 * p_len is the len until the first occurrence of either
385 * '\n' or '\0'
386 */
387 p_len = strcspn(buf, "\n");
388
389 if (p_len >= BTRFS_LABEL_SIZE)
379 return -EINVAL; 390 return -EINVAL;
380 391
381 trans = btrfs_start_transaction(root, 0); 392 trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
383 return PTR_ERR(trans); 394 return PTR_ERR(trans);
384 395
385 spin_lock(&root->fs_info->super_lock); 396 spin_lock(&root->fs_info->super_lock);
386 strcpy(fs_info->super_copy->label, buf); 397 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
398 memcpy(fs_info->super_copy->label, buf, p_len);
387 spin_unlock(&root->fs_info->super_lock); 399 spin_unlock(&root->fs_info->super_lock);
388 ret = btrfs_commit_transaction(trans, root); 400 ret = btrfs_commit_transaction(trans, root);
389 401
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
392 404
393 return ret; 405 return ret;
394} 406}
395BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); 407BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
396
397static ssize_t btrfs_no_store(struct kobject *kobj,
398 struct kobj_attribute *a,
399 const char *buf, size_t len)
400{
401 return -EPERM;
402}
403 408
404static ssize_t btrfs_nodesize_show(struct kobject *kobj, 409static ssize_t btrfs_nodesize_show(struct kobject *kobj,
405 struct kobj_attribute *a, char *buf) 410 struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
409 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); 414 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
410} 415}
411 416
412BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); 417BTRFS_ATTR(nodesize, btrfs_nodesize_show);
413 418
414static ssize_t btrfs_sectorsize_show(struct kobject *kobj, 419static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
415 struct kobj_attribute *a, char *buf) 420 struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
419 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); 424 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
420} 425}
421 426
422BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); 427BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
423 428
424static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, 429static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
425 struct kobj_attribute *a, char *buf) 430 struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
429 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); 434 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
430} 435}
431 436
432BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); 437BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
433 438
434static struct attribute *btrfs_attrs[] = { 439static struct attribute *btrfs_attrs[] = {
435 BTRFS_ATTR_PTR(label), 440 BTRFS_ATTR_PTR(label),
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df37504c..f7dd298b3cf6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
20 .store = _store, \ 20 .store = _store, \
21} 21}
22 22
23#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ 23#define BTRFS_ATTR_RW(_name, _show, _store) \
24static struct kobj_attribute btrfs_attr_##_name = \ 24 static struct kobj_attribute btrfs_attr_##_name = \
25 __INIT_KOBJ_ATTR(_name, _mode, _show, _store) 25 __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
26#define BTRFS_ATTR(_name, _mode, _show) \ 26
27 BTRFS_ATTR_RW(_name, _mode, _show, NULL) 27#define BTRFS_ATTR(_name, _show) \
28 static struct kobj_attribute btrfs_attr_##_name = \
29 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
30
28#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) 31#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
29 32
30#define BTRFS_RAID_ATTR(_name, _show) \ 33#define BTRFS_RAID_ATTR(_name, _show) \
31static struct kobj_attribute btrfs_raid_attr_##_name = \ 34 static struct kobj_attribute btrfs_raid_attr_##_name = \
32 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) 35 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
36
33#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) 37#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
34 38
35 39
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf84c69..2299bfde39ee 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
40 cache->key.offset = 1024 * 1024 * 1024; 40 cache->key.offset = 1024 * 1024 * 1024;
41 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 41 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
42 cache->sectorsize = 4096; 42 cache->sectorsize = 4096;
43 cache->full_stripe_len = 4096;
43 44
44 spin_lock_init(&cache->lock); 45 spin_lock_init(&cache->lock);
45 INIT_LIST_HEAD(&cache->list); 46 INIT_LIST_HEAD(&cache->list);
46 INIT_LIST_HEAD(&cache->cluster_list); 47 INIT_LIST_HEAD(&cache->cluster_list);
47 INIT_LIST_HEAD(&cache->new_bg_list); 48 INIT_LIST_HEAD(&cache->bg_list);
48 49
49 btrfs_init_free_space_ctl(cache); 50 btrfs_init_free_space_ctl(cache);
50 51
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
364 return 0; 365 return 0;
365} 366}
366 367
368/* Used by test_steal_space_from_bitmap_to_extent(). */
369static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
370 struct btrfs_free_space *info)
371{
372 return ctl->free_extents > 0;
373}
374
375/* Used by test_steal_space_from_bitmap_to_extent(). */
376static int
377check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
378 const int num_extents,
379 const int num_bitmaps)
380{
381 if (cache->free_space_ctl->free_extents != num_extents) {
382 test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
383 cache->free_space_ctl->free_extents, num_extents);
384 return -EINVAL;
385 }
386 if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
387 test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
388 cache->free_space_ctl->total_bitmaps, num_bitmaps);
389 return -EINVAL;
390 }
391 return 0;
392}
393
394/* Used by test_steal_space_from_bitmap_to_extent(). */
395static int check_cache_empty(struct btrfs_block_group_cache *cache)
396{
397 u64 offset;
398 u64 max_extent_size;
399
400 /*
401 * Now lets confirm that there's absolutely no free space left to
402 * allocate.
403 */
404 if (cache->free_space_ctl->free_space != 0) {
405 test_msg("Cache free space is not 0\n");
406 return -EINVAL;
407 }
408
409 /* And any allocation request, no matter how small, should fail now. */
410 offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
411 &max_extent_size);
412 if (offset != 0) {
413 test_msg("Space allocation did not fail, returned offset: %llu",
414 offset);
415 return -EINVAL;
416 }
417
418 /* And no extent nor bitmap entries in the cache anymore. */
419 return check_num_extents_and_bitmaps(cache, 0, 0);
420}
421
422/*
423 * Before we were able to steal free space from a bitmap entry to an extent
424 * entry, we could end up with 2 entries representing a contiguous free space.
425 * One would be an extent entry and the other a bitmap entry. Since in order
426 * to allocate space to a caller we use only 1 entry, we couldn't return that
427 * whole range to the caller if it was requested. This forced the caller to
428 * either assume ENOSPC or perform several smaller space allocations, which
429 * wasn't optimal as they could be spread all over the block group while under
430 * concurrency (extra overhead and fragmentation).
431 *
432 * This stealing approach is benefical, since we always prefer to allocate from
433 * extent entries, both for clustered and non-clustered allocation requests.
434 */
435static int
436test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
437{
438 int ret;
439 u64 offset;
440 u64 max_extent_size;
441
442 bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
443 struct btrfs_free_space *);
444
445 test_msg("Running space stealing from bitmap to extent\n");
446
447 /*
448 * For this test, we want to ensure we end up with an extent entry
449 * immediately adjacent to a bitmap entry, where the bitmap starts
450 * at an offset where the extent entry ends. We keep adding and
451 * removing free space to reach into this state, but to get there
452 * we need to reach a point where marking new free space doesn't
453 * result in adding new extent entries or merging the new space
454 * with existing extent entries - the space ends up being marked
455 * in an existing bitmap that covers the new free space range.
456 *
457 * To get there, we need to reach the threshold defined set at
458 * cache->free_space_ctl->extents_thresh, which currently is
459 * 256 extents on a x86_64 system at least, and a few other
460 * conditions (check free_space_cache.c). Instead of making the
461 * test much longer and complicated, use a "use_bitmap" operation
462 * that forces use of bitmaps as soon as we have at least 1
463 * extent entry.
464 */
465 use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
466 cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
467
468 /*
469 * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
470 */
471 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
472 128 * 1024, 0);
473 if (ret) {
474 test_msg("Couldn't add extent entry %d\n", ret);
475 return ret;
476 }
477
478 /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
479 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
480 128 * 1024 * 1024 - 512 * 1024, 1);
481 if (ret) {
482 test_msg("Couldn't add bitmap entry %d\n", ret);
483 return ret;
484 }
485
486 ret = check_num_extents_and_bitmaps(cache, 2, 1);
487 if (ret)
488 return ret;
489
490 /*
491 * Now make only the first 256Kb of the bitmap marked as free, so that
492 * we end up with only the following ranges marked as free space:
493 *
494 * [128Mb - 256Kb, 128Mb - 128Kb[
495 * [128Mb + 512Kb, 128Mb + 768Kb[
496 */
497 ret = btrfs_remove_free_space(cache,
498 128 * 1024 * 1024 + 768 * 1024,
499 128 * 1024 * 1024 - 768 * 1024);
500 if (ret) {
501 test_msg("Failed to free part of bitmap space %d\n", ret);
502 return ret;
503 }
504
505 /* Confirm that only those 2 ranges are marked as free. */
506 if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
507 128 * 1024)) {
508 test_msg("Free space range missing\n");
509 return -ENOENT;
510 }
511 if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
512 256 * 1024)) {
513 test_msg("Free space range missing\n");
514 return -ENOENT;
515 }
516
517 /*
518 * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
519 * as free anymore.
520 */
521 if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
522 128 * 1024 * 1024 - 768 * 1024)) {
523 test_msg("Bitmap region not removed from space cache\n");
524 return -EINVAL;
525 }
526
527 /*
528 * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
529 * covered by the bitmap, isn't marked as free.
530 */
531 if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
532 256 * 1024)) {
533 test_msg("Invalid bitmap region marked as free\n");
534 return -EINVAL;
535 }
536
537 /*
538 * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
539 * by the bitmap too, isn't marked as free either.
540 */
541 if (test_check_exists(cache, 128 * 1024 * 1024,
542 256 * 1024)) {
543 test_msg("Invalid bitmap region marked as free\n");
544 return -EINVAL;
545 }
546
547 /*
548 * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
549 * lets make sure the free space cache marks it as free in the bitmap,
550 * and doesn't insert a new extent entry to represent this region.
551 */
552 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
553 if (ret) {
554 test_msg("Error adding free space: %d\n", ret);
555 return ret;
556 }
557 /* Confirm the region is marked as free. */
558 if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
559 test_msg("Bitmap region not marked as free\n");
560 return -ENOENT;
561 }
562
563 /*
564 * Confirm that no new extent entries or bitmap entries were added to
565 * the cache after adding that free space region.
566 */
567 ret = check_num_extents_and_bitmaps(cache, 2, 1);
568 if (ret)
569 return ret;
570
571 /*
572 * Now lets add a small free space region to the right of the previous
573 * one, which is not contiguous with it and is part of the bitmap too.
574 * The goal is to test that the bitmap entry space stealing doesn't
575 * steal this space region.
576 */
577 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
578 4096);
579 if (ret) {
580 test_msg("Error adding free space: %d\n", ret);
581 return ret;
582 }
583
584 /*
585 * Confirm that no new extent entries or bitmap entries were added to
586 * the cache after adding that free space region.
587 */
588 ret = check_num_extents_and_bitmaps(cache, 2, 1);
589 if (ret)
590 return ret;
591
592 /*
593 * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
594 * expand the range covered by the existing extent entry that represents
595 * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
596 */
597 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
598 128 * 1024);
599 if (ret) {
600 test_msg("Error adding free space: %d\n", ret);
601 return ret;
602 }
603 /* Confirm the region is marked as free. */
604 if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
605 128 * 1024)) {
606 test_msg("Extent region not marked as free\n");
607 return -ENOENT;
608 }
609
610 /*
611 * Confirm that our extent entry didn't stole all free space from the
612 * bitmap, because of the small 4Kb free space region.
613 */
614 ret = check_num_extents_and_bitmaps(cache, 2, 1);
615 if (ret)
616 return ret;
617
618 /*
619 * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
620 * space. Without stealing bitmap free space into extent entry space,
621 * we would have all this free space represented by 2 entries in the
622 * cache:
623 *
624 * extent entry covering range: [128Mb - 256Kb, 128Mb[
625 * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
626 *
627 * Attempting to allocate the whole free space (1Mb) would fail, because
628 * we can't allocate from multiple entries.
629 * With the bitmap free space stealing, we get a single extent entry
630 * that represents the 1Mb free space, and therefore we're able to
631 * allocate the whole free space at once.
632 */
633 if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
634 1 * 1024 * 1024)) {
635 test_msg("Expected region not marked as free\n");
636 return -ENOENT;
637 }
638
639 if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
640 test_msg("Cache free space is not 1Mb + 4Kb\n");
641 return -EINVAL;
642 }
643
644 offset = btrfs_find_space_for_alloc(cache,
645 0, 1 * 1024 * 1024, 0,
646 &max_extent_size);
647 if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
648 test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
649 offset);
650 return -EINVAL;
651 }
652
653 /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
654 ret = check_num_extents_and_bitmaps(cache, 1, 1);
655 if (ret)
656 return ret;
657
658 if (cache->free_space_ctl->free_space != 4096) {
659 test_msg("Cache free space is not 4Kb\n");
660 return -EINVAL;
661 }
662
663 offset = btrfs_find_space_for_alloc(cache,
664 0, 4096, 0,
665 &max_extent_size);
666 if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
667 test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
668 offset);
669 return -EINVAL;
670 }
671
672 ret = check_cache_empty(cache);
673 if (ret)
674 return ret;
675
676 __btrfs_remove_free_space_cache(cache->free_space_ctl);
677
678 /*
679 * Now test a similar scenario, but where our extent entry is located
680 * to the right of the bitmap entry, so that we can check that stealing
681 * space from a bitmap to the front of an extent entry works.
682 */
683
684 /*
685 * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
686 */
687 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
688 128 * 1024, 0);
689 if (ret) {
690 test_msg("Couldn't add extent entry %d\n", ret);
691 return ret;
692 }
693
694 /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
695 ret = test_add_free_space_entry(cache, 0,
696 128 * 1024 * 1024 - 512 * 1024, 1);
697 if (ret) {
698 test_msg("Couldn't add bitmap entry %d\n", ret);
699 return ret;
700 }
701
702 ret = check_num_extents_and_bitmaps(cache, 2, 1);
703 if (ret)
704 return ret;
705
706 /*
707 * Now make only the last 256Kb of the bitmap marked as free, so that
708 * we end up with only the following ranges marked as free space:
709 *
710 * [128Mb + 128b, 128Mb + 256Kb[
711 * [128Mb - 768Kb, 128Mb - 512Kb[
712 */
713 ret = btrfs_remove_free_space(cache,
714 0,
715 128 * 1024 * 1024 - 768 * 1024);
716 if (ret) {
717 test_msg("Failed to free part of bitmap space %d\n", ret);
718 return ret;
719 }
720
721 /* Confirm that only those 2 ranges are marked as free. */
722 if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
723 128 * 1024)) {
724 test_msg("Free space range missing\n");
725 return -ENOENT;
726 }
727 if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
728 256 * 1024)) {
729 test_msg("Free space range missing\n");
730 return -ENOENT;
731 }
732
733 /*
734 * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
735 * as free anymore.
736 */
737 if (test_check_exists(cache, 0,
738 128 * 1024 * 1024 - 768 * 1024)) {
739 test_msg("Bitmap region not removed from space cache\n");
740 return -EINVAL;
741 }
742
743 /*
744 * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
745 * covered by the bitmap, isn't marked as free.
746 */
747 if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
748 512 * 1024)) {
749 test_msg("Invalid bitmap region marked as free\n");
750 return -EINVAL;
751 }
752
753 /*
754 * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
755 * lets make sure the free space cache marks it as free in the bitmap,
756 * and doesn't insert a new extent entry to represent this region.
757 */
758 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
759 512 * 1024);
760 if (ret) {
761 test_msg("Error adding free space: %d\n", ret);
762 return ret;
763 }
764 /* Confirm the region is marked as free. */
765 if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
766 512 * 1024)) {
767 test_msg("Bitmap region not marked as free\n");
768 return -ENOENT;
769 }
770
771 /*
772 * Confirm that no new extent entries or bitmap entries were added to
773 * the cache after adding that free space region.
774 */
775 ret = check_num_extents_and_bitmaps(cache, 2, 1);
776 if (ret)
777 return ret;
778
779 /*
780 * Now lets add a small free space region to the left of the previous
781 * one, which is not contiguous with it and is part of the bitmap too.
782 * The goal is to test that the bitmap entry space stealing doesn't
783 * steal this space region.
784 */
785 ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
786 if (ret) {
787 test_msg("Error adding free space: %d\n", ret);
788 return ret;
789 }
790
791 /*
792 * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
793 * expand the range covered by the existing extent entry that represents
794 * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
795 */
796 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
797 if (ret) {
798 test_msg("Error adding free space: %d\n", ret);
799 return ret;
800 }
801 /* Confirm the region is marked as free. */
802 if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
803 test_msg("Extent region not marked as free\n");
804 return -ENOENT;
805 }
806
807 /*
808 * Confirm that our extent entry didn't stole all free space from the
809 * bitmap, because of the small 8Kb free space region.
810 */
811 ret = check_num_extents_and_bitmaps(cache, 2, 1);
812 if (ret)
813 return ret;
814
815 /*
816 * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
817 * space. Without stealing bitmap free space into extent entry space,
818 * we would have all this free space represented by 2 entries in the
819 * cache:
820 *
821 * extent entry covering range: [128Mb, 128Mb + 256Kb[
822 * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
823 *
824 * Attempting to allocate the whole free space (1Mb) would fail, because
825 * we can't allocate from multiple entries.
826 * With the bitmap free space stealing, we get a single extent entry
827 * that represents the 1Mb free space, and therefore we're able to
828 * allocate the whole free space at once.
829 */
830 if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
831 1 * 1024 * 1024)) {
832 test_msg("Expected region not marked as free\n");
833 return -ENOENT;
834 }
835
836 if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
837 test_msg("Cache free space is not 1Mb + 8Kb\n");
838 return -EINVAL;
839 }
840
841 offset = btrfs_find_space_for_alloc(cache,
842 0, 1 * 1024 * 1024, 0,
843 &max_extent_size);
844 if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
845 test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
846 offset);
847 return -EINVAL;
848 }
849
850 /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
851 ret = check_num_extents_and_bitmaps(cache, 1, 1);
852 if (ret)
853 return ret;
854
855 if (cache->free_space_ctl->free_space != 8192) {
856 test_msg("Cache free space is not 8Kb\n");
857 return -EINVAL;
858 }
859
860 offset = btrfs_find_space_for_alloc(cache,
861 0, 8192, 0,
862 &max_extent_size);
863 if (offset != (32 * 1024 * 1024)) {
864 test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
865 offset);
866 return -EINVAL;
867 }
868
869 ret = check_cache_empty(cache);
870 if (ret)
871 return ret;
872
873 cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
874 __btrfs_remove_free_space_cache(cache->free_space_ctl);
875
876 return 0;
877}
878
367int btrfs_test_free_space_cache(void) 879int btrfs_test_free_space_cache(void)
368{ 880{
369 struct btrfs_block_group_cache *cache; 881 struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
386 ret = test_bitmaps_and_extents(cache); 898 ret = test_bitmaps_and_extents(cache);
387 if (ret) 899 if (ret)
388 goto out; 900 goto out;
901
902 ret = test_steal_space_from_bitmap_to_extent(cache);
389out: 903out:
390 __btrfs_remove_free_space_cache(cache->free_space_ctl); 904 __btrfs_remove_free_space_cache(cache->free_space_ctl);
391 kfree(cache->free_space_ctl); 905 kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d89c6d3542ca..dcaae3616728 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
386 int ret; 386 int ret;
387 387
388 /* Send isn't supposed to start transactions. */ 388 /* Send isn't supposed to start transactions. */
389 ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); 389 ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
390 390
391 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 391 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
392 return ERR_PTR(-EROFS); 392 return ERR_PTR(-EROFS);
@@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
408 if (num_items > 0 && root != root->fs_info->chunk_root) { 408 if (num_items > 0 && root != root->fs_info->chunk_root) {
409 if (root->fs_info->quota_enabled && 409 if (root->fs_info->quota_enabled &&
410 is_fstree(root->root_key.objectid)) { 410 is_fstree(root->root_key.objectid)) {
411 qgroup_reserved = num_items * root->leafsize; 411 qgroup_reserved = num_items * root->nodesize;
412 ret = btrfs_qgroup_reserve(root, qgroup_reserved); 412 ret = btrfs_qgroup_reserve(root, qgroup_reserved);
413 if (ret) 413 if (ret)
414 return ERR_PTR(ret); 414 return ERR_PTR(ret);
@@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
418 /* 418 /*
419 * Do the reservation for the relocation root creation 419 * Do the reservation for the relocation root creation
420 */ 420 */
421 if (unlikely(need_reserve_reloc_root(root))) { 421 if (need_reserve_reloc_root(root)) {
422 num_bytes += root->nodesize; 422 num_bytes += root->nodesize;
423 reloc_reserved = true; 423 reloc_reserved = true;
424 } 424 }
@@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
609 if (transid <= root->fs_info->last_trans_committed) 609 if (transid <= root->fs_info->last_trans_committed)
610 goto out; 610 goto out;
611 611
612 ret = -EINVAL;
613 /* find specified transaction */ 612 /* find specified transaction */
614 spin_lock(&root->fs_info->trans_lock); 613 spin_lock(&root->fs_info->trans_lock);
615 list_for_each_entry(t, &root->fs_info->trans_list, list) { 614 list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
625 } 624 }
626 } 625 }
627 spin_unlock(&root->fs_info->trans_lock); 626 spin_unlock(&root->fs_info->trans_lock);
628 /* The specified transaction doesn't exist */ 627
629 if (!cur_trans) 628 /*
629 * The specified transaction doesn't exist, or we
630 * raced with btrfs_commit_transaction
631 */
632 if (!cur_trans) {
633 if (transid > root->fs_info->last_trans_committed)
634 ret = -EINVAL;
630 goto out; 635 goto out;
636 }
631 } else { 637 } else {
632 /* find newest transaction that is committing | committed */ 638 /* find newest transaction that is committing | committed */
633 spin_lock(&root->fs_info->trans_lock); 639 spin_lock(&root->fs_info->trans_lock);
@@ -851,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
851 struct extent_state *cached_state = NULL; 857 struct extent_state *cached_state = NULL;
852 u64 start = 0; 858 u64 start = 0;
853 u64 end; 859 u64 end;
860 struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
861 bool errors = false;
854 862
855 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 863 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
856 EXTENT_NEED_WAIT, &cached_state)) { 864 EXTENT_NEED_WAIT, &cached_state)) {
@@ -864,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
864 } 872 }
865 if (err) 873 if (err)
866 werr = err; 874 werr = err;
875
876 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
877 if ((mark & EXTENT_DIRTY) &&
878 test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
879 &btree_ino->runtime_flags))
880 errors = true;
881
882 if ((mark & EXTENT_NEW) &&
883 test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
884 &btree_ino->runtime_flags))
885 errors = true;
886 } else {
887 if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
888 &btree_ino->runtime_flags))
889 errors = true;
890 }
891
892 if (errors && !werr)
893 werr = -EIO;
894
867 return werr; 895 return werr;
868} 896}
869 897
@@ -1629,6 +1657,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1629{ 1657{
1630 struct btrfs_transaction *cur_trans = trans->transaction; 1658 struct btrfs_transaction *cur_trans = trans->transaction;
1631 struct btrfs_transaction *prev_trans = NULL; 1659 struct btrfs_transaction *prev_trans = NULL;
1660 struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
1632 int ret; 1661 int ret;
1633 1662
1634 /* Stop the commit early if ->aborted is set */ 1663 /* Stop the commit early if ->aborted is set */
@@ -1868,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1868 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1897 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1869 sizeof(*root->fs_info->super_copy)); 1898 sizeof(*root->fs_info->super_copy));
1870 1899
1900 btrfs_update_commit_device_size(root->fs_info);
1901 btrfs_update_commit_device_bytes_used(root, cur_trans);
1902
1903 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
1904 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
1905
1871 spin_lock(&root->fs_info->trans_lock); 1906 spin_lock(&root->fs_info->trans_lock);
1872 cur_trans->state = TRANS_STATE_UNBLOCKED; 1907 cur_trans->state = TRANS_STATE_UNBLOCKED;
1873 root->fs_info->running_transaction = NULL; 1908 root->fs_info->running_transaction = NULL;
@@ -1981,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1981 ret = btrfs_drop_snapshot(root, NULL, 0, 0); 2016 ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1982 else 2017 else
1983 ret = btrfs_drop_snapshot(root, NULL, 1, 0); 2018 ret = btrfs_drop_snapshot(root, NULL, 1, 0);
1984 /* 2019
1985 * If we encounter a transaction abort during snapshot cleaning, we
1986 * don't want to crash here
1987 */
1988 return (ret < 0) ? 0 : 1; 2020 return (ret < 0) ? 0 : 1;
1989} 2021}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 579be51b27e5..d8f40e1a5d2d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,7 +79,7 @@ struct btrfs_transaction {
79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ 79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
80 __TRANS_ATTACH) 80 __TRANS_ATTACH)
81 81
82#define BTRFS_SEND_TRANS_STUB 1 82#define BTRFS_SEND_TRANS_STUB ((void *)1)
83 83
84struct btrfs_trans_handle { 84struct btrfs_trans_handle {
85 u64 transid; 85 u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d0262ceb85e1..1475979e5718 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode, 97 struct btrfs_root *root, struct inode *inode,
98 int inode_only, 98 int inode_only,
99 const loff_t start, 99 const loff_t start,
100 const loff_t end); 100 const loff_t end,
101 struct btrfs_log_ctx *ctx);
101static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root, 103 struct btrfs_root *root,
103 struct btrfs_path *path, u64 objectid); 104 struct btrfs_path *path, u64 objectid);
@@ -1498,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1498 return -EIO; 1499 return -EIO;
1499 1500
1500 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1501 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1501 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1502 key.type = BTRFS_ORPHAN_ITEM_KEY;
1502 key.offset = objectid; 1503 key.offset = objectid;
1503 1504
1504 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1505 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1637,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1637 found_key.type == log_key.type && 1638 found_key.type == log_key.type &&
1638 found_key.offset == log_key.offset && 1639 found_key.offset == log_key.offset &&
1639 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1640 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1641 update_size = false;
1640 goto out; 1642 goto out;
1641 } 1643 }
1642 1644
@@ -2157,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2157 2159
2158 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2160 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2159 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2161 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2160 blocksize = btrfs_level_size(root, *level - 1); 2162 blocksize = root->nodesize;
2161 2163
2162 parent = path->nodes[*level]; 2164 parent = path->nodes[*level];
2163 root_owner = btrfs_header_owner(parent); 2165 root_owner = btrfs_header_owner(parent);
@@ -2983,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2983 min_key.type = key_type; 2985 min_key.type = key_type;
2984 min_key.offset = min_offset; 2986 min_key.offset = min_offset;
2985 2987
2986 path->keep_locks = 1;
2987
2988 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 2988 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
2989 2989
2990 /* 2990 /*
@@ -3364,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3364 * or deletes of this inode don't have to relog the inode 3364 * or deletes of this inode don't have to relog the inode
3365 * again 3365 * again
3366 */ 3366 */
3367 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3367 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3368 !skip_csum) { 3368 !skip_csum) {
3369 int found_type; 3369 int found_type;
3370 extent = btrfs_item_ptr(src, start_slot + i, 3370 extent = btrfs_item_ptr(src, start_slot + i,
@@ -3573,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3573 return 0; 3573 return 0;
3574} 3574}
3575 3575
3576static int log_one_extent(struct btrfs_trans_handle *trans, 3576static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3577 struct inode *inode, struct btrfs_root *root, 3577 struct inode *inode,
3578 struct extent_map *em, struct btrfs_path *path, 3578 struct btrfs_root *root,
3579 struct list_head *logged_list) 3579 const struct extent_map *em,
3580 const struct list_head *logged_list,
3581 bool *ordered_io_error)
3580{ 3582{
3581 struct btrfs_root *log = root->log_root;
3582 struct btrfs_file_extent_item *fi;
3583 struct extent_buffer *leaf;
3584 struct btrfs_ordered_extent *ordered; 3583 struct btrfs_ordered_extent *ordered;
3585 struct list_head ordered_sums; 3584 struct btrfs_root *log = root->log_root;
3586 struct btrfs_map_token token;
3587 struct btrfs_key key;
3588 u64 mod_start = em->mod_start; 3585 u64 mod_start = em->mod_start;
3589 u64 mod_len = em->mod_len; 3586 u64 mod_len = em->mod_len;
3587 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3590 u64 csum_offset; 3588 u64 csum_offset;
3591 u64 csum_len; 3589 u64 csum_len;
3592 u64 extent_offset = em->start - em->orig_start; 3590 LIST_HEAD(ordered_sums);
3593 u64 block_len; 3591 int ret = 0;
3594 int ret;
3595 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3596 int extent_inserted = 0;
3597
3598 INIT_LIST_HEAD(&ordered_sums);
3599 btrfs_init_map_token(&token);
3600
3601 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3602 em->start + em->len, NULL, 0, 1,
3603 sizeof(*fi), &extent_inserted);
3604 if (ret)
3605 return ret;
3606
3607 if (!extent_inserted) {
3608 key.objectid = btrfs_ino(inode);
3609 key.type = BTRFS_EXTENT_DATA_KEY;
3610 key.offset = em->start;
3611
3612 ret = btrfs_insert_empty_item(trans, log, path, &key,
3613 sizeof(*fi));
3614 if (ret)
3615 return ret;
3616 }
3617 leaf = path->nodes[0];
3618 fi = btrfs_item_ptr(leaf, path->slots[0],
3619 struct btrfs_file_extent_item);
3620
3621 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3622 &token);
3623 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3624 skip_csum = true;
3625 btrfs_set_token_file_extent_type(leaf, fi,
3626 BTRFS_FILE_EXTENT_PREALLOC,
3627 &token);
3628 } else {
3629 btrfs_set_token_file_extent_type(leaf, fi,
3630 BTRFS_FILE_EXTENT_REG,
3631 &token);
3632 if (em->block_start == EXTENT_MAP_HOLE)
3633 skip_csum = true;
3634 }
3635
3636 block_len = max(em->block_len, em->orig_block_len);
3637 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3638 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3639 em->block_start,
3640 &token);
3641 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3642 &token);
3643 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3644 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3645 em->block_start -
3646 extent_offset, &token);
3647 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3648 &token);
3649 } else {
3650 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3651 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3652 &token);
3653 }
3654
3655 btrfs_set_token_file_extent_offset(leaf, fi,
3656 em->start - em->orig_start,
3657 &token);
3658 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3659 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3660 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3661 &token);
3662 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3663 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3664 btrfs_mark_buffer_dirty(leaf);
3665 3592
3666 btrfs_release_path(path); 3593 *ordered_io_error = false;
3667 if (ret) {
3668 return ret;
3669 }
3670 3594
3671 if (skip_csum) 3595 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3596 em->block_start == EXTENT_MAP_HOLE)
3672 return 0; 3597 return 0;
3673 3598
3674 /* 3599 /*
3675 * First check and see if our csums are on our outstanding ordered 3600 * Wait far any ordered extent that covers our extent map. If it
3676 * extents. 3601 * finishes without an error, first check and see if our csums are on
3602 * our outstanding ordered extents.
3677 */ 3603 */
3678 list_for_each_entry(ordered, logged_list, log_list) { 3604 list_for_each_entry(ordered, logged_list, log_list) {
3679 struct btrfs_ordered_sum *sum; 3605 struct btrfs_ordered_sum *sum;
@@ -3685,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3685 mod_start + mod_len <= ordered->file_offset) 3611 mod_start + mod_len <= ordered->file_offset)
3686 continue; 3612 continue;
3687 3613
3614 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3615 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3616 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3617 const u64 start = ordered->file_offset;
3618 const u64 end = ordered->file_offset + ordered->len - 1;
3619
3620 WARN_ON(ordered->inode != inode);
3621 filemap_fdatawrite_range(inode->i_mapping, start, end);
3622 }
3623
3624 wait_event(ordered->wait,
3625 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3629 *ordered_io_error = true;
3630 break;
3631 }
3688 /* 3632 /*
3689 * We are going to copy all the csums on this ordered extent, so 3633 * We are going to copy all the csums on this ordered extent, so
3690 * go ahead and adjust mod_start and mod_len in case this 3634 * go ahead and adjust mod_start and mod_len in case this
@@ -3716,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3716 } 3660 }
3717 } 3661 }
3718 3662
3663 if (skip_csum)
3664 continue;
3665
3719 /* 3666 /*
3720 * To keep us from looping for the above case of an ordered 3667 * To keep us from looping for the above case of an ordered
3721 * extent that falls inside of the logged extent. 3668 * extent that falls inside of the logged extent.
@@ -3733,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3733 list_for_each_entry(sum, &ordered->list, list) { 3680 list_for_each_entry(sum, &ordered->list, list) {
3734 ret = btrfs_csum_file_blocks(trans, log, sum); 3681 ret = btrfs_csum_file_blocks(trans, log, sum);
3735 if (ret) 3682 if (ret)
3736 goto unlocked; 3683 break;
3737 } 3684 }
3738
3739 } 3685 }
3740unlocked:
3741 3686
3742 if (!mod_len || ret) 3687 if (*ordered_io_error || !mod_len || ret || skip_csum)
3743 return ret; 3688 return ret;
3744 3689
3745 if (em->compress_type) { 3690 if (em->compress_type) {
3746 csum_offset = 0; 3691 csum_offset = 0;
3747 csum_len = block_len; 3692 csum_len = max(em->block_len, em->orig_block_len);
3748 } else { 3693 } else {
3749 csum_offset = mod_start - em->start; 3694 csum_offset = mod_start - em->start;
3750 csum_len = mod_len; 3695 csum_len = mod_len;
@@ -3771,11 +3716,106 @@ unlocked:
3771 return ret; 3716 return ret;
3772} 3717}
3773 3718
3719static int log_one_extent(struct btrfs_trans_handle *trans,
3720 struct inode *inode, struct btrfs_root *root,
3721 const struct extent_map *em,
3722 struct btrfs_path *path,
3723 const struct list_head *logged_list,
3724 struct btrfs_log_ctx *ctx)
3725{
3726 struct btrfs_root *log = root->log_root;
3727 struct btrfs_file_extent_item *fi;
3728 struct extent_buffer *leaf;
3729 struct btrfs_map_token token;
3730 struct btrfs_key key;
3731 u64 extent_offset = em->start - em->orig_start;
3732 u64 block_len;
3733 int ret;
3734 int extent_inserted = 0;
3735 bool ordered_io_err = false;
3736
3737 ret = wait_ordered_extents(trans, inode, root, em, logged_list,
3738 &ordered_io_err);
3739 if (ret)
3740 return ret;
3741
3742 if (ordered_io_err) {
3743 ctx->io_err = -EIO;
3744 return 0;
3745 }
3746
3747 btrfs_init_map_token(&token);
3748
3749 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3750 em->start + em->len, NULL, 0, 1,
3751 sizeof(*fi), &extent_inserted);
3752 if (ret)
3753 return ret;
3754
3755 if (!extent_inserted) {
3756 key.objectid = btrfs_ino(inode);
3757 key.type = BTRFS_EXTENT_DATA_KEY;
3758 key.offset = em->start;
3759
3760 ret = btrfs_insert_empty_item(trans, log, path, &key,
3761 sizeof(*fi));
3762 if (ret)
3763 return ret;
3764 }
3765 leaf = path->nodes[0];
3766 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item);
3768
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3770 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi,
3773 BTRFS_FILE_EXTENT_PREALLOC,
3774 &token);
3775 else
3776 btrfs_set_token_file_extent_type(leaf, fi,
3777 BTRFS_FILE_EXTENT_REG,
3778 &token);
3779
3780 block_len = max(em->block_len, em->orig_block_len);
3781 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3782 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3783 em->block_start,
3784 &token);
3785 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3786 &token);
3787 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3788 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3789 em->block_start -
3790 extent_offset, &token);
3791 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3792 &token);
3793 } else {
3794 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3795 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3796 &token);
3797 }
3798
3799 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
3800 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3801 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3802 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3803 &token);
3804 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3805 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3806 btrfs_mark_buffer_dirty(leaf);
3807
3808 btrfs_release_path(path);
3809
3810 return ret;
3811}
3812
3774static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3813static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3775 struct btrfs_root *root, 3814 struct btrfs_root *root,
3776 struct inode *inode, 3815 struct inode *inode,
3777 struct btrfs_path *path, 3816 struct btrfs_path *path,
3778 struct list_head *logged_list) 3817 struct list_head *logged_list,
3818 struct btrfs_log_ctx *ctx)
3779{ 3819{
3780 struct extent_map *em, *n; 3820 struct extent_map *em, *n;
3781 struct list_head extents; 3821 struct list_head extents;
@@ -3833,7 +3873,8 @@ process:
3833 3873
3834 write_unlock(&tree->lock); 3874 write_unlock(&tree->lock);
3835 3875
3836 ret = log_one_extent(trans, inode, root, em, path, logged_list); 3876 ret = log_one_extent(trans, inode, root, em, path, logged_list,
3877 ctx);
3837 write_lock(&tree->lock); 3878 write_lock(&tree->lock);
3838 clear_em_logging(tree, em); 3879 clear_em_logging(tree, em);
3839 free_extent_map(em); 3880 free_extent_map(em);
@@ -3863,7 +3904,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3863 struct btrfs_root *root, struct inode *inode, 3904 struct btrfs_root *root, struct inode *inode,
3864 int inode_only, 3905 int inode_only,
3865 const loff_t start, 3906 const loff_t start,
3866 const loff_t end) 3907 const loff_t end,
3908 struct btrfs_log_ctx *ctx)
3867{ 3909{
3868 struct btrfs_path *path; 3910 struct btrfs_path *path;
3869 struct btrfs_path *dst_path; 3911 struct btrfs_path *dst_path;
@@ -3964,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3964 err = ret; 4006 err = ret;
3965 goto out_unlock; 4007 goto out_unlock;
3966 } 4008 }
3967 path->keep_locks = 1;
3968 4009
3969 while (1) { 4010 while (1) {
3970 ins_nr = 0; 4011 ins_nr = 0;
@@ -4049,7 +4090,7 @@ log_extents:
4049 btrfs_release_path(dst_path); 4090 btrfs_release_path(dst_path);
4050 if (fast_search) { 4091 if (fast_search) {
4051 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4092 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4052 &logged_list); 4093 &logged_list, ctx);
4053 if (ret) { 4094 if (ret) {
4054 err = ret; 4095 err = ret;
4055 goto out_unlock; 4096 goto out_unlock;
@@ -4239,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4239 if (ret) 4280 if (ret)
4240 goto end_no_trans; 4281 goto end_no_trans;
4241 4282
4242 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); 4283 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
4243 if (ret) 4284 if (ret)
4244 goto end_trans; 4285 goto end_trans;
4245 4286
@@ -4268,7 +4309,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4268 if (BTRFS_I(inode)->generation > 4309 if (BTRFS_I(inode)->generation >
4269 root->fs_info->last_trans_committed) { 4310 root->fs_info->last_trans_committed) {
4270 ret = btrfs_log_inode(trans, root, inode, inode_only, 4311 ret = btrfs_log_inode(trans, root, inode, inode_only,
4271 0, LLONG_MAX); 4312 0, LLONG_MAX, ctx);
4272 if (ret) 4313 if (ret)
4273 goto end_trans; 4314 goto end_trans;
4274 } 4315 }
@@ -4360,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4360again: 4401again:
4361 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4402 key.objectid = BTRFS_TREE_LOG_OBJECTID;
4362 key.offset = (u64)-1; 4403 key.offset = (u64)-1;
4363 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4404 key.type = BTRFS_ROOT_ITEM_KEY;
4364 4405
4365 while (1) { 4406 while (1) {
4366 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4407 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index e2e798ae7cd7..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
28struct btrfs_log_ctx { 28struct btrfs_log_ctx {
29 int log_ret; 29 int log_ret;
30 int log_transid; 30 int log_transid;
31 int io_err;
31 struct list_head list; 32 struct list_head list;
32}; 33};
33 34
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
35{ 36{
36 ctx->log_ret = 0; 37 ctx->log_ret = 0;
37 ctx->log_transid = 0; 38 ctx->log_transid = 0;
39 ctx->io_err = 0;
38 INIT_LIST_HEAD(&ctx->list); 40 INIT_LIST_HEAD(&ctx->list);
39} 41}
40 42
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03ee7d8..778282944530 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
279 key.offset = 0; 279 key.offset = 0;
280 280
281again_search_slot: 281again_search_slot:
282 path->keep_locks = 1;
283 ret = btrfs_search_forward(root, &key, path, 0); 282 ret = btrfs_search_forward(root, &key, path, 0);
284 if (ret) { 283 if (ret) {
285 if (ret > 0) 284 if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2c2d6d1d8eee..d47289c715c8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
52 52
53static DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root) 56static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
74 mutex_init(&fs_devs->device_list_mutex); 74 mutex_init(&fs_devs->device_list_mutex);
75 75
76 INIT_LIST_HEAD(&fs_devs->devices); 76 INIT_LIST_HEAD(&fs_devs->devices);
77 INIT_LIST_HEAD(&fs_devs->resized_devices);
77 INIT_LIST_HEAD(&fs_devs->alloc_list); 78 INIT_LIST_HEAD(&fs_devs->alloc_list);
78 INIT_LIST_HEAD(&fs_devs->list); 79 INIT_LIST_HEAD(&fs_devs->list);
79 80
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
154 155
155 INIT_LIST_HEAD(&dev->dev_list); 156 INIT_LIST_HEAD(&dev->dev_list);
156 INIT_LIST_HEAD(&dev->dev_alloc_list); 157 INIT_LIST_HEAD(&dev->dev_alloc_list);
158 INIT_LIST_HEAD(&dev->resized_list);
157 159
158 spin_lock_init(&dev->io_lock); 160 spin_lock_init(&dev->io_lock);
159 161
160 spin_lock_init(&dev->reada_lock); 162 spin_lock_init(&dev->reada_lock);
161 atomic_set(&dev->reada_in_flight, 0); 163 atomic_set(&dev->reada_in_flight, 0);
164 atomic_set(&dev->dev_stats_ccnt, 0);
162 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 165 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
163 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 166 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
164 167
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
474 return PTR_ERR(fs_devices); 477 return PTR_ERR(fs_devices);
475 478
476 list_add(&fs_devices->list, &fs_uuids); 479 list_add(&fs_devices->list, &fs_uuids);
477 fs_devices->latest_devid = devid;
478 fs_devices->latest_trans = found_transid;
479 480
480 device = NULL; 481 device = NULL;
481 } else { 482 } else {
482 device = __find_device(&fs_devices->devices, devid, 483 device = __find_device(&fs_devices->devices, devid,
483 disk_super->dev_item.uuid); 484 disk_super->dev_item.uuid);
484 } 485 }
486
485 if (!device) { 487 if (!device) {
486 if (fs_devices->opened) 488 if (fs_devices->opened)
487 return -EBUSY; 489 return -EBUSY;
@@ -565,10 +567,6 @@ static noinline int device_list_add(const char *path,
565 if (!fs_devices->opened) 567 if (!fs_devices->opened)
566 device->generation = found_transid; 568 device->generation = found_transid;
567 569
568 if (found_transid > fs_devices->latest_trans) {
569 fs_devices->latest_devid = devid;
570 fs_devices->latest_trans = found_transid;
571 }
572 *fs_devices_ret = fs_devices; 570 *fs_devices_ret = fs_devices;
573 571
574 return ret; 572 return ret;
@@ -584,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
584 if (IS_ERR(fs_devices)) 582 if (IS_ERR(fs_devices))
585 return fs_devices; 583 return fs_devices;
586 584
587 fs_devices->latest_devid = orig->latest_devid; 585 mutex_lock(&orig->device_list_mutex);
588 fs_devices->latest_trans = orig->latest_trans;
589 fs_devices->total_devices = orig->total_devices; 586 fs_devices->total_devices = orig->total_devices;
590 587
591 /* We have held the volume lock, it is safe to get the devices. */ 588 /* We have held the volume lock, it is safe to get the devices. */
@@ -614,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
614 device->fs_devices = fs_devices; 611 device->fs_devices = fs_devices;
615 fs_devices->num_devices++; 612 fs_devices->num_devices++;
616 } 613 }
614 mutex_unlock(&orig->device_list_mutex);
617 return fs_devices; 615 return fs_devices;
618error: 616error:
617 mutex_unlock(&orig->device_list_mutex);
619 free_fs_devices(fs_devices); 618 free_fs_devices(fs_devices);
620 return ERR_PTR(-ENOMEM); 619 return ERR_PTR(-ENOMEM);
621} 620}
@@ -624,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
624 struct btrfs_fs_devices *fs_devices, int step) 623 struct btrfs_fs_devices *fs_devices, int step)
625{ 624{
626 struct btrfs_device *device, *next; 625 struct btrfs_device *device, *next;
627 626 struct btrfs_device *latest_dev = NULL;
628 struct block_device *latest_bdev = NULL;
629 u64 latest_devid = 0;
630 u64 latest_transid = 0;
631 627
632 mutex_lock(&uuid_mutex); 628 mutex_lock(&uuid_mutex);
633again: 629again:
@@ -635,11 +631,9 @@ again:
635 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 631 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
636 if (device->in_fs_metadata) { 632 if (device->in_fs_metadata) {
637 if (!device->is_tgtdev_for_dev_replace && 633 if (!device->is_tgtdev_for_dev_replace &&
638 (!latest_transid || 634 (!latest_dev ||
639 device->generation > latest_transid)) { 635 device->generation > latest_dev->generation)) {
640 latest_devid = device->devid; 636 latest_dev = device;
641 latest_transid = device->generation;
642 latest_bdev = device->bdev;
643 } 637 }
644 continue; 638 continue;
645 } 639 }
@@ -681,9 +675,7 @@ again:
681 goto again; 675 goto again;
682 } 676 }
683 677
684 fs_devices->latest_bdev = latest_bdev; 678 fs_devices->latest_bdev = latest_dev->bdev;
685 fs_devices->latest_devid = latest_devid;
686 fs_devices->latest_trans = latest_transid;
687 679
688 mutex_unlock(&uuid_mutex); 680 mutex_unlock(&uuid_mutex);
689} 681}
@@ -732,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
732 fs_devices->rw_devices--; 724 fs_devices->rw_devices--;
733 } 725 }
734 726
735 if (device->can_discard)
736 fs_devices->num_can_discard--;
737 if (device->missing) 727 if (device->missing)
738 fs_devices->missing_devices--; 728 fs_devices->missing_devices--;
739 729
@@ -798,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
798 struct block_device *bdev; 788 struct block_device *bdev;
799 struct list_head *head = &fs_devices->devices; 789 struct list_head *head = &fs_devices->devices;
800 struct btrfs_device *device; 790 struct btrfs_device *device;
801 struct block_device *latest_bdev = NULL; 791 struct btrfs_device *latest_dev = NULL;
802 struct buffer_head *bh; 792 struct buffer_head *bh;
803 struct btrfs_super_block *disk_super; 793 struct btrfs_super_block *disk_super;
804 u64 latest_devid = 0;
805 u64 latest_transid = 0;
806 u64 devid; 794 u64 devid;
807 int seeding = 1; 795 int seeding = 1;
808 int ret = 0; 796 int ret = 0;
@@ -830,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
830 goto error_brelse; 818 goto error_brelse;
831 819
832 device->generation = btrfs_super_generation(disk_super); 820 device->generation = btrfs_super_generation(disk_super);
833 if (!latest_transid || device->generation > latest_transid) { 821 if (!latest_dev ||
834 latest_devid = devid; 822 device->generation > latest_dev->generation)
835 latest_transid = device->generation; 823 latest_dev = device;
836 latest_bdev = bdev;
837 }
838 824
839 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 825 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
840 device->writeable = 0; 826 device->writeable = 0;
@@ -844,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
844 } 830 }
845 831
846 q = bdev_get_queue(bdev); 832 q = bdev_get_queue(bdev);
847 if (blk_queue_discard(q)) { 833 if (blk_queue_discard(q))
848 device->can_discard = 1; 834 device->can_discard = 1;
849 fs_devices->num_can_discard++;
850 }
851 835
852 device->bdev = bdev; 836 device->bdev = bdev;
853 device->in_fs_metadata = 0; 837 device->in_fs_metadata = 0;
@@ -877,9 +861,7 @@ error_brelse:
877 } 861 }
878 fs_devices->seeding = seeding; 862 fs_devices->seeding = seeding;
879 fs_devices->opened = 1; 863 fs_devices->opened = 1;
880 fs_devices->latest_bdev = latest_bdev; 864 fs_devices->latest_bdev = latest_dev->bdev;
881 fs_devices->latest_devid = latest_devid;
882 fs_devices->latest_trans = latest_transid;
883 fs_devices->total_rw_bytes = 0; 865 fs_devices->total_rw_bytes = 0;
884out: 866out:
885 return ret; 867 return ret;
@@ -1053,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1053 if (key.objectid > device->devid) 1035 if (key.objectid > device->devid)
1054 break; 1036 break;
1055 1037
1056 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1038 if (key.type != BTRFS_DEV_EXTENT_KEY)
1057 goto next; 1039 goto next;
1058 1040
1059 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1041 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1205,7 +1187,7 @@ again:
1205 if (key.objectid > device->devid) 1187 if (key.objectid > device->devid)
1206 break; 1188 break;
1207 1189
1208 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1190 if (key.type != BTRFS_DEV_EXTENT_KEY)
1209 goto next; 1191 goto next;
1210 1192
1211 if (key.offset > search_start) { 1193 if (key.offset > search_start) {
@@ -1284,7 +1266,7 @@ out:
1284 1266
1285static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1267static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1286 struct btrfs_device *device, 1268 struct btrfs_device *device,
1287 u64 start) 1269 u64 start, u64 *dev_extent_len)
1288{ 1270{
1289 int ret; 1271 int ret;
1290 struct btrfs_path *path; 1272 struct btrfs_path *path;
@@ -1326,13 +1308,8 @@ again:
1326 goto out; 1308 goto out;
1327 } 1309 }
1328 1310
1329 if (device->bytes_used > 0) { 1311 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1330 u64 len = btrfs_dev_extent_length(leaf, extent); 1312
1331 device->bytes_used -= len;
1332 spin_lock(&root->fs_info->free_chunk_lock);
1333 root->fs_info->free_chunk_space += len;
1334 spin_unlock(&root->fs_info->free_chunk_lock);
1335 }
1336 ret = btrfs_del_item(trans, root, path); 1313 ret = btrfs_del_item(trans, root, path);
1337 if (ret) { 1314 if (ret) {
1338 btrfs_error(root->fs_info, ret, 1315 btrfs_error(root->fs_info, ret,
@@ -1482,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
1482 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1459 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1483 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1460 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1484 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1461 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1485 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1462 btrfs_set_device_total_bytes(leaf, dev_item,
1486 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1463 btrfs_device_get_disk_total_bytes(device));
1464 btrfs_set_device_bytes_used(leaf, dev_item,
1465 btrfs_device_get_bytes_used(device));
1487 btrfs_set_device_group(leaf, dev_item, 0); 1466 btrfs_set_device_group(leaf, dev_item, 0);
1488 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1467 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1489 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1468 btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1539,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1539 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1518 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1540 key.type = BTRFS_DEV_ITEM_KEY; 1519 key.type = BTRFS_DEV_ITEM_KEY;
1541 key.offset = device->devid; 1520 key.offset = device->devid;
1542 lock_chunks(root);
1543 1521
1544 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1522 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1545 if (ret < 0) 1523 if (ret < 0)
@@ -1555,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1555 goto out; 1533 goto out;
1556out: 1534out:
1557 btrfs_free_path(path); 1535 btrfs_free_path(path);
1558 unlock_chunks(root);
1559 btrfs_commit_transaction(trans, root); 1536 btrfs_commit_transaction(trans, root);
1560 return ret; 1537 return ret;
1561} 1538}
@@ -1671,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1671 if (device->writeable) { 1648 if (device->writeable) {
1672 lock_chunks(root); 1649 lock_chunks(root);
1673 list_del_init(&device->dev_alloc_list); 1650 list_del_init(&device->dev_alloc_list);
1651 device->fs_devices->rw_devices--;
1674 unlock_chunks(root); 1652 unlock_chunks(root);
1675 root->fs_info->fs_devices->rw_devices--;
1676 clear_super = true; 1653 clear_super = true;
1677 } 1654 }
1678 1655
@@ -1691,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1691 if (ret) 1668 if (ret)
1692 goto error_undo; 1669 goto error_undo;
1693 1670
1694 spin_lock(&root->fs_info->free_chunk_lock);
1695 root->fs_info->free_chunk_space = device->total_bytes -
1696 device->bytes_used;
1697 spin_unlock(&root->fs_info->free_chunk_lock);
1698
1699 device->in_fs_metadata = 0; 1671 device->in_fs_metadata = 0;
1700 btrfs_scrub_cancel_dev(root->fs_info, device); 1672 btrfs_scrub_cancel_dev(root->fs_info, device);
1701 1673
@@ -1749,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1749 fs_devices = fs_devices->seed; 1721 fs_devices = fs_devices->seed;
1750 } 1722 }
1751 cur_devices->seed = NULL; 1723 cur_devices->seed = NULL;
1752 lock_chunks(root);
1753 __btrfs_close_devices(cur_devices); 1724 __btrfs_close_devices(cur_devices);
1754 unlock_chunks(root);
1755 free_fs_devices(cur_devices); 1725 free_fs_devices(cur_devices);
1756 } 1726 }
1757 1727
@@ -1824,8 +1794,8 @@ error_undo:
1824 lock_chunks(root); 1794 lock_chunks(root);
1825 list_add(&device->dev_alloc_list, 1795 list_add(&device->dev_alloc_list,
1826 &root->fs_info->fs_devices->alloc_list); 1796 &root->fs_info->fs_devices->alloc_list);
1797 device->fs_devices->rw_devices++;
1827 unlock_chunks(root); 1798 unlock_chunks(root);
1828 root->fs_info->fs_devices->rw_devices++;
1829 } 1799 }
1830 goto error_brelse; 1800 goto error_brelse;
1831} 1801}
@@ -1833,29 +1803,57 @@ error_undo:
1833void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1803void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1834 struct btrfs_device *srcdev) 1804 struct btrfs_device *srcdev)
1835{ 1805{
1806 struct btrfs_fs_devices *fs_devices;
1807
1836 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1808 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1837 1809
1810 /*
1811 * in case of fs with no seed, srcdev->fs_devices will point
1812 * to fs_devices of fs_info. However when the dev being replaced is
1813 * a seed dev it will point to the seed's local fs_devices. In short
1814 * srcdev will have its correct fs_devices in both the cases.
1815 */
1816 fs_devices = srcdev->fs_devices;
1817
1838 list_del_rcu(&srcdev->dev_list); 1818 list_del_rcu(&srcdev->dev_list);
1839 list_del_rcu(&srcdev->dev_alloc_list); 1819 list_del_rcu(&srcdev->dev_alloc_list);
1840 fs_info->fs_devices->num_devices--; 1820 fs_devices->num_devices--;
1841 if (srcdev->missing) { 1821 if (srcdev->missing)
1842 fs_info->fs_devices->missing_devices--; 1822 fs_devices->missing_devices--;
1843 fs_info->fs_devices->rw_devices++;
1844 }
1845 if (srcdev->can_discard)
1846 fs_info->fs_devices->num_can_discard--;
1847 if (srcdev->bdev) {
1848 fs_info->fs_devices->open_devices--;
1849 1823
1850 /* 1824 if (srcdev->writeable) {
1851 * zero out the old super if it is not writable 1825 fs_devices->rw_devices--;
1852 * (e.g. seed device) 1826 /* zero out the old super if it is writable */
1853 */ 1827 btrfs_scratch_superblock(srcdev);
1854 if (srcdev->writeable)
1855 btrfs_scratch_superblock(srcdev);
1856 } 1828 }
1857 1829
1830 if (srcdev->bdev)
1831 fs_devices->open_devices--;
1832
1858 call_rcu(&srcdev->rcu, free_device); 1833 call_rcu(&srcdev->rcu, free_device);
1834
1835 /*
1836 * unless fs_devices is seed fs, num_devices shouldn't go
1837 * zero
1838 */
1839 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
1840
1841 /* if this is no devs we rather delete the fs_devices */
1842 if (!fs_devices->num_devices) {
1843 struct btrfs_fs_devices *tmp_fs_devices;
1844
1845 tmp_fs_devices = fs_info->fs_devices;
1846 while (tmp_fs_devices) {
1847 if (tmp_fs_devices->seed == fs_devices) {
1848 tmp_fs_devices->seed = fs_devices->seed;
1849 break;
1850 }
1851 tmp_fs_devices = tmp_fs_devices->seed;
1852 }
1853 fs_devices->seed = NULL;
1854 __btrfs_close_devices(fs_devices);
1855 free_fs_devices(fs_devices);
1856 }
1859} 1857}
1860 1858
1861void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1859void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1863,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1863{ 1861{
1864 struct btrfs_device *next_device; 1862 struct btrfs_device *next_device;
1865 1863
1864 mutex_lock(&uuid_mutex);
1866 WARN_ON(!tgtdev); 1865 WARN_ON(!tgtdev);
1867 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1866 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1868 if (tgtdev->bdev) { 1867 if (tgtdev->bdev) {
@@ -1870,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1870 fs_info->fs_devices->open_devices--; 1869 fs_info->fs_devices->open_devices--;
1871 } 1870 }
1872 fs_info->fs_devices->num_devices--; 1871 fs_info->fs_devices->num_devices--;
1873 if (tgtdev->can_discard)
1874 fs_info->fs_devices->num_can_discard++;
1875 1872
1876 next_device = list_entry(fs_info->fs_devices->devices.next, 1873 next_device = list_entry(fs_info->fs_devices->devices.next,
1877 struct btrfs_device, dev_list); 1874 struct btrfs_device, dev_list);
@@ -1884,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1884 call_rcu(&tgtdev->rcu, free_device); 1881 call_rcu(&tgtdev->rcu, free_device);
1885 1882
1886 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1883 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1884 mutex_unlock(&uuid_mutex);
1887} 1885}
1888 1886
1889static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1887static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1982,17 +1980,17 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1982 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1980 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1983 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1981 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1984 synchronize_rcu); 1982 synchronize_rcu);
1983 list_for_each_entry(device, &seed_devices->devices, dev_list)
1984 device->fs_devices = seed_devices;
1985 1985
1986 lock_chunks(root);
1986 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1987 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1987 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1988 unlock_chunks(root);
1988 device->fs_devices = seed_devices;
1989 }
1990 1989
1991 fs_devices->seeding = 0; 1990 fs_devices->seeding = 0;
1992 fs_devices->num_devices = 0; 1991 fs_devices->num_devices = 0;
1993 fs_devices->open_devices = 0; 1992 fs_devices->open_devices = 0;
1994 fs_devices->missing_devices = 0; 1993 fs_devices->missing_devices = 0;
1995 fs_devices->num_can_discard = 0;
1996 fs_devices->rotating = 0; 1994 fs_devices->rotating = 0;
1997 fs_devices->seed = seed_devices; 1995 fs_devices->seed = seed_devices;
1998 1996
@@ -2092,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2092 struct list_head *devices; 2090 struct list_head *devices;
2093 struct super_block *sb = root->fs_info->sb; 2091 struct super_block *sb = root->fs_info->sb;
2094 struct rcu_string *name; 2092 struct rcu_string *name;
2095 u64 total_bytes; 2093 u64 tmp;
2096 int seeding_dev = 0; 2094 int seeding_dev = 0;
2097 int ret = 0; 2095 int ret = 0;
2098 2096
@@ -2148,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2148 goto error; 2146 goto error;
2149 } 2147 }
2150 2148
2151 lock_chunks(root);
2152
2153 q = bdev_get_queue(bdev); 2149 q = bdev_get_queue(bdev);
2154 if (blk_queue_discard(q)) 2150 if (blk_queue_discard(q))
2155 device->can_discard = 1; 2151 device->can_discard = 1;
@@ -2160,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2160 device->sector_size = root->sectorsize; 2156 device->sector_size = root->sectorsize;
2161 device->total_bytes = i_size_read(bdev->bd_inode); 2157 device->total_bytes = i_size_read(bdev->bd_inode);
2162 device->disk_total_bytes = device->total_bytes; 2158 device->disk_total_bytes = device->total_bytes;
2159 device->commit_total_bytes = device->total_bytes;
2163 device->dev_root = root->fs_info->dev_root; 2160 device->dev_root = root->fs_info->dev_root;
2164 device->bdev = bdev; 2161 device->bdev = bdev;
2165 device->in_fs_metadata = 1; 2162 device->in_fs_metadata = 1;
@@ -2177,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2177 device->fs_devices = root->fs_info->fs_devices; 2174 device->fs_devices = root->fs_info->fs_devices;
2178 2175
2179 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2176 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2177 lock_chunks(root);
2180 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2178 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2181 list_add(&device->dev_alloc_list, 2179 list_add(&device->dev_alloc_list,
2182 &root->fs_info->fs_devices->alloc_list); 2180 &root->fs_info->fs_devices->alloc_list);
@@ -2184,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2184 root->fs_info->fs_devices->open_devices++; 2182 root->fs_info->fs_devices->open_devices++;
2185 root->fs_info->fs_devices->rw_devices++; 2183 root->fs_info->fs_devices->rw_devices++;
2186 root->fs_info->fs_devices->total_devices++; 2184 root->fs_info->fs_devices->total_devices++;
2187 if (device->can_discard)
2188 root->fs_info->fs_devices->num_can_discard++;
2189 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2185 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2190 2186
2191 spin_lock(&root->fs_info->free_chunk_lock); 2187 spin_lock(&root->fs_info->free_chunk_lock);
@@ -2195,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2195 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2191 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2196 root->fs_info->fs_devices->rotating = 1; 2192 root->fs_info->fs_devices->rotating = 1;
2197 2193
2198 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 2194 tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
2199 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2195 btrfs_set_super_total_bytes(root->fs_info->super_copy,
2200 total_bytes + device->total_bytes); 2196 tmp + device->total_bytes);
2201 2197
2202 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2198 tmp = btrfs_super_num_devices(root->fs_info->super_copy);
2203 btrfs_set_super_num_devices(root->fs_info->super_copy, 2199 btrfs_set_super_num_devices(root->fs_info->super_copy,
2204 total_bytes + 1); 2200 tmp + 1);
2205 2201
2206 /* add sysfs device entry */ 2202 /* add sysfs device entry */
2207 btrfs_kobj_add_device(root->fs_info, device); 2203 btrfs_kobj_add_device(root->fs_info, device);
2208 2204
2205 /*
2206 * we've got more storage, clear any full flags on the space
2207 * infos
2208 */
2209 btrfs_clear_space_info_full(root->fs_info);
2210
2211 unlock_chunks(root);
2209 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2212 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2210 2213
2211 if (seeding_dev) { 2214 if (seeding_dev) {
2212 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2215 lock_chunks(root);
2213 ret = init_first_rw_device(trans, root, device); 2216 ret = init_first_rw_device(trans, root, device);
2217 unlock_chunks(root);
2214 if (ret) { 2218 if (ret) {
2215 btrfs_abort_transaction(trans, root, ret); 2219 btrfs_abort_transaction(trans, root, ret);
2216 goto error_trans; 2220 goto error_trans;
2217 } 2221 }
2222 }
2223
2224 ret = btrfs_add_device(trans, root, device);
2225 if (ret) {
2226 btrfs_abort_transaction(trans, root, ret);
2227 goto error_trans;
2228 }
2229
2230 if (seeding_dev) {
2231 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2232
2218 ret = btrfs_finish_sprout(trans, root); 2233 ret = btrfs_finish_sprout(trans, root);
2219 if (ret) { 2234 if (ret) {
2220 btrfs_abort_transaction(trans, root, ret); 2235 btrfs_abort_transaction(trans, root, ret);
@@ -2228,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2228 root->fs_info->fsid); 2243 root->fs_info->fsid);
2229 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2244 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2230 goto error_trans; 2245 goto error_trans;
2231 } else {
2232 ret = btrfs_add_device(trans, root, device);
2233 if (ret) {
2234 btrfs_abort_transaction(trans, root, ret);
2235 goto error_trans;
2236 }
2237 } 2246 }
2238 2247
2239 /*
2240 * we've got more storage, clear any full flags on the space
2241 * infos
2242 */
2243 btrfs_clear_space_info_full(root->fs_info);
2244
2245 unlock_chunks(root);
2246 root->fs_info->num_tolerated_disk_barrier_failures = 2248 root->fs_info->num_tolerated_disk_barrier_failures =
2247 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2249 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2248 ret = btrfs_commit_transaction(trans, root); 2250 ret = btrfs_commit_transaction(trans, root);
@@ -2274,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2274 return ret; 2276 return ret;
2275 2277
2276error_trans: 2278error_trans:
2277 unlock_chunks(root);
2278 btrfs_end_transaction(trans, root); 2279 btrfs_end_transaction(trans, root);
2279 rcu_string_free(device->name); 2280 rcu_string_free(device->name);
2280 btrfs_kobj_rm_device(root->fs_info, device); 2281 btrfs_kobj_rm_device(root->fs_info, device);
@@ -2289,6 +2290,7 @@ error:
2289} 2290}
2290 2291
2291int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2292int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2293 struct btrfs_device *srcdev,
2292 struct btrfs_device **device_out) 2294 struct btrfs_device **device_out)
2293{ 2295{
2294 struct request_queue *q; 2296 struct request_queue *q;
@@ -2301,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2301 int ret = 0; 2303 int ret = 0;
2302 2304
2303 *device_out = NULL; 2305 *device_out = NULL;
2304 if (fs_info->fs_devices->seeding) 2306 if (fs_info->fs_devices->seeding) {
2307 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2305 return -EINVAL; 2308 return -EINVAL;
2309 }
2306 2310
2307 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2311 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2308 fs_info->bdev_holder); 2312 fs_info->bdev_holder);
2309 if (IS_ERR(bdev)) 2313 if (IS_ERR(bdev)) {
2314 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2310 return PTR_ERR(bdev); 2315 return PTR_ERR(bdev);
2316 }
2311 2317
2312 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2318 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2313 2319
2314 devices = &fs_info->fs_devices->devices; 2320 devices = &fs_info->fs_devices->devices;
2315 list_for_each_entry(device, devices, dev_list) { 2321 list_for_each_entry(device, devices, dev_list) {
2316 if (device->bdev == bdev) { 2322 if (device->bdev == bdev) {
2323 btrfs_err(fs_info, "target device is in the filesystem!");
2317 ret = -EEXIST; 2324 ret = -EEXIST;
2318 goto error; 2325 goto error;
2319 } 2326 }
2320 } 2327 }
2321 2328
2329
2330 if (i_size_read(bdev->bd_inode) <
2331 btrfs_device_get_total_bytes(srcdev)) {
2332 btrfs_err(fs_info, "target device is smaller than source device!");
2333 ret = -EINVAL;
2334 goto error;
2335 }
2336
2337
2322 device = btrfs_alloc_device(NULL, &devid, NULL); 2338 device = btrfs_alloc_device(NULL, &devid, NULL);
2323 if (IS_ERR(device)) { 2339 if (IS_ERR(device)) {
2324 ret = PTR_ERR(device); 2340 ret = PTR_ERR(device);
@@ -2342,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2342 device->io_width = root->sectorsize; 2358 device->io_width = root->sectorsize;
2343 device->io_align = root->sectorsize; 2359 device->io_align = root->sectorsize;
2344 device->sector_size = root->sectorsize; 2360 device->sector_size = root->sectorsize;
2345 device->total_bytes = i_size_read(bdev->bd_inode); 2361 device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2346 device->disk_total_bytes = device->total_bytes; 2362 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2363 device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2364 ASSERT(list_empty(&srcdev->resized_list));
2365 device->commit_total_bytes = srcdev->commit_total_bytes;
2366 device->commit_bytes_used = device->bytes_used;
2347 device->dev_root = fs_info->dev_root; 2367 device->dev_root = fs_info->dev_root;
2348 device->bdev = bdev; 2368 device->bdev = bdev;
2349 device->in_fs_metadata = 1; 2369 device->in_fs_metadata = 1;
@@ -2355,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2355 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2375 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2356 fs_info->fs_devices->num_devices++; 2376 fs_info->fs_devices->num_devices++;
2357 fs_info->fs_devices->open_devices++; 2377 fs_info->fs_devices->open_devices++;
2358 if (device->can_discard)
2359 fs_info->fs_devices->num_can_discard++;
2360 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2378 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2361 2379
2362 *device_out = device; 2380 *device_out = device;
@@ -2415,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2415 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2433 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2416 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2434 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2417 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2435 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2418 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 2436 btrfs_set_device_total_bytes(leaf, dev_item,
2419 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 2437 btrfs_device_get_disk_total_bytes(device));
2438 btrfs_set_device_bytes_used(leaf, dev_item,
2439 btrfs_device_get_bytes_used(device));
2420 btrfs_mark_buffer_dirty(leaf); 2440 btrfs_mark_buffer_dirty(leaf);
2421 2441
2422out: 2442out:
@@ -2424,40 +2444,44 @@ out:
2424 return ret; 2444 return ret;
2425} 2445}
2426 2446
2427static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 2447int btrfs_grow_device(struct btrfs_trans_handle *trans,
2428 struct btrfs_device *device, u64 new_size) 2448 struct btrfs_device *device, u64 new_size)
2429{ 2449{
2430 struct btrfs_super_block *super_copy = 2450 struct btrfs_super_block *super_copy =
2431 device->dev_root->fs_info->super_copy; 2451 device->dev_root->fs_info->super_copy;
2432 u64 old_total = btrfs_super_total_bytes(super_copy); 2452 struct btrfs_fs_devices *fs_devices;
2433 u64 diff = new_size - device->total_bytes; 2453 u64 old_total;
2454 u64 diff;
2434 2455
2435 if (!device->writeable) 2456 if (!device->writeable)
2436 return -EACCES; 2457 return -EACCES;
2458
2459 lock_chunks(device->dev_root);
2460 old_total = btrfs_super_total_bytes(super_copy);
2461 diff = new_size - device->total_bytes;
2462
2437 if (new_size <= device->total_bytes || 2463 if (new_size <= device->total_bytes ||
2438 device->is_tgtdev_for_dev_replace) 2464 device->is_tgtdev_for_dev_replace) {
2465 unlock_chunks(device->dev_root);
2439 return -EINVAL; 2466 return -EINVAL;
2467 }
2468
2469 fs_devices = device->dev_root->fs_info->fs_devices;
2440 2470
2441 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2471 btrfs_set_super_total_bytes(super_copy, old_total + diff);
2442 device->fs_devices->total_rw_bytes += diff; 2472 device->fs_devices->total_rw_bytes += diff;
2443 2473
2444 device->total_bytes = new_size; 2474 btrfs_device_set_total_bytes(device, new_size);
2445 device->disk_total_bytes = new_size; 2475 btrfs_device_set_disk_total_bytes(device, new_size);
2446 btrfs_clear_space_info_full(device->dev_root->fs_info); 2476 btrfs_clear_space_info_full(device->dev_root->fs_info);
2477 if (list_empty(&device->resized_list))
2478 list_add_tail(&device->resized_list,
2479 &fs_devices->resized_devices);
2480 unlock_chunks(device->dev_root);
2447 2481
2448 return btrfs_update_device(trans, device); 2482 return btrfs_update_device(trans, device);
2449} 2483}
2450 2484
2451int btrfs_grow_device(struct btrfs_trans_handle *trans,
2452 struct btrfs_device *device, u64 new_size)
2453{
2454 int ret;
2455 lock_chunks(device->dev_root);
2456 ret = __btrfs_grow_device(trans, device, new_size);
2457 unlock_chunks(device->dev_root);
2458 return ret;
2459}
2460
2461static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2485static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2462 struct btrfs_root *root, 2486 struct btrfs_root *root,
2463 u64 chunk_tree, u64 chunk_objectid, 2487 u64 chunk_tree, u64 chunk_objectid,
@@ -2509,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2509 u32 cur; 2533 u32 cur;
2510 struct btrfs_key key; 2534 struct btrfs_key key;
2511 2535
2536 lock_chunks(root);
2512 array_size = btrfs_super_sys_array_size(super_copy); 2537 array_size = btrfs_super_sys_array_size(super_copy);
2513 2538
2514 ptr = super_copy->sys_chunk_array; 2539 ptr = super_copy->sys_chunk_array;
@@ -2538,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2538 cur += len; 2563 cur += len;
2539 } 2564 }
2540 } 2565 }
2566 unlock_chunks(root);
2541 return ret; 2567 return ret;
2542} 2568}
2543 2569
2544static int btrfs_relocate_chunk(struct btrfs_root *root, 2570int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2545 u64 chunk_tree, u64 chunk_objectid, 2571 struct btrfs_root *root, u64 chunk_offset)
2546 u64 chunk_offset)
2547{ 2572{
2548 struct extent_map_tree *em_tree; 2573 struct extent_map_tree *em_tree;
2549 struct btrfs_root *extent_root;
2550 struct btrfs_trans_handle *trans;
2551 struct extent_map *em; 2574 struct extent_map *em;
2575 struct btrfs_root *extent_root = root->fs_info->extent_root;
2552 struct map_lookup *map; 2576 struct map_lookup *map;
2553 int ret; 2577 u64 dev_extent_len = 0;
2554 int i; 2578 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2579 u64 chunk_tree = root->fs_info->chunk_root->objectid;
2580 int i, ret = 0;
2555 2581
2582 /* Just in case */
2556 root = root->fs_info->chunk_root; 2583 root = root->fs_info->chunk_root;
2557 extent_root = root->fs_info->extent_root;
2558 em_tree = &root->fs_info->mapping_tree.map_tree; 2584 em_tree = &root->fs_info->mapping_tree.map_tree;
2559 2585
2560 ret = btrfs_can_relocate(extent_root, chunk_offset);
2561 if (ret)
2562 return -ENOSPC;
2563
2564 /* step one, relocate all the extents inside this chunk */
2565 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2566 if (ret)
2567 return ret;
2568
2569 trans = btrfs_start_transaction(root, 0);
2570 if (IS_ERR(trans)) {
2571 ret = PTR_ERR(trans);
2572 btrfs_std_error(root->fs_info, ret);
2573 return ret;
2574 }
2575
2576 lock_chunks(root);
2577
2578 /*
2579 * step two, delete the device extents and the
2580 * chunk tree entries
2581 */
2582 read_lock(&em_tree->lock); 2586 read_lock(&em_tree->lock);
2583 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2587 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2584 read_unlock(&em_tree->lock); 2588 read_unlock(&em_tree->lock);
2585 2589
2586 BUG_ON(!em || em->start > chunk_offset || 2590 if (!em || em->start > chunk_offset ||
2587 em->start + em->len < chunk_offset); 2591 em->start + em->len < chunk_offset) {
2592 /*
2593 * This is a logic error, but we don't want to just rely on the
2594 * user having built with ASSERT enabled, so if ASSERT doens't
2595 * do anything we still error out.
2596 */
2597 ASSERT(0);
2598 if (em)
2599 free_extent_map(em);
2600 return -EINVAL;
2601 }
2588 map = (struct map_lookup *)em->bdev; 2602 map = (struct map_lookup *)em->bdev;
2589 2603
2590 for (i = 0; i < map->num_stripes; i++) { 2604 for (i = 0; i < map->num_stripes; i++) {
2591 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2605 struct btrfs_device *device = map->stripes[i].dev;
2592 map->stripes[i].physical); 2606 ret = btrfs_free_dev_extent(trans, device,
2593 BUG_ON(ret); 2607 map->stripes[i].physical,
2608 &dev_extent_len);
2609 if (ret) {
2610 btrfs_abort_transaction(trans, root, ret);
2611 goto out;
2612 }
2613
2614 if (device->bytes_used > 0) {
2615 lock_chunks(root);
2616 btrfs_device_set_bytes_used(device,
2617 device->bytes_used - dev_extent_len);
2618 spin_lock(&root->fs_info->free_chunk_lock);
2619 root->fs_info->free_chunk_space += dev_extent_len;
2620 spin_unlock(&root->fs_info->free_chunk_lock);
2621 btrfs_clear_space_info_full(root->fs_info);
2622 unlock_chunks(root);
2623 }
2594 2624
2595 if (map->stripes[i].dev) { 2625 if (map->stripes[i].dev) {
2596 ret = btrfs_update_device(trans, map->stripes[i].dev); 2626 ret = btrfs_update_device(trans, map->stripes[i].dev);
2597 BUG_ON(ret); 2627 if (ret) {
2628 btrfs_abort_transaction(trans, root, ret);
2629 goto out;
2630 }
2598 } 2631 }
2599 } 2632 }
2600 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2633 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2601 chunk_offset); 2634 chunk_offset);
2602 2635 if (ret) {
2603 BUG_ON(ret); 2636 btrfs_abort_transaction(trans, root, ret);
2637 goto out;
2638 }
2604 2639
2605 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2640 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2606 2641
2607 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2642 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2608 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2643 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2609 BUG_ON(ret); 2644 if (ret) {
2645 btrfs_abort_transaction(trans, root, ret);
2646 goto out;
2647 }
2610 } 2648 }
2611 2649
2612 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2650 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2613 BUG_ON(ret); 2651 if (ret) {
2652 btrfs_abort_transaction(trans, extent_root, ret);
2653 goto out;
2654 }
2614 2655
2615 write_lock(&em_tree->lock); 2656 write_lock(&em_tree->lock);
2616 remove_extent_mapping(em_tree, em); 2657 remove_extent_mapping(em_tree, em);
@@ -2618,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
2618 2659
2619 /* once for the tree */ 2660 /* once for the tree */
2620 free_extent_map(em); 2661 free_extent_map(em);
2662out:
2621 /* once for us */ 2663 /* once for us */
2622 free_extent_map(em); 2664 free_extent_map(em);
2665 return ret;
2666}
2623 2667
2624 unlock_chunks(root); 2668static int btrfs_relocate_chunk(struct btrfs_root *root,
2669 u64 chunk_tree, u64 chunk_objectid,
2670 u64 chunk_offset)
2671{
2672 struct btrfs_root *extent_root;
2673 struct btrfs_trans_handle *trans;
2674 int ret;
2675
2676 root = root->fs_info->chunk_root;
2677 extent_root = root->fs_info->extent_root;
2678
2679 ret = btrfs_can_relocate(extent_root, chunk_offset);
2680 if (ret)
2681 return -ENOSPC;
2682
2683 /* step one, relocate all the extents inside this chunk */
2684 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2685 if (ret)
2686 return ret;
2687
2688 trans = btrfs_start_transaction(root, 0);
2689 if (IS_ERR(trans)) {
2690 ret = PTR_ERR(trans);
2691 btrfs_std_error(root->fs_info, ret);
2692 return ret;
2693 }
2694
2695 /*
2696 * step two, delete the device extents and the
2697 * chunk tree entries
2698 */
2699 ret = btrfs_remove_chunk(trans, root, chunk_offset);
2625 btrfs_end_transaction(trans, root); 2700 btrfs_end_transaction(trans, root);
2626 return 0; 2701 return ret;
2627} 2702}
2628 2703
2629static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2704static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2676,8 +2751,8 @@ again:
2676 found_key.offset); 2751 found_key.offset);
2677 if (ret == -ENOSPC) 2752 if (ret == -ENOSPC)
2678 failed++; 2753 failed++;
2679 else if (ret) 2754 else
2680 BUG(); 2755 BUG_ON(ret);
2681 } 2756 }
2682 2757
2683 if (found_key.offset == 0) 2758 if (found_key.offset == 0)
@@ -3084,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3084 /* step one make some room on all the devices */ 3159 /* step one make some room on all the devices */
3085 devices = &fs_info->fs_devices->devices; 3160 devices = &fs_info->fs_devices->devices;
3086 list_for_each_entry(device, devices, dev_list) { 3161 list_for_each_entry(device, devices, dev_list) {
3087 old_size = device->total_bytes; 3162 old_size = btrfs_device_get_total_bytes(device);
3088 size_to_free = div_factor(old_size, 1); 3163 size_to_free = div_factor(old_size, 1);
3089 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 3164 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3090 if (!device->writeable || 3165 if (!device->writeable ||
3091 device->total_bytes - device->bytes_used > size_to_free || 3166 btrfs_device_get_total_bytes(device) -
3167 btrfs_device_get_bytes_used(device) > size_to_free ||
3092 device->is_tgtdev_for_dev_replace) 3168 device->is_tgtdev_for_dev_replace)
3093 continue; 3169 continue;
3094 3170
@@ -3643,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
3643 max_key.type = BTRFS_ROOT_ITEM_KEY; 3719 max_key.type = BTRFS_ROOT_ITEM_KEY;
3644 max_key.offset = (u64)-1; 3720 max_key.offset = (u64)-1;
3645 3721
3646 path->keep_locks = 1;
3647
3648 while (1) { 3722 while (1) {
3649 ret = btrfs_search_forward(root, &key, path, 0); 3723 ret = btrfs_search_forward(root, &key, path, 0);
3650 if (ret) { 3724 if (ret) {
@@ -3896,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3896 struct btrfs_key key; 3970 struct btrfs_key key;
3897 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3971 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3898 u64 old_total = btrfs_super_total_bytes(super_copy); 3972 u64 old_total = btrfs_super_total_bytes(super_copy);
3899 u64 old_size = device->total_bytes; 3973 u64 old_size = btrfs_device_get_total_bytes(device);
3900 u64 diff = device->total_bytes - new_size; 3974 u64 diff = old_size - new_size;
3901 3975
3902 if (device->is_tgtdev_for_dev_replace) 3976 if (device->is_tgtdev_for_dev_replace)
3903 return -EINVAL; 3977 return -EINVAL;
@@ -3910,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3910 3984
3911 lock_chunks(root); 3985 lock_chunks(root);
3912 3986
3913 device->total_bytes = new_size; 3987 btrfs_device_set_total_bytes(device, new_size);
3914 if (device->writeable) { 3988 if (device->writeable) {
3915 device->fs_devices->total_rw_bytes -= diff; 3989 device->fs_devices->total_rw_bytes -= diff;
3916 spin_lock(&root->fs_info->free_chunk_lock); 3990 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3976,7 +4050,7 @@ again:
3976 ret = -ENOSPC; 4050 ret = -ENOSPC;
3977 lock_chunks(root); 4051 lock_chunks(root);
3978 4052
3979 device->total_bytes = old_size; 4053 btrfs_device_set_total_bytes(device, old_size);
3980 if (device->writeable) 4054 if (device->writeable)
3981 device->fs_devices->total_rw_bytes += diff; 4055 device->fs_devices->total_rw_bytes += diff;
3982 spin_lock(&root->fs_info->free_chunk_lock); 4056 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3994,18 +4068,17 @@ again:
3994 } 4068 }
3995 4069
3996 lock_chunks(root); 4070 lock_chunks(root);
4071 btrfs_device_set_disk_total_bytes(device, new_size);
4072 if (list_empty(&device->resized_list))
4073 list_add_tail(&device->resized_list,
4074 &root->fs_info->fs_devices->resized_devices);
3997 4075
3998 device->disk_total_bytes = new_size;
3999 /* Now btrfs_update_device() will change the on-disk size. */
4000 ret = btrfs_update_device(trans, device);
4001 if (ret) {
4002 unlock_chunks(root);
4003 btrfs_end_transaction(trans, root);
4004 goto done;
4005 }
4006 WARN_ON(diff > old_total); 4076 WARN_ON(diff > old_total);
4007 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4077 btrfs_set_super_total_bytes(super_copy, old_total - diff);
4008 unlock_chunks(root); 4078 unlock_chunks(root);
4079
4080 /* Now btrfs_update_device() will change the on-disk size. */
4081 ret = btrfs_update_device(trans, device);
4009 btrfs_end_transaction(trans, root); 4082 btrfs_end_transaction(trans, root);
4010done: 4083done:
4011 btrfs_free_path(path); 4084 btrfs_free_path(path);
@@ -4021,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
4021 u32 array_size; 4094 u32 array_size;
4022 u8 *ptr; 4095 u8 *ptr;
4023 4096
4097 lock_chunks(root);
4024 array_size = btrfs_super_sys_array_size(super_copy); 4098 array_size = btrfs_super_sys_array_size(super_copy);
4025 if (array_size + item_size + sizeof(disk_key) 4099 if (array_size + item_size + sizeof(disk_key)
4026 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4100 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4101 unlock_chunks(root);
4027 return -EFBIG; 4102 return -EFBIG;
4103 }
4028 4104
4029 ptr = super_copy->sys_chunk_array + array_size; 4105 ptr = super_copy->sys_chunk_array + array_size;
4030 btrfs_cpu_key_to_disk(&disk_key, key); 4106 btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4033,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
4033 memcpy(ptr, chunk, item_size); 4109 memcpy(ptr, chunk, item_size);
4034 item_size += sizeof(disk_key); 4110 item_size += sizeof(disk_key);
4035 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4111 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4112 unlock_chunks(root);
4113
4036 return 0; 4114 return 0;
4037} 4115}
4038 4116
@@ -4402,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4402 if (ret) 4480 if (ret)
4403 goto error_del_extent; 4481 goto error_del_extent;
4404 4482
4483 for (i = 0; i < map->num_stripes; i++) {
4484 num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4485 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4486 }
4487
4488 spin_lock(&extent_root->fs_info->free_chunk_lock);
4489 extent_root->fs_info->free_chunk_space -= (stripe_size *
4490 map->num_stripes);
4491 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4492
4405 free_extent_map(em); 4493 free_extent_map(em);
4406 check_raid56_incompat_flag(extent_root->fs_info, type); 4494 check_raid56_incompat_flag(extent_root->fs_info, type);
4407 4495
@@ -4473,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4473 device = map->stripes[i].dev; 4561 device = map->stripes[i].dev;
4474 dev_offset = map->stripes[i].physical; 4562 dev_offset = map->stripes[i].physical;
4475 4563
4476 device->bytes_used += stripe_size;
4477 ret = btrfs_update_device(trans, device); 4564 ret = btrfs_update_device(trans, device);
4478 if (ret) 4565 if (ret)
4479 goto out; 4566 goto out;
@@ -4486,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4486 goto out; 4573 goto out;
4487 } 4574 }
4488 4575
4489 spin_lock(&extent_root->fs_info->free_chunk_lock);
4490 extent_root->fs_info->free_chunk_space -= (stripe_size *
4491 map->num_stripes);
4492 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4493
4494 stripe = &chunk->stripe; 4576 stripe = &chunk->stripe;
4495 for (i = 0; i < map->num_stripes; i++) { 4577 for (i = 0; i < map->num_stripes; i++) {
4496 device = map->stripes[i].dev; 4578 device = map->stripes[i].dev;
@@ -4570,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4570 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4652 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4571 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4653 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4572 alloc_profile); 4654 alloc_profile);
4573 if (ret) { 4655 return ret;
4574 btrfs_abort_transaction(trans, root, ret); 4656}
4575 goto out; 4657
4658static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4659{
4660 int max_errors;
4661
4662 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4663 BTRFS_BLOCK_GROUP_RAID10 |
4664 BTRFS_BLOCK_GROUP_RAID5 |
4665 BTRFS_BLOCK_GROUP_DUP)) {
4666 max_errors = 1;
4667 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4668 max_errors = 2;
4669 } else {
4670 max_errors = 0;
4576 } 4671 }
4577 4672
4578 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4673 return max_errors;
4579 if (ret)
4580 btrfs_abort_transaction(trans, root, ret);
4581out:
4582 return ret;
4583} 4674}
4584 4675
4585int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4676int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4588,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4588 struct map_lookup *map; 4679 struct map_lookup *map;
4589 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4680 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4590 int readonly = 0; 4681 int readonly = 0;
4682 int miss_ndevs = 0;
4591 int i; 4683 int i;
4592 4684
4593 read_lock(&map_tree->map_tree.lock); 4685 read_lock(&map_tree->map_tree.lock);
@@ -4596,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4596 if (!em) 4688 if (!em)
4597 return 1; 4689 return 1;
4598 4690
4599 if (btrfs_test_opt(root, DEGRADED)) {
4600 free_extent_map(em);
4601 return 0;
4602 }
4603
4604 map = (struct map_lookup *)em->bdev; 4691 map = (struct map_lookup *)em->bdev;
4605 for (i = 0; i < map->num_stripes; i++) { 4692 for (i = 0; i < map->num_stripes; i++) {
4693 if (map->stripes[i].dev->missing) {
4694 miss_ndevs++;
4695 continue;
4696 }
4697
4606 if (!map->stripes[i].dev->writeable) { 4698 if (!map->stripes[i].dev->writeable) {
4607 readonly = 1; 4699 readonly = 1;
4608 break; 4700 goto end;
4609 } 4701 }
4610 } 4702 }
4703
4704 /*
4705 * If the number of missing devices is larger than max errors,
4706 * we can not write the data into that chunk successfully, so
4707 * set it readonly.
4708 */
4709 if (miss_ndevs > btrfs_chunk_max_errors(map))
4710 readonly = 1;
4711end:
4611 free_extent_map(em); 4712 free_extent_map(em);
4612 return readonly; 4713 return readonly;
4613} 4714}
@@ -5008,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5008 num_stripes = min_t(u64, map->num_stripes, 5109 num_stripes = min_t(u64, map->num_stripes,
5009 stripe_nr_end - stripe_nr_orig); 5110 stripe_nr_end - stripe_nr_orig);
5010 stripe_index = do_div(stripe_nr, map->num_stripes); 5111 stripe_index = do_div(stripe_nr, map->num_stripes);
5112 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5113 mirror_num = 1;
5011 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5114 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5012 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 5115 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
5013 num_stripes = map->num_stripes; 5116 num_stripes = map->num_stripes;
@@ -5111,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5111 /* We distribute the parity blocks across stripes */ 5214 /* We distribute the parity blocks across stripes */
5112 tmp = stripe_nr + stripe_index; 5215 tmp = stripe_nr + stripe_index;
5113 stripe_index = do_div(tmp, map->num_stripes); 5216 stripe_index = do_div(tmp, map->num_stripes);
5217 if (!(rw & (REQ_WRITE | REQ_DISCARD |
5218 REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5219 mirror_num = 1;
5114 } 5220 }
5115 } else { 5221 } else {
5116 /* 5222 /*
@@ -5218,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5218 } 5324 }
5219 } 5325 }
5220 5326
5221 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 5327 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5222 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5328 max_errors = btrfs_chunk_max_errors(map);
5223 BTRFS_BLOCK_GROUP_RAID10 |
5224 BTRFS_BLOCK_GROUP_RAID5 |
5225 BTRFS_BLOCK_GROUP_DUP)) {
5226 max_errors = 1;
5227 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5228 max_errors = 2;
5229 }
5230 }
5231 5329
5232 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5330 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5233 dev_replace->tgtdev != NULL) { 5331 dev_replace->tgtdev != NULL) {
@@ -5610,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5610 name = rcu_dereference(dev->name); 5708 name = rcu_dereference(dev->name);
5611 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5709 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5612 "(%s id %llu), size=%u\n", rw, 5710 "(%s id %llu), size=%u\n", rw,
5613 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 5711 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
5614 name->str, dev->devid, bio->bi_size); 5712 name->str, dev->devid, bio->bi_iter.bi_size);
5615 rcu_read_unlock(); 5713 rcu_read_unlock();
5616 } 5714 }
5617#endif 5715#endif
@@ -5789,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5789} 5887}
5790 5888
5791static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5889static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5890 struct btrfs_fs_devices *fs_devices,
5792 u64 devid, u8 *dev_uuid) 5891 u64 devid, u8 *dev_uuid)
5793{ 5892{
5794 struct btrfs_device *device; 5893 struct btrfs_device *device;
5795 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5796 5894
5797 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5895 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5798 if (IS_ERR(device)) 5896 if (IS_ERR(device))
@@ -5929,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5929 } 6027 }
5930 if (!map->stripes[i].dev) { 6028 if (!map->stripes[i].dev) {
5931 map->stripes[i].dev = 6029 map->stripes[i].dev =
5932 add_missing_dev(root, devid, uuid); 6030 add_missing_dev(root, root->fs_info->fs_devices,
6031 devid, uuid);
5933 if (!map->stripes[i].dev) { 6032 if (!map->stripes[i].dev) {
5934 free_extent_map(em); 6033 free_extent_map(em);
5935 return -EIO; 6034 return -EIO;
@@ -5956,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
5956 device->devid = btrfs_device_id(leaf, dev_item); 6055 device->devid = btrfs_device_id(leaf, dev_item);
5957 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6056 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5958 device->total_bytes = device->disk_total_bytes; 6057 device->total_bytes = device->disk_total_bytes;
6058 device->commit_total_bytes = device->disk_total_bytes;
5959 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6059 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6060 device->commit_bytes_used = device->bytes_used;
5960 device->type = btrfs_device_type(leaf, dev_item); 6061 device->type = btrfs_device_type(leaf, dev_item);
5961 device->io_align = btrfs_device_io_align(leaf, dev_item); 6062 device->io_align = btrfs_device_io_align(leaf, dev_item);
5962 device->io_width = btrfs_device_io_width(leaf, dev_item); 6063 device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5968,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
5968 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6069 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5969} 6070}
5970 6071
5971static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 6072static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
6073 u8 *fsid)
5972{ 6074{
5973 struct btrfs_fs_devices *fs_devices; 6075 struct btrfs_fs_devices *fs_devices;
5974 int ret; 6076 int ret;
@@ -5977,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5977 6079
5978 fs_devices = root->fs_info->fs_devices->seed; 6080 fs_devices = root->fs_info->fs_devices->seed;
5979 while (fs_devices) { 6081 while (fs_devices) {
5980 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 6082 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
5981 ret = 0; 6083 return fs_devices;
5982 goto out; 6084
5983 }
5984 fs_devices = fs_devices->seed; 6085 fs_devices = fs_devices->seed;
5985 } 6086 }
5986 6087
5987 fs_devices = find_fsid(fsid); 6088 fs_devices = find_fsid(fsid);
5988 if (!fs_devices) { 6089 if (!fs_devices) {
5989 ret = -ENOENT; 6090 if (!btrfs_test_opt(root, DEGRADED))
5990 goto out; 6091 return ERR_PTR(-ENOENT);
6092
6093 fs_devices = alloc_fs_devices(fsid);
6094 if (IS_ERR(fs_devices))
6095 return fs_devices;
6096
6097 fs_devices->seeding = 1;
6098 fs_devices->opened = 1;
6099 return fs_devices;
5991 } 6100 }
5992 6101
5993 fs_devices = clone_fs_devices(fs_devices); 6102 fs_devices = clone_fs_devices(fs_devices);
5994 if (IS_ERR(fs_devices)) { 6103 if (IS_ERR(fs_devices))
5995 ret = PTR_ERR(fs_devices); 6104 return fs_devices;
5996 goto out;
5997 }
5998 6105
5999 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6106 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6000 root->fs_info->bdev_holder); 6107 root->fs_info->bdev_holder);
6001 if (ret) { 6108 if (ret) {
6002 free_fs_devices(fs_devices); 6109 free_fs_devices(fs_devices);
6110 fs_devices = ERR_PTR(ret);
6003 goto out; 6111 goto out;
6004 } 6112 }
6005 6113
6006 if (!fs_devices->seeding) { 6114 if (!fs_devices->seeding) {
6007 __btrfs_close_devices(fs_devices); 6115 __btrfs_close_devices(fs_devices);
6008 free_fs_devices(fs_devices); 6116 free_fs_devices(fs_devices);
6009 ret = -EINVAL; 6117 fs_devices = ERR_PTR(-EINVAL);
6010 goto out; 6118 goto out;
6011 } 6119 }
6012 6120
6013 fs_devices->seed = root->fs_info->fs_devices->seed; 6121 fs_devices->seed = root->fs_info->fs_devices->seed;
6014 root->fs_info->fs_devices->seed = fs_devices; 6122 root->fs_info->fs_devices->seed = fs_devices;
6015out: 6123out:
6016 return ret; 6124 return fs_devices;
6017} 6125}
6018 6126
6019static int read_one_dev(struct btrfs_root *root, 6127static int read_one_dev(struct btrfs_root *root,
6020 struct extent_buffer *leaf, 6128 struct extent_buffer *leaf,
6021 struct btrfs_dev_item *dev_item) 6129 struct btrfs_dev_item *dev_item)
6022{ 6130{
6131 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6023 struct btrfs_device *device; 6132 struct btrfs_device *device;
6024 u64 devid; 6133 u64 devid;
6025 int ret; 6134 int ret;
@@ -6033,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
6033 BTRFS_UUID_SIZE); 6142 BTRFS_UUID_SIZE);
6034 6143
6035 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 6144 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
6036 ret = open_seed_devices(root, fs_uuid); 6145 fs_devices = open_seed_devices(root, fs_uuid);
6037 if (ret && !btrfs_test_opt(root, DEGRADED)) 6146 if (IS_ERR(fs_devices))
6038 return ret; 6147 return PTR_ERR(fs_devices);
6039 } 6148 }
6040 6149
6041 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 6150 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
6042 if (!device || !device->bdev) { 6151 if (!device) {
6043 if (!btrfs_test_opt(root, DEGRADED)) 6152 if (!btrfs_test_opt(root, DEGRADED))
6044 return -EIO; 6153 return -EIO;
6045 6154
6046 if (!device) { 6155 btrfs_warn(root->fs_info, "devid %llu missing", devid);
6047 btrfs_warn(root->fs_info, "devid %llu missing", devid); 6156 device = add_missing_dev(root, fs_devices, devid, dev_uuid);
6048 device = add_missing_dev(root, devid, dev_uuid); 6157 if (!device)
6049 if (!device) 6158 return -ENOMEM;
6050 return -ENOMEM; 6159 } else {
6051 } else if (!device->missing) { 6160 if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
6161 return -EIO;
6162
6163 if(!device->bdev && !device->missing) {
6052 /* 6164 /*
6053 * this happens when a device that was properly setup 6165 * this happens when a device that was properly setup
6054 * in the device info lists suddenly goes bad. 6166 * in the device info lists suddenly goes bad.
6055 * device->bdev is NULL, and so we have to set 6167 * device->bdev is NULL, and so we have to set
6056 * device->missing to one here 6168 * device->missing to one here
6057 */ 6169 */
6058 root->fs_info->fs_devices->missing_devices++; 6170 device->fs_devices->missing_devices++;
6059 device->missing = 1; 6171 device->missing = 1;
6060 } 6172 }
6173
6174 /* Move the device to its own fs_devices */
6175 if (device->fs_devices != fs_devices) {
6176 ASSERT(device->missing);
6177
6178 list_move(&device->dev_list, &fs_devices->devices);
6179 device->fs_devices->num_devices--;
6180 fs_devices->num_devices++;
6181
6182 device->fs_devices->missing_devices--;
6183 fs_devices->missing_devices++;
6184
6185 device->fs_devices = fs_devices;
6186 }
6061 } 6187 }
6062 6188
6063 if (device->fs_devices != root->fs_info->fs_devices) { 6189 if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6373,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6373 struct btrfs_root *dev_root = fs_info->dev_root; 6499 struct btrfs_root *dev_root = fs_info->dev_root;
6374 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6500 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6375 struct btrfs_device *device; 6501 struct btrfs_device *device;
6502 int stats_cnt;
6376 int ret = 0; 6503 int ret = 0;
6377 6504
6378 mutex_lock(&fs_devices->device_list_mutex); 6505 mutex_lock(&fs_devices->device_list_mutex);
6379 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6506 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6380 if (!device->dev_stats_valid || !device->dev_stats_dirty) 6507 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
6381 continue; 6508 continue;
6382 6509
6510 stats_cnt = atomic_read(&device->dev_stats_ccnt);
6383 ret = update_dev_stat_item(trans, dev_root, device); 6511 ret = update_dev_stat_item(trans, dev_root, device);
6384 if (!ret) 6512 if (!ret)
6385 device->dev_stats_dirty = 0; 6513 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
6386 } 6514 }
6387 mutex_unlock(&fs_devices->device_list_mutex); 6515 mutex_unlock(&fs_devices->device_list_mutex);
6388 6516
@@ -6481,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
6481 6609
6482 return 0; 6610 return 0;
6483} 6611}
6612
6613/*
6614 * Update the size of all devices, which is used for writing out the
6615 * super blocks.
6616 */
6617void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
6618{
6619 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6620 struct btrfs_device *curr, *next;
6621
6622 if (list_empty(&fs_devices->resized_devices))
6623 return;
6624
6625 mutex_lock(&fs_devices->device_list_mutex);
6626 lock_chunks(fs_info->dev_root);
6627 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
6628 resized_list) {
6629 list_del_init(&curr->resized_list);
6630 curr->commit_total_bytes = curr->disk_total_bytes;
6631 }
6632 unlock_chunks(fs_info->dev_root);
6633 mutex_unlock(&fs_devices->device_list_mutex);
6634}
6635
6636/* Must be invoked during the transaction commit */
6637void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
6638 struct btrfs_transaction *transaction)
6639{
6640 struct extent_map *em;
6641 struct map_lookup *map;
6642 struct btrfs_device *dev;
6643 int i;
6644
6645 if (list_empty(&transaction->pending_chunks))
6646 return;
6647
6648 /* In order to kick the device replace finish process */
6649 lock_chunks(root);
6650 list_for_each_entry(em, &transaction->pending_chunks, list) {
6651 map = (struct map_lookup *)em->bdev;
6652
6653 for (i = 0; i < map->num_stripes; i++) {
6654 dev = map->stripes[i].dev;
6655 dev->commit_bytes_used = dev->bytes_used;
6656 }
6657 }
6658 unlock_chunks(root);
6659}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c47816..08980fa23039 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
24#include <linux/btrfs.h> 24#include <linux/btrfs.h>
25#include "async-thread.h" 25#include "async-thread.h"
26 26
27extern struct mutex uuid_mutex;
28
27#define BTRFS_STRIPE_LEN (64 * 1024) 29#define BTRFS_STRIPE_LEN (64 * 1024)
28 30
29struct buffer_head; 31struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
32 struct bio *tail; 34 struct bio *tail;
33}; 35};
34 36
37/*
38 * Use sequence counter to get consistent device stat data on
39 * 32-bit processors.
40 */
41#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
42#include <linux/seqlock.h>
43#define __BTRFS_NEED_DEVICE_DATA_ORDERED
44#define btrfs_device_data_ordered_init(device) \
45 seqcount_init(&device->data_seqcount)
46#else
47#define btrfs_device_data_ordered_init(device) do { } while (0)
48#endif
49
35struct btrfs_device { 50struct btrfs_device {
36 struct list_head dev_list; 51 struct list_head dev_list;
37 struct list_head dev_alloc_list; 52 struct list_head dev_alloc_list;
38 struct btrfs_fs_devices *fs_devices; 53 struct btrfs_fs_devices *fs_devices;
54
39 struct btrfs_root *dev_root; 55 struct btrfs_root *dev_root;
40 56
57 struct rcu_string *name;
58
59 u64 generation;
60
61 spinlock_t io_lock ____cacheline_aligned;
62 int running_pending;
41 /* regular prio bios */ 63 /* regular prio bios */
42 struct btrfs_pending_bios pending_bios; 64 struct btrfs_pending_bios pending_bios;
43 /* WRITE_SYNC bios */ 65 /* WRITE_SYNC bios */
44 struct btrfs_pending_bios pending_sync_bios; 66 struct btrfs_pending_bios pending_sync_bios;
45 67
46 u64 generation; 68 struct block_device *bdev;
47 int running_pending; 69
70 /* the mode sent to blkdev_get */
71 fmode_t mode;
72
48 int writeable; 73 int writeable;
49 int in_fs_metadata; 74 int in_fs_metadata;
50 int missing; 75 int missing;
51 int can_discard; 76 int can_discard;
52 int is_tgtdev_for_dev_replace; 77 int is_tgtdev_for_dev_replace;
53 78
54 spinlock_t io_lock; 79#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
55 /* the mode sent to blkdev_get */ 80 seqcount_t data_seqcount;
56 fmode_t mode; 81#endif
57
58 struct block_device *bdev;
59
60
61 struct rcu_string *name;
62 82
63 /* the internal btrfs device id */ 83 /* the internal btrfs device id */
64 u64 devid; 84 u64 devid;
65 85
66 /* size of the device */ 86 /* size of the device in memory */
67 u64 total_bytes; 87 u64 total_bytes;
68 88
69 /* size of the disk */ 89 /* size of the device on disk */
70 u64 disk_total_bytes; 90 u64 disk_total_bytes;
71 91
72 /* bytes used */ 92 /* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
83 /* minimal io size for this device */ 103 /* minimal io size for this device */
84 u32 sector_size; 104 u32 sector_size;
85 105
86
87 /* physical drive uuid (or lvm uuid) */ 106 /* physical drive uuid (or lvm uuid) */
88 u8 uuid[BTRFS_UUID_SIZE]; 107 u8 uuid[BTRFS_UUID_SIZE];
89 108
109 /*
110 * size of the device on the current transaction
111 *
112 * This variant is update when committing the transaction,
113 * and protected by device_list_mutex
114 */
115 u64 commit_total_bytes;
116
117 /* bytes used on the current transaction */
118 u64 commit_bytes_used;
119 /*
120 * used to manage the device which is resized
121 *
122 * It is protected by chunk_lock.
123 */
124 struct list_head resized_list;
125
90 /* for sending down flush barriers */ 126 /* for sending down flush barriers */
91 int nobarriers; 127 int nobarriers;
92 struct bio *flush_bio; 128 struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
107 struct radix_tree_root reada_zones; 143 struct radix_tree_root reada_zones;
108 struct radix_tree_root reada_extents; 144 struct radix_tree_root reada_extents;
109 145
110
111 /* disk I/O failure stats. For detailed description refer to 146 /* disk I/O failure stats. For detailed description refer to
112 * enum btrfs_dev_stat_values in ioctl.h */ 147 * enum btrfs_dev_stat_values in ioctl.h */
113 int dev_stats_valid; 148 int dev_stats_valid;
114 int dev_stats_dirty; /* counters need to be written to disk */ 149
150 /* Counter to record the change of device stats */
151 atomic_t dev_stats_ccnt;
115 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; 152 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
116}; 153};
117 154
155/*
156 * If we read those variants at the context of their own lock, we needn't
157 * use the following helpers, reading them directly is safe.
158 */
159#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
160#define BTRFS_DEVICE_GETSET_FUNCS(name) \
161static inline u64 \
162btrfs_device_get_##name(const struct btrfs_device *dev) \
163{ \
164 u64 size; \
165 unsigned int seq; \
166 \
167 do { \
168 seq = read_seqcount_begin(&dev->data_seqcount); \
169 size = dev->name; \
170 } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
171 return size; \
172} \
173 \
174static inline void \
175btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
176{ \
177 preempt_disable(); \
178 write_seqcount_begin(&dev->data_seqcount); \
179 dev->name = size; \
180 write_seqcount_end(&dev->data_seqcount); \
181 preempt_enable(); \
182}
183#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
184#define BTRFS_DEVICE_GETSET_FUNCS(name) \
185static inline u64 \
186btrfs_device_get_##name(const struct btrfs_device *dev) \
187{ \
188 u64 size; \
189 \
190 preempt_disable(); \
191 size = dev->name; \
192 preempt_enable(); \
193 return size; \
194} \
195 \
196static inline void \
197btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
198{ \
199 preempt_disable(); \
200 dev->name = size; \
201 preempt_enable(); \
202}
203#else
204#define BTRFS_DEVICE_GETSET_FUNCS(name) \
205static inline u64 \
206btrfs_device_get_##name(const struct btrfs_device *dev) \
207{ \
208 return dev->name; \
209} \
210 \
211static inline void \
212btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
213{ \
214 dev->name = size; \
215}
216#endif
217
218BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
219BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
220BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
221
118struct btrfs_fs_devices { 222struct btrfs_fs_devices {
119 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 223 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
120 224
121 /* the device with this id has the most recent copy of the super */
122 u64 latest_devid;
123 u64 latest_trans;
124 u64 num_devices; 225 u64 num_devices;
125 u64 open_devices; 226 u64 open_devices;
126 u64 rw_devices; 227 u64 rw_devices;
127 u64 missing_devices; 228 u64 missing_devices;
128 u64 total_rw_bytes; 229 u64 total_rw_bytes;
129 u64 num_can_discard;
130 u64 total_devices; 230 u64 total_devices;
131 struct block_device *latest_bdev; 231 struct block_device *latest_bdev;
132 232
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
139 struct mutex device_list_mutex; 239 struct mutex device_list_mutex;
140 struct list_head devices; 240 struct list_head devices;
141 241
242 struct list_head resized_devices;
142 /* devices not currently being allocated */ 243 /* devices not currently being allocated */
143 struct list_head alloc_list; 244 struct list_head alloc_list;
144 struct list_head list; 245 struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
167 */ 268 */
168typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); 269typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
169struct btrfs_io_bio { 270struct btrfs_io_bio {
170 unsigned long mirror_num; 271 unsigned int mirror_num;
171 unsigned long stripe_index; 272 unsigned int stripe_index;
273 u64 logical;
172 u8 *csum; 274 u8 *csum;
173 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; 275 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
174 u8 *csum_allocated; 276 u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
325int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 427int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
326int btrfs_init_new_device(struct btrfs_root *root, char *path); 428int btrfs_init_new_device(struct btrfs_root *root, char *path);
327int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 429int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
430 struct btrfs_device *srcdev,
328 struct btrfs_device **device_out); 431 struct btrfs_device **device_out);
329int btrfs_balance(struct btrfs_balance_control *bctl, 432int btrfs_balance(struct btrfs_balance_control *bctl,
330 struct btrfs_ioctl_balance_args *bargs); 433 struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
360int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 463int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
361 struct btrfs_root *extent_root, 464 struct btrfs_root *extent_root,
362 u64 chunk_offset, u64 chunk_size); 465 u64 chunk_offset, u64 chunk_size);
466int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
467 struct btrfs_root *root, u64 chunk_offset);
468
469static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
470{
471 return atomic_read(&dev->dev_stats_ccnt);
472}
473
363static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 474static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
364 int index) 475 int index)
365{ 476{
366 atomic_inc(dev->dev_stat_values + index); 477 atomic_inc(dev->dev_stat_values + index);
367 dev->dev_stats_dirty = 1; 478 smp_mb__before_atomic();
479 atomic_inc(&dev->dev_stats_ccnt);
368} 480}
369 481
370static inline int btrfs_dev_stat_read(struct btrfs_device *dev, 482static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
379 int ret; 491 int ret;
380 492
381 ret = atomic_xchg(dev->dev_stat_values + index, 0); 493 ret = atomic_xchg(dev->dev_stat_values + index, 0);
382 dev->dev_stats_dirty = 1; 494 smp_mb__before_atomic();
495 atomic_inc(&dev->dev_stats_ccnt);
383 return ret; 496 return ret;
384} 497}
385 498
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
387 int index, unsigned long val) 500 int index, unsigned long val)
388{ 501{
389 atomic_set(dev->dev_stat_values + index, val); 502 atomic_set(dev->dev_stat_values + index, val);
390 dev->dev_stats_dirty = 1; 503 smp_mb__before_atomic();
504 atomic_inc(&dev->dev_stats_ccnt);
391} 505}
392 506
393static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, 507static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
395{ 509{
396 btrfs_dev_stat_set(dev, index, 0); 510 btrfs_dev_stat_set(dev, index, 0);
397} 511}
512
513void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
514void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
515 struct btrfs_transaction *transaction);
398#endif 516#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d797ea..dcf20131fbe4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
237 * first xattr that we find and walk forward 237 * first xattr that we find and walk forward
238 */ 238 */
239 key.objectid = btrfs_ino(inode); 239 key.objectid = btrfs_ino(inode);
240 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 240 key.type = BTRFS_XATTR_ITEM_KEY;
241 key.offset = 0; 241 key.offset = 0;
242 242
243 path = btrfs_alloc_path(); 243 path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
273 /* check to make sure this item is what we want */ 273 /* check to make sure this item is what we want */
274 if (found_key.objectid != key.objectid) 274 if (found_key.objectid != key.objectid)
275 break; 275 break;
276 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) 276 if (found_key.type != BTRFS_XATTR_ITEM_KEY)
277 break; 277 break;
278 278
279 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 279 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc81277..759fa4e2de8f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
33#include "compression.h" 33#include "compression.h"
34 34
35struct workspace { 35struct workspace {
36 z_stream inf_strm; 36 z_stream strm;
37 z_stream def_strm;
38 char *buf; 37 char *buf;
39 struct list_head list; 38 struct list_head list;
40}; 39};
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
43{ 42{
44 struct workspace *workspace = list_entry(ws, struct workspace, list); 43 struct workspace *workspace = list_entry(ws, struct workspace, list);
45 44
46 vfree(workspace->def_strm.workspace); 45 vfree(workspace->strm.workspace);
47 vfree(workspace->inf_strm.workspace);
48 kfree(workspace->buf); 46 kfree(workspace->buf);
49 kfree(workspace); 47 kfree(workspace);
50} 48}
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
52static struct list_head *zlib_alloc_workspace(void) 50static struct list_head *zlib_alloc_workspace(void)
53{ 51{
54 struct workspace *workspace; 52 struct workspace *workspace;
53 int workspacesize;
55 54
56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 55 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
57 if (!workspace) 56 if (!workspace)
58 return ERR_PTR(-ENOMEM); 57 return ERR_PTR(-ENOMEM);
59 58
60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( 59 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
61 MAX_WBITS, MAX_MEM_LEVEL)); 60 zlib_inflate_workspacesize());
62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->strm.workspace = vmalloc(workspacesize);
63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
64 if (!workspace->def_strm.workspace || 63 if (!workspace->strm.workspace || !workspace->buf)
65 !workspace->inf_strm.workspace || !workspace->buf)
66 goto fail; 64 goto fail;
67 65
68 INIT_LIST_HEAD(&workspace->list); 66 INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
96 *total_out = 0; 94 *total_out = 0;
97 *total_in = 0; 95 *total_in = 0;
98 96
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 97 if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
100 printk(KERN_WARNING "BTRFS: deflateInit failed\n"); 98 printk(KERN_WARNING "BTRFS: deflateInit failed\n");
101 ret = -EIO; 99 ret = -EIO;
102 goto out; 100 goto out;
103 } 101 }
104 102
105 workspace->def_strm.total_in = 0; 103 workspace->strm.total_in = 0;
106 workspace->def_strm.total_out = 0; 104 workspace->strm.total_out = 0;
107 105
108 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 106 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
109 data_in = kmap(in_page); 107 data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
117 pages[0] = out_page; 115 pages[0] = out_page;
118 nr_pages = 1; 116 nr_pages = 1;
119 117
120 workspace->def_strm.next_in = data_in; 118 workspace->strm.next_in = data_in;
121 workspace->def_strm.next_out = cpage_out; 119 workspace->strm.next_out = cpage_out;
122 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 120 workspace->strm.avail_out = PAGE_CACHE_SIZE;
123 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 121 workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
124 122
125 while (workspace->def_strm.total_in < len) { 123 while (workspace->strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 124 ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 125 if (ret != Z_OK) {
128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 126 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
129 ret); 127 ret);
130 zlib_deflateEnd(&workspace->def_strm); 128 zlib_deflateEnd(&workspace->strm);
131 ret = -EIO; 129 ret = -EIO;
132 goto out; 130 goto out;
133 } 131 }
134 132
135 /* we're making it bigger, give up */ 133 /* we're making it bigger, give up */
136 if (workspace->def_strm.total_in > 8192 && 134 if (workspace->strm.total_in > 8192 &&
137 workspace->def_strm.total_in < 135 workspace->strm.total_in <
138 workspace->def_strm.total_out) { 136 workspace->strm.total_out) {
139 ret = -E2BIG; 137 ret = -E2BIG;
140 goto out; 138 goto out;
141 } 139 }
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
143 * before the total_in so we will pull in a new page for 141 * before the total_in so we will pull in a new page for
144 * the stream end if required 142 * the stream end if required
145 */ 143 */
146 if (workspace->def_strm.avail_out == 0) { 144 if (workspace->strm.avail_out == 0) {
147 kunmap(out_page); 145 kunmap(out_page);
148 if (nr_pages == nr_dest_pages) { 146 if (nr_pages == nr_dest_pages) {
149 out_page = NULL; 147 out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
158 cpage_out = kmap(out_page); 156 cpage_out = kmap(out_page);
159 pages[nr_pages] = out_page; 157 pages[nr_pages] = out_page;
160 nr_pages++; 158 nr_pages++;
161 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 159 workspace->strm.avail_out = PAGE_CACHE_SIZE;
162 workspace->def_strm.next_out = cpage_out; 160 workspace->strm.next_out = cpage_out;
163 } 161 }
164 /* we're all done */ 162 /* we're all done */
165 if (workspace->def_strm.total_in >= len) 163 if (workspace->strm.total_in >= len)
166 break; 164 break;
167 165
168 /* we've read in a full page, get a new one */ 166 /* we've read in a full page, get a new one */
169 if (workspace->def_strm.avail_in == 0) { 167 if (workspace->strm.avail_in == 0) {
170 if (workspace->def_strm.total_out > max_out) 168 if (workspace->strm.total_out > max_out)
171 break; 169 break;
172 170
173 bytes_left = len - workspace->def_strm.total_in; 171 bytes_left = len - workspace->strm.total_in;
174 kunmap(in_page); 172 kunmap(in_page);
175 page_cache_release(in_page); 173 page_cache_release(in_page);
176 174
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
178 in_page = find_get_page(mapping, 176 in_page = find_get_page(mapping,
179 start >> PAGE_CACHE_SHIFT); 177 start >> PAGE_CACHE_SHIFT);
180 data_in = kmap(in_page); 178 data_in = kmap(in_page);
181 workspace->def_strm.avail_in = min(bytes_left, 179 workspace->strm.avail_in = min(bytes_left,
182 PAGE_CACHE_SIZE); 180 PAGE_CACHE_SIZE);
183 workspace->def_strm.next_in = data_in; 181 workspace->strm.next_in = data_in;
184 } 182 }
185 } 183 }
186 workspace->def_strm.avail_in = 0; 184 workspace->strm.avail_in = 0;
187 ret = zlib_deflate(&workspace->def_strm, Z_FINISH); 185 ret = zlib_deflate(&workspace->strm, Z_FINISH);
188 zlib_deflateEnd(&workspace->def_strm); 186 zlib_deflateEnd(&workspace->strm);
189 187
190 if (ret != Z_STREAM_END) { 188 if (ret != Z_STREAM_END) {
191 ret = -EIO; 189 ret = -EIO;
192 goto out; 190 goto out;
193 } 191 }
194 192
195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { 193 if (workspace->strm.total_out >= workspace->strm.total_in) {
196 ret = -E2BIG; 194 ret = -E2BIG;
197 goto out; 195 goto out;
198 } 196 }
199 197
200 ret = 0; 198 ret = 0;
201 *total_out = workspace->def_strm.total_out; 199 *total_out = workspace->strm.total_out;
202 *total_in = workspace->def_strm.total_in; 200 *total_in = workspace->strm.total_in;
203out: 201out:
204 *out_pages = nr_pages; 202 *out_pages = nr_pages;
205 if (out_page) 203 if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
225 size_t total_out = 0; 223 size_t total_out = 0;
226 unsigned long page_in_index = 0; 224 unsigned long page_in_index = 0;
227 unsigned long page_out_index = 0; 225 unsigned long page_out_index = 0;
228 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 226 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
229 PAGE_CACHE_SIZE;
230 unsigned long buf_start; 227 unsigned long buf_start;
231 unsigned long pg_offset; 228 unsigned long pg_offset;
232 229
233 data_in = kmap(pages_in[page_in_index]); 230 data_in = kmap(pages_in[page_in_index]);
234 workspace->inf_strm.next_in = data_in; 231 workspace->strm.next_in = data_in;
235 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); 232 workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
236 workspace->inf_strm.total_in = 0; 233 workspace->strm.total_in = 0;
237 234
238 workspace->inf_strm.total_out = 0; 235 workspace->strm.total_out = 0;
239 workspace->inf_strm.next_out = workspace->buf; 236 workspace->strm.next_out = workspace->buf;
240 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 237 workspace->strm.avail_out = PAGE_CACHE_SIZE;
241 pg_offset = 0; 238 pg_offset = 0;
242 239
243 /* If it's deflate, and it's got no preset dictionary, then 240 /* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
247 !(((data_in[0]<<8) + data_in[1]) % 31)) { 244 !(((data_in[0]<<8) + data_in[1]) % 31)) {
248 245
249 wbits = -((data_in[0] >> 4) + 8); 246 wbits = -((data_in[0] >> 4) + 8);
250 workspace->inf_strm.next_in += 2; 247 workspace->strm.next_in += 2;
251 workspace->inf_strm.avail_in -= 2; 248 workspace->strm.avail_in -= 2;
252 } 249 }
253 250
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 251 if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
255 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 252 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
256 return -EIO; 253 return -EIO;
257 } 254 }
258 while (workspace->inf_strm.total_in < srclen) { 255 while (workspace->strm.total_in < srclen) {
259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 256 ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
260 if (ret != Z_OK && ret != Z_STREAM_END) 257 if (ret != Z_OK && ret != Z_STREAM_END)
261 break; 258 break;
262 259
263 buf_start = total_out; 260 buf_start = total_out;
264 total_out = workspace->inf_strm.total_out; 261 total_out = workspace->strm.total_out;
265 262
266 /* we didn't make progress in this inflate call, we're done */ 263 /* we didn't make progress in this inflate call, we're done */
267 if (buf_start == total_out) 264 if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
276 goto done; 273 goto done;
277 } 274 }
278 275
279 workspace->inf_strm.next_out = workspace->buf; 276 workspace->strm.next_out = workspace->buf;
280 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 277 workspace->strm.avail_out = PAGE_CACHE_SIZE;
281 278
282 if (workspace->inf_strm.avail_in == 0) { 279 if (workspace->strm.avail_in == 0) {
283 unsigned long tmp; 280 unsigned long tmp;
284 kunmap(pages_in[page_in_index]); 281 kunmap(pages_in[page_in_index]);
285 page_in_index++; 282 page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
288 break; 285 break;
289 } 286 }
290 data_in = kmap(pages_in[page_in_index]); 287 data_in = kmap(pages_in[page_in_index]);
291 workspace->inf_strm.next_in = data_in; 288 workspace->strm.next_in = data_in;
292 tmp = srclen - workspace->inf_strm.total_in; 289 tmp = srclen - workspace->strm.total_in;
293 workspace->inf_strm.avail_in = min(tmp, 290 workspace->strm.avail_in = min(tmp,
294 PAGE_CACHE_SIZE); 291 PAGE_CACHE_SIZE);
295 } 292 }
296 } 293 }
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
299 else 296 else
300 ret = 0; 297 ret = 0;
301done: 298done:
302 zlib_inflateEnd(&workspace->inf_strm); 299 zlib_inflateEnd(&workspace->strm);
303 if (data_in) 300 if (data_in)
304 kunmap(pages_in[page_in_index]); 301 kunmap(pages_in[page_in_index]);
305 return ret; 302 return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
317 unsigned long total_out = 0; 314 unsigned long total_out = 0;
318 char *kaddr; 315 char *kaddr;
319 316
320 workspace->inf_strm.next_in = data_in; 317 workspace->strm.next_in = data_in;
321 workspace->inf_strm.avail_in = srclen; 318 workspace->strm.avail_in = srclen;
322 workspace->inf_strm.total_in = 0; 319 workspace->strm.total_in = 0;
323 320
324 workspace->inf_strm.next_out = workspace->buf; 321 workspace->strm.next_out = workspace->buf;
325 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 322 workspace->strm.avail_out = PAGE_CACHE_SIZE;
326 workspace->inf_strm.total_out = 0; 323 workspace->strm.total_out = 0;
327 /* If it's deflate, and it's got no preset dictionary, then 324 /* If it's deflate, and it's got no preset dictionary, then
328 we can tell zlib to skip the adler32 check. */ 325 we can tell zlib to skip the adler32 check. */
329 if (srclen > 2 && !(data_in[1] & PRESET_DICT) && 326 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
331 !(((data_in[0]<<8) + data_in[1]) % 31)) { 328 !(((data_in[0]<<8) + data_in[1]) % 31)) {
332 329
333 wbits = -((data_in[0] >> 4) + 8); 330 wbits = -((data_in[0] >> 4) + 8);
334 workspace->inf_strm.next_in += 2; 331 workspace->strm.next_in += 2;
335 workspace->inf_strm.avail_in -= 2; 332 workspace->strm.avail_in -= 2;
336 } 333 }
337 334
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 335 if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
339 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 336 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
340 return -EIO; 337 return -EIO;
341 } 338 }
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
346 unsigned long bytes; 343 unsigned long bytes;
347 unsigned long pg_offset = 0; 344 unsigned long pg_offset = 0;
348 345
349 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 346 ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
350 if (ret != Z_OK && ret != Z_STREAM_END) 347 if (ret != Z_OK && ret != Z_STREAM_END)
351 break; 348 break;
352 349
353 buf_start = total_out; 350 buf_start = total_out;
354 total_out = workspace->inf_strm.total_out; 351 total_out = workspace->strm.total_out;
355 352
356 if (total_out == buf_start) { 353 if (total_out == buf_start) {
357 ret = -EIO; 354 ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
377 pg_offset += bytes; 374 pg_offset += bytes;
378 bytes_left -= bytes; 375 bytes_left -= bytes;
379next: 376next:
380 workspace->inf_strm.next_out = workspace->buf; 377 workspace->strm.next_out = workspace->buf;
381 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 378 workspace->strm.avail_out = PAGE_CACHE_SIZE;
382 } 379 }
383 380
384 if (ret != Z_STREAM_END && bytes_left != 0) 381 if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
386 else 383 else
387 ret = 0; 384 ret = 0;
388 385
389 zlib_inflateEnd(&workspace->inf_strm); 386 zlib_inflateEnd(&workspace->strm);
390 return ret; 387 return ret;
391} 388}
392 389
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4ee4e30d26d9..1faecea101f3 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -23,6 +23,7 @@ struct map_lookup;
23struct extent_buffer; 23struct extent_buffer;
24struct btrfs_work; 24struct btrfs_work;
25struct __btrfs_workqueue; 25struct __btrfs_workqueue;
26struct btrfs_qgroup_operation;
26 27
27#define show_ref_type(type) \ 28#define show_ref_type(type) \
28 __print_symbolic(type, \ 29 __print_symbolic(type, \
@@ -157,12 +158,13 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
157 158
158#define show_map_flags(flag) \ 159#define show_map_flags(flag) \
159 __print_flags(flag, "|", \ 160 __print_flags(flag, "|", \
160 { EXTENT_FLAG_PINNED, "PINNED" }, \ 161 { (1 << EXTENT_FLAG_PINNED), "PINNED" },\
161 { EXTENT_FLAG_COMPRESSED, "COMPRESSED" }, \ 162 { (1 << EXTENT_FLAG_COMPRESSED), "COMPRESSED" },\
162 { EXTENT_FLAG_VACANCY, "VACANCY" }, \ 163 { (1 << EXTENT_FLAG_VACANCY), "VACANCY" },\
163 { EXTENT_FLAG_PREALLOC, "PREALLOC" }, \ 164 { (1 << EXTENT_FLAG_PREALLOC), "PREALLOC" },\
164 { EXTENT_FLAG_LOGGING, "LOGGING" }, \ 165 { (1 << EXTENT_FLAG_LOGGING), "LOGGING" },\
165 { EXTENT_FLAG_FILLING, "FILLING" }) 166 { (1 << EXTENT_FLAG_FILLING), "FILLING" },\
167 { (1 << EXTENT_FLAG_FS_MAPPING), "FS_MAPPING" })
166 168
167TRACE_EVENT_CONDITION(btrfs_get_extent, 169TRACE_EVENT_CONDITION(btrfs_get_extent,
168 170
@@ -996,6 +998,7 @@ DECLARE_EVENT_CLASS(btrfs__work,
996 __field( void *, func ) 998 __field( void *, func )
997 __field( void *, ordered_func ) 999 __field( void *, ordered_func )
998 __field( void *, ordered_free ) 1000 __field( void *, ordered_free )
1001 __field( void *, normal_work )
999 ), 1002 ),
1000 1003
1001 TP_fast_assign( 1004 TP_fast_assign(
@@ -1004,11 +1007,13 @@ DECLARE_EVENT_CLASS(btrfs__work,
1004 __entry->func = work->func; 1007 __entry->func = work->func;
1005 __entry->ordered_func = work->ordered_func; 1008 __entry->ordered_func = work->ordered_func;
1006 __entry->ordered_free = work->ordered_free; 1009 __entry->ordered_free = work->ordered_free;
1010 __entry->normal_work = &work->normal_work;
1007 ), 1011 ),
1008 1012
1009 TP_printk("work=%p, wq=%p, func=%p, ordered_func=%p, ordered_free=%p", 1013 TP_printk("work=%p (normal_work=%p), wq=%p, func=%pf, ordered_func=%p,"
1010 __entry->work, __entry->wq, __entry->func, 1014 " ordered_free=%p",
1011 __entry->ordered_func, __entry->ordered_free) 1015 __entry->work, __entry->normal_work, __entry->wq,
1016 __entry->func, __entry->ordered_func, __entry->ordered_free)
1012); 1017);
1013 1018
1014/* For situiations that the work is freed */ 1019/* For situiations that the work is freed */
@@ -1043,13 +1048,6 @@ DEFINE_EVENT(btrfs__work, btrfs_work_sched,
1043 TP_ARGS(work) 1048 TP_ARGS(work)
1044); 1049);
1045 1050
1046DEFINE_EVENT(btrfs__work, btrfs_normal_work_done,
1047
1048 TP_PROTO(struct btrfs_work *work),
1049
1050 TP_ARGS(work)
1051);
1052
1053DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done, 1051DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,
1054 1052
1055 TP_PROTO(struct btrfs_work *work), 1053 TP_PROTO(struct btrfs_work *work),
@@ -1119,6 +1117,61 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
1119 TP_ARGS(wq) 1117 TP_ARGS(wq)
1120); 1118);
1121 1119
1120#define show_oper_type(type) \
1121 __print_symbolic(type, \
1122 { BTRFS_QGROUP_OPER_ADD_EXCL, "OPER_ADD_EXCL" }, \
1123 { BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" }, \
1124 { BTRFS_QGROUP_OPER_SUB_EXCL, "OPER_SUB_EXCL" }, \
1125 { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" })
1126
1127DECLARE_EVENT_CLASS(btrfs_qgroup_oper,
1128
1129 TP_PROTO(struct btrfs_qgroup_operation *oper),
1130
1131 TP_ARGS(oper),
1132
1133 TP_STRUCT__entry(
1134 __field( u64, ref_root )
1135 __field( u64, bytenr )
1136 __field( u64, num_bytes )
1137 __field( u64, seq )
1138 __field( int, type )
1139 __field( u64, elem_seq )
1140 ),
1141
1142 TP_fast_assign(
1143 __entry->ref_root = oper->ref_root;
1144 __entry->bytenr = oper->bytenr,
1145 __entry->num_bytes = oper->num_bytes;
1146 __entry->seq = oper->seq;
1147 __entry->type = oper->type;
1148 __entry->elem_seq = oper->elem.seq;
1149 ),
1150
1151 TP_printk("ref_root = %llu, bytenr = %llu, num_bytes = %llu, "
1152 "seq = %llu, elem.seq = %llu, type = %s",
1153 (unsigned long long)__entry->ref_root,
1154 (unsigned long long)__entry->bytenr,
1155 (unsigned long long)__entry->num_bytes,
1156 (unsigned long long)__entry->seq,
1157 (unsigned long long)__entry->elem_seq,
1158 show_oper_type(__entry->type))
1159);
1160
1161DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_account,
1162
1163 TP_PROTO(struct btrfs_qgroup_operation *oper),
1164
1165 TP_ARGS(oper)
1166);
1167
1168DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_record_ref,
1169
1170 TP_PROTO(struct btrfs_qgroup_operation *oper),
1171
1172 TP_ARGS(oper)
1173);
1174
1122#endif /* _TRACE_BTRFS_H */ 1175#endif /* _TRACE_BTRFS_H */
1123 1176
1124/* This part must be outside protection */ 1177/* This part must be outside protection */