aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c323
-rw-r--r--fs/btrfs/backref.h14
-rw-r--r--fs/btrfs/btrfs_inode.h19
-rw-r--r--fs/btrfs/check-integrity.c47
-rw-r--r--fs/btrfs/compression.c19
-rw-r--r--fs/btrfs/ctree.c459
-rw-r--r--fs/btrfs/ctree.h296
-rw-r--r--fs/btrfs/delayed-inode.c17
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c370
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c593
-rw-r--r--fs/btrfs/extent_io.c167
-rw-r--r--fs/btrfs/extent_io.h27
-rw-r--r--fs/btrfs/extent_map.c52
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file-item.c26
-rw-r--r--fs/btrfs/file.c820
-rw-r--r--fs/btrfs/free-space-cache.c61
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c861
-rw-r--r--fs/btrfs/ioctl.c425
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c187
-rw-r--r--fs/btrfs/ordered-data.h21
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c57
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c51
-rw-r--r--fs/btrfs/root-tree.c33
-rw-r--r--fs/btrfs/scrub.c1862
-rw-r--r--fs/btrfs/send.c1073
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c122
-rw-r--r--fs/btrfs/transaction.c409
-rw-r--r--fs/btrfs/transaction.h20
-rw-r--r--fs/btrfs/tree-log.c994
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c1044
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
-rw-r--r--fs/btrfs/zlib.c8
50 files changed, 8749 insertions, 3198 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
122 if (ret < 0) 122 if (ret < 0)
123 return ret; 123 return ret;
124 if (ret == 0)
125 acl = NULL;
124 } 126 }
125 ret = 0; 127 ret = 0;
126 break; 128 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d6..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/vmalloc.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "disk-io.h" 21#include "disk-io.h"
21#include "backref.h" 22#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
231 } 232 }
232 if (!ret) { 233 if (!ret) {
233 ret = ulist_add(parents, eb->start, 234 ret = ulist_add(parents, eb->start,
234 (unsigned long)eie, GFP_NOFS); 235 (uintptr_t)eie, GFP_NOFS);
235 if (ret < 0) 236 if (ret < 0)
236 break; 237 break;
237 if (!extent_item_pos) { 238 if (!extent_item_pos) {
@@ -282,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
282 goto out; 283 goto out;
283 } 284 }
284 285
285 rcu_read_lock(); 286 root_level = btrfs_old_root_level(root, time_seq);
286 root_level = btrfs_header_level(root->node);
287 rcu_read_unlock();
288 287
289 if (root_level + 1 == level) 288 if (root_level + 1 == level)
290 goto out; 289 goto out;
@@ -363,8 +362,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
363 ULIST_ITER_INIT(&uiter); 362 ULIST_ITER_INIT(&uiter);
364 node = ulist_next(parents, &uiter); 363 node = ulist_next(parents, &uiter);
365 ref->parent = node ? node->val : 0; 364 ref->parent = node ? node->val : 0;
366 ref->inode_list = 365 ref->inode_list = node ?
367 node ? (struct extent_inode_elem *)node->aux : 0; 366 (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
368 367
369 /* additional parents require new refs being added here */ 368 /* additional parents require new refs being added here */
370 while ((node = ulist_next(parents, &uiter))) { 369 while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +374,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 } 374 }
376 memcpy(new_ref, ref, sizeof(*ref)); 375 memcpy(new_ref, ref, sizeof(*ref));
377 new_ref->parent = node->val; 376 new_ref->parent = node->val;
378 new_ref->inode_list = 377 new_ref->inode_list = (struct extent_inode_elem *)
379 (struct extent_inode_elem *)node->aux; 378 (uintptr_t)node->aux;
380 list_add(&new_ref->list, &ref->list); 379 list_add(&new_ref->list, &ref->list);
381 } 380 }
382 ulist_reinit(parents); 381 ulist_reinit(parents);
@@ -462,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
462 pos2 = n2, n2 = pos2->next) { 461 pos2 = n2, n2 = pos2->next) {
463 struct __prelim_ref *ref2; 462 struct __prelim_ref *ref2;
464 struct __prelim_ref *xchg; 463 struct __prelim_ref *xchg;
464 struct extent_inode_elem *eie;
465 465
466 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 ref2 = list_entry(pos2, struct __prelim_ref, list);
467 467
@@ -473,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
473 ref1 = ref2; 473 ref1 = ref2;
474 ref2 = xchg; 474 ref2 = xchg;
475 } 475 }
476 ref1->count += ref2->count;
477 } else { 476 } else {
478 if (ref1->parent != ref2->parent) 477 if (ref1->parent != ref2->parent)
479 continue; 478 continue;
480 ref1->count += ref2->count;
481 } 479 }
480
481 eie = ref1->inode_list;
482 while (eie && eie->next)
483 eie = eie->next;
484 if (eie)
485 eie->next = ref2->inode_list;
486 else
487 ref1->inode_list = ref2->inode_list;
488 ref1->count += ref2->count;
489
482 list_del(&ref2->list); 490 list_del(&ref2->list);
483 kfree(ref2); 491 kfree(ref2);
484 } 492 }
@@ -891,8 +899,7 @@ again:
891 while (!list_empty(&prefs)) { 899 while (!list_empty(&prefs)) {
892 ref = list_first_entry(&prefs, struct __prelim_ref, list); 900 ref = list_first_entry(&prefs, struct __prelim_ref, list);
893 list_del(&ref->list); 901 list_del(&ref->list);
894 if (ref->count < 0) 902 WARN_ON(ref->count < 0);
895 WARN_ON(1);
896 if (ref->count && ref->root_id && ref->parent == 0) { 903 if (ref->count && ref->root_id && ref->parent == 0) {
897 /* no parent == root of tree */ 904 /* no parent == root of tree */
898 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 905 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
@@ -914,8 +921,8 @@ again:
914 free_extent_buffer(eb); 921 free_extent_buffer(eb);
915 } 922 }
916 ret = ulist_add_merge(refs, ref->parent, 923 ret = ulist_add_merge(refs, ref->parent,
917 (unsigned long)ref->inode_list, 924 (uintptr_t)ref->inode_list,
918 (unsigned long *)&eie, GFP_NOFS); 925 (u64 *)&eie, GFP_NOFS);
919 if (!ret && extent_item_pos) { 926 if (!ret && extent_item_pos) {
920 /* 927 /*
921 * we've recorded that parent, so we must extend 928 * we've recorded that parent, so we must extend
@@ -959,7 +966,7 @@ static void free_leaf_list(struct ulist *blocks)
959 while ((node = ulist_next(blocks, &uiter))) { 966 while ((node = ulist_next(blocks, &uiter))) {
960 if (!node->aux) 967 if (!node->aux)
961 continue; 968 continue;
962 eie = (struct extent_inode_elem *)node->aux; 969 eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
963 for (; eie; eie = eie_next) { 970 for (; eie; eie = eie_next) {
964 eie_next = eie->next; 971 eie_next = eie->next;
965 kfree(eie); 972 kfree(eie);
@@ -1108,44 +1115,97 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1108 found_key); 1115 found_key);
1109} 1116}
1110 1117
1111/* 1118int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1112 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements 1119 u64 start_off, struct btrfs_path *path,
1113 * of the path are separated by '/' and the path is guaranteed to be 1120 struct btrfs_inode_extref **ret_extref,
1114 * 0-terminated. the path is only given within the current file system. 1121 u64 *found_off)
1115 * Therefore, it never starts with a '/'. the caller is responsible to provide 1122{
1116 * "size" bytes in "dest". the dest buffer will be filled backwards. finally, 1123 int ret, slot;
1117 * the start point of the resulting string is returned. this pointer is within 1124 struct btrfs_key key;
1118 * dest, normally. 1125 struct btrfs_key found_key;
1119 * in case the path buffer would overflow, the pointer is decremented further 1126 struct btrfs_inode_extref *extref;
1120 * as if output was written to the buffer, though no more output is actually 1127 struct extent_buffer *leaf;
1121 * generated. that way, the caller can determine how much space would be 1128 unsigned long ptr;
1122 * required for the path to fit into the buffer. in that case, the returned 1129
1123 * value will be smaller than dest. callers must check this! 1130 key.objectid = inode_objectid;
1124 */ 1131 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1132 key.offset = start_off;
1126 struct btrfs_inode_ref *iref, 1133
1127 struct extent_buffer *eb_in, u64 parent, 1134 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1128 char *dest, u32 size) 1135 if (ret < 0)
1136 return ret;
1137
1138 while (1) {
1139 leaf = path->nodes[0];
1140 slot = path->slots[0];
1141 if (slot >= btrfs_header_nritems(leaf)) {
1142 /*
1143 * If the item at offset is not found,
1144 * btrfs_search_slot will point us to the slot
1145 * where it should be inserted. In our case
1146 * that will be the slot directly before the
1147 * next INODE_REF_KEY_V2 item. In the case
1148 * that we're pointing to the last slot in a
1149 * leaf, we must move one leaf over.
1150 */
1151 ret = btrfs_next_leaf(root, path);
1152 if (ret) {
1153 if (ret >= 1)
1154 ret = -ENOENT;
1155 break;
1156 }
1157 continue;
1158 }
1159
1160 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1161
1162 /*
1163 * Check that we're still looking at an extended ref key for
1164 * this particular objectid. If we have different
1165 * objectid or type then there are no more to be found
1166 * in the tree and we can exit.
1167 */
1168 ret = -ENOENT;
1169 if (found_key.objectid != inode_objectid)
1170 break;
1171 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
1172 break;
1173
1174 ret = 0;
1175 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1176 extref = (struct btrfs_inode_extref *)ptr;
1177 *ret_extref = extref;
1178 if (found_off)
1179 *found_off = found_key.offset;
1180 break;
1181 }
1182
1183 return ret;
1184}
1185
1186char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1187 u32 name_len, unsigned long name_off,
1188 struct extent_buffer *eb_in, u64 parent,
1189 char *dest, u32 size)
1129{ 1190{
1130 u32 len;
1131 int slot; 1191 int slot;
1132 u64 next_inum; 1192 u64 next_inum;
1133 int ret; 1193 int ret;
1134 s64 bytes_left = size - 1; 1194 s64 bytes_left = ((s64)size) - 1;
1135 struct extent_buffer *eb = eb_in; 1195 struct extent_buffer *eb = eb_in;
1136 struct btrfs_key found_key; 1196 struct btrfs_key found_key;
1137 int leave_spinning = path->leave_spinning; 1197 int leave_spinning = path->leave_spinning;
1198 struct btrfs_inode_ref *iref;
1138 1199
1139 if (bytes_left >= 0) 1200 if (bytes_left >= 0)
1140 dest[bytes_left] = '\0'; 1201 dest[bytes_left] = '\0';
1141 1202
1142 path->leave_spinning = 1; 1203 path->leave_spinning = 1;
1143 while (1) { 1204 while (1) {
1144 len = btrfs_inode_ref_name_len(eb, iref); 1205 bytes_left -= name_len;
1145 bytes_left -= len;
1146 if (bytes_left >= 0) 1206 if (bytes_left >= 0)
1147 read_extent_buffer(eb, dest + bytes_left, 1207 read_extent_buffer(eb, dest + bytes_left,
1148 (unsigned long)(iref + 1), len); 1208 name_off, name_len);
1149 if (eb != eb_in) { 1209 if (eb != eb_in) {
1150 btrfs_tree_read_unlock_blocking(eb); 1210 btrfs_tree_read_unlock_blocking(eb);
1151 free_extent_buffer(eb); 1211 free_extent_buffer(eb);
@@ -1155,6 +1215,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1155 ret = -ENOENT; 1215 ret = -ENOENT;
1156 if (ret) 1216 if (ret)
1157 break; 1217 break;
1218
1158 next_inum = found_key.offset; 1219 next_inum = found_key.offset;
1159 1220
1160 /* regular exit ahead */ 1221 /* regular exit ahead */
@@ -1170,8 +1231,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1170 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1231 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1171 } 1232 }
1172 btrfs_release_path(path); 1233 btrfs_release_path(path);
1173
1174 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); 1234 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1235
1236 name_len = btrfs_inode_ref_name_len(eb, iref);
1237 name_off = (unsigned long)(iref + 1);
1238
1175 parent = next_inum; 1239 parent = next_inum;
1176 --bytes_left; 1240 --bytes_left;
1177 if (bytes_left >= 0) 1241 if (bytes_left >= 0)
@@ -1188,12 +1252,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1188} 1252}
1189 1253
1190/* 1254/*
1255 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
1256 * of the path are separated by '/' and the path is guaranteed to be
1257 * 0-terminated. the path is only given within the current file system.
1258 * Therefore, it never starts with a '/'. the caller is responsible to provide
1259 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
1260 * the start point of the resulting string is returned. this pointer is within
1261 * dest, normally.
1262 * in case the path buffer would overflow, the pointer is decremented further
1263 * as if output was written to the buffer, though no more output is actually
1264 * generated. that way, the caller can determine how much space would be
1265 * required for the path to fit into the buffer. in that case, the returned
1266 * value will be smaller than dest. callers must check this!
1267 */
1268char *btrfs_iref_to_path(struct btrfs_root *fs_root,
1269 struct btrfs_path *path,
1270 struct btrfs_inode_ref *iref,
1271 struct extent_buffer *eb_in, u64 parent,
1272 char *dest, u32 size)
1273{
1274 return btrfs_ref_to_path(fs_root, path,
1275 btrfs_inode_ref_name_len(eb_in, iref),
1276 (unsigned long)(iref + 1),
1277 eb_in, parent, dest, size);
1278}
1279
1280/*
1191 * this makes the path point to (logical EXTENT_ITEM *) 1281 * this makes the path point to (logical EXTENT_ITEM *)
1192 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for 1282 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
1193 * tree blocks and <0 on error. 1283 * tree blocks and <0 on error.
1194 */ 1284 */
1195int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 1285int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1196 struct btrfs_path *path, struct btrfs_key *found_key) 1286 struct btrfs_path *path, struct btrfs_key *found_key,
1287 u64 *flags_ret)
1197{ 1288{
1198 int ret; 1289 int ret;
1199 u64 flags; 1290 u64 flags;
@@ -1237,10 +1328,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1237 (unsigned long long)found_key->objectid, 1328 (unsigned long long)found_key->objectid,
1238 (unsigned long long)found_key->offset, 1329 (unsigned long long)found_key->offset,
1239 (unsigned long long)flags, item_size); 1330 (unsigned long long)flags, item_size);
1240 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1331
1241 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 1332 WARN_ON(!flags_ret);
1242 if (flags & BTRFS_EXTENT_FLAG_DATA) 1333 if (flags_ret) {
1243 return BTRFS_EXTENT_FLAG_DATA; 1334 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1335 *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
1336 else if (flags & BTRFS_EXTENT_FLAG_DATA)
1337 *flags_ret = BTRFS_EXTENT_FLAG_DATA;
1338 else
1339 BUG_ON(1);
1340 return 0;
1341 }
1244 1342
1245 return -EIO; 1343 return -EIO;
1246} 1344}
@@ -1404,12 +1502,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1404 ULIST_ITER_INIT(&root_uiter); 1502 ULIST_ITER_INIT(&root_uiter);
1405 while (!ret && (root_node = ulist_next(roots, &root_uiter))) { 1503 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1406 pr_debug("root %llu references leaf %llu, data list " 1504 pr_debug("root %llu references leaf %llu, data list "
1407 "%#lx\n", root_node->val, ref_node->val, 1505 "%#llx\n", root_node->val, ref_node->val,
1408 ref_node->aux); 1506 (long long)ref_node->aux);
1409 ret = iterate_leaf_refs( 1507 ret = iterate_leaf_refs((struct extent_inode_elem *)
1410 (struct extent_inode_elem *)ref_node->aux, 1508 (uintptr_t)ref_node->aux,
1411 root_node->val, extent_item_objectid, 1509 root_node->val,
1412 iterate, ctx); 1510 extent_item_objectid,
1511 iterate, ctx);
1413 } 1512 }
1414 ulist_free(roots); 1513 ulist_free(roots);
1415 roots = NULL; 1514 roots = NULL;
@@ -1432,15 +1531,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1432{ 1531{
1433 int ret; 1532 int ret;
1434 u64 extent_item_pos; 1533 u64 extent_item_pos;
1534 u64 flags = 0;
1435 struct btrfs_key found_key; 1535 struct btrfs_key found_key;
1436 int search_commit_root = path->search_commit_root; 1536 int search_commit_root = path->search_commit_root;
1437 1537
1438 ret = extent_from_logical(fs_info, logical, path, 1538 ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
1439 &found_key);
1440 btrfs_release_path(path); 1539 btrfs_release_path(path);
1441 if (ret < 0) 1540 if (ret < 0)
1442 return ret; 1541 return ret;
1443 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1542 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1444 return -EINVAL; 1543 return -EINVAL;
1445 1544
1446 extent_item_pos = logical - found_key.objectid; 1545 extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1550,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1451 return ret; 1550 return ret;
1452} 1551}
1453 1552
1454static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, 1553typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
1455 struct btrfs_path *path, 1554 struct extent_buffer *eb, void *ctx);
1456 iterate_irefs_t *iterate, void *ctx) 1555
1556static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1557 struct btrfs_path *path,
1558 iterate_irefs_t *iterate, void *ctx)
1457{ 1559{
1458 int ret = 0; 1560 int ret = 0;
1459 int slot; 1561 int slot;
@@ -1470,7 +1572,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1470 while (!ret) { 1572 while (!ret) {
1471 path->leave_spinning = 1; 1573 path->leave_spinning = 1;
1472 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, 1574 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
1473 &found_key); 1575 &found_key);
1474 if (ret < 0) 1576 if (ret < 0)
1475 break; 1577 break;
1476 if (ret) { 1578 if (ret) {
@@ -1498,7 +1600,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1498 "tree %llu\n", cur, 1600 "tree %llu\n", cur,
1499 (unsigned long long)found_key.objectid, 1601 (unsigned long long)found_key.objectid,
1500 (unsigned long long)fs_root->objectid); 1602 (unsigned long long)fs_root->objectid);
1501 ret = iterate(parent, iref, eb, ctx); 1603 ret = iterate(parent, name_len,
1604 (unsigned long)(iref + 1), eb, ctx);
1502 if (ret) 1605 if (ret)
1503 break; 1606 break;
1504 len = sizeof(*iref) + name_len; 1607 len = sizeof(*iref) + name_len;
@@ -1513,12 +1616,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1513 return ret; 1616 return ret;
1514} 1617}
1515 1618
1619static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
1620 struct btrfs_path *path,
1621 iterate_irefs_t *iterate, void *ctx)
1622{
1623 int ret;
1624 int slot;
1625 u64 offset = 0;
1626 u64 parent;
1627 int found = 0;
1628 struct extent_buffer *eb;
1629 struct btrfs_inode_extref *extref;
1630 struct extent_buffer *leaf;
1631 u32 item_size;
1632 u32 cur_offset;
1633 unsigned long ptr;
1634
1635 while (1) {
1636 ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
1637 &offset);
1638 if (ret < 0)
1639 break;
1640 if (ret) {
1641 ret = found ? 0 : -ENOENT;
1642 break;
1643 }
1644 ++found;
1645
1646 slot = path->slots[0];
1647 eb = path->nodes[0];
1648 /* make sure we can use eb after releasing the path */
1649 atomic_inc(&eb->refs);
1650
1651 btrfs_tree_read_lock(eb);
1652 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1653 btrfs_release_path(path);
1654
1655 leaf = path->nodes[0];
1656 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1657 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1658 cur_offset = 0;
1659
1660 while (cur_offset < item_size) {
1661 u32 name_len;
1662
1663 extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
1664 parent = btrfs_inode_extref_parent(eb, extref);
1665 name_len = btrfs_inode_extref_name_len(eb, extref);
1666 ret = iterate(parent, name_len,
1667 (unsigned long)&extref->name, eb, ctx);
1668 if (ret)
1669 break;
1670
1671 cur_offset += btrfs_inode_extref_name_len(leaf, extref);
1672 cur_offset += sizeof(*extref);
1673 }
1674 btrfs_tree_read_unlock_blocking(eb);
1675 free_extent_buffer(eb);
1676
1677 offset++;
1678 }
1679
1680 btrfs_release_path(path);
1681
1682 return ret;
1683}
1684
1685static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1686 struct btrfs_path *path, iterate_irefs_t *iterate,
1687 void *ctx)
1688{
1689 int ret;
1690 int found_refs = 0;
1691
1692 ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
1693 if (!ret)
1694 ++found_refs;
1695 else if (ret != -ENOENT)
1696 return ret;
1697
1698 ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
1699 if (ret == -ENOENT && found_refs)
1700 return 0;
1701
1702 return ret;
1703}
1704
1516/* 1705/*
1517 * returns 0 if the path could be dumped (probably truncated) 1706 * returns 0 if the path could be dumped (probably truncated)
1518 * returns <0 in case of an error 1707 * returns <0 in case of an error
1519 */ 1708 */
1520static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, 1709static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
1521 struct extent_buffer *eb, void *ctx) 1710 struct extent_buffer *eb, void *ctx)
1522{ 1711{
1523 struct inode_fs_paths *ipath = ctx; 1712 struct inode_fs_paths *ipath = ctx;
1524 char *fspath; 1713 char *fspath;
@@ -1531,20 +1720,16 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1531 ipath->fspath->bytes_left - s_ptr : 0; 1720 ipath->fspath->bytes_left - s_ptr : 0;
1532 1721
1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1722 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1723 fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
1535 inum, fspath_min, bytes_left); 1724 name_off, eb, inum, fspath_min, bytes_left);
1536 if (IS_ERR(fspath)) 1725 if (IS_ERR(fspath))
1537 return PTR_ERR(fspath); 1726 return PTR_ERR(fspath);
1538 1727
1539 if (fspath > fspath_min) { 1728 if (fspath > fspath_min) {
1540 pr_debug("path resolved: %s\n", fspath);
1541 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1729 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
1542 ++ipath->fspath->elem_cnt; 1730 ++ipath->fspath->elem_cnt;
1543 ipath->fspath->bytes_left = fspath - fspath_min; 1731 ipath->fspath->bytes_left = fspath - fspath_min;
1544 } else { 1732 } else {
1545 pr_debug("missed path, not enough space. missing bytes: %lu, "
1546 "constructed so far: %s\n",
1547 (unsigned long)(fspath_min - fspath), fspath_min);
1548 ++ipath->fspath->elem_missed; 1733 ++ipath->fspath->elem_missed;
1549 ipath->fspath->bytes_missing += fspath_min - fspath; 1734 ipath->fspath->bytes_missing += fspath_min - fspath;
1550 ipath->fspath->bytes_left = 0; 1735 ipath->fspath->bytes_left = 0;
@@ -1566,7 +1751,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1566int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) 1751int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
1567{ 1752{
1568 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, 1753 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
1569 inode_to_path, ipath); 1754 inode_to_path, ipath);
1570} 1755}
1571 1756
1572struct btrfs_data_container *init_data_container(u32 total_bytes) 1757struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1760,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
1575 size_t alloc_bytes; 1760 size_t alloc_bytes;
1576 1761
1577 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); 1762 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
1578 data = kmalloc(alloc_bytes, GFP_NOFS); 1763 data = vmalloc(alloc_bytes);
1579 if (!data) 1764 if (!data)
1580 return ERR_PTR(-ENOMEM); 1765 return ERR_PTR(-ENOMEM);
1581 1766
@@ -1626,6 +1811,6 @@ void free_ipath(struct inode_fs_paths *ipath)
1626{ 1811{
1627 if (!ipath) 1812 if (!ipath)
1628 return; 1813 return;
1629 kfree(ipath->fspath); 1814 vfree(ipath->fspath);
1630 kfree(ipath); 1815 kfree(ipath);
1631} 1816}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab8..d61feca79455 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,14 +33,13 @@ struct inode_fs_paths {
33 33
34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, 34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
35 void *ctx); 35 void *ctx);
36typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
37 struct extent_buffer *eb, void *ctx);
38 36
39int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, 37int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
40 struct btrfs_path *path); 38 struct btrfs_path *path);
41 39
42int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 40int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
43 struct btrfs_path *path, struct btrfs_key *found_key); 41 struct btrfs_path *path, struct btrfs_key *found_key,
42 u64 *flags);
44 43
45int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 44int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
46 struct btrfs_extent_item *ei, u32 item_size, 45 struct btrfs_extent_item *ei, u32 item_size,
@@ -63,10 +62,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
63char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 62char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
64 struct btrfs_inode_ref *iref, struct extent_buffer *eb, 63 struct btrfs_inode_ref *iref, struct extent_buffer *eb,
65 u64 parent, char *dest, u32 size); 64 u64 parent, char *dest, u32 size);
65char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
66 u32 name_len, unsigned long name_off,
67 struct extent_buffer *eb_in, u64 parent,
68 char *dest, u32 size);
66 69
67struct btrfs_data_container *init_data_container(u32 total_bytes); 70struct btrfs_data_container *init_data_container(u32 total_bytes);
68struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 71struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
69 struct btrfs_path *path); 72 struct btrfs_path *path);
70void free_ipath(struct inode_fs_paths *ipath); 73void free_ipath(struct inode_fs_paths *ipath);
71 74
75int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
76 u64 start_off, struct btrfs_path *path,
77 struct btrfs_inode_extref **ret_extref,
78 u64 *found_off);
79
72#endif 80#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe7..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,8 @@
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4 38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8
41 43
42/* in memory btrfs inode */ 44/* in memory btrfs inode */
43struct btrfs_inode { 45struct btrfs_inode {
@@ -89,6 +91,9 @@ struct btrfs_inode {
89 91
90 unsigned long runtime_flags; 92 unsigned long runtime_flags;
91 93
94 /* Keep track of who's O_SYNC/fsycing currently */
95 atomic_t sync_writers;
96
92 /* full 64 bit generation number, struct vfs_inode doesn't have a big 97 /* full 64 bit generation number, struct vfs_inode doesn't have a big
93 * enough field for this. 98 * enough field for this.
94 */ 99 */
@@ -143,6 +148,9 @@ struct btrfs_inode {
143 /* flags field from the on disk inode */ 148 /* flags field from the on disk inode */
144 u32 flags; 149 u32 flags;
145 150
151 /* a local copy of root's last_log_commit */
152 unsigned long last_log_commit;
153
146 /* 154 /*
147 * Counters to keep track of the number of extent item's we may use due 155 * Counters to keep track of the number of extent item's we may use due
148 * to delalloc and such. outstanding_extents is the number of extent 156 * to delalloc and such. outstanding_extents is the number of extent
@@ -202,15 +210,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
202 210
203static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 211static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
204{ 212{
205 struct btrfs_root *root = BTRFS_I(inode)->root;
206 int ret = 0;
207
208 mutex_lock(&root->log_mutex);
209 if (BTRFS_I(inode)->logged_trans == generation && 213 if (BTRFS_I(inode)->logged_trans == generation &&
210 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 214 BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
211 ret = 1; 215 return 1;
212 mutex_unlock(&root->log_mutex); 216 return 0;
213 return ret;
214} 217}
215 218
216#endif 219#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e33407..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
37 * the file system was mounted, (i.e., they have been 37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been 38 * referenced by the super block) or they have been
39 * written since then and the write completion callback 39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where 40 * was called and no write error was indicated and a
41 * these blocks are located was received and completed. 41 * FLUSH request to the device where these blocks are
42 * located was received and completed.
42 * 2b. All referenced blocks need to have a generation 43 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number. 44 * number which is equal to the parent's number.
44 * 45 *
@@ -136,7 +137,7 @@ struct btrfsic_block {
136 unsigned int never_written:1; /* block was added because it was 137 unsigned int never_written:1; /* block was added because it was
137 * referenced, not because it was 138 * referenced, not because it was
138 * written */ 139 * written */
139 unsigned int mirror_num:2; /* large enough to hold 140 unsigned int mirror_num; /* large enough to hold
140 * BTRFS_SUPER_MIRROR_MAX */ 141 * BTRFS_SUPER_MIRROR_MAX */
141 struct btrfsic_dev_state *dev_state; 142 struct btrfsic_dev_state *dev_state;
142 u64 dev_bytenr; /* key, physical byte num on disk */ 143 u64 dev_bytenr; /* key, physical byte num on disk */
@@ -722,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
722 } 723 }
723 724
724 num_copies = 725 num_copies =
725 btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 btrfs_num_copies(state->root->fs_info,
726 next_bytenr, state->metablock_size); 727 next_bytenr, state->metablock_size);
727 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
728 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -902,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
902 } 903 }
903 904
904 num_copies = 905 num_copies =
905 btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 btrfs_num_copies(state->root->fs_info,
906 next_bytenr, state->metablock_size); 907 next_bytenr, state->metablock_size);
907 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
908 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1286,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
1286 *next_blockp = NULL; 1287 *next_blockp = NULL;
1287 if (0 == *num_copiesp) { 1288 if (0 == *num_copiesp) {
1288 *num_copiesp = 1289 *num_copiesp =
1289 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 btrfs_num_copies(state->root->fs_info,
1290 next_bytenr, state->metablock_size); 1291 next_bytenr, state->metablock_size);
1291 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1292 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1488,7 +1489,7 @@ static int btrfsic_handle_extent_data(
1488 chunk_len = num_bytes; 1489 chunk_len = num_bytes;
1489 1490
1490 num_copies = 1491 num_copies =
1491 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 btrfs_num_copies(state->root->fs_info,
1492 next_bytenr, state->datablock_size); 1493 next_bytenr, state->datablock_size);
1493 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1494 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1581,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1581 struct btrfs_device *device; 1582 struct btrfs_device *device;
1582 1583
1583 length = len; 1584 length = len;
1584 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 ret = btrfs_map_block(state->root->fs_info, READ,
1585 bytenr, &length, &multi, mirror_num); 1586 bytenr, &length, &multi, mirror_num);
1586 1587
1588 if (ret) {
1589 block_ctx_out->start = 0;
1590 block_ctx_out->dev_bytenr = 0;
1591 block_ctx_out->len = 0;
1592 block_ctx_out->dev = NULL;
1593 block_ctx_out->datav = NULL;
1594 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL;
1596
1597 return ret;
1598 }
1599
1587 device = multi->stripes[0].dev; 1600 device = multi->stripes[0].dev;
1588 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1589 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1602 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1593,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1593 block_ctx_out->pagev = NULL; 1606 block_ctx_out->pagev = NULL;
1594 block_ctx_out->mem_to_free = NULL; 1607 block_ctx_out->mem_to_free = NULL;
1595 1608
1596 if (0 == ret) 1609 kfree(multi);
1597 kfree(multi);
1598 if (NULL == block_ctx_out->dev) { 1610 if (NULL == block_ctx_out->dev) {
1599 ret = -ENXIO; 1611 ret = -ENXIO;
1600 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); 1612 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2462,7 +2474,7 @@ static int btrfsic_process_written_superblock(
2462 } 2474 }
2463 2475
2464 num_copies = 2476 num_copies =
2465 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2477 btrfs_num_copies(state->root->fs_info,
2466 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2478 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2467 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2479 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2468 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2480 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2601,6 +2613,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2601 (unsigned long long)l->block_ref_to->dev_bytenr, 2613 (unsigned long long)l->block_ref_to->dev_bytenr,
2602 l->block_ref_to->mirror_num); 2614 l->block_ref_to->mirror_num);
2603 ret = -1; 2615 ret = -1;
2616 } else if (l->block_ref_to->iodone_w_error) {
2617 printk(KERN_INFO "btrfs: attempt to write superblock"
2618 " which references block %c @%llu (%s/%llu/%d)"
2619 " which has write error!\n",
2620 btrfsic_get_block_type(state, l->block_ref_to),
2621 (unsigned long long)
2622 l->block_ref_to->logical_bytenr,
2623 l->block_ref_to->dev_state->name,
2624 (unsigned long long)l->block_ref_to->dev_bytenr,
2625 l->block_ref_to->mirror_num);
2626 ret = -1;
2604 } else if (l->parent_generation != 2627 } else if (l->parent_generation !=
2605 l->block_ref_to->generation && 2628 l->block_ref_to->generation &&
2606 BTRFSIC_GENERATION_UNKNOWN != 2629 BTRFSIC_GENERATION_UNKNOWN !=
@@ -2948,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2948 struct btrfsic_block_data_ctx block_ctx; 2971 struct btrfsic_block_data_ctx block_ctx;
2949 int match = 0; 2972 int match = 0;
2950 2973
2951 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2974 num_copies = btrfs_num_copies(state->root->fs_info,
2952 bytenr, state->metablock_size); 2975 bytenr, state->metablock_size);
2953 2976
2954 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a030..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
577 u64 em_start; 577 u64 em_start;
578 struct extent_map *em; 578 struct extent_map *em;
579 int ret = -ENOMEM; 579 int ret = -ENOMEM;
580 int faili = 0;
580 u32 *sums; 581 u32 *sums;
581 582
582 tree = &BTRFS_I(inode)->io_tree; 583 tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
626 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 627 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
627 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | 628 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
628 __GFP_HIGHMEM); 629 __GFP_HIGHMEM);
629 if (!cb->compressed_pages[pg_index]) 630 if (!cb->compressed_pages[pg_index]) {
631 faili = pg_index - 1;
632 ret = -ENOMEM;
630 goto fail2; 633 goto fail2;
634 }
631 } 635 }
636 faili = nr_pages - 1;
632 cb->nr_pages = nr_pages; 637 cb->nr_pages = nr_pages;
633 638
634 add_ra_bio_pages(inode, em_start + em_len, cb); 639 add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -682,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
682 687
683 ret = btrfs_map_bio(root, READ, comp_bio, 688 ret = btrfs_map_bio(root, READ, comp_bio,
684 mirror_num, 0); 689 mirror_num, 0);
685 BUG_ON(ret); /* -ENOMEM */ 690 if (ret)
691 bio_endio(comp_bio, ret);
686 692
687 bio_put(comp_bio); 693 bio_put(comp_bio);
688 694
@@ -707,14 +713,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
707 } 713 }
708 714
709 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 715 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
710 BUG_ON(ret); /* -ENOMEM */ 716 if (ret)
717 bio_endio(comp_bio, ret);
711 718
712 bio_put(comp_bio); 719 bio_put(comp_bio);
713 return 0; 720 return 0;
714 721
715fail2: 722fail2:
716 for (pg_index = 0; pg_index < nr_pages; pg_index++) 723 while (faili >= 0) {
717 free_page((unsigned long)cb->compressed_pages[pg_index]); 724 __free_page(cb->compressed_pages[faili]);
725 faili--;
726 }
718 727
719 kfree(cb->compressed_pages); 728 kfree(cb->compressed_pages);
720fail1: 729fail1:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63a..c7b67cf24bba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 struct btrfs_path *path, int level, int slot, 41 struct btrfs_path *path, int level, int slot);
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb); 43 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 44struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -596,6 +595,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
596 if (tree_mod_dont_log(fs_info, eb)) 595 if (tree_mod_dont_log(fs_info, eb))
597 return 0; 596 return 0;
598 597
598 /*
599 * When we override something during the move, we log these removals.
600 * This can only happen when we move towards the beginning of the
601 * buffer, i.e. dst_slot < src_slot.
602 */
599 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 603 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
600 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, 604 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
601 MOD_LOG_KEY_REMOVE_WHILE_MOVING); 605 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
@@ -647,8 +651,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
647 if (tree_mod_dont_log(fs_info, NULL)) 651 if (tree_mod_dont_log(fs_info, NULL))
648 return 0; 652 return 0;
649 653
650 __tree_mod_log_free_eb(fs_info, old_root);
651
652 ret = tree_mod_alloc(fs_info, flags, &tm); 654 ret = tree_mod_alloc(fs_info, flags, &tm);
653 if (ret < 0) 655 if (ret < 0)
654 goto out; 656 goto out;
@@ -773,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
773 775
774static noinline void 776static noinline void
775tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 777tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
776 struct extent_buffer *eb, 778 struct extent_buffer *eb, int slot, int atomic)
777 struct btrfs_disk_key *disk_key, int slot, int atomic)
778{ 779{
779 int ret; 780 int ret;
780 781
@@ -926,12 +927,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
926 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 927 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
927 BUG_ON(ret); /* -ENOMEM */ 928 BUG_ON(ret); /* -ENOMEM */
928 } 929 }
929 /* 930 tree_mod_log_free_eb(root->fs_info, buf);
930 * don't log freeing in case we're freeing the root node, this
931 * is done by tree_mod_log_set_root_pointer later
932 */
933 if (buf != root->node && btrfs_header_level(buf) != 0)
934 tree_mod_log_free_eb(root->fs_info, buf);
935 clean_tree_block(trans, root, buf); 931 clean_tree_block(trans, root, buf);
936 *last_ref = 1; 932 *last_ref = 1;
937 } 933 }
@@ -1142,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1142 switch (tm->op) { 1138 switch (tm->op) {
1143 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1144 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1145 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1146 case MOD_LOG_KEY_REMOVE: 1141 case MOD_LOG_KEY_REMOVE:
1142 n++;
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1147 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
1148 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1145 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1149 btrfs_set_node_ptr_generation(eb, tm->slot, 1146 btrfs_set_node_ptr_generation(eb, tm->slot,
1150 tm->generation); 1147 tm->generation);
1151 n++;
1152 break; 1148 break;
1153 case MOD_LOG_KEY_REPLACE: 1149 case MOD_LOG_KEY_REPLACE:
1154 BUG_ON(tm->slot >= n); 1150 BUG_ON(tm->slot >= n);
@@ -1225,6 +1221,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1225 free_extent_buffer(eb); 1221 free_extent_buffer(eb);
1226 1222
1227 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1223 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1224 WARN_ON(btrfs_header_nritems(eb_rewin) >
1225 BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
1228 1226
1229 return eb_rewin; 1227 return eb_rewin;
1230} 1228}
@@ -1241,9 +1239,11 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1241{ 1239{
1242 struct tree_mod_elem *tm; 1240 struct tree_mod_elem *tm;
1243 struct extent_buffer *eb; 1241 struct extent_buffer *eb;
1242 struct extent_buffer *old;
1244 struct tree_mod_root *old_root = NULL; 1243 struct tree_mod_root *old_root = NULL;
1245 u64 old_generation = 0; 1244 u64 old_generation = 0;
1246 u64 logical; 1245 u64 logical;
1246 u32 blocksize;
1247 1247
1248 eb = btrfs_read_lock_root_node(root); 1248 eb = btrfs_read_lock_root_node(root);
1249 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); 1249 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
@@ -1259,14 +1259,32 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1259 } 1259 }
1260 1260
1261 tm = tree_mod_log_search(root->fs_info, logical, time_seq); 1261 tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1262 if (old_root) 1262 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1263 btrfs_tree_read_unlock(root->node);
1264 free_extent_buffer(root->node);
1265 blocksize = btrfs_level_size(root, old_root->level);
1266 old = read_tree_block(root, logical, blocksize, 0);
1267 if (!old) {
1268 pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1269 logical);
1270 WARN_ON(1);
1271 } else {
1272 eb = btrfs_clone_extent_buffer(old);
1273 free_extent_buffer(old);
1274 }
1275 } else if (old_root) {
1276 btrfs_tree_read_unlock(root->node);
1277 free_extent_buffer(root->node);
1263 eb = alloc_dummy_extent_buffer(logical, root->nodesize); 1278 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1264 else 1279 } else {
1265 eb = btrfs_clone_extent_buffer(root->node); 1280 eb = btrfs_clone_extent_buffer(root->node);
1266 btrfs_tree_read_unlock(root->node); 1281 btrfs_tree_read_unlock(root->node);
1267 free_extent_buffer(root->node); 1282 free_extent_buffer(root->node);
1283 }
1284
1268 if (!eb) 1285 if (!eb)
1269 return NULL; 1286 return NULL;
1287 extent_buffer_get(eb);
1270 btrfs_tree_read_lock(eb); 1288 btrfs_tree_read_lock(eb);
1271 if (old_root) { 1289 if (old_root) {
1272 btrfs_set_header_bytenr(eb, eb->start); 1290 btrfs_set_header_bytenr(eb, eb->start);
@@ -1279,11 +1297,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1279 __tree_mod_log_rewind(eb, time_seq, tm); 1297 __tree_mod_log_rewind(eb, time_seq, tm);
1280 else 1298 else
1281 WARN_ON(btrfs_header_level(eb) != 0); 1299 WARN_ON(btrfs_header_level(eb) != 0);
1282 extent_buffer_get(eb); 1300 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1283 1301
1284 return eb; 1302 return eb;
1285} 1303}
1286 1304
1305int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1306{
1307 struct tree_mod_elem *tm;
1308 int level;
1309
1310 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1311 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1312 level = tm->old_root.level;
1313 } else {
1314 rcu_read_lock();
1315 level = btrfs_header_level(root->node);
1316 rcu_read_unlock();
1317 }
1318
1319 return level;
1320}
1321
1287static inline int should_cow_block(struct btrfs_trans_handle *trans, 1322static inline int should_cow_block(struct btrfs_trans_handle *trans,
1288 struct btrfs_root *root, 1323 struct btrfs_root *root,
1289 struct extent_buffer *buf) 1324 struct extent_buffer *buf)
@@ -1324,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1324 u64 search_start; 1359 u64 search_start;
1325 int ret; 1360 int ret;
1326 1361
1327 if (trans->transaction != root->fs_info->running_transaction) { 1362 if (trans->transaction != root->fs_info->running_transaction)
1328 printk(KERN_CRIT "trans %llu running %llu\n", 1363 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1329 (unsigned long long)trans->transid, 1364 (unsigned long long)trans->transid,
1330 (unsigned long long) 1365 (unsigned long long)
1331 root->fs_info->running_transaction->transid); 1366 root->fs_info->running_transaction->transid);
1332 WARN_ON(1); 1367
1333 } 1368 if (trans->transid != root->fs_info->generation)
1334 if (trans->transid != root->fs_info->generation) { 1369 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1335 printk(KERN_CRIT "trans %llu running %llu\n",
1336 (unsigned long long)trans->transid, 1370 (unsigned long long)trans->transid,
1337 (unsigned long long)root->fs_info->generation); 1371 (unsigned long long)root->fs_info->generation);
1338 WARN_ON(1);
1339 }
1340 1372
1341 if (!should_cow_block(trans, root, buf)) { 1373 if (!should_cow_block(trans, root, buf)) {
1342 *cow_ret = buf; 1374 *cow_ret = buf;
@@ -1432,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1432 if (cache_only && parent_level != 1) 1464 if (cache_only && parent_level != 1)
1433 return 0; 1465 return 0;
1434 1466
1435 if (trans->transaction != root->fs_info->running_transaction) 1467 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1436 WARN_ON(1); 1468 WARN_ON(trans->transid != root->fs_info->generation);
1437 if (trans->transid != root->fs_info->generation)
1438 WARN_ON(1);
1439 1469
1440 parent_nritems = btrfs_header_nritems(parent); 1470 parent_nritems = btrfs_header_nritems(parent);
1441 blocksize = btrfs_level_size(root, parent_level - 1); 1471 blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1725,6 +1755,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1725 goto enospc; 1755 goto enospc;
1726 } 1756 }
1727 1757
1758 tree_mod_log_free_eb(root->fs_info, root->node);
1728 tree_mod_log_set_root_pointer(root, child); 1759 tree_mod_log_set_root_pointer(root, child);
1729 rcu_assign_pointer(root->node, child); 1760 rcu_assign_pointer(root->node, child);
1730 1761
@@ -1789,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1789 if (btrfs_header_nritems(right) == 0) { 1820 if (btrfs_header_nritems(right) == 0) {
1790 clean_tree_block(trans, root, right); 1821 clean_tree_block(trans, root, right);
1791 btrfs_tree_unlock(right); 1822 btrfs_tree_unlock(right);
1792 del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1823 del_ptr(trans, root, path, level + 1, pslot + 1);
1793 root_sub_used(root, right->len); 1824 root_sub_used(root, right->len);
1794 btrfs_free_tree_block(trans, root, right, 0, 1); 1825 btrfs_free_tree_block(trans, root, right, 0, 1);
1795 free_extent_buffer_stale(right); 1826 free_extent_buffer_stale(right);
@@ -1798,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1798 struct btrfs_disk_key right_key; 1829 struct btrfs_disk_key right_key;
1799 btrfs_node_key(right, &right_key, 0); 1830 btrfs_node_key(right, &right_key, 0);
1800 tree_mod_log_set_node_key(root->fs_info, parent, 1831 tree_mod_log_set_node_key(root->fs_info, parent,
1801 &right_key, pslot + 1, 0); 1832 pslot + 1, 0);
1802 btrfs_set_node_key(parent, &right_key, pslot + 1); 1833 btrfs_set_node_key(parent, &right_key, pslot + 1);
1803 btrfs_mark_buffer_dirty(parent); 1834 btrfs_mark_buffer_dirty(parent);
1804 } 1835 }
@@ -1833,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1833 if (btrfs_header_nritems(mid) == 0) { 1864 if (btrfs_header_nritems(mid) == 0) {
1834 clean_tree_block(trans, root, mid); 1865 clean_tree_block(trans, root, mid);
1835 btrfs_tree_unlock(mid); 1866 btrfs_tree_unlock(mid);
1836 del_ptr(trans, root, path, level + 1, pslot, 1); 1867 del_ptr(trans, root, path, level + 1, pslot);
1837 root_sub_used(root, mid->len); 1868 root_sub_used(root, mid->len);
1838 btrfs_free_tree_block(trans, root, mid, 0, 1); 1869 btrfs_free_tree_block(trans, root, mid, 0, 1);
1839 free_extent_buffer_stale(mid); 1870 free_extent_buffer_stale(mid);
@@ -1842,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1842 /* update the parent key to reflect our changes */ 1873 /* update the parent key to reflect our changes */
1843 struct btrfs_disk_key mid_key; 1874 struct btrfs_disk_key mid_key;
1844 btrfs_node_key(mid, &mid_key, 0); 1875 btrfs_node_key(mid, &mid_key, 0);
1845 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1876 tree_mod_log_set_node_key(root->fs_info, parent,
1846 pslot, 0); 1877 pslot, 0);
1847 btrfs_set_node_key(parent, &mid_key, pslot); 1878 btrfs_set_node_key(parent, &mid_key, pslot);
1848 btrfs_mark_buffer_dirty(parent); 1879 btrfs_mark_buffer_dirty(parent);
@@ -1942,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1942 orig_slot += left_nr; 1973 orig_slot += left_nr;
1943 btrfs_node_key(mid, &disk_key, 0); 1974 btrfs_node_key(mid, &disk_key, 0);
1944 tree_mod_log_set_node_key(root->fs_info, parent, 1975 tree_mod_log_set_node_key(root->fs_info, parent,
1945 &disk_key, pslot, 0); 1976 pslot, 0);
1946 btrfs_set_node_key(parent, &disk_key, pslot); 1977 btrfs_set_node_key(parent, &disk_key, pslot);
1947 btrfs_mark_buffer_dirty(parent); 1978 btrfs_mark_buffer_dirty(parent);
1948 if (btrfs_header_nritems(left) > orig_slot) { 1979 if (btrfs_header_nritems(left) > orig_slot) {
@@ -1995,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1995 2026
1996 btrfs_node_key(right, &disk_key, 0); 2027 btrfs_node_key(right, &disk_key, 0);
1997 tree_mod_log_set_node_key(root->fs_info, parent, 2028 tree_mod_log_set_node_key(root->fs_info, parent,
1998 &disk_key, pslot + 1, 0); 2029 pslot + 1, 0);
1999 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2030 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2000 btrfs_mark_buffer_dirty(parent); 2031 btrfs_mark_buffer_dirty(parent);
2001 2032
@@ -2181,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
2181 int no_skips = 0; 2212 int no_skips = 0;
2182 struct extent_buffer *t; 2213 struct extent_buffer *t;
2183 2214
2215 if (path->really_keep_locks)
2216 return;
2217
2184 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2185 if (!path->nodes[i]) 2219 if (!path->nodes[i])
2186 break; 2220 break;
@@ -2228,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2228{ 2262{
2229 int i; 2263 int i;
2230 2264
2231 if (path->keep_locks) 2265 if (path->keep_locks || path->really_keep_locks)
2232 return; 2266 return;
2233 2267
2234 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2461,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2461 if (!cow) 2495 if (!cow)
2462 write_lock_level = -1; 2496 write_lock_level = -1;
2463 2497
2464 if (cow && (p->keep_locks || p->lowest_level)) 2498 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2465 write_lock_level = BTRFS_MAX_LEVEL; 2499 write_lock_level = BTRFS_MAX_LEVEL;
2466 2500
2467 min_write_lock_level = write_lock_level; 2501 min_write_lock_level = write_lock_level;
@@ -2530,7 +2564,10 @@ again:
2530 * must have write locks on this node and the 2564 * must have write locks on this node and the
2531 * parent 2565 * parent
2532 */ 2566 */
2533 if (level + 1 > write_lock_level) { 2567 if (level > write_lock_level ||
2568 (level + 1 > write_lock_level &&
2569 level + 1 < BTRFS_MAX_LEVEL &&
2570 p->nodes[level + 1])) {
2534 write_lock_level = level + 1; 2571 write_lock_level = level + 1;
2535 btrfs_release_path(p); 2572 btrfs_release_path(p);
2536 goto again; 2573 goto again;
@@ -2879,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
2879 if (!path->nodes[i]) 2916 if (!path->nodes[i])
2880 break; 2917 break;
2881 t = path->nodes[i]; 2918 t = path->nodes[i];
2882 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2919 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2883 btrfs_set_node_key(t, key, tslot); 2920 btrfs_set_node_key(t, key, tslot);
2884 btrfs_mark_buffer_dirty(path->nodes[i]); 2921 btrfs_mark_buffer_dirty(path->nodes[i]);
2885 if (tslot != 0) 2922 if (tslot != 0)
@@ -2970,8 +3007,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2970 push_items * sizeof(struct btrfs_key_ptr)); 3007 push_items * sizeof(struct btrfs_key_ptr));
2971 3008
2972 if (push_items < src_nritems) { 3009 if (push_items < src_nritems) {
2973 tree_mod_log_eb_move(root->fs_info, src, 0, push_items, 3010 /*
2974 src_nritems - push_items); 3011 * don't call tree_mod_log_eb_move here, key removal was already
3012 * fully logged by tree_mod_log_eb_copy above.
3013 */
2975 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 3014 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2976 btrfs_node_key_ptr_offset(push_items), 3015 btrfs_node_key_ptr_offset(push_items),
2977 (src_nritems - push_items) * 3016 (src_nritems - push_items) *
@@ -3262,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3262 */ 3301 */
3263static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3302static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3264{ 3303{
3304 struct btrfs_item *start_item;
3305 struct btrfs_item *end_item;
3306 struct btrfs_map_token token;
3265 int data_len; 3307 int data_len;
3266 int nritems = btrfs_header_nritems(l); 3308 int nritems = btrfs_header_nritems(l);
3267 int end = min(nritems, start + nr) - 1; 3309 int end = min(nritems, start + nr) - 1;
3268 3310
3269 if (!nr) 3311 if (!nr)
3270 return 0; 3312 return 0;
3271 data_len = btrfs_item_end_nr(l, start); 3313 btrfs_init_map_token(&token);
3272 data_len = data_len - btrfs_item_offset_nr(l, end); 3314 start_item = btrfs_item_nr(l, start);
3315 end_item = btrfs_item_nr(l, end);
3316 data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 btrfs_token_item_size(l, start_item, &token);
3318 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3273 data_len += sizeof(struct btrfs_item) * nr; 3319 data_len += sizeof(struct btrfs_item) * nr;
3274 WARN_ON(data_len < 0); 3320 WARN_ON(data_len < 0);
3275 return data_len; 3321 return data_len;
@@ -3363,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3363 if (push_items == 0) 3409 if (push_items == 0)
3364 goto out_unlock; 3410 goto out_unlock;
3365 3411
3366 if (!empty && push_items == left_nritems) 3412 WARN_ON(!empty && push_items == left_nritems);
3367 WARN_ON(1);
3368 3413
3369 /* push left to right */ 3414 /* push left to right */
3370 right_nritems = btrfs_header_nritems(right); 3415 right_nritems = btrfs_header_nritems(right);
@@ -3602,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3602 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3647 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3603 3648
3604 /* fixup right node */ 3649 /* fixup right node */
3605 if (push_items > right_nritems) { 3650 if (push_items > right_nritems)
3606 printk(KERN_CRIT "push items %d nr %u\n", push_items, 3651 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3607 right_nritems); 3652 right_nritems);
3608 WARN_ON(1);
3609 }
3610 3653
3611 if (push_items < right_nritems) { 3654 if (push_items < right_nritems) {
3612 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3655 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4402,149 +4445,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
4402} 4445}
4403 4446
4404/* 4447/*
4405 * Given a key and some data, insert items into the tree.
4406 * This does all the path init required, making room in the tree if needed.
4407 * Returns the number of keys that were inserted.
4408 */
4409int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4410 struct btrfs_root *root,
4411 struct btrfs_path *path,
4412 struct btrfs_key *cpu_key, u32 *data_size,
4413 int nr)
4414{
4415 struct extent_buffer *leaf;
4416 struct btrfs_item *item;
4417 int ret = 0;
4418 int slot;
4419 int i;
4420 u32 nritems;
4421 u32 total_data = 0;
4422 u32 total_size = 0;
4423 unsigned int data_end;
4424 struct btrfs_disk_key disk_key;
4425 struct btrfs_key found_key;
4426 struct btrfs_map_token token;
4427
4428 btrfs_init_map_token(&token);
4429
4430 for (i = 0; i < nr; i++) {
4431 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4432 BTRFS_LEAF_DATA_SIZE(root)) {
4433 break;
4434 nr = i;
4435 }
4436 total_data += data_size[i];
4437 total_size += data_size[i] + sizeof(struct btrfs_item);
4438 }
4439 BUG_ON(nr == 0);
4440
4441 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4442 if (ret == 0)
4443 return -EEXIST;
4444 if (ret < 0)
4445 goto out;
4446
4447 leaf = path->nodes[0];
4448
4449 nritems = btrfs_header_nritems(leaf);
4450 data_end = leaf_data_end(root, leaf);
4451
4452 if (btrfs_leaf_free_space(root, leaf) < total_size) {
4453 for (i = nr; i >= 0; i--) {
4454 total_data -= data_size[i];
4455 total_size -= data_size[i] + sizeof(struct btrfs_item);
4456 if (total_size < btrfs_leaf_free_space(root, leaf))
4457 break;
4458 }
4459 nr = i;
4460 }
4461
4462 slot = path->slots[0];
4463 BUG_ON(slot < 0);
4464
4465 if (slot != nritems) {
4466 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4467
4468 item = btrfs_item_nr(leaf, slot);
4469 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4470
4471 /* figure out how many keys we can insert in here */
4472 total_data = data_size[0];
4473 for (i = 1; i < nr; i++) {
4474 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4475 break;
4476 total_data += data_size[i];
4477 }
4478 nr = i;
4479
4480 if (old_data < data_end) {
4481 btrfs_print_leaf(root, leaf);
4482 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4483 slot, old_data, data_end);
4484 BUG_ON(1);
4485 }
4486 /*
4487 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4488 */
4489 /* first correct the data pointers */
4490 for (i = slot; i < nritems; i++) {
4491 u32 ioff;
4492
4493 item = btrfs_item_nr(leaf, i);
4494 ioff = btrfs_token_item_offset(leaf, item, &token);
4495 btrfs_set_token_item_offset(leaf, item,
4496 ioff - total_data, &token);
4497 }
4498 /* shift the items */
4499 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4500 btrfs_item_nr_offset(slot),
4501 (nritems - slot) * sizeof(struct btrfs_item));
4502
4503 /* shift the data */
4504 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4505 data_end - total_data, btrfs_leaf_data(leaf) +
4506 data_end, old_data - data_end);
4507 data_end = old_data;
4508 } else {
4509 /*
4510 * this sucks but it has to be done, if we are inserting at
4511 * the end of the leaf only insert 1 of the items, since we
4512 * have no way of knowing whats on the next leaf and we'd have
4513 * to drop our current locks to figure it out
4514 */
4515 nr = 1;
4516 }
4517
4518 /* setup the item for the new data */
4519 for (i = 0; i < nr; i++) {
4520 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4521 btrfs_set_item_key(leaf, &disk_key, slot + i);
4522 item = btrfs_item_nr(leaf, slot + i);
4523 btrfs_set_token_item_offset(leaf, item,
4524 data_end - data_size[i], &token);
4525 data_end -= data_size[i];
4526 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4527 }
4528 btrfs_set_header_nritems(leaf, nritems + nr);
4529 btrfs_mark_buffer_dirty(leaf);
4530
4531 ret = 0;
4532 if (slot == 0) {
4533 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4534 fixup_low_keys(trans, root, path, &disk_key, 1);
4535 }
4536
4537 if (btrfs_leaf_free_space(root, leaf) < 0) {
4538 btrfs_print_leaf(root, leaf);
4539 BUG();
4540 }
4541out:
4542 if (!ret)
4543 ret = nr;
4544 return ret;
4545}
4546
4547/*
4548 * this is a helper for btrfs_insert_empty_items, the main goal here is 4448 * this is a helper for btrfs_insert_empty_items, the main goal here is
4549 * to save stack depth by doing the bulk of the work in a function 4449 * to save stack depth by doing the bulk of the work in a function
4550 * that doesn't call btrfs_search_slot 4450 * that doesn't call btrfs_search_slot
@@ -4705,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4705 * empty a node. 4605 * empty a node.
4706 */ 4606 */
4707static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4607static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4708 struct btrfs_path *path, int level, int slot, 4608 struct btrfs_path *path, int level, int slot)
4709 int tree_mod_log)
4710{ 4609{
4711 struct extent_buffer *parent = path->nodes[level]; 4610 struct extent_buffer *parent = path->nodes[level];
4712 u32 nritems; 4611 u32 nritems;
4713 int ret; 4612 int ret;
4714 4613
4614 if (level) {
4615 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4616 MOD_LOG_KEY_REMOVE);
4617 BUG_ON(ret < 0);
4618 }
4619
4715 nritems = btrfs_header_nritems(parent); 4620 nritems = btrfs_header_nritems(parent);
4716 if (slot != nritems - 1) { 4621 if (slot != nritems - 1) {
4717 if (tree_mod_log && level) 4622 if (level)
4718 tree_mod_log_eb_move(root->fs_info, parent, slot, 4623 tree_mod_log_eb_move(root->fs_info, parent, slot,
4719 slot + 1, nritems - slot - 1); 4624 slot + 1, nritems - slot - 1);
4720 memmove_extent_buffer(parent, 4625 memmove_extent_buffer(parent,
@@ -4722,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4722 btrfs_node_key_ptr_offset(slot + 1), 4627 btrfs_node_key_ptr_offset(slot + 1),
4723 sizeof(struct btrfs_key_ptr) * 4628 sizeof(struct btrfs_key_ptr) *
4724 (nritems - slot - 1)); 4629 (nritems - slot - 1));
4725 } else if (tree_mod_log && level) {
4726 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4727 MOD_LOG_KEY_REMOVE);
4728 BUG_ON(ret < 0);
4729 } 4630 }
4730 4631
4731 nritems--; 4632 nritems--;
@@ -4759,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4759 struct extent_buffer *leaf) 4660 struct extent_buffer *leaf)
4760{ 4661{
4761 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4662 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4762 del_ptr(trans, root, path, 1, path->slots[1], 1); 4663 del_ptr(trans, root, path, 1, path->slots[1]);
4763 4664
4764 /* 4665 /*
4765 * btrfs_free_extent is expensive, we want to make sure we 4666 * btrfs_free_extent is expensive, we want to make sure we
@@ -5073,6 +4974,7 @@ static void tree_move_down(struct btrfs_root *root,
5073 struct btrfs_path *path, 4974 struct btrfs_path *path,
5074 int *level, int root_level) 4975 int *level, int root_level)
5075{ 4976{
4977 BUG_ON(*level == 0);
5076 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 4978 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5077 path->slots[*level]); 4979 path->slots[*level]);
5078 path->slots[*level - 1] = 0; 4980 path->slots[*level - 1] = 0;
@@ -5089,7 +4991,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
5089 4991
5090 path->slots[*level]++; 4992 path->slots[*level]++;
5091 4993
5092 while (path->slots[*level] == nritems) { 4994 while (path->slots[*level] >= nritems) {
5093 if (*level == root_level) 4995 if (*level == root_level)
5094 return -1; 4996 return -1;
5095 4997
@@ -5225,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5225 right_path->search_commit_root = 1; 5127 right_path->search_commit_root = 1;
5226 right_path->skip_locking = 1; 5128 right_path->skip_locking = 1;
5227 5129
5228 spin_lock(&left_root->root_times_lock); 5130 spin_lock(&left_root->root_item_lock);
5229 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5131 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5230 spin_unlock(&left_root->root_times_lock); 5132 spin_unlock(&left_root->root_item_lock);
5231 5133
5232 spin_lock(&right_root->root_times_lock); 5134 spin_lock(&right_root->root_item_lock);
5233 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5135 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5234 spin_unlock(&right_root->root_times_lock); 5136 spin_unlock(&right_root->root_item_lock);
5235 5137
5236 trans = btrfs_join_transaction(left_root); 5138 trans = btrfs_join_transaction(left_root);
5237 if (IS_ERR(trans)) { 5139 if (IS_ERR(trans)) {
@@ -5326,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5326 goto out; 5228 goto out;
5327 } 5229 }
5328 5230
5329 spin_lock(&left_root->root_times_lock); 5231 spin_lock(&left_root->root_item_lock);
5330 ctransid = btrfs_root_ctransid(&left_root->root_item); 5232 ctransid = btrfs_root_ctransid(&left_root->root_item);
5331 spin_unlock(&left_root->root_times_lock); 5233 spin_unlock(&left_root->root_item_lock);
5332 if (ctransid != left_start_ctransid) 5234 if (ctransid != left_start_ctransid)
5333 left_start_ctransid = 0; 5235 left_start_ctransid = 0;
5334 5236
5335 spin_lock(&right_root->root_times_lock); 5237 spin_lock(&right_root->root_item_lock);
5336 ctransid = btrfs_root_ctransid(&right_root->root_item); 5238 ctransid = btrfs_root_ctransid(&right_root->root_item);
5337 spin_unlock(&right_root->root_times_lock); 5239 spin_unlock(&right_root->root_item_lock);
5338 if (ctransid != right_start_ctransid) 5240 if (ctransid != right_start_ctransid)
5339 right_start_ctransid = 0; 5241 right_start_ctransid = 0;
5340 5242
@@ -5433,9 +5335,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5433 goto out; 5335 goto out;
5434 advance_right = ADVANCE; 5336 advance_right = ADVANCE;
5435 } else { 5337 } else {
5338 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5436 ret = tree_compare_item(left_root, left_path, 5339 ret = tree_compare_item(left_root, left_path,
5437 right_path, tmp_buf); 5340 right_path, tmp_buf);
5438 if (ret) { 5341 if (ret) {
5342 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5439 ret = changed_cb(left_root, right_root, 5343 ret = changed_cb(left_root, right_root,
5440 left_path, right_path, 5344 left_path, right_path,
5441 &left_key, 5345 &left_key,
@@ -5596,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5596 return btrfs_next_old_leaf(root, path, 0); 5500 return btrfs_next_old_leaf(root, path, 0);
5597} 5501}
5598 5502
5503/* Release the path up to but not including the given level */
5504static void btrfs_release_level(struct btrfs_path *path, int level)
5505{
5506 int i;
5507
5508 for (i = 0; i < level; i++) {
5509 path->slots[i] = 0;
5510 if (!path->nodes[i])
5511 continue;
5512 if (path->locks[i]) {
5513 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5514 path->locks[i] = 0;
5515 }
5516 free_extent_buffer(path->nodes[i]);
5517 path->nodes[i] = NULL;
5518 }
5519}
5520
5521/*
5522 * This function assumes 2 things
5523 *
5524 * 1) You are using path->keep_locks
5525 * 2) You are not inserting items.
5526 *
5527 * If either of these are not true do not use this function. If you need a next
5528 * leaf with either of these not being true then this function can be easily
5529 * adapted to do that, but at the moment these are the limitations.
5530 */
5531int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5532 struct btrfs_root *root, struct btrfs_path *path,
5533 int del)
5534{
5535 struct extent_buffer *b;
5536 struct btrfs_key key;
5537 u32 nritems;
5538 int level = 1;
5539 int slot;
5540 int ret = 1;
5541 int write_lock_level = BTRFS_MAX_LEVEL;
5542 int ins_len = del ? -1 : 0;
5543
5544 WARN_ON(!(path->keep_locks || path->really_keep_locks));
5545
5546 nritems = btrfs_header_nritems(path->nodes[0]);
5547 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5548
5549 while (path->nodes[level]) {
5550 nritems = btrfs_header_nritems(path->nodes[level]);
5551 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5552search:
5553 btrfs_release_path(path);
5554 ret = btrfs_search_slot(trans, root, &key, path,
5555 ins_len, 1);
5556 if (ret < 0)
5557 goto out;
5558 level = 1;
5559 continue;
5560 }
5561
5562 if (path->slots[level] >= nritems - 1) {
5563 level++;
5564 continue;
5565 }
5566
5567 btrfs_release_level(path, level);
5568 break;
5569 }
5570
5571 if (!path->nodes[level]) {
5572 ret = 1;
5573 goto out;
5574 }
5575
5576 path->slots[level]++;
5577 b = path->nodes[level];
5578
5579 while (b) {
5580 level = btrfs_header_level(b);
5581
5582 if (!should_cow_block(trans, root, b))
5583 goto cow_done;
5584
5585 btrfs_set_path_blocking(path);
5586 ret = btrfs_cow_block(trans, root, b,
5587 path->nodes[level + 1],
5588 path->slots[level + 1], &b);
5589 if (ret)
5590 goto out;
5591cow_done:
5592 path->nodes[level] = b;
5593 btrfs_clear_path_blocking(path, NULL, 0);
5594 if (level != 0) {
5595 ret = setup_nodes_for_search(trans, root, path, b,
5596 level, ins_len,
5597 &write_lock_level);
5598 if (ret == -EAGAIN)
5599 goto search;
5600 if (ret)
5601 goto out;
5602
5603 b = path->nodes[level];
5604 slot = path->slots[level];
5605
5606 ret = read_block_for_search(trans, root, path,
5607 &b, level, slot, &key, 0);
5608 if (ret == -EAGAIN)
5609 goto search;
5610 if (ret)
5611 goto out;
5612 level = btrfs_header_level(b);
5613 if (!btrfs_try_tree_write_lock(b)) {
5614 btrfs_set_path_blocking(path);
5615 btrfs_tree_lock(b);
5616 btrfs_clear_path_blocking(path, b,
5617 BTRFS_WRITE_LOCK);
5618 }
5619 path->locks[level] = BTRFS_WRITE_LOCK;
5620 path->nodes[level] = b;
5621 path->slots[level] = 0;
5622 } else {
5623 path->slots[level] = 0;
5624 ret = 0;
5625 break;
5626 }
5627 }
5628
5629out:
5630 if (ret)
5631 btrfs_release_path(path);
5632
5633 return ret;
5634}
5635
5599int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5636int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5600 u64 time_seq) 5637 u64 time_seq)
5601{ 5638{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9821b672f5a2..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC "_BHRfS_M"
50 50
51#define BTRFS_MAX_MIRRORS 2 51#define BTRFS_MAX_MIRRORS 3
52 52
53#define BTRFS_MAX_LEVEL 8 53#define BTRFS_MAX_LEVEL 8
54 54
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
142 142
143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
144 144
145#define BTRFS_DEV_REPLACE_DEVID 0
146
145/* 147/*
146 * the max metadata block size. This limit is somewhat artificial, 148 * the max metadata block size. This limit is somewhat artificial,
147 * but the memmove costs go through the roof for larger blocks. 149 * but the memmove costs go through the roof for larger blocks.
@@ -154,6 +156,13 @@ struct btrfs_ordered_sum;
154 */ 156 */
155#define BTRFS_NAME_LEN 255 157#define BTRFS_NAME_LEN 255
156 158
159/*
160 * Theoretical limit is larger, but we keep this down to a sane
161 * value. That should limit greatly the possibility of collisions on
162 * inode ref items.
163 */
164#define BTRFS_LINK_MAX 65535U
165
157/* 32 bytes in various csum fields */ 166/* 32 bytes in various csum fields */
158#define BTRFS_CSUM_SIZE 32 167#define BTRFS_CSUM_SIZE 32
159 168
@@ -165,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
165/* four bytes for CRC32 */ 174/* four bytes for CRC32 */
166#define BTRFS_EMPTY_DIR_SIZE 0 175#define BTRFS_EMPTY_DIR_SIZE 0
167 176
177/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
178#define REQ_GET_READ_MIRRORS (1 << 30)
179
168#define BTRFS_FT_UNKNOWN 0 180#define BTRFS_FT_UNKNOWN 0
169#define BTRFS_FT_REG_FILE 1 181#define BTRFS_FT_REG_FILE 1
170#define BTRFS_FT_DIR 2 182#define BTRFS_FT_DIR 2
@@ -406,7 +418,7 @@ struct btrfs_root_backup {
406 __le64 bytes_used; 418 __le64 bytes_used;
407 __le64 num_devices; 419 __le64 num_devices;
408 /* future */ 420 /* future */
409 __le64 unsed_64[4]; 421 __le64 unused_64[4];
410 422
411 u8 tree_root_level; 423 u8 tree_root_level;
412 u8 chunk_root_level; 424 u8 chunk_root_level;
@@ -489,6 +501,8 @@ struct btrfs_super_block {
489 */ 501 */
490#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
491 503
504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
505
492#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 506#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
493#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 507#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
494#define BTRFS_FEATURE_INCOMPAT_SUPP \ 508#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -496,7 +510,8 @@ struct btrfs_super_block {
496 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 510 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
497 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
498 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
499 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) 513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
514 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
500 515
501/* 516/*
502 * A leaf is full of items. offset and size tell us where to find 517 * A leaf is full of items. offset and size tell us where to find
@@ -561,6 +576,7 @@ struct btrfs_path {
561 unsigned int skip_locking:1; 576 unsigned int skip_locking:1;
562 unsigned int leave_spinning:1; 577 unsigned int leave_spinning:1;
563 unsigned int search_commit_root:1; 578 unsigned int search_commit_root:1;
579 unsigned int really_keep_locks:1;
564}; 580};
565 581
566/* 582/*
@@ -643,6 +659,14 @@ struct btrfs_inode_ref {
643 /* name goes here */ 659 /* name goes here */
644} __attribute__ ((__packed__)); 660} __attribute__ ((__packed__));
645 661
662struct btrfs_inode_extref {
663 __le64 parent_objectid;
664 __le64 index;
665 __le16 name_len;
666 __u8 name[0];
667 /* name goes here */
668} __attribute__ ((__packed__));
669
646struct btrfs_timespec { 670struct btrfs_timespec {
647 __le64 sec; 671 __le64 sec;
648 __le32 nsec; 672 __le32 nsec;
@@ -867,6 +891,59 @@ struct btrfs_dev_stats_item {
867 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 891 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
868} __attribute__ ((__packed__)); 892} __attribute__ ((__packed__));
869 893
894#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
895#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
896#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
897#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
898#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
899#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
900#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
901
902struct btrfs_dev_replace {
903 u64 replace_state; /* see #define above */
904 u64 time_started; /* seconds since 1-Jan-1970 */
905 u64 time_stopped; /* seconds since 1-Jan-1970 */
906 atomic64_t num_write_errors;
907 atomic64_t num_uncorrectable_read_errors;
908
909 u64 cursor_left;
910 u64 committed_cursor_left;
911 u64 cursor_left_last_write_of_item;
912 u64 cursor_right;
913
914 u64 cont_reading_from_srcdev_mode; /* see #define above */
915
916 int is_valid;
917 int item_needs_writeback;
918 struct btrfs_device *srcdev;
919 struct btrfs_device *tgtdev;
920
921 pid_t lock_owner;
922 atomic_t nesting_level;
923 struct mutex lock_finishing_cancel_unmount;
924 struct mutex lock_management_lock;
925 struct mutex lock;
926
927 struct btrfs_scrub_progress scrub_progress;
928};
929
930struct btrfs_dev_replace_item {
931 /*
932 * grow this item struct at the end for future enhancements and keep
933 * the existing values unchanged
934 */
935 __le64 src_devid;
936 __le64 cursor_left;
937 __le64 cursor_right;
938 __le64 cont_reading_from_srcdev_mode;
939
940 __le64 replace_state;
941 __le64 time_started;
942 __le64 time_stopped;
943 __le64 num_write_errors;
944 __le64 num_uncorrectable_read_errors;
945} __attribute__ ((__packed__));
946
870/* different types of block groups (and chunks) */ 947/* different types of block groups (and chunks) */
871#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 948#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
872#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 949#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1028,12 +1105,22 @@ struct btrfs_space_info {
1028 wait_queue_head_t wait; 1105 wait_queue_head_t wait;
1029}; 1106};
1030 1107
1108#define BTRFS_BLOCK_RSV_GLOBAL 1
1109#define BTRFS_BLOCK_RSV_DELALLOC 2
1110#define BTRFS_BLOCK_RSV_TRANS 3
1111#define BTRFS_BLOCK_RSV_CHUNK 4
1112#define BTRFS_BLOCK_RSV_DELOPS 5
1113#define BTRFS_BLOCK_RSV_EMPTY 6
1114#define BTRFS_BLOCK_RSV_TEMP 7
1115
1031struct btrfs_block_rsv { 1116struct btrfs_block_rsv {
1032 u64 size; 1117 u64 size;
1033 u64 reserved; 1118 u64 reserved;
1034 struct btrfs_space_info *space_info; 1119 struct btrfs_space_info *space_info;
1035 spinlock_t lock; 1120 spinlock_t lock;
1036 unsigned int full; 1121 unsigned short full;
1122 unsigned short type;
1123 unsigned short failfast;
1037}; 1124};
1038 1125
1039/* 1126/*
@@ -1127,6 +1214,9 @@ struct btrfs_block_group_cache {
1127 * Today it will only have one thing on it, but that may change 1214 * Today it will only have one thing on it, but that may change
1128 */ 1215 */
1129 struct list_head cluster_list; 1216 struct list_head cluster_list;
1217
1218 /* For delayed block group creation */
1219 struct list_head new_bg_list;
1130}; 1220};
1131 1221
1132/* delayed seq elem */ 1222/* delayed seq elem */
@@ -1240,7 +1330,6 @@ struct btrfs_fs_info {
1240 struct mutex reloc_mutex; 1330 struct mutex reloc_mutex;
1241 1331
1242 struct list_head trans_list; 1332 struct list_head trans_list;
1243 struct list_head hashers;
1244 struct list_head dead_roots; 1333 struct list_head dead_roots;
1245 struct list_head caching_block_groups; 1334 struct list_head caching_block_groups;
1246 1335
@@ -1303,6 +1392,7 @@ struct btrfs_fs_info {
1303 struct btrfs_workers generic_worker; 1392 struct btrfs_workers generic_worker;
1304 struct btrfs_workers workers; 1393 struct btrfs_workers workers;
1305 struct btrfs_workers delalloc_workers; 1394 struct btrfs_workers delalloc_workers;
1395 struct btrfs_workers flush_workers;
1306 struct btrfs_workers endio_workers; 1396 struct btrfs_workers endio_workers;
1307 struct btrfs_workers endio_meta_workers; 1397 struct btrfs_workers endio_meta_workers;
1308 struct btrfs_workers endio_meta_write_workers; 1398 struct btrfs_workers endio_meta_write_workers;
@@ -1366,9 +1456,6 @@ struct btrfs_fs_info {
1366 struct rb_root defrag_inodes; 1456 struct rb_root defrag_inodes;
1367 atomic_t defrag_running; 1457 atomic_t defrag_running;
1368 1458
1369 spinlock_t ref_cache_lock;
1370 u64 total_ref_cache_size;
1371
1372 /* 1459 /*
1373 * these three are in extended format (availability of single 1460 * these three are in extended format (availability of single
1374 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1461 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1402,6 +1489,8 @@ struct btrfs_fs_info {
1402 struct rw_semaphore scrub_super_lock; 1489 struct rw_semaphore scrub_super_lock;
1403 int scrub_workers_refcnt; 1490 int scrub_workers_refcnt;
1404 struct btrfs_workers scrub_workers; 1491 struct btrfs_workers scrub_workers;
1492 struct btrfs_workers scrub_wr_completion_workers;
1493 struct btrfs_workers scrub_nocow_workers;
1405 1494
1406#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1495#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1407 u32 check_integrity_print_mask; 1496 u32 check_integrity_print_mask;
@@ -1441,6 +1530,13 @@ struct btrfs_fs_info {
1441 1530
1442 /* next backup root to be overwritten */ 1531 /* next backup root to be overwritten */
1443 int backup_root_index; 1532 int backup_root_index;
1533
1534 int num_tolerated_disk_barrier_failures;
1535
1536 /* device replace state */
1537 struct btrfs_dev_replace dev_replace;
1538
1539 atomic_t mutually_exclusive_operation_running;
1444}; 1540};
1445 1541
1446/* 1542/*
@@ -1481,9 +1577,9 @@ struct btrfs_root {
1481 wait_queue_head_t log_commit_wait[2]; 1577 wait_queue_head_t log_commit_wait[2];
1482 atomic_t log_writers; 1578 atomic_t log_writers;
1483 atomic_t log_commit[2]; 1579 atomic_t log_commit[2];
1580 atomic_t log_batch;
1484 unsigned long log_transid; 1581 unsigned long log_transid;
1485 unsigned long last_log_commit; 1582 unsigned long last_log_commit;
1486 unsigned long log_batch;
1487 pid_t log_start_pid; 1583 pid_t log_start_pid;
1488 bool log_multiple_pids; 1584 bool log_multiple_pids;
1489 1585
@@ -1550,7 +1646,7 @@ struct btrfs_root {
1550 1646
1551 int force_cow; 1647 int force_cow;
1552 1648
1553 spinlock_t root_times_lock; 1649 spinlock_t root_item_lock;
1554}; 1650};
1555 1651
1556struct btrfs_ioctl_defrag_range_args { 1652struct btrfs_ioctl_defrag_range_args {
@@ -1592,6 +1688,7 @@ struct btrfs_ioctl_defrag_range_args {
1592 */ 1688 */
1593#define BTRFS_INODE_ITEM_KEY 1 1689#define BTRFS_INODE_ITEM_KEY 1
1594#define BTRFS_INODE_REF_KEY 12 1690#define BTRFS_INODE_REF_KEY 12
1691#define BTRFS_INODE_EXTREF_KEY 13
1595#define BTRFS_XATTR_ITEM_KEY 24 1692#define BTRFS_XATTR_ITEM_KEY 24
1596#define BTRFS_ORPHAN_ITEM_KEY 48 1693#define BTRFS_ORPHAN_ITEM_KEY 48
1597/* reserve 2-15 close to the inode for later flexibility */ 1694/* reserve 2-15 close to the inode for later flexibility */
@@ -1693,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
1693#define BTRFS_DEV_STATS_KEY 249 1790#define BTRFS_DEV_STATS_KEY 249
1694 1791
1695/* 1792/*
1793 * Persistantly stores the device replace state in the device tree.
1794 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
1795 */
1796#define BTRFS_DEV_REPLACE_KEY 250
1797
1798/*
1696 * string items are for debugging. They just store a short string of 1799 * string items are for debugging. They just store a short string of
1697 * data in the FS 1800 * data in the FS
1698 */ 1801 */
@@ -1757,7 +1860,7 @@ struct btrfs_map_token {
1757 1860
1758static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1861static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1759{ 1862{
1760 memset(token, 0, sizeof(*token)); 1863 token->kaddr = NULL;
1761} 1864}
1762 1865
1763/* some macros to generate set/get funcs for the struct fields. This 1866/* some macros to generate set/get funcs for the struct fields. This
@@ -1978,6 +2081,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1978BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); 2081BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1979BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); 2082BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1980 2083
2084/* struct btrfs_inode_extref */
2085BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
2086 parent_objectid, 64);
2087BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
2088 name_len, 16);
2089BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
2090
1981/* struct btrfs_inode_item */ 2091/* struct btrfs_inode_item */
1982BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); 2092BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1983BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); 2093BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2718,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2718BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2828BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2719 rsv_excl, 64); 2829 rsv_excl, 64);
2720 2830
2831/* btrfs_dev_replace_item */
2832BTRFS_SETGET_FUNCS(dev_replace_src_devid,
2833 struct btrfs_dev_replace_item, src_devid, 64);
2834BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
2835 struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
2836 64);
2837BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
2838 replace_state, 64);
2839BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
2840 time_started, 64);
2841BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
2842 time_stopped, 64);
2843BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
2844 num_write_errors, 64);
2845BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
2846 struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
2847 64);
2848BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
2849 cursor_left, 64);
2850BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
2851 cursor_right, 64);
2852
2853BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
2854 struct btrfs_dev_replace_item, src_devid, 64);
2855BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
2856 struct btrfs_dev_replace_item,
2857 cont_reading_from_srcdev_mode, 64);
2858BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
2859 struct btrfs_dev_replace_item, replace_state, 64);
2860BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
2861 struct btrfs_dev_replace_item, time_started, 64);
2862BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
2863 struct btrfs_dev_replace_item, time_stopped, 64);
2864BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
2865 struct btrfs_dev_replace_item, num_write_errors, 64);
2866BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
2867 struct btrfs_dev_replace_item,
2868 num_uncorrectable_read_errors, 64);
2869BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
2870 struct btrfs_dev_replace_item, cursor_left, 64);
2871BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2872 struct btrfs_dev_replace_item, cursor_right, 64);
2873
2721static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2874static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2722{ 2875{
2723 return sb->s_fs_info; 2876 return sb->s_fs_info;
@@ -2858,9 +3011,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2858 u64 size); 3011 u64 size);
2859int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3012int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2860 struct btrfs_root *root, u64 group_start); 3013 struct btrfs_root *root, u64 group_start);
3014void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3015 struct btrfs_root *root);
2861u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3016u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2862u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3017u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2863void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 3018void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
3019
3020enum btrfs_reserve_flush_enum {
3021 /* If we are in the transaction, we can't flush anything.*/
3022 BTRFS_RESERVE_NO_FLUSH,
3023 /*
3024 * Flushing delalloc may cause deadlock somewhere, in this
3025 * case, use FLUSH LIMIT
3026 */
3027 BTRFS_RESERVE_FLUSH_LIMIT,
3028 BTRFS_RESERVE_FLUSH_ALL,
3029};
3030
2864int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3031int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2865void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3032void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2866void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3033void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2874,24 +3041,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2874void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 3041void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2875int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 3042int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2876void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); 3043void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2877void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); 3044void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2878struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 3045struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
3046 unsigned short type);
2879void btrfs_free_block_rsv(struct btrfs_root *root, 3047void btrfs_free_block_rsv(struct btrfs_root *root,
2880 struct btrfs_block_rsv *rsv); 3048 struct btrfs_block_rsv *rsv);
2881int btrfs_block_rsv_add(struct btrfs_root *root, 3049int btrfs_block_rsv_add(struct btrfs_root *root,
2882 struct btrfs_block_rsv *block_rsv, 3050 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2883 u64 num_bytes); 3051 enum btrfs_reserve_flush_enum flush);
2884int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2885 struct btrfs_block_rsv *block_rsv,
2886 u64 num_bytes);
2887int btrfs_block_rsv_check(struct btrfs_root *root, 3052int btrfs_block_rsv_check(struct btrfs_root *root,
2888 struct btrfs_block_rsv *block_rsv, int min_factor); 3053 struct btrfs_block_rsv *block_rsv, int min_factor);
2889int btrfs_block_rsv_refill(struct btrfs_root *root, 3054int btrfs_block_rsv_refill(struct btrfs_root *root,
2890 struct btrfs_block_rsv *block_rsv, 3055 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2891 u64 min_reserved); 3056 enum btrfs_reserve_flush_enum flush);
2892int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2893 struct btrfs_block_rsv *block_rsv,
2894 u64 min_reserved);
2895int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3057int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2896 struct btrfs_block_rsv *dst_rsv, 3058 struct btrfs_block_rsv *dst_rsv,
2897 u64 num_bytes); 3059 u64 num_bytes);
@@ -2915,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2915int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3077int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2916int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3078int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2917 struct btrfs_fs_info *fs_info); 3079 struct btrfs_fs_info *fs_info);
3080int __get_raid_index(u64 flags);
2918/* ctree.c */ 3081/* ctree.c */
2919int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2920 int level, int *slot); 3083 int level, int *slot);
@@ -3025,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3025} 3188}
3026 3189
3027int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3190int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3191int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
3192 struct btrfs_root *root, struct btrfs_path *path,
3193 int del);
3028int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3194int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3029 u64 time_seq); 3195 u64 time_seq);
3030static inline int btrfs_next_old_item(struct btrfs_root *root, 3196static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3080,6 +3246,7 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
3080{ 3246{
3081 return atomic_inc_return(&fs_info->tree_mod_seq); 3247 return atomic_inc_return(&fs_info->tree_mod_seq);
3082} 3248}
3249int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
3083 3250
3084/* root-item.c */ 3251/* root-item.c */
3085int btrfs_find_root_ref(struct btrfs_root *tree_root, 3252int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -3116,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3116 struct btrfs_root *root); 3283 struct btrfs_root *root);
3117 3284
3118/* dir-item.c */ 3285/* dir-item.c */
3286int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
3287 const char *name, int name_len);
3119int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3288int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
3120 struct btrfs_root *root, const char *name, 3289 struct btrfs_root *root, const char *name,
3121 int name_len, struct inode *dir, 3290 int name_len, struct inode *dir,
@@ -3172,12 +3341,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
3172 struct btrfs_root *root, 3341 struct btrfs_root *root,
3173 const char *name, int name_len, 3342 const char *name, int name_len,
3174 u64 inode_objectid, u64 ref_objectid, u64 *index); 3343 u64 inode_objectid, u64 ref_objectid, u64 *index);
3175struct btrfs_inode_ref * 3344int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
3176btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 3345 struct btrfs_root *root,
3177 struct btrfs_root *root, 3346 struct btrfs_path *path,
3178 struct btrfs_path *path, 3347 const char *name, int name_len,
3179 const char *name, int name_len, 3348 u64 inode_objectid, u64 ref_objectid, int mod,
3180 u64 inode_objectid, u64 ref_objectid, int mod); 3349 u64 *ret_index);
3181int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 3350int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
3182 struct btrfs_root *root, 3351 struct btrfs_root *root,
3183 struct btrfs_path *path, u64 objectid); 3352 struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3354,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
3185 *root, struct btrfs_path *path, 3354 *root, struct btrfs_path *path,
3186 struct btrfs_key *location, int mod); 3355 struct btrfs_key *location, int mod);
3187 3356
3357struct btrfs_inode_extref *
3358btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
3359 struct btrfs_root *root,
3360 struct btrfs_path *path,
3361 const char *name, int name_len,
3362 u64 inode_objectid, u64 ref_objectid, int ins_len,
3363 int cow);
3364
3365int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
3366 u64 ref_objectid, const char *name,
3367 int name_len,
3368 struct btrfs_inode_extref **extref_ret);
3369
3188/* file-item.c */ 3370/* file-item.c */
3189int btrfs_del_csums(struct btrfs_trans_handle *trans, 3371int btrfs_del_csums(struct btrfs_trans_handle *trans,
3190 struct btrfs_root *root, u64 bytenr, u64 len); 3372 struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3202,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
3202 struct btrfs_root *root, 3384 struct btrfs_root *root,
3203 struct btrfs_path *path, u64 objectid, 3385 struct btrfs_path *path, u64 objectid,
3204 u64 bytenr, int mod); 3386 u64 bytenr, int mod);
3387u64 btrfs_file_extent_length(struct btrfs_path *path);
3205int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3206 struct btrfs_root *root, 3389 struct btrfs_root *root,
3207 struct btrfs_ordered_sum *sums); 3390 struct btrfs_ordered_sum *sums);
@@ -3217,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3217int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3400int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3218 struct list_head *list, int search_commit); 3401 struct list_head *list, int search_commit);
3219/* inode.c */ 3402/* inode.c */
3403struct btrfs_delalloc_work {
3404 struct inode *inode;
3405 int wait;
3406 int delay_iput;
3407 struct completion completion;
3408 struct list_head list;
3409 struct btrfs_work work;
3410};
3411
3412struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
3413 int wait, int delay_iput);
3414void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3415
3220struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3416struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3221 size_t pg_offset, u64 start, u64 len, 3417 size_t pg_offset, u64 start, u64 len,
3222 int create); 3418 int create);
@@ -3249,6 +3445,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3249 struct btrfs_root *root, 3445 struct btrfs_root *root,
3250 struct inode *dir, u64 objectid, 3446 struct inode *dir, u64 objectid,
3251 const char *name, int name_len); 3447 const char *name, int name_len);
3448int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3449 int front);
3252int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3450int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3253 struct btrfs_root *root, 3451 struct btrfs_root *root,
3254 struct inode *inode, u64 new_size, 3452 struct inode *inode, u64 new_size,
@@ -3283,6 +3481,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3283int btrfs_update_inode(struct btrfs_trans_handle *trans, 3481int btrfs_update_inode(struct btrfs_trans_handle *trans,
3284 struct btrfs_root *root, 3482 struct btrfs_root *root,
3285 struct inode *inode); 3483 struct inode *inode);
3484int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3485 struct btrfs_root *root, struct inode *inode);
3286int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 3486int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
3287int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 3487int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
3288int btrfs_orphan_cleanup(struct btrfs_root *root); 3488int btrfs_orphan_cleanup(struct btrfs_root *root);
@@ -3308,16 +3508,30 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
3308int btrfs_defrag_file(struct inode *inode, struct file *file, 3508int btrfs_defrag_file(struct inode *inode, struct file *file,
3309 struct btrfs_ioctl_defrag_range_args *range, 3509 struct btrfs_ioctl_defrag_range_args *range,
3310 u64 newer_than, unsigned long max_pages); 3510 u64 newer_than, unsigned long max_pages);
3511void btrfs_get_block_group_info(struct list_head *groups_list,
3512 struct btrfs_ioctl_space_info *space);
3513
3311/* file.c */ 3514/* file.c */
3515int btrfs_auto_defrag_init(void);
3516void btrfs_auto_defrag_exit(void);
3312int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3517int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3313 struct inode *inode); 3518 struct inode *inode);
3314int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3519int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3520void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3315int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3521int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3316int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3522void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3317 int skip_pinned); 3523 int skip_pinned);
3524int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
3525 u64 start, u64 end, int skip_pinned,
3526 int modified);
3318extern const struct file_operations btrfs_file_operations; 3527extern const struct file_operations btrfs_file_operations;
3319int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 3528int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
3320 u64 start, u64 end, u64 *hint_byte, int drop_cache); 3529 struct btrfs_root *root, struct inode *inode,
3530 struct btrfs_path *path, u64 start, u64 end,
3531 u64 *drop_end, int drop_cache);
3532int btrfs_drop_extents(struct btrfs_trans_handle *trans,
3533 struct btrfs_root *root, struct inode *inode, u64 start,
3534 u64 end, int drop_cache);
3321int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 3535int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
3322 struct inode *inode, u64 start, u64 end); 3536 struct inode *inode, u64 start, u64 end);
3323int btrfs_release_file(struct inode *inode, struct file *file); 3537int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3592,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3378 } 3592 }
3379} 3593}
3380 3594
3595/*
3596 * Call btrfs_abort_transaction as early as possible when an error condition is
3597 * detected, that way the exact line number is reported.
3598 */
3599
3381#define btrfs_abort_transaction(trans, root, errno) \ 3600#define btrfs_abort_transaction(trans, root, errno) \
3382do { \ 3601do { \
3383 __btrfs_abort_transaction(trans, root, __func__, \ 3602 __btrfs_abort_transaction(trans, root, __func__, \
@@ -3445,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3445 struct btrfs_pending_snapshot *pending); 3664 struct btrfs_pending_snapshot *pending);
3446 3665
3447/* scrub.c */ 3666/* scrub.c */
3448int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3667int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3449 struct btrfs_scrub_progress *progress, int readonly); 3668 u64 end, struct btrfs_scrub_progress *progress,
3669 int readonly, int is_dev_replace);
3450void btrfs_scrub_pause(struct btrfs_root *root); 3670void btrfs_scrub_pause(struct btrfs_root *root);
3451void btrfs_scrub_pause_super(struct btrfs_root *root); 3671void btrfs_scrub_pause_super(struct btrfs_root *root);
3452void btrfs_scrub_continue(struct btrfs_root *root); 3672void btrfs_scrub_continue(struct btrfs_root *root);
3453void btrfs_scrub_continue_super(struct btrfs_root *root); 3673void btrfs_scrub_continue_super(struct btrfs_root *root);
3454int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674int btrfs_scrub_cancel(struct btrfs_fs_info *info);
3455int btrfs_scrub_cancel(struct btrfs_root *root); 3675int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3456int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3676 struct btrfs_device *dev);
3457int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
3458int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3459 struct btrfs_scrub_progress *progress); 3679 struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 52c85e2b95d0..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
29 29
30int __init btrfs_delayed_inode_init(void) 30int __init btrfs_delayed_inode_init(void)
31{ 31{
32 delayed_node_cache = kmem_cache_create("delayed_node", 32 delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
33 sizeof(struct btrfs_delayed_node), 33 sizeof(struct btrfs_delayed_node),
34 0, 34 0,
35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,8 +650,9 @@ static int btrfs_delayed_inode_reserve_metadata(
650 * we're accounted for. 650 * we're accounted for.
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv != &root->fs_info->delalloc_block_rsv)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
655 BTRFS_RESERVE_NO_FLUSH);
655 /* 656 /*
656 * Since we're under a transaction reserve_metadata_bytes could 657 * Since we're under a transaction reserve_metadata_bytes could
657 * try to commit the transaction which will make it return 658 * try to commit the transaction which will make it return
@@ -668,7 +669,7 @@ static int btrfs_delayed_inode_reserve_metadata(
668 num_bytes, 1); 669 num_bytes, 1);
669 } 670 }
670 return ret; 671 return ret;
671 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 672 } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
672 spin_lock(&BTRFS_I(inode)->lock); 673 spin_lock(&BTRFS_I(inode)->lock);
673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 674 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
674 &BTRFS_I(inode)->runtime_flags)) { 675 &BTRFS_I(inode)->runtime_flags)) {
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
686 * reserve something strictly for us. If not be a pain and try 687 * reserve something strictly for us. If not be a pain and try
687 * to steal from the delalloc block rsv. 688 * to steal from the delalloc block rsv.
688 */ 689 */
689 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 690 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
691 BTRFS_RESERVE_NO_FLUSH);
690 if (!ret) 692 if (!ret)
691 goto out; 693 goto out;
692 694
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1255 struct btrfs_delayed_node *delayed_node = NULL; 1257 struct btrfs_delayed_node *delayed_node = NULL;
1256 struct btrfs_root *root; 1258 struct btrfs_root *root;
1257 struct btrfs_block_rsv *block_rsv; 1259 struct btrfs_block_rsv *block_rsv;
1258 unsigned long nr = 0;
1259 int need_requeue = 0; 1260 int need_requeue = 0;
1260 int ret; 1261 int ret;
1261 1262
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1316 delayed_node); 1317 delayed_node);
1317 mutex_unlock(&delayed_node->mutex); 1318 mutex_unlock(&delayed_node->mutex);
1318 1319
1319 nr = trans->blocks_used;
1320
1321 trans->block_rsv = block_rsv; 1320 trans->block_rsv = block_rsv;
1322 btrfs_end_transaction_dmeta(trans, root); 1321 btrfs_end_transaction_dmeta(trans, root);
1323 __btrfs_btree_balance_dirty(root, nr); 1322 btrfs_btree_balance_dirty_nodelay(root);
1324free_path: 1323free_path:
1325 btrfs_free_path(path); 1324 btrfs_free_path(path);
1326out: 1325out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/slab.h>
21#include <linux/buffer_head.h>
22#include <linux/blkdev.h>
23#include <linux/random.h>
24#include <linux/iocontext.h>
25#include <linux/capability.h>
26#include <linux/kthread.h>
27#include <linux/math64.h>
28#include <asm/div64.h>
29#include "compat.h"
30#include "ctree.h"
31#include "extent_map.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "print-tree.h"
35#include "volumes.h"
36#include "async-thread.h"
37#include "check-integrity.h"
38#include "rcu-string.h"
39#include "dev-replace.h"
40
41static u64 btrfs_get_seconds_since_1970(void);
42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
43 int scrub_ret);
44static void btrfs_dev_replace_update_device_in_mapping_tree(
45 struct btrfs_fs_info *fs_info,
46 struct btrfs_device *srcdev,
47 struct btrfs_device *tgtdev);
48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
49 char *srcdev_name,
50 struct btrfs_device **device);
51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
52static int btrfs_dev_replace_kthread(void *data);
53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
54
55
56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
57{
58 struct btrfs_key key;
59 struct btrfs_root *dev_root = fs_info->dev_root;
60 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
61 struct extent_buffer *eb;
62 int slot;
63 int ret = 0;
64 struct btrfs_path *path = NULL;
65 int item_size;
66 struct btrfs_dev_replace_item *ptr;
67 u64 src_devid;
68
69 path = btrfs_alloc_path();
70 if (!path) {
71 ret = -ENOMEM;
72 goto out;
73 }
74
75 key.objectid = 0;
76 key.type = BTRFS_DEV_REPLACE_KEY;
77 key.offset = 0;
78 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
79 if (ret) {
80no_valid_dev_replace_entry_found:
81 ret = 0;
82 dev_replace->replace_state =
83 BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
84 dev_replace->cont_reading_from_srcdev_mode =
85 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
86 dev_replace->replace_state = 0;
87 dev_replace->time_started = 0;
88 dev_replace->time_stopped = 0;
89 atomic64_set(&dev_replace->num_write_errors, 0);
90 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
91 dev_replace->cursor_left = 0;
92 dev_replace->committed_cursor_left = 0;
93 dev_replace->cursor_left_last_write_of_item = 0;
94 dev_replace->cursor_right = 0;
95 dev_replace->srcdev = NULL;
96 dev_replace->tgtdev = NULL;
97 dev_replace->is_valid = 0;
98 dev_replace->item_needs_writeback = 0;
99 goto out;
100 }
101 slot = path->slots[0];
102 eb = path->nodes[0];
103 item_size = btrfs_item_size_nr(eb, slot);
104 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
105
106 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
107 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
108 goto no_valid_dev_replace_entry_found;
109 }
110
111 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 dev_replace->cont_reading_from_srcdev_mode =
113 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 dev_replace->time_stopped =
117 btrfs_dev_replace_time_stopped(eb, ptr);
118 atomic64_set(&dev_replace->num_write_errors,
119 btrfs_dev_replace_num_write_errors(eb, ptr));
120 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 dev_replace->is_valid = 1;
127
128 dev_replace->item_needs_writeback = 0;
129 switch (dev_replace->replace_state) {
130 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 dev_replace->srcdev = NULL;
134 dev_replace->tgtdev = NULL;
135 break;
136 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 NULL, NULL);
140 dev_replace->tgtdev = btrfs_find_device(fs_info,
141 BTRFS_DEV_REPLACE_DEVID,
142 NULL, NULL);
143 /*
144 * allow 'btrfs dev replace_cancel' if src/tgt device is
145 * missing
146 */
147 if (!dev_replace->srcdev &&
148 !btrfs_test_opt(dev_root, DEGRADED)) {
149 ret = -EIO;
150 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
151 (unsigned long long)src_devid);
152 }
153 if (!dev_replace->tgtdev &&
154 !btrfs_test_opt(dev_root, DEGRADED)) {
155 ret = -EIO;
156 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
157 (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
158 }
159 if (dev_replace->tgtdev) {
160 if (dev_replace->srcdev) {
161 dev_replace->tgtdev->total_bytes =
162 dev_replace->srcdev->total_bytes;
163 dev_replace->tgtdev->disk_total_bytes =
164 dev_replace->srcdev->disk_total_bytes;
165 dev_replace->tgtdev->bytes_used =
166 dev_replace->srcdev->bytes_used;
167 }
168 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
169 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
170 dev_replace->tgtdev);
171 }
172 break;
173 }
174
175out:
176 if (path)
177 btrfs_free_path(path);
178 return ret;
179}
180
181/*
182 * called from commit_transaction. Writes changed device replace state to
183 * disk.
184 */
185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
186 struct btrfs_fs_info *fs_info)
187{
188 int ret;
189 struct btrfs_root *dev_root = fs_info->dev_root;
190 struct btrfs_path *path;
191 struct btrfs_key key;
192 struct extent_buffer *eb;
193 struct btrfs_dev_replace_item *ptr;
194 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
195
196 btrfs_dev_replace_lock(dev_replace);
197 if (!dev_replace->is_valid ||
198 !dev_replace->item_needs_writeback) {
199 btrfs_dev_replace_unlock(dev_replace);
200 return 0;
201 }
202 btrfs_dev_replace_unlock(dev_replace);
203
204 key.objectid = 0;
205 key.type = BTRFS_DEV_REPLACE_KEY;
206 key.offset = 0;
207
208 path = btrfs_alloc_path();
209 if (!path) {
210 ret = -ENOMEM;
211 goto out;
212 }
213 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
214 if (ret < 0) {
215 pr_warn("btrfs: error %d while searching for dev_replace item!\n",
216 ret);
217 goto out;
218 }
219
220 if (ret == 0 &&
221 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
222 /*
223 * need to delete old one and insert a new one.
224 * Since no attempt is made to recover any old state, if the
225 * dev_replace state is 'running', the data on the target
226 * drive is lost.
227 * It would be possible to recover the state: just make sure
228 * that the beginning of the item is never changed and always
229 * contains all the essential information. Then read this
230 * minimal set of information and use it as a base for the
231 * new state.
232 */
233 ret = btrfs_del_item(trans, dev_root, path);
234 if (ret != 0) {
235 pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
236 ret);
237 goto out;
238 }
239 ret = 1;
240 }
241
242 if (ret == 1) {
243 /* need to insert a new item */
244 btrfs_release_path(path);
245 ret = btrfs_insert_empty_item(trans, dev_root, path,
246 &key, sizeof(*ptr));
247 if (ret < 0) {
248 pr_warn("btrfs: insert dev_replace item failed %d!\n",
249 ret);
250 goto out;
251 }
252 }
253
254 eb = path->nodes[0];
255 ptr = btrfs_item_ptr(eb, path->slots[0],
256 struct btrfs_dev_replace_item);
257
258 btrfs_dev_replace_lock(dev_replace);
259 if (dev_replace->srcdev)
260 btrfs_set_dev_replace_src_devid(eb, ptr,
261 dev_replace->srcdev->devid);
262 else
263 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
264 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
265 dev_replace->cont_reading_from_srcdev_mode);
266 btrfs_set_dev_replace_replace_state(eb, ptr,
267 dev_replace->replace_state);
268 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
269 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
270 btrfs_set_dev_replace_num_write_errors(eb, ptr,
271 atomic64_read(&dev_replace->num_write_errors));
272 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
273 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
274 dev_replace->cursor_left_last_write_of_item =
275 dev_replace->cursor_left;
276 btrfs_set_dev_replace_cursor_left(eb, ptr,
277 dev_replace->cursor_left_last_write_of_item);
278 btrfs_set_dev_replace_cursor_right(eb, ptr,
279 dev_replace->cursor_right);
280 dev_replace->item_needs_writeback = 0;
281 btrfs_dev_replace_unlock(dev_replace);
282
283 btrfs_mark_buffer_dirty(eb);
284
285out:
286 btrfs_free_path(path);
287
288 return ret;
289}
290
291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
292{
293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
294
295 dev_replace->committed_cursor_left =
296 dev_replace->cursor_left_last_write_of_item;
297}
298
299static u64 btrfs_get_seconds_since_1970(void)
300{
301 struct timespec t = CURRENT_TIME_SEC;
302
303 return t.tv_sec;
304}
305
306int btrfs_dev_replace_start(struct btrfs_root *root,
307 struct btrfs_ioctl_dev_replace_args *args)
308{
309 struct btrfs_trans_handle *trans;
310 struct btrfs_fs_info *fs_info = root->fs_info;
311 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
312 int ret;
313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL;
315
316 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
319 break;
320 default:
321 return -EINVAL;
322 }
323
324 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
325 args->start.tgtdev_name[0] == '\0')
326 return -EINVAL;
327
328 mutex_lock(&fs_info->volume_mutex);
329 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
330 &tgt_device);
331 if (ret) {
332 pr_err("btrfs: target device %s is invalid!\n",
333 args->start.tgtdev_name);
334 mutex_unlock(&fs_info->volume_mutex);
335 return -EINVAL;
336 }
337
338 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
339 args->start.srcdev_name,
340 &src_device);
341 mutex_unlock(&fs_info->volume_mutex);
342 if (ret) {
343 ret = -EINVAL;
344 goto leave_no_lock;
345 }
346
347 if (tgt_device->total_bytes < src_device->total_bytes) {
348 pr_err("btrfs: target device is smaller than source device!\n");
349 ret = -EINVAL;
350 goto leave_no_lock;
351 }
352
353 btrfs_dev_replace_lock(dev_replace);
354 switch (dev_replace->replace_state) {
355 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
356 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
357 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
358 break;
359 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
360 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
361 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
362 goto leave;
363 }
364
365 dev_replace->cont_reading_from_srcdev_mode =
366 args->start.cont_reading_from_srcdev_mode;
367 WARN_ON(!src_device);
368 dev_replace->srcdev = src_device;
369 WARN_ON(!tgt_device);
370 dev_replace->tgtdev = tgt_device;
371
372 printk_in_rcu(KERN_INFO
373 "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
374 src_device->missing ? "<missing disk>" :
375 rcu_str_deref(src_device->name),
376 src_device->devid,
377 rcu_str_deref(tgt_device->name));
378
379 tgt_device->total_bytes = src_device->total_bytes;
380 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
381 tgt_device->bytes_used = src_device->bytes_used;
382
383 /*
384 * from now on, the writes to the srcdev are all duplicated to
385 * go to the tgtdev as well (refer to btrfs_map_block()).
386 */
387 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
388 dev_replace->time_started = btrfs_get_seconds_since_1970();
389 dev_replace->cursor_left = 0;
390 dev_replace->committed_cursor_left = 0;
391 dev_replace->cursor_left_last_write_of_item = 0;
392 dev_replace->cursor_right = 0;
393 dev_replace->is_valid = 1;
394 dev_replace->item_needs_writeback = 1;
395 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
396 btrfs_dev_replace_unlock(dev_replace);
397
398 btrfs_wait_ordered_extents(root, 0);
399
400 /* force writing the updated state information to disk */
401 trans = btrfs_start_transaction(root, 0);
402 if (IS_ERR(trans)) {
403 ret = PTR_ERR(trans);
404 btrfs_dev_replace_lock(dev_replace);
405 goto leave;
406 }
407
408 ret = btrfs_commit_transaction(trans, root);
409 WARN_ON(ret);
410
411 /* the disk copy procedure reuses the scrub code */
412 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
413 src_device->total_bytes,
414 &dev_replace->scrub_progress, 0, 1);
415
416 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
417 WARN_ON(ret);
418
419 return 0;
420
421leave:
422 dev_replace->srcdev = NULL;
423 dev_replace->tgtdev = NULL;
424 btrfs_dev_replace_unlock(dev_replace);
425leave_no_lock:
426 if (tgt_device)
427 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
428 return ret;
429}
430
431static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
432 int scrub_ret)
433{
434 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
435 struct btrfs_device *tgt_device;
436 struct btrfs_device *src_device;
437 struct btrfs_root *root = fs_info->tree_root;
438 u8 uuid_tmp[BTRFS_UUID_SIZE];
439 struct btrfs_trans_handle *trans;
440 int ret = 0;
441
442 /* don't allow cancel or unmount to disturb the finishing procedure */
443 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
444
445 btrfs_dev_replace_lock(dev_replace);
446 /* was the operation canceled, or is it finished? */
447 if (dev_replace->replace_state !=
448 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
449 btrfs_dev_replace_unlock(dev_replace);
450 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
451 return 0;
452 }
453
454 tgt_device = dev_replace->tgtdev;
455 src_device = dev_replace->srcdev;
456 btrfs_dev_replace_unlock(dev_replace);
457
458 /* replace old device with new one in mapping tree */
459 if (!scrub_ret)
460 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
461 src_device,
462 tgt_device);
463
464 /*
465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished
467 */
468 btrfs_start_delalloc_inodes(root, 0);
469 btrfs_wait_ordered_extents(root, 0);
470
471 trans = btrfs_start_transaction(root, 0);
472 if (IS_ERR(trans)) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return PTR_ERR(trans);
475 }
476 ret = btrfs_commit_transaction(trans, root);
477 WARN_ON(ret);
478
479 /* keep away write_all_supers() during the finishing procedure */
480 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
481 btrfs_dev_replace_lock(dev_replace);
482 dev_replace->replace_state =
483 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
484 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
485 dev_replace->tgtdev = NULL;
486 dev_replace->srcdev = NULL;
487 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
488 dev_replace->item_needs_writeback = 1;
489
490 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name),
495 src_device->devid,
496 rcu_str_deref(tgt_device->name), scrub_ret);
497 btrfs_dev_replace_unlock(dev_replace);
498 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
499 if (tgt_device)
500 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
501 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
502
503 return 0;
504 }
505
506 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name),
510 src_device->devid,
511 rcu_str_deref(tgt_device->name));
512 tgt_device->is_tgtdev_for_dev_replace = 0;
513 tgt_device->devid = src_device->devid;
514 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
515 tgt_device->bytes_used = src_device->bytes_used;
516 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
517 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
518 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
519 tgt_device->total_bytes = src_device->total_bytes;
520 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
521 tgt_device->bytes_used = src_device->bytes_used;
522 if (fs_info->sb->s_bdev == src_device->bdev)
523 fs_info->sb->s_bdev = tgt_device->bdev;
524 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
525 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
526 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
527
528 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
529 if (src_device->bdev) {
530 /* zero out the old super */
531 btrfs_scratch_superblock(src_device);
532 }
533 /*
534 * this is again a consistent state where no dev_replace procedure
535 * is running, the target device is part of the filesystem, the
536 * source device is not part of the filesystem anymore and its 1st
537 * superblock is scratched out so that it is no longer marked to
538 * belong to this filesystem.
539 */
540 btrfs_dev_replace_unlock(dev_replace);
541 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
542
543 /* write back the superblocks */
544 trans = btrfs_start_transaction(root, 0);
545 if (!IS_ERR(trans))
546 btrfs_commit_transaction(trans, root);
547
548 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
549
550 return 0;
551}
552
553static void btrfs_dev_replace_update_device_in_mapping_tree(
554 struct btrfs_fs_info *fs_info,
555 struct btrfs_device *srcdev,
556 struct btrfs_device *tgtdev)
557{
558 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
559 struct extent_map *em;
560 struct map_lookup *map;
561 u64 start = 0;
562 int i;
563
564 write_lock(&em_tree->lock);
565 do {
566 em = lookup_extent_mapping(em_tree, start, (u64)-1);
567 if (!em)
568 break;
569 map = (struct map_lookup *)em->bdev;
570 for (i = 0; i < map->num_stripes; i++)
571 if (srcdev == map->stripes[i].dev)
572 map->stripes[i].dev = tgtdev;
573 start = em->start + em->len;
574 free_extent_map(em);
575 } while (start);
576 write_unlock(&em_tree->lock);
577}
578
579static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
580 char *srcdev_name,
581 struct btrfs_device **device)
582{
583 int ret;
584
585 if (srcdevid) {
586 ret = 0;
587 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
588 NULL);
589 if (!*device)
590 ret = -ENOENT;
591 } else {
592 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
593 device);
594 }
595 return ret;
596}
597
598void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
599 struct btrfs_ioctl_dev_replace_args *args)
600{
601 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
602
603 btrfs_dev_replace_lock(dev_replace);
604 /* even if !dev_replace_is_valid, the values are good enough for
605 * the replace_status ioctl */
606 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
607 args->status.replace_state = dev_replace->replace_state;
608 args->status.time_started = dev_replace->time_started;
609 args->status.time_stopped = dev_replace->time_stopped;
610 args->status.num_write_errors =
611 atomic64_read(&dev_replace->num_write_errors);
612 args->status.num_uncorrectable_read_errors =
613 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
614 switch (dev_replace->replace_state) {
615 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
616 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
617 args->status.progress_1000 = 0;
618 break;
619 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
620 args->status.progress_1000 = 1000;
621 break;
622 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
623 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
624 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
625 div64_u64(dev_replace->srcdev->total_bytes, 1000));
626 break;
627 }
628 btrfs_dev_replace_unlock(dev_replace);
629}
630
631int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
632 struct btrfs_ioctl_dev_replace_args *args)
633{
634 args->result = __btrfs_dev_replace_cancel(fs_info);
635 return 0;
636}
637
638static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
639{
640 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
641 struct btrfs_device *tgt_device = NULL;
642 struct btrfs_trans_handle *trans;
643 struct btrfs_root *root = fs_info->tree_root;
644 u64 result;
645 int ret;
646
647 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
648 btrfs_dev_replace_lock(dev_replace);
649 switch (dev_replace->replace_state) {
650 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
651 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
652 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
653 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
654 btrfs_dev_replace_unlock(dev_replace);
655 goto leave;
656 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
657 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
658 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
659 tgt_device = dev_replace->tgtdev;
660 dev_replace->tgtdev = NULL;
661 dev_replace->srcdev = NULL;
662 break;
663 }
664 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
665 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
666 dev_replace->item_needs_writeback = 1;
667 btrfs_dev_replace_unlock(dev_replace);
668 btrfs_scrub_cancel(fs_info);
669
670 trans = btrfs_start_transaction(root, 0);
671 if (IS_ERR(trans)) {
672 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
673 return PTR_ERR(trans);
674 }
675 ret = btrfs_commit_transaction(trans, root);
676 WARN_ON(ret);
677 if (tgt_device)
678 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
679
680leave:
681 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
682 return result;
683}
684
685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
686{
687 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
688
689 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
690 btrfs_dev_replace_lock(dev_replace);
691 switch (dev_replace->replace_state) {
692 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
693 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
694 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
695 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
696 break;
697 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
698 dev_replace->replace_state =
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
701 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n");
703 break;
704 }
705
706 btrfs_dev_replace_unlock(dev_replace);
707 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
708}
709
710/* resume dev_replace procedure that was interrupted by unmount */
711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
712{
713 struct task_struct *task;
714 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
715
716 btrfs_dev_replace_lock(dev_replace);
717 switch (dev_replace->replace_state) {
718 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
719 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
720 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
721 btrfs_dev_replace_unlock(dev_replace);
722 return 0;
723 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
724 break;
725 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
726 dev_replace->replace_state =
727 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
728 break;
729 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n");
733 btrfs_dev_replace_unlock(dev_replace);
734 return 0;
735 }
736 btrfs_dev_replace_unlock(dev_replace);
737
738 WARN_ON(atomic_xchg(
739 &fs_info->mutually_exclusive_operation_running, 1));
740 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
741 return PTR_RET(task);
742}
743
744static int btrfs_dev_replace_kthread(void *data)
745{
746 struct btrfs_fs_info *fs_info = data;
747 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
748 struct btrfs_ioctl_dev_replace_args *status_args;
749 u64 progress;
750
751 status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
752 if (status_args) {
753 btrfs_dev_replace_status(fs_info, status_args);
754 progress = status_args->status.progress_1000;
755 kfree(status_args);
756 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>",
765 (unsigned int)progress);
766 }
767 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
769
770 return 0;
771}
772
773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
774{
775 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
776 int ret;
777
778 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
779 dev_replace->committed_cursor_left,
780 dev_replace->srcdev->total_bytes,
781 &dev_replace->scrub_progress, 0, 1);
782 ret = btrfs_dev_replace_finishing(fs_info, ret);
783 WARN_ON(ret);
784 return 0;
785}
786
787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
788{
789 if (!dev_replace->is_valid)
790 return 0;
791
792 switch (dev_replace->replace_state) {
793 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
794 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
795 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
796 return 0;
797 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
798 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
799 /*
800 * return true even if tgtdev is missing (this is
801 * something that can happen if the dev_replace
802 * procedure is suspended by an umount and then
803 * the tgtdev is missing (or "btrfs dev scan") was
804 * not called and the the filesystem is remounted
805 * in degraded state. This does not stop the
806 * dev_replace procedure. It needs to be canceled
807 * manually if the cancelation is wanted.
808 */
809 break;
810 }
811 return 1;
812}
813
814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
815{
816 /* the beginning is just an optimization for the typical case */
817 if (atomic_read(&dev_replace->nesting_level) == 0) {
818acquire_lock:
819 /* this is not a nested case where the same thread
820 * is trying to acqurire the same lock twice */
821 mutex_lock(&dev_replace->lock);
822 mutex_lock(&dev_replace->lock_management_lock);
823 dev_replace->lock_owner = current->pid;
824 atomic_inc(&dev_replace->nesting_level);
825 mutex_unlock(&dev_replace->lock_management_lock);
826 return;
827 }
828
829 mutex_lock(&dev_replace->lock_management_lock);
830 if (atomic_read(&dev_replace->nesting_level) > 0 &&
831 dev_replace->lock_owner == current->pid) {
832 WARN_ON(!mutex_is_locked(&dev_replace->lock));
833 atomic_inc(&dev_replace->nesting_level);
834 mutex_unlock(&dev_replace->lock_management_lock);
835 return;
836 }
837
838 mutex_unlock(&dev_replace->lock_management_lock);
839 goto acquire_lock;
840}
841
842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
843{
844 WARN_ON(!mutex_is_locked(&dev_replace->lock));
845 mutex_lock(&dev_replace->lock_management_lock);
846 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
847 WARN_ON(dev_replace->lock_owner != current->pid);
848 atomic_dec(&dev_replace->nesting_level);
849 if (atomic_read(&dev_replace->nesting_level) == 0) {
850 dev_replace->lock_owner = 0;
851 mutex_unlock(&dev_replace->lock_management_lock);
852 mutex_unlock(&dev_replace->lock);
853 } else {
854 mutex_unlock(&dev_replace->lock_management_lock);
855 }
856}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22struct btrfs_ioctl_dev_replace_args;
23
24int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
33 struct btrfs_ioctl_dev_replace_args *args);
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
39
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{
42 atomic64_inc(stat_value);
43}
44#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
213 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
214} 214}
215 215
216int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
217 const char *name, int name_len)
218{
219 int ret;
220 struct btrfs_key key;
221 struct btrfs_dir_item *di;
222 int data_size;
223 struct extent_buffer *leaf;
224 int slot;
225 struct btrfs_path *path;
226
227
228 path = btrfs_alloc_path();
229 if (!path)
230 return -ENOMEM;
231
232 key.objectid = dir;
233 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
234 key.offset = btrfs_name_hash(name, name_len);
235
236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
237
238 /* return back any errors */
239 if (ret < 0)
240 goto out;
241
242 /* nothing found, we're safe */
243 if (ret > 0) {
244 ret = 0;
245 goto out;
246 }
247
248 /* we found an item, look for our name in the item */
249 di = btrfs_match_dir_item_name(root, path, name, name_len);
250 if (di) {
251 /* our exact name was found */
252 ret = -EEXIST;
253 goto out;
254 }
255
256 /*
257 * see if there is room in the item to insert this
258 * name
259 */
260 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
261 leaf = path->nodes[0];
262 slot = path->slots[0];
263 if (data_size + btrfs_item_size_nr(leaf, slot) +
264 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
265 ret = -EOVERFLOW;
266 } else {
267 /* plenty of insertion room */
268 ret = 0;
269 }
270out:
271 btrfs_free_path(path);
272 return ret;
273}
274
216/* 275/*
217 * lookup a directory item based on index. 'dir' is the objectid 276 * lookup a directory item based on index. 'dir' is the objectid
218 * we're searching in, and 'mod' tells us if you plan on deleting the 277 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,11 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
49
50#ifdef CONFIG_X86
51#include <asm/cpufeature.h>
52#endif
48 53
49static struct extent_io_ops btree_extent_io_ops; 54static struct extent_io_ops btree_extent_io_ops;
50static void end_workqueue_fn(struct btrfs_work *work); 55static void end_workqueue_fn(struct btrfs_work *work);
@@ -217,26 +222,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
217 write_lock(&em_tree->lock); 222 write_lock(&em_tree->lock);
218 ret = add_extent_mapping(em_tree, em); 223 ret = add_extent_mapping(em_tree, em);
219 if (ret == -EEXIST) { 224 if (ret == -EEXIST) {
220 u64 failed_start = em->start;
221 u64 failed_len = em->len;
222
223 free_extent_map(em); 225 free_extent_map(em);
224 em = lookup_extent_mapping(em_tree, start, len); 226 em = lookup_extent_mapping(em_tree, start, len);
225 if (em) { 227 if (!em)
226 ret = 0; 228 em = ERR_PTR(-EIO);
227 } else {
228 em = lookup_extent_mapping(em_tree, failed_start,
229 failed_len);
230 ret = -EIO;
231 }
232 } else if (ret) { 229 } else if (ret) {
233 free_extent_map(em); 230 free_extent_map(em);
234 em = NULL; 231 em = ERR_PTR(ret);
235 } 232 }
236 write_unlock(&em_tree->lock); 233 write_unlock(&em_tree->lock);
237 234
238 if (ret)
239 em = ERR_PTR(ret);
240out: 235out:
241 return em; 236 return em;
242} 237}
@@ -393,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
393 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
394 break; 389 break;
395 390
396 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
397 eb->start, eb->len); 392 eb->start, eb->len);
398 if (num_copies == 1) 393 if (num_copies == 1)
399 break; 394 break;
@@ -439,10 +434,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
439 WARN_ON(1); 434 WARN_ON(1);
440 return 0; 435 return 0;
441 } 436 }
442 if (eb->pages[0] != page) {
443 WARN_ON(1);
444 return 0;
445 }
446 if (!PageUptodate(page)) { 437 if (!PageUptodate(page)) {
447 WARN_ON(1); 438 WARN_ON(1);
448 return 0; 439 return 0;
@@ -862,21 +853,37 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
862 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
863 u64 bio_offset) 854 u64 bio_offset)
864{ 855{
856 int ret;
857
865 /* 858 /*
866 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
867 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
868 */ 861 */
869 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
866}
867
868static int check_async_write(struct inode *inode, unsigned long bio_flags)
869{
870 if (bio_flags & EXTENT_BIO_TREE_LOG)
871 return 0;
872#ifdef CONFIG_X86
873 if (cpu_has_xmm4_2)
874 return 0;
875#endif
876 return 1;
870} 877}
871 878
872static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 879static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
873 int mirror_num, unsigned long bio_flags, 880 int mirror_num, unsigned long bio_flags,
874 u64 bio_offset) 881 u64 bio_offset)
875{ 882{
883 int async = check_async_write(inode, bio_flags);
876 int ret; 884 int ret;
877 885
878 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
879
880 /* 887 /*
881 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
882 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -884,20 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
884 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
885 bio, 1); 892 bio, 1);
886 if (ret) 893 if (ret)
887 return ret; 894 goto out_w_error;
888 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
889 mirror_num, 0); 896 mirror_num, 0);
897 } else if (!async) {
898 ret = btree_csum_one_bio(bio);
899 if (ret)
900 goto out_w_error;
901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
890 } 913 }
891 914
892 /* 915 if (ret) {
893 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
894 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
895 */ 918 }
896 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
897 inode, rw, bio, mirror_num, 0,
898 bio_offset,
899 __btree_submit_bio_start,
900 __btree_submit_bio_done);
901} 920}
902 921
903#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -982,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
982 1001
983static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
984{ 1003{
1004#ifdef DEBUG
985 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
986 1006
987 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -990,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
990 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
991 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
992 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
993 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
994} 1015}
995 1016
@@ -1121,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1121 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1122 } 1143 }
1123 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1124 }
1125 1145
1126 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1127 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1128 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1129 } 1150 }
1130} 1151}
1131 1152
@@ -1168,8 +1189,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1168 atomic_set(&root->log_commit[0], 0); 1189 atomic_set(&root->log_commit[0], 0);
1169 atomic_set(&root->log_commit[1], 0); 1190 atomic_set(&root->log_commit[1], 0);
1170 atomic_set(&root->log_writers, 0); 1191 atomic_set(&root->log_writers, 0);
1192 atomic_set(&root->log_batch, 0);
1171 atomic_set(&root->orphan_inodes, 0); 1193 atomic_set(&root->orphan_inodes, 0);
1172 root->log_batch = 0;
1173 root->log_transid = 0; 1194 root->log_transid = 0;
1174 root->last_log_commit = 0; 1195 root->last_log_commit = 0;
1175 extent_io_tree_init(&root->dirty_log_pages, 1196 extent_io_tree_init(&root->dirty_log_pages,
@@ -1185,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1185 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1186 root->anon_dev = 0; 1207 root->anon_dev = 0;
1187 1208
1188 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1189} 1210}
1190 1211
1191static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1667,9 +1688,10 @@ static int transaction_kthread(void *arg)
1667 spin_unlock(&root->fs_info->trans_lock); 1688 spin_unlock(&root->fs_info->trans_lock);
1668 1689
1669 /* If the file system is aborted, this will always fail. */ 1690 /* If the file system is aborted, this will always fail. */
1670 trans = btrfs_join_transaction(root); 1691 trans = btrfs_attach_transaction(root);
1671 if (IS_ERR(trans)) { 1692 if (IS_ERR(trans)) {
1672 cannot_commit = true; 1693 if (PTR_ERR(trans) != -ENOENT)
1694 cannot_commit = true;
1673 goto sleep; 1695 goto sleep;
1674 } 1696 }
1675 if (transid == trans->transid) { 1697 if (transid == trans->transid) {
@@ -1994,13 +2016,11 @@ int open_ctree(struct super_block *sb,
1994 INIT_LIST_HEAD(&fs_info->trans_list); 2016 INIT_LIST_HEAD(&fs_info->trans_list);
1995 INIT_LIST_HEAD(&fs_info->dead_roots); 2017 INIT_LIST_HEAD(&fs_info->dead_roots);
1996 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2018 INIT_LIST_HEAD(&fs_info->delayed_iputs);
1997 INIT_LIST_HEAD(&fs_info->hashers);
1998 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2019 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1999 INIT_LIST_HEAD(&fs_info->ordered_operations); 2020 INIT_LIST_HEAD(&fs_info->ordered_operations);
2000 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2021 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2001 spin_lock_init(&fs_info->delalloc_lock); 2022 spin_lock_init(&fs_info->delalloc_lock);
2002 spin_lock_init(&fs_info->trans_lock); 2023 spin_lock_init(&fs_info->trans_lock);
2003 spin_lock_init(&fs_info->ref_cache_lock);
2004 spin_lock_init(&fs_info->fs_roots_radix_lock); 2024 spin_lock_init(&fs_info->fs_roots_radix_lock);
2005 spin_lock_init(&fs_info->delayed_iput_lock); 2025 spin_lock_init(&fs_info->delayed_iput_lock);
2006 spin_lock_init(&fs_info->defrag_inodes_lock); 2026 spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2034,15 @@ int open_ctree(struct super_block *sb,
2014 INIT_LIST_HEAD(&fs_info->space_info); 2034 INIT_LIST_HEAD(&fs_info->space_info);
2015 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2035 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2016 btrfs_mapping_init(&fs_info->mapping_tree); 2036 btrfs_mapping_init(&fs_info->mapping_tree);
2017 btrfs_init_block_rsv(&fs_info->global_block_rsv); 2037 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2018 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 2038 BTRFS_BLOCK_RSV_GLOBAL);
2019 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 2039 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2020 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 2040 BTRFS_BLOCK_RSV_DELALLOC);
2021 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 2041 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2022 btrfs_init_block_rsv(&fs_info->delayed_block_rsv); 2042 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2043 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2044 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2045 BTRFS_BLOCK_RSV_DELOPS);
2023 atomic_set(&fs_info->nr_async_submits, 0); 2046 atomic_set(&fs_info->nr_async_submits, 0);
2024 atomic_set(&fs_info->async_delalloc_pages, 0); 2047 atomic_set(&fs_info->async_delalloc_pages, 0);
2025 atomic_set(&fs_info->async_submit_draining, 0); 2048 atomic_set(&fs_info->async_submit_draining, 0);
@@ -2121,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2121 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2122 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2123 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2124 2152
2125 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2126 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2269,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2269 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2270 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2271 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2272 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2273 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2274 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2340,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2340 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2341 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2342 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2343 if (ret) { 2376 if (ret) {
2344 err = -ENOMEM; 2377 err = -ENOMEM;
2345 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2408,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2408 goto fail_tree_roots; 2441 goto fail_tree_roots;
2409 } 2442 }
2410 2443
2411 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2412 2449
2413 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2414 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2480,6 +2517,14 @@ retry_root_backup:
2480 goto fail_block_groups; 2517 goto fail_block_groups;
2481 } 2518 }
2482 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2483 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2484 if (ret) { 2529 if (ret) {
2485 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2491,6 +2536,15 @@ retry_root_backup:
2491 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2536 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2492 goto fail_block_groups; 2537 goto fail_block_groups;
2493 } 2538 }
2539 fs_info->num_tolerated_disk_barrier_failures =
2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2494 2548
2495 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2496 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2619,6 +2673,13 @@ retry_root_backup:
2619 return ret; 2673 return ret;
2620 } 2674 }
2621 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2622 return 0; 2683 return 0;
2623 2684
2624fail_qgroup: 2685fail_qgroup:
@@ -2655,6 +2716,7 @@ fail_sb_buffer:
2655 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2656 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2657 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2658fail_alloc: 2720fail_alloc:
2659fail_iput: 2721fail_iput:
2660 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2874,12 +2936,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2874 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 2936 printk_in_rcu("btrfs: disabling barriers on dev %s\n",
2875 rcu_str_deref(device->name)); 2937 rcu_str_deref(device->name));
2876 device->nobarriers = 1; 2938 device->nobarriers = 1;
2877 } 2939 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
2878 if (!bio_flagged(bio, BIO_UPTODATE)) {
2879 ret = -EIO; 2940 ret = -EIO;
2880 if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2941 btrfs_dev_stat_inc_and_print(device,
2881 btrfs_dev_stat_inc_and_print(device, 2942 BTRFS_DEV_STAT_FLUSH_ERRS);
2882 BTRFS_DEV_STAT_FLUSH_ERRS);
2883 } 2943 }
2884 2944
2885 /* drop the reference from the wait == 0 run */ 2945 /* drop the reference from the wait == 0 run */
@@ -2918,14 +2978,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2918{ 2978{
2919 struct list_head *head; 2979 struct list_head *head;
2920 struct btrfs_device *dev; 2980 struct btrfs_device *dev;
2921 int errors = 0; 2981 int errors_send = 0;
2982 int errors_wait = 0;
2922 int ret; 2983 int ret;
2923 2984
2924 /* send down all the barriers */ 2985 /* send down all the barriers */
2925 head = &info->fs_devices->devices; 2986 head = &info->fs_devices->devices;
2926 list_for_each_entry_rcu(dev, head, dev_list) { 2987 list_for_each_entry_rcu(dev, head, dev_list) {
2927 if (!dev->bdev) { 2988 if (!dev->bdev) {
2928 errors++; 2989 errors_send++;
2929 continue; 2990 continue;
2930 } 2991 }
2931 if (!dev->in_fs_metadata || !dev->writeable) 2992 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2994,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2933 2994
2934 ret = write_dev_flush(dev, 0); 2995 ret = write_dev_flush(dev, 0);
2935 if (ret) 2996 if (ret)
2936 errors++; 2997 errors_send++;
2937 } 2998 }
2938 2999
2939 /* wait for all the barriers */ 3000 /* wait for all the barriers */
2940 list_for_each_entry_rcu(dev, head, dev_list) { 3001 list_for_each_entry_rcu(dev, head, dev_list) {
2941 if (!dev->bdev) { 3002 if (!dev->bdev) {
2942 errors++; 3003 errors_wait++;
2943 continue; 3004 continue;
2944 } 3005 }
2945 if (!dev->in_fs_metadata || !dev->writeable) 3006 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +3008,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2947 3008
2948 ret = write_dev_flush(dev, 1); 3009 ret = write_dev_flush(dev, 1);
2949 if (ret) 3010 if (ret)
2950 errors++; 3011 errors_wait++;
2951 } 3012 }
2952 if (errors) 3013 if (errors_send > info->num_tolerated_disk_barrier_failures ||
3014 errors_wait > info->num_tolerated_disk_barrier_failures)
2953 return -EIO; 3015 return -EIO;
2954 return 0; 3016 return 0;
2955} 3017}
2956 3018
3019int btrfs_calc_num_tolerated_disk_barrier_failures(
3020 struct btrfs_fs_info *fs_info)
3021{
3022 struct btrfs_ioctl_space_info space;
3023 struct btrfs_space_info *sinfo;
3024 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
3025 BTRFS_BLOCK_GROUP_SYSTEM,
3026 BTRFS_BLOCK_GROUP_METADATA,
3027 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
3028 int num_types = 4;
3029 int i;
3030 int c;
3031 int num_tolerated_disk_barrier_failures =
3032 (int)fs_info->fs_devices->num_devices;
3033
3034 for (i = 0; i < num_types; i++) {
3035 struct btrfs_space_info *tmp;
3036
3037 sinfo = NULL;
3038 rcu_read_lock();
3039 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
3040 if (tmp->flags == types[i]) {
3041 sinfo = tmp;
3042 break;
3043 }
3044 }
3045 rcu_read_unlock();
3046
3047 if (!sinfo)
3048 continue;
3049
3050 down_read(&sinfo->groups_sem);
3051 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3052 if (!list_empty(&sinfo->block_groups[c])) {
3053 u64 flags;
3054
3055 btrfs_get_block_group_info(
3056 &sinfo->block_groups[c], &space);
3057 if (space.total_bytes == 0 ||
3058 space.used_bytes == 0)
3059 continue;
3060 flags = space.flags;
3061 /*
3062 * return
3063 * 0: if dup, single or RAID0 is configured for
3064 * any of metadata, system or data, else
3065 * 1: if RAID5 is configured, or if RAID1 or
3066 * RAID10 is configured and only two mirrors
3067 * are used, else
3068 * 2: if RAID6 is configured, else
3069 * num_mirrors - 1: if RAID1 or RAID10 is
3070 * configured and more than
3071 * 2 mirrors are used.
3072 */
3073 if (num_tolerated_disk_barrier_failures > 0 &&
3074 ((flags & (BTRFS_BLOCK_GROUP_DUP |
3075 BTRFS_BLOCK_GROUP_RAID0)) ||
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0)))
3078 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1
3080 &&
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3082 BTRFS_BLOCK_GROUP_RAID10)))
3083 num_tolerated_disk_barrier_failures = 1;
3084 }
3085 }
3086 up_read(&sinfo->groups_sem);
3087 }
3088
3089 return num_tolerated_disk_barrier_failures;
3090}
3091
2957int write_all_supers(struct btrfs_root *root, int max_mirrors) 3092int write_all_supers(struct btrfs_root *root, int max_mirrors)
2958{ 3093{
2959 struct list_head *head; 3094 struct list_head *head;
@@ -2976,8 +3111,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2976 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3111 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2977 head = &root->fs_info->fs_devices->devices; 3112 head = &root->fs_info->fs_devices->devices;
2978 3113
2979 if (do_barriers) 3114 if (do_barriers) {
2980 barrier_all_devices(root->fs_info); 3115 ret = barrier_all_devices(root->fs_info);
3116 if (ret) {
3117 mutex_unlock(
3118 &root->fs_info->fs_devices->device_list_mutex);
3119 btrfs_error(root->fs_info, ret,
3120 "errors while submitting device barriers.");
3121 return ret;
3122 }
3123 }
2981 3124
2982 list_for_each_entry_rcu(dev, head, dev_list) { 3125 list_for_each_entry_rcu(dev, head, dev_list) {
2983 if (!dev->bdev) { 3126 if (!dev->bdev) {
@@ -3177,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3177 smp_mb(); 3320 smp_mb();
3178 3321
3179 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3180 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3181 3324
3182 btrfs_scrub_cancel(root); 3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3326
3327 btrfs_scrub_cancel(fs_info);
3183 3328
3184 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3185 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3186 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3187 3332
3188 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3189 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3190 3335
3191 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3192 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3211,10 +3356,6 @@ int close_ctree(struct btrfs_root *root)
3211 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3356 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3212 (unsigned long long)fs_info->delalloc_bytes); 3357 (unsigned long long)fs_info->delalloc_bytes);
3213 } 3358 }
3214 if (fs_info->total_ref_cache_size) {
3215 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
3216 (unsigned long long)fs_info->total_ref_cache_size);
3217 }
3218 3359
3219 free_extent_buffer(fs_info->extent_root->node); 3360 free_extent_buffer(fs_info->extent_root->node);
3220 free_extent_buffer(fs_info->extent_root->commit_root); 3361 free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3250,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3250 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3251 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3252 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3253 3395
3254#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3255 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3294,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3294 int was_dirty; 3436 int was_dirty;
3295 3437
3296 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3297 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3298 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3299 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3300 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3301 (unsigned long long)transid, 3443 (unsigned long long)transid,
3302 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3303 WARN_ON(1);
3304 }
3305 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3306 if (!was_dirty) { 3446 if (!was_dirty) {
3307 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3310,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3310 } 3450 }
3311} 3451}
3312 3452
3313void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3314{ 3455{
3315 /* 3456 /*
3316 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3322,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3322 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3323 return; 3464 return;
3324 3465
3325 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3326 3468
3327 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3328 3470
3329 if (num_dirty > thresh) { 3471 if (num_dirty > thresh) {
3330 balance_dirty_pages_ratelimited_nr( 3472 balance_dirty_pages_ratelimited(
3331 root->fs_info->btree_inode->i_mapping, 1); 3473 root->fs_info->btree_inode->i_mapping);
3332 } 3474 }
3333 return; 3475 return;
3334} 3476}
3335 3477
3336void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3337{ 3479{
3338 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3339 * looks as though older kernels can get into trouble with 3481}
3340 * this code, they end up stuck in balance_dirty_pages forever
3341 */
3342 u64 num_dirty;
3343 unsigned long thresh = 32 * 1024 * 1024;
3344
3345 if (current->flags & PF_MEMALLOC)
3346 return;
3347
3348 num_dirty = root->fs_info->dirty_metadata_bytes;
3349 3482
3350 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3351 balance_dirty_pages_ratelimited_nr( 3484{
3352 root->fs_info->btree_inode->i_mapping, 1); 3485 __btrfs_btree_balance_dirty(root, 0);
3353 }
3354 return;
3355} 3486}
3356 3487
3357int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
@@ -3360,52 +3491,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3360 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3491 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3361} 3492}
3362 3493
3363int btree_lock_page_hook(struct page *page, void *data,
3364 void (*flush_fn)(void *))
3365{
3366 struct inode *inode = page->mapping->host;
3367 struct btrfs_root *root = BTRFS_I(inode)->root;
3368 struct extent_buffer *eb;
3369
3370 /*
3371 * We culled this eb but the page is still hanging out on the mapping,
3372 * carry on.
3373 */
3374 if (!PagePrivate(page))
3375 goto out;
3376
3377 eb = (struct extent_buffer *)page->private;
3378 if (!eb) {
3379 WARN_ON(1);
3380 goto out;
3381 }
3382 if (page != eb->pages[0])
3383 goto out;
3384
3385 if (!btrfs_try_tree_write_lock(eb)) {
3386 flush_fn(data);
3387 btrfs_tree_lock(eb);
3388 }
3389 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3390
3391 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3392 spin_lock(&root->fs_info->delalloc_lock);
3393 if (root->fs_info->dirty_metadata_bytes >= eb->len)
3394 root->fs_info->dirty_metadata_bytes -= eb->len;
3395 else
3396 WARN_ON(1);
3397 spin_unlock(&root->fs_info->delalloc_lock);
3398 }
3399
3400 btrfs_tree_unlock(eb);
3401out:
3402 if (!trylock_page(page)) {
3403 flush_fn(data);
3404 lock_page(page);
3405 }
3406 return 0;
3407}
3408
3409static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3494static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3410 int read_only) 3495 int read_only)
3411{ 3496{
@@ -3608,7 +3693,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3608 3693
3609 while (1) { 3694 while (1) {
3610 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 3695 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3611 mark); 3696 mark, NULL);
3612 if (ret) 3697 if (ret)
3613 break; 3698 break;
3614 3699
@@ -3663,7 +3748,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3663again: 3748again:
3664 while (1) { 3749 while (1) {
3665 ret = find_first_extent_bit(unpin, 0, &start, &end, 3750 ret = find_first_extent_bit(unpin, 0, &start, &end,
3666 EXTENT_DIRTY); 3751 EXTENT_DIRTY, NULL);
3667 if (ret) 3752 if (ret)
3668 break; 3753 break;
3669 3754
@@ -3800,7 +3885,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3800} 3885}
3801 3886
3802static struct extent_io_ops btree_extent_io_ops = { 3887static struct extent_io_ops btree_extent_io_ops = {
3803 .write_cache_pages_lock_hook = btree_lock_page_hook,
3804 .readpage_end_io_hook = btree_readpage_end_io_hook, 3888 .readpage_end_io_hook = btree_readpage_end_io_hook,
3805 .readpage_io_failed_hook = btree_io_failed_hook, 3889 .readpage_io_failed_hook = btree_io_failed_hook,
3806 .submit_bio_hook = btree_submit_bio_hook, 3890 .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
63 struct btrfs_key *location); 63 struct btrfs_key *location);
64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
65void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65void btrfs_btree_balance_dirty(struct btrfs_root *root);
66void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
68void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 68void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
95 u64 objectid); 95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data, 96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *)); 97 void (*flush_fn)(void *));
98int btrfs_calc_num_tolerated_disk_barrier_failures(
99 struct btrfs_fs_info *fs_info);
98 100
99#ifdef CONFIG_DEBUG_LOCK_ALLOC 101#ifdef CONFIG_DEBUG_LOCK_ALLOC
100void btrfs_init_lockdep(void); 102void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d3..521e9d4424f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36#include "math.h"
36 37
37#undef SCRAMBLE_DELAYED_REFS 38#undef SCRAMBLE_DELAYED_REFS
38 39
@@ -94,8 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 u64 flags, struct btrfs_disk_key *key, 95 u64 flags, struct btrfs_disk_key *key,
95 int level, struct btrfs_key *ins); 96 int level, struct btrfs_key *ins);
96static int do_chunk_alloc(struct btrfs_trans_handle *trans, 97static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97 struct btrfs_root *extent_root, u64 alloc_bytes, 98 struct btrfs_root *extent_root, u64 flags,
98 u64 flags, int force); 99 int force);
99static int find_next_key(struct btrfs_path *path, int level, 100static int find_next_key(struct btrfs_path *path, int level,
100 struct btrfs_key *key); 101 struct btrfs_key *key);
101static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 102static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +313,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
312 while (start < end) { 313 while (start < end) {
313 ret = find_first_extent_bit(info->pinned_extents, start, 314 ret = find_first_extent_bit(info->pinned_extents, start,
314 &extent_start, &extent_end, 315 &extent_start, &extent_end,
315 EXTENT_DIRTY | EXTENT_UPTODATE); 316 EXTENT_DIRTY | EXTENT_UPTODATE,
317 NULL);
316 if (ret) 318 if (ret)
317 break; 319 break;
318 320
@@ -648,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
648 rcu_read_unlock(); 650 rcu_read_unlock();
649} 651}
650 652
651static u64 div_factor(u64 num, int factor)
652{
653 if (factor == 10)
654 return num;
655 num *= factor;
656 do_div(num, 10);
657 return num;
658}
659
660static u64 div_factor_fine(u64 num, int factor)
661{
662 if (factor == 100)
663 return num;
664 num *= factor;
665 do_div(num, 100);
666 return num;
667}
668
669u64 btrfs_find_block_group(struct btrfs_root *root, 653u64 btrfs_find_block_group(struct btrfs_root *root,
670 u64 search_start, u64 search_hint, int owner) 654 u64 search_start, u64 search_hint, int owner)
671{ 655{
@@ -1834,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1834 1818
1835 1819
1836 /* Tell the block device(s) that the sectors can be discarded */ 1820 /* Tell the block device(s) that the sectors can be discarded */
1837 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1821 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1838 bytenr, &num_bytes, &bbio, 0); 1822 bytenr, &num_bytes, &bbio, 0);
1839 /* Error condition is -ENOMEM */ 1823 /* Error condition is -ENOMEM */
1840 if (!ret) { 1824 if (!ret) {
@@ -2313,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2313 kfree(extent_op); 2297 kfree(extent_op);
2314 2298
2315 if (ret) { 2299 if (ret) {
2300 list_del_init(&locked_ref->cluster);
2301 mutex_unlock(&locked_ref->mutex);
2302
2316 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2317 spin_lock(&delayed_refs->lock); 2304 spin_lock(&delayed_refs->lock);
2318 return ret; 2305 return ret;
@@ -2355,16 +2342,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2355 count++; 2342 count++;
2356 2343
2357 if (ret) { 2344 if (ret) {
2345 if (locked_ref) {
2346 list_del_init(&locked_ref->cluster);
2347 mutex_unlock(&locked_ref->mutex);
2348 }
2358 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2359 spin_lock(&delayed_refs->lock); 2350 spin_lock(&delayed_refs->lock);
2360 return ret; 2351 return ret;
2361 } 2352 }
2362 2353
2363next: 2354next:
2364 do_chunk_alloc(trans, fs_info->extent_root,
2365 2 * 1024 * 1024,
2366 btrfs_get_alloc_profile(root, 0),
2367 CHUNK_ALLOC_NO_FORCE);
2368 cond_resched(); 2355 cond_resched();
2369 spin_lock(&delayed_refs->lock); 2356 spin_lock(&delayed_refs->lock);
2370 } 2357 }
@@ -2478,10 +2465,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2478 if (root == root->fs_info->extent_root) 2465 if (root == root->fs_info->extent_root)
2479 root = root->fs_info->tree_root; 2466 root = root->fs_info->tree_root;
2480 2467
2481 do_chunk_alloc(trans, root->fs_info->extent_root,
2482 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2483 CHUNK_ALLOC_NO_FORCE);
2484
2485 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2468 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2486 2469
2487 delayed_refs = &trans->transaction->delayed_refs; 2470 delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2534,12 @@ again:
2551 } 2534 }
2552 2535
2553 if (run_all) { 2536 if (run_all) {
2537 if (!list_empty(&trans->new_bgs)) {
2538 spin_unlock(&delayed_refs->lock);
2539 btrfs_create_pending_block_groups(trans, root);
2540 spin_lock(&delayed_refs->lock);
2541 }
2542
2554 node = rb_first(&delayed_refs->root); 2543 node = rb_first(&delayed_refs->root);
2555 if (!node) 2544 if (!node)
2556 goto out; 2545 goto out;
@@ -3406,7 +3395,6 @@ alloc:
3406 return PTR_ERR(trans); 3395 return PTR_ERR(trans);
3407 3396
3408 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3397 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3409 bytes + 2 * 1024 * 1024,
3410 alloc_target, 3398 alloc_target,
3411 CHUNK_ALLOC_NO_FORCE); 3399 CHUNK_ALLOC_NO_FORCE);
3412 btrfs_end_transaction(trans, root); 3400 btrfs_end_transaction(trans, root);
@@ -3488,8 +3476,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3488} 3476}
3489 3477
3490static int should_alloc_chunk(struct btrfs_root *root, 3478static int should_alloc_chunk(struct btrfs_root *root,
3491 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3479 struct btrfs_space_info *sinfo, int force)
3492 int force)
3493{ 3480{
3494 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3481 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3495 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3482 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3491,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3504 * and purposes it's used space. Don't worry about locking the 3491 * and purposes it's used space. Don't worry about locking the
3505 * global_rsv, it doesn't change except when the transaction commits. 3492 * global_rsv, it doesn't change except when the transaction commits.
3506 */ 3493 */
3507 num_allocated += global_rsv->size; 3494 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3495 num_allocated += global_rsv->size;
3508 3496
3509 /* 3497 /*
3510 * in limited mode, we want to have some free space up to 3498 * in limited mode, we want to have some free space up to
@@ -3518,15 +3506,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3518 if (num_bytes - num_allocated < thresh) 3506 if (num_bytes - num_allocated < thresh)
3519 return 1; 3507 return 1;
3520 } 3508 }
3521 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3522
3523 /* 256MB or 2% of the FS */
3524 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3525 /* system chunks need a much small threshold */
3526 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3527 thresh = 32 * 1024 * 1024;
3528 3509
3529 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) 3510 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3530 return 0; 3511 return 0;
3531 return 1; 3512 return 1;
3532} 3513}
@@ -3576,8 +3557,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
3576} 3557}
3577 3558
3578static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3559static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3579 struct btrfs_root *extent_root, u64 alloc_bytes, 3560 struct btrfs_root *extent_root, u64 flags, int force)
3580 u64 flags, int force)
3581{ 3561{
3582 struct btrfs_space_info *space_info; 3562 struct btrfs_space_info *space_info;
3583 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3563 struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3581,7 @@ again:
3601 return 0; 3581 return 0;
3602 } 3582 }
3603 3583
3604 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { 3584 if (!should_alloc_chunk(extent_root, space_info, force)) {
3605 spin_unlock(&space_info->lock); 3585 spin_unlock(&space_info->lock);
3606 return 0; 3586 return 0;
3607 } else if (space_info->chunk_alloc) { 3587 } else if (space_info->chunk_alloc) {
@@ -3669,6 +3649,60 @@ out:
3669 return ret; 3649 return ret;
3670} 3650}
3671 3651
3652static int can_overcommit(struct btrfs_root *root,
3653 struct btrfs_space_info *space_info, u64 bytes,
3654 enum btrfs_reserve_flush_enum flush)
3655{
3656 u64 profile = btrfs_get_alloc_profile(root, 0);
3657 u64 avail;
3658 u64 used;
3659
3660 used = space_info->bytes_used + space_info->bytes_reserved +
3661 space_info->bytes_pinned + space_info->bytes_readonly +
3662 space_info->bytes_may_use;
3663
3664 spin_lock(&root->fs_info->free_chunk_lock);
3665 avail = root->fs_info->free_chunk_space;
3666 spin_unlock(&root->fs_info->free_chunk_lock);
3667
3668 /*
3669 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable.
3671 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 |
3674 BTRFS_BLOCK_GROUP_RAID10))
3675 avail >>= 1;
3676
3677 /*
3678 * If we aren't flushing all things, let us overcommit up to
3679 * 1/2th of the space. If we can flush, don't let us overcommit
3680 * too much, let it overcommit up to 1/8 of the space.
3681 */
3682 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3683 avail >>= 3;
3684 else
3685 avail >>= 1;
3686
3687 if (used + bytes < space_info->total_bytes + avail)
3688 return 1;
3689 return 0;
3690}
3691
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3693 unsigned long nr_pages,
3694 enum wb_reason reason)
3695{
3696 if (!writeback_in_progress(sb->s_bdi) &&
3697 down_read_trylock(&sb->s_umount)) {
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702
3703 return 0;
3704}
3705
3672/* 3706/*
3673 * shrink metadata reservation for delalloc 3707 * shrink metadata reservation for delalloc
3674 */ 3708 */
@@ -3683,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3683 long time_left; 3717 long time_left;
3684 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3718 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3685 int loops = 0; 3719 int loops = 0;
3720 enum btrfs_reserve_flush_enum flush;
3686 3721
3687 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 trans = (struct btrfs_trans_handle *)current->journal_info;
3688 block_rsv = &root->fs_info->delalloc_block_rsv; 3723 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3693,21 +3728,30 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3693 if (delalloc_bytes == 0) { 3728 if (delalloc_bytes == 0) {
3694 if (trans) 3729 if (trans)
3695 return; 3730 return;
3696 btrfs_wait_ordered_extents(root, 0, 0); 3731 btrfs_wait_ordered_extents(root, 0);
3697 return; 3732 return;
3698 } 3733 }
3699 3734
3700 while (delalloc_bytes && loops < 3) { 3735 while (delalloc_bytes && loops < 3) {
3701 max_reclaim = min(delalloc_bytes, to_reclaim); 3736 max_reclaim = min(delalloc_bytes, to_reclaim);
3702 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3703 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
3704 WB_REASON_FS_FREE_SPACE); 3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3705 3741
3742 /*
3743 * We need to wait for the async pages to actually start before
3744 * we do anything.
3745 */
3746 wait_event(root->fs_info->async_submit_wait,
3747 !atomic_read(&root->fs_info->async_delalloc_pages));
3748
3749 if (!trans)
3750 flush = BTRFS_RESERVE_FLUSH_ALL;
3751 else
3752 flush = BTRFS_RESERVE_NO_FLUSH;
3706 spin_lock(&space_info->lock); 3753 spin_lock(&space_info->lock);
3707 if (space_info->bytes_used + space_info->bytes_reserved + 3754 if (can_overcommit(root, space_info, orig, flush)) {
3708 space_info->bytes_pinned + space_info->bytes_readonly +
3709 space_info->bytes_may_use + orig <=
3710 space_info->total_bytes) {
3711 spin_unlock(&space_info->lock); 3755 spin_unlock(&space_info->lock);
3712 break; 3756 break;
3713 } 3757 }
@@ -3715,7 +3759,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3715 3759
3716 loops++; 3760 loops++;
3717 if (wait_ordered && !trans) { 3761 if (wait_ordered && !trans) {
3718 btrfs_wait_ordered_extents(root, 0, 0); 3762 btrfs_wait_ordered_extents(root, 0);
3719 } else { 3763 } else {
3720 time_left = schedule_timeout_killable(1); 3764 time_left = schedule_timeout_killable(1);
3721 if (time_left) 3765 if (time_left)
@@ -3784,11 +3828,12 @@ commit:
3784} 3828}
3785 3829
3786enum flush_state { 3830enum flush_state {
3787 FLUSH_DELALLOC = 1, 3831 FLUSH_DELAYED_ITEMS_NR = 1,
3788 FLUSH_DELALLOC_WAIT = 2, 3832 FLUSH_DELAYED_ITEMS = 2,
3789 FLUSH_DELAYED_ITEMS_NR = 3, 3833 FLUSH_DELALLOC = 3,
3790 FLUSH_DELAYED_ITEMS = 4, 3834 FLUSH_DELALLOC_WAIT = 4,
3791 COMMIT_TRANS = 5, 3835 ALLOC_CHUNK = 5,
3836 COMMIT_TRANS = 6,
3792}; 3837};
3793 3838
3794static int flush_space(struct btrfs_root *root, 3839static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3845,6 @@ static int flush_space(struct btrfs_root *root,
3800 int ret = 0; 3845 int ret = 0;
3801 3846
3802 switch (state) { 3847 switch (state) {
3803 case FLUSH_DELALLOC:
3804 case FLUSH_DELALLOC_WAIT:
3805 shrink_delalloc(root, num_bytes, orig_bytes,
3806 state == FLUSH_DELALLOC_WAIT);
3807 break;
3808 case FLUSH_DELAYED_ITEMS_NR: 3848 case FLUSH_DELAYED_ITEMS_NR:
3809 case FLUSH_DELAYED_ITEMS: 3849 case FLUSH_DELAYED_ITEMS:
3810 if (state == FLUSH_DELAYED_ITEMS_NR) { 3850 if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3865,24 @@ static int flush_space(struct btrfs_root *root,
3825 ret = btrfs_run_delayed_items_nr(trans, root, nr); 3865 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3826 btrfs_end_transaction(trans, root); 3866 btrfs_end_transaction(trans, root);
3827 break; 3867 break;
3868 case FLUSH_DELALLOC:
3869 case FLUSH_DELALLOC_WAIT:
3870 shrink_delalloc(root, num_bytes, orig_bytes,
3871 state == FLUSH_DELALLOC_WAIT);
3872 break;
3873 case ALLOC_CHUNK:
3874 trans = btrfs_join_transaction(root);
3875 if (IS_ERR(trans)) {
3876 ret = PTR_ERR(trans);
3877 break;
3878 }
3879 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3880 btrfs_get_alloc_profile(root, 0),
3881 CHUNK_ALLOC_NO_FORCE);
3882 btrfs_end_transaction(trans, root);
3883 if (ret == -ENOSPC)
3884 ret = 0;
3885 break;
3828 case COMMIT_TRANS: 3886 case COMMIT_TRANS:
3829 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 3887 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3830 break; 3888 break;
@@ -3840,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
3840 * @root - the root we're allocating for 3898 * @root - the root we're allocating for
3841 * @block_rsv - the block_rsv we're allocating for 3899 * @block_rsv - the block_rsv we're allocating for
3842 * @orig_bytes - the number of bytes we want 3900 * @orig_bytes - the number of bytes we want
3843 * @flush - wether or not we can flush to make our reservation 3901 * @flush - whether or not we can flush to make our reservation
3844 * 3902 *
3845 * This will reserve orgi_bytes number of bytes from the space info associated 3903 * This will reserve orgi_bytes number of bytes from the space info associated
3846 * with the block_rsv. If there is not enough space it will make an attempt to 3904 * with the block_rsv. If there is not enough space it will make an attempt to
@@ -3851,24 +3909,25 @@ static int flush_space(struct btrfs_root *root,
3851 */ 3909 */
3852static int reserve_metadata_bytes(struct btrfs_root *root, 3910static int reserve_metadata_bytes(struct btrfs_root *root,
3853 struct btrfs_block_rsv *block_rsv, 3911 struct btrfs_block_rsv *block_rsv,
3854 u64 orig_bytes, int flush) 3912 u64 orig_bytes,
3913 enum btrfs_reserve_flush_enum flush)
3855{ 3914{
3856 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 struct btrfs_space_info *space_info = block_rsv->space_info;
3857 u64 used; 3916 u64 used;
3858 u64 num_bytes = orig_bytes; 3917 u64 num_bytes = orig_bytes;
3859 int flush_state = FLUSH_DELALLOC; 3918 int flush_state = FLUSH_DELAYED_ITEMS_NR;
3860 int ret = 0; 3919 int ret = 0;
3861 bool flushing = false; 3920 bool flushing = false;
3862 bool committed = false;
3863 3921
3864again: 3922again:
3865 ret = 0; 3923 ret = 0;
3866 spin_lock(&space_info->lock); 3924 spin_lock(&space_info->lock);
3867 /* 3925 /*
3868 * We only want to wait if somebody other than us is flushing and we are 3926 * We only want to wait if somebody other than us is flushing and we
3869 * actually alloed to flush. 3927 * are actually allowed to flush all things.
3870 */ 3928 */
3871 while (flush && !flushing && space_info->flush) { 3929 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3930 space_info->flush) {
3872 spin_unlock(&space_info->lock); 3931 spin_unlock(&space_info->lock);
3873 /* 3932 /*
3874 * If we have a trans handle we can't wait because the flusher 3933 * If we have a trans handle we can't wait because the flusher
@@ -3922,80 +3981,52 @@ again:
3922 (orig_bytes * 2); 3981 (orig_bytes * 2);
3923 } 3982 }
3924 3983
3925 if (ret) { 3984 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3926 u64 profile = btrfs_get_alloc_profile(root, 0); 3985 space_info->bytes_may_use += orig_bytes;
3927 u64 avail; 3986 trace_btrfs_space_reservation(root->fs_info, "space_info",
3928 3987 space_info->flags, orig_bytes,
3929 /* 3988 1);
3930 * If we have a lot of space that's pinned, don't bother doing 3989 ret = 0;
3931 * the overcommit dance yet and just commit the transaction.
3932 */
3933 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3934 do_div(avail, 10);
3935 if (space_info->bytes_pinned >= avail && flush && !committed) {
3936 space_info->flush = 1;
3937 flushing = true;
3938 spin_unlock(&space_info->lock);
3939 ret = may_commit_transaction(root, space_info,
3940 orig_bytes, 1);
3941 if (ret)
3942 goto out;
3943 committed = true;
3944 goto again;
3945 }
3946
3947 spin_lock(&root->fs_info->free_chunk_lock);
3948 avail = root->fs_info->free_chunk_space;
3949
3950 /*
3951 * If we have dup, raid1 or raid10 then only half of the free
3952 * space is actually useable.
3953 */
3954 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3955 BTRFS_BLOCK_GROUP_RAID1 |
3956 BTRFS_BLOCK_GROUP_RAID10))
3957 avail >>= 1;
3958
3959 /*
3960 * If we aren't flushing don't let us overcommit too much, say
3961 * 1/8th of the space. If we can flush, let it overcommit up to
3962 * 1/2 of the space.
3963 */
3964 if (flush)
3965 avail >>= 3;
3966 else
3967 avail >>= 1;
3968 spin_unlock(&root->fs_info->free_chunk_lock);
3969
3970 if (used + num_bytes < space_info->total_bytes + avail) {
3971 space_info->bytes_may_use += orig_bytes;
3972 trace_btrfs_space_reservation(root->fs_info,
3973 "space_info", space_info->flags, orig_bytes, 1);
3974 ret = 0;
3975 }
3976 } 3990 }
3977 3991
3978 /* 3992 /*
3979 * Couldn't make our reservation, save our place so while we're trying 3993 * Couldn't make our reservation, save our place so while we're trying
3980 * to reclaim space we can actually use it instead of somebody else 3994 * to reclaim space we can actually use it instead of somebody else
3981 * stealing it from us. 3995 * stealing it from us.
3996 *
3997 * We make the other tasks wait for the flush only when we can flush
3998 * all things.
3982 */ 3999 */
3983 if (ret && flush) { 4000 if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
3984 flushing = true; 4001 flushing = true;
3985 space_info->flush = 1; 4002 space_info->flush = 1;
3986 } 4003 }
3987 4004
3988 spin_unlock(&space_info->lock); 4005 spin_unlock(&space_info->lock);
3989 4006
3990 if (!ret || !flush) 4007 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
3991 goto out; 4008 goto out;
3992 4009
3993 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4010 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3994 flush_state); 4011 flush_state);
3995 flush_state++; 4012 flush_state++;
4013
4014 /*
4015 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4016 * would happen. So skip delalloc flush.
4017 */
4018 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4019 (flush_state == FLUSH_DELALLOC ||
4020 flush_state == FLUSH_DELALLOC_WAIT))
4021 flush_state = ALLOC_CHUNK;
4022
3996 if (!ret) 4023 if (!ret)
3997 goto again; 4024 goto again;
3998 else if (flush_state <= COMMIT_TRANS) 4025 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4026 flush_state < COMMIT_TRANS)
4027 goto again;
4028 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4029 flush_state <= COMMIT_TRANS)
3999 goto again; 4030 goto again;
4000 4031
4001out: 4032out:
@@ -4114,13 +4145,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4114 return 0; 4145 return 0;
4115} 4146}
4116 4147
4117void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) 4148void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4118{ 4149{
4119 memset(rsv, 0, sizeof(*rsv)); 4150 memset(rsv, 0, sizeof(*rsv));
4120 spin_lock_init(&rsv->lock); 4151 spin_lock_init(&rsv->lock);
4152 rsv->type = type;
4121} 4153}
4122 4154
4123struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 4155struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4156 unsigned short type)
4124{ 4157{
4125 struct btrfs_block_rsv *block_rsv; 4158 struct btrfs_block_rsv *block_rsv;
4126 struct btrfs_fs_info *fs_info = root->fs_info; 4159 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4162,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4129 if (!block_rsv) 4162 if (!block_rsv)
4130 return NULL; 4163 return NULL;
4131 4164
4132 btrfs_init_block_rsv(block_rsv); 4165 btrfs_init_block_rsv(block_rsv, type);
4133 block_rsv->space_info = __find_space_info(fs_info, 4166 block_rsv->space_info = __find_space_info(fs_info,
4134 BTRFS_BLOCK_GROUP_METADATA); 4167 BTRFS_BLOCK_GROUP_METADATA);
4135 return block_rsv; 4168 return block_rsv;
@@ -4138,13 +4171,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4138void btrfs_free_block_rsv(struct btrfs_root *root, 4171void btrfs_free_block_rsv(struct btrfs_root *root,
4139 struct btrfs_block_rsv *rsv) 4172 struct btrfs_block_rsv *rsv)
4140{ 4173{
4174 if (!rsv)
4175 return;
4141 btrfs_block_rsv_release(root, rsv, (u64)-1); 4176 btrfs_block_rsv_release(root, rsv, (u64)-1);
4142 kfree(rsv); 4177 kfree(rsv);
4143} 4178}
4144 4179
4145static inline int __block_rsv_add(struct btrfs_root *root, 4180int btrfs_block_rsv_add(struct btrfs_root *root,
4146 struct btrfs_block_rsv *block_rsv, 4181 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4147 u64 num_bytes, int flush) 4182 enum btrfs_reserve_flush_enum flush)
4148{ 4183{
4149 int ret; 4184 int ret;
4150 4185
@@ -4160,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
4160 return ret; 4195 return ret;
4161} 4196}
4162 4197
4163int btrfs_block_rsv_add(struct btrfs_root *root,
4164 struct btrfs_block_rsv *block_rsv,
4165 u64 num_bytes)
4166{
4167 return __block_rsv_add(root, block_rsv, num_bytes, 1);
4168}
4169
4170int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4171 struct btrfs_block_rsv *block_rsv,
4172 u64 num_bytes)
4173{
4174 return __block_rsv_add(root, block_rsv, num_bytes, 0);
4175}
4176
4177int btrfs_block_rsv_check(struct btrfs_root *root, 4198int btrfs_block_rsv_check(struct btrfs_root *root,
4178 struct btrfs_block_rsv *block_rsv, int min_factor) 4199 struct btrfs_block_rsv *block_rsv, int min_factor)
4179{ 4200{
@@ -4192,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
4192 return ret; 4213 return ret;
4193} 4214}
4194 4215
4195static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4216int btrfs_block_rsv_refill(struct btrfs_root *root,
4196 struct btrfs_block_rsv *block_rsv, 4217 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4197 u64 min_reserved, int flush) 4218 enum btrfs_reserve_flush_enum flush)
4198{ 4219{
4199 u64 num_bytes = 0; 4220 u64 num_bytes = 0;
4200 int ret = -ENOSPC; 4221 int ret = -ENOSPC;
@@ -4222,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4222 return ret; 4243 return ret;
4223} 4244}
4224 4245
4225int btrfs_block_rsv_refill(struct btrfs_root *root,
4226 struct btrfs_block_rsv *block_rsv,
4227 u64 min_reserved)
4228{
4229 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4230}
4231
4232int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4233 struct btrfs_block_rsv *block_rsv,
4234 u64 min_reserved)
4235{
4236 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4237}
4238
4239int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4246int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4240 struct btrfs_block_rsv *dst_rsv, 4247 struct btrfs_block_rsv *dst_rsv,
4241 u64 num_bytes) 4248 u64 num_bytes)
@@ -4416,10 +4423,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4416 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4423 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4417 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4424 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4418 /* 4425 /*
4419 * two for root back/forward refs, two for directory entries 4426 * two for root back/forward refs, two for directory entries,
4420 * and one for root of the snapshot. 4427 * one for root of the snapshot and one for parent inode.
4421 */ 4428 */
4422 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); 4429 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4423 dst_rsv->space_info = src_rsv->space_info; 4430 dst_rsv->space_info = src_rsv->space_info;
4424 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4431 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4425} 4432}
@@ -4526,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4526 u64 csum_bytes; 4533 u64 csum_bytes;
4527 unsigned nr_extents = 0; 4534 unsigned nr_extents = 0;
4528 int extra_reserve = 0; 4535 int extra_reserve = 0;
4529 int flush = 1; 4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4530 int ret; 4537 int ret;
4538 bool delalloc_lock = true;
4531 4539
4532 /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 /* If we are a free space inode we need to not flush since we will be in
4533 if (btrfs_is_free_space_inode(inode)) 4541 * the middle of a transaction commit. We also don't need the delalloc
4534 flush = 0; 4542 * mutex since we won't race with anybody. We need this mostly to make
4543 * lockdep shut its filthy mouth.
4544 */
4545 if (btrfs_is_free_space_inode(inode)) {
4546 flush = BTRFS_RESERVE_NO_FLUSH;
4547 delalloc_lock = false;
4548 }
4535 4549
4536 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4550 if (flush != BTRFS_RESERVE_NO_FLUSH &&
4551 btrfs_transaction_in_commit(root->fs_info))
4537 schedule_timeout(1); 4552 schedule_timeout(1);
4538 4553
4539 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4554 if (delalloc_lock)
4555 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4556
4540 num_bytes = ALIGN(num_bytes, root->sectorsize); 4557 num_bytes = ALIGN(num_bytes, root->sectorsize);
4541 4558
4542 spin_lock(&BTRFS_I(inode)->lock); 4559 spin_lock(&BTRFS_I(inode)->lock);
@@ -4566,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4566 ret = btrfs_qgroup_reserve(root, num_bytes + 4583 ret = btrfs_qgroup_reserve(root, num_bytes +
4567 nr_extents * root->leafsize); 4584 nr_extents * root->leafsize);
4568 if (ret) { 4585 if (ret) {
4569 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4586 spin_lock(&BTRFS_I(inode)->lock);
4587 calc_csum_metadata_size(inode, num_bytes, 0);
4588 spin_unlock(&BTRFS_I(inode)->lock);
4589 if (delalloc_lock)
4590 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4570 return ret; 4591 return ret;
4571 } 4592 }
4572 } 4593 }
@@ -4601,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4601 btrfs_ino(inode), 4622 btrfs_ino(inode),
4602 to_free, 0); 4623 to_free, 0);
4603 } 4624 }
4604 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4625 if (root->fs_info->quota_enabled) {
4626 btrfs_qgroup_free(root, num_bytes +
4627 nr_extents * root->leafsize);
4628 }
4629 if (delalloc_lock)
4630 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4605 return ret; 4631 return ret;
4606 } 4632 }
4607 4633
@@ -4613,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4613 } 4639 }
4614 BTRFS_I(inode)->reserved_extents += nr_extents; 4640 BTRFS_I(inode)->reserved_extents += nr_extents;
4615 spin_unlock(&BTRFS_I(inode)->lock); 4641 spin_unlock(&BTRFS_I(inode)->lock);
4616 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4642
4643 if (delalloc_lock)
4644 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4617 4645
4618 if (to_reserve) 4646 if (to_reserve)
4619 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4647 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4963,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4963{ 4991{
4964 struct btrfs_fs_info *fs_info = root->fs_info; 4992 struct btrfs_fs_info *fs_info = root->fs_info;
4965 struct btrfs_block_group_cache *cache = NULL; 4993 struct btrfs_block_group_cache *cache = NULL;
4994 struct btrfs_space_info *space_info;
4995 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4966 u64 len; 4996 u64 len;
4997 bool readonly;
4967 4998
4968 while (start <= end) { 4999 while (start <= end) {
5000 readonly = false;
4969 if (!cache || 5001 if (!cache ||
4970 start >= cache->key.objectid + cache->key.offset) { 5002 start >= cache->key.objectid + cache->key.offset) {
4971 if (cache) 5003 if (cache)
@@ -4983,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4983 } 5015 }
4984 5016
4985 start += len; 5017 start += len;
5018 space_info = cache->space_info;
4986 5019
4987 spin_lock(&cache->space_info->lock); 5020 spin_lock(&space_info->lock);
4988 spin_lock(&cache->lock); 5021 spin_lock(&cache->lock);
4989 cache->pinned -= len; 5022 cache->pinned -= len;
4990 cache->space_info->bytes_pinned -= len; 5023 space_info->bytes_pinned -= len;
4991 if (cache->ro) 5024 if (cache->ro) {
4992 cache->space_info->bytes_readonly += len; 5025 space_info->bytes_readonly += len;
5026 readonly = true;
5027 }
4993 spin_unlock(&cache->lock); 5028 spin_unlock(&cache->lock);
4994 spin_unlock(&cache->space_info->lock); 5029 if (!readonly && global_rsv->space_info == space_info) {
5030 spin_lock(&global_rsv->lock);
5031 if (!global_rsv->full) {
5032 len = min(len, global_rsv->size -
5033 global_rsv->reserved);
5034 global_rsv->reserved += len;
5035 space_info->bytes_may_use += len;
5036 if (global_rsv->reserved >= global_rsv->size)
5037 global_rsv->full = 1;
5038 }
5039 spin_unlock(&global_rsv->lock);
5040 }
5041 spin_unlock(&space_info->lock);
4995 } 5042 }
4996 5043
4997 if (cache) 5044 if (cache)
@@ -5018,7 +5065,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5018 5065
5019 while (1) { 5066 while (1) {
5020 ret = find_first_extent_bit(unpin, 0, &start, &end, 5067 ret = find_first_extent_bit(unpin, 0, &start, &end,
5021 EXTENT_DIRTY); 5068 EXTENT_DIRTY, NULL);
5022 if (ret) 5069 if (ret)
5023 break; 5070 break;
5024 5071
@@ -5096,8 +5143,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5096 ret = remove_extent_backref(trans, extent_root, path, 5143 ret = remove_extent_backref(trans, extent_root, path,
5097 NULL, refs_to_drop, 5144 NULL, refs_to_drop,
5098 is_data); 5145 is_data);
5099 if (ret) 5146 if (ret) {
5100 goto abort; 5147 btrfs_abort_transaction(trans, extent_root, ret);
5148 goto out;
5149 }
5101 btrfs_release_path(path); 5150 btrfs_release_path(path);
5102 path->leave_spinning = 1; 5151 path->leave_spinning = 1;
5103 5152
@@ -5115,8 +5164,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5115 btrfs_print_leaf(extent_root, 5164 btrfs_print_leaf(extent_root,
5116 path->nodes[0]); 5165 path->nodes[0]);
5117 } 5166 }
5118 if (ret < 0) 5167 if (ret < 0) {
5119 goto abort; 5168 btrfs_abort_transaction(trans, extent_root, ret);
5169 goto out;
5170 }
5120 extent_slot = path->slots[0]; 5171 extent_slot = path->slots[0];
5121 } 5172 }
5122 } else if (ret == -ENOENT) { 5173 } else if (ret == -ENOENT) {
@@ -5130,7 +5181,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5130 (unsigned long long)owner_objectid, 5181 (unsigned long long)owner_objectid,
5131 (unsigned long long)owner_offset); 5182 (unsigned long long)owner_offset);
5132 } else { 5183 } else {
5133 goto abort; 5184 btrfs_abort_transaction(trans, extent_root, ret);
5185 goto out;
5134 } 5186 }
5135 5187
5136 leaf = path->nodes[0]; 5188 leaf = path->nodes[0];
@@ -5140,8 +5192,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5140 BUG_ON(found_extent || extent_slot != path->slots[0]); 5192 BUG_ON(found_extent || extent_slot != path->slots[0]);
5141 ret = convert_extent_item_v0(trans, extent_root, path, 5193 ret = convert_extent_item_v0(trans, extent_root, path,
5142 owner_objectid, 0); 5194 owner_objectid, 0);
5143 if (ret < 0) 5195 if (ret < 0) {
5144 goto abort; 5196 btrfs_abort_transaction(trans, extent_root, ret);
5197 goto out;
5198 }
5145 5199
5146 btrfs_release_path(path); 5200 btrfs_release_path(path);
5147 path->leave_spinning = 1; 5201 path->leave_spinning = 1;
@@ -5158,8 +5212,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5158 (unsigned long long)bytenr); 5212 (unsigned long long)bytenr);
5159 btrfs_print_leaf(extent_root, path->nodes[0]); 5213 btrfs_print_leaf(extent_root, path->nodes[0]);
5160 } 5214 }
5161 if (ret < 0) 5215 if (ret < 0) {
5162 goto abort; 5216 btrfs_abort_transaction(trans, extent_root, ret);
5217 goto out;
5218 }
5219
5163 extent_slot = path->slots[0]; 5220 extent_slot = path->slots[0];
5164 leaf = path->nodes[0]; 5221 leaf = path->nodes[0];
5165 item_size = btrfs_item_size_nr(leaf, extent_slot); 5222 item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5253,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5196 ret = remove_extent_backref(trans, extent_root, path, 5253 ret = remove_extent_backref(trans, extent_root, path,
5197 iref, refs_to_drop, 5254 iref, refs_to_drop,
5198 is_data); 5255 is_data);
5199 if (ret) 5256 if (ret) {
5200 goto abort; 5257 btrfs_abort_transaction(trans, extent_root, ret);
5258 goto out;
5259 }
5201 } 5260 }
5202 } else { 5261 } else {
5203 if (found_extent) { 5262 if (found_extent) {
@@ -5214,27 +5273,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5214 5273
5215 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5274 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5216 num_to_del); 5275 num_to_del);
5217 if (ret) 5276 if (ret) {
5218 goto abort; 5277 btrfs_abort_transaction(trans, extent_root, ret);
5278 goto out;
5279 }
5219 btrfs_release_path(path); 5280 btrfs_release_path(path);
5220 5281
5221 if (is_data) { 5282 if (is_data) {
5222 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5283 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5223 if (ret) 5284 if (ret) {
5224 goto abort; 5285 btrfs_abort_transaction(trans, extent_root, ret);
5286 goto out;
5287 }
5225 } 5288 }
5226 5289
5227 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5290 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5228 if (ret) 5291 if (ret) {
5229 goto abort; 5292 btrfs_abort_transaction(trans, extent_root, ret);
5293 goto out;
5294 }
5230 } 5295 }
5231out: 5296out:
5232 btrfs_free_path(path); 5297 btrfs_free_path(path);
5233 return ret; 5298 return ret;
5234
5235abort:
5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238} 5299}
5239 5300
5240/* 5301/*
@@ -5446,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5446 return 0; 5507 return 0;
5447} 5508}
5448 5509
5449static int __get_block_group_index(u64 flags) 5510int __get_raid_index(u64 flags)
5450{ 5511{
5451 int index; 5512 int index;
5452 5513
@@ -5466,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
5466 5527
5467static int get_block_group_index(struct btrfs_block_group_cache *cache) 5528static int get_block_group_index(struct btrfs_block_group_cache *cache)
5468{ 5529{
5469 return __get_block_group_index(cache->flags); 5530 return __get_raid_index(cache->flags);
5470} 5531}
5471 5532
5472enum btrfs_loop_type { 5533enum btrfs_loop_type {
@@ -5497,8 +5558,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5497 struct btrfs_block_group_cache *used_block_group; 5558 struct btrfs_block_group_cache *used_block_group;
5498 u64 search_start = 0; 5559 u64 search_start = 0;
5499 int empty_cluster = 2 * 1024 * 1024; 5560 int empty_cluster = 2 * 1024 * 1024;
5500 int allowed_chunk_alloc = 0;
5501 int done_chunk_alloc = 0;
5502 struct btrfs_space_info *space_info; 5561 struct btrfs_space_info *space_info;
5503 int loop = 0; 5562 int loop = 0;
5504 int index = 0; 5563 int index = 0;
@@ -5530,9 +5589,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5530 if (btrfs_mixed_space_info(space_info)) 5589 if (btrfs_mixed_space_info(space_info))
5531 use_cluster = false; 5590 use_cluster = false;
5532 5591
5533 if (orig_root->ref_cows || empty_size)
5534 allowed_chunk_alloc = 1;
5535
5536 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5592 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5537 last_ptr = &root->fs_info->meta_alloc_cluster; 5593 last_ptr = &root->fs_info->meta_alloc_cluster;
5538 if (!btrfs_test_opt(root, SSD)) 5594 if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5862,6 @@ checks:
5806 5862
5807 trace_btrfs_reserve_extent(orig_root, block_group, 5863 trace_btrfs_reserve_extent(orig_root, block_group,
5808 search_start, num_bytes); 5864 search_start, num_bytes);
5809 if (offset < search_start)
5810 btrfs_add_free_space(used_block_group, offset,
5811 search_start - offset);
5812 BUG_ON(offset > search_start);
5813 if (used_block_group != block_group) 5865 if (used_block_group != block_group)
5814 btrfs_put_block_group(used_block_group); 5866 btrfs_put_block_group(used_block_group);
5815 btrfs_put_block_group(block_group); 5867 btrfs_put_block_group(block_group);
@@ -5842,34 +5894,17 @@ loop:
5842 index = 0; 5894 index = 0;
5843 loop++; 5895 loop++;
5844 if (loop == LOOP_ALLOC_CHUNK) { 5896 if (loop == LOOP_ALLOC_CHUNK) {
5845 if (allowed_chunk_alloc) { 5897 ret = do_chunk_alloc(trans, root, data,
5846 ret = do_chunk_alloc(trans, root, num_bytes + 5898 CHUNK_ALLOC_FORCE);
5847 2 * 1024 * 1024, data, 5899 /*
5848 CHUNK_ALLOC_LIMITED); 5900 * Do not bail out on ENOSPC since we
5849 /* 5901 * can do more things.
5850 * Do not bail out on ENOSPC since we 5902 */
5851 * can do more things. 5903 if (ret < 0 && ret != -ENOSPC) {
5852 */ 5904 btrfs_abort_transaction(trans,
5853 if (ret < 0 && ret != -ENOSPC) { 5905 root, ret);
5854 btrfs_abort_transaction(trans, 5906 goto out;
5855 root, ret);
5856 goto out;
5857 }
5858 allowed_chunk_alloc = 0;
5859 if (ret == 1)
5860 done_chunk_alloc = 1;
5861 } else if (!done_chunk_alloc &&
5862 space_info->force_alloc ==
5863 CHUNK_ALLOC_NO_FORCE) {
5864 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5865 } 5907 }
5866
5867 /*
5868 * We didn't allocate a chunk, go ahead and drop the
5869 * empty size and loop again.
5870 */
5871 if (!done_chunk_alloc)
5872 loop = LOOP_NO_EMPTY_SIZE;
5873 } 5908 }
5874 5909
5875 if (loop == LOOP_NO_EMPTY_SIZE) { 5910 if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5979,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5944 5979
5945 data = btrfs_get_alloc_profile(root, data); 5980 data = btrfs_get_alloc_profile(root, data);
5946again: 5981again:
5947 /*
5948 * the only place that sets empty_size is btrfs_realloc_node, which
5949 * is not called recursively on allocations
5950 */
5951 if (empty_size || root->ref_cows) {
5952 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5953 num_bytes + 2 * 1024 * 1024, data,
5954 CHUNK_ALLOC_NO_FORCE);
5955 if (ret < 0 && ret != -ENOSPC) {
5956 btrfs_abort_transaction(trans, root, ret);
5957 return ret;
5958 }
5959 }
5960
5961 WARN_ON(num_bytes < root->sectorsize); 5982 WARN_ON(num_bytes < root->sectorsize);
5962 ret = find_free_extent(trans, root, num_bytes, empty_size, 5983 ret = find_free_extent(trans, root, num_bytes, empty_size,
5963 hint_byte, ins, data); 5984 hint_byte, ins, data);
@@ -5967,12 +5988,6 @@ again:
5967 num_bytes = num_bytes >> 1; 5988 num_bytes = num_bytes >> 1;
5968 num_bytes = num_bytes & ~(root->sectorsize - 1); 5989 num_bytes = num_bytes & ~(root->sectorsize - 1);
5969 num_bytes = max(num_bytes, min_alloc_size); 5990 num_bytes = max(num_bytes, min_alloc_size);
5970 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5971 num_bytes, data, CHUNK_ALLOC_FORCE);
5972 if (ret < 0 && ret != -ENOSPC) {
5973 btrfs_abort_transaction(trans, root, ret);
5974 return ret;
5975 }
5976 if (num_bytes == min_alloc_size) 5991 if (num_bytes == min_alloc_size)
5977 final_tried = true; 5992 final_tried = true;
5978 goto again; 5993 goto again;
@@ -6295,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6295 block_rsv = get_block_rsv(trans, root); 6310 block_rsv = get_block_rsv(trans, root);
6296 6311
6297 if (block_rsv->size == 0) { 6312 if (block_rsv->size == 0) {
6298 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6313 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6314 BTRFS_RESERVE_NO_FLUSH);
6299 /* 6315 /*
6300 * If we couldn't reserve metadata bytes try and use some from 6316 * If we couldn't reserve metadata bytes try and use some from
6301 * the global reserve. 6317 * the global reserve.
@@ -6314,15 +6330,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6314 ret = block_rsv_use_bytes(block_rsv, blocksize); 6330 ret = block_rsv_use_bytes(block_rsv, blocksize);
6315 if (!ret) 6331 if (!ret)
6316 return block_rsv; 6332 return block_rsv;
6317 if (ret) { 6333 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6334 static DEFINE_RATELIMIT_STATE(_rs,
6319 DEFAULT_RATELIMIT_INTERVAL, 6335 DEFAULT_RATELIMIT_INTERVAL,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6336 /*DEFAULT_RATELIMIT_BURST*/ 2);
6321 if (__ratelimit(&_rs)) { 6337 if (__ratelimit(&_rs))
6322 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6338 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6323 WARN_ON(1); 6339 ret);
6324 } 6340 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6325 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6341 BTRFS_RESERVE_NO_FLUSH);
6326 if (!ret) { 6342 if (!ret) {
6327 return block_rsv; 6343 return block_rsv;
6328 } else if (ret && block_rsv != global_rsv) { 6344 } else if (ret && block_rsv != global_rsv) {
@@ -7279,7 +7295,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7279 7295
7280 alloc_flags = update_block_group_flags(root, cache->flags); 7296 alloc_flags = update_block_group_flags(root, cache->flags);
7281 if (alloc_flags != cache->flags) { 7297 if (alloc_flags != cache->flags) {
7282 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7298 ret = do_chunk_alloc(trans, root, alloc_flags,
7283 CHUNK_ALLOC_FORCE); 7299 CHUNK_ALLOC_FORCE);
7284 if (ret < 0) 7300 if (ret < 0)
7285 goto out; 7301 goto out;
@@ -7289,7 +7305,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7289 if (!ret) 7305 if (!ret)
7290 goto out; 7306 goto out;
7291 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7307 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7292 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7308 ret = do_chunk_alloc(trans, root, alloc_flags,
7293 CHUNK_ALLOC_FORCE); 7309 CHUNK_ALLOC_FORCE);
7294 if (ret < 0) 7310 if (ret < 0)
7295 goto out; 7311 goto out;
@@ -7303,7 +7319,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7303 struct btrfs_root *root, u64 type) 7319 struct btrfs_root *root, u64 type)
7304{ 7320{
7305 u64 alloc_flags = get_alloc_profile(root, type); 7321 u64 alloc_flags = get_alloc_profile(root, type);
7306 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7322 return do_chunk_alloc(trans, root, alloc_flags,
7307 CHUNK_ALLOC_FORCE); 7323 CHUNK_ALLOC_FORCE);
7308} 7324}
7309 7325
@@ -7453,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7453 */ 7469 */
7454 target = get_restripe_target(root->fs_info, block_group->flags); 7470 target = get_restripe_target(root->fs_info, block_group->flags);
7455 if (target) { 7471 if (target) {
7456 index = __get_block_group_index(extended_to_chunk(target)); 7472 index = __get_raid_index(extended_to_chunk(target));
7457 } else { 7473 } else {
7458 /* 7474 /*
7459 * this is just a balance, so if we were marked as full 7475 * this is just a balance, so if we were marked as full
@@ -7487,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7487 * check to make sure we can actually find a chunk with enough 7503 * check to make sure we can actually find a chunk with enough
7488 * space to fit our block group in. 7504 * space to fit our block group in.
7489 */ 7505 */
7490 if (device->total_bytes > device->bytes_used + min_free) { 7506 if (device->total_bytes > device->bytes_used + min_free &&
7507 !device->is_tgtdev_for_dev_replace) {
7491 ret = find_free_dev_extent(device, min_free, 7508 ret = find_free_dev_extent(device, min_free,
7492 &dev_offset, NULL); 7509 &dev_offset, NULL);
7493 if (!ret) 7510 if (!ret)
@@ -7810,6 +7827,34 @@ error:
7810 return ret; 7827 return ret;
7811} 7828}
7812 7829
7830void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7831 struct btrfs_root *root)
7832{
7833 struct btrfs_block_group_cache *block_group, *tmp;
7834 struct btrfs_root *extent_root = root->fs_info->extent_root;
7835 struct btrfs_block_group_item item;
7836 struct btrfs_key key;
7837 int ret = 0;
7838
7839 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7840 new_bg_list) {
7841 list_del_init(&block_group->new_bg_list);
7842
7843 if (ret)
7844 continue;
7845
7846 spin_lock(&block_group->lock);
7847 memcpy(&item, &block_group->item, sizeof(item));
7848 memcpy(&key, &block_group->key, sizeof(key));
7849 spin_unlock(&block_group->lock);
7850
7851 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7852 sizeof(item));
7853 if (ret)
7854 btrfs_abort_transaction(trans, extent_root, ret);
7855 }
7856}
7857
7813int btrfs_make_block_group(struct btrfs_trans_handle *trans, 7858int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7814 struct btrfs_root *root, u64 bytes_used, 7859 struct btrfs_root *root, u64 bytes_used,
7815 u64 type, u64 chunk_objectid, u64 chunk_offset, 7860 u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7888,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7843 spin_lock_init(&cache->lock); 7888 spin_lock_init(&cache->lock);
7844 INIT_LIST_HEAD(&cache->list); 7889 INIT_LIST_HEAD(&cache->list);
7845 INIT_LIST_HEAD(&cache->cluster_list); 7890 INIT_LIST_HEAD(&cache->cluster_list);
7891 INIT_LIST_HEAD(&cache->new_bg_list);
7846 7892
7847 btrfs_init_free_space_ctl(cache); 7893 btrfs_init_free_space_ctl(cache);
7848 7894
@@ -7874,12 +7920,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7874 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7920 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7875 BUG_ON(ret); /* Logic error */ 7921 BUG_ON(ret); /* Logic error */
7876 7922
7877 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 7923 list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7878 sizeof(cache->item));
7879 if (ret) {
7880 btrfs_abort_transaction(trans, extent_root, ret);
7881 return ret;
7882 }
7883 7924
7884 set_avail_alloc_bits(extent_root->fs_info, type); 7925 set_avail_alloc_bits(extent_root->fs_info, type);
7885 7926
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b08ea4717e9d..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
45 struct bio *bio; 45 struct bio *bio;
46 struct extent_io_tree *tree; 46 struct extent_io_tree *tree;
47 get_extent_t *get_extent; 47 get_extent_t *get_extent;
48 unsigned long bio_flags;
48 49
49 /* tells writepage not to lock the state bits for this range 50 /* tells writepage not to lock the state bits for this range
50 * it still does the unlocking 51 * it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
64 65
65int __init extent_io_init(void) 66int __init extent_io_init(void)
66{ 67{
67 extent_state_cache = kmem_cache_create("extent_state", 68 extent_state_cache = kmem_cache_create("btrfs_extent_state",
68 sizeof(struct extent_state), 0, 69 sizeof(struct extent_state), 0,
69 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 70 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
70 if (!extent_state_cache) 71 if (!extent_state_cache)
71 return -ENOMEM; 72 return -ENOMEM;
72 73
73 extent_buffer_cache = kmem_cache_create("extent_buffers", 74 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
74 sizeof(struct extent_buffer), 0, 75 sizeof(struct extent_buffer), 0,
75 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 76 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
76 if (!extent_buffer_cache) 77 if (!extent_buffer_cache)
@@ -340,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
340{ 341{
341 struct rb_node *node; 342 struct rb_node *node;
342 343
343 if (end < start) { 344 if (end < start)
344 printk(KERN_ERR "btrfs end < start %llu %llu\n", 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
345 (unsigned long long)end, 346 (unsigned long long)end,
346 (unsigned long long)start); 347 (unsigned long long)start);
347 WARN_ON(1);
348 }
349 state->start = start; 348 state->start = start;
350 state->end = end; 349 state->end = end;
351 350
@@ -942,6 +941,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
942 * @end: the end offset in bytes (inclusive) 941 * @end: the end offset in bytes (inclusive)
943 * @bits: the bits to set in this range 942 * @bits: the bits to set in this range
944 * @clear_bits: the bits to clear in this range 943 * @clear_bits: the bits to clear in this range
944 * @cached_state: state that we're going to cache
945 * @mask: the allocation mask 945 * @mask: the allocation mask
946 * 946 *
947 * This will go through and set bits for the given range. If any states exist 947 * This will go through and set bits for the given range. If any states exist
@@ -951,7 +951,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
951 * boundary bits like LOCK. 951 * boundary bits like LOCK.
952 */ 952 */
953int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 953int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
954 int bits, int clear_bits, gfp_t mask) 954 int bits, int clear_bits,
955 struct extent_state **cached_state, gfp_t mask)
955{ 956{
956 struct extent_state *state; 957 struct extent_state *state;
957 struct extent_state *prealloc = NULL; 958 struct extent_state *prealloc = NULL;
@@ -968,6 +969,15 @@ again:
968 } 969 }
969 970
970 spin_lock(&tree->lock); 971 spin_lock(&tree->lock);
972 if (cached_state && *cached_state) {
973 state = *cached_state;
974 if (state->start <= start && state->end > start &&
975 state->tree) {
976 node = &state->rb_node;
977 goto hit_next;
978 }
979 }
980
971 /* 981 /*
972 * this search will find all the extents that end after 982 * this search will find all the extents that end after
973 * our range starts. 983 * our range starts.
@@ -998,6 +1008,7 @@ hit_next:
998 */ 1008 */
999 if (state->start == start && state->end <= end) { 1009 if (state->start == start && state->end <= end) {
1000 set_state_bits(tree, state, &bits); 1010 set_state_bits(tree, state, &bits);
1011 cache_state(state, cached_state);
1001 state = clear_state_bit(tree, state, &clear_bits, 0); 1012 state = clear_state_bit(tree, state, &clear_bits, 0);
1002 if (last_end == (u64)-1) 1013 if (last_end == (u64)-1)
1003 goto out; 1014 goto out;
@@ -1038,6 +1049,7 @@ hit_next:
1038 goto out; 1049 goto out;
1039 if (state->end <= end) { 1050 if (state->end <= end) {
1040 set_state_bits(tree, state, &bits); 1051 set_state_bits(tree, state, &bits);
1052 cache_state(state, cached_state);
1041 state = clear_state_bit(tree, state, &clear_bits, 0); 1053 state = clear_state_bit(tree, state, &clear_bits, 0);
1042 if (last_end == (u64)-1) 1054 if (last_end == (u64)-1)
1043 goto out; 1055 goto out;
@@ -1076,6 +1088,7 @@ hit_next:
1076 &bits); 1088 &bits);
1077 if (err) 1089 if (err)
1078 extent_io_tree_panic(tree, err); 1090 extent_io_tree_panic(tree, err);
1091 cache_state(prealloc, cached_state);
1079 prealloc = NULL; 1092 prealloc = NULL;
1080 start = this_end + 1; 1093 start = this_end + 1;
1081 goto search_again; 1094 goto search_again;
@@ -1098,6 +1111,7 @@ hit_next:
1098 extent_io_tree_panic(tree, err); 1111 extent_io_tree_panic(tree, err);
1099 1112
1100 set_state_bits(tree, prealloc, &bits); 1113 set_state_bits(tree, prealloc, &bits);
1114 cache_state(prealloc, cached_state);
1101 clear_state_bit(tree, prealloc, &clear_bits, 0); 1115 clear_state_bit(tree, prealloc, &clear_bits, 0);
1102 prealloc = NULL; 1116 prealloc = NULL;
1103 goto out; 1117 goto out;
@@ -1150,6 +1164,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1150 NULL, cached_state, mask); 1164 NULL, cached_state, mask);
1151} 1165}
1152 1166
1167int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1168 struct extent_state **cached_state, gfp_t mask)
1169{
1170 return set_extent_bit(tree, start, end,
1171 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1172 NULL, cached_state, mask);
1173}
1174
1153int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1175int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1154 gfp_t mask) 1176 gfp_t mask)
1155{ 1177{
@@ -1294,18 +1316,42 @@ out:
1294 * If nothing was found, 1 is returned. If found something, return 0. 1316 * If nothing was found, 1 is returned. If found something, return 0.
1295 */ 1317 */
1296int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1318int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1297 u64 *start_ret, u64 *end_ret, int bits) 1319 u64 *start_ret, u64 *end_ret, int bits,
1320 struct extent_state **cached_state)
1298{ 1321{
1299 struct extent_state *state; 1322 struct extent_state *state;
1323 struct rb_node *n;
1300 int ret = 1; 1324 int ret = 1;
1301 1325
1302 spin_lock(&tree->lock); 1326 spin_lock(&tree->lock);
1327 if (cached_state && *cached_state) {
1328 state = *cached_state;
1329 if (state->end == start - 1 && state->tree) {
1330 n = rb_next(&state->rb_node);
1331 while (n) {
1332 state = rb_entry(n, struct extent_state,
1333 rb_node);
1334 if (state->state & bits)
1335 goto got_it;
1336 n = rb_next(n);
1337 }
1338 free_extent_state(*cached_state);
1339 *cached_state = NULL;
1340 goto out;
1341 }
1342 free_extent_state(*cached_state);
1343 *cached_state = NULL;
1344 }
1345
1303 state = find_first_extent_bit_state(tree, start, bits); 1346 state = find_first_extent_bit_state(tree, start, bits);
1347got_it:
1304 if (state) { 1348 if (state) {
1349 cache_state(state, cached_state);
1305 *start_ret = state->start; 1350 *start_ret = state->start;
1306 *end_ret = state->end; 1351 *end_ret = state->end;
1307 ret = 0; 1352 ret = 0;
1308 } 1353 }
1354out:
1309 spin_unlock(&tree->lock); 1355 spin_unlock(&tree->lock);
1310 return ret; 1356 return ret;
1311} 1357}
@@ -1871,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
1871 * the standard behavior is to write all copies in a raid setup. here we only 1917 * the standard behavior is to write all copies in a raid setup. here we only
1872 * want to write the one bad copy. so we do the mapping for ourselves and issue 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue
1873 * submit_bio directly. 1919 * submit_bio directly.
1874 * to avoid any synchonization issues, wait for the data after writing, which 1920 * to avoid any synchronization issues, wait for the data after writing, which
1875 * actually prevents the read that triggered the error from finishing. 1921 * actually prevents the read that triggered the error from finishing.
1876 * currently, there can be no more than two copies of every data bit. thus, 1922 * currently, there can be no more than two copies of every data bit. thus,
1877 * exactly one rewrite is required. 1923 * exactly one rewrite is required.
1878 */ 1924 */
1879int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1925int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1880 u64 length, u64 logical, struct page *page, 1926 u64 length, u64 logical, struct page *page,
1881 int mirror_num) 1927 int mirror_num)
1882{ 1928{
@@ -1898,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1898 bio->bi_size = 0; 1944 bio->bi_size = 0;
1899 map_length = length; 1945 map_length = length;
1900 1946
1901 ret = btrfs_map_block(map_tree, WRITE, logical, 1947 ret = btrfs_map_block(fs_info, WRITE, logical,
1902 &map_length, &bbio, mirror_num); 1948 &map_length, &bbio, mirror_num);
1903 if (ret) { 1949 if (ret) {
1904 bio_put(bio); 1950 bio_put(bio);
@@ -1936,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1936int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1982int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1937 int mirror_num) 1983 int mirror_num)
1938{ 1984{
1939 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1940 u64 start = eb->start; 1985 u64 start = eb->start;
1941 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1942 int ret = 0; 1987 int ret = 0;
1943 1988
1944 for (i = 0; i < num_pages; i++) { 1989 for (i = 0; i < num_pages; i++) {
1945 struct page *p = extent_buffer_page(eb, i); 1990 struct page *p = extent_buffer_page(eb, i);
1946 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1947 start, p, mirror_num); 1992 start, p, mirror_num);
1948 if (ret) 1993 if (ret)
1949 break; 1994 break;
@@ -1962,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
1962 u64 private; 2007 u64 private;
1963 u64 private_failure; 2008 u64 private_failure;
1964 struct io_failure_record *failrec; 2009 struct io_failure_record *failrec;
1965 struct btrfs_mapping_tree *map_tree; 2010 struct btrfs_fs_info *fs_info;
1966 struct extent_state *state; 2011 struct extent_state *state;
1967 int num_copies; 2012 int num_copies;
1968 int did_repair = 0; 2013 int did_repair = 0;
@@ -1998,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
1998 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1999 2044
2000 if (state && state->start == failrec->start) { 2045 if (state && state->start == failrec->start) {
2001 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2046 fs_info = BTRFS_I(inode)->root->fs_info;
2002 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2003 failrec->len); 2048 failrec->len);
2004 if (num_copies > 1) { 2049 if (num_copies > 1) {
2005 ret = repair_io_failure(map_tree, start, failrec->len, 2050 ret = repair_io_failure(fs_info, start, failrec->len,
2006 failrec->logical, page, 2051 failrec->logical, page,
2007 failrec->failed_mirror); 2052 failrec->failed_mirror);
2008 did_repair = !ret; 2053 did_repair = !ret;
@@ -2068,7 +2113,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2068 } 2113 }
2069 read_unlock(&em_tree->lock); 2114 read_unlock(&em_tree->lock);
2070 2115
2071 if (!em || IS_ERR(em)) { 2116 if (!em) {
2072 kfree(failrec); 2117 kfree(failrec);
2073 return -EIO; 2118 return -EIO;
2074 } 2119 }
@@ -2111,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2111 * clean_io_failure() clean all those errors at once. 2156 * clean_io_failure() clean all those errors at once.
2112 */ 2157 */
2113 } 2158 }
2114 num_copies = btrfs_num_copies( 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2115 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2160 failrec->logical, failrec->len);
2116 failrec->logical, failrec->len);
2117 if (num_copies == 1) { 2161 if (num_copies == 1) {
2118 /* 2162 /*
2119 * we only have a single copy of the data, so don't bother with 2163 * we only have a single copy of the data, so don't bother with
@@ -2304,8 +2348,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2304 struct extent_state *cached = NULL; 2348 struct extent_state *cached = NULL;
2305 struct extent_state *state; 2349 struct extent_state *state;
2306 2350
2307 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " 2351 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2308 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, 2352 "mirror=%ld\n", (u64)bio->bi_sector, err,
2309 (long int)bio->bi_bdev); 2353 (long int)bio->bi_bdev);
2310 tree = &BTRFS_I(page->mapping->host)->io_tree; 2354 tree = &BTRFS_I(page->mapping->host)->io_tree;
2311 2355
@@ -2418,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2418 return bio; 2462 return bio;
2419} 2463}
2420 2464
2421/*
2422 * Since writes are async, they will only return -ENOMEM.
2423 * Reads can return the full range of I/O error conditions.
2424 */
2425static int __must_check submit_one_bio(int rw, struct bio *bio, 2465static int __must_check submit_one_bio(int rw, struct bio *bio,
2426 int mirror_num, unsigned long bio_flags) 2466 int mirror_num, unsigned long bio_flags)
2427{ 2467{
@@ -2709,12 +2749,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2709 end_bio_extent_readpage, mirror_num, 2749 end_bio_extent_readpage, mirror_num,
2710 *bio_flags, 2750 *bio_flags,
2711 this_bio_flag); 2751 this_bio_flag);
2712 BUG_ON(ret == -ENOMEM); 2752 if (!ret) {
2713 nr++; 2753 nr++;
2714 *bio_flags = this_bio_flag; 2754 *bio_flags = this_bio_flag;
2755 }
2715 } 2756 }
2716 if (ret) 2757 if (ret) {
2717 SetPageError(page); 2758 SetPageError(page);
2759 unlock_extent(tree, cur, cur + iosize - 1);
2760 }
2718 cur = cur + iosize; 2761 cur = cur + iosize;
2719 pg_offset += iosize; 2762 pg_offset += iosize;
2720 } 2763 }
@@ -3161,12 +3204,16 @@ static int write_one_eb(struct extent_buffer *eb,
3161 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3204 struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3162 u64 offset = eb->start; 3205 u64 offset = eb->start;
3163 unsigned long i, num_pages; 3206 unsigned long i, num_pages;
3207 unsigned long bio_flags = 0;
3164 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3208 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3165 int ret = 0; 3209 int ret = 0;
3166 3210
3167 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3211 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3168 num_pages = num_extent_pages(eb->start, eb->len); 3212 num_pages = num_extent_pages(eb->start, eb->len);
3169 atomic_set(&eb->io_pages, num_pages); 3213 atomic_set(&eb->io_pages, num_pages);
3214 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3215 bio_flags = EXTENT_BIO_TREE_LOG;
3216
3170 for (i = 0; i < num_pages; i++) { 3217 for (i = 0; i < num_pages; i++) {
3171 struct page *p = extent_buffer_page(eb, i); 3218 struct page *p = extent_buffer_page(eb, i);
3172 3219
@@ -3175,7 +3222,8 @@ static int write_one_eb(struct extent_buffer *eb,
3175 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3222 ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3176 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3223 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3177 -1, end_bio_extent_buffer_writepage, 3224 -1, end_bio_extent_buffer_writepage,
3178 0, 0, 0); 3225 0, epd->bio_flags, bio_flags);
3226 epd->bio_flags = bio_flags;
3179 if (ret) { 3227 if (ret) {
3180 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3228 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3181 SetPageError(p); 3229 SetPageError(p);
@@ -3210,6 +3258,7 @@ int btree_write_cache_pages(struct address_space *mapping,
3210 .tree = tree, 3258 .tree = tree,
3211 .extent_locked = 0, 3259 .extent_locked = 0,
3212 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3260 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3261 .bio_flags = 0,
3213 }; 3262 };
3214 int ret = 0; 3263 int ret = 0;
3215 int done = 0; 3264 int done = 0;
@@ -3254,19 +3303,34 @@ retry:
3254 break; 3303 break;
3255 } 3304 }
3256 3305
3306 spin_lock(&mapping->private_lock);
3307 if (!PagePrivate(page)) {
3308 spin_unlock(&mapping->private_lock);
3309 continue;
3310 }
3311
3257 eb = (struct extent_buffer *)page->private; 3312 eb = (struct extent_buffer *)page->private;
3313
3314 /*
3315 * Shouldn't happen and normally this would be a BUG_ON
3316 * but no sense in crashing the users box for something
3317 * we can survive anyway.
3318 */
3258 if (!eb) { 3319 if (!eb) {
3320 spin_unlock(&mapping->private_lock);
3259 WARN_ON(1); 3321 WARN_ON(1);
3260 continue; 3322 continue;
3261 } 3323 }
3262 3324
3263 if (eb == prev_eb) 3325 if (eb == prev_eb) {
3326 spin_unlock(&mapping->private_lock);
3264 continue; 3327 continue;
3328 }
3265 3329
3266 if (!atomic_inc_not_zero(&eb->refs)) { 3330 ret = atomic_inc_not_zero(&eb->refs);
3267 WARN_ON(1); 3331 spin_unlock(&mapping->private_lock);
3332 if (!ret)
3268 continue; 3333 continue;
3269 }
3270 3334
3271 prev_eb = eb; 3335 prev_eb = eb;
3272 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3336 ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3457,7 +3521,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
3457 if (epd->sync_io) 3521 if (epd->sync_io)
3458 rw = WRITE_SYNC; 3522 rw = WRITE_SYNC;
3459 3523
3460 ret = submit_one_bio(rw, epd->bio, 0, 0); 3524 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3461 BUG_ON(ret < 0); /* -ENOMEM */ 3525 BUG_ON(ret < 0); /* -ENOMEM */
3462 epd->bio = NULL; 3526 epd->bio = NULL;
3463 } 3527 }
@@ -3480,6 +3544,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3480 .get_extent = get_extent, 3544 .get_extent = get_extent,
3481 .extent_locked = 0, 3545 .extent_locked = 0,
3482 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3546 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3547 .bio_flags = 0,
3483 }; 3548 };
3484 3549
3485 ret = __extent_writepage(page, wbc, &epd); 3550 ret = __extent_writepage(page, wbc, &epd);
@@ -3504,6 +3569,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3504 .get_extent = get_extent, 3569 .get_extent = get_extent,
3505 .extent_locked = 1, 3570 .extent_locked = 1,
3506 .sync_io = mode == WB_SYNC_ALL, 3571 .sync_io = mode == WB_SYNC_ALL,
3572 .bio_flags = 0,
3507 }; 3573 };
3508 struct writeback_control wbc_writepages = { 3574 struct writeback_control wbc_writepages = {
3509 .sync_mode = mode, 3575 .sync_mode = mode,
@@ -3543,6 +3609,7 @@ int extent_writepages(struct extent_io_tree *tree,
3543 .get_extent = get_extent, 3609 .get_extent = get_extent,
3544 .extent_locked = 0, 3610 .extent_locked = 0,
3545 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3611 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3612 .bio_flags = 0,
3546 }; 3613 };
3547 3614
3548 ret = extent_write_cache_pages(tree, mapping, wbc, 3615 ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3920,18 +3987,6 @@ out:
3920 return ret; 3987 return ret;
3921} 3988}
3922 3989
3923inline struct page *extent_buffer_page(struct extent_buffer *eb,
3924 unsigned long i)
3925{
3926 return eb->pages[i];
3927}
3928
3929inline unsigned long num_extent_pages(u64 start, u64 len)
3930{
3931 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3932 (start >> PAGE_CACHE_SHIFT);
3933}
3934
3935static void __free_extent_buffer(struct extent_buffer *eb) 3990static void __free_extent_buffer(struct extent_buffer *eb)
3936{ 3991{
3937#if LEAK_DEBUG 3992#if LEAK_DEBUG
@@ -4047,8 +4102,8 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4047 4102
4048 return eb; 4103 return eb;
4049err: 4104err:
4050 for (i--; i > 0; i--) 4105 for (; i > 0; i--)
4051 __free_page(eb->pages[i]); 4106 __free_page(eb->pages[i - 1]);
4052 __free_extent_buffer(eb); 4107 __free_extent_buffer(eb);
4053 return NULL; 4108 return NULL;
4054} 4109}
@@ -4192,10 +4247,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4192 4247
4193 for (i = 0; i < num_pages; i++, index++) { 4248 for (i = 0; i < num_pages; i++, index++) {
4194 p = find_or_create_page(mapping, index, GFP_NOFS); 4249 p = find_or_create_page(mapping, index, GFP_NOFS);
4195 if (!p) { 4250 if (!p)
4196 WARN_ON(1);
4197 goto free_eb; 4251 goto free_eb;
4198 }
4199 4252
4200 spin_lock(&mapping->private_lock); 4253 spin_lock(&mapping->private_lock);
4201 if (PagePrivate(p)) { 4254 if (PagePrivate(p)) {
@@ -4338,7 +4391,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4338 4391
4339 /* Should be safe to release our pages at this point */ 4392 /* Should be safe to release our pages at this point */
4340 btrfs_release_extent_buffer_page(eb, 0); 4393 btrfs_release_extent_buffer_page(eb, 0);
4341
4342 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4394 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4343 return 1; 4395 return 1;
4344 } 4396 }
@@ -4661,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4661 } 4713 }
4662 4714
4663 if (start + min_len > eb->len) { 4715 if (start + min_len > eb->len) {
4664 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4665 "wanted %lu %lu\n", (unsigned long long)eb->start, 4717 "wanted %lu %lu\n", (unsigned long long)eb->start,
4666 eb->len, start, min_len); 4718 eb->len, start, min_len);
4667 WARN_ON(1);
4668 return -EINVAL; 4719 return -EINVAL;
4669 } 4720 }
4670 4721
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15d..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
27 * type for this bio 27 * type for this bio
28 */ 28 */
29#define EXTENT_BIO_COMPRESSED 1 29#define EXTENT_BIO_COMPRESSED 1
30#define EXTENT_BIO_TREE_LOG 2
30#define EXTENT_BIO_FLAG_SHIFT 16 31#define EXTENT_BIO_FLAG_SHIFT 16
31 32
32/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
232int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 233int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
233 gfp_t mask); 234 gfp_t mask);
234int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 235int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
235 int bits, int clear_bits, gfp_t mask); 236 int bits, int clear_bits,
237 struct extent_state **cached_state, gfp_t mask);
236int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 238int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
237 struct extent_state **cached_state, gfp_t mask); 239 struct extent_state **cached_state, gfp_t mask);
240int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
241 struct extent_state **cached_state, gfp_t mask);
238int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 242int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
239 u64 *start_ret, u64 *end_ret, int bits); 243 u64 *start_ret, u64 *end_ret, int bits,
244 struct extent_state **cached_state);
240struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 245struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
241 u64 start, int bits); 246 u64 start, int bits);
242int extent_invalidatepage(struct extent_io_tree *tree, 247int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
277int read_extent_buffer_pages(struct extent_io_tree *tree, 282int read_extent_buffer_pages(struct extent_io_tree *tree,
278 struct extent_buffer *eb, u64 start, int wait, 283 struct extent_buffer *eb, u64 start, int wait,
279 get_extent_t *get_extent, int mirror_num); 284 get_extent_t *get_extent, int mirror_num);
280unsigned long num_extent_pages(u64 start, u64 len); 285
281struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); 286static inline unsigned long num_extent_pages(u64 start, u64 len)
287{
288 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
289 (start >> PAGE_CACHE_SHIFT);
290}
291
292static inline struct page *extent_buffer_page(struct extent_buffer *eb,
293 unsigned long i)
294{
295 return eb->pages[i];
296}
282 297
283static inline void extent_buffer_get(struct extent_buffer *eb) 298static inline void extent_buffer_get(struct extent_buffer *eb)
284{ 299{
@@ -322,9 +337,9 @@ struct bio *
322btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
323 gfp_t gfp_flags); 338 gfp_t gfp_flags);
324 339
325struct btrfs_mapping_tree; 340struct btrfs_fs_info;
326 341
327int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
328 u64 length, u64 logical, struct page *page, 343 u64 length, u64 logical, struct page *page,
329 int mirror_num); 344 int mirror_num);
330int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 345int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..f169d6b11d7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
11 11
12int __init extent_map_init(void) 12int __init extent_map_init(void)
13{ 13{
14 extent_map_cache = kmem_cache_create("extent_map", 14 extent_map_cache = kmem_cache_create("btrfs_extent_map",
15 sizeof(struct extent_map), 0, 15 sizeof(struct extent_map), 0,
16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
17 if (!extent_map_cache) 17 if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
35void extent_map_tree_init(struct extent_map_tree *tree) 35void extent_map_tree_init(struct extent_map_tree *tree)
36{ 36{
37 tree->map = RB_ROOT; 37 tree->map = RB_ROOT;
38 INIT_LIST_HEAD(&tree->modified_extents);
38 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
39} 40}
40 41
@@ -48,13 +49,15 @@ void extent_map_tree_init(struct extent_map_tree *tree)
48struct extent_map *alloc_extent_map(void) 49struct extent_map *alloc_extent_map(void)
49{ 50{
50 struct extent_map *em; 51 struct extent_map *em;
51 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 53 if (!em)
53 return NULL; 54 return NULL;
54 em->in_tree = 0; 55 em->in_tree = 0;
55 em->flags = 0; 56 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 57 em->compress_type = BTRFS_COMPRESS_NONE;
58 em->generation = 0;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
60 INIT_LIST_HEAD(&em->list);
58 return em; 61 return em;
59} 62}
60 63
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
72 WARN_ON(atomic_read(&em->refs) == 0); 75 WARN_ON(atomic_read(&em->refs) == 0);
73 if (atomic_dec_and_test(&em->refs)) { 76 if (atomic_dec_and_test(&em->refs)) {
74 WARN_ON(em->in_tree); 77 WARN_ON(em->in_tree);
78 WARN_ON(!list_empty(&em->list));
75 kmem_cache_free(extent_map_cache, em); 79 kmem_cache_free(extent_map_cache, em);
76 } 80 }
77} 81}
@@ -194,10 +198,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
194 merge = rb_entry(rb, struct extent_map, rb_node); 198 merge = rb_entry(rb, struct extent_map, rb_node);
195 if (rb && mergable_maps(merge, em)) { 199 if (rb && mergable_maps(merge, em)) {
196 em->start = merge->start; 200 em->start = merge->start;
201 em->orig_start = merge->orig_start;
197 em->len += merge->len; 202 em->len += merge->len;
198 em->block_len += merge->block_len; 203 em->block_len += merge->block_len;
199 em->block_start = merge->block_start; 204 em->block_start = merge->block_start;
200 merge->in_tree = 0; 205 merge->in_tree = 0;
206 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
207 em->mod_start = merge->mod_start;
208 em->generation = max(em->generation, merge->generation);
209 list_move(&em->list, &tree->modified_extents);
210
211 list_del_init(&merge->list);
201 rb_erase(&merge->rb_node, &tree->map); 212 rb_erase(&merge->rb_node, &tree->map);
202 free_extent_map(merge); 213 free_extent_map(merge);
203 } 214 }
@@ -211,14 +222,30 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
211 em->block_len += merge->len; 222 em->block_len += merge->len;
212 rb_erase(&merge->rb_node, &tree->map); 223 rb_erase(&merge->rb_node, &tree->map);
213 merge->in_tree = 0; 224 merge->in_tree = 0;
225 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
226 em->generation = max(em->generation, merge->generation);
227 list_del_init(&merge->list);
214 free_extent_map(merge); 228 free_extent_map(merge);
215 } 229 }
216} 230}
217 231
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 232/**
233 * unpin_extent_cache - unpin an extent from the cache
234 * @tree: tree to unpin the extent in
235 * @start: logical offset in the file
236 * @len: length of the extent
237 * @gen: generation that this extent has been modified in
238 *
239 * Called after an extent has been written to disk properly. Set the generation
240 * to the generation that actually added the file item to the inode so we know
241 * we need to sync this extent when we call fsync().
242 */
243int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
244 u64 gen)
219{ 245{
220 int ret = 0; 246 int ret = 0;
221 struct extent_map *em; 247 struct extent_map *em;
248 bool prealloc = false;
222 249
223 write_lock(&tree->lock); 250 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len); 251 em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +255,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
228 if (!em) 255 if (!em)
229 goto out; 256 goto out;
230 257
258 list_move(&em->list, &tree->modified_extents);
259 em->generation = gen;
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 260 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
261 em->mod_start = em->start;
262 em->mod_len = em->len;
263
264 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
265 prealloc = true;
266 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
267 }
232 268
233 try_merge_map(tree, em); 269 try_merge_map(tree, em);
234 270
271 if (prealloc) {
272 em->mod_start = em->start;
273 em->mod_len = em->len;
274 }
275
235 free_extent_map(em); 276 free_extent_map(em);
236out: 277out:
237 write_unlock(&tree->lock); 278 write_unlock(&tree->lock);
@@ -269,6 +310,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
269 } 310 }
270 atomic_inc(&em->refs); 311 atomic_inc(&em->refs);
271 312
313 em->mod_start = em->start;
314 em->mod_len = em->len;
315
272 try_merge_map(tree, em); 316 try_merge_map(tree, em);
273out: 317out:
274 return ret; 318 return ret;
@@ -358,6 +402,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
358 402
359 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 403 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
360 rb_erase(&em->rb_node, &tree->map); 404 rb_erase(&em->rb_node, &tree->map);
405 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
406 list_del_init(&em->list);
361 em->in_tree = 0; 407 em->in_tree = 0;
362 return ret; 408 return ret;
363} 409}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761fe..922943ce29e8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,8 @@
13#define EXTENT_FLAG_COMPRESSED 1 13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
16 18
17struct extent_map { 19struct extent_map {
18 struct rb_node rb_node; 20 struct rb_node rb_node;
@@ -20,18 +22,24 @@ struct extent_map {
20 /* all of these are in bytes */ 22 /* all of these are in bytes */
21 u64 start; 23 u64 start;
22 u64 len; 24 u64 len;
25 u64 mod_start;
26 u64 mod_len;
23 u64 orig_start; 27 u64 orig_start;
28 u64 orig_block_len;
24 u64 block_start; 29 u64 block_start;
25 u64 block_len; 30 u64 block_len;
31 u64 generation;
26 unsigned long flags; 32 unsigned long flags;
27 struct block_device *bdev; 33 struct block_device *bdev;
28 atomic_t refs; 34 atomic_t refs;
29 unsigned int in_tree; 35 unsigned int in_tree;
30 unsigned int compress_type; 36 unsigned int compress_type;
37 struct list_head list;
31}; 38};
32 39
33struct extent_map_tree { 40struct extent_map_tree {
34 struct rb_root map; 41 struct rb_root map;
42 struct list_head modified_extents;
35 rwlock_t lock; 43 rwlock_t lock;
36}; 44};
37 45
@@ -60,7 +68,7 @@ struct extent_map *alloc_extent_map(void);
60void free_extent_map(struct extent_map *em); 68void free_extent_map(struct extent_map *em);
61int __init extent_map_init(void); 69int __init extent_map_init(void);
62void extent_map_exit(void); 70void extent_map_exit(void);
63int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); 71int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
64struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 72struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
65 u64 start, u64 len); 73 u64 start, u64 len);
66#endif 74#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01dc..bd38cef42358 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "print-tree.h" 26#include "print-tree.h"
27 27
28#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ 28#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
29 sizeof(struct btrfs_item) * 2) / \ 29 sizeof(struct btrfs_item) * 2) / \
30 size) - 1)) 30 size) - 1))
31 31
32#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) 32#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
33 PAGE_CACHE_SIZE))
33 34
34#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
35 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
@@ -132,7 +133,6 @@ fail:
132 return ERR_PTR(ret); 133 return ERR_PTR(ret);
133} 134}
134 135
135
136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
137 struct btrfs_root *root, 137 struct btrfs_root *root,
138 struct btrfs_path *path, u64 objectid, 138 struct btrfs_path *path, u64 objectid,
@@ -150,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
150 return ret; 150 return ret;
151} 151}
152 152
153u64 btrfs_file_extent_length(struct btrfs_path *path)
154{
155 int extent_type;
156 struct btrfs_file_extent_item *fi;
157 u64 len;
158
159 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
160 struct btrfs_file_extent_item);
161 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
162
163 if (extent_type == BTRFS_FILE_EXTENT_REG ||
164 extent_type == BTRFS_FILE_EXTENT_PREALLOC)
165 len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
166 else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
167 len = btrfs_file_extent_inline_len(path->nodes[0], fi);
168 else
169 BUG();
170
171 return len;
172}
153 173
154static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 174static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
155 struct inode *inode, struct bio *bio, 175 struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6b40e86121b..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,7 +39,9 @@
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h"
42 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
43/* 45/*
44 * when auto defrag is enabled we 46 * when auto defrag is enabled we
45 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -89,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
89 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
90 * pass in is freed 92 * pass in is freed
91 */ 93 */
92static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
93 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
94{ 96{
95 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -117,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
117 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
118 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
119 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
120 goto exists; 122 return -EEXIST;
121 } 123 }
122 } 124 }
123 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
124 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
125 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
126 return; 128 return 0;
129}
127 130
128exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
129 kfree(defrag); 132{
130 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
131 135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
138
139 return 1;
132} 140}
133 141
134/* 142/*
@@ -141,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
141 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
142 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
143 u64 transid; 151 u64 transid;
152 int ret;
144 153
145 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
146 return 0;
147
148 if (btrfs_fs_closing(root->fs_info))
149 return 0; 155 return 0;
150 156
151 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -156,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
156 else 162 else
157 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
158 164
159 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
160 if (!defrag) 166 if (!defrag)
161 return -ENOMEM; 167 return -ENOMEM;
162 168
@@ -165,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
165 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
166 172
167 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
168 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
169 __btrfs_add_inode_defrag(inode, defrag); 175 /*
170 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
171 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
172 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
173 return 0; 187 return 0;
174} 188}
175 189
176/* 190/*
177 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
194 */
195void btrfs_requeue_inode_defrag(struct inode *inode,
196 struct inode_defrag *defrag)
197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
178 */ 221 */
179struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 222static struct inode_defrag *
180 u64 root, u64 ino, 223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
181 struct rb_node **next)
182{ 224{
183 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
184 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -189,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
189 tmp.ino = ino; 231 tmp.ino = ino;
190 tmp.root = root; 232 tmp.root = root;
191 233
192 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
193 while (p) { 236 while (p) {
194 parent = p; 237 parent = p;
195 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -200,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
200 else if (ret > 0) 243 else if (ret > 0)
201 p = parent->rb_right; 244 p = parent->rb_right;
202 else 245 else
203 return entry; 246 goto out;
204 } 247 }
205 248
206 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
207 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
208 parent = rb_next(parent); 251 if (parent)
209 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
210 } 253 else
211 *next = parent; 254 entry = NULL;
212 } 255 }
213 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
214} 261}
215 262
216/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
217 * run through the list of inodes in the FS that need
218 * defragging
219 */
220int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
221{ 264{
222 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
223 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
224 struct inode *inode; 292 struct inode *inode;
225 struct rb_node *n;
226 struct btrfs_key key; 293 struct btrfs_key key;
227 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
228 u64 first_ino = 0;
229 u64 root_objectid = 0;
230 int num_defrag; 295 int num_defrag;
231 int defrag_batch = 1024;
232 296
297 /* get the inode */
298 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1;
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
304 return PTR_ERR(inode_root);
305 }
306
307 key.objectid = defrag->ino;
308 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
309 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
313 return PTR_ERR(inode);
314 }
315
316 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
233 memset(&range, 0, sizeof(range)); 318 memset(&range, 0, sizeof(range));
234 range.len = (u64)-1; 319 range.len = (u64)-1;
320 range.start = defrag->last_offset;
321
322 sb_start_write(fs_info->sb);
323 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
324 BTRFS_DEFRAG_BATCH);
325 sb_end_write(fs_info->sb);
326 /*
327 * if we filled the whole defrag batch, there
328 * must be more work to do. Queue this defrag
329 * again
330 */
331 if (num_defrag == BTRFS_DEFRAG_BATCH) {
332 defrag->last_offset = range.start;
333 btrfs_requeue_inode_defrag(inode, defrag);
334 } else if (defrag->last_offset && !defrag->cycled) {
335 /*
336 * we didn't fill our defrag batch, but
337 * we didn't start at zero. Make sure we loop
338 * around to the start of the file.
339 */
340 defrag->last_offset = 0;
341 defrag->cycled = 1;
342 btrfs_requeue_inode_defrag(inode, defrag);
343 } else {
344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
345 }
346
347 iput(inode);
348 return 0;
349}
350
351/*
352 * run through the list of inodes in the FS that need
353 * defragging
354 */
355int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
356{
357 struct inode_defrag *defrag;
358 u64 first_ino = 0;
359 u64 root_objectid = 0;
235 360
236 atomic_inc(&fs_info->defrag_running); 361 atomic_inc(&fs_info->defrag_running);
237 spin_lock(&fs_info->defrag_inodes_lock);
238 while(1) { 362 while(1) {
239 n = NULL; 363 if (!__need_auto_defrag(fs_info->tree_root))
364 break;
240 365
241 /* find an inode to defrag */ 366 /* find an inode to defrag */
242 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 367 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
243 first_ino, &n); 368 first_ino);
244 if (!defrag) { 369 if (!defrag) {
245 if (n) { 370 if (root_objectid || first_ino) {
246 defrag = rb_entry(n, struct inode_defrag,
247 rb_node);
248 } else if (root_objectid || first_ino) {
249 root_objectid = 0; 371 root_objectid = 0;
250 first_ino = 0; 372 first_ino = 0;
251 continue; 373 continue;
@@ -254,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
254 } 376 }
255 } 377 }
256 378
257 /* remove it from the rbtree */
258 first_ino = defrag->ino + 1; 379 first_ino = defrag->ino + 1;
259 root_objectid = defrag->root; 380 root_objectid = defrag->root;
260 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
261
262 if (btrfs_fs_closing(fs_info))
263 goto next_free;
264 381
265 spin_unlock(&fs_info->defrag_inodes_lock); 382 __btrfs_run_defrag_inode(fs_info, defrag);
266
267 /* get the inode */
268 key.objectid = defrag->root;
269 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
270 key.offset = (u64)-1;
271 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
272 if (IS_ERR(inode_root))
273 goto next;
274
275 key.objectid = defrag->ino;
276 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
277 key.offset = 0;
278
279 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
280 if (IS_ERR(inode))
281 goto next;
282
283 /* do a chunk of defrag */
284 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
285 range.start = defrag->last_offset;
286 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
287 defrag_batch);
288 /*
289 * if we filled the whole defrag batch, there
290 * must be more work to do. Queue this defrag
291 * again
292 */
293 if (num_defrag == defrag_batch) {
294 defrag->last_offset = range.start;
295 __btrfs_add_inode_defrag(inode, defrag);
296 /*
297 * we don't want to kfree defrag, we added it back to
298 * the rbtree
299 */
300 defrag = NULL;
301 } else if (defrag->last_offset && !defrag->cycled) {
302 /*
303 * we didn't fill our defrag batch, but
304 * we didn't start at zero. Make sure we loop
305 * around to the start of the file.
306 */
307 defrag->last_offset = 0;
308 defrag->cycled = 1;
309 __btrfs_add_inode_defrag(inode, defrag);
310 defrag = NULL;
311 }
312
313 iput(inode);
314next:
315 spin_lock(&fs_info->defrag_inodes_lock);
316next_free:
317 kfree(defrag);
318 } 383 }
319 spin_unlock(&fs_info->defrag_inodes_lock);
320
321 atomic_dec(&fs_info->defrag_running); 384 atomic_dec(&fs_info->defrag_running);
322 385
323 /* 386 /*
@@ -458,14 +521,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
458 * this drops all the extents in the cache that intersect the range 521 * this drops all the extents in the cache that intersect the range
459 * [start, end]. Existing extents are split as required. 522 * [start, end]. Existing extents are split as required.
460 */ 523 */
461int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 524void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
462 int skip_pinned) 525 int skip_pinned)
463{ 526{
464 struct extent_map *em; 527 struct extent_map *em;
465 struct extent_map *split = NULL; 528 struct extent_map *split = NULL;
466 struct extent_map *split2 = NULL; 529 struct extent_map *split2 = NULL;
467 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 530 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
468 u64 len = end - start + 1; 531 u64 len = end - start + 1;
532 u64 gen;
469 int ret; 533 int ret;
470 int testend = 1; 534 int testend = 1;
471 unsigned long flags; 535 unsigned long flags;
@@ -477,11 +541,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
477 testend = 0; 541 testend = 0;
478 } 542 }
479 while (1) { 543 while (1) {
544 int no_splits = 0;
545
480 if (!split) 546 if (!split)
481 split = alloc_extent_map(); 547 split = alloc_extent_map();
482 if (!split2) 548 if (!split2)
483 split2 = alloc_extent_map(); 549 split2 = alloc_extent_map();
484 BUG_ON(!split || !split2); /* -ENOMEM */ 550 if (!split || !split2)
551 no_splits = 1;
485 552
486 write_lock(&em_tree->lock); 553 write_lock(&em_tree->lock);
487 em = lookup_extent_mapping(em_tree, start, len); 554 em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +557,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
490 break; 557 break;
491 } 558 }
492 flags = em->flags; 559 flags = em->flags;
560 gen = em->generation;
493 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 561 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
494 if (testend && em->start + em->len >= start + len) { 562 if (testend && em->start + em->len >= start + len) {
495 free_extent_map(em); 563 free_extent_map(em);
@@ -506,6 +574,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
506 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 574 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
507 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 575 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
508 remove_extent_mapping(em_tree, em); 576 remove_extent_mapping(em_tree, em);
577 if (no_splits)
578 goto next;
509 579
510 if (em->block_start < EXTENT_MAP_LAST_BYTE && 580 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
511 em->start < start) { 581 em->start < start) {
@@ -518,12 +588,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
518 split->block_len = em->block_len; 588 split->block_len = em->block_len;
519 else 589 else
520 split->block_len = split->len; 590 split->block_len = split->len;
521 591 split->orig_block_len = max(split->block_len,
592 em->orig_block_len);
593 split->generation = gen;
522 split->bdev = em->bdev; 594 split->bdev = em->bdev;
523 split->flags = flags; 595 split->flags = flags;
524 split->compress_type = em->compress_type; 596 split->compress_type = em->compress_type;
525 ret = add_extent_mapping(em_tree, split); 597 ret = add_extent_mapping(em_tree, split);
526 BUG_ON(ret); /* Logic error */ 598 BUG_ON(ret); /* Logic error */
599 list_move(&split->list, &em_tree->modified_extents);
527 free_extent_map(split); 600 free_extent_map(split);
528 split = split2; 601 split = split2;
529 split2 = NULL; 602 split2 = NULL;
@@ -537,6 +610,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
537 split->bdev = em->bdev; 610 split->bdev = em->bdev;
538 split->flags = flags; 611 split->flags = flags;
539 split->compress_type = em->compress_type; 612 split->compress_type = em->compress_type;
613 split->generation = gen;
614 split->orig_block_len = max(em->block_len,
615 em->orig_block_len);
540 616
541 if (compressed) { 617 if (compressed) {
542 split->block_len = em->block_len; 618 split->block_len = em->block_len;
@@ -545,14 +621,16 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
545 } else { 621 } else {
546 split->block_len = split->len; 622 split->block_len = split->len;
547 split->block_start = em->block_start + diff; 623 split->block_start = em->block_start + diff;
548 split->orig_start = split->start; 624 split->orig_start = em->orig_start;
549 } 625 }
550 626
551 ret = add_extent_mapping(em_tree, split); 627 ret = add_extent_mapping(em_tree, split);
552 BUG_ON(ret); /* Logic error */ 628 BUG_ON(ret); /* Logic error */
629 list_move(&split->list, &em_tree->modified_extents);
553 free_extent_map(split); 630 free_extent_map(split);
554 split = NULL; 631 split = NULL;
555 } 632 }
633next:
556 write_unlock(&em_tree->lock); 634 write_unlock(&em_tree->lock);
557 635
558 /* once for us */ 636 /* once for us */
@@ -564,7 +642,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
564 free_extent_map(split); 642 free_extent_map(split);
565 if (split2) 643 if (split2)
566 free_extent_map(split2); 644 free_extent_map(split2);
567 return 0;
568} 645}
569 646
570/* 647/*
@@ -576,13 +653,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
576 * it is either truncated or split. Anything entirely inside the range 653 * it is either truncated or split. Anything entirely inside the range
577 * is deleted from the tree. 654 * is deleted from the tree.
578 */ 655 */
579int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 656int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
580 u64 start, u64 end, u64 *hint_byte, int drop_cache) 657 struct btrfs_root *root, struct inode *inode,
658 struct btrfs_path *path, u64 start, u64 end,
659 u64 *drop_end, int drop_cache)
581{ 660{
582 struct btrfs_root *root = BTRFS_I(inode)->root;
583 struct extent_buffer *leaf; 661 struct extent_buffer *leaf;
584 struct btrfs_file_extent_item *fi; 662 struct btrfs_file_extent_item *fi;
585 struct btrfs_path *path;
586 struct btrfs_key key; 663 struct btrfs_key key;
587 struct btrfs_key new_key; 664 struct btrfs_key new_key;
588 u64 ino = btrfs_ino(inode); 665 u64 ino = btrfs_ino(inode);
@@ -597,14 +674,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
597 int recow; 674 int recow;
598 int ret; 675 int ret;
599 int modify_tree = -1; 676 int modify_tree = -1;
677 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
678 int found = 0;
600 679
601 if (drop_cache) 680 if (drop_cache)
602 btrfs_drop_extent_cache(inode, start, end - 1, 0); 681 btrfs_drop_extent_cache(inode, start, end - 1, 0);
603 682
604 path = btrfs_alloc_path();
605 if (!path)
606 return -ENOMEM;
607
608 if (start >= BTRFS_I(inode)->disk_i_size) 683 if (start >= BTRFS_I(inode)->disk_i_size)
609 modify_tree = 0; 684 modify_tree = 0;
610 685
@@ -666,6 +741,7 @@ next_slot:
666 goto next_slot; 741 goto next_slot;
667 } 742 }
668 743
744 found = 1;
669 search_start = max(key.offset, start); 745 search_start = max(key.offset, start);
670 if (recow || !modify_tree) { 746 if (recow || !modify_tree) {
671 modify_tree = -1; 747 modify_tree = -1;
@@ -707,14 +783,13 @@ next_slot:
707 extent_end - start); 783 extent_end - start);
708 btrfs_mark_buffer_dirty(leaf); 784 btrfs_mark_buffer_dirty(leaf);
709 785
710 if (disk_bytenr > 0) { 786 if (update_refs && disk_bytenr > 0) {
711 ret = btrfs_inc_extent_ref(trans, root, 787 ret = btrfs_inc_extent_ref(trans, root,
712 disk_bytenr, num_bytes, 0, 788 disk_bytenr, num_bytes, 0,
713 root->root_key.objectid, 789 root->root_key.objectid,
714 new_key.objectid, 790 new_key.objectid,
715 start - extent_offset, 0); 791 start - extent_offset, 0);
716 BUG_ON(ret); /* -ENOMEM */ 792 BUG_ON(ret); /* -ENOMEM */
717 *hint_byte = disk_bytenr;
718 } 793 }
719 key.offset = start; 794 key.offset = start;
720 } 795 }
@@ -734,10 +809,8 @@ next_slot:
734 btrfs_set_file_extent_num_bytes(leaf, fi, 809 btrfs_set_file_extent_num_bytes(leaf, fi,
735 extent_end - end); 810 extent_end - end);
736 btrfs_mark_buffer_dirty(leaf); 811 btrfs_mark_buffer_dirty(leaf);
737 if (disk_bytenr > 0) { 812 if (update_refs && disk_bytenr > 0)
738 inode_sub_bytes(inode, end - key.offset); 813 inode_sub_bytes(inode, end - key.offset);
739 *hint_byte = disk_bytenr;
740 }
741 break; 814 break;
742 } 815 }
743 816
@@ -753,10 +826,8 @@ next_slot:
753 btrfs_set_file_extent_num_bytes(leaf, fi, 826 btrfs_set_file_extent_num_bytes(leaf, fi,
754 start - key.offset); 827 start - key.offset);
755 btrfs_mark_buffer_dirty(leaf); 828 btrfs_mark_buffer_dirty(leaf);
756 if (disk_bytenr > 0) { 829 if (update_refs && disk_bytenr > 0)
757 inode_sub_bytes(inode, extent_end - start); 830 inode_sub_bytes(inode, extent_end - start);
758 *hint_byte = disk_bytenr;
759 }
760 if (end == extent_end) 831 if (end == extent_end)
761 break; 832 break;
762 833
@@ -777,12 +848,13 @@ next_slot:
777 del_nr++; 848 del_nr++;
778 } 849 }
779 850
780 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 851 if (update_refs &&
852 extent_type == BTRFS_FILE_EXTENT_INLINE) {
781 inode_sub_bytes(inode, 853 inode_sub_bytes(inode,
782 extent_end - key.offset); 854 extent_end - key.offset);
783 extent_end = ALIGN(extent_end, 855 extent_end = ALIGN(extent_end,
784 root->sectorsize); 856 root->sectorsize);
785 } else if (disk_bytenr > 0) { 857 } else if (update_refs && disk_bytenr > 0) {
786 ret = btrfs_free_extent(trans, root, 858 ret = btrfs_free_extent(trans, root,
787 disk_bytenr, num_bytes, 0, 859 disk_bytenr, num_bytes, 0,
788 root->root_key.objectid, 860 root->root_key.objectid,
@@ -791,7 +863,6 @@ next_slot:
791 BUG_ON(ret); /* -ENOMEM */ 863 BUG_ON(ret); /* -ENOMEM */
792 inode_sub_bytes(inode, 864 inode_sub_bytes(inode,
793 extent_end - key.offset); 865 extent_end - key.offset);
794 *hint_byte = disk_bytenr;
795 } 866 }
796 867
797 if (end == extent_end) 868 if (end == extent_end)
@@ -806,7 +877,7 @@ next_slot:
806 del_nr); 877 del_nr);
807 if (ret) { 878 if (ret) {
808 btrfs_abort_transaction(trans, root, ret); 879 btrfs_abort_transaction(trans, root, ret);
809 goto out; 880 break;
810 } 881 }
811 882
812 del_nr = 0; 883 del_nr = 0;
@@ -825,7 +896,24 @@ next_slot:
825 btrfs_abort_transaction(trans, root, ret); 896 btrfs_abort_transaction(trans, root, ret);
826 } 897 }
827 898
828out: 899 if (drop_end)
900 *drop_end = found ? min(end, extent_end) : end;
901 btrfs_release_path(path);
902 return ret;
903}
904
905int btrfs_drop_extents(struct btrfs_trans_handle *trans,
906 struct btrfs_root *root, struct inode *inode, u64 start,
907 u64 end, int drop_cache)
908{
909 struct btrfs_path *path;
910 int ret;
911
912 path = btrfs_alloc_path();
913 if (!path)
914 return -ENOMEM;
915 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
916 drop_cache);
829 btrfs_free_path(path); 917 btrfs_free_path(path);
830 return ret; 918 return ret;
831} 919}
@@ -892,8 +980,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
892 int ret; 980 int ret;
893 u64 ino = btrfs_ino(inode); 981 u64 ino = btrfs_ino(inode);
894 982
895 btrfs_drop_extent_cache(inode, start, end - 1, 0);
896
897 path = btrfs_alloc_path(); 983 path = btrfs_alloc_path();
898 if (!path) 984 if (!path)
899 return -ENOMEM; 985 return -ENOMEM;
@@ -935,12 +1021,16 @@ again:
935 btrfs_set_item_key_safe(trans, root, path, &new_key); 1021 btrfs_set_item_key_safe(trans, root, path, &new_key);
936 fi = btrfs_item_ptr(leaf, path->slots[0], 1022 fi = btrfs_item_ptr(leaf, path->slots[0],
937 struct btrfs_file_extent_item); 1023 struct btrfs_file_extent_item);
1024 btrfs_set_file_extent_generation(leaf, fi,
1025 trans->transid);
938 btrfs_set_file_extent_num_bytes(leaf, fi, 1026 btrfs_set_file_extent_num_bytes(leaf, fi,
939 extent_end - end); 1027 extent_end - end);
940 btrfs_set_file_extent_offset(leaf, fi, 1028 btrfs_set_file_extent_offset(leaf, fi,
941 end - orig_offset); 1029 end - orig_offset);
942 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1030 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
943 struct btrfs_file_extent_item); 1031 struct btrfs_file_extent_item);
1032 btrfs_set_file_extent_generation(leaf, fi,
1033 trans->transid);
944 btrfs_set_file_extent_num_bytes(leaf, fi, 1034 btrfs_set_file_extent_num_bytes(leaf, fi,
945 end - other_start); 1035 end - other_start);
946 btrfs_mark_buffer_dirty(leaf); 1036 btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +1048,16 @@ again:
958 struct btrfs_file_extent_item); 1048 struct btrfs_file_extent_item);
959 btrfs_set_file_extent_num_bytes(leaf, fi, 1049 btrfs_set_file_extent_num_bytes(leaf, fi,
960 start - key.offset); 1050 start - key.offset);
1051 btrfs_set_file_extent_generation(leaf, fi,
1052 trans->transid);
961 path->slots[0]++; 1053 path->slots[0]++;
962 new_key.offset = start; 1054 new_key.offset = start;
963 btrfs_set_item_key_safe(trans, root, path, &new_key); 1055 btrfs_set_item_key_safe(trans, root, path, &new_key);
964 1056
965 fi = btrfs_item_ptr(leaf, path->slots[0], 1057 fi = btrfs_item_ptr(leaf, path->slots[0],
966 struct btrfs_file_extent_item); 1058 struct btrfs_file_extent_item);
1059 btrfs_set_file_extent_generation(leaf, fi,
1060 trans->transid);
967 btrfs_set_file_extent_num_bytes(leaf, fi, 1061 btrfs_set_file_extent_num_bytes(leaf, fi,
968 other_end - start); 1062 other_end - start);
969 btrfs_set_file_extent_offset(leaf, fi, 1063 btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1085,14 @@ again:
991 leaf = path->nodes[0]; 1085 leaf = path->nodes[0];
992 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1086 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
993 struct btrfs_file_extent_item); 1087 struct btrfs_file_extent_item);
1088 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
994 btrfs_set_file_extent_num_bytes(leaf, fi, 1089 btrfs_set_file_extent_num_bytes(leaf, fi,
995 split - key.offset); 1090 split - key.offset);
996 1091
997 fi = btrfs_item_ptr(leaf, path->slots[0], 1092 fi = btrfs_item_ptr(leaf, path->slots[0],
998 struct btrfs_file_extent_item); 1093 struct btrfs_file_extent_item);
999 1094
1095 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1000 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1096 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1001 btrfs_set_file_extent_num_bytes(leaf, fi, 1097 btrfs_set_file_extent_num_bytes(leaf, fi,
1002 extent_end - split); 1098 extent_end - split);
@@ -1056,12 +1152,14 @@ again:
1056 struct btrfs_file_extent_item); 1152 struct btrfs_file_extent_item);
1057 btrfs_set_file_extent_type(leaf, fi, 1153 btrfs_set_file_extent_type(leaf, fi,
1058 BTRFS_FILE_EXTENT_REG); 1154 BTRFS_FILE_EXTENT_REG);
1155 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1059 btrfs_mark_buffer_dirty(leaf); 1156 btrfs_mark_buffer_dirty(leaf);
1060 } else { 1157 } else {
1061 fi = btrfs_item_ptr(leaf, del_slot - 1, 1158 fi = btrfs_item_ptr(leaf, del_slot - 1,
1062 struct btrfs_file_extent_item); 1159 struct btrfs_file_extent_item);
1063 btrfs_set_file_extent_type(leaf, fi, 1160 btrfs_set_file_extent_type(leaf, fi,
1064 BTRFS_FILE_EXTENT_REG); 1161 BTRFS_FILE_EXTENT_REG);
1162 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1065 btrfs_set_file_extent_num_bytes(leaf, fi, 1163 btrfs_set_file_extent_num_bytes(leaf, fi,
1066 extent_end - key.offset); 1164 extent_end - key.offset);
1067 btrfs_mark_buffer_dirty(leaf); 1165 btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1271,8 @@ again:
1173 1271
1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1272 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1175 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1273 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1176 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1274 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1177 GFP_NOFS); 1275 0, 0, &cached_state, GFP_NOFS);
1178 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1276 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1179 start_pos, last_pos - 1, &cached_state, 1277 start_pos, last_pos - 1, &cached_state,
1180 GFP_NOFS); 1278 GFP_NOFS);
@@ -1314,10 +1412,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1314 1412
1315 cond_resched(); 1413 cond_resched();
1316 1414
1317 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1415 balance_dirty_pages_ratelimited(inode->i_mapping);
1318 dirty_pages);
1319 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1416 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1320 btrfs_btree_balance_dirty(root, 1); 1417 btrfs_btree_balance_dirty(root);
1321 1418
1322 pos += copied; 1419 pos += copied;
1323 num_written += copied; 1420 num_written += copied;
@@ -1366,6 +1463,24 @@ out:
1366 return written ? written : err; 1463 return written ? written : err;
1367} 1464}
1368 1465
1466static void update_time_for_write(struct inode *inode)
1467{
1468 struct timespec now;
1469
1470 if (IS_NOCMTIME(inode))
1471 return;
1472
1473 now = current_fs_time(inode->i_sb);
1474 if (!timespec_equal(&inode->i_mtime, &now))
1475 inode->i_mtime = now;
1476
1477 if (!timespec_equal(&inode->i_ctime, &now))
1478 inode->i_ctime = now;
1479
1480 if (IS_I_VERSION(inode))
1481 inode_inc_iversion(inode);
1482}
1483
1369static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1484static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1370 const struct iovec *iov, 1485 const struct iovec *iov,
1371 unsigned long nr_segs, loff_t pos) 1486 unsigned long nr_segs, loff_t pos)
@@ -1378,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1378 ssize_t num_written = 0; 1493 ssize_t num_written = 0;
1379 ssize_t err = 0; 1494 ssize_t err = 0;
1380 size_t count, ocount; 1495 size_t count, ocount;
1496 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1381 1497
1382 sb_start_write(inode->i_sb); 1498 sb_start_write(inode->i_sb);
1383 1499
@@ -1420,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1420 goto out; 1536 goto out;
1421 } 1537 }
1422 1538
1423 err = file_update_time(file); 1539 /*
1424 if (err) { 1540 * We reserve space for updating the inode when we reserve space for the
1425 mutex_unlock(&inode->i_mutex); 1541 * extent we are going to write, so we will enospc out there. We don't
1426 goto out; 1542 * need to start yet another transaction to update the inode as we will
1427 } 1543 * update the inode when we finish writing whatever data we write.
1544 */
1545 update_time_for_write(inode);
1428 1546
1429 start_pos = round_down(pos, root->sectorsize); 1547 start_pos = round_down(pos, root->sectorsize);
1430 if (start_pos > i_size_read(inode)) { 1548 if (start_pos > i_size_read(inode)) {
@@ -1435,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1435 } 1553 }
1436 } 1554 }
1437 1555
1556 if (sync)
1557 atomic_inc(&BTRFS_I(inode)->sync_writers);
1558
1438 if (unlikely(file->f_flags & O_DIRECT)) { 1559 if (unlikely(file->f_flags & O_DIRECT)) {
1439 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1560 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1440 pos, ppos, count, ocount); 1561 pos, ppos, count, ocount);
@@ -1461,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1461 * this will either be one more than the running transaction 1582 * this will either be one more than the running transaction
1462 * or the generation used for the next transaction if there isn't 1583 * or the generation used for the next transaction if there isn't
1463 * one running right now. 1584 * one running right now.
1585 *
1586 * We also have to set last_sub_trans to the current log transid,
1587 * otherwise subsequent syncs to a file that's been synced in this
1588 * transaction will appear to have already occured.
1464 */ 1589 */
1465 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1590 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1591 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1466 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1592 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1467 err = generic_write_sync(file, pos, num_written); 1593 err = generic_write_sync(file, pos, num_written);
1468 if (err < 0 && num_written > 0) 1594 if (err < 0 && num_written > 0)
1469 num_written = err; 1595 num_written = err;
1470 } 1596 }
1471out: 1597out:
1598 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers);
1472 sb_end_write(inode->i_sb); 1600 sb_end_write(inode->i_sb);
1473 current->backing_dev_info = NULL; 1601 current->backing_dev_info = NULL;
1474 return num_written ? num_written : err; 1602 return num_written ? num_written : err;
@@ -1514,16 +1642,26 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1514 1642
1515 trace_btrfs_sync_file(file, datasync); 1643 trace_btrfs_sync_file(file, datasync);
1516 1644
1645 /*
1646 * We write the dirty pages in the range and wait until they complete
1647 * out of the ->i_mutex. If so, we can flush the dirty pages by
1648 * multi-task, and make the performance up.
1649 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers);
1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers);
1653 if (ret)
1654 return ret;
1655
1517 mutex_lock(&inode->i_mutex); 1656 mutex_lock(&inode->i_mutex);
1518 1657
1519 /* 1658 /*
1520 * we wait first, since the writeback may change the inode, also wait 1659 * We flush the dirty pages again to avoid some dirty pages in the
1521 * ordered range does a filemape_write_and_wait_range which is why we 1660 * range being left.
1522 * don't do it above like other file systems.
1523 */ 1661 */
1524 root->log_batch++; 1662 atomic_inc(&root->log_batch);
1525 btrfs_wait_ordered_range(inode, start, end); 1663 btrfs_wait_ordered_range(inode, start, end - start + 1);
1526 root->log_batch++; 1664 atomic_inc(&root->log_batch);
1527 1665
1528 /* 1666 /*
1529 * check the transaction that last modified this inode 1667 * check the transaction that last modified this inode
@@ -1544,6 +1682,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1544 BTRFS_I(inode)->last_trans <= 1682 BTRFS_I(inode)->last_trans <=
1545 root->fs_info->last_trans_committed) { 1683 root->fs_info->last_trans_committed) {
1546 BTRFS_I(inode)->last_trans = 0; 1684 BTRFS_I(inode)->last_trans = 0;
1685
1686 /*
1687 * We'v had everything committed since the last time we were
1688 * modified so clear this flag in case it was set for whatever
1689 * reason, it's no longer relevant.
1690 */
1691 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1692 &BTRFS_I(inode)->runtime_flags);
1547 mutex_unlock(&inode->i_mutex); 1693 mutex_unlock(&inode->i_mutex);
1548 goto out; 1694 goto out;
1549 } 1695 }
@@ -1615,6 +1761,329 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1615 return 0; 1761 return 0;
1616} 1762}
1617 1763
1764static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
1765 int slot, u64 start, u64 end)
1766{
1767 struct btrfs_file_extent_item *fi;
1768 struct btrfs_key key;
1769
1770 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1771 return 0;
1772
1773 btrfs_item_key_to_cpu(leaf, &key, slot);
1774 if (key.objectid != btrfs_ino(inode) ||
1775 key.type != BTRFS_EXTENT_DATA_KEY)
1776 return 0;
1777
1778 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1779
1780 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1781 return 0;
1782
1783 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1784 return 0;
1785
1786 if (key.offset == end)
1787 return 1;
1788 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1789 return 1;
1790 return 0;
1791}
1792
1793static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
1794 struct btrfs_path *path, u64 offset, u64 end)
1795{
1796 struct btrfs_root *root = BTRFS_I(inode)->root;
1797 struct extent_buffer *leaf;
1798 struct btrfs_file_extent_item *fi;
1799 struct extent_map *hole_em;
1800 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1801 struct btrfs_key key;
1802 int ret;
1803
1804 key.objectid = btrfs_ino(inode);
1805 key.type = BTRFS_EXTENT_DATA_KEY;
1806 key.offset = offset;
1807
1808
1809 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1810 if (ret < 0)
1811 return ret;
1812 BUG_ON(!ret);
1813
1814 leaf = path->nodes[0];
1815 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
1816 u64 num_bytes;
1817
1818 path->slots[0]--;
1819 fi = btrfs_item_ptr(leaf, path->slots[0],
1820 struct btrfs_file_extent_item);
1821 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
1822 end - offset;
1823 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1824 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1825 btrfs_set_file_extent_offset(leaf, fi, 0);
1826 btrfs_mark_buffer_dirty(leaf);
1827 goto out;
1828 }
1829
1830 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
1831 u64 num_bytes;
1832
1833 path->slots[0]++;
1834 key.offset = offset;
1835 btrfs_set_item_key_safe(trans, root, path, &key);
1836 fi = btrfs_item_ptr(leaf, path->slots[0],
1837 struct btrfs_file_extent_item);
1838 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
1839 offset;
1840 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1841 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1842 btrfs_set_file_extent_offset(leaf, fi, 0);
1843 btrfs_mark_buffer_dirty(leaf);
1844 goto out;
1845 }
1846 btrfs_release_path(path);
1847
1848 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
1849 0, 0, end - offset, 0, end - offset,
1850 0, 0, 0);
1851 if (ret)
1852 return ret;
1853
1854out:
1855 btrfs_release_path(path);
1856
1857 hole_em = alloc_extent_map();
1858 if (!hole_em) {
1859 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1860 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1861 &BTRFS_I(inode)->runtime_flags);
1862 } else {
1863 hole_em->start = offset;
1864 hole_em->len = end - offset;
1865 hole_em->orig_start = offset;
1866
1867 hole_em->block_start = EXTENT_MAP_HOLE;
1868 hole_em->block_len = 0;
1869 hole_em->orig_block_len = 0;
1870 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1871 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1872 hole_em->generation = trans->transid;
1873
1874 do {
1875 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1876 write_lock(&em_tree->lock);
1877 ret = add_extent_mapping(em_tree, hole_em);
1878 if (!ret)
1879 list_move(&hole_em->list,
1880 &em_tree->modified_extents);
1881 write_unlock(&em_tree->lock);
1882 } while (ret == -EEXIST);
1883 free_extent_map(hole_em);
1884 if (ret)
1885 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1886 &BTRFS_I(inode)->runtime_flags);
1887 }
1888
1889 return 0;
1890}
1891
1892static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1893{
1894 struct btrfs_root *root = BTRFS_I(inode)->root;
1895 struct extent_state *cached_state = NULL;
1896 struct btrfs_path *path;
1897 struct btrfs_block_rsv *rsv;
1898 struct btrfs_trans_handle *trans;
1899 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1900 u64 lockend = round_down(offset + len,
1901 BTRFS_I(inode)->root->sectorsize) - 1;
1902 u64 cur_offset = lockstart;
1903 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1904 u64 drop_end;
1905 int ret = 0;
1906 int err = 0;
1907 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1908 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1909
1910 btrfs_wait_ordered_range(inode, offset, len);
1911
1912 mutex_lock(&inode->i_mutex);
1913 /*
1914 * We needn't truncate any page which is beyond the end of the file
1915 * because we are sure there is no data there.
1916 */
1917 /*
1918 * Only do this if we are in the same page and we aren't doing the
1919 * entire page.
1920 */
1921 if (same_page && len < PAGE_CACHE_SIZE) {
1922 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1923 ret = btrfs_truncate_page(inode, offset, len, 0);
1924 mutex_unlock(&inode->i_mutex);
1925 return ret;
1926 }
1927
1928 /* zero back part of the first page */
1929 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1930 ret = btrfs_truncate_page(inode, offset, 0, 0);
1931 if (ret) {
1932 mutex_unlock(&inode->i_mutex);
1933 return ret;
1934 }
1935 }
1936
1937 /* zero the front end of the last page */
1938 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1939 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1940 if (ret) {
1941 mutex_unlock(&inode->i_mutex);
1942 return ret;
1943 }
1944 }
1945
1946 if (lockend < lockstart) {
1947 mutex_unlock(&inode->i_mutex);
1948 return 0;
1949 }
1950
1951 while (1) {
1952 struct btrfs_ordered_extent *ordered;
1953
1954 truncate_pagecache_range(inode, lockstart, lockend);
1955
1956 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1957 0, &cached_state);
1958 ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
1959
1960 /*
1961 * We need to make sure we have no ordered extents in this range
1962 * and nobody raced in and read a page in this range, if we did
1963 * we need to try again.
1964 */
1965 if ((!ordered ||
1966 (ordered->file_offset + ordered->len < lockstart ||
1967 ordered->file_offset > lockend)) &&
1968 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
1969 lockend, EXTENT_UPTODATE, 0,
1970 cached_state)) {
1971 if (ordered)
1972 btrfs_put_ordered_extent(ordered);
1973 break;
1974 }
1975 if (ordered)
1976 btrfs_put_ordered_extent(ordered);
1977 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
1978 lockend, &cached_state, GFP_NOFS);
1979 btrfs_wait_ordered_range(inode, lockstart,
1980 lockend - lockstart + 1);
1981 }
1982
1983 path = btrfs_alloc_path();
1984 if (!path) {
1985 ret = -ENOMEM;
1986 goto out;
1987 }
1988
1989 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
1990 if (!rsv) {
1991 ret = -ENOMEM;
1992 goto out_free;
1993 }
1994 rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
1995 rsv->failfast = 1;
1996
1997 /*
1998 * 1 - update the inode
1999 * 1 - removing the extents in the range
2000 * 1 - adding the hole extent
2001 */
2002 trans = btrfs_start_transaction(root, 3);
2003 if (IS_ERR(trans)) {
2004 err = PTR_ERR(trans);
2005 goto out_free;
2006 }
2007
2008 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
2009 min_size);
2010 BUG_ON(ret);
2011 trans->block_rsv = rsv;
2012
2013 while (cur_offset < lockend) {
2014 ret = __btrfs_drop_extents(trans, root, inode, path,
2015 cur_offset, lockend + 1,
2016 &drop_end, 1);
2017 if (ret != -ENOSPC)
2018 break;
2019
2020 trans->block_rsv = &root->fs_info->trans_block_rsv;
2021
2022 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2023 if (ret) {
2024 err = ret;
2025 break;
2026 }
2027
2028 cur_offset = drop_end;
2029
2030 ret = btrfs_update_inode(trans, root, inode);
2031 if (ret) {
2032 err = ret;
2033 break;
2034 }
2035
2036 btrfs_end_transaction(trans, root);
2037 btrfs_btree_balance_dirty(root);
2038
2039 trans = btrfs_start_transaction(root, 3);
2040 if (IS_ERR(trans)) {
2041 ret = PTR_ERR(trans);
2042 trans = NULL;
2043 break;
2044 }
2045
2046 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
2047 rsv, min_size);
2048 BUG_ON(ret); /* shouldn't happen */
2049 trans->block_rsv = rsv;
2050 }
2051
2052 if (ret) {
2053 err = ret;
2054 goto out_trans;
2055 }
2056
2057 trans->block_rsv = &root->fs_info->trans_block_rsv;
2058 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2059 if (ret) {
2060 err = ret;
2061 goto out_trans;
2062 }
2063
2064out_trans:
2065 if (!trans)
2066 goto out_free;
2067
2068 inode_inc_iversion(inode);
2069 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2070
2071 trans->block_rsv = &root->fs_info->trans_block_rsv;
2072 ret = btrfs_update_inode(trans, root, inode);
2073 btrfs_end_transaction(trans, root);
2074 btrfs_btree_balance_dirty(root);
2075out_free:
2076 btrfs_free_path(path);
2077 btrfs_free_block_rsv(root, rsv);
2078out:
2079 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2080 &cached_state, GFP_NOFS);
2081 mutex_unlock(&inode->i_mutex);
2082 if (ret && !err)
2083 err = ret;
2084 return err;
2085}
2086
1618static long btrfs_fallocate(struct file *file, int mode, 2087static long btrfs_fallocate(struct file *file, int mode,
1619 loff_t offset, loff_t len) 2088 loff_t offset, loff_t len)
1620{ 2089{
@@ -1626,22 +2095,25 @@ static long btrfs_fallocate(struct file *file, int mode,
1626 u64 alloc_end; 2095 u64 alloc_end;
1627 u64 alloc_hint = 0; 2096 u64 alloc_hint = 0;
1628 u64 locked_end; 2097 u64 locked_end;
1629 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1630 struct extent_map *em; 2098 struct extent_map *em;
2099 int blocksize = BTRFS_I(inode)->root->sectorsize;
1631 int ret; 2100 int ret;
1632 2101
1633 alloc_start = offset & ~mask; 2102 alloc_start = round_down(offset, blocksize);
1634 alloc_end = (offset + len + mask) & ~mask; 2103 alloc_end = round_up(offset + len, blocksize);
1635 2104
1636 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 2105 /* Make sure we aren't being give some crap mode */
1637 if (mode & ~FALLOC_FL_KEEP_SIZE) 2106 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1638 return -EOPNOTSUPP; 2107 return -EOPNOTSUPP;
1639 2108
2109 if (mode & FALLOC_FL_PUNCH_HOLE)
2110 return btrfs_punch_hole(inode, offset, len);
2111
1640 /* 2112 /*
1641 * Make sure we have enough space before we do the 2113 * Make sure we have enough space before we do the
1642 * allocation. 2114 * allocation.
1643 */ 2115 */
1644 ret = btrfs_check_data_free_space(inode, len); 2116 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1645 if (ret) 2117 if (ret)
1646 return ret; 2118 return ret;
1647 2119
@@ -1709,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
1709 } 2181 }
1710 last_byte = min(extent_map_end(em), alloc_end); 2182 last_byte = min(extent_map_end(em), alloc_end);
1711 actual_end = min_t(u64, extent_map_end(em), offset + len); 2183 actual_end = min_t(u64, extent_map_end(em), offset + len);
1712 last_byte = (last_byte + mask) & ~mask; 2184 last_byte = ALIGN(last_byte, blocksize);
1713 2185
1714 if (em->block_start == EXTENT_MAP_HOLE || 2186 if (em->block_start == EXTENT_MAP_HOLE ||
1715 (cur_offset >= inode->i_size && 2187 (cur_offset >= inode->i_size &&
@@ -1748,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,
1748out: 2220out:
1749 mutex_unlock(&inode->i_mutex); 2221 mutex_unlock(&inode->i_mutex);
1750 /* Let go of our reservation. */ 2222 /* Let go of our reservation. */
1751 btrfs_free_reserved_data_space(inode, len); 2223 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1752 return ret; 2224 return ret;
1753} 2225}
1754 2226
1755static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 2227static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
1756{ 2228{
1757 struct btrfs_root *root = BTRFS_I(inode)->root; 2229 struct btrfs_root *root = BTRFS_I(inode)->root;
1758 struct extent_map *em; 2230 struct extent_map *em;
@@ -1786,7 +2258,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
1786 * before the position we want in case there is outstanding delalloc 2258 * before the position we want in case there is outstanding delalloc
1787 * going on here. 2259 * going on here.
1788 */ 2260 */
1789 if (origin == SEEK_HOLE && start != 0) { 2261 if (whence == SEEK_HOLE && start != 0) {
1790 if (start <= root->sectorsize) 2262 if (start <= root->sectorsize)
1791 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 2263 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
1792 root->sectorsize, 0); 2264 root->sectorsize, 0);
@@ -1820,13 +2292,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
1820 } 2292 }
1821 } 2293 }
1822 2294
1823 if (origin == SEEK_HOLE) { 2295 if (whence == SEEK_HOLE) {
1824 *offset = start; 2296 *offset = start;
1825 free_extent_map(em); 2297 free_extent_map(em);
1826 break; 2298 break;
1827 } 2299 }
1828 } else { 2300 } else {
1829 if (origin == SEEK_DATA) { 2301 if (whence == SEEK_DATA) {
1830 if (em->block_start == EXTENT_MAP_DELALLOC) { 2302 if (em->block_start == EXTENT_MAP_DELALLOC) {
1831 if (start >= inode->i_size) { 2303 if (start >= inode->i_size) {
1832 free_extent_map(em); 2304 free_extent_map(em);
@@ -1863,16 +2335,16 @@ out:
1863 return ret; 2335 return ret;
1864} 2336}
1865 2337
1866static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 2338static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
1867{ 2339{
1868 struct inode *inode = file->f_mapping->host; 2340 struct inode *inode = file->f_mapping->host;
1869 int ret; 2341 int ret;
1870 2342
1871 mutex_lock(&inode->i_mutex); 2343 mutex_lock(&inode->i_mutex);
1872 switch (origin) { 2344 switch (whence) {
1873 case SEEK_END: 2345 case SEEK_END:
1874 case SEEK_CUR: 2346 case SEEK_CUR:
1875 offset = generic_file_llseek(file, offset, origin); 2347 offset = generic_file_llseek(file, offset, whence);
1876 goto out; 2348 goto out;
1877 case SEEK_DATA: 2349 case SEEK_DATA:
1878 case SEEK_HOLE: 2350 case SEEK_HOLE:
@@ -1881,7 +2353,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1881 return -ENXIO; 2353 return -ENXIO;
1882 } 2354 }
1883 2355
1884 ret = find_desired_extent(inode, &offset, origin); 2356 ret = find_desired_extent(inode, &offset, whence);
1885 if (ret) { 2357 if (ret) {
1886 mutex_unlock(&inode->i_mutex); 2358 mutex_unlock(&inode->i_mutex);
1887 return ret; 2359 return ret;
@@ -1924,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
1924 .compat_ioctl = btrfs_ioctl, 2396 .compat_ioctl = btrfs_ioctl,
1925#endif 2397#endif
1926}; 2398};
2399
2400void btrfs_auto_defrag_exit(void)
2401{
2402 if (btrfs_inode_defrag_cachep)
2403 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2404}
2405
2406int btrfs_auto_defrag_init(void)
2407{
2408 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2409 sizeof(struct inode_defrag), 0,
2410 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2411 NULL);
2412 if (!btrfs_inode_defrag_cachep)
2413 return -ENOMEM;
2414
2415 return 0;
2416}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5c..59ea2e4349c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
307 307
308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
309{ 309{
310 WARN_ON(io_ctl->cur);
311 BUG_ON(io_ctl->index >= io_ctl->num_pages); 310 BUG_ON(io_ctl->index >= io_ctl->num_pages);
312 io_ctl->page = io_ctl->pages[io_ctl->index++]; 311 io_ctl->page = io_ctl->pages[io_ctl->index++];
313 io_ctl->cur = kmap(io_ctl->page); 312 io_ctl->cur = kmap(io_ctl->page);
@@ -966,7 +965,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
966 block_group->key.offset)) { 965 block_group->key.offset)) {
967 ret = find_first_extent_bit(unpin, start, 966 ret = find_first_extent_bit(unpin, start,
968 &extent_start, &extent_end, 967 &extent_start, &extent_end,
969 EXTENT_DIRTY); 968 EXTENT_DIRTY, NULL);
970 if (ret) { 969 if (ret) {
971 ret = 0; 970 ret = 0;
972 break; 971 break;
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1250 * if previous extent entry covers the offset, 1249 * if previous extent entry covers the offset,
1251 * we should return it instead of the bitmap entry 1250 * we should return it instead of the bitmap entry
1252 */ 1251 */
1253 n = &entry->offset_index; 1252 n = rb_prev(&entry->offset_index);
1254 while (1) { 1253 if (n) {
1255 n = rb_prev(n);
1256 if (!n)
1257 break;
1258 prev = rb_entry(n, struct btrfs_free_space, 1254 prev = rb_entry(n, struct btrfs_free_space,
1259 offset_index); 1255 offset_index);
1260 if (!prev->bitmap) { 1256 if (!prev->bitmap &&
1261 if (prev->offset + prev->bytes > offset) 1257 prev->offset + prev->bytes > offset)
1262 entry = prev; 1258 entry = prev;
1263 break;
1264 }
1265 } 1259 }
1266 } 1260 }
1267 return entry; 1261 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1287 } 1281 }
1288 1282
1289 if (entry->bitmap) { 1283 if (entry->bitmap) {
1290 n = &entry->offset_index; 1284 n = rb_prev(&entry->offset_index);
1291 while (1) { 1285 if (n) {
1292 n = rb_prev(n);
1293 if (!n)
1294 break;
1295 prev = rb_entry(n, struct btrfs_free_space, 1286 prev = rb_entry(n, struct btrfs_free_space,
1296 offset_index); 1287 offset_index);
1297 if (!prev->bitmap) { 1288 if (!prev->bitmap &&
1298 if (prev->offset + prev->bytes > offset) 1289 prev->offset + prev->bytes > offset)
1299 return prev; 1290 return prev;
1300 break;
1301 }
1302 } 1291 }
1303 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1292 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1304 return entry; 1293 return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1364 u64 bitmap_bytes; 1353 u64 bitmap_bytes;
1365 u64 extent_bytes; 1354 u64 extent_bytes;
1366 u64 size = block_group->key.offset; 1355 u64 size = block_group->key.offset;
1367 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1369 1358
1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1359 BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1454,9 +1443,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1454 max_t(u64, *offset, bitmap_info->offset)); 1443 max_t(u64, *offset, bitmap_info->offset));
1455 bits = bytes_to_bits(*bytes, ctl->unit); 1444 bits = bytes_to_bits(*bytes, ctl->unit);
1456 1445
1457 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1446 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1458 i < BITS_PER_BITMAP;
1459 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1460 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1447 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1461 BITS_PER_BITMAP, i); 1448 BITS_PER_BITMAP, i);
1462 if ((next_zero - i) >= bits) { 1449 if ((next_zero - i) >= bits) {
@@ -1652,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1652 * some block groups are so tiny they can't be enveloped by a bitmap, so 1639 * some block groups are so tiny they can't be enveloped by a bitmap, so
1653 * don't even bother to create a bitmap for this 1640 * don't even bother to create a bitmap for this
1654 */ 1641 */
1655 if (BITS_PER_BITMAP * block_group->sectorsize > 1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
1656 block_group->key.offset)
1657 return false; 1643 return false;
1658 1644
1659 return true; 1645 return true;
@@ -2300,16 +2286,14 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2300 unsigned long total_found = 0; 2286 unsigned long total_found = 0;
2301 int ret; 2287 int ret;
2302 2288
2303 i = offset_to_bit(entry->offset, block_group->sectorsize, 2289 i = offset_to_bit(entry->offset, ctl->unit,
2304 max_t(u64, offset, entry->offset)); 2290 max_t(u64, offset, entry->offset));
2305 want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2291 want_bits = bytes_to_bits(bytes, ctl->unit);
2306 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2292 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2307 2293
2308again: 2294again:
2309 found_bits = 0; 2295 found_bits = 0;
2310 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 2296 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2311 i < BITS_PER_BITMAP;
2312 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2313 next_zero = find_next_zero_bit(entry->bitmap, 2297 next_zero = find_next_zero_bit(entry->bitmap,
2314 BITS_PER_BITMAP, i); 2298 BITS_PER_BITMAP, i);
2315 if (next_zero - i >= min_bits) { 2299 if (next_zero - i >= min_bits) {
@@ -2329,23 +2313,22 @@ again:
2329 2313
2330 total_found += found_bits; 2314 total_found += found_bits;
2331 2315
2332 if (cluster->max_size < found_bits * block_group->sectorsize) 2316 if (cluster->max_size < found_bits * ctl->unit)
2333 cluster->max_size = found_bits * block_group->sectorsize; 2317 cluster->max_size = found_bits * ctl->unit;
2334 2318
2335 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2319 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2336 i = next_zero + 1; 2320 i = next_zero + 1;
2337 goto again; 2321 goto again;
2338 } 2322 }
2339 2323
2340 cluster->window_start = start * block_group->sectorsize + 2324 cluster->window_start = start * ctl->unit + entry->offset;
2341 entry->offset;
2342 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2325 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2343 ret = tree_insert_offset(&cluster->root, entry->offset, 2326 ret = tree_insert_offset(&cluster->root, entry->offset,
2344 &entry->offset_index, 1); 2327 &entry->offset_index, 1);
2345 BUG_ON(ret); /* -EEXIST; Logic error */ 2328 BUG_ON(ret); /* -EEXIST; Logic error */
2346 2329
2347 trace_btrfs_setup_cluster(block_group, cluster, 2330 trace_btrfs_setup_cluster(block_group, cluster,
2348 total_found * block_group->sectorsize, 1); 2331 total_found * ctl->unit, 1);
2349 return 0; 2332 return 0;
2350} 2333}
2351 2334
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b99..1d982812ab67 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
24{ 24{
25 return crc32c((u32)~1, name, len); 25 return crc32c((u32)~1, name, len);
26} 26}
27
28/*
29 * Figure the key offset of an extended inode ref
30 */
31static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
32 int len)
33{
34 return (u64) crc32c(parent_objectid, name, len);
35}
36
27#endif 37#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c73..48b8fda93132 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
18 18
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "hash.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
50 return 0; 51 return 0;
51} 52}
52 53
53struct btrfs_inode_ref * 54int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
55 const char *name, int name_len,
56 struct btrfs_inode_extref **extref_ret)
57{
58 struct extent_buffer *leaf;
59 struct btrfs_inode_extref *extref;
60 unsigned long ptr;
61 unsigned long name_ptr;
62 u32 item_size;
63 u32 cur_offset = 0;
64 int ref_name_len;
65
66 leaf = path->nodes[0];
67 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
68 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
69
70 /*
71 * Search all extended backrefs in this item. We're only
72 * looking through any collisions so most of the time this is
73 * just going to compare against one buffer. If all is well,
74 * we'll return success and the inode ref object.
75 */
76 while (cur_offset < item_size) {
77 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
78 name_ptr = (unsigned long)(&extref->name);
79 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
80
81 if (ref_name_len == name_len &&
82 btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
83 (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
84 if (extref_ret)
85 *extref_ret = extref;
86 return 1;
87 }
88
89 cur_offset += ref_name_len + sizeof(*extref);
90 }
91 return 0;
92}
93
94static struct btrfs_inode_ref *
54btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 95btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root, 96 struct btrfs_root *root,
56 struct btrfs_path *path, 97 struct btrfs_path *path,
57 const char *name, int name_len, 98 const char *name, int name_len,
58 u64 inode_objectid, u64 ref_objectid, int mod) 99 u64 inode_objectid, u64 ref_objectid, int ins_len,
100 int cow)
59{ 101{
102 int ret;
60 struct btrfs_key key; 103 struct btrfs_key key;
61 struct btrfs_inode_ref *ref; 104 struct btrfs_inode_ref *ref;
62 int ins_len = mod < 0 ? -1 : 0;
63 int cow = mod != 0;
64 int ret;
65 105
66 key.objectid = inode_objectid; 106 key.objectid = inode_objectid;
67 key.type = BTRFS_INODE_REF_KEY; 107 key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
77 return ref; 117 return ref;
78} 118}
79 119
80int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 120/* Returns NULL if no extref found */
121struct btrfs_inode_extref *
122btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
123 struct btrfs_root *root,
124 struct btrfs_path *path,
125 const char *name, int name_len,
126 u64 inode_objectid, u64 ref_objectid, int ins_len,
127 int cow)
128{
129 int ret;
130 struct btrfs_key key;
131 struct btrfs_inode_extref *extref;
132
133 key.objectid = inode_objectid;
134 key.type = BTRFS_INODE_EXTREF_KEY;
135 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
136
137 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
138 if (ret < 0)
139 return ERR_PTR(ret);
140 if (ret > 0)
141 return NULL;
142 if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
143 return NULL;
144 return extref;
145}
146
147int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
148 struct btrfs_root *root,
149 struct btrfs_path *path,
150 const char *name, int name_len,
151 u64 inode_objectid, u64 ref_objectid, int mod,
152 u64 *ret_index)
153{
154 struct btrfs_inode_ref *ref;
155 struct btrfs_inode_extref *extref;
156 int ins_len = mod < 0 ? -1 : 0;
157 int cow = mod != 0;
158
159 ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
160 inode_objectid, ref_objectid, ins_len,
161 cow);
162 if (IS_ERR(ref))
163 return PTR_ERR(ref);
164
165 if (ref != NULL) {
166 *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
167 return 0;
168 }
169
170 btrfs_release_path(path);
171
172 extref = btrfs_lookup_inode_extref(trans, root, path, name,
173 name_len, inode_objectid,
174 ref_objectid, ins_len, cow);
175 if (IS_ERR(extref))
176 return PTR_ERR(extref);
177
178 if (extref) {
179 *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
180 return 0;
181 }
182
183 return -ENOENT;
184}
185
186int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root, 187 struct btrfs_root *root,
82 const char *name, int name_len, 188 const char *name, int name_len,
83 u64 inode_objectid, u64 ref_objectid, u64 *index) 189 u64 inode_objectid, u64 ref_objectid, u64 *index)
84{ 190{
85 struct btrfs_path *path; 191 struct btrfs_path *path;
86 struct btrfs_key key; 192 struct btrfs_key key;
193 struct btrfs_inode_extref *extref;
194 struct extent_buffer *leaf;
195 int ret;
196 int del_len = name_len + sizeof(*extref);
197 unsigned long ptr;
198 unsigned long item_start;
199 u32 item_size;
200
201 key.objectid = inode_objectid;
202 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
203 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
204
205 path = btrfs_alloc_path();
206 if (!path)
207 return -ENOMEM;
208
209 path->leave_spinning = 1;
210
211 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
212 if (ret > 0)
213 ret = -ENOENT;
214 if (ret < 0)
215 goto out;
216
217 /*
218 * Sanity check - did we find the right item for this name?
219 * This should always succeed so error here will make the FS
220 * readonly.
221 */
222 if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
223 name, name_len, &extref)) {
224 btrfs_std_error(root->fs_info, -ENOENT);
225 ret = -EROFS;
226 goto out;
227 }
228
229 leaf = path->nodes[0];
230 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
231 if (index)
232 *index = btrfs_inode_extref_index(leaf, extref);
233
234 if (del_len == item_size) {
235 /*
236 * Common case only one ref in the item, remove the
237 * whole item.
238 */
239 ret = btrfs_del_item(trans, root, path);
240 goto out;
241 }
242
243 ptr = (unsigned long)extref;
244 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
245
246 memmove_extent_buffer(leaf, ptr, ptr + del_len,
247 item_size - (ptr + del_len - item_start));
248
249 btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
250
251out:
252 btrfs_free_path(path);
253
254 return ret;
255}
256
257int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
258 struct btrfs_root *root,
259 const char *name, int name_len,
260 u64 inode_objectid, u64 ref_objectid, u64 *index)
261{
262 struct btrfs_path *path;
263 struct btrfs_key key;
87 struct btrfs_inode_ref *ref; 264 struct btrfs_inode_ref *ref;
88 struct extent_buffer *leaf; 265 struct extent_buffer *leaf;
89 unsigned long ptr; 266 unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
91 u32 item_size; 268 u32 item_size;
92 u32 sub_item_len; 269 u32 sub_item_len;
93 int ret; 270 int ret;
271 int search_ext_refs = 0;
94 int del_len = name_len + sizeof(*ref); 272 int del_len = name_len + sizeof(*ref);
95 273
96 key.objectid = inode_objectid; 274 key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
106 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 284 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
107 if (ret > 0) { 285 if (ret > 0) {
108 ret = -ENOENT; 286 ret = -ENOENT;
287 search_ext_refs = 1;
109 goto out; 288 goto out;
110 } else if (ret < 0) { 289 } else if (ret < 0) {
111 goto out; 290 goto out;
112 } 291 }
113 if (!find_name_in_backref(path, name, name_len, &ref)) { 292 if (!find_name_in_backref(path, name, name_len, &ref)) {
114 ret = -ENOENT; 293 ret = -ENOENT;
294 search_ext_refs = 1;
115 goto out; 295 goto out;
116 } 296 }
117 leaf = path->nodes[0]; 297 leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
129 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); 309 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
130 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, 310 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
131 item_size - (ptr + sub_item_len - item_start)); 311 item_size - (ptr + sub_item_len - item_start));
132 btrfs_truncate_item(trans, root, path, 312 btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
133 item_size - sub_item_len, 1); 313out:
314 btrfs_free_path(path);
315
316 if (search_ext_refs) {
317 /*
318 * No refs were found, or we could not find the
319 * name in our ref array. Find and remove the extended
320 * inode ref then.
321 */
322 return btrfs_del_inode_extref(trans, root, name, name_len,
323 inode_objectid, ref_objectid, index);
324 }
325
326 return ret;
327}
328
329/*
330 * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
331 *
332 * The caller must have checked against BTRFS_LINK_MAX already.
333 */
334static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
335 struct btrfs_root *root,
336 const char *name, int name_len,
337 u64 inode_objectid, u64 ref_objectid, u64 index)
338{
339 struct btrfs_inode_extref *extref;
340 int ret;
341 int ins_len = name_len + sizeof(*extref);
342 unsigned long ptr;
343 struct btrfs_path *path;
344 struct btrfs_key key;
345 struct extent_buffer *leaf;
346 struct btrfs_item *item;
347
348 key.objectid = inode_objectid;
349 key.type = BTRFS_INODE_EXTREF_KEY;
350 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
351
352 path = btrfs_alloc_path();
353 if (!path)
354 return -ENOMEM;
355
356 path->leave_spinning = 1;
357 ret = btrfs_insert_empty_item(trans, root, path, &key,
358 ins_len);
359 if (ret == -EEXIST) {
360 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
361 name, name_len, NULL))
362 goto out;
363
364 btrfs_extend_item(trans, root, path, ins_len);
365 ret = 0;
366 }
367 if (ret < 0)
368 goto out;
369
370 leaf = path->nodes[0];
371 item = btrfs_item_nr(leaf, path->slots[0]);
372 ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
373 ptr += btrfs_item_size(leaf, item) - ins_len;
374 extref = (struct btrfs_inode_extref *)ptr;
375
376 btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
377 btrfs_set_inode_extref_index(path->nodes[0], extref, index);
378 btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
379
380 ptr = (unsigned long)&extref->name;
381 write_extent_buffer(path->nodes[0], name, ptr, name_len);
382 btrfs_mark_buffer_dirty(path->nodes[0]);
383
134out: 384out:
135 btrfs_free_path(path); 385 btrfs_free_path(path);
136 return ret; 386 return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
191 441
192out: 442out:
193 btrfs_free_path(path); 443 btrfs_free_path(path);
444
445 if (ret == -EMLINK) {
446 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
447 /* We ran out of space in the ref array. Need to
448 * add an extended ref. */
449 if (btrfs_super_incompat_flags(disk_super)
450 & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
451 ret = btrfs_insert_inode_extref(trans, root, name,
452 name_len,
453 inode_objectid,
454 ref_objectid, index);
455 }
456
194 return ret; 457 return ret;
195} 458}
196 459
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
434 * 3 items for pre-allocation 434 * 3 items for pre-allocation
435 */ 435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 437 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved); 438 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH);
439 if (ret) 440 if (ret)
440 goto out; 441 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6ed6944e50c..67ed24ae86bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
71static struct extent_io_ops btrfs_extent_io_ops; 71static struct extent_io_ops btrfs_extent_io_ops;
72 72
73static struct kmem_cache *btrfs_inode_cachep; 73static struct kmem_cache *btrfs_inode_cachep;
74static struct kmem_cache *btrfs_delalloc_work_cachep;
74struct kmem_cache *btrfs_trans_handle_cachep; 75struct kmem_cache *btrfs_trans_handle_cachep;
75struct kmem_cache *btrfs_transaction_cachep; 76struct kmem_cache *btrfs_transaction_cachep;
76struct kmem_cache *btrfs_path_cachep; 77struct kmem_cache *btrfs_path_cachep;
@@ -94,8 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 95 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 96 u64 start, u64 end, int *page_started,
96 unsigned long *nr_written, int unlock); 97 unsigned long *nr_written, int unlock);
97static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 98static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
98 struct btrfs_root *root, struct inode *inode); 99 u64 len, u64 orig_start,
100 u64 block_start, u64 block_len,
101 u64 orig_block_len, int type);
99 102
100static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 103static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
101 struct inode *inode, struct inode *dir, 104 struct inode *inode, struct inode *dir,
@@ -230,7 +233,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
230 u64 inline_len = actual_end - start; 233 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) & 234 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1); 235 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len; 236 u64 data_len = inline_len;
235 int ret; 237 int ret;
236 238
@@ -247,8 +249,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
247 return 1; 249 return 1;
248 } 250 }
249 251
250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 252 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
251 &hint_byte, 1);
252 if (ret) 253 if (ret)
253 return ret; 254 return ret;
254 255
@@ -664,7 +665,7 @@ retry:
664 async_extent->compressed_size, 665 async_extent->compressed_size,
665 async_extent->compressed_size, 666 async_extent->compressed_size,
666 0, alloc_hint, &ins, 1); 667 0, alloc_hint, &ins, 1);
667 if (ret) 668 if (ret && ret != -ENOSPC)
668 btrfs_abort_transaction(trans, root, ret); 669 btrfs_abort_transaction(trans, root, ret);
669 btrfs_end_transaction(trans, root); 670 btrfs_end_transaction(trans, root);
670 } 671 }
@@ -702,14 +703,19 @@ retry:
702 703
703 em->block_start = ins.objectid; 704 em->block_start = ins.objectid;
704 em->block_len = ins.offset; 705 em->block_len = ins.offset;
706 em->orig_block_len = ins.offset;
705 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 em->bdev = root->fs_info->fs_devices->latest_bdev;
706 em->compress_type = async_extent->compress_type; 708 em->compress_type = async_extent->compress_type;
707 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 set_bit(EXTENT_FLAG_PINNED, &em->flags);
708 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 710 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
711 em->generation = -1;
709 712
710 while (1) { 713 while (1) {
711 write_lock(&em_tree->lock); 714 write_lock(&em_tree->lock);
712 ret = add_extent_mapping(em_tree, em); 715 ret = add_extent_mapping(em_tree, em);
716 if (!ret)
717 list_move(&em->list,
718 &em_tree->modified_extents);
713 write_unlock(&em_tree->lock); 719 write_unlock(&em_tree->lock);
714 if (ret != -EEXIST) { 720 if (ret != -EEXIST) {
715 free_extent_map(em); 721 free_extent_map(em);
@@ -807,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
807 * required to start IO on it. It may be clean and already done with 813 * required to start IO on it. It may be clean and already done with
808 * IO when we return. 814 * IO when we return.
809 */ 815 */
810static noinline int cow_file_range(struct inode *inode, 816static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
811 struct page *locked_page, 817 struct inode *inode,
812 u64 start, u64 end, int *page_started, 818 struct btrfs_root *root,
813 unsigned long *nr_written, 819 struct page *locked_page,
814 int unlock) 820 u64 start, u64 end, int *page_started,
821 unsigned long *nr_written,
822 int unlock)
815{ 823{
816 struct btrfs_root *root = BTRFS_I(inode)->root;
817 struct btrfs_trans_handle *trans;
818 u64 alloc_hint = 0; 824 u64 alloc_hint = 0;
819 u64 num_bytes; 825 u64 num_bytes;
820 unsigned long ram_size; 826 unsigned long ram_size;
@@ -827,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
827 int ret = 0; 833 int ret = 0;
828 834
829 BUG_ON(btrfs_is_free_space_inode(inode)); 835 BUG_ON(btrfs_is_free_space_inode(inode));
830 trans = btrfs_join_transaction(root);
831 if (IS_ERR(trans)) {
832 extent_clear_unlock_delalloc(inode,
833 &BTRFS_I(inode)->io_tree,
834 start, end, locked_page,
835 EXTENT_CLEAR_UNLOCK_PAGE |
836 EXTENT_CLEAR_UNLOCK |
837 EXTENT_CLEAR_DELALLOC |
838 EXTENT_CLEAR_DIRTY |
839 EXTENT_SET_WRITEBACK |
840 EXTENT_END_WRITEBACK);
841 return PTR_ERR(trans);
842 }
843 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
844 836
845 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 837 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
846 num_bytes = max(blocksize, num_bytes); 838 num_bytes = max(blocksize, num_bytes);
847 disk_num_bytes = num_bytes; 839 disk_num_bytes = num_bytes;
848 ret = 0;
849 840
850 /* if this is a small write inside eof, kick off defrag */ 841 /* if this is a small write inside eof, kick off defrag */
851 if (num_bytes < 64 * 1024 && 842 if (num_bytes < 64 * 1024 &&
@@ -904,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
904 895
905 em->block_start = ins.objectid; 896 em->block_start = ins.objectid;
906 em->block_len = ins.offset; 897 em->block_len = ins.offset;
898 em->orig_block_len = ins.offset;
907 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 em->bdev = root->fs_info->fs_devices->latest_bdev;
908 set_bit(EXTENT_FLAG_PINNED, &em->flags); 900 set_bit(EXTENT_FLAG_PINNED, &em->flags);
901 em->generation = -1;
909 902
910 while (1) { 903 while (1) {
911 write_lock(&em_tree->lock); 904 write_lock(&em_tree->lock);
912 ret = add_extent_mapping(em_tree, em); 905 ret = add_extent_mapping(em_tree, em);
906 if (!ret)
907 list_move(&em->list,
908 &em_tree->modified_extents);
913 write_unlock(&em_tree->lock); 909 write_unlock(&em_tree->lock);
914 if (ret != -EEXIST) { 910 if (ret != -EEXIST) {
915 free_extent_map(em); 911 free_extent_map(em);
@@ -956,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
956 alloc_hint = ins.objectid + ins.offset; 952 alloc_hint = ins.objectid + ins.offset;
957 start += cur_alloc_size; 953 start += cur_alloc_size;
958 } 954 }
959 ret = 0;
960out: 955out:
961 btrfs_end_transaction(trans, root);
962
963 return ret; 956 return ret;
957
964out_unlock: 958out_unlock:
965 extent_clear_unlock_delalloc(inode, 959 extent_clear_unlock_delalloc(inode,
966 &BTRFS_I(inode)->io_tree, 960 &BTRFS_I(inode)->io_tree,
@@ -975,6 +969,39 @@ out_unlock:
975 goto out; 969 goto out;
976} 970}
977 971
972static noinline int cow_file_range(struct inode *inode,
973 struct page *locked_page,
974 u64 start, u64 end, int *page_started,
975 unsigned long *nr_written,
976 int unlock)
977{
978 struct btrfs_trans_handle *trans;
979 struct btrfs_root *root = BTRFS_I(inode)->root;
980 int ret;
981
982 trans = btrfs_join_transaction(root);
983 if (IS_ERR(trans)) {
984 extent_clear_unlock_delalloc(inode,
985 &BTRFS_I(inode)->io_tree,
986 start, end, locked_page,
987 EXTENT_CLEAR_UNLOCK_PAGE |
988 EXTENT_CLEAR_UNLOCK |
989 EXTENT_CLEAR_DELALLOC |
990 EXTENT_CLEAR_DIRTY |
991 EXTENT_SET_WRITEBACK |
992 EXTENT_END_WRITEBACK);
993 return PTR_ERR(trans);
994 }
995 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
996
997 ret = __cow_file_range(trans, inode, root, locked_page, start, end,
998 page_started, nr_written, unlock);
999
1000 btrfs_end_transaction(trans, root);
1001
1002 return ret;
1003}
1004
978/* 1005/*
979 * work queue call back to started compression on a file and pages 1006 * work queue call back to started compression on a file and pages
980 */ 1007 */
@@ -1130,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1130 u64 extent_offset; 1157 u64 extent_offset;
1131 u64 disk_bytenr; 1158 u64 disk_bytenr;
1132 u64 num_bytes; 1159 u64 num_bytes;
1160 u64 disk_num_bytes;
1133 int extent_type; 1161 int extent_type;
1134 int ret, err; 1162 int ret, err;
1135 int type; 1163 int type;
@@ -1232,6 +1260,8 @@ next_slot:
1232 extent_offset = btrfs_file_extent_offset(leaf, fi); 1260 extent_offset = btrfs_file_extent_offset(leaf, fi);
1233 extent_end = found_key.offset + 1261 extent_end = found_key.offset +
1234 btrfs_file_extent_num_bytes(leaf, fi); 1262 btrfs_file_extent_num_bytes(leaf, fi);
1263 disk_num_bytes =
1264 btrfs_file_extent_disk_num_bytes(leaf, fi);
1235 if (extent_end <= start) { 1265 if (extent_end <= start) {
1236 path->slots[0]++; 1266 path->slots[0]++;
1237 goto next_slot; 1267 goto next_slot;
@@ -1285,9 +1315,9 @@ out_check:
1285 1315
1286 btrfs_release_path(path); 1316 btrfs_release_path(path);
1287 if (cow_start != (u64)-1) { 1317 if (cow_start != (u64)-1) {
1288 ret = cow_file_range(inode, locked_page, cow_start, 1318 ret = __cow_file_range(trans, inode, root, locked_page,
1289 found_key.offset - 1, page_started, 1319 cow_start, found_key.offset - 1,
1290 nr_written, 1); 1320 page_started, nr_written, 1);
1291 if (ret) { 1321 if (ret) {
1292 btrfs_abort_transaction(trans, root, ret); 1322 btrfs_abort_transaction(trans, root, ret);
1293 goto error; 1323 goto error;
@@ -1302,15 +1332,21 @@ out_check:
1302 em = alloc_extent_map(); 1332 em = alloc_extent_map();
1303 BUG_ON(!em); /* -ENOMEM */ 1333 BUG_ON(!em); /* -ENOMEM */
1304 em->start = cur_offset; 1334 em->start = cur_offset;
1305 em->orig_start = em->start; 1335 em->orig_start = found_key.offset - extent_offset;
1306 em->len = num_bytes; 1336 em->len = num_bytes;
1307 em->block_len = num_bytes; 1337 em->block_len = num_bytes;
1308 em->block_start = disk_bytenr; 1338 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes;
1309 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 em->bdev = root->fs_info->fs_devices->latest_bdev;
1310 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1342 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1;
1311 while (1) { 1344 while (1) {
1312 write_lock(&em_tree->lock); 1345 write_lock(&em_tree->lock);
1313 ret = add_extent_mapping(em_tree, em); 1346 ret = add_extent_mapping(em_tree, em);
1347 if (!ret)
1348 list_move(&em->list,
1349 &em_tree->modified_extents);
1314 write_unlock(&em_tree->lock); 1350 write_unlock(&em_tree->lock);
1315 if (ret != -EEXIST) { 1351 if (ret != -EEXIST) {
1316 free_extent_map(em); 1352 free_extent_map(em);
@@ -1355,8 +1391,9 @@ out_check:
1355 } 1391 }
1356 1392
1357 if (cow_start != (u64)-1) { 1393 if (cow_start != (u64)-1) {
1358 ret = cow_file_range(inode, locked_page, cow_start, end, 1394 ret = __cow_file_range(trans, inode, root, locked_page,
1359 page_started, nr_written, 1); 1395 cow_start, end,
1396 page_started, nr_written, 1);
1360 if (ret) { 1397 if (ret) {
1361 btrfs_abort_transaction(trans, root, ret); 1398 btrfs_abort_transaction(trans, root, ret);
1362 goto error; 1399 goto error;
@@ -1364,11 +1401,7 @@ out_check:
1364 } 1401 }
1365 1402
1366error: 1403error:
1367 if (nolock) { 1404 err = btrfs_end_transaction(trans, root);
1368 err = btrfs_end_transaction_nolock(trans, root);
1369 } else {
1370 err = btrfs_end_transaction(trans, root);
1371 }
1372 if (!ret) 1405 if (!ret)
1373 ret = err; 1406 ret = err;
1374 1407
@@ -1538,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1538 unsigned long bio_flags) 1571 unsigned long bio_flags)
1539{ 1572{
1540 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1573 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1541 struct btrfs_mapping_tree *map_tree;
1542 u64 logical = (u64)bio->bi_sector << 9; 1574 u64 logical = (u64)bio->bi_sector << 9;
1543 u64 length = 0; 1575 u64 length = 0;
1544 u64 map_length; 1576 u64 map_length;
@@ -1548,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1548 return 0; 1580 return 0;
1549 1581
1550 length = bio->bi_size; 1582 length = bio->bi_size;
1551 map_tree = &root->fs_info->mapping_tree;
1552 map_length = length; 1583 map_length = length;
1553 ret = btrfs_map_block(map_tree, READ, logical, 1584 ret = btrfs_map_block(root->fs_info, READ, logical,
1554 &map_length, NULL, 0); 1585 &map_length, NULL, 0);
1555 /* Will always return 0 or 1 with map_multi == NULL */ 1586 /* Will always return 0 with map_multi == NULL */
1556 BUG_ON(ret < 0); 1587 BUG_ON(ret < 0);
1557 if (map_length < length + size) 1588 if (map_length < length + size)
1558 return 1; 1589 return 1;
@@ -1593,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1593 u64 bio_offset) 1624 u64 bio_offset)
1594{ 1625{
1595 struct btrfs_root *root = BTRFS_I(inode)->root; 1626 struct btrfs_root *root = BTRFS_I(inode)->root;
1596 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1627 int ret;
1628
1629 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1630 if (ret)
1631 bio_endio(bio, ret);
1632 return ret;
1597} 1633}
1598 1634
1599/* 1635/*
@@ -1608,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1608 int ret = 0; 1644 int ret = 0;
1609 int skip_sum; 1645 int skip_sum;
1610 int metadata = 0; 1646 int metadata = 0;
1647 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1611 1648
1612 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1613 1650
@@ -1617,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1617 if (!(rw & REQ_WRITE)) { 1654 if (!(rw & REQ_WRITE)) {
1618 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1655 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1619 if (ret) 1656 if (ret)
1620 return ret; 1657 goto out;
1621 1658
1622 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1659 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1623 return btrfs_submit_compressed_read(inode, bio, 1660 ret = btrfs_submit_compressed_read(inode, bio,
1624 mirror_num, bio_flags); 1661 mirror_num,
1662 bio_flags);
1663 goto out;
1625 } else if (!skip_sum) { 1664 } else if (!skip_sum) {
1626 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1665 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1627 if (ret) 1666 if (ret)
1628 return ret; 1667 goto out;
1629 } 1668 }
1630 goto mapit; 1669 goto mapit;
1631 } else if (!skip_sum) { 1670 } else if (async && !skip_sum) {
1632 /* csum items have already been cloned */ 1671 /* csum items have already been cloned */
1633 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1672 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1634 goto mapit; 1673 goto mapit;
1635 /* we're doing a write, do the async checksumming */ 1674 /* we're doing a write, do the async checksumming */
1636 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1675 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1637 inode, rw, bio, mirror_num, 1676 inode, rw, bio, mirror_num,
1638 bio_flags, bio_offset, 1677 bio_flags, bio_offset,
1639 __btrfs_submit_bio_start, 1678 __btrfs_submit_bio_start,
1640 __btrfs_submit_bio_done); 1679 __btrfs_submit_bio_done);
1680 goto out;
1681 } else if (!skip_sum) {
1682 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1683 if (ret)
1684 goto out;
1641 } 1685 }
1642 1686
1643mapit: 1687mapit:
1644 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1688 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1689
1690out:
1691 if (ret < 0)
1692 bio_endio(bio, ret);
1693 return ret;
1645} 1694}
1646 1695
1647/* 1696/*
@@ -1664,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1664int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1713int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1665 struct extent_state **cached_state) 1714 struct extent_state **cached_state)
1666{ 1715{
1667 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1716 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1668 WARN_ON(1);
1669 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1717 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1670 cached_state, GFP_NOFS); 1718 cached_state, GFP_NOFS);
1671} 1719}
@@ -1785,7 +1833,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1785 struct btrfs_path *path; 1833 struct btrfs_path *path;
1786 struct extent_buffer *leaf; 1834 struct extent_buffer *leaf;
1787 struct btrfs_key ins; 1835 struct btrfs_key ins;
1788 u64 hint;
1789 int ret; 1836 int ret;
1790 1837
1791 path = btrfs_alloc_path(); 1838 path = btrfs_alloc_path();
@@ -1803,8 +1850,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1803 * the caller is expected to unpin it and allow it to be merged 1850 * the caller is expected to unpin it and allow it to be merged
1804 * with the others. 1851 * with the others.
1805 */ 1852 */
1806 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1853 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1807 &hint, 0); 1854 file_pos + num_bytes, 0);
1808 if (ret) 1855 if (ret)
1809 goto out; 1856 goto out;
1810 1857
@@ -1828,10 +1875,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1828 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1875 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1829 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1876 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1830 1877
1831 btrfs_unlock_up_safe(path, 1);
1832 btrfs_set_lock_blocking(leaf);
1833
1834 btrfs_mark_buffer_dirty(leaf); 1878 btrfs_mark_buffer_dirty(leaf);
1879 btrfs_release_path(path);
1835 1880
1836 inode_add_bytes(inode, num_bytes); 1881 inode_add_bytes(inode, num_bytes);
1837 1882
@@ -1877,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1877 1922
1878 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1923 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1879 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1924 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1880 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1925 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1881 if (!ret) { 1926 if (nolock)
1882 if (nolock) 1927 trans = btrfs_join_transaction_nolock(root);
1883 trans = btrfs_join_transaction_nolock(root); 1928 else
1884 else 1929 trans = btrfs_join_transaction(root);
1885 trans = btrfs_join_transaction(root); 1930 if (IS_ERR(trans)) {
1886 if (IS_ERR(trans)) { 1931 ret = PTR_ERR(trans);
1887 ret = PTR_ERR(trans); 1932 trans = NULL;
1888 trans = NULL; 1933 goto out;
1889 goto out;
1890 }
1891 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1892 ret = btrfs_update_inode_fallback(trans, root, inode);
1893 if (ret) /* -ENOMEM or corruption */
1894 btrfs_abort_transaction(trans, root, ret);
1895 } 1934 }
1935 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1936 ret = btrfs_update_inode_fallback(trans, root, inode);
1937 if (ret) /* -ENOMEM or corruption */
1938 btrfs_abort_transaction(trans, root, ret);
1896 goto out; 1939 goto out;
1897 } 1940 }
1898 1941
@@ -1929,11 +1972,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1929 ordered_extent->len, 1972 ordered_extent->len,
1930 compress_type, 0, 0, 1973 compress_type, 0, 0,
1931 BTRFS_FILE_EXTENT_REG); 1974 BTRFS_FILE_EXTENT_REG);
1932 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1933 ordered_extent->file_offset,
1934 ordered_extent->len);
1935 } 1975 }
1936 1976 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1977 ordered_extent->file_offset, ordered_extent->len,
1978 trans->transid);
1937 if (ret < 0) { 1979 if (ret < 0) {
1938 btrfs_abort_transaction(trans, root, ret); 1980 btrfs_abort_transaction(trans, root, ret);
1939 goto out_unlock; 1981 goto out_unlock;
@@ -1942,13 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1942 add_pending_csums(trans, inode, ordered_extent->file_offset, 1984 add_pending_csums(trans, inode, ordered_extent->file_offset,
1943 &ordered_extent->list); 1985 &ordered_extent->list);
1944 1986
1945 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1987 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1946 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1988 ret = btrfs_update_inode_fallback(trans, root, inode);
1947 ret = btrfs_update_inode_fallback(trans, root, inode); 1989 if (ret) { /* -ENOMEM or corruption */
1948 if (ret) { /* -ENOMEM or corruption */ 1990 btrfs_abort_transaction(trans, root, ret);
1949 btrfs_abort_transaction(trans, root, ret); 1991 goto out_unlock;
1950 goto out_unlock;
1951 }
1952 } 1992 }
1953 ret = 0; 1993 ret = 0;
1954out_unlock: 1994out_unlock:
@@ -1958,12 +1998,8 @@ out_unlock:
1958out: 1998out:
1959 if (root != root->fs_info->tree_root) 1999 if (root != root->fs_info->tree_root)
1960 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 2000 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1961 if (trans) { 2001 if (trans)
1962 if (nolock) 2002 btrfs_end_transaction(trans, root);
1963 btrfs_end_transaction_nolock(trans, root);
1964 else
1965 btrfs_end_transaction(trans, root);
1966 }
1967 2003
1968 if (ret) 2004 if (ret)
1969 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 2005 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -2119,7 +2155,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2119 if (empty) 2155 if (empty)
2120 return; 2156 return;
2121 2157
2122 down_read(&root->fs_info->cleanup_work_sem);
2123 spin_lock(&fs_info->delayed_iput_lock); 2158 spin_lock(&fs_info->delayed_iput_lock);
2124 list_splice_init(&fs_info->delayed_iputs, &list); 2159 list_splice_init(&fs_info->delayed_iputs, &list);
2125 spin_unlock(&fs_info->delayed_iput_lock); 2160 spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2165,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2130 iput(delayed->inode); 2165 iput(delayed->inode);
2131 kfree(delayed); 2166 kfree(delayed);
2132 } 2167 }
2133 up_read(&root->fs_info->cleanup_work_sem);
2134} 2168}
2135 2169
2136enum btrfs_orphan_cleanup_state { 2170enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2232,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2198 int ret; 2232 int ret;
2199 2233
2200 if (!root->orphan_block_rsv) { 2234 if (!root->orphan_block_rsv) {
2201 block_rsv = btrfs_alloc_block_rsv(root); 2235 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2202 if (!block_rsv) 2236 if (!block_rsv)
2203 return -ENOMEM; 2237 return -ENOMEM;
2204 } 2238 }
@@ -2225,7 +2259,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2225 insert = 1; 2259 insert = 1;
2226#endif 2260#endif
2227 insert = 1; 2261 insert = 1;
2228 atomic_dec(&root->orphan_inodes); 2262 atomic_inc(&root->orphan_inodes);
2229 } 2263 }
2230 2264
2231 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2265 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2590,6 +2624,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
2590 2624
2591 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2625 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2592 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2626 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2627 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
2628
2629 /*
2630 * If we were modified in the current generation and evicted from memory
2631 * and then re-read we need to do a full sync since we don't have any
2632 * idea about which extents were modified before we were evicted from
2633 * cache.
2634 */
2635 if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
2636 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2637 &BTRFS_I(inode)->runtime_flags);
2638
2593 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2639 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2594 inode->i_generation = BTRFS_I(inode)->generation; 2640 inode->i_generation = BTRFS_I(inode)->generation;
2595 inode->i_rdev = 0; 2641 inode->i_rdev = 0;
@@ -2747,8 +2793,9 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2747 return btrfs_update_inode_item(trans, root, inode); 2793 return btrfs_update_inode_item(trans, root, inode);
2748} 2794}
2749 2795
2750static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 2796noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2751 struct btrfs_root *root, struct inode *inode) 2797 struct btrfs_root *root,
2798 struct inode *inode)
2752{ 2799{
2753 int ret; 2800 int ret;
2754 2801
@@ -2894,7 +2941,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2894 struct btrfs_trans_handle *trans; 2941 struct btrfs_trans_handle *trans;
2895 struct btrfs_root *root = BTRFS_I(dir)->root; 2942 struct btrfs_root *root = BTRFS_I(dir)->root;
2896 struct btrfs_path *path; 2943 struct btrfs_path *path;
2897 struct btrfs_inode_ref *ref;
2898 struct btrfs_dir_item *di; 2944 struct btrfs_dir_item *di;
2899 struct inode *inode = dentry->d_inode; 2945 struct inode *inode = dentry->d_inode;
2900 u64 index; 2946 u64 index;
@@ -3008,17 +3054,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3008 } 3054 }
3009 btrfs_release_path(path); 3055 btrfs_release_path(path);
3010 3056
3011 ref = btrfs_lookup_inode_ref(trans, root, path, 3057 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3012 dentry->d_name.name, dentry->d_name.len, 3058 dentry->d_name.len, ino, dir_ino, 0,
3013 ino, dir_ino, 0); 3059 &index);
3014 if (IS_ERR(ref)) { 3060 if (ret) {
3015 err = PTR_ERR(ref); 3061 err = ret;
3016 goto out; 3062 goto out;
3017 } 3063 }
3018 BUG_ON(!ref); /* Logic error */ 3064
3019 if (check_path_shared(root, path)) 3065 if (check_path_shared(root, path))
3020 goto out; 3066 goto out;
3021 index = btrfs_inode_ref_index(path->nodes[0], ref); 3067
3022 btrfs_release_path(path); 3068 btrfs_release_path(path);
3023 3069
3024 /* 3070 /*
@@ -3061,7 +3107,7 @@ out:
3061static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3107static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3062 struct btrfs_root *root) 3108 struct btrfs_root *root)
3063{ 3109{
3064 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3110 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3065 btrfs_block_rsv_release(root, trans->block_rsv, 3111 btrfs_block_rsv_release(root, trans->block_rsv,
3066 trans->bytes_reserved); 3112 trans->bytes_reserved);
3067 trans->block_rsv = &root->fs_info->trans_block_rsv; 3113 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3077,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3077 struct btrfs_trans_handle *trans; 3123 struct btrfs_trans_handle *trans;
3078 struct inode *inode = dentry->d_inode; 3124 struct inode *inode = dentry->d_inode;
3079 int ret; 3125 int ret;
3080 unsigned long nr = 0;
3081 3126
3082 trans = __unlink_start_trans(dir, dentry); 3127 trans = __unlink_start_trans(dir, dentry);
3083 if (IS_ERR(trans)) 3128 if (IS_ERR(trans))
@@ -3097,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3097 } 3142 }
3098 3143
3099out: 3144out:
3100 nr = trans->blocks_used;
3101 __unlink_end_trans(trans, root); 3145 __unlink_end_trans(trans, root);
3102 btrfs_btree_balance_dirty(root, nr); 3146 btrfs_btree_balance_dirty(root);
3103 return ret; 3147 return ret;
3104} 3148}
3105 3149
@@ -3189,11 +3233,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3189 int err = 0; 3233 int err = 0;
3190 struct btrfs_root *root = BTRFS_I(dir)->root; 3234 struct btrfs_root *root = BTRFS_I(dir)->root;
3191 struct btrfs_trans_handle *trans; 3235 struct btrfs_trans_handle *trans;
3192 unsigned long nr = 0;
3193 3236
3194 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3237 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3195 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3196 return -ENOTEMPTY; 3238 return -ENOTEMPTY;
3239 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3240 return -EPERM;
3197 3241
3198 trans = __unlink_start_trans(dir, dentry); 3242 trans = __unlink_start_trans(dir, dentry);
3199 if (IS_ERR(trans)) 3243 if (IS_ERR(trans))
@@ -3217,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3217 if (!err) 3261 if (!err)
3218 btrfs_i_size_write(inode, 0); 3262 btrfs_i_size_write(inode, 0);
3219out: 3263out:
3220 nr = trans->blocks_used;
3221 __unlink_end_trans(trans, root); 3264 __unlink_end_trans(trans, root);
3222 btrfs_btree_balance_dirty(root, nr); 3265 btrfs_btree_balance_dirty(root);
3223 3266
3224 return err; 3267 return err;
3225} 3268}
@@ -3267,8 +3310,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3267 return -ENOMEM; 3310 return -ENOMEM;
3268 path->reada = -1; 3311 path->reada = -1;
3269 3312
3313 /*
3314 * We want to drop from the next block forward in case this new size is
3315 * not block aligned since we will be keeping the last block of the
3316 * extent just the way it is.
3317 */
3270 if (root->ref_cows || root == root->fs_info->tree_root) 3318 if (root->ref_cows || root == root->fs_info->tree_root)
3271 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3319 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
3272 3320
3273 /* 3321 /*
3274 * This function is also used to drop the items in the log tree before 3322 * This function is also used to drop the items in the log tree before
@@ -3429,12 +3477,6 @@ delete:
3429 3477
3430 if (path->slots[0] == 0 || 3478 if (path->slots[0] == 0 ||
3431 path->slots[0] != pending_del_slot) { 3479 path->slots[0] != pending_del_slot) {
3432 if (root->ref_cows &&
3433 BTRFS_I(inode)->location.objectid !=
3434 BTRFS_FREE_INO_OBJECTID) {
3435 err = -EAGAIN;
3436 goto out;
3437 }
3438 if (pending_del_nr) { 3480 if (pending_del_nr) {
3439 ret = btrfs_del_items(trans, root, path, 3481 ret = btrfs_del_items(trans, root, path,
3440 pending_del_slot, 3482 pending_del_slot,
@@ -3465,12 +3507,20 @@ error:
3465} 3507}
3466 3508
3467/* 3509/*
3468 * taken from block_truncate_page, but does cow as it zeros out 3510 * btrfs_truncate_page - read, zero a chunk and write a page
3469 * any bytes left in the last page in the file. 3511 * @inode - inode that we're zeroing
3512 * @from - the offset to start zeroing
3513 * @len - the length to zero, 0 to zero the entire range respective to the
3514 * offset
3515 * @front - zero up to the offset instead of from the offset on
3516 *
3517 * This will find the page for the "from" offset and cow the page and zero the
3518 * part we want to zero. This is used with truncate and hole punching.
3470 */ 3519 */
3471static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3520int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3521 int front)
3472{ 3522{
3473 struct inode *inode = mapping->host; 3523 struct address_space *mapping = inode->i_mapping;
3474 struct btrfs_root *root = BTRFS_I(inode)->root; 3524 struct btrfs_root *root = BTRFS_I(inode)->root;
3475 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3525 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3476 struct btrfs_ordered_extent *ordered; 3526 struct btrfs_ordered_extent *ordered;
@@ -3485,17 +3535,18 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3485 u64 page_start; 3535 u64 page_start;
3486 u64 page_end; 3536 u64 page_end;
3487 3537
3488 if ((offset & (blocksize - 1)) == 0) 3538 if ((offset & (blocksize - 1)) == 0 &&
3539 (!len || ((len & (blocksize - 1)) == 0)))
3489 goto out; 3540 goto out;
3490 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3541 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3491 if (ret) 3542 if (ret)
3492 goto out; 3543 goto out;
3493 3544
3494 ret = -ENOMEM;
3495again: 3545again:
3496 page = find_or_create_page(mapping, index, mask); 3546 page = find_or_create_page(mapping, index, mask);
3497 if (!page) { 3547 if (!page) {
3498 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3548 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3549 ret = -ENOMEM;
3499 goto out; 3550 goto out;
3500 } 3551 }
3501 3552
@@ -3532,7 +3583,8 @@ again:
3532 } 3583 }
3533 3584
3534 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3585 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3535 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3586 EXTENT_DIRTY | EXTENT_DELALLOC |
3587 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
3536 0, 0, &cached_state, GFP_NOFS); 3588 0, 0, &cached_state, GFP_NOFS);
3537 3589
3538 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3590 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3543,10 +3595,14 @@ again:
3543 goto out_unlock; 3595 goto out_unlock;
3544 } 3596 }
3545 3597
3546 ret = 0;
3547 if (offset != PAGE_CACHE_SIZE) { 3598 if (offset != PAGE_CACHE_SIZE) {
3599 if (!len)
3600 len = PAGE_CACHE_SIZE - offset;
3548 kaddr = kmap(page); 3601 kaddr = kmap(page);
3549 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3602 if (front)
3603 memset(kaddr, 0, offset);
3604 else
3605 memset(kaddr + offset, 0, len);
3550 flush_dcache_page(page); 3606 flush_dcache_page(page);
3551 kunmap(page); 3607 kunmap(page);
3552 } 3608 }
@@ -3577,6 +3633,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3577 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3633 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3578 struct extent_map *em = NULL; 3634 struct extent_map *em = NULL;
3579 struct extent_state *cached_state = NULL; 3635 struct extent_state *cached_state = NULL;
3636 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3580 u64 mask = root->sectorsize - 1; 3637 u64 mask = root->sectorsize - 1;
3581 u64 hole_start = (oldsize + mask) & ~mask; 3638 u64 hole_start = (oldsize + mask) & ~mask;
3582 u64 block_end = (size + mask) & ~mask; 3639 u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3670,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3613 last_byte = min(extent_map_end(em), block_end); 3670 last_byte = min(extent_map_end(em), block_end);
3614 last_byte = (last_byte + mask) & ~mask; 3671 last_byte = (last_byte + mask) & ~mask;
3615 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3672 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3616 u64 hint_byte = 0; 3673 struct extent_map *hole_em;
3617 hole_size = last_byte - cur_offset; 3674 hole_size = last_byte - cur_offset;
3618 3675
3619 trans = btrfs_start_transaction(root, 3); 3676 trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3679,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3622 break; 3679 break;
3623 } 3680 }
3624 3681
3625 err = btrfs_drop_extents(trans, inode, cur_offset, 3682 err = btrfs_drop_extents(trans, root, inode,
3626 cur_offset + hole_size, 3683 cur_offset,
3627 &hint_byte, 1); 3684 cur_offset + hole_size, 1);
3628 if (err) { 3685 if (err) {
3629 btrfs_abort_transaction(trans, root, err); 3686 btrfs_abort_transaction(trans, root, err);
3630 btrfs_end_transaction(trans, root); 3687 btrfs_end_transaction(trans, root);
@@ -3641,9 +3698,40 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3641 break; 3698 break;
3642 } 3699 }
3643 3700
3644 btrfs_drop_extent_cache(inode, hole_start, 3701 btrfs_drop_extent_cache(inode, cur_offset,
3645 last_byte - 1, 0); 3702 cur_offset + hole_size - 1, 0);
3703 hole_em = alloc_extent_map();
3704 if (!hole_em) {
3705 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3706 &BTRFS_I(inode)->runtime_flags);
3707 goto next;
3708 }
3709 hole_em->start = cur_offset;
3710 hole_em->len = hole_size;
3711 hole_em->orig_start = cur_offset;
3712
3713 hole_em->block_start = EXTENT_MAP_HOLE;
3714 hole_em->block_len = 0;
3715 hole_em->orig_block_len = 0;
3716 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3717 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3718 hole_em->generation = trans->transid;
3646 3719
3720 while (1) {
3721 write_lock(&em_tree->lock);
3722 err = add_extent_mapping(em_tree, hole_em);
3723 if (!err)
3724 list_move(&hole_em->list,
3725 &em_tree->modified_extents);
3726 write_unlock(&em_tree->lock);
3727 if (err != -EEXIST)
3728 break;
3729 btrfs_drop_extent_cache(inode, cur_offset,
3730 cur_offset +
3731 hole_size - 1, 0);
3732 }
3733 free_extent_map(hole_em);
3734next:
3647 btrfs_update_inode(trans, root, inode); 3735 btrfs_update_inode(trans, root, inode);
3648 btrfs_end_transaction(trans, root); 3736 btrfs_end_transaction(trans, root);
3649 } 3737 }
@@ -3740,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
3740 struct btrfs_root *root = BTRFS_I(inode)->root; 3828 struct btrfs_root *root = BTRFS_I(inode)->root;
3741 struct btrfs_block_rsv *rsv, *global_rsv; 3829 struct btrfs_block_rsv *rsv, *global_rsv;
3742 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3830 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3743 unsigned long nr;
3744 int ret; 3831 int ret;
3745 3832
3746 trace_btrfs_inode_evict(inode); 3833 trace_btrfs_inode_evict(inode);
@@ -3768,29 +3855,26 @@ void btrfs_evict_inode(struct inode *inode)
3768 goto no_delete; 3855 goto no_delete;
3769 } 3856 }
3770 3857
3771 rsv = btrfs_alloc_block_rsv(root); 3858 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3772 if (!rsv) { 3859 if (!rsv) {
3773 btrfs_orphan_del(NULL, inode); 3860 btrfs_orphan_del(NULL, inode);
3774 goto no_delete; 3861 goto no_delete;
3775 } 3862 }
3776 rsv->size = min_size; 3863 rsv->size = min_size;
3864 rsv->failfast = 1;
3777 global_rsv = &root->fs_info->global_block_rsv; 3865 global_rsv = &root->fs_info->global_block_rsv;
3778 3866
3779 btrfs_i_size_write(inode, 0); 3867 btrfs_i_size_write(inode, 0);
3780 3868
3781 /* 3869 /*
3782 * This is a bit simpler than btrfs_truncate since 3870 * This is a bit simpler than btrfs_truncate since we've already
3783 * 3871 * reserved our space for our orphan item in the unlink, so we just
3784 * 1) We've already reserved our space for our orphan item in the 3872 * need to reserve some slack space in case we add bytes and update
3785 * unlink. 3873 * inode item when doing the truncate.
3786 * 2) We're going to delete the inode item, so we don't need to update
3787 * it at all.
3788 *
3789 * So we just need to reserve some slack space in case we add bytes when
3790 * doing the truncate.
3791 */ 3874 */
3792 while (1) { 3875 while (1) {
3793 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3876 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3877 BTRFS_RESERVE_FLUSH_LIMIT);
3794 3878
3795 /* 3879 /*
3796 * Try and steal from the global reserve since we will 3880 * Try and steal from the global reserve since we will
@@ -3808,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
3808 goto no_delete; 3892 goto no_delete;
3809 } 3893 }
3810 3894
3811 trans = btrfs_start_transaction(root, 0); 3895 trans = btrfs_start_transaction_lflush(root, 1);
3812 if (IS_ERR(trans)) { 3896 if (IS_ERR(trans)) {
3813 btrfs_orphan_del(NULL, inode); 3897 btrfs_orphan_del(NULL, inode);
3814 btrfs_free_block_rsv(root, rsv); 3898 btrfs_free_block_rsv(root, rsv);
@@ -3818,13 +3902,16 @@ void btrfs_evict_inode(struct inode *inode)
3818 trans->block_rsv = rsv; 3902 trans->block_rsv = rsv;
3819 3903
3820 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3904 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3821 if (ret != -EAGAIN) 3905 if (ret != -ENOSPC)
3822 break; 3906 break;
3823 3907
3824 nr = trans->blocks_used; 3908 trans->block_rsv = &root->fs_info->trans_block_rsv;
3909 ret = btrfs_update_inode(trans, root, inode);
3910 BUG_ON(ret);
3911
3825 btrfs_end_transaction(trans, root); 3912 btrfs_end_transaction(trans, root);
3826 trans = NULL; 3913 trans = NULL;
3827 btrfs_btree_balance_dirty(root, nr); 3914 btrfs_btree_balance_dirty(root);
3828 } 3915 }
3829 3916
3830 btrfs_free_block_rsv(root, rsv); 3917 btrfs_free_block_rsv(root, rsv);
@@ -3840,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
3840 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3927 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3841 btrfs_return_ino(root, btrfs_ino(inode)); 3928 btrfs_return_ino(root, btrfs_ino(inode));
3842 3929
3843 nr = trans->blocks_used;
3844 btrfs_end_transaction(trans, root); 3930 btrfs_end_transaction(trans, root);
3845 btrfs_btree_balance_dirty(root, nr); 3931 btrfs_btree_balance_dirty(root);
3846no_delete: 3932no_delete:
3847 clear_inode(inode); 3933 clear_inode(inode);
3848 return; 3934 return;
@@ -4470,10 +4556,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4470 trans = btrfs_join_transaction(root); 4556 trans = btrfs_join_transaction(root);
4471 if (IS_ERR(trans)) 4557 if (IS_ERR(trans))
4472 return PTR_ERR(trans); 4558 return PTR_ERR(trans);
4473 if (nolock) 4559 ret = btrfs_commit_transaction(trans, root);
4474 ret = btrfs_end_transaction_nolock(trans, root);
4475 else
4476 ret = btrfs_commit_transaction(trans, root);
4477 } 4560 }
4478 return ret; 4561 return ret;
4479} 4562}
@@ -4671,6 +4754,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4671 BTRFS_I(inode)->generation = trans->transid; 4754 BTRFS_I(inode)->generation = trans->transid;
4672 inode->i_generation = BTRFS_I(inode)->generation; 4755 inode->i_generation = BTRFS_I(inode)->generation;
4673 4756
4757 /*
4758 * We could have gotten an inode number from somebody who was fsynced
4759 * and then removed in this same transaction, so let's just set full
4760 * sync since it will be a full sync anyway and this will blow away the
4761 * old info in the log.
4762 */
4763 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
4764
4674 if (S_ISDIR(mode)) 4765 if (S_ISDIR(mode))
4675 owner = 0; 4766 owner = 0;
4676 else 4767 else
@@ -4680,6 +4771,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4680 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4771 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4681 key[0].offset = 0; 4772 key[0].offset = 0;
4682 4773
4774 /*
4775 * Start new inodes with an inode_ref. This is slightly more
4776 * efficient for small numbers of hard links since they will
4777 * be packed into one item. Extended refs will kick in if we
4778 * add more hard links than can fit in the ref item.
4779 */
4683 key[1].objectid = objectid; 4780 key[1].objectid = objectid;
4684 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4781 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4685 key[1].offset = ref_objectid; 4782 key[1].offset = ref_objectid;
@@ -4721,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4721 if (S_ISREG(mode)) { 4818 if (S_ISREG(mode)) {
4722 if (btrfs_test_opt(root, NODATASUM)) 4819 if (btrfs_test_opt(root, NODATASUM))
4723 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4820 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4724 if (btrfs_test_opt(root, NODATACOW) || 4821 if (btrfs_test_opt(root, NODATACOW))
4725 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4726 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4822 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4727 } 4823 }
4728 4824
@@ -4788,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4788 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4884 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4789 parent_inode, &key, 4885 parent_inode, &key,
4790 btrfs_inode_type(inode), index); 4886 btrfs_inode_type(inode), index);
4791 if (ret == -EEXIST) 4887 if (ret == -EEXIST || ret == -EOVERFLOW)
4792 goto fail_dir_item; 4888 goto fail_dir_item;
4793 else if (ret) { 4889 else if (ret) {
4794 btrfs_abort_transaction(trans, root, ret); 4890 btrfs_abort_transaction(trans, root, ret);
@@ -4843,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4843 int err; 4939 int err;
4844 int drop_inode = 0; 4940 int drop_inode = 0;
4845 u64 objectid; 4941 u64 objectid;
4846 unsigned long nr = 0;
4847 u64 index = 0; 4942 u64 index = 0;
4848 4943
4849 if (!new_valid_dev(rdev)) 4944 if (!new_valid_dev(rdev))
@@ -4876,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4876 goto out_unlock; 4971 goto out_unlock;
4877 } 4972 }
4878 4973
4974 err = btrfs_update_inode(trans, root, inode);
4975 if (err) {
4976 drop_inode = 1;
4977 goto out_unlock;
4978 }
4979
4879 /* 4980 /*
4880 * If the active LSM wants to access the inode during 4981 * If the active LSM wants to access the inode during
4881 * d_instantiate it needs these. Smack checks to see 4982 * d_instantiate it needs these. Smack checks to see
@@ -4893,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4893 d_instantiate(dentry, inode); 4994 d_instantiate(dentry, inode);
4894 } 4995 }
4895out_unlock: 4996out_unlock:
4896 nr = trans->blocks_used;
4897 btrfs_end_transaction(trans, root); 4997 btrfs_end_transaction(trans, root);
4898 btrfs_btree_balance_dirty(root, nr); 4998 btrfs_btree_balance_dirty(root);
4899 if (drop_inode) { 4999 if (drop_inode) {
4900 inode_dec_link_count(inode); 5000 inode_dec_link_count(inode);
4901 iput(inode); 5001 iput(inode);
@@ -4909,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4909 struct btrfs_trans_handle *trans; 5009 struct btrfs_trans_handle *trans;
4910 struct btrfs_root *root = BTRFS_I(dir)->root; 5010 struct btrfs_root *root = BTRFS_I(dir)->root;
4911 struct inode *inode = NULL; 5011 struct inode *inode = NULL;
4912 int drop_inode = 0; 5012 int drop_inode_on_err = 0;
4913 int err; 5013 int err;
4914 unsigned long nr = 0;
4915 u64 objectid; 5014 u64 objectid;
4916 u64 index = 0; 5015 u64 index = 0;
4917 5016
@@ -4935,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4935 err = PTR_ERR(inode); 5034 err = PTR_ERR(inode);
4936 goto out_unlock; 5035 goto out_unlock;
4937 } 5036 }
5037 drop_inode_on_err = 1;
4938 5038
4939 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5039 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4940 if (err) { 5040 if (err)
4941 drop_inode = 1; 5041 goto out_unlock;
5042
5043 err = btrfs_update_inode(trans, root, inode);
5044 if (err)
4942 goto out_unlock; 5045 goto out_unlock;
4943 }
4944 5046
4945 /* 5047 /*
4946 * If the active LSM wants to access the inode during 5048 * If the active LSM wants to access the inode during
@@ -4953,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4953 5055
4954 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5056 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4955 if (err) 5057 if (err)
4956 drop_inode = 1; 5058 goto out_unlock;
4957 else { 5059
4958 inode->i_mapping->a_ops = &btrfs_aops; 5060 inode->i_mapping->a_ops = &btrfs_aops;
4959 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5061 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4960 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5062 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4961 d_instantiate(dentry, inode); 5063 d_instantiate(dentry, inode);
4962 } 5064
4963out_unlock: 5065out_unlock:
4964 nr = trans->blocks_used;
4965 btrfs_end_transaction(trans, root); 5066 btrfs_end_transaction(trans, root);
4966 if (drop_inode) { 5067 if (err && drop_inode_on_err) {
4967 inode_dec_link_count(inode); 5068 inode_dec_link_count(inode);
4968 iput(inode); 5069 iput(inode);
4969 } 5070 }
4970 btrfs_btree_balance_dirty(root, nr); 5071 btrfs_btree_balance_dirty(root);
4971 return err; 5072 return err;
4972} 5073}
4973 5074
@@ -4978,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4978 struct btrfs_root *root = BTRFS_I(dir)->root; 5079 struct btrfs_root *root = BTRFS_I(dir)->root;
4979 struct inode *inode = old_dentry->d_inode; 5080 struct inode *inode = old_dentry->d_inode;
4980 u64 index; 5081 u64 index;
4981 unsigned long nr = 0;
4982 int err; 5082 int err;
4983 int drop_inode = 0; 5083 int drop_inode = 0;
4984 5084
@@ -4986,7 +5086,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4986 if (root->objectid != BTRFS_I(inode)->root->objectid) 5086 if (root->objectid != BTRFS_I(inode)->root->objectid)
4987 return -EXDEV; 5087 return -EXDEV;
4988 5088
4989 if (inode->i_nlink == ~0U) 5089 if (inode->i_nlink >= BTRFS_LINK_MAX)
4990 return -EMLINK; 5090 return -EMLINK;
4991 5091
4992 err = btrfs_set_inode_index(dir, &index); 5092 err = btrfs_set_inode_index(dir, &index);
@@ -5008,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5008 inode_inc_iversion(inode); 5108 inode_inc_iversion(inode);
5009 inode->i_ctime = CURRENT_TIME; 5109 inode->i_ctime = CURRENT_TIME;
5010 ihold(inode); 5110 ihold(inode);
5111 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5011 5112
5012 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5113 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5013 5114
@@ -5022,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5022 btrfs_log_new_name(trans, inode, NULL, parent); 5123 btrfs_log_new_name(trans, inode, NULL, parent);
5023 } 5124 }
5024 5125
5025 nr = trans->blocks_used;
5026 btrfs_end_transaction(trans, root); 5126 btrfs_end_transaction(trans, root);
5027fail: 5127fail:
5028 if (drop_inode) { 5128 if (drop_inode) {
5029 inode_dec_link_count(inode); 5129 inode_dec_link_count(inode);
5030 iput(inode); 5130 iput(inode);
5031 } 5131 }
5032 btrfs_btree_balance_dirty(root, nr); 5132 btrfs_btree_balance_dirty(root);
5033 return err; 5133 return err;
5034} 5134}
5035 5135
@@ -5042,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5042 int drop_on_err = 0; 5142 int drop_on_err = 0;
5043 u64 objectid = 0; 5143 u64 objectid = 0;
5044 u64 index = 0; 5144 u64 index = 0;
5045 unsigned long nr = 1;
5046 5145
5047 /* 5146 /*
5048 * 2 items for inode and ref 5147 * 2 items for inode and ref
@@ -5088,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5088 drop_on_err = 0; 5187 drop_on_err = 0;
5089 5188
5090out_fail: 5189out_fail:
5091 nr = trans->blocks_used;
5092 btrfs_end_transaction(trans, root); 5190 btrfs_end_transaction(trans, root);
5093 if (drop_on_err) 5191 if (drop_on_err)
5094 iput(inode); 5192 iput(inode);
5095 btrfs_btree_balance_dirty(root, nr); 5193 btrfs_btree_balance_dirty(root);
5096 return err; 5194 return err;
5097} 5195}
5098 5196
@@ -5286,6 +5384,7 @@ again:
5286 if (start + len <= found_key.offset) 5384 if (start + len <= found_key.offset)
5287 goto not_found; 5385 goto not_found;
5288 em->start = start; 5386 em->start = start;
5387 em->orig_start = start;
5289 em->len = found_key.offset - start; 5388 em->len = found_key.offset - start;
5290 goto not_found_em; 5389 goto not_found_em;
5291 } 5390 }
@@ -5296,6 +5395,8 @@ again:
5296 em->len = extent_end - extent_start; 5395 em->len = extent_end - extent_start;
5297 em->orig_start = extent_start - 5396 em->orig_start = extent_start -
5298 btrfs_file_extent_offset(leaf, item); 5397 btrfs_file_extent_offset(leaf, item);
5398 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5399 item);
5299 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5400 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5300 if (bytenr == 0) { 5401 if (bytenr == 0) {
5301 em->block_start = EXTENT_MAP_HOLE; 5402 em->block_start = EXTENT_MAP_HOLE;
@@ -5305,8 +5406,7 @@ again:
5305 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5406 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5306 em->compress_type = compress_type; 5407 em->compress_type = compress_type;
5307 em->block_start = bytenr; 5408 em->block_start = bytenr;
5308 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5409 em->block_len = em->orig_block_len;
5309 item);
5310 } else { 5410 } else {
5311 bytenr += btrfs_file_extent_offset(leaf, item); 5411 bytenr += btrfs_file_extent_offset(leaf, item);
5312 em->block_start = bytenr; 5412 em->block_start = bytenr;
@@ -5336,7 +5436,8 @@ again:
5336 em->start = extent_start + extent_offset; 5436 em->start = extent_start + extent_offset;
5337 em->len = (copy_size + root->sectorsize - 1) & 5437 em->len = (copy_size + root->sectorsize - 1) &
5338 ~((u64)root->sectorsize - 1); 5438 ~((u64)root->sectorsize - 1);
5339 em->orig_start = EXTENT_MAP_INLINE; 5439 em->orig_block_len = em->len;
5440 em->orig_start = em->start;
5340 if (compress_type) { 5441 if (compress_type) {
5341 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5442 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5342 em->compress_type = compress_type; 5443 em->compress_type = compress_type;
@@ -5385,11 +5486,11 @@ again:
5385 extent_map_end(em) - 1, NULL, GFP_NOFS); 5486 extent_map_end(em) - 1, NULL, GFP_NOFS);
5386 goto insert; 5487 goto insert;
5387 } else { 5488 } else {
5388 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5489 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5389 WARN_ON(1);
5390 } 5490 }
5391not_found: 5491not_found:
5392 em->start = start; 5492 em->start = start;
5493 em->orig_start = start;
5393 em->len = len; 5494 em->len = len;
5394not_found_em: 5495not_found_em:
5395 em->block_start = EXTENT_MAP_HOLE; 5496 em->block_start = EXTENT_MAP_HOLE;
@@ -5450,7 +5551,8 @@ insert:
5450 write_unlock(&em_tree->lock); 5551 write_unlock(&em_tree->lock);
5451out: 5552out:
5452 5553
5453 trace_btrfs_get_extent(root, em); 5554 if (em)
5555 trace_btrfs_get_extent(root, em);
5454 5556
5455 if (path) 5557 if (path)
5456 btrfs_free_path(path); 5558 btrfs_free_path(path);
@@ -5590,38 +5692,19 @@ out:
5590} 5692}
5591 5693
5592static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5694static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5593 struct extent_map *em,
5594 u64 start, u64 len) 5695 u64 start, u64 len)
5595{ 5696{
5596 struct btrfs_root *root = BTRFS_I(inode)->root; 5697 struct btrfs_root *root = BTRFS_I(inode)->root;
5597 struct btrfs_trans_handle *trans; 5698 struct btrfs_trans_handle *trans;
5598 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5699 struct extent_map *em;
5599 struct btrfs_key ins; 5700 struct btrfs_key ins;
5600 u64 alloc_hint; 5701 u64 alloc_hint;
5601 int ret; 5702 int ret;
5602 bool insert = false;
5603
5604 /*
5605 * Ok if the extent map we looked up is a hole and is for the exact
5606 * range we want, there is no reason to allocate a new one, however if
5607 * it is not right then we need to free this one and drop the cache for
5608 * our range.
5609 */
5610 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5611 em->len != len) {
5612 free_extent_map(em);
5613 em = NULL;
5614 insert = true;
5615 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5616 }
5617 5703
5618 trans = btrfs_join_transaction(root); 5704 trans = btrfs_join_transaction(root);
5619 if (IS_ERR(trans)) 5705 if (IS_ERR(trans))
5620 return ERR_CAST(trans); 5706 return ERR_CAST(trans);
5621 5707
5622 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5623 btrfs_add_inode_defrag(trans, inode);
5624
5625 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5708 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5626 5709
5627 alloc_hint = get_extent_allocation_hint(inode, start, len); 5710 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5632,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5632 goto out; 5715 goto out;
5633 } 5716 }
5634 5717
5635 if (!em) { 5718 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
5636 em = alloc_extent_map(); 5719 ins.offset, ins.offset, 0);
5637 if (!em) { 5720 if (IS_ERR(em))
5638 em = ERR_PTR(-ENOMEM); 5721 goto out;
5639 goto out;
5640 }
5641 }
5642
5643 em->start = start;
5644 em->orig_start = em->start;
5645 em->len = ins.offset;
5646
5647 em->block_start = ins.objectid;
5648 em->block_len = ins.offset;
5649 em->bdev = root->fs_info->fs_devices->latest_bdev;
5650
5651 /*
5652 * We need to do this because if we're using the original em we searched
5653 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5654 */
5655 em->flags = 0;
5656 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5657
5658 while (insert) {
5659 write_lock(&em_tree->lock);
5660 ret = add_extent_mapping(em_tree, em);
5661 write_unlock(&em_tree->lock);
5662 if (ret != -EEXIST)
5663 break;
5664 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5665 }
5666 5722
5667 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5723 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5668 ins.offset, ins.offset, 0); 5724 ins.offset, ins.offset, 0);
@@ -5836,6 +5892,53 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5836 return ret; 5892 return ret;
5837} 5893}
5838 5894
5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5896 u64 len, u64 orig_start,
5897 u64 block_start, u64 block_len,
5898 u64 orig_block_len, int type)
5899{
5900 struct extent_map_tree *em_tree;
5901 struct extent_map *em;
5902 struct btrfs_root *root = BTRFS_I(inode)->root;
5903 int ret;
5904
5905 em_tree = &BTRFS_I(inode)->extent_tree;
5906 em = alloc_extent_map();
5907 if (!em)
5908 return ERR_PTR(-ENOMEM);
5909
5910 em->start = start;
5911 em->orig_start = orig_start;
5912 em->len = len;
5913 em->block_len = block_len;
5914 em->block_start = block_start;
5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 em->orig_block_len = orig_block_len;
5917 em->generation = -1;
5918 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5919 if (type == BTRFS_ORDERED_PREALLOC)
5920 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5921
5922 do {
5923 btrfs_drop_extent_cache(inode, em->start,
5924 em->start + em->len - 1, 0);
5925 write_lock(&em_tree->lock);
5926 ret = add_extent_mapping(em_tree, em);
5927 if (!ret)
5928 list_move(&em->list,
5929 &em_tree->modified_extents);
5930 write_unlock(&em_tree->lock);
5931 } while (ret == -EEXIST);
5932
5933 if (ret) {
5934 free_extent_map(em);
5935 return ERR_PTR(ret);
5936 }
5937
5938 return em;
5939}
5940
5941
5839static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5942static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5840 struct buffer_head *bh_result, int create) 5943 struct buffer_head *bh_result, int create)
5841{ 5944{
@@ -5950,6 +6053,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5950 goto must_cow; 6053 goto must_cow;
5951 6054
5952 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6055 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6056 u64 orig_start = em->orig_start;
6057 u64 orig_block_len = em->orig_block_len;
6058
6059 if (type == BTRFS_ORDERED_PREALLOC) {
6060 free_extent_map(em);
6061 em = create_pinned_em(inode, start, len,
6062 orig_start,
6063 block_start, len,
6064 orig_block_len, type);
6065 if (IS_ERR(em)) {
6066 btrfs_end_transaction(trans, root);
6067 goto unlock_err;
6068 }
6069 }
6070
5953 ret = btrfs_add_ordered_extent_dio(inode, start, 6071 ret = btrfs_add_ordered_extent_dio(inode, start,
5954 block_start, len, len, type); 6072 block_start, len, len, type);
5955 btrfs_end_transaction(trans, root); 6073 btrfs_end_transaction(trans, root);
@@ -5967,7 +6085,8 @@ must_cow:
5967 * it above 6085 * it above
5968 */ 6086 */
5969 len = bh_result->b_size; 6087 len = bh_result->b_size;
5970 em = btrfs_new_extent_direct(inode, em, start, len); 6088 free_extent_map(em);
6089 em = btrfs_new_extent_direct(inode, start, len);
5971 if (IS_ERR(em)) { 6090 if (IS_ERR(em)) {
5972 ret = PTR_ERR(em); 6091 ret = PTR_ERR(em);
5973 goto unlock_err; 6092 goto unlock_err;
@@ -5999,7 +6118,8 @@ unlock:
5999 if (lockstart < lockend) { 6118 if (lockstart < lockend) {
6000 if (create && len < lockend - lockstart) { 6119 if (create && len < lockend - lockstart) {
6001 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6120 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6002 lockstart + len - 1, unlock_bits, 1, 0, 6121 lockstart + len - 1,
6122 unlock_bits | EXTENT_DEFRAG, 1, 0,
6003 &cached_state, GFP_NOFS); 6123 &cached_state, GFP_NOFS);
6004 /* 6124 /*
6005 * Beside unlock, we also need to cleanup reserved space 6125 * Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6127,8 @@ unlock:
6007 */ 6127 */
6008 clear_extent_bit(&BTRFS_I(inode)->io_tree, 6128 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6009 lockstart + len, lockend, 6129 lockstart + len, lockend,
6010 unlock_bits | EXTENT_DO_ACCOUNTING, 6130 unlock_bits | EXTENT_DO_ACCOUNTING |
6011 1, 0, NULL, GFP_NOFS); 6131 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6012 } else { 6132 } else {
6013 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6133 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6014 lockend, unlock_bits, 1, 0, 6134 lockend, unlock_bits, 1, 0,
@@ -6207,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6207 struct btrfs_root *root = BTRFS_I(inode)->root; 6327 struct btrfs_root *root = BTRFS_I(inode)->root;
6208 int ret; 6328 int ret;
6209 6329
6330 if (async_submit)
6331 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6332
6210 bio_get(bio); 6333 bio_get(bio);
6211 6334
6212 if (!write) { 6335 if (!write) {
@@ -6251,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6251{ 6374{
6252 struct inode *inode = dip->inode; 6375 struct inode *inode = dip->inode;
6253 struct btrfs_root *root = BTRFS_I(inode)->root; 6376 struct btrfs_root *root = BTRFS_I(inode)->root;
6254 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6255 struct bio *bio; 6377 struct bio *bio;
6256 struct bio *orig_bio = dip->orig_bio; 6378 struct bio *orig_bio = dip->orig_bio;
6257 struct bio_vec *bvec = orig_bio->bi_io_vec; 6379 struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6264,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6264 int async_submit = 0; 6386 int async_submit = 0;
6265 6387
6266 map_length = orig_bio->bi_size; 6388 map_length = orig_bio->bi_size;
6267 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6389 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6268 &map_length, NULL, 0); 6390 &map_length, NULL, 0);
6269 if (ret) { 6391 if (ret) {
6270 bio_put(orig_bio); 6392 bio_put(orig_bio);
@@ -6318,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6318 bio->bi_end_io = btrfs_end_dio_bio; 6440 bio->bi_end_io = btrfs_end_dio_bio;
6319 6441
6320 map_length = orig_bio->bi_size; 6442 map_length = orig_bio->bi_size;
6321 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6443 ret = btrfs_map_block(root->fs_info, READ,
6444 start_sector << 9,
6322 &map_length, NULL, 0); 6445 &map_length, NULL, 0);
6323 if (ret) { 6446 if (ret) {
6324 bio_put(bio); 6447 bio_put(bio);
@@ -6471,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6471 btrfs_submit_direct, 0); 6594 btrfs_submit_direct, 0);
6472} 6595}
6473 6596
6597#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
6598
6474static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6599static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6475 __u64 start, __u64 len) 6600 __u64 start, __u64 len)
6476{ 6601{
6602 int ret;
6603
6604 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6605 if (ret)
6606 return ret;
6607
6477 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6608 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6478} 6609}
6479 6610
@@ -6573,8 +6704,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6573 */ 6704 */
6574 clear_extent_bit(tree, page_start, page_end, 6705 clear_extent_bit(tree, page_start, page_end,
6575 EXTENT_DIRTY | EXTENT_DELALLOC | 6706 EXTENT_DIRTY | EXTENT_DELALLOC |
6576 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6707 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
6577 &cached_state, GFP_NOFS); 6708 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
6578 /* 6709 /*
6579 * whoever cleared the private bit is responsible 6710 * whoever cleared the private bit is responsible
6580 * for the finish_ordered_io 6711 * for the finish_ordered_io
@@ -6590,7 +6721,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6590 } 6721 }
6591 clear_extent_bit(tree, page_start, page_end, 6722 clear_extent_bit(tree, page_start, page_end,
6592 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6723 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
6593 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6724 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
6725 &cached_state, GFP_NOFS);
6594 __btrfs_releasepage(page, GFP_NOFS); 6726 __btrfs_releasepage(page, GFP_NOFS);
6595 6727
6596 ClearPageChecked(page); 6728 ClearPageChecked(page);
@@ -6687,7 +6819,8 @@ again:
6687 * prepare_pages in the normal write path. 6819 * prepare_pages in the normal write path.
6688 */ 6820 */
6689 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6821 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
6690 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6822 EXTENT_DIRTY | EXTENT_DELALLOC |
6823 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
6691 0, 0, &cached_state, GFP_NOFS); 6824 0, 0, &cached_state, GFP_NOFS);
6692 6825
6693 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6826 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6851,7 @@ again:
6718 6851
6719 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6852 BTRFS_I(inode)->last_trans = root->fs_info->generation;
6720 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6853 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
6854 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
6721 6855
6722 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6856 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6723 6857
@@ -6741,11 +6875,10 @@ static int btrfs_truncate(struct inode *inode)
6741 int ret; 6875 int ret;
6742 int err = 0; 6876 int err = 0;
6743 struct btrfs_trans_handle *trans; 6877 struct btrfs_trans_handle *trans;
6744 unsigned long nr;
6745 u64 mask = root->sectorsize - 1; 6878 u64 mask = root->sectorsize - 1;
6746 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6879 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6747 6880
6748 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6881 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
6749 if (ret) 6882 if (ret)
6750 return ret; 6883 return ret;
6751 6884
@@ -6788,10 +6921,11 @@ static int btrfs_truncate(struct inode *inode)
6788 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6921 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6789 * updating the inode. 6922 * updating the inode.
6790 */ 6923 */
6791 rsv = btrfs_alloc_block_rsv(root); 6924 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
6792 if (!rsv) 6925 if (!rsv)
6793 return -ENOMEM; 6926 return -ENOMEM;
6794 rsv->size = min_size; 6927 rsv->size = min_size;
6928 rsv->failfast = 1;
6795 6929
6796 /* 6930 /*
6797 * 1 for the truncate slack space 6931 * 1 for the truncate slack space
@@ -6837,36 +6971,21 @@ static int btrfs_truncate(struct inode *inode)
6837 &BTRFS_I(inode)->runtime_flags)) 6971 &BTRFS_I(inode)->runtime_flags))
6838 btrfs_add_ordered_operation(trans, root, inode); 6972 btrfs_add_ordered_operation(trans, root, inode);
6839 6973
6840 while (1) { 6974 /*
6841 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6975 * So if we truncate and then write and fsync we normally would just
6842 if (ret) { 6976 * write the extents that changed, which is a problem if we need to
6843 /* 6977 * first truncate that entire inode. So set this flag so we write out
6844 * This can only happen with the original transaction we 6978 * all of the extents in the inode to the sync log so we're completely
6845 * started above, every other time we shouldn't have a 6979 * safe.
6846 * transaction started yet. 6980 */
6847 */ 6981 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6848 if (ret == -EAGAIN) 6982 trans->block_rsv = rsv;
6849 goto end_trans;
6850 err = ret;
6851 break;
6852 }
6853
6854 if (!trans) {
6855 /* Just need the 1 for updating the inode */
6856 trans = btrfs_start_transaction(root, 1);
6857 if (IS_ERR(trans)) {
6858 ret = err = PTR_ERR(trans);
6859 trans = NULL;
6860 break;
6861 }
6862 }
6863
6864 trans->block_rsv = rsv;
6865 6983
6984 while (1) {
6866 ret = btrfs_truncate_inode_items(trans, root, inode, 6985 ret = btrfs_truncate_inode_items(trans, root, inode,
6867 inode->i_size, 6986 inode->i_size,
6868 BTRFS_EXTENT_DATA_KEY); 6987 BTRFS_EXTENT_DATA_KEY);
6869 if (ret != -EAGAIN) { 6988 if (ret != -ENOSPC) {
6870 err = ret; 6989 err = ret;
6871 break; 6990 break;
6872 } 6991 }
@@ -6877,11 +6996,21 @@ static int btrfs_truncate(struct inode *inode)
6877 err = ret; 6996 err = ret;
6878 break; 6997 break;
6879 } 6998 }
6880end_trans: 6999
6881 nr = trans->blocks_used;
6882 btrfs_end_transaction(trans, root); 7000 btrfs_end_transaction(trans, root);
6883 trans = NULL; 7001 btrfs_btree_balance_dirty(root);
6884 btrfs_btree_balance_dirty(root, nr); 7002
7003 trans = btrfs_start_transaction(root, 2);
7004 if (IS_ERR(trans)) {
7005 ret = err = PTR_ERR(trans);
7006 trans = NULL;
7007 break;
7008 }
7009
7010 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
7011 rsv, min_size);
7012 BUG_ON(ret); /* shouldn't happen */
7013 trans->block_rsv = rsv;
6885 } 7014 }
6886 7015
6887 if (ret == 0 && inode->i_nlink > 0) { 7016 if (ret == 0 && inode->i_nlink > 0) {
@@ -6903,9 +7032,8 @@ end_trans:
6903 if (ret && !err) 7032 if (ret && !err)
6904 err = ret; 7033 err = ret;
6905 7034
6906 nr = trans->blocks_used;
6907 ret = btrfs_end_transaction(trans, root); 7035 ret = btrfs_end_transaction(trans, root);
6908 btrfs_btree_balance_dirty(root, nr); 7036 btrfs_btree_balance_dirty(root);
6909 } 7037 }
6910 7038
6911out: 7039out:
@@ -6965,6 +7093,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6965 ei->csum_bytes = 0; 7093 ei->csum_bytes = 0;
6966 ei->index_cnt = (u64)-1; 7094 ei->index_cnt = (u64)-1;
6967 ei->last_unlink_trans = 0; 7095 ei->last_unlink_trans = 0;
7096 ei->last_log_commit = 0;
6968 7097
6969 spin_lock_init(&ei->lock); 7098 spin_lock_init(&ei->lock);
6970 ei->outstanding_extents = 0; 7099 ei->outstanding_extents = 0;
@@ -6981,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6981 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7110 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6982 ei->io_tree.track_uptodate = 1; 7111 ei->io_tree.track_uptodate = 1;
6983 ei->io_failure_tree.track_uptodate = 1; 7112 ei->io_failure_tree.track_uptodate = 1;
7113 atomic_set(&ei->sync_writers, 0);
6984 mutex_init(&ei->log_mutex); 7114 mutex_init(&ei->log_mutex);
6985 mutex_init(&ei->delalloc_mutex); 7115 mutex_init(&ei->delalloc_mutex);
6986 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7116 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7091,40 +7221,49 @@ void btrfs_destroy_cachep(void)
7091 kmem_cache_destroy(btrfs_path_cachep); 7221 kmem_cache_destroy(btrfs_path_cachep);
7092 if (btrfs_free_space_cachep) 7222 if (btrfs_free_space_cachep)
7093 kmem_cache_destroy(btrfs_free_space_cachep); 7223 kmem_cache_destroy(btrfs_free_space_cachep);
7224 if (btrfs_delalloc_work_cachep)
7225 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7094} 7226}
7095 7227
7096int btrfs_init_cachep(void) 7228int btrfs_init_cachep(void)
7097{ 7229{
7098 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7230 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7099 sizeof(struct btrfs_inode), 0, 7231 sizeof(struct btrfs_inode), 0,
7100 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7232 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7101 if (!btrfs_inode_cachep) 7233 if (!btrfs_inode_cachep)
7102 goto fail; 7234 goto fail;
7103 7235
7104 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7236 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7105 sizeof(struct btrfs_trans_handle), 0, 7237 sizeof(struct btrfs_trans_handle), 0,
7106 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7238 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7107 if (!btrfs_trans_handle_cachep) 7239 if (!btrfs_trans_handle_cachep)
7108 goto fail; 7240 goto fail;
7109 7241
7110 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7242 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7111 sizeof(struct btrfs_transaction), 0, 7243 sizeof(struct btrfs_transaction), 0,
7112 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7244 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7113 if (!btrfs_transaction_cachep) 7245 if (!btrfs_transaction_cachep)
7114 goto fail; 7246 goto fail;
7115 7247
7116 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7248 btrfs_path_cachep = kmem_cache_create("btrfs_path",
7117 sizeof(struct btrfs_path), 0, 7249 sizeof(struct btrfs_path), 0,
7118 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7250 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7119 if (!btrfs_path_cachep) 7251 if (!btrfs_path_cachep)
7120 goto fail; 7252 goto fail;
7121 7253
7122 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7254 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7123 sizeof(struct btrfs_free_space), 0, 7255 sizeof(struct btrfs_free_space), 0,
7124 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7256 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7125 if (!btrfs_free_space_cachep) 7257 if (!btrfs_free_space_cachep)
7126 goto fail; 7258 goto fail;
7127 7259
7260 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7261 sizeof(struct btrfs_delalloc_work), 0,
7262 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7263 NULL);
7264 if (!btrfs_delalloc_work_cachep)
7265 goto fail;
7266
7128 return 0; 7267 return 0;
7129fail: 7268fail:
7130 btrfs_destroy_cachep(); 7269 btrfs_destroy_cachep();
@@ -7196,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7196 if (S_ISDIR(old_inode->i_mode) && new_inode && 7335 if (S_ISDIR(old_inode->i_mode) && new_inode &&
7197 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7336 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7198 return -ENOTEMPTY; 7337 return -ENOTEMPTY;
7338
7339
7340 /* check for collisions, even if the name isn't there */
7341 ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7342 new_dentry->d_name.name,
7343 new_dentry->d_name.len);
7344
7345 if (ret) {
7346 if (ret == -EEXIST) {
7347 /* we shouldn't get
7348 * eexist without a new_inode */
7349 if (!new_inode) {
7350 WARN_ON(1);
7351 return ret;
7352 }
7353 } else {
7354 /* maybe -EOVERFLOW */
7355 return ret;
7356 }
7357 }
7358 ret = 0;
7359
7199 /* 7360 /*
7200 * we're using rename to replace one file with another. 7361 * we're using rename to replace one file with another.
7201 * and the replacement file is large. Start IO on it now so 7362 * and the replacement file is large. Start IO on it now so
@@ -7335,6 +7496,49 @@ out_notrans:
7335 return ret; 7496 return ret;
7336} 7497}
7337 7498
7499static void btrfs_run_delalloc_work(struct btrfs_work *work)
7500{
7501 struct btrfs_delalloc_work *delalloc_work;
7502
7503 delalloc_work = container_of(work, struct btrfs_delalloc_work,
7504 work);
7505 if (delalloc_work->wait)
7506 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7507 else
7508 filemap_flush(delalloc_work->inode->i_mapping);
7509
7510 if (delalloc_work->delay_iput)
7511 btrfs_add_delayed_iput(delalloc_work->inode);
7512 else
7513 iput(delalloc_work->inode);
7514 complete(&delalloc_work->completion);
7515}
7516
7517struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7518 int wait, int delay_iput)
7519{
7520 struct btrfs_delalloc_work *work;
7521
7522 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7523 if (!work)
7524 return NULL;
7525
7526 init_completion(&work->completion);
7527 INIT_LIST_HEAD(&work->list);
7528 work->inode = inode;
7529 work->wait = wait;
7530 work->delay_iput = delay_iput;
7531 work->work.func = btrfs_run_delalloc_work;
7532
7533 return work;
7534}
7535
7536void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7537{
7538 wait_for_completion(&work->completion);
7539 kmem_cache_free(btrfs_delalloc_work_cachep, work);
7540}
7541
7338/* 7542/*
7339 * some fairly slow code that needs optimization. This walks the list 7543 * some fairly slow code that needs optimization. This walks the list
7340 * of all the inodes with pending delalloc and forces them to disk. 7544 * of all the inodes with pending delalloc and forces them to disk.
@@ -7344,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7344 struct list_head *head = &root->fs_info->delalloc_inodes; 7548 struct list_head *head = &root->fs_info->delalloc_inodes;
7345 struct btrfs_inode *binode; 7549 struct btrfs_inode *binode;
7346 struct inode *inode; 7550 struct inode *inode;
7551 struct btrfs_delalloc_work *work, *next;
7552 struct list_head works;
7553 int ret = 0;
7347 7554
7348 if (root->fs_info->sb->s_flags & MS_RDONLY) 7555 if (root->fs_info->sb->s_flags & MS_RDONLY)
7349 return -EROFS; 7556 return -EROFS;
7350 7557
7558 INIT_LIST_HEAD(&works);
7559
7351 spin_lock(&root->fs_info->delalloc_lock); 7560 spin_lock(&root->fs_info->delalloc_lock);
7352 while (!list_empty(head)) { 7561 while (!list_empty(head)) {
7353 binode = list_entry(head->next, struct btrfs_inode, 7562 binode = list_entry(head->next, struct btrfs_inode,
@@ -7357,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7357 list_del_init(&binode->delalloc_inodes); 7566 list_del_init(&binode->delalloc_inodes);
7358 spin_unlock(&root->fs_info->delalloc_lock); 7567 spin_unlock(&root->fs_info->delalloc_lock);
7359 if (inode) { 7568 if (inode) {
7360 filemap_flush(inode->i_mapping); 7569 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7361 if (delay_iput) 7570 if (!work) {
7362 btrfs_add_delayed_iput(inode); 7571 ret = -ENOMEM;
7363 else 7572 goto out;
7364 iput(inode); 7573 }
7574 list_add_tail(&work->list, &works);
7575 btrfs_queue_worker(&root->fs_info->flush_workers,
7576 &work->work);
7365 } 7577 }
7366 cond_resched(); 7578 cond_resched();
7367 spin_lock(&root->fs_info->delalloc_lock); 7579 spin_lock(&root->fs_info->delalloc_lock);
@@ -7380,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7380 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7592 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7381 } 7593 }
7382 atomic_dec(&root->fs_info->async_submit_draining); 7594 atomic_dec(&root->fs_info->async_submit_draining);
7383 return 0; 7595out:
7596 list_for_each_entry_safe(work, next, &works, list) {
7597 list_del_init(&work->list);
7598 btrfs_wait_and_free_delalloc_work(work);
7599 }
7600 return ret;
7384} 7601}
7385 7602
7386static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7603static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7400,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7400 unsigned long ptr; 7617 unsigned long ptr;
7401 struct btrfs_file_extent_item *ei; 7618 struct btrfs_file_extent_item *ei;
7402 struct extent_buffer *leaf; 7619 struct extent_buffer *leaf;
7403 unsigned long nr = 0;
7404 7620
7405 name_len = strlen(symname) + 1; 7621 name_len = strlen(symname) + 1;
7406 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7622 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7498,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7498out_unlock: 7714out_unlock:
7499 if (!err) 7715 if (!err)
7500 d_instantiate(dentry, inode); 7716 d_instantiate(dentry, inode);
7501 nr = trans->blocks_used;
7502 btrfs_end_transaction(trans, root); 7717 btrfs_end_transaction(trans, root);
7503 if (drop_inode) { 7718 if (drop_inode) {
7504 inode_dec_link_count(inode); 7719 inode_dec_link_count(inode);
7505 iput(inode); 7720 iput(inode);
7506 } 7721 }
7507 btrfs_btree_balance_dirty(root, nr); 7722 btrfs_btree_balance_dirty(root);
7508 return err; 7723 return err;
7509} 7724}
7510 7725
@@ -7513,6 +7728,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7513 loff_t actual_len, u64 *alloc_hint, 7728 loff_t actual_len, u64 *alloc_hint,
7514 struct btrfs_trans_handle *trans) 7729 struct btrfs_trans_handle *trans)
7515{ 7730{
7731 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
7732 struct extent_map *em;
7516 struct btrfs_root *root = BTRFS_I(inode)->root; 7733 struct btrfs_root *root = BTRFS_I(inode)->root;
7517 struct btrfs_key ins; 7734 struct btrfs_key ins;
7518 u64 cur_offset = start; 7735 u64 cur_offset = start;
@@ -7553,6 +7770,38 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7553 btrfs_drop_extent_cache(inode, cur_offset, 7770 btrfs_drop_extent_cache(inode, cur_offset,
7554 cur_offset + ins.offset -1, 0); 7771 cur_offset + ins.offset -1, 0);
7555 7772
7773 em = alloc_extent_map();
7774 if (!em) {
7775 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
7776 &BTRFS_I(inode)->runtime_flags);
7777 goto next;
7778 }
7779
7780 em->start = cur_offset;
7781 em->orig_start = cur_offset;
7782 em->len = ins.offset;
7783 em->block_start = ins.objectid;
7784 em->block_len = ins.offset;
7785 em->orig_block_len = ins.offset;
7786 em->bdev = root->fs_info->fs_devices->latest_bdev;
7787 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7788 em->generation = trans->transid;
7789
7790 while (1) {
7791 write_lock(&em_tree->lock);
7792 ret = add_extent_mapping(em_tree, em);
7793 if (!ret)
7794 list_move(&em->list,
7795 &em_tree->modified_extents);
7796 write_unlock(&em_tree->lock);
7797 if (ret != -EEXIST)
7798 break;
7799 btrfs_drop_extent_cache(inode, cur_offset,
7800 cur_offset + ins.offset - 1,
7801 0);
7802 }
7803 free_extent_map(em);
7804next:
7556 num_bytes -= ins.offset; 7805 num_bytes -= ins.offset;
7557 cur_offset += ins.offset; 7806 cur_offset += ins.offset;
7558 *alloc_hint = ins.objectid + ins.offset; 7807 *alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47127c1bd290..4b4516770f05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
55#include "backref.h" 55#include "backref.h"
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h"
58 59
59/* Mask out flags that are inappropriate for the given type of inode. */ 60/* Mask out flags that are inappropriate for the given type of inode. */
60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 141 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 } 142 }
142 143
143 if (flags & BTRFS_INODE_NODATACOW) 144 if (flags & BTRFS_INODE_NODATACOW) {
144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
146 if (S_ISREG(inode->i_mode))
147 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
148 }
145 149
146 btrfs_update_iflags(inode); 150 btrfs_update_iflags(inode);
147} 151}
@@ -181,6 +185,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
181 int ret; 185 int ret;
182 u64 ip_oldflags; 186 u64 ip_oldflags;
183 unsigned int i_oldflags; 187 unsigned int i_oldflags;
188 umode_t mode;
184 189
185 if (btrfs_root_readonly(root)) 190 if (btrfs_root_readonly(root))
186 return -EROFS; 191 return -EROFS;
@@ -203,6 +208,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
203 208
204 ip_oldflags = ip->flags; 209 ip_oldflags = ip->flags;
205 i_oldflags = inode->i_flags; 210 i_oldflags = inode->i_flags;
211 mode = inode->i_mode;
206 212
207 flags = btrfs_mask_flags(inode->i_mode, flags); 213 flags = btrfs_mask_flags(inode->i_mode, flags);
208 oldflags = btrfs_flags_to_ioctl(ip->flags); 214 oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +243,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
237 ip->flags |= BTRFS_INODE_DIRSYNC; 243 ip->flags |= BTRFS_INODE_DIRSYNC;
238 else 244 else
239 ip->flags &= ~BTRFS_INODE_DIRSYNC; 245 ip->flags &= ~BTRFS_INODE_DIRSYNC;
240 if (flags & FS_NOCOW_FL) 246 if (flags & FS_NOCOW_FL) {
241 ip->flags |= BTRFS_INODE_NODATACOW; 247 if (S_ISREG(mode)) {
242 else 248 /*
243 ip->flags &= ~BTRFS_INODE_NODATACOW; 249 * It's safe to turn csums off here, no extents exist.
250 * Otherwise we want the flag to reflect the real COW
251 * status of the file and will not set it.
252 */
253 if (inode->i_size == 0)
254 ip->flags |= BTRFS_INODE_NODATACOW
255 | BTRFS_INODE_NODATASUM;
256 } else {
257 ip->flags |= BTRFS_INODE_NODATACOW;
258 }
259 } else {
260 /*
261 * Revert back under same assuptions as above
262 */
263 if (S_ISREG(mode)) {
264 if (inode->i_size == 0)
265 ip->flags &= ~(BTRFS_INODE_NODATACOW
266 | BTRFS_INODE_NODATASUM);
267 } else {
268 ip->flags &= ~BTRFS_INODE_NODATACOW;
269 }
270 }
244 271
245 /* 272 /*
246 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 273 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -320,7 +347,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
320 return -EOPNOTSUPP; 347 return -EOPNOTSUPP;
321 if (copy_from_user(&range, arg, sizeof(range))) 348 if (copy_from_user(&range, arg, sizeof(range)))
322 return -EFAULT; 349 return -EFAULT;
323 if (range.start > total_bytes) 350 if (range.start > total_bytes ||
351 range.len < fs_info->sb->s_blocksize)
324 return -EINVAL; 352 return -EINVAL;
325 353
326 range.len = min(range.len, total_bytes - range.start); 354 range.len = min(range.len, total_bytes - range.start);
@@ -516,7 +544,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
516 if (!pending_snapshot) 544 if (!pending_snapshot)
517 return -ENOMEM; 545 return -ENOMEM;
518 546
519 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 547 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
548 BTRFS_BLOCK_RSV_TEMP);
520 pending_snapshot->dentry = dentry; 549 pending_snapshot->dentry = dentry;
521 pending_snapshot->root = root; 550 pending_snapshot->root = root;
522 pending_snapshot->readonly = readonly; 551 pending_snapshot->readonly = readonly;
@@ -525,7 +554,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
525 *inherit = NULL; /* take responsibility to free it */ 554 *inherit = NULL; /* take responsibility to free it */
526 } 555 }
527 556
528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 557 trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
529 if (IS_ERR(trans)) { 558 if (IS_ERR(trans)) {
530 ret = PTR_ERR(trans); 559 ret = PTR_ERR(trans);
531 goto fail; 560 goto fail;
@@ -546,7 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
546 ret = btrfs_commit_transaction(trans, 575 ret = btrfs_commit_transaction(trans,
547 root->fs_info->extent_root); 576 root->fs_info->extent_root);
548 } 577 }
549 BUG_ON(ret); 578 if (ret) {
579 /* cleanup_transaction has freed this for us */
580 if (trans->aborted)
581 pending_snapshot = NULL;
582 goto fail;
583 }
550 584
551 ret = pending_snapshot->error; 585 ret = pending_snapshot->error;
552 if (ret) 586 if (ret)
@@ -614,7 +648,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
614 return -ENOENT; 648 return -ENOENT;
615 649
616 BUG_ON(victim->d_parent->d_inode != dir); 650 BUG_ON(victim->d_parent->d_inode != dir);
617 audit_inode_child(victim, dir); 651 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
618 652
619 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 653 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
620 if (error) 654 if (error)
@@ -679,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
679 if (error) 713 if (error)
680 goto out_dput; 714 goto out_dput;
681 715
716 /*
717 * even if this name doesn't exist, we may get hash collisions.
718 * check for them now when we can safely fail
719 */
720 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
721 dir->i_ino, name,
722 namelen);
723 if (error)
724 goto out_dput;
725
682 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 726 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
683 727
684 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 728 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1022,8 +1066,8 @@ again:
1022 page_start, page_end - 1, 0, &cached_state); 1066 page_start, page_end - 1, 0, &cached_state);
1023 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1067 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
1024 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1068 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1025 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1069 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1026 GFP_NOFS); 1070 &cached_state, GFP_NOFS);
1027 1071
1028 if (i_done != page_cnt) { 1072 if (i_done != page_cnt) {
1029 spin_lock(&BTRFS_I(inode)->lock); 1073 spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1078,8 @@ again:
1034 } 1078 }
1035 1079
1036 1080
1037 btrfs_set_extent_delalloc(inode, page_start, page_end - 1, 1081 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
1038 &cached_state); 1082 &cached_state, GFP_NOFS);
1039 1083
1040 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1084 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1041 page_start, page_end - 1, &cached_state, 1085 page_start, page_end - 1, &cached_state,
@@ -1199,7 +1243,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1199 } 1243 }
1200 1244
1201 defrag_count += ret; 1245 defrag_count += ret;
1202 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1246 balance_dirty_pages_ratelimited(inode->i_mapping);
1203 mutex_unlock(&inode->i_mutex); 1247 mutex_unlock(&inode->i_mutex);
1204 1248
1205 if (newer_than) { 1249 if (newer_than) {
@@ -1267,12 +1311,13 @@ out_ra:
1267 return ret; 1311 return ret;
1268} 1312}
1269 1313
1270static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1314static noinline int btrfs_ioctl_resize(struct file *file,
1271 void __user *arg) 1315 void __user *arg)
1272{ 1316{
1273 u64 new_size; 1317 u64 new_size;
1274 u64 old_size; 1318 u64 old_size;
1275 u64 devid = 1; 1319 u64 devid = 1;
1320 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1276 struct btrfs_ioctl_vol_args *vol_args; 1321 struct btrfs_ioctl_vol_args *vol_args;
1277 struct btrfs_trans_handle *trans; 1322 struct btrfs_trans_handle *trans;
1278 struct btrfs_device *device = NULL; 1323 struct btrfs_device *device = NULL;
@@ -1287,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1287 if (!capable(CAP_SYS_ADMIN)) 1332 if (!capable(CAP_SYS_ADMIN))
1288 return -EPERM; 1333 return -EPERM;
1289 1334
1290 mutex_lock(&root->fs_info->volume_mutex); 1335 ret = mnt_want_write_file(file);
1291 if (root->fs_info->balance_ctl) { 1336 if (ret)
1292 printk(KERN_INFO "btrfs: balance in progress\n"); 1337 return ret;
1293 ret = -EINVAL; 1338
1294 goto out; 1339 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1340 1)) {
1341 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1342 return -EINPROGRESS;
1295 } 1343 }
1296 1344
1345 mutex_lock(&root->fs_info->volume_mutex);
1297 vol_args = memdup_user(arg, sizeof(*vol_args)); 1346 vol_args = memdup_user(arg, sizeof(*vol_args));
1298 if (IS_ERR(vol_args)) { 1347 if (IS_ERR(vol_args)) {
1299 ret = PTR_ERR(vol_args); 1348 ret = PTR_ERR(vol_args);
@@ -1313,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1313 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1362 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1314 (unsigned long long)devid); 1363 (unsigned long long)devid);
1315 } 1364 }
1316 device = btrfs_find_device(root, devid, NULL, NULL); 1365 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1317 if (!device) { 1366 if (!device) {
1318 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1367 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1319 (unsigned long long)devid); 1368 (unsigned long long)devid);
@@ -1345,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1345 } 1394 }
1346 } 1395 }
1347 1396
1397 if (device->is_tgtdev_for_dev_replace) {
1398 ret = -EINVAL;
1399 goto out_free;
1400 }
1401
1348 old_size = device->total_bytes; 1402 old_size = device->total_bytes;
1349 1403
1350 if (mod < 0) { 1404 if (mod < 0) {
@@ -1383,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1383 btrfs_commit_transaction(trans, root); 1437 btrfs_commit_transaction(trans, root);
1384 } else if (new_size < old_size) { 1438 } else if (new_size < old_size) {
1385 ret = btrfs_shrink_device(device, new_size); 1439 ret = btrfs_shrink_device(device, new_size);
1386 } 1440 } /* equal, nothing need to do */
1387 1441
1388out_free: 1442out_free:
1389 kfree(vol_args); 1443 kfree(vol_args);
1390out: 1444out:
1391 mutex_unlock(&root->fs_info->volume_mutex); 1445 mutex_unlock(&root->fs_info->volume_mutex);
1446 mnt_drop_write_file(file);
1447 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1392 return ret; 1448 return ret;
1393} 1449}
1394 1450
@@ -2130,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2130 if (btrfs_root_readonly(root)) 2186 if (btrfs_root_readonly(root))
2131 return -EROFS; 2187 return -EROFS;
2132 2188
2189 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2190 1)) {
2191 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2192 return -EINPROGRESS;
2193 }
2133 ret = mnt_want_write_file(file); 2194 ret = mnt_want_write_file(file);
2134 if (ret) 2195 if (ret) {
2196 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
2197 0);
2135 return ret; 2198 return ret;
2199 }
2136 2200
2137 switch (inode->i_mode & S_IFMT) { 2201 switch (inode->i_mode & S_IFMT) {
2138 case S_IFDIR: 2202 case S_IFDIR:
@@ -2184,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2184 } 2248 }
2185out: 2249out:
2186 mnt_drop_write_file(file); 2250 mnt_drop_write_file(file);
2251 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2187 return ret; 2252 return ret;
2188} 2253}
2189 2254
@@ -2195,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2195 if (!capable(CAP_SYS_ADMIN)) 2260 if (!capable(CAP_SYS_ADMIN))
2196 return -EPERM; 2261 return -EPERM;
2197 2262
2198 mutex_lock(&root->fs_info->volume_mutex); 2263 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2199 if (root->fs_info->balance_ctl) { 2264 1)) {
2200 printk(KERN_INFO "btrfs: balance in progress\n"); 2265 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2201 ret = -EINVAL; 2266 return -EINPROGRESS;
2202 goto out;
2203 } 2267 }
2204 2268
2269 mutex_lock(&root->fs_info->volume_mutex);
2205 vol_args = memdup_user(arg, sizeof(*vol_args)); 2270 vol_args = memdup_user(arg, sizeof(*vol_args));
2206 if (IS_ERR(vol_args)) { 2271 if (IS_ERR(vol_args)) {
2207 ret = PTR_ERR(vol_args); 2272 ret = PTR_ERR(vol_args);
@@ -2214,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2214 kfree(vol_args); 2279 kfree(vol_args);
2215out: 2280out:
2216 mutex_unlock(&root->fs_info->volume_mutex); 2281 mutex_unlock(&root->fs_info->volume_mutex);
2282 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2217 return ret; 2283 return ret;
2218} 2284}
2219 2285
2220static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2286static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2221{ 2287{
2288 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
2222 struct btrfs_ioctl_vol_args *vol_args; 2289 struct btrfs_ioctl_vol_args *vol_args;
2223 int ret; 2290 int ret;
2224 2291
2225 if (!capable(CAP_SYS_ADMIN)) 2292 if (!capable(CAP_SYS_ADMIN))
2226 return -EPERM; 2293 return -EPERM;
2227 2294
2228 if (root->fs_info->sb->s_flags & MS_RDONLY) 2295 ret = mnt_want_write_file(file);
2229 return -EROFS; 2296 if (ret)
2297 return ret;
2230 2298
2231 mutex_lock(&root->fs_info->volume_mutex); 2299 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2232 if (root->fs_info->balance_ctl) { 2300 1)) {
2233 printk(KERN_INFO "btrfs: balance in progress\n"); 2301 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2234 ret = -EINVAL; 2302 mnt_drop_write_file(file);
2235 goto out; 2303 return -EINPROGRESS;
2236 } 2304 }
2237 2305
2306 mutex_lock(&root->fs_info->volume_mutex);
2238 vol_args = memdup_user(arg, sizeof(*vol_args)); 2307 vol_args = memdup_user(arg, sizeof(*vol_args));
2239 if (IS_ERR(vol_args)) { 2308 if (IS_ERR(vol_args)) {
2240 ret = PTR_ERR(vol_args); 2309 ret = PTR_ERR(vol_args);
@@ -2247,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2247 kfree(vol_args); 2316 kfree(vol_args);
2248out: 2317out:
2249 mutex_unlock(&root->fs_info->volume_mutex); 2318 mutex_unlock(&root->fs_info->volume_mutex);
2319 mnt_drop_write_file(file);
2320 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2250 return ret; 2321 return ret;
2251} 2322}
2252 2323
@@ -2302,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2302 s_uuid = di_args->uuid; 2373 s_uuid = di_args->uuid;
2303 2374
2304 mutex_lock(&fs_devices->device_list_mutex); 2375 mutex_lock(&fs_devices->device_list_mutex);
2305 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2376 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
2306 mutex_unlock(&fs_devices->device_list_mutex); 2377 mutex_unlock(&fs_devices->device_list_mutex);
2307 2378
2308 if (!dev) { 2379 if (!dev) {
@@ -2351,7 +2422,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2351 int ret; 2422 int ret;
2352 u64 len = olen; 2423 u64 len = olen;
2353 u64 bs = root->fs_info->sb->s_blocksize; 2424 u64 bs = root->fs_info->sb->s_blocksize;
2354 u64 hint_byte;
2355 2425
2356 /* 2426 /*
2357 * TODO: 2427 * TODO:
@@ -2456,13 +2526,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2456 another, and lock file content */ 2526 another, and lock file content */
2457 while (1) { 2527 while (1) {
2458 struct btrfs_ordered_extent *ordered; 2528 struct btrfs_ordered_extent *ordered;
2459 lock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2529 lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2460 ordered = btrfs_lookup_first_ordered_extent(src, off+len); 2530 ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
2461 if (!ordered && 2531 if (!ordered &&
2462 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, 2532 !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
2463 EXTENT_DELALLOC, 0, NULL)) 2533 EXTENT_DELALLOC, 0, NULL))
2464 break; 2534 break;
2465 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2535 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2466 if (ordered) 2536 if (ordered)
2467 btrfs_put_ordered_extent(ordered); 2537 btrfs_put_ordered_extent(ordered);
2468 btrfs_wait_ordered_range(src, off, len); 2538 btrfs_wait_ordered_range(src, off, len);
@@ -2536,7 +2606,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2536 btrfs_release_path(path); 2606 btrfs_release_path(path);
2537 2607
2538 if (key.offset + datal <= off || 2608 if (key.offset + datal <= off ||
2539 key.offset >= off+len) 2609 key.offset >= off + len - 1)
2540 goto next; 2610 goto next;
2541 2611
2542 memcpy(&new_key, &key, sizeof(new_key)); 2612 memcpy(&new_key, &key, sizeof(new_key));
@@ -2574,10 +2644,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2574 datal -= off - key.offset; 2644 datal -= off - key.offset;
2575 } 2645 }
2576 2646
2577 ret = btrfs_drop_extents(trans, inode, 2647 ret = btrfs_drop_extents(trans, root, inode,
2578 new_key.offset, 2648 new_key.offset,
2579 new_key.offset + datal, 2649 new_key.offset + datal,
2580 &hint_byte, 1); 2650 1);
2581 if (ret) { 2651 if (ret) {
2582 btrfs_abort_transaction(trans, root, 2652 btrfs_abort_transaction(trans, root,
2583 ret); 2653 ret);
@@ -2637,8 +2707,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2637 new_key.offset += skip; 2707 new_key.offset += skip;
2638 } 2708 }
2639 2709
2640 if (key.offset + datal > off+len) 2710 if (key.offset + datal > off + len)
2641 trim = key.offset + datal - (off+len); 2711 trim = key.offset + datal - (off + len);
2642 2712
2643 if (comp && (skip || trim)) { 2713 if (comp && (skip || trim)) {
2644 ret = -EINVAL; 2714 ret = -EINVAL;
@@ -2648,10 +2718,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2648 size -= skip + trim; 2718 size -= skip + trim;
2649 datal -= skip + trim; 2719 datal -= skip + trim;
2650 2720
2651 ret = btrfs_drop_extents(trans, inode, 2721 ret = btrfs_drop_extents(trans, root, inode,
2652 new_key.offset, 2722 new_key.offset,
2653 new_key.offset + datal, 2723 new_key.offset + datal,
2654 &hint_byte, 1); 2724 1);
2655 if (ret) { 2725 if (ret) {
2656 btrfs_abort_transaction(trans, root, 2726 btrfs_abort_transaction(trans, root,
2657 ret); 2727 ret);
@@ -2715,7 +2785,7 @@ next:
2715 ret = 0; 2785 ret = 0;
2716out: 2786out:
2717 btrfs_release_path(path); 2787 btrfs_release_path(path);
2718 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2788 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2719out_unlock: 2789out_unlock:
2720 mutex_unlock(&src->i_mutex); 2790 mutex_unlock(&src->i_mutex);
2721 mutex_unlock(&inode->i_mutex); 2791 mutex_unlock(&inode->i_mutex);
@@ -2796,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2796 struct btrfs_disk_key disk_key; 2866 struct btrfs_disk_key disk_key;
2797 u64 objectid = 0; 2867 u64 objectid = 0;
2798 u64 dir_id; 2868 u64 dir_id;
2869 int ret;
2799 2870
2800 if (!capable(CAP_SYS_ADMIN)) 2871 if (!capable(CAP_SYS_ADMIN))
2801 return -EPERM; 2872 return -EPERM;
2802 2873
2803 if (copy_from_user(&objectid, argp, sizeof(objectid))) 2874 ret = mnt_want_write_file(file);
2804 return -EFAULT; 2875 if (ret)
2876 return ret;
2877
2878 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2879 ret = -EFAULT;
2880 goto out;
2881 }
2805 2882
2806 if (!objectid) 2883 if (!objectid)
2807 objectid = root->root_key.objectid; 2884 objectid = root->root_key.objectid;
@@ -2811,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2811 location.offset = (u64)-1; 2888 location.offset = (u64)-1;
2812 2889
2813 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2890 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2814 if (IS_ERR(new_root)) 2891 if (IS_ERR(new_root)) {
2815 return PTR_ERR(new_root); 2892 ret = PTR_ERR(new_root);
2893 goto out;
2894 }
2816 2895
2817 if (btrfs_root_refs(&new_root->root_item) == 0) 2896 if (btrfs_root_refs(&new_root->root_item) == 0) {
2818 return -ENOENT; 2897 ret = -ENOENT;
2898 goto out;
2899 }
2819 2900
2820 path = btrfs_alloc_path(); 2901 path = btrfs_alloc_path();
2821 if (!path) 2902 if (!path) {
2822 return -ENOMEM; 2903 ret = -ENOMEM;
2904 goto out;
2905 }
2823 path->leave_spinning = 1; 2906 path->leave_spinning = 1;
2824 2907
2825 trans = btrfs_start_transaction(root, 1); 2908 trans = btrfs_start_transaction(root, 1);
2826 if (IS_ERR(trans)) { 2909 if (IS_ERR(trans)) {
2827 btrfs_free_path(path); 2910 btrfs_free_path(path);
2828 return PTR_ERR(trans); 2911 ret = PTR_ERR(trans);
2912 goto out;
2829 } 2913 }
2830 2914
2831 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 2915 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2836,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2836 btrfs_end_transaction(trans, root); 2920 btrfs_end_transaction(trans, root);
2837 printk(KERN_ERR "Umm, you don't have the default dir item, " 2921 printk(KERN_ERR "Umm, you don't have the default dir item, "
2838 "this isn't going to work\n"); 2922 "this isn't going to work\n");
2839 return -ENOENT; 2923 ret = -ENOENT;
2924 goto out;
2840 } 2925 }
2841 2926
2842 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 2927 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2846,12 +2931,13 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2846 2931
2847 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2932 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2848 btrfs_end_transaction(trans, root); 2933 btrfs_end_transaction(trans, root);
2849 2934out:
2850 return 0; 2935 mnt_drop_write_file(file);
2936 return ret;
2851} 2937}
2852 2938
2853static void get_block_group_info(struct list_head *groups_list, 2939void btrfs_get_block_group_info(struct list_head *groups_list,
2854 struct btrfs_ioctl_space_info *space) 2940 struct btrfs_ioctl_space_info *space)
2855{ 2941{
2856 struct btrfs_block_group_cache *block_group; 2942 struct btrfs_block_group_cache *block_group;
2857 2943
@@ -2959,8 +3045,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2959 down_read(&info->groups_sem); 3045 down_read(&info->groups_sem);
2960 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 3046 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2961 if (!list_empty(&info->block_groups[c])) { 3047 if (!list_empty(&info->block_groups[c])) {
2962 get_block_group_info(&info->block_groups[c], 3048 btrfs_get_block_group_info(
2963 &space); 3049 &info->block_groups[c], &space);
2964 memcpy(dest, &space, sizeof(space)); 3050 memcpy(dest, &space, sizeof(space));
2965 dest++; 3051 dest++;
2966 space_args.total_spaces++; 3052 space_args.total_spaces++;
@@ -3011,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
3011 return 0; 3097 return 0;
3012} 3098}
3013 3099
3014static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3100static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3101 void __user *argp)
3015{ 3102{
3016 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3017 struct btrfs_trans_handle *trans; 3103 struct btrfs_trans_handle *trans;
3018 u64 transid; 3104 u64 transid;
3019 int ret; 3105 int ret;
3020 3106
3021 trans = btrfs_start_transaction(root, 0); 3107 trans = btrfs_attach_transaction(root);
3022 if (IS_ERR(trans)) 3108 if (IS_ERR(trans)) {
3023 return PTR_ERR(trans); 3109 if (PTR_ERR(trans) != -ENOENT)
3110 return PTR_ERR(trans);
3111
3112 /* No running transaction, don't bother */
3113 transid = root->fs_info->last_trans_committed;
3114 goto out;
3115 }
3024 transid = trans->transid; 3116 transid = trans->transid;
3025 ret = btrfs_commit_transaction_async(trans, root, 0); 3117 ret = btrfs_commit_transaction_async(trans, root, 0);
3026 if (ret) { 3118 if (ret) {
3027 btrfs_end_transaction(trans, root); 3119 btrfs_end_transaction(trans, root);
3028 return ret; 3120 return ret;
3029 } 3121 }
3030 3122out:
3031 if (argp) 3123 if (argp)
3032 if (copy_to_user(argp, &transid, sizeof(transid))) 3124 if (copy_to_user(argp, &transid, sizeof(transid)))
3033 return -EFAULT; 3125 return -EFAULT;
3034 return 0; 3126 return 0;
3035} 3127}
3036 3128
3037static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3129static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3130 void __user *argp)
3038{ 3131{
3039 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3040 u64 transid; 3132 u64 transid;
3041 3133
3042 if (argp) { 3134 if (argp) {
@@ -3048,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3048 return btrfs_wait_for_commit(root, transid); 3140 return btrfs_wait_for_commit(root, transid);
3049} 3141}
3050 3142
3051static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3143static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3052{ 3144{
3053 int ret; 3145 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3054 struct btrfs_ioctl_scrub_args *sa; 3146 struct btrfs_ioctl_scrub_args *sa;
3147 int ret;
3055 3148
3056 if (!capable(CAP_SYS_ADMIN)) 3149 if (!capable(CAP_SYS_ADMIN))
3057 return -EPERM; 3150 return -EPERM;
@@ -3060,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3060 if (IS_ERR(sa)) 3153 if (IS_ERR(sa))
3061 return PTR_ERR(sa); 3154 return PTR_ERR(sa);
3062 3155
3063 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3156 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3064 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3157 ret = mnt_want_write_file(file);
3158 if (ret)
3159 goto out;
3160 }
3161
3162 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
3163 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3164 0);
3065 3165
3066 if (copy_to_user(arg, sa, sizeof(*sa))) 3166 if (copy_to_user(arg, sa, sizeof(*sa)))
3067 ret = -EFAULT; 3167 ret = -EFAULT;
3068 3168
3169 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3170 mnt_drop_write_file(file);
3171out:
3069 kfree(sa); 3172 kfree(sa);
3070 return ret; 3173 return ret;
3071} 3174}
@@ -3075,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3075 if (!capable(CAP_SYS_ADMIN)) 3178 if (!capable(CAP_SYS_ADMIN))
3076 return -EPERM; 3179 return -EPERM;
3077 3180
3078 return btrfs_scrub_cancel(root); 3181 return btrfs_scrub_cancel(root->fs_info);
3079} 3182}
3080 3183
3081static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 3184static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3124,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3124 return ret; 3227 return ret;
3125} 3228}
3126 3229
3230static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
3231{
3232 struct btrfs_ioctl_dev_replace_args *p;
3233 int ret;
3234
3235 if (!capable(CAP_SYS_ADMIN))
3236 return -EPERM;
3237
3238 p = memdup_user(arg, sizeof(*p));
3239 if (IS_ERR(p))
3240 return PTR_ERR(p);
3241
3242 switch (p->cmd) {
3243 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3244 if (atomic_xchg(
3245 &root->fs_info->mutually_exclusive_operation_running,
3246 1)) {
3247 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3248 ret = -EINPROGRESS;
3249 } else {
3250 ret = btrfs_dev_replace_start(root, p);
3251 atomic_set(
3252 &root->fs_info->mutually_exclusive_operation_running,
3253 0);
3254 }
3255 break;
3256 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3257 btrfs_dev_replace_status(root->fs_info, p);
3258 ret = 0;
3259 break;
3260 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3261 ret = btrfs_dev_replace_cancel(root->fs_info, p);
3262 break;
3263 default:
3264 ret = -EINVAL;
3265 break;
3266 }
3267
3268 if (copy_to_user(arg, p, sizeof(*p)))
3269 ret = -EFAULT;
3270
3271 kfree(p);
3272 return ret;
3273}
3274
3127static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3275static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3128{ 3276{
3129 int ret = 0; 3277 int ret = 0;
@@ -3208,11 +3356,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3208{ 3356{
3209 int ret = 0; 3357 int ret = 0;
3210 int size; 3358 int size;
3211 u64 extent_item_pos;
3212 struct btrfs_ioctl_logical_ino_args *loi; 3359 struct btrfs_ioctl_logical_ino_args *loi;
3213 struct btrfs_data_container *inodes = NULL; 3360 struct btrfs_data_container *inodes = NULL;
3214 struct btrfs_path *path = NULL; 3361 struct btrfs_path *path = NULL;
3215 struct btrfs_key key;
3216 3362
3217 if (!capable(CAP_SYS_ADMIN)) 3363 if (!capable(CAP_SYS_ADMIN))
3218 return -EPERM; 3364 return -EPERM;
@@ -3230,7 +3376,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3230 goto out; 3376 goto out;
3231 } 3377 }
3232 3378
3233 size = min_t(u32, loi->size, 4096); 3379 size = min_t(u32, loi->size, 64 * 1024);
3234 inodes = init_data_container(size); 3380 inodes = init_data_container(size);
3235 if (IS_ERR(inodes)) { 3381 if (IS_ERR(inodes)) {
3236 ret = PTR_ERR(inodes); 3382 ret = PTR_ERR(inodes);
@@ -3238,22 +3384,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3238 goto out; 3384 goto out;
3239 } 3385 }
3240 3386
3241 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3387 ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
3242 btrfs_release_path(path); 3388 build_ino_list, inodes);
3243 3389 if (ret == -EINVAL)
3244 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3245 ret = -ENOENT; 3390 ret = -ENOENT;
3246 if (ret < 0) 3391 if (ret < 0)
3247 goto out; 3392 goto out;
3248 3393
3249 extent_item_pos = loi->logical - key.objectid;
3250 ret = iterate_extent_inodes(root->fs_info, key.objectid,
3251 extent_item_pos, 0, build_ino_list,
3252 inodes);
3253
3254 if (ret < 0)
3255 goto out;
3256
3257 ret = copy_to_user((void *)(unsigned long)loi->inodes, 3394 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3258 (void *)(unsigned long)inodes, size); 3395 (void *)(unsigned long)inodes, size);
3259 if (ret) 3396 if (ret)
@@ -3261,7 +3398,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3261 3398
3262out: 3399out:
3263 btrfs_free_path(path); 3400 btrfs_free_path(path);
3264 kfree(inodes); 3401 vfree(inodes);
3265 kfree(loi); 3402 kfree(loi);
3266 3403
3267 return ret; 3404 return ret;
@@ -3301,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3301 struct btrfs_ioctl_balance_args *bargs; 3438 struct btrfs_ioctl_balance_args *bargs;
3302 struct btrfs_balance_control *bctl; 3439 struct btrfs_balance_control *bctl;
3303 int ret; 3440 int ret;
3441 int need_to_clear_lock = 0;
3304 3442
3305 if (!capable(CAP_SYS_ADMIN)) 3443 if (!capable(CAP_SYS_ADMIN))
3306 return -EPERM; 3444 return -EPERM;
@@ -3336,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3336 bargs = NULL; 3474 bargs = NULL;
3337 } 3475 }
3338 3476
3339 if (fs_info->balance_ctl) { 3477 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
3478 1)) {
3479 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3340 ret = -EINPROGRESS; 3480 ret = -EINPROGRESS;
3341 goto out_bargs; 3481 goto out_bargs;
3342 } 3482 }
3483 need_to_clear_lock = 1;
3343 3484
3344 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3485 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3345 if (!bctl) { 3486 if (!bctl) {
@@ -3373,6 +3514,9 @@ do_balance:
3373out_bargs: 3514out_bargs:
3374 kfree(bargs); 3515 kfree(bargs);
3375out: 3516out:
3517 if (need_to_clear_lock)
3518 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
3519 0);
3376 mutex_unlock(&fs_info->balance_mutex); 3520 mutex_unlock(&fs_info->balance_mutex);
3377 mutex_unlock(&fs_info->volume_mutex); 3521 mutex_unlock(&fs_info->volume_mutex);
3378 mnt_drop_write_file(file); 3522 mnt_drop_write_file(file);
@@ -3427,8 +3571,9 @@ out:
3427 return ret; 3571 return ret;
3428} 3572}
3429 3573
3430static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3574static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3431{ 3575{
3576 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3432 struct btrfs_ioctl_quota_ctl_args *sa; 3577 struct btrfs_ioctl_quota_ctl_args *sa;
3433 struct btrfs_trans_handle *trans = NULL; 3578 struct btrfs_trans_handle *trans = NULL;
3434 int ret; 3579 int ret;
@@ -3437,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3437 if (!capable(CAP_SYS_ADMIN)) 3582 if (!capable(CAP_SYS_ADMIN))
3438 return -EPERM; 3583 return -EPERM;
3439 3584
3440 if (root->fs_info->sb->s_flags & MS_RDONLY) 3585 ret = mnt_want_write_file(file);
3441 return -EROFS; 3586 if (ret)
3587 return ret;
3442 3588
3443 sa = memdup_user(arg, sizeof(*sa)); 3589 sa = memdup_user(arg, sizeof(*sa));
3444 if (IS_ERR(sa)) 3590 if (IS_ERR(sa)) {
3445 return PTR_ERR(sa); 3591 ret = PTR_ERR(sa);
3592 goto drop_write;
3593 }
3446 3594
3447 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3595 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3448 trans = btrfs_start_transaction(root, 2); 3596 trans = btrfs_start_transaction(root, 2);
@@ -3475,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3475 if (err && !ret) 3623 if (err && !ret)
3476 ret = err; 3624 ret = err;
3477 } 3625 }
3478
3479out: 3626out:
3480 kfree(sa); 3627 kfree(sa);
3628drop_write:
3629 mnt_drop_write_file(file);
3481 return ret; 3630 return ret;
3482} 3631}
3483 3632
3484static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3633static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3485{ 3634{
3635 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3486 struct btrfs_ioctl_qgroup_assign_args *sa; 3636 struct btrfs_ioctl_qgroup_assign_args *sa;
3487 struct btrfs_trans_handle *trans; 3637 struct btrfs_trans_handle *trans;
3488 int ret; 3638 int ret;
@@ -3491,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3491 if (!capable(CAP_SYS_ADMIN)) 3641 if (!capable(CAP_SYS_ADMIN))
3492 return -EPERM; 3642 return -EPERM;
3493 3643
3494 if (root->fs_info->sb->s_flags & MS_RDONLY) 3644 ret = mnt_want_write_file(file);
3495 return -EROFS; 3645 if (ret)
3646 return ret;
3496 3647
3497 sa = memdup_user(arg, sizeof(*sa)); 3648 sa = memdup_user(arg, sizeof(*sa));
3498 if (IS_ERR(sa)) 3649 if (IS_ERR(sa)) {
3499 return PTR_ERR(sa); 3650 ret = PTR_ERR(sa);
3651 goto drop_write;
3652 }
3500 3653
3501 trans = btrfs_join_transaction(root); 3654 trans = btrfs_join_transaction(root);
3502 if (IS_ERR(trans)) { 3655 if (IS_ERR(trans)) {
@@ -3519,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3519 3672
3520out: 3673out:
3521 kfree(sa); 3674 kfree(sa);
3675drop_write:
3676 mnt_drop_write_file(file);
3522 return ret; 3677 return ret;
3523} 3678}
3524 3679
3525static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3680static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3526{ 3681{
3682 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3527 struct btrfs_ioctl_qgroup_create_args *sa; 3683 struct btrfs_ioctl_qgroup_create_args *sa;
3528 struct btrfs_trans_handle *trans; 3684 struct btrfs_trans_handle *trans;
3529 int ret; 3685 int ret;
@@ -3532,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3532 if (!capable(CAP_SYS_ADMIN)) 3688 if (!capable(CAP_SYS_ADMIN))
3533 return -EPERM; 3689 return -EPERM;
3534 3690
3535 if (root->fs_info->sb->s_flags & MS_RDONLY) 3691 ret = mnt_want_write_file(file);
3536 return -EROFS; 3692 if (ret)
3693 return ret;
3537 3694
3538 sa = memdup_user(arg, sizeof(*sa)); 3695 sa = memdup_user(arg, sizeof(*sa));
3539 if (IS_ERR(sa)) 3696 if (IS_ERR(sa)) {
3540 return PTR_ERR(sa); 3697 ret = PTR_ERR(sa);
3698 goto drop_write;
3699 }
3541 3700
3542 trans = btrfs_join_transaction(root); 3701 trans = btrfs_join_transaction(root);
3543 if (IS_ERR(trans)) { 3702 if (IS_ERR(trans)) {
@@ -3559,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3559 3718
3560out: 3719out:
3561 kfree(sa); 3720 kfree(sa);
3721drop_write:
3722 mnt_drop_write_file(file);
3562 return ret; 3723 return ret;
3563} 3724}
3564 3725
3565static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3726static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3566{ 3727{
3728 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3567 struct btrfs_ioctl_qgroup_limit_args *sa; 3729 struct btrfs_ioctl_qgroup_limit_args *sa;
3568 struct btrfs_trans_handle *trans; 3730 struct btrfs_trans_handle *trans;
3569 int ret; 3731 int ret;
@@ -3573,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3573 if (!capable(CAP_SYS_ADMIN)) 3735 if (!capable(CAP_SYS_ADMIN))
3574 return -EPERM; 3736 return -EPERM;
3575 3737
3576 if (root->fs_info->sb->s_flags & MS_RDONLY) 3738 ret = mnt_want_write_file(file);
3577 return -EROFS; 3739 if (ret)
3740 return ret;
3578 3741
3579 sa = memdup_user(arg, sizeof(*sa)); 3742 sa = memdup_user(arg, sizeof(*sa));
3580 if (IS_ERR(sa)) 3743 if (IS_ERR(sa)) {
3581 return PTR_ERR(sa); 3744 ret = PTR_ERR(sa);
3745 goto drop_write;
3746 }
3582 3747
3583 trans = btrfs_join_transaction(root); 3748 trans = btrfs_join_transaction(root);
3584 if (IS_ERR(trans)) { 3749 if (IS_ERR(trans)) {
@@ -3601,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3601 3766
3602out: 3767out:
3603 kfree(sa); 3768 kfree(sa);
3769drop_write:
3770 mnt_drop_write_file(file);
3604 return ret; 3771 return ret;
3605} 3772}
3606 3773
@@ -3721,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3721 case BTRFS_IOC_DEFRAG_RANGE: 3888 case BTRFS_IOC_DEFRAG_RANGE:
3722 return btrfs_ioctl_defrag(file, argp); 3889 return btrfs_ioctl_defrag(file, argp);
3723 case BTRFS_IOC_RESIZE: 3890 case BTRFS_IOC_RESIZE:
3724 return btrfs_ioctl_resize(root, argp); 3891 return btrfs_ioctl_resize(file, argp);
3725 case BTRFS_IOC_ADD_DEV: 3892 case BTRFS_IOC_ADD_DEV:
3726 return btrfs_ioctl_add_dev(root, argp); 3893 return btrfs_ioctl_add_dev(root, argp);
3727 case BTRFS_IOC_RM_DEV: 3894 case BTRFS_IOC_RM_DEV:
3728 return btrfs_ioctl_rm_dev(root, argp); 3895 return btrfs_ioctl_rm_dev(file, argp);
3729 case BTRFS_IOC_FS_INFO: 3896 case BTRFS_IOC_FS_INFO:
3730 return btrfs_ioctl_fs_info(root, argp); 3897 return btrfs_ioctl_fs_info(root, argp);
3731 case BTRFS_IOC_DEV_INFO: 3898 case BTRFS_IOC_DEV_INFO:
@@ -3754,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3754 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3921 btrfs_sync_fs(file->f_dentry->d_sb, 1);
3755 return 0; 3922 return 0;
3756 case BTRFS_IOC_START_SYNC: 3923 case BTRFS_IOC_START_SYNC:
3757 return btrfs_ioctl_start_sync(file, argp); 3924 return btrfs_ioctl_start_sync(root, argp);
3758 case BTRFS_IOC_WAIT_SYNC: 3925 case BTRFS_IOC_WAIT_SYNC:
3759 return btrfs_ioctl_wait_sync(file, argp); 3926 return btrfs_ioctl_wait_sync(root, argp);
3760 case BTRFS_IOC_SCRUB: 3927 case BTRFS_IOC_SCRUB:
3761 return btrfs_ioctl_scrub(root, argp); 3928 return btrfs_ioctl_scrub(file, argp);
3762 case BTRFS_IOC_SCRUB_CANCEL: 3929 case BTRFS_IOC_SCRUB_CANCEL:
3763 return btrfs_ioctl_scrub_cancel(root, argp); 3930 return btrfs_ioctl_scrub_cancel(root, argp);
3764 case BTRFS_IOC_SCRUB_PROGRESS: 3931 case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3776,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3776 case BTRFS_IOC_GET_DEV_STATS: 3943 case BTRFS_IOC_GET_DEV_STATS:
3777 return btrfs_ioctl_get_dev_stats(root, argp); 3944 return btrfs_ioctl_get_dev_stats(root, argp);
3778 case BTRFS_IOC_QUOTA_CTL: 3945 case BTRFS_IOC_QUOTA_CTL:
3779 return btrfs_ioctl_quota_ctl(root, argp); 3946 return btrfs_ioctl_quota_ctl(file, argp);
3780 case BTRFS_IOC_QGROUP_ASSIGN: 3947 case BTRFS_IOC_QGROUP_ASSIGN:
3781 return btrfs_ioctl_qgroup_assign(root, argp); 3948 return btrfs_ioctl_qgroup_assign(file, argp);
3782 case BTRFS_IOC_QGROUP_CREATE: 3949 case BTRFS_IOC_QGROUP_CREATE:
3783 return btrfs_ioctl_qgroup_create(root, argp); 3950 return btrfs_ioctl_qgroup_create(file, argp);
3784 case BTRFS_IOC_QGROUP_LIMIT: 3951 case BTRFS_IOC_QGROUP_LIMIT:
3785 return btrfs_ioctl_qgroup_limit(root, argp); 3952 return btrfs_ioctl_qgroup_limit(file, argp);
3953 case BTRFS_IOC_DEV_REPLACE:
3954 return btrfs_ioctl_dev_replace(root, argp);
3786 } 3955 }
3787 3956
3788 return -ENOTTY; 3957 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) 37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
124}; 126};
125 127
126#define BTRFS_DEVICE_PATH_NAME_MAX 1024 128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
127struct btrfs_ioctl_dev_info_args { 170struct btrfs_ioctl_dev_info_args {
128 __u64 devid; /* in/out */ 171 __u64 devid; /* in/out */
129 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ 172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
453 struct btrfs_ioctl_qgroup_limit_args) 496 struct btrfs_ioctl_qgroup_limit_args)
454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
455 struct btrfs_ioctl_get_dev_stats) 498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
456#endif 502#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
1
2/*
3 * Copyright (C) 2012 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#ifndef __BTRFS_MATH_H
22#define __BTRFS_MATH_H
23
24#include <asm/div64.h>
25
26static inline u64 div_factor(u64 num, int factor)
27{
28 if (factor == 10)
29 return num;
30 num *= factor;
31 do_div(num, 10);
32 return num;
33}
34
35static inline u64 div_factor_fine(u64 num, int factor)
36{
37 if (factor == 100)
38 return num;
39 num *= factor;
40 do_div(num, 100);
41 return num;
42}
43
44#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551dd..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27 27
28static struct kmem_cache *btrfs_ordered_extent_cache;
29
28static u64 entry_end(struct btrfs_ordered_extent *entry) 30static u64 entry_end(struct btrfs_ordered_extent *entry)
29{ 31{
30 if (entry->file_offset + entry->len < entry->file_offset) 32 if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
187 struct btrfs_ordered_extent *entry; 189 struct btrfs_ordered_extent *entry;
188 190
189 tree = &BTRFS_I(inode)->ordered_tree; 191 tree = &BTRFS_I(inode)->ordered_tree;
190 entry = kzalloc(sizeof(*entry), GFP_NOFS); 192 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
191 if (!entry) 193 if (!entry)
192 return -ENOMEM; 194 return -ENOMEM;
193 195
@@ -209,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
209 init_waitqueue_head(&entry->wait); 211 init_waitqueue_head(&entry->wait);
210 INIT_LIST_HEAD(&entry->list); 212 INIT_LIST_HEAD(&entry->list);
211 INIT_LIST_HEAD(&entry->root_extent_list); 213 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion);
212 216
213 trace_btrfs_ordered_extent_add(inode, entry); 217 trace_btrfs_ordered_extent_add(inode, entry);
214 218
@@ -421,7 +425,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
421 list_del(&sum->list); 425 list_del(&sum->list);
422 kfree(sum); 426 kfree(sum);
423 } 427 }
424 kfree(entry); 428 kmem_cache_free(btrfs_ordered_extent_cache, entry);
425 } 429 }
426} 430}
427 431
@@ -462,19 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
462 wake_up(&entry->wait); 466 wake_up(&entry->wait);
463} 467}
464 468
469static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
470{
471 struct btrfs_ordered_extent *ordered;
472
473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
474 btrfs_start_ordered_extent(ordered->inode, ordered, 1);
475 complete(&ordered->completion);
476}
477
465/* 478/*
466 * wait for all the ordered extents in a root. This is done when balancing 479 * wait for all the ordered extents in a root. This is done when balancing
467 * space between drives. 480 * space between drives.
468 */ 481 */
469void btrfs_wait_ordered_extents(struct btrfs_root *root, 482void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
470 int nocow_only, int delay_iput)
471{ 483{
472 struct list_head splice; 484 struct list_head splice, works;
473 struct list_head *cur; 485 struct list_head *cur;
474 struct btrfs_ordered_extent *ordered; 486 struct btrfs_ordered_extent *ordered, *next;
475 struct inode *inode; 487 struct inode *inode;
476 488
477 INIT_LIST_HEAD(&splice); 489 INIT_LIST_HEAD(&splice);
490 INIT_LIST_HEAD(&works);
478 491
479 spin_lock(&root->fs_info->ordered_extent_lock); 492 spin_lock(&root->fs_info->ordered_extent_lock);
480 list_splice_init(&root->fs_info->ordered_extents, &splice); 493 list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -482,15 +495,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
482 cur = splice.next; 495 cur = splice.next;
483 ordered = list_entry(cur, struct btrfs_ordered_extent, 496 ordered = list_entry(cur, struct btrfs_ordered_extent,
484 root_extent_list); 497 root_extent_list);
485 if (nocow_only &&
486 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
487 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
488 list_move(&ordered->root_extent_list,
489 &root->fs_info->ordered_extents);
490 cond_resched_lock(&root->fs_info->ordered_extent_lock);
491 continue;
492 }
493
494 list_del_init(&ordered->root_extent_list); 498 list_del_init(&ordered->root_extent_list);
495 atomic_inc(&ordered->refs); 499 atomic_inc(&ordered->refs);
496 500
@@ -502,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
502 spin_unlock(&root->fs_info->ordered_extent_lock); 506 spin_unlock(&root->fs_info->ordered_extent_lock);
503 507
504 if (inode) { 508 if (inode) {
505 btrfs_start_ordered_extent(inode, ordered, 1); 509 ordered->flush_work.func = btrfs_run_ordered_extent_work;
506 btrfs_put_ordered_extent(ordered); 510 list_add_tail(&ordered->work_list, &works);
507 if (delay_iput) 511 btrfs_queue_worker(&root->fs_info->flush_workers,
508 btrfs_add_delayed_iput(inode); 512 &ordered->flush_work);
509 else
510 iput(inode);
511 } else { 513 } else {
512 btrfs_put_ordered_extent(ordered); 514 btrfs_put_ordered_extent(ordered);
513 } 515 }
514 516
517 cond_resched();
515 spin_lock(&root->fs_info->ordered_extent_lock); 518 spin_lock(&root->fs_info->ordered_extent_lock);
516 } 519 }
517 spin_unlock(&root->fs_info->ordered_extent_lock); 520 spin_unlock(&root->fs_info->ordered_extent_lock);
521
522 list_for_each_entry_safe(ordered, next, &works, work_list) {
523 list_del_init(&ordered->work_list);
524 wait_for_completion(&ordered->completion);
525
526 inode = ordered->inode;
527 btrfs_put_ordered_extent(ordered);
528 if (delay_iput)
529 btrfs_add_delayed_iput(inode);
530 else
531 iput(inode);
532
533 cond_resched();
534 }
518} 535}
519 536
520/* 537/*
@@ -527,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
527 * extra check to make sure the ordered operation list really is empty 544 * extra check to make sure the ordered operation list really is empty
528 * before we return 545 * before we return
529 */ 546 */
530void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
531{ 548{
532 struct btrfs_inode *btrfs_inode; 549 struct btrfs_inode *btrfs_inode;
533 struct inode *inode; 550 struct inode *inode;
534 struct list_head splice; 551 struct list_head splice;
552 struct list_head works;
553 struct btrfs_delalloc_work *work, *next;
554 int ret = 0;
535 555
536 INIT_LIST_HEAD(&splice); 556 INIT_LIST_HEAD(&splice);
557 INIT_LIST_HEAD(&works);
537 558
538 mutex_lock(&root->fs_info->ordered_operations_mutex); 559 mutex_lock(&root->fs_info->ordered_operations_mutex);
539 spin_lock(&root->fs_info->ordered_extent_lock); 560 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -541,6 +562,7 @@ again:
541 list_splice_init(&root->fs_info->ordered_operations, &splice); 562 list_splice_init(&root->fs_info->ordered_operations, &splice);
542 563
543 while (!list_empty(&splice)) { 564 while (!list_empty(&splice)) {
565
544 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
545 ordered_operations); 567 ordered_operations);
546 568
@@ -557,15 +579,26 @@ again:
557 list_add_tail(&BTRFS_I(inode)->ordered_operations, 579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
558 &root->fs_info->ordered_operations); 580 &root->fs_info->ordered_operations);
559 } 581 }
582
583 if (!inode)
584 continue;
560 spin_unlock(&root->fs_info->ordered_extent_lock); 585 spin_unlock(&root->fs_info->ordered_extent_lock);
561 586
562 if (inode) { 587 work = btrfs_alloc_delalloc_work(inode, wait, 1);
563 if (wait) 588 if (!work) {
564 btrfs_wait_ordered_range(inode, 0, (u64)-1); 589 if (list_empty(&BTRFS_I(inode)->ordered_operations))
565 else 590 list_add_tail(&btrfs_inode->ordered_operations,
566 filemap_flush(inode->i_mapping); 591 &splice);
567 btrfs_add_delayed_iput(inode); 592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM;
597 goto out;
568 } 598 }
599 list_add_tail(&work->list, &works);
600 btrfs_queue_worker(&root->fs_info->flush_workers,
601 &work->work);
569 602
570 cond_resched(); 603 cond_resched();
571 spin_lock(&root->fs_info->ordered_extent_lock); 604 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -574,7 +607,13 @@ again:
574 goto again; 607 goto again;
575 608
576 spin_unlock(&root->fs_info->ordered_extent_lock); 609 spin_unlock(&root->fs_info->ordered_extent_lock);
610out:
611 list_for_each_entry_safe(work, next, &works, list) {
612 list_del_init(&work->list);
613 btrfs_wait_and_free_delalloc_work(work);
614 }
577 mutex_unlock(&root->fs_info->ordered_operations_mutex); 615 mutex_unlock(&root->fs_info->ordered_operations_mutex);
616 return ret;
578} 617}
579 618
580/* 619/*
@@ -614,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
614 u64 end; 653 u64 end;
615 u64 orig_end; 654 u64 orig_end;
616 struct btrfs_ordered_extent *ordered; 655 struct btrfs_ordered_extent *ordered;
617 int found;
618 656
619 if (start + len < start) { 657 if (start + len < start) {
620 orig_end = INT_LIMIT(loff_t); 658 orig_end = INT_LIMIT(loff_t);
@@ -650,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
650 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
651 689
652 end = orig_end; 690 end = orig_end;
653 found = 0;
654 while (1) { 691 while (1) {
655 ordered = btrfs_lookup_first_ordered_extent(inode, end); 692 ordered = btrfs_lookup_first_ordered_extent(inode, end);
656 if (!ordered) 693 if (!ordered)
@@ -663,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
663 btrfs_put_ordered_extent(ordered); 700 btrfs_put_ordered_extent(ordered);
664 break; 701 break;
665 } 702 }
666 found++;
667 btrfs_start_ordered_extent(inode, ordered, 1); 703 btrfs_start_ordered_extent(inode, ordered, 1);
668 end = ordered->file_offset; 704 end = ordered->file_offset;
669 btrfs_put_ordered_extent(ordered); 705 btrfs_put_ordered_extent(ordered);
@@ -775,7 +811,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
775 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 811 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
776 u64 disk_i_size; 812 u64 disk_i_size;
777 u64 new_i_size; 813 u64 new_i_size;
778 u64 i_size_test;
779 u64 i_size = i_size_read(inode); 814 u64 i_size = i_size_read(inode);
780 struct rb_node *node; 815 struct rb_node *node;
781 struct rb_node *prev = NULL; 816 struct rb_node *prev = NULL;
@@ -835,55 +870,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
835 break; 870 break;
836 if (test->file_offset >= i_size) 871 if (test->file_offset >= i_size)
837 break; 872 break;
838 if (test->file_offset >= disk_i_size) 873 if (test->file_offset >= disk_i_size) {
874 /*
875 * we don't update disk_i_size now, so record this
876 * undealt i_size. Or we will not know the real
877 * i_size.
878 */
879 if (test->outstanding_isize < offset)
880 test->outstanding_isize = offset;
881 if (ordered &&
882 ordered->outstanding_isize >
883 test->outstanding_isize)
884 test->outstanding_isize =
885 ordered->outstanding_isize;
839 goto out; 886 goto out;
840 }
841 new_i_size = min_t(u64, offset, i_size);
842
843 /*
844 * at this point, we know we can safely update i_size to at least
845 * the offset from this ordered extent. But, we need to
846 * walk forward and see if ios from higher up in the file have
847 * finished.
848 */
849 if (ordered) {
850 node = rb_next(&ordered->rb_node);
851 } else {
852 if (prev)
853 node = rb_next(prev);
854 else
855 node = rb_first(&tree->tree);
856 }
857
858 /*
859 * We are looking for an area between our current extent and the next
860 * ordered extent to update the i_size to. There are 3 cases here
861 *
862 * 1) We don't actually have anything and we can update to i_size.
863 * 2) We have stuff but they already did their i_size update so again we
864 * can just update to i_size.
865 * 3) We have an outstanding ordered extent so the most we can update
866 * our disk_i_size to is the start of the next offset.
867 */
868 i_size_test = i_size;
869 for (; node; node = rb_next(node)) {
870 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
871
872 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
873 continue;
874 if (test->file_offset > offset) {
875 i_size_test = test->file_offset;
876 break;
877 } 887 }
878 } 888 }
889 new_i_size = min_t(u64, offset, i_size);
879 890
880 /* 891 /*
881 * i_size_test is the end of a region after this ordered 892 * Some ordered extents may completed before the current one, and
882 * extent where there are no ordered extents, we can safely set 893 * we hold the real i_size in ->outstanding_isize.
883 * disk_i_size to this.
884 */ 894 */
885 if (i_size_test > offset) 895 if (ordered && ordered->outstanding_isize > new_i_size)
886 new_i_size = min_t(u64, i_size_test, i_size); 896 new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
887 BTRFS_I(inode)->disk_i_size = new_i_size; 897 BTRFS_I(inode)->disk_i_size = new_i_size;
888 ret = 0; 898 ret = 0;
889out: 899out:
@@ -968,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
968 if (last_mod < root->fs_info->last_trans_committed) 978 if (last_mod < root->fs_info->last_trans_committed)
969 return; 979 return;
970 980
971 /*
972 * the transaction is already committing. Just start the IO and
973 * don't bother with all of this list nonsense
974 */
975 if (trans && root->fs_info->running_transaction->blocked) {
976 btrfs_wait_ordered_range(inode, 0, (u64)-1);
977 return;
978 }
979
980 spin_lock(&root->fs_info->ordered_extent_lock); 981 spin_lock(&root->fs_info->ordered_extent_lock);
981 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
982 list_add_tail(&BTRFS_I(inode)->ordered_operations, 983 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -984,3 +985,21 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
984 } 985 }
985 spin_unlock(&root->fs_info->ordered_extent_lock); 986 spin_unlock(&root->fs_info->ordered_extent_lock);
986} 987}
988
989int __init ordered_data_init(void)
990{
991 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
992 sizeof(struct btrfs_ordered_extent), 0,
993 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
994 NULL);
995 if (!btrfs_ordered_extent_cache)
996 return -ENOMEM;
997
998 return 0;
999}
1000
1001void ordered_data_exit(void)
1002{
1003 if (btrfs_ordered_extent_cache)
1004 kmem_cache_destroy(btrfs_ordered_extent_cache);
1005}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d2997..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ 77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78 78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82 82
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 96 /* number of bytes that still need writing */
97 u64 bytes_left; 97 u64 bytes_left;
98 98
99 /*
100 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of
102 * btrfs_ordered_update_i_size();
103 */
104 u64 outstanding_isize;
105
99 /* flags (described above) */ 106 /* flags (described above) */
100 unsigned long flags; 107 unsigned long flags;
101 108
@@ -121,8 +128,11 @@ struct btrfs_ordered_extent {
121 struct list_head root_extent_list; 128 struct list_head root_extent_list;
122 129
123 struct btrfs_work work; 130 struct btrfs_work work;
124};
125 131
132 struct completion completion;
133 struct btrfs_work flush_work;
134 struct list_head work_list;
135};
126 136
127/* 137/*
128 * calculates the total size you need to allocate for an ordered sum 138 * calculates the total size you need to allocate for an ordered sum
@@ -179,10 +189,11 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
179int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
180 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
181int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
182void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
183void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root, 194 struct btrfs_root *root,
185 struct inode *inode); 195 struct inode *inode);
186void btrfs_wait_ordered_extents(struct btrfs_root *root, 196void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
187 int nocow_only, int delay_iput); 197int __init ordered_data_init(void);
198void ordered_data_exit(void);
188#endif 199#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
297 case BTRFS_DEV_STATS_KEY: 297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 298 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 299 break;
300 case BTRFS_DEV_REPLACE_KEY:
301 printk(KERN_INFO "\t\tdev replace\n");
302 break;
300 }; 303 };
301 } 304 }
302} 305}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b65015581744..fe9d02c45f8e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -790,8 +790,10 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
790 } 790 }
791 791
792 path = btrfs_alloc_path(); 792 path = btrfs_alloc_path();
793 if (!path) 793 if (!path) {
794 return -ENOMEM; 794 ret = -ENOMEM;
795 goto out_free_root;
796 }
795 797
796 key.objectid = 0; 798 key.objectid = 0;
797 key.type = BTRFS_QGROUP_STATUS_KEY; 799 key.type = BTRFS_QGROUP_STATUS_KEY;
@@ -800,7 +802,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
800 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 802 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
801 sizeof(*ptr)); 803 sizeof(*ptr));
802 if (ret) 804 if (ret)
803 goto out; 805 goto out_free_path;
804 806
805 leaf = path->nodes[0]; 807 leaf = path->nodes[0];
806 ptr = btrfs_item_ptr(leaf, path->slots[0], 808 ptr = btrfs_item_ptr(leaf, path->slots[0],
@@ -818,8 +820,15 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
818 fs_info->quota_root = quota_root; 820 fs_info->quota_root = quota_root;
819 fs_info->pending_quota_state = 1; 821 fs_info->pending_quota_state = 1;
820 spin_unlock(&fs_info->qgroup_lock); 822 spin_unlock(&fs_info->qgroup_lock);
821out: 823out_free_path:
822 btrfs_free_path(path); 824 btrfs_free_path(path);
825out_free_root:
826 if (ret) {
827 free_extent_buffer(quota_root->node);
828 free_extent_buffer(quota_root->commit_root);
829 kfree(quota_root);
830 }
831out:
823 return ret; 832 return ret;
824} 833}
825 834
@@ -1145,12 +1154,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1145 1154
1146 ulist_reinit(tmp); 1155 ulist_reinit(tmp);
1147 /* XXX id not needed */ 1156 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1157 ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter); 1158 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1159 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist; 1160 struct btrfs_qgroup_list *glist;
1152 1161
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1162 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1154 if (qg->refcnt < seq) 1163 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1; 1164 qg->refcnt = seq + 1;
1156 else 1165 else
@@ -1158,7 +1167,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1158 1167
1159 list_for_each_entry(glist, &qg->groups, next_group) { 1168 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid, 1169 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group, 1170 (u64)(uintptr_t)glist->group,
1162 GFP_ATOMIC); 1171 GFP_ATOMIC);
1163 } 1172 }
1164 } 1173 }
@@ -1168,13 +1177,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1168 * step 2: walk from the new root 1177 * step 2: walk from the new root
1169 */ 1178 */
1170 ulist_reinit(tmp); 1179 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1180 ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter); 1181 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) { 1182 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg; 1183 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist; 1184 struct btrfs_qgroup_list *glist;
1176 1185
1177 qg = (struct btrfs_qgroup *)unode->aux; 1186 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1178 if (qg->refcnt < seq) { 1187 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */ 1188 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes; 1189 qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1199,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1190 1199
1191 list_for_each_entry(glist, &qg->groups, next_group) { 1200 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid, 1201 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC); 1202 (uintptr_t)glist->group, GFP_ATOMIC);
1194 } 1203 }
1195 } 1204 }
1196 1205
@@ -1208,12 +1217,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1208 continue; 1217 continue;
1209 1218
1210 ulist_reinit(tmp); 1219 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1220 ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter); 1221 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1222 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist; 1223 struct btrfs_qgroup_list *glist;
1215 1224
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1225 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1217 if (qg->tag == seq) 1226 if (qg->tag == seq)
1218 continue; 1227 continue;
1219 1228
@@ -1225,7 +1234,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1225 1234
1226 list_for_each_entry(glist, &qg->groups, next_group) { 1235 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid, 1236 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group, 1237 (uintptr_t)glist->group,
1229 GFP_ATOMIC); 1238 GFP_ATOMIC);
1230 } 1239 }
1231 } 1240 }
@@ -1469,13 +1478,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1469 * be exceeded 1478 * be exceeded
1470 */ 1479 */
1471 ulist = ulist_alloc(GFP_ATOMIC); 1480 ulist = ulist_alloc(GFP_ATOMIC);
1472 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1481 if (!ulist) {
1482 ret = -ENOMEM;
1483 goto out;
1484 }
1485 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1473 ULIST_ITER_INIT(&uiter); 1486 ULIST_ITER_INIT(&uiter);
1474 while ((unode = ulist_next(ulist, &uiter))) { 1487 while ((unode = ulist_next(ulist, &uiter))) {
1475 struct btrfs_qgroup *qg; 1488 struct btrfs_qgroup *qg;
1476 struct btrfs_qgroup_list *glist; 1489 struct btrfs_qgroup_list *glist;
1477 1490
1478 qg = (struct btrfs_qgroup *)unode->aux; 1491 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1479 1492
1480 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 1493 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1481 qg->reserved + qg->rfer + num_bytes > 1494 qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1502,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1489 1502
1490 list_for_each_entry(glist, &qg->groups, next_group) { 1503 list_for_each_entry(glist, &qg->groups, next_group) {
1491 ulist_add(ulist, glist->group->qgroupid, 1504 ulist_add(ulist, glist->group->qgroupid,
1492 (unsigned long)glist->group, GFP_ATOMIC); 1505 (uintptr_t)glist->group, GFP_ATOMIC);
1493 } 1506 }
1494 } 1507 }
1495 if (ret) 1508 if (ret)
@@ -1502,7 +1515,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1502 while ((unode = ulist_next(ulist, &uiter))) { 1515 while ((unode = ulist_next(ulist, &uiter))) {
1503 struct btrfs_qgroup *qg; 1516 struct btrfs_qgroup *qg;
1504 1517
1505 qg = (struct btrfs_qgroup *)unode->aux; 1518 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1506 1519
1507 qg->reserved += num_bytes; 1520 qg->reserved += num_bytes;
1508 } 1521 }
@@ -1541,19 +1554,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1541 goto out; 1554 goto out;
1542 1555
1543 ulist = ulist_alloc(GFP_ATOMIC); 1556 ulist = ulist_alloc(GFP_ATOMIC);
1544 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1557 if (!ulist) {
1558 btrfs_std_error(fs_info, -ENOMEM);
1559 goto out;
1560 }
1561 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1545 ULIST_ITER_INIT(&uiter); 1562 ULIST_ITER_INIT(&uiter);
1546 while ((unode = ulist_next(ulist, &uiter))) { 1563 while ((unode = ulist_next(ulist, &uiter))) {
1547 struct btrfs_qgroup *qg; 1564 struct btrfs_qgroup *qg;
1548 struct btrfs_qgroup_list *glist; 1565 struct btrfs_qgroup_list *glist;
1549 1566
1550 qg = (struct btrfs_qgroup *)unode->aux; 1567 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1551 1568
1552 qg->reserved -= num_bytes; 1569 qg->reserved -= num_bytes;
1553 1570
1554 list_for_each_entry(glist, &qg->groups, next_group) { 1571 list_for_each_entry(glist, &qg->groups, next_group) {
1555 ulist_add(ulist, glist->group->qgroupid, 1572 ulist_add(ulist, glist->group->qgroupid,
1556 (unsigned long)glist->group, GFP_ATOMIC); 1573 (uintptr_t)glist->group, GFP_ATOMIC);
1557 } 1574 }
1558 } 1575 }
1559 1576
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
27#include "volumes.h" 27#include "volumes.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30#include "dev-replace.h"
30 31
31#undef DEBUG 32#undef DEBUG
32 33
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
323 struct reada_extent *re = NULL; 324 struct reada_extent *re = NULL;
324 struct reada_extent *re_exist = NULL; 325 struct reada_extent *re_exist = NULL;
325 struct btrfs_fs_info *fs_info = root->fs_info; 326 struct btrfs_fs_info *fs_info = root->fs_info;
326 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
327 struct btrfs_bio *bbio = NULL; 327 struct btrfs_bio *bbio = NULL;
328 struct btrfs_device *dev; 328 struct btrfs_device *dev;
329 struct btrfs_device *prev_dev; 329 struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
332 int nzones = 0; 332 int nzones = 0;
333 int i; 333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 334 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing;
335 336
336 spin_lock(&fs_info->reada_lock); 337 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 338 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
358 * map block 359 * map block
359 */ 360 */
360 length = blocksize; 361 length = blocksize;
361 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 362 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
363 &bbio, 0);
362 if (ret || !bbio || length < blocksize) 364 if (ret || !bbio || length < blocksize)
363 goto error; 365 goto error;
364 366
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
393 } 395 }
394 396
395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 397 /* insert extent in reada_tree + all per-device trees, all or nothing */
398 btrfs_dev_replace_lock(&fs_info->dev_replace);
396 spin_lock(&fs_info->reada_lock); 399 spin_lock(&fs_info->reada_lock);
397 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
398 if (ret == -EEXIST) { 401 if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
400 BUG_ON(!re_exist); 403 BUG_ON(!re_exist);
401 re_exist->refcnt++; 404 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 405 spin_unlock(&fs_info->reada_lock);
406 btrfs_dev_replace_unlock(&fs_info->dev_replace);
403 goto error; 407 goto error;
404 } 408 }
405 if (ret) { 409 if (ret) {
406 spin_unlock(&fs_info->reada_lock); 410 spin_unlock(&fs_info->reada_lock);
411 btrfs_dev_replace_unlock(&fs_info->dev_replace);
407 goto error; 412 goto error;
408 } 413 }
409 prev_dev = NULL; 414 prev_dev = NULL;
415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
416 &fs_info->dev_replace);
410 for (i = 0; i < nzones; ++i) { 417 for (i = 0; i < nzones; ++i) {
411 dev = bbio->stripes[i].dev; 418 dev = bbio->stripes[i].dev;
412 if (dev == prev_dev) { 419 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
419 */ 426 */
420 continue; 427 continue;
421 } 428 }
429 if (!dev->bdev) {
430 /* cannot read ahead on missing device */
431 continue;
432 }
433 if (dev_replace_is_ongoing &&
434 dev == fs_info->dev_replace.tgtdev) {
435 /*
436 * as this device is selected for reading only as
437 * a last resort, skip it for read ahead.
438 */
439 continue;
440 }
422 prev_dev = dev; 441 prev_dev = dev;
423 ret = radix_tree_insert(&dev->reada_extents, index, re); 442 ret = radix_tree_insert(&dev->reada_extents, index, re);
424 if (ret) { 443 if (ret) {
425 while (--i >= 0) { 444 while (--i >= 0) {
426 dev = bbio->stripes[i].dev; 445 dev = bbio->stripes[i].dev;
427 BUG_ON(dev == NULL); 446 BUG_ON(dev == NULL);
447 /* ignore whether the entry was inserted */
428 radix_tree_delete(&dev->reada_extents, index); 448 radix_tree_delete(&dev->reada_extents, index);
429 } 449 }
430 BUG_ON(fs_info == NULL); 450 BUG_ON(fs_info == NULL);
431 radix_tree_delete(&fs_info->reada_tree, index); 451 radix_tree_delete(&fs_info->reada_tree, index);
432 spin_unlock(&fs_info->reada_lock); 452 spin_unlock(&fs_info->reada_lock);
453 btrfs_dev_replace_unlock(&fs_info->dev_replace);
433 goto error; 454 goto error;
434 } 455 }
435 } 456 }
436 spin_unlock(&fs_info->reada_lock); 457 spin_unlock(&fs_info->reada_lock);
458 btrfs_dev_replace_unlock(&fs_info->dev_replace);
437 459
438 kfree(bbio); 460 kfree(bbio);
439 return re; 461 return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
915 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
916 free_extent_buffer(node); 938 free_extent_buffer(node);
917 939
918 reada_add_block(rc, start, &max_key, level, generation); 940 if (reada_add_block(rc, start, &max_key, level, generation)) {
941 kfree(rc);
942 return ERR_PTR(-ENOMEM);
943 }
919 944
920 reada_start_machine(root->fs_info); 945 reada_start_machine(root->fs_info);
921 946
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004d..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2025 struct btrfs_root_item *root_item; 2025 struct btrfs_root_item *root_item;
2026 struct btrfs_path *path; 2026 struct btrfs_path *path;
2027 struct extent_buffer *leaf; 2027 struct extent_buffer *leaf;
2028 unsigned long nr;
2029 int level; 2028 int level;
2030 int max_level; 2029 int max_level;
2031 int replaced = 0; 2030 int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2074 BUG_ON(IS_ERR(trans)); 2073 BUG_ON(IS_ERR(trans));
2075 trans->block_rsv = rc->block_rsv; 2074 trans->block_rsv = rc->block_rsv;
2076 2075
2077 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2076 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2077 BTRFS_RESERVE_FLUSH_ALL);
2078 if (ret) { 2078 if (ret) {
2079 BUG_ON(ret != -EAGAIN); 2079 BUG_ON(ret != -EAGAIN);
2080 ret = btrfs_commit_transaction(trans, root); 2080 ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2125 path->slots[level]); 2125 path->slots[level]);
2126 root_item->drop_level = level; 2126 root_item->drop_level = level;
2127 2127
2128 nr = trans->blocks_used;
2129 btrfs_end_transaction_throttle(trans, root); 2128 btrfs_end_transaction_throttle(trans, root);
2130 2129
2131 btrfs_btree_balance_dirty(root, nr); 2130 btrfs_btree_balance_dirty(root);
2132 2131
2133 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134 invalidate_extent_cache(root, &key, &next_key); 2133 invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
2155 btrfs_update_reloc_root(trans, root); 2154 btrfs_update_reloc_root(trans, root);
2156 } 2155 }
2157 2156
2158 nr = trans->blocks_used;
2159 btrfs_end_transaction_throttle(trans, root); 2157 btrfs_end_transaction_throttle(trans, root);
2160 2158
2161 btrfs_btree_balance_dirty(root, nr); 2159 btrfs_btree_balance_dirty(root);
2162 2160
2163 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2161 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164 invalidate_extent_cache(root, &key, &next_key); 2162 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2184again: 2182again:
2185 if (!err) { 2183 if (!err) {
2186 num_bytes = rc->merging_rsv_size; 2184 num_bytes = rc->merging_rsv_size;
2187 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2185 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2186 BTRFS_RESERVE_FLUSH_ALL);
2188 if (ret) 2187 if (ret)
2189 err = ret; 2188 err = ret;
2190 } 2189 }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2458 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460 2459
2461 trans->block_rsv = rc->block_rsv; 2460 trans->block_rsv = rc->block_rsv;
2462 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2461 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2462 BTRFS_RESERVE_FLUSH_ALL);
2463 if (ret) { 2463 if (ret) {
2464 if (ret == -EAGAIN) 2464 if (ret == -EAGAIN)
2465 rc->commit_transaction = 1; 2465 rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3259 struct btrfs_path *path; 3259 struct btrfs_path *path;
3260 struct btrfs_root *root = fs_info->tree_root; 3260 struct btrfs_root *root = fs_info->tree_root;
3261 struct btrfs_trans_handle *trans; 3261 struct btrfs_trans_handle *trans;
3262 unsigned long nr;
3263 int ret = 0; 3262 int ret = 0;
3264 3263
3265 if (inode) 3264 if (inode)
@@ -3270,8 +3269,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3270 key.offset = 0; 3269 key.offset = 0;
3271 3270
3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3271 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3273 if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { 3272 if (IS_ERR(inode) || is_bad_inode(inode)) {
3274 if (inode && !IS_ERR(inode)) 3273 if (!IS_ERR(inode))
3275 iput(inode); 3274 iput(inode);
3276 return -ENOENT; 3275 return -ENOENT;
3277 } 3276 }
@@ -3293,9 +3292,8 @@ truncate:
3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3292 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294 3293
3295 btrfs_free_path(path); 3294 btrfs_free_path(path);
3296 nr = trans->blocks_used;
3297 btrfs_end_transaction(trans, root); 3295 btrfs_end_transaction(trans, root);
3298 btrfs_btree_balance_dirty(root, nr); 3296 btrfs_btree_balance_dirty(root);
3299out: 3297out:
3300 iput(inode); 3298 iput(inode);
3301 return ret; 3299 return ret;
@@ -3621,7 +3619,7 @@ next:
3621 3619
3622 ret = find_first_extent_bit(&rc->processed_blocks, 3620 ret = find_first_extent_bit(&rc->processed_blocks,
3623 key.objectid, &start, &end, 3621 key.objectid, &start, &end,
3624 EXTENT_DIRTY); 3622 EXTENT_DIRTY, NULL);
3625 3623
3626 if (ret == 0 && start <= key.objectid) { 3624 if (ret == 0 && start <= key.objectid) {
3627 btrfs_release_path(path); 3625 btrfs_release_path(path);
@@ -3674,7 +3672,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3674 struct btrfs_trans_handle *trans; 3672 struct btrfs_trans_handle *trans;
3675 int ret; 3673 int ret;
3676 3674
3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); 3675 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3676 BTRFS_BLOCK_RSV_TEMP);
3678 if (!rc->block_rsv) 3677 if (!rc->block_rsv)
3679 return -ENOMEM; 3678 return -ENOMEM;
3680 3679
@@ -3684,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3684 * is no reservation in transaction handle. 3683 * is no reservation in transaction handle.
3685 */ 3684 */
3686 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3685 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3687 rc->extent_root->nodesize * 256); 3686 rc->extent_root->nodesize * 256,
3687 BTRFS_RESERVE_FLUSH_ALL);
3688 if (ret) 3688 if (ret)
3689 return ret; 3689 return ret;
3690 3690
@@ -3710,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3710 struct btrfs_trans_handle *trans = NULL; 3710 struct btrfs_trans_handle *trans = NULL;
3711 struct btrfs_path *path; 3711 struct btrfs_path *path;
3712 struct btrfs_extent_item *ei; 3712 struct btrfs_extent_item *ei;
3713 unsigned long nr;
3714 u64 flags; 3713 u64 flags;
3715 u32 item_size; 3714 u32 item_size;
3716 int ret; 3715 int ret;
@@ -3827,9 +3826,8 @@ restart:
3827 ret = btrfs_commit_transaction(trans, rc->extent_root); 3826 ret = btrfs_commit_transaction(trans, rc->extent_root);
3828 BUG_ON(ret); 3827 BUG_ON(ret);
3829 } else { 3828 } else {
3830 nr = trans->blocks_used;
3831 btrfs_end_transaction_throttle(trans, rc->extent_root); 3829 btrfs_end_transaction_throttle(trans, rc->extent_root);
3832 btrfs_btree_balance_dirty(rc->extent_root, nr); 3830 btrfs_btree_balance_dirty(rc->extent_root);
3833 } 3831 }
3834 trans = NULL; 3832 trans = NULL;
3835 3833
@@ -3859,9 +3857,8 @@ restart:
3859 GFP_NOFS); 3857 GFP_NOFS);
3860 3858
3861 if (trans) { 3859 if (trans) {
3862 nr = trans->blocks_used;
3863 btrfs_end_transaction_throttle(trans, rc->extent_root); 3860 btrfs_end_transaction_throttle(trans, rc->extent_root);
3864 btrfs_btree_balance_dirty(rc->extent_root, nr); 3861 btrfs_btree_balance_dirty(rc->extent_root);
3865 } 3862 }
3866 3863
3867 if (!err) { 3864 if (!err) {
@@ -3940,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3940 struct btrfs_trans_handle *trans; 3937 struct btrfs_trans_handle *trans;
3941 struct btrfs_root *root; 3938 struct btrfs_root *root;
3942 struct btrfs_key key; 3939 struct btrfs_key key;
3943 unsigned long nr;
3944 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3940 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3945 int err = 0; 3941 int err = 0;
3946 3942
@@ -3968,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3968 3964
3969 err = btrfs_orphan_add(trans, inode); 3965 err = btrfs_orphan_add(trans, inode);
3970out: 3966out:
3971 nr = trans->blocks_used;
3972 btrfs_end_transaction(trans, root); 3967 btrfs_end_transaction(trans, root);
3973 btrfs_btree_balance_dirty(root, nr); 3968 btrfs_btree_balance_dirty(root);
3974 if (err) { 3969 if (err) {
3975 if (inode) 3970 if (inode)
3976 iput(inode); 3971 iput(inode);
@@ -4056,8 +4051,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4056 (unsigned long long)rc->block_group->key.objectid, 4051 (unsigned long long)rc->block_group->key.objectid,
4057 (unsigned long long)rc->block_group->flags); 4052 (unsigned long long)rc->block_group->flags);
4058 4053
4059 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4054 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4060 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 4055 if (ret < 0) {
4056 err = ret;
4057 goto out;
4058 }
4059 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4061 4060
4062 while (1) { 4061 while (1) {
4063 mutex_lock(&fs_info->cleaner_mutex); 4062 mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d88071..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
141 return -ENOMEM; 141 return -ENOMEM;
142 142
143 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
144 if (ret < 0) 144 if (ret < 0) {
145 goto out_abort; 145 btrfs_abort_transaction(trans, root, ret);
146 goto out;
147 }
146 148
147 if (ret != 0) { 149 if (ret != 0) {
148 btrfs_print_leaf(root, path->nodes[0]); 150 btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
166 btrfs_release_path(path); 168 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path, 169 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1); 170 -1, 1);
169 if (ret < 0) 171 if (ret < 0) {
170 goto out_abort; 172 btrfs_abort_transaction(trans, root, ret);
173 goto out;
174 }
175
171 ret = btrfs_del_item(trans, root, path); 176 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0) 177 if (ret < 0) {
173 goto out_abort; 178 btrfs_abort_transaction(trans, root, ret);
179 goto out;
180 }
174 btrfs_release_path(path); 181 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path, 182 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item)); 183 key, sizeof(*item));
177 if (ret < 0) 184 if (ret < 0) {
178 goto out_abort; 185 btrfs_abort_transaction(trans, root, ret);
186 goto out;
187 }
179 l = path->nodes[0]; 188 l = path->nodes[0];
180 slot = path->slots[0]; 189 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot); 190 ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
192out: 201out:
193 btrfs_free_path(path); 202 btrfs_free_path(path);
194 return ret; 203 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
199} 204}
200 205
201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 206int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -543,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
543 struct btrfs_root_item *item = &root->root_item; 548 struct btrfs_root_item *item = &root->root_item;
544 struct timespec ct = CURRENT_TIME; 549 struct timespec ct = CURRENT_TIME;
545 550
546 spin_lock(&root->root_times_lock); 551 spin_lock(&root->root_item_lock);
547 item->ctransid = cpu_to_le64(trans->transid); 552 item->ctransid = cpu_to_le64(trans->transid);
548 item->ctime.sec = cpu_to_le64(ct.tv_sec); 553 item->ctime.sec = cpu_to_le64(ct.tv_sec);
549 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
550 spin_unlock(&root->root_times_lock); 555 spin_unlock(&root->root_item_lock);
551} 556}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a6..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
174 257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
175 263
176static void scrub_free_csums(struct scrub_dev *sdev) 264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
177{ 269{
178 while (!list_empty(&sdev->csum_list)) { 270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
292
293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
307{
308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,37 +496,42 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
353 struct btrfs_extent_item *ei; 504 struct btrfs_extent_item *ei;
354 struct scrub_warning swarn; 505 struct scrub_warning swarn;
355 u32 item_size; 506 unsigned long ptr = 0;
356 int ret; 507 u64 extent_item_pos;
508 u64 flags = 0;
357 u64 ref_root; 509 u64 ref_root;
510 u32 item_size;
358 u8 ref_level; 511 u8 ref_level;
359 unsigned long ptr = 0;
360 const int bufsize = 4096; 512 const int bufsize = 4096;
361 u64 extent_item_pos; 513 int ret;
514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
362 518
363 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
364 520
365 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
366 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
367 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
368 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
369 swarn.logical = sblock->pagev[0].logical;
370 swarn.errstr = errstr; 525 swarn.errstr = errstr;
371 swarn.dev = dev; 526 swarn.dev = NULL;
372 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
373 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
374 529
375 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 530 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
376 goto out; 531 goto out;
377 532
378 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); 533 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
534 &flags);
379 if (ret < 0) 535 if (ret < 0)
380 goto out; 536 goto out;
381 537
@@ -387,7 +543,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
387 item_size = btrfs_item_size_nr(eb, path->slots[0]); 543 item_size = btrfs_item_size_nr(eb, path->slots[0]);
388 btrfs_release_path(path); 544 btrfs_release_path(path);
389 545
390 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 546 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
391 do { 547 do {
392 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 548 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
393 &ref_root, &ref_level); 549 &ref_root, &ref_level);
@@ -403,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
403 } while (ret != 1); 559 } while (ret != 1);
404 } else { 560 } else {
405 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
406 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
407 extent_item_pos, 1, 564 extent_item_pos, 1,
408 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -414,11 +571,11 @@ out:
414 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
415} 572}
416 573
417static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
418{ 575{
419 struct page *page = NULL; 576 struct page *page = NULL;
420 unsigned long index; 577 unsigned long index;
421 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
422 int ret; 579 int ret;
423 int corrected = 0; 580 int corrected = 0;
424 struct btrfs_key key; 581 struct btrfs_key key;
@@ -449,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
449 } 606 }
450 607
451 if (PageUptodate(page)) { 608 if (PageUptodate(page)) {
452 struct btrfs_mapping_tree *map_tree; 609 struct btrfs_fs_info *fs_info;
453 if (PageDirty(page)) { 610 if (PageDirty(page)) {
454 /* 611 /*
455 * we need to write the data to the defect sector. the 612 * we need to write the data to the defect sector. the
@@ -470,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
470 ret = -EIO; 627 ret = -EIO;
471 goto out; 628 goto out;
472 } 629 }
473 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 630 fs_info = BTRFS_I(inode)->root->fs_info;
474 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
475 fixup->logical, page, 632 fixup->logical, page,
476 fixup->mirror_num); 633 fixup->mirror_num);
477 unlock_page(page); 634 unlock_page(page);
@@ -528,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
528{ 685{
529 int ret; 686 int ret;
530 struct scrub_fixup_nodatasum *fixup; 687 struct scrub_fixup_nodatasum *fixup;
531 struct scrub_dev *sdev; 688 struct scrub_ctx *sctx;
532 struct btrfs_trans_handle *trans = NULL; 689 struct btrfs_trans_handle *trans = NULL;
533 struct btrfs_fs_info *fs_info; 690 struct btrfs_fs_info *fs_info;
534 struct btrfs_path *path; 691 struct btrfs_path *path;
535 int uncorrectable = 0; 692 int uncorrectable = 0;
536 693
537 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
538 sdev = fixup->sdev; 695 sctx = fixup->sctx;
539 fs_info = fixup->root->fs_info; 696 fs_info = fixup->root->fs_info;
540 697
541 path = btrfs_alloc_path(); 698 path = btrfs_alloc_path();
542 if (!path) { 699 if (!path) {
543 spin_lock(&sdev->stat_lock); 700 spin_lock(&sctx->stat_lock);
544 ++sdev->stat.malloc_errors; 701 ++sctx->stat.malloc_errors;
545 spin_unlock(&sdev->stat_lock); 702 spin_unlock(&sctx->stat_lock);
546 uncorrectable = 1; 703 uncorrectable = 1;
547 goto out; 704 goto out;
548 } 705 }
@@ -571,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
571 } 728 }
572 WARN_ON(ret != 1); 729 WARN_ON(ret != 1);
573 730
574 spin_lock(&sdev->stat_lock); 731 spin_lock(&sctx->stat_lock);
575 ++sdev->stat.corrected_errors; 732 ++sctx->stat.corrected_errors;
576 spin_unlock(&sdev->stat_lock); 733 spin_unlock(&sctx->stat_lock);
577 734
578out: 735out:
579 if (trans && !IS_ERR(trans)) 736 if (trans && !IS_ERR(trans))
580 btrfs_end_transaction(trans, fixup->root); 737 btrfs_end_transaction(trans, fixup->root);
581 if (uncorrectable) { 738 if (uncorrectable) {
582 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
583 ++sdev->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
584 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
585 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
586 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
587 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
588 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
589 rcu_str_deref(sdev->dev->name)); 748 rcu_str_deref(fixup->dev->name));
590 } 749 }
591 750
592 btrfs_free_path(path); 751 btrfs_free_path(path);
593 kfree(fixup); 752 kfree(fixup);
594 753
595 /* see caller why we're pretending to be paused in the scrub counters */ 754 scrub_pending_trans_workers_dec(sctx);
596 mutex_lock(&fs_info->scrub_lock);
597 atomic_dec(&fs_info->scrubs_running);
598 atomic_dec(&fs_info->scrubs_paused);
599 mutex_unlock(&fs_info->scrub_lock);
600 atomic_dec(&sdev->fixup_cnt);
601 wake_up(&fs_info->scrub_pause_wait);
602 wake_up(&sdev->list_wait);
603} 755}
604 756
605/* 757/*
@@ -612,7 +764,8 @@ out:
612 */ 764 */
613static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
614{ 766{
615 struct scrub_dev *sdev = sblock_to_check->sdev; 767 struct scrub_ctx *sctx = sblock_to_check->sctx;
768 struct btrfs_device *dev;
616 struct btrfs_fs_info *fs_info; 769 struct btrfs_fs_info *fs_info;
617 u64 length; 770 u64 length;
618 u64 logical; 771 u64 logical;
@@ -631,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
631 DEFAULT_RATELIMIT_BURST); 784 DEFAULT_RATELIMIT_BURST);
632 785
633 BUG_ON(sblock_to_check->page_count < 1); 786 BUG_ON(sblock_to_check->page_count < 1);
634 fs_info = sdev->dev->dev_root->fs_info; 787 fs_info = sctx->dev_root->fs_info;
788 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
789 /*
790 * if we find an error in a super block, we just report it.
791 * They will get written with the next transaction commit
792 * anyway
793 */
794 spin_lock(&sctx->stat_lock);
795 ++sctx->stat.super_errors;
796 spin_unlock(&sctx->stat_lock);
797 return 0;
798 }
635 length = sblock_to_check->page_count * PAGE_SIZE; 799 length = sblock_to_check->page_count * PAGE_SIZE;
636 logical = sblock_to_check->pagev[0].logical; 800 logical = sblock_to_check->pagev[0]->logical;
637 generation = sblock_to_check->pagev[0].generation; 801 generation = sblock_to_check->pagev[0]->generation;
638 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 802 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
639 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 803 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
640 is_metadata = !(sblock_to_check->pagev[0].flags & 804 is_metadata = !(sblock_to_check->pagev[0]->flags &
641 BTRFS_EXTENT_FLAG_DATA); 805 BTRFS_EXTENT_FLAG_DATA);
642 have_csum = sblock_to_check->pagev[0].have_csum; 806 have_csum = sblock_to_check->pagev[0]->have_csum;
643 csum = sblock_to_check->pagev[0].csum; 807 csum = sblock_to_check->pagev[0]->csum;
808 dev = sblock_to_check->pagev[0]->dev;
809
810 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
811 sblocks_for_recheck = NULL;
812 goto nodatasum_case;
813 }
644 814
645 /* 815 /*
646 * read all mirrors one after the other. This includes to 816 * read all mirrors one after the other. This includes to
@@ -675,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
675 sizeof(*sblocks_for_recheck), 845 sizeof(*sblocks_for_recheck),
676 GFP_NOFS); 846 GFP_NOFS);
677 if (!sblocks_for_recheck) { 847 if (!sblocks_for_recheck) {
678 spin_lock(&sdev->stat_lock); 848 spin_lock(&sctx->stat_lock);
679 sdev->stat.malloc_errors++; 849 sctx->stat.malloc_errors++;
680 sdev->stat.read_errors++; 850 sctx->stat.read_errors++;
681 sdev->stat.uncorrectable_errors++; 851 sctx->stat.uncorrectable_errors++;
682 spin_unlock(&sdev->stat_lock); 852 spin_unlock(&sctx->stat_lock);
683 btrfs_dev_stat_inc_and_print(sdev->dev, 853 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
684 BTRFS_DEV_STAT_READ_ERRS);
685 goto out; 854 goto out;
686 } 855 }
687 856
688 /* setup the context, map the logical blocks and alloc the pages */ 857 /* setup the context, map the logical blocks and alloc the pages */
689 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 858 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
690 logical, sblocks_for_recheck); 859 logical, sblocks_for_recheck);
691 if (ret) { 860 if (ret) {
692 spin_lock(&sdev->stat_lock); 861 spin_lock(&sctx->stat_lock);
693 sdev->stat.read_errors++; 862 sctx->stat.read_errors++;
694 sdev->stat.uncorrectable_errors++; 863 sctx->stat.uncorrectable_errors++;
695 spin_unlock(&sdev->stat_lock); 864 spin_unlock(&sctx->stat_lock);
696 btrfs_dev_stat_inc_and_print(sdev->dev, 865 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
697 BTRFS_DEV_STAT_READ_ERRS);
698 goto out; 866 goto out;
699 } 867 }
700 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 868 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
701 sblock_bad = sblocks_for_recheck + failed_mirror_index; 869 sblock_bad = sblocks_for_recheck + failed_mirror_index;
702 870
703 /* build and submit the bios for the failed mirror, check checksums */ 871 /* build and submit the bios for the failed mirror, check checksums */
704 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 872 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
705 csum, generation, sdev->csum_size); 873 csum, generation, sctx->csum_size);
706 if (ret) {
707 spin_lock(&sdev->stat_lock);
708 sdev->stat.read_errors++;
709 sdev->stat.uncorrectable_errors++;
710 spin_unlock(&sdev->stat_lock);
711 btrfs_dev_stat_inc_and_print(sdev->dev,
712 BTRFS_DEV_STAT_READ_ERRS);
713 goto out;
714 }
715 874
716 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 875 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
717 sblock_bad->no_io_error_seen) { 876 sblock_bad->no_io_error_seen) {
@@ -723,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
723 * different bio (usually one of the two latter cases is 882 * different bio (usually one of the two latter cases is
724 * the cause) 883 * the cause)
725 */ 884 */
726 spin_lock(&sdev->stat_lock); 885 spin_lock(&sctx->stat_lock);
727 sdev->stat.unverified_errors++; 886 sctx->stat.unverified_errors++;
728 spin_unlock(&sdev->stat_lock); 887 spin_unlock(&sctx->stat_lock);
729 888
889 if (sctx->is_dev_replace)
890 scrub_write_block_to_dev_replace(sblock_bad);
730 goto out; 891 goto out;
731 } 892 }
732 893
733 if (!sblock_bad->no_io_error_seen) { 894 if (!sblock_bad->no_io_error_seen) {
734 spin_lock(&sdev->stat_lock); 895 spin_lock(&sctx->stat_lock);
735 sdev->stat.read_errors++; 896 sctx->stat.read_errors++;
736 spin_unlock(&sdev->stat_lock); 897 spin_unlock(&sctx->stat_lock);
737 if (__ratelimit(&_rs)) 898 if (__ratelimit(&_rs))
738 scrub_print_warning("i/o error", sblock_to_check); 899 scrub_print_warning("i/o error", sblock_to_check);
739 btrfs_dev_stat_inc_and_print(sdev->dev, 900 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
740 BTRFS_DEV_STAT_READ_ERRS);
741 } else if (sblock_bad->checksum_error) { 901 } else if (sblock_bad->checksum_error) {
742 spin_lock(&sdev->stat_lock); 902 spin_lock(&sctx->stat_lock);
743 sdev->stat.csum_errors++; 903 sctx->stat.csum_errors++;
744 spin_unlock(&sdev->stat_lock); 904 spin_unlock(&sctx->stat_lock);
745 if (__ratelimit(&_rs)) 905 if (__ratelimit(&_rs))
746 scrub_print_warning("checksum error", sblock_to_check); 906 scrub_print_warning("checksum error", sblock_to_check);
747 btrfs_dev_stat_inc_and_print(sdev->dev, 907 btrfs_dev_stat_inc_and_print(dev,
748 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 BTRFS_DEV_STAT_CORRUPTION_ERRS);
749 } else if (sblock_bad->header_error) { 909 } else if (sblock_bad->header_error) {
750 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
751 sdev->stat.verify_errors++; 911 sctx->stat.verify_errors++;
752 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
753 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
754 scrub_print_warning("checksum/header error", 914 scrub_print_warning("checksum/header error",
755 sblock_to_check); 915 sblock_to_check);
756 if (sblock_bad->generation_error) 916 if (sblock_bad->generation_error)
757 btrfs_dev_stat_inc_and_print(sdev->dev, 917 btrfs_dev_stat_inc_and_print(dev,
758 BTRFS_DEV_STAT_GENERATION_ERRS); 918 BTRFS_DEV_STAT_GENERATION_ERRS);
759 else 919 else
760 btrfs_dev_stat_inc_and_print(sdev->dev, 920 btrfs_dev_stat_inc_and_print(dev,
761 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 BTRFS_DEV_STAT_CORRUPTION_ERRS);
762 } 922 }
763 923
764 if (sdev->readonly) 924 if (sctx->readonly && !sctx->is_dev_replace)
765 goto did_not_correct_error; 925 goto did_not_correct_error;
766 926
767 if (!is_metadata && !have_csum) { 927 if (!is_metadata && !have_csum) {
768 struct scrub_fixup_nodatasum *fixup_nodatasum; 928 struct scrub_fixup_nodatasum *fixup_nodatasum;
769 929
930nodatasum_case:
931 WARN_ON(sctx->is_dev_replace);
932
770 /* 933 /*
771 * !is_metadata and !have_csum, this means that the data 934 * !is_metadata and !have_csum, this means that the data
772 * might not be COW'ed, that it might be modified 935 * might not be COW'ed, that it might be modified
@@ -777,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
777 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 940 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
778 if (!fixup_nodatasum) 941 if (!fixup_nodatasum)
779 goto did_not_correct_error; 942 goto did_not_correct_error;
780 fixup_nodatasum->sdev = sdev; 943 fixup_nodatasum->sctx = sctx;
944 fixup_nodatasum->dev = dev;
781 fixup_nodatasum->logical = logical; 945 fixup_nodatasum->logical = logical;
782 fixup_nodatasum->root = fs_info->extent_root; 946 fixup_nodatasum->root = fs_info->extent_root;
783 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
784 /* 948 scrub_pending_trans_workers_inc(sctx);
785 * increment scrubs_running to prevent cancel requests from
786 * completing as long as a fixup worker is running. we must also
787 * increment scrubs_paused to prevent deadlocking on pause
788 * requests used for transactions commits (as the worker uses a
789 * transaction context). it is safe to regard the fixup worker
790 * as paused for all matters practical. effectively, we only
791 * avoid cancellation requests from completing.
792 */
793 mutex_lock(&fs_info->scrub_lock);
794 atomic_inc(&fs_info->scrubs_running);
795 atomic_inc(&fs_info->scrubs_paused);
796 mutex_unlock(&fs_info->scrub_lock);
797 atomic_inc(&sdev->fixup_cnt);
798 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 949 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
799 btrfs_queue_worker(&fs_info->scrub_workers, 950 btrfs_queue_worker(&fs_info->scrub_workers,
800 &fixup_nodatasum->work); 951 &fixup_nodatasum->work);
@@ -803,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
803 954
804 /* 955 /*
805 * now build and submit the bios for the other mirrors, check 956 * now build and submit the bios for the other mirrors, check
806 * checksums 957 * checksums.
807 */ 958 * First try to pick the mirror which is completely without I/O
808 for (mirror_index = 0;
809 mirror_index < BTRFS_MAX_MIRRORS &&
810 sblocks_for_recheck[mirror_index].page_count > 0;
811 mirror_index++) {
812 if (mirror_index == failed_mirror_index)
813 continue;
814
815 /* build and submit the bios, check checksums */
816 ret = scrub_recheck_block(fs_info,
817 sblocks_for_recheck + mirror_index,
818 is_metadata, have_csum, csum,
819 generation, sdev->csum_size);
820 if (ret)
821 goto did_not_correct_error;
822 }
823
824 /*
825 * first try to pick the mirror which is completely without I/O
826 * errors and also does not have a checksum error. 959 * errors and also does not have a checksum error.
827 * If one is found, and if a checksum is present, the full block 960 * If one is found, and if a checksum is present, the full block
828 * that is known to contain an error is rewritten. Afterwards 961 * that is known to contain an error is rewritten. Afterwards
@@ -838,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
838 mirror_index < BTRFS_MAX_MIRRORS && 971 mirror_index < BTRFS_MAX_MIRRORS &&
839 sblocks_for_recheck[mirror_index].page_count > 0; 972 sblocks_for_recheck[mirror_index].page_count > 0;
840 mirror_index++) { 973 mirror_index++) {
841 struct scrub_block *sblock_other = sblocks_for_recheck + 974 struct scrub_block *sblock_other;
842 mirror_index; 975
976 if (mirror_index == failed_mirror_index)
977 continue;
978 sblock_other = sblocks_for_recheck + mirror_index;
979
980 /* build and submit the bios, check checksums */
981 scrub_recheck_block(fs_info, sblock_other, is_metadata,
982 have_csum, csum, generation,
983 sctx->csum_size);
843 984
844 if (!sblock_other->header_error && 985 if (!sblock_other->header_error &&
845 !sblock_other->checksum_error && 986 !sblock_other->checksum_error &&
846 sblock_other->no_io_error_seen) { 987 sblock_other->no_io_error_seen) {
847 int force_write = is_metadata || have_csum; 988 if (sctx->is_dev_replace) {
848 989 scrub_write_block_to_dev_replace(sblock_other);
849 ret = scrub_repair_block_from_good_copy(sblock_bad, 990 } else {
850 sblock_other, 991 int force_write = is_metadata || have_csum;
851 force_write); 992
993 ret = scrub_repair_block_from_good_copy(
994 sblock_bad, sblock_other,
995 force_write);
996 }
852 if (0 == ret) 997 if (0 == ret)
853 goto corrected_error; 998 goto corrected_error;
854 } 999 }
855 } 1000 }
856 1001
857 /* 1002 /*
858 * in case of I/O errors in the area that is supposed to be 1003 * for dev_replace, pick good pages and write to the target device.
1004 */
1005 if (sctx->is_dev_replace) {
1006 success = 1;
1007 for (page_num = 0; page_num < sblock_bad->page_count;
1008 page_num++) {
1009 int sub_success;
1010
1011 sub_success = 0;
1012 for (mirror_index = 0;
1013 mirror_index < BTRFS_MAX_MIRRORS &&
1014 sblocks_for_recheck[mirror_index].page_count > 0;
1015 mirror_index++) {
1016 struct scrub_block *sblock_other =
1017 sblocks_for_recheck + mirror_index;
1018 struct scrub_page *page_other =
1019 sblock_other->pagev[page_num];
1020
1021 if (!page_other->io_error) {
1022 ret = scrub_write_page_to_dev_replace(
1023 sblock_other, page_num);
1024 if (ret == 0) {
1025 /* succeeded for this page */
1026 sub_success = 1;
1027 break;
1028 } else {
1029 btrfs_dev_replace_stats_inc(
1030 &sctx->dev_root->
1031 fs_info->dev_replace.
1032 num_write_errors);
1033 }
1034 }
1035 }
1036
1037 if (!sub_success) {
1038 /*
1039 * did not find a mirror to fetch the page
1040 * from. scrub_write_page_to_dev_replace()
1041 * handles this case (page->io_error), by
1042 * filling the block with zeros before
1043 * submitting the write request
1044 */
1045 success = 0;
1046 ret = scrub_write_page_to_dev_replace(
1047 sblock_bad, page_num);
1048 if (ret)
1049 btrfs_dev_replace_stats_inc(
1050 &sctx->dev_root->fs_info->
1051 dev_replace.num_write_errors);
1052 }
1053 }
1054
1055 goto out;
1056 }
1057
1058 /*
1059 * for regular scrub, repair those pages that are errored.
1060 * In case of I/O errors in the area that is supposed to be
859 * repaired, continue by picking good copies of those pages. 1061 * repaired, continue by picking good copies of those pages.
860 * Select the good pages from mirrors to rewrite bad pages from 1062 * Select the good pages from mirrors to rewrite bad pages from
861 * the area to fix. Afterwards verify the checksum of the block 1063 * the area to fix. Afterwards verify the checksum of the block
@@ -885,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
885 1087
886 success = 1; 1088 success = 1;
887 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1089 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
888 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1090 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
889 1091
890 if (!page_bad->io_error) 1092 if (!page_bad->io_error)
891 continue; 1093 continue;
@@ -896,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
896 mirror_index++) { 1098 mirror_index++) {
897 struct scrub_block *sblock_other = sblocks_for_recheck + 1099 struct scrub_block *sblock_other = sblocks_for_recheck +
898 mirror_index; 1100 mirror_index;
899 struct scrub_page *page_other = sblock_other->pagev + 1101 struct scrub_page *page_other = sblock_other->pagev[
900 page_num; 1102 page_num];
901 1103
902 if (!page_other->io_error) { 1104 if (!page_other->io_error) {
903 ret = scrub_repair_page_from_good_copy( 1105 ret = scrub_repair_page_from_good_copy(
@@ -926,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
926 * is verified, but most likely the data comes out 1128 * is verified, but most likely the data comes out
927 * of the page cache. 1129 * of the page cache.
928 */ 1130 */
929 ret = scrub_recheck_block(fs_info, sblock_bad, 1131 scrub_recheck_block(fs_info, sblock_bad,
930 is_metadata, have_csum, csum, 1132 is_metadata, have_csum, csum,
931 generation, sdev->csum_size); 1133 generation, sctx->csum_size);
932 if (!ret && !sblock_bad->header_error && 1134 if (!sblock_bad->header_error &&
933 !sblock_bad->checksum_error && 1135 !sblock_bad->checksum_error &&
934 sblock_bad->no_io_error_seen) 1136 sblock_bad->no_io_error_seen)
935 goto corrected_error; 1137 goto corrected_error;
@@ -937,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
937 goto did_not_correct_error; 1139 goto did_not_correct_error;
938 } else { 1140 } else {
939corrected_error: 1141corrected_error:
940 spin_lock(&sdev->stat_lock); 1142 spin_lock(&sctx->stat_lock);
941 sdev->stat.corrected_errors++; 1143 sctx->stat.corrected_errors++;
942 spin_unlock(&sdev->stat_lock); 1144 spin_unlock(&sctx->stat_lock);
943 printk_ratelimited_in_rcu(KERN_ERR 1145 printk_ratelimited_in_rcu(KERN_ERR
944 "btrfs: fixed up error at logical %llu on dev %s\n", 1146 "btrfs: fixed up error at logical %llu on dev %s\n",
945 (unsigned long long)logical, 1147 (unsigned long long)logical,
946 rcu_str_deref(sdev->dev->name)); 1148 rcu_str_deref(dev->name));
947 } 1149 }
948 } else { 1150 } else {
949did_not_correct_error: 1151did_not_correct_error:
950 spin_lock(&sdev->stat_lock); 1152 spin_lock(&sctx->stat_lock);
951 sdev->stat.uncorrectable_errors++; 1153 sctx->stat.uncorrectable_errors++;
952 spin_unlock(&sdev->stat_lock); 1154 spin_unlock(&sctx->stat_lock);
953 printk_ratelimited_in_rcu(KERN_ERR 1155 printk_ratelimited_in_rcu(KERN_ERR
954 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1156 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
955 (unsigned long long)logical, 1157 (unsigned long long)logical,
956 rcu_str_deref(sdev->dev->name)); 1158 rcu_str_deref(dev->name));
957 } 1159 }
958 1160
959out: 1161out:
@@ -964,11 +1166,11 @@ out:
964 mirror_index; 1166 mirror_index;
965 int page_index; 1167 int page_index;
966 1168
967 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1169 for (page_index = 0; page_index < sblock->page_count;
968 page_index++) 1170 page_index++) {
969 if (sblock->pagev[page_index].page) 1171 sblock->pagev[page_index]->sblock = NULL;
970 __free_page( 1172 scrub_page_put(sblock->pagev[page_index]);
971 sblock->pagev[page_index].page); 1173 }
972 } 1174 }
973 kfree(sblocks_for_recheck); 1175 kfree(sblocks_for_recheck);
974 } 1176 }
@@ -976,8 +1178,9 @@ out:
976 return 0; 1178 return 0;
977} 1179}
978 1180
979static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1181static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
980 struct btrfs_mapping_tree *map_tree, 1182 struct btrfs_fs_info *fs_info,
1183 struct scrub_block *original_sblock,
981 u64 length, u64 logical, 1184 u64 length, u64 logical,
982 struct scrub_block *sblocks_for_recheck) 1185 struct scrub_block *sblocks_for_recheck)
983{ 1186{
@@ -986,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
986 int ret; 1189 int ret;
987 1190
988 /* 1191 /*
989 * note: the three members sdev, ref_count and outstanding_pages 1192 * note: the two members ref_count and outstanding_pages
990 * are not used (and not set) in the blocks that are used for 1193 * are not used (and not set) in the blocks that are used for
991 * the recheck procedure 1194 * the recheck procedure
992 */ 1195 */
@@ -1001,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1001 * with a length of PAGE_SIZE, each returned stripe 1204 * with a length of PAGE_SIZE, each returned stripe
1002 * represents one mirror 1205 * represents one mirror
1003 */ 1206 */
1004 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1207 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1005 &bbio, 0); 1208 &mapped_length, &bbio, 0);
1006 if (ret || !bbio || mapped_length < sublen) { 1209 if (ret || !bbio || mapped_length < sublen) {
1007 kfree(bbio); 1210 kfree(bbio);
1008 return -EIO; 1211 return -EIO;
1009 } 1212 }
1010 1213
1011 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1214 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1012 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1215 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1013 mirror_index++) { 1216 mirror_index++) {
1014 struct scrub_block *sblock; 1217 struct scrub_block *sblock;
@@ -1018,20 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1018 continue; 1221 continue;
1019 1222
1020 sblock = sblocks_for_recheck + mirror_index; 1223 sblock = sblocks_for_recheck + mirror_index;
1021 page = sblock->pagev + page_index; 1224 sblock->sctx = sctx;
1225 page = kzalloc(sizeof(*page), GFP_NOFS);
1226 if (!page) {
1227leave_nomem:
1228 spin_lock(&sctx->stat_lock);
1229 sctx->stat.malloc_errors++;
1230 spin_unlock(&sctx->stat_lock);
1231 kfree(bbio);
1232 return -ENOMEM;
1233 }
1234 scrub_page_get(page);
1235 sblock->pagev[page_index] = page;
1022 page->logical = logical; 1236 page->logical = logical;
1023 page->physical = bbio->stripes[mirror_index].physical; 1237 page->physical = bbio->stripes[mirror_index].physical;
1238 BUG_ON(page_index >= original_sblock->page_count);
1239 page->physical_for_dev_replace =
1240 original_sblock->pagev[page_index]->
1241 physical_for_dev_replace;
1024 /* for missing devices, dev->bdev is NULL */ 1242 /* for missing devices, dev->bdev is NULL */
1025 page->dev = bbio->stripes[mirror_index].dev; 1243 page->dev = bbio->stripes[mirror_index].dev;
1026 page->mirror_num = mirror_index + 1; 1244 page->mirror_num = mirror_index + 1;
1027 page->page = alloc_page(GFP_NOFS);
1028 if (!page->page) {
1029 spin_lock(&sdev->stat_lock);
1030 sdev->stat.malloc_errors++;
1031 spin_unlock(&sdev->stat_lock);
1032 return -ENOMEM;
1033 }
1034 sblock->page_count++; 1245 sblock->page_count++;
1246 page->page = alloc_page(GFP_NOFS);
1247 if (!page->page)
1248 goto leave_nomem;
1035 } 1249 }
1036 kfree(bbio); 1250 kfree(bbio);
1037 length -= sublen; 1251 length -= sublen;
@@ -1049,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1049 * to take those pages that are not errored from all the mirrors so that 1263 * to take those pages that are not errored from all the mirrors so that
1050 * the pages that are errored in the just handled mirror can be repaired. 1264 * the pages that are errored in the just handled mirror can be repaired.
1051 */ 1265 */
1052static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1266static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1053 struct scrub_block *sblock, int is_metadata, 1267 struct scrub_block *sblock, int is_metadata,
1054 int have_csum, u8 *csum, u64 generation, 1268 int have_csum, u8 *csum, u64 generation,
1055 u16 csum_size) 1269 u16 csum_size)
1056{ 1270{
1057 int page_num; 1271 int page_num;
1058 1272
@@ -1062,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1062 1276
1063 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1277 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1064 struct bio *bio; 1278 struct bio *bio;
1065 int ret; 1279 struct scrub_page *page = sblock->pagev[page_num];
1066 struct scrub_page *page = sblock->pagev + page_num;
1067 DECLARE_COMPLETION_ONSTACK(complete); 1280 DECLARE_COMPLETION_ONSTACK(complete);
1068 1281
1069 if (page->dev->bdev == NULL) { 1282 if (page->dev->bdev == NULL) {
@@ -1072,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1072 continue; 1285 continue;
1073 } 1286 }
1074 1287
1075 BUG_ON(!page->page); 1288 WARN_ON(!page->page);
1076 bio = bio_alloc(GFP_NOFS, 1); 1289 bio = bio_alloc(GFP_NOFS, 1);
1077 if (!bio) 1290 if (!bio) {
1078 return -EIO; 1291 page->io_error = 1;
1292 sblock->no_io_error_seen = 0;
1293 continue;
1294 }
1079 bio->bi_bdev = page->dev->bdev; 1295 bio->bi_bdev = page->dev->bdev;
1080 bio->bi_sector = page->physical >> 9; 1296 bio->bi_sector = page->physical >> 9;
1081 bio->bi_end_io = scrub_complete_bio_end_io; 1297 bio->bi_end_io = scrub_complete_bio_end_io;
1082 bio->bi_private = &complete; 1298 bio->bi_private = &complete;
1083 1299
1084 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1300 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1085 if (PAGE_SIZE != ret) {
1086 bio_put(bio);
1087 return -EIO;
1088 }
1089 btrfsic_submit_bio(READ, bio); 1301 btrfsic_submit_bio(READ, bio);
1090 1302
1091 /* this will also unplug the queue */ 1303 /* this will also unplug the queue */
@@ -1102,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1102 have_csum, csum, generation, 1314 have_csum, csum, generation,
1103 csum_size); 1315 csum_size);
1104 1316
1105 return 0; 1317 return;
1106} 1318}
1107 1319
1108static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1320static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1117,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1117 struct btrfs_root *root = fs_info->extent_root; 1329 struct btrfs_root *root = fs_info->extent_root;
1118 void *mapped_buffer; 1330 void *mapped_buffer;
1119 1331
1120 BUG_ON(!sblock->pagev[0].page); 1332 WARN_ON(!sblock->pagev[0]->page);
1121 if (is_metadata) { 1333 if (is_metadata) {
1122 struct btrfs_header *h; 1334 struct btrfs_header *h;
1123 1335
1124 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1336 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1125 h = (struct btrfs_header *)mapped_buffer; 1337 h = (struct btrfs_header *)mapped_buffer;
1126 1338
1127 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1339 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1128 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1340 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1129 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1341 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1130 BTRFS_UUID_SIZE)) { 1342 BTRFS_UUID_SIZE)) {
@@ -1138,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1138 if (!have_csum) 1350 if (!have_csum)
1139 return; 1351 return;
1140 1352
1141 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1353 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1142 } 1354 }
1143 1355
1144 for (page_num = 0;;) { 1356 for (page_num = 0;;) {
@@ -1154,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1154 page_num++; 1366 page_num++;
1155 if (page_num >= sblock->page_count) 1367 if (page_num >= sblock->page_count)
1156 break; 1368 break;
1157 BUG_ON(!sblock->pagev[page_num].page); 1369 WARN_ON(!sblock->pagev[page_num]->page);
1158 1370
1159 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1371 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1160 } 1372 }
1161 1373
1162 btrfs_csum_final(crc, calculated_csum); 1374 btrfs_csum_final(crc, calculated_csum);
@@ -1194,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1194 struct scrub_block *sblock_good, 1406 struct scrub_block *sblock_good,
1195 int page_num, int force_write) 1407 int page_num, int force_write)
1196{ 1408{
1197 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1409 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1198 struct scrub_page *page_good = sblock_good->pagev + page_num; 1410 struct scrub_page *page_good = sblock_good->pagev[page_num];
1199 1411
1200 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1412 BUG_ON(page_bad->page == NULL);
1201 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1413 BUG_ON(page_good->page == NULL);
1202 if (force_write || sblock_bad->header_error || 1414 if (force_write || sblock_bad->header_error ||
1203 sblock_bad->checksum_error || page_bad->io_error) { 1415 sblock_bad->checksum_error || page_bad->io_error) {
1204 struct bio *bio; 1416 struct bio *bio;
1205 int ret; 1417 int ret;
1206 DECLARE_COMPLETION_ONSTACK(complete); 1418 DECLARE_COMPLETION_ONSTACK(complete);
1207 1419
1420 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1423 return -EIO;
1424 }
1425
1208 bio = bio_alloc(GFP_NOFS, 1); 1426 bio = bio_alloc(GFP_NOFS, 1);
1209 if (!bio) 1427 if (!bio)
1210 return -EIO; 1428 return -EIO;
@@ -1225,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1225 if (!bio_flagged(bio, BIO_UPTODATE)) { 1443 if (!bio_flagged(bio, BIO_UPTODATE)) {
1226 btrfs_dev_stat_inc_and_print(page_bad->dev, 1444 btrfs_dev_stat_inc_and_print(page_bad->dev,
1227 BTRFS_DEV_STAT_WRITE_ERRS); 1445 BTRFS_DEV_STAT_WRITE_ERRS);
1446 btrfs_dev_replace_stats_inc(
1447 &sblock_bad->sctx->dev_root->fs_info->
1448 dev_replace.num_write_errors);
1228 bio_put(bio); 1449 bio_put(bio);
1229 return -EIO; 1450 return -EIO;
1230 } 1451 }
@@ -1234,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1234 return 0; 1455 return 0;
1235} 1456}
1236 1457
1237static void scrub_checksum(struct scrub_block *sblock) 1458static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1459{
1460 int page_num;
1461
1462 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1463 int ret;
1464
1465 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1466 if (ret)
1467 btrfs_dev_replace_stats_inc(
1468 &sblock->sctx->dev_root->fs_info->dev_replace.
1469 num_write_errors);
1470 }
1471}
1472
1473static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1474 int page_num)
1475{
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 BUG_ON(spage->page == NULL);
1479 if (spage->io_error) {
1480 void *mapped_buffer = kmap_atomic(spage->page);
1481
1482 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1483 flush_dcache_page(spage->page);
1484 kunmap_atomic(mapped_buffer);
1485 }
1486 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1487}
1488
1489static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1490 struct scrub_page *spage)
1491{
1492 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1493 struct scrub_bio *sbio;
1494 int ret;
1495
1496 mutex_lock(&wr_ctx->wr_lock);
1497again:
1498 if (!wr_ctx->wr_curr_bio) {
1499 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1500 GFP_NOFS);
1501 if (!wr_ctx->wr_curr_bio) {
1502 mutex_unlock(&wr_ctx->wr_lock);
1503 return -ENOMEM;
1504 }
1505 wr_ctx->wr_curr_bio->sctx = sctx;
1506 wr_ctx->wr_curr_bio->page_count = 0;
1507 }
1508 sbio = wr_ctx->wr_curr_bio;
1509 if (sbio->page_count == 0) {
1510 struct bio *bio;
1511
1512 sbio->physical = spage->physical_for_dev_replace;
1513 sbio->logical = spage->logical;
1514 sbio->dev = wr_ctx->tgtdev;
1515 bio = sbio->bio;
1516 if (!bio) {
1517 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1518 if (!bio) {
1519 mutex_unlock(&wr_ctx->wr_lock);
1520 return -ENOMEM;
1521 }
1522 sbio->bio = bio;
1523 }
1524
1525 bio->bi_private = sbio;
1526 bio->bi_end_io = scrub_wr_bio_end_io;
1527 bio->bi_bdev = sbio->dev->bdev;
1528 bio->bi_sector = sbio->physical >> 9;
1529 sbio->err = 0;
1530 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1531 spage->physical_for_dev_replace ||
1532 sbio->logical + sbio->page_count * PAGE_SIZE !=
1533 spage->logical) {
1534 scrub_wr_submit(sctx);
1535 goto again;
1536 }
1537
1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1539 if (ret != PAGE_SIZE) {
1540 if (sbio->page_count < 1) {
1541 bio_put(sbio->bio);
1542 sbio->bio = NULL;
1543 mutex_unlock(&wr_ctx->wr_lock);
1544 return -EIO;
1545 }
1546 scrub_wr_submit(sctx);
1547 goto again;
1548 }
1549
1550 sbio->pagev[sbio->page_count] = spage;
1551 scrub_page_get(spage);
1552 sbio->page_count++;
1553 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1554 scrub_wr_submit(sctx);
1555 mutex_unlock(&wr_ctx->wr_lock);
1556
1557 return 0;
1558}
1559
1560static void scrub_wr_submit(struct scrub_ctx *sctx)
1561{
1562 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1563 struct scrub_bio *sbio;
1564
1565 if (!wr_ctx->wr_curr_bio)
1566 return;
1567
1568 sbio = wr_ctx->wr_curr_bio;
1569 wr_ctx->wr_curr_bio = NULL;
1570 WARN_ON(!sbio->bio->bi_bdev);
1571 scrub_pending_bio_inc(sctx);
1572 /* process all writes in a single worker thread. Then the block layer
1573 * orders the requests before sending them to the driver which
1574 * doubled the write performance on spinning disks when measured
1575 * with Linux 3.5 */
1576 btrfsic_submit_bio(WRITE, sbio->bio);
1577}
1578
1579static void scrub_wr_bio_end_io(struct bio *bio, int err)
1580{
1581 struct scrub_bio *sbio = bio->bi_private;
1582 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1583
1584 sbio->err = err;
1585 sbio->bio = bio;
1586
1587 sbio->work.func = scrub_wr_bio_end_io_worker;
1588 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1589}
1590
1591static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1592{
1593 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1594 struct scrub_ctx *sctx = sbio->sctx;
1595 int i;
1596
1597 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1598 if (sbio->err) {
1599 struct btrfs_dev_replace *dev_replace =
1600 &sbio->sctx->dev_root->fs_info->dev_replace;
1601
1602 for (i = 0; i < sbio->page_count; i++) {
1603 struct scrub_page *spage = sbio->pagev[i];
1604
1605 spage->io_error = 1;
1606 btrfs_dev_replace_stats_inc(&dev_replace->
1607 num_write_errors);
1608 }
1609 }
1610
1611 for (i = 0; i < sbio->page_count; i++)
1612 scrub_page_put(sbio->pagev[i]);
1613
1614 bio_put(sbio->bio);
1615 kfree(sbio);
1616 scrub_pending_bio_dec(sctx);
1617}
1618
1619static int scrub_checksum(struct scrub_block *sblock)
1238{ 1620{
1239 u64 flags; 1621 u64 flags;
1240 int ret; 1622 int ret;
1241 1623
1242 BUG_ON(sblock->page_count < 1); 1624 WARN_ON(sblock->page_count < 1);
1243 flags = sblock->pagev[0].flags; 1625 flags = sblock->pagev[0]->flags;
1244 ret = 0; 1626 ret = 0;
1245 if (flags & BTRFS_EXTENT_FLAG_DATA) 1627 if (flags & BTRFS_EXTENT_FLAG_DATA)
1246 ret = scrub_checksum_data(sblock); 1628 ret = scrub_checksum_data(sblock);
@@ -1252,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1252 WARN_ON(1); 1634 WARN_ON(1);
1253 if (ret) 1635 if (ret)
1254 scrub_handle_errored_block(sblock); 1636 scrub_handle_errored_block(sblock);
1637
1638 return ret;
1255} 1639}
1256 1640
1257static int scrub_checksum_data(struct scrub_block *sblock) 1641static int scrub_checksum_data(struct scrub_block *sblock)
1258{ 1642{
1259 struct scrub_dev *sdev = sblock->sdev; 1643 struct scrub_ctx *sctx = sblock->sctx;
1260 u8 csum[BTRFS_CSUM_SIZE]; 1644 u8 csum[BTRFS_CSUM_SIZE];
1261 u8 *on_disk_csum; 1645 u8 *on_disk_csum;
1262 struct page *page; 1646 struct page *page;
1263 void *buffer; 1647 void *buffer;
1264 u32 crc = ~(u32)0; 1648 u32 crc = ~(u32)0;
1265 int fail = 0; 1649 int fail = 0;
1266 struct btrfs_root *root = sdev->dev->dev_root; 1650 struct btrfs_root *root = sctx->dev_root;
1267 u64 len; 1651 u64 len;
1268 int index; 1652 int index;
1269 1653
1270 BUG_ON(sblock->page_count < 1); 1654 BUG_ON(sblock->page_count < 1);
1271 if (!sblock->pagev[0].have_csum) 1655 if (!sblock->pagev[0]->have_csum)
1272 return 0; 1656 return 0;
1273 1657
1274 on_disk_csum = sblock->pagev[0].csum; 1658 on_disk_csum = sblock->pagev[0]->csum;
1275 page = sblock->pagev[0].page; 1659 page = sblock->pagev[0]->page;
1276 buffer = kmap_atomic(page); 1660 buffer = kmap_atomic(page);
1277 1661
1278 len = sdev->sectorsize; 1662 len = sctx->sectorsize;
1279 index = 0; 1663 index = 0;
1280 for (;;) { 1664 for (;;) {
1281 u64 l = min_t(u64, len, PAGE_SIZE); 1665 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1287,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1287 break; 1671 break;
1288 index++; 1672 index++;
1289 BUG_ON(index >= sblock->page_count); 1673 BUG_ON(index >= sblock->page_count);
1290 BUG_ON(!sblock->pagev[index].page); 1674 BUG_ON(!sblock->pagev[index]->page);
1291 page = sblock->pagev[index].page; 1675 page = sblock->pagev[index]->page;
1292 buffer = kmap_atomic(page); 1676 buffer = kmap_atomic(page);
1293 } 1677 }
1294 1678
1295 btrfs_csum_final(crc, csum); 1679 btrfs_csum_final(crc, csum);
1296 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1680 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1297 fail = 1; 1681 fail = 1;
1298 1682
1299 return fail; 1683 return fail;
@@ -1301,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1301 1685
1302static int scrub_checksum_tree_block(struct scrub_block *sblock) 1686static int scrub_checksum_tree_block(struct scrub_block *sblock)
1303{ 1687{
1304 struct scrub_dev *sdev = sblock->sdev; 1688 struct scrub_ctx *sctx = sblock->sctx;
1305 struct btrfs_header *h; 1689 struct btrfs_header *h;
1306 struct btrfs_root *root = sdev->dev->dev_root; 1690 struct btrfs_root *root = sctx->dev_root;
1307 struct btrfs_fs_info *fs_info = root->fs_info; 1691 struct btrfs_fs_info *fs_info = root->fs_info;
1308 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1692 u8 calculated_csum[BTRFS_CSUM_SIZE];
1309 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1693 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1318,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1318 int index; 1702 int index;
1319 1703
1320 BUG_ON(sblock->page_count < 1); 1704 BUG_ON(sblock->page_count < 1);
1321 page = sblock->pagev[0].page; 1705 page = sblock->pagev[0]->page;
1322 mapped_buffer = kmap_atomic(page); 1706 mapped_buffer = kmap_atomic(page);
1323 h = (struct btrfs_header *)mapped_buffer; 1707 h = (struct btrfs_header *)mapped_buffer;
1324 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1708 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1325 1709
1326 /* 1710 /*
1327 * we don't use the getter functions here, as we 1711 * we don't use the getter functions here, as we
@@ -1329,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1329 * b) the page is already kmapped 1713 * b) the page is already kmapped
1330 */ 1714 */
1331 1715
1332 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1716 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1333 ++fail; 1717 ++fail;
1334 1718
1335 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1719 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1336 ++fail; 1720 ++fail;
1337 1721
1338 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1722 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1342,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1342 BTRFS_UUID_SIZE)) 1726 BTRFS_UUID_SIZE))
1343 ++fail; 1727 ++fail;
1344 1728
1345 BUG_ON(sdev->nodesize != sdev->leafsize); 1729 WARN_ON(sctx->nodesize != sctx->leafsize);
1346 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1730 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1347 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1731 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1348 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1732 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1349 index = 0; 1733 index = 0;
@@ -1357,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1357 break; 1741 break;
1358 index++; 1742 index++;
1359 BUG_ON(index >= sblock->page_count); 1743 BUG_ON(index >= sblock->page_count);
1360 BUG_ON(!sblock->pagev[index].page); 1744 BUG_ON(!sblock->pagev[index]->page);
1361 page = sblock->pagev[index].page; 1745 page = sblock->pagev[index]->page;
1362 mapped_buffer = kmap_atomic(page); 1746 mapped_buffer = kmap_atomic(page);
1363 mapped_size = PAGE_SIZE; 1747 mapped_size = PAGE_SIZE;
1364 p = mapped_buffer; 1748 p = mapped_buffer;
1365 } 1749 }
1366 1750
1367 btrfs_csum_final(crc, calculated_csum); 1751 btrfs_csum_final(crc, calculated_csum);
1368 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1752 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1369 ++crc_fail; 1753 ++crc_fail;
1370 1754
1371 return fail || crc_fail; 1755 return fail || crc_fail;
@@ -1374,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1374static int scrub_checksum_super(struct scrub_block *sblock) 1758static int scrub_checksum_super(struct scrub_block *sblock)
1375{ 1759{
1376 struct btrfs_super_block *s; 1760 struct btrfs_super_block *s;
1377 struct scrub_dev *sdev = sblock->sdev; 1761 struct scrub_ctx *sctx = sblock->sctx;
1378 struct btrfs_root *root = sdev->dev->dev_root; 1762 struct btrfs_root *root = sctx->dev_root;
1379 struct btrfs_fs_info *fs_info = root->fs_info; 1763 struct btrfs_fs_info *fs_info = root->fs_info;
1380 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1764 u8 calculated_csum[BTRFS_CSUM_SIZE];
1381 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1765 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1390,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1390 int index; 1774 int index;
1391 1775
1392 BUG_ON(sblock->page_count < 1); 1776 BUG_ON(sblock->page_count < 1);
1393 page = sblock->pagev[0].page; 1777 page = sblock->pagev[0]->page;
1394 mapped_buffer = kmap_atomic(page); 1778 mapped_buffer = kmap_atomic(page);
1395 s = (struct btrfs_super_block *)mapped_buffer; 1779 s = (struct btrfs_super_block *)mapped_buffer;
1396 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1780 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1397 1781
1398 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1782 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1399 ++fail_cor; 1783 ++fail_cor;
1400 1784
1401 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1785 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1402 ++fail_gen; 1786 ++fail_gen;
1403 1787
1404 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1788 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1418,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1418 break; 1802 break;
1419 index++; 1803 index++;
1420 BUG_ON(index >= sblock->page_count); 1804 BUG_ON(index >= sblock->page_count);
1421 BUG_ON(!sblock->pagev[index].page); 1805 BUG_ON(!sblock->pagev[index]->page);
1422 page = sblock->pagev[index].page; 1806 page = sblock->pagev[index]->page;
1423 mapped_buffer = kmap_atomic(page); 1807 mapped_buffer = kmap_atomic(page);
1424 mapped_size = PAGE_SIZE; 1808 mapped_size = PAGE_SIZE;
1425 p = mapped_buffer; 1809 p = mapped_buffer;
1426 } 1810 }
1427 1811
1428 btrfs_csum_final(crc, calculated_csum); 1812 btrfs_csum_final(crc, calculated_csum);
1429 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1813 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1430 ++fail_cor; 1814 ++fail_cor;
1431 1815
1432 if (fail_cor + fail_gen) { 1816 if (fail_cor + fail_gen) {
@@ -1435,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1435 * They will get written with the next transaction commit 1819 * They will get written with the next transaction commit
1436 * anyway 1820 * anyway
1437 */ 1821 */
1438 spin_lock(&sdev->stat_lock); 1822 spin_lock(&sctx->stat_lock);
1439 ++sdev->stat.super_errors; 1823 ++sctx->stat.super_errors;
1440 spin_unlock(&sdev->stat_lock); 1824 spin_unlock(&sctx->stat_lock);
1441 if (fail_cor) 1825 if (fail_cor)
1442 btrfs_dev_stat_inc_and_print(sdev->dev, 1826 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1443 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1827 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1444 else 1828 else
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1829 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_GENERATION_ERRS); 1830 BTRFS_DEV_STAT_GENERATION_ERRS);
1447 } 1831 }
1448 1832
@@ -1460,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1460 int i; 1844 int i;
1461 1845
1462 for (i = 0; i < sblock->page_count; i++) 1846 for (i = 0; i < sblock->page_count; i++)
1463 if (sblock->pagev[i].page) 1847 scrub_page_put(sblock->pagev[i]);
1464 __free_page(sblock->pagev[i].page);
1465 kfree(sblock); 1848 kfree(sblock);
1466 } 1849 }
1467} 1850}
1468 1851
1469static void scrub_submit(struct scrub_dev *sdev) 1852static void scrub_page_get(struct scrub_page *spage)
1853{
1854 atomic_inc(&spage->ref_count);
1855}
1856
1857static void scrub_page_put(struct scrub_page *spage)
1858{
1859 if (atomic_dec_and_test(&spage->ref_count)) {
1860 if (spage->page)
1861 __free_page(spage->page);
1862 kfree(spage);
1863 }
1864}
1865
1866static void scrub_submit(struct scrub_ctx *sctx)
1470{ 1867{
1471 struct scrub_bio *sbio; 1868 struct scrub_bio *sbio;
1472 1869
1473 if (sdev->curr == -1) 1870 if (sctx->curr == -1)
1474 return; 1871 return;
1475 1872
1476 sbio = sdev->bios[sdev->curr]; 1873 sbio = sctx->bios[sctx->curr];
1477 sdev->curr = -1; 1874 sctx->curr = -1;
1478 atomic_inc(&sdev->in_flight); 1875 scrub_pending_bio_inc(sctx);
1479 1876
1480 btrfsic_submit_bio(READ, sbio->bio); 1877 if (!sbio->bio->bi_bdev) {
1878 /*
1879 * this case should not happen. If btrfs_map_block() is
1880 * wrong, it could happen for dev-replace operations on
1881 * missing devices when no mirrors are available, but in
1882 * this case it should already fail the mount.
1883 * This case is handled correctly (but _very_ slowly).
1884 */
1885 printk_ratelimited(KERN_WARNING
1886 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1887 bio_endio(sbio->bio, -EIO);
1888 } else {
1889 btrfsic_submit_bio(READ, sbio->bio);
1890 }
1481} 1891}
1482 1892
1483static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1893static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1484 struct scrub_page *spage) 1894 struct scrub_page *spage)
1485{ 1895{
1486 struct scrub_block *sblock = spage->sblock; 1896 struct scrub_block *sblock = spage->sblock;
1487 struct scrub_bio *sbio; 1897 struct scrub_bio *sbio;
@@ -1491,28 +1901,29 @@ again:
1491 /* 1901 /*
1492 * grab a fresh bio or wait for one to become available 1902 * grab a fresh bio or wait for one to become available
1493 */ 1903 */
1494 while (sdev->curr == -1) { 1904 while (sctx->curr == -1) {
1495 spin_lock(&sdev->list_lock); 1905 spin_lock(&sctx->list_lock);
1496 sdev->curr = sdev->first_free; 1906 sctx->curr = sctx->first_free;
1497 if (sdev->curr != -1) { 1907 if (sctx->curr != -1) {
1498 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1908 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1499 sdev->bios[sdev->curr]->next_free = -1; 1909 sctx->bios[sctx->curr]->next_free = -1;
1500 sdev->bios[sdev->curr]->page_count = 0; 1910 sctx->bios[sctx->curr]->page_count = 0;
1501 spin_unlock(&sdev->list_lock); 1911 spin_unlock(&sctx->list_lock);
1502 } else { 1912 } else {
1503 spin_unlock(&sdev->list_lock); 1913 spin_unlock(&sctx->list_lock);
1504 wait_event(sdev->list_wait, sdev->first_free != -1); 1914 wait_event(sctx->list_wait, sctx->first_free != -1);
1505 } 1915 }
1506 } 1916 }
1507 sbio = sdev->bios[sdev->curr]; 1917 sbio = sctx->bios[sctx->curr];
1508 if (sbio->page_count == 0) { 1918 if (sbio->page_count == 0) {
1509 struct bio *bio; 1919 struct bio *bio;
1510 1920
1511 sbio->physical = spage->physical; 1921 sbio->physical = spage->physical;
1512 sbio->logical = spage->logical; 1922 sbio->logical = spage->logical;
1923 sbio->dev = spage->dev;
1513 bio = sbio->bio; 1924 bio = sbio->bio;
1514 if (!bio) { 1925 if (!bio) {
1515 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1926 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1516 if (!bio) 1927 if (!bio)
1517 return -ENOMEM; 1928 return -ENOMEM;
1518 sbio->bio = bio; 1929 sbio->bio = bio;
@@ -1520,14 +1931,15 @@ again:
1520 1931
1521 bio->bi_private = sbio; 1932 bio->bi_private = sbio;
1522 bio->bi_end_io = scrub_bio_end_io; 1933 bio->bi_end_io = scrub_bio_end_io;
1523 bio->bi_bdev = sdev->dev->bdev; 1934 bio->bi_bdev = sbio->dev->bdev;
1524 bio->bi_sector = spage->physical >> 9; 1935 bio->bi_sector = sbio->physical >> 9;
1525 sbio->err = 0; 1936 sbio->err = 0;
1526 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1937 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1527 spage->physical || 1938 spage->physical ||
1528 sbio->logical + sbio->page_count * PAGE_SIZE != 1939 sbio->logical + sbio->page_count * PAGE_SIZE !=
1529 spage->logical) { 1940 spage->logical ||
1530 scrub_submit(sdev); 1941 sbio->dev != spage->dev) {
1942 scrub_submit(sctx);
1531 goto again; 1943 goto again;
1532 } 1944 }
1533 1945
@@ -1539,81 +1951,87 @@ again:
1539 sbio->bio = NULL; 1951 sbio->bio = NULL;
1540 return -EIO; 1952 return -EIO;
1541 } 1953 }
1542 scrub_submit(sdev); 1954 scrub_submit(sctx);
1543 goto again; 1955 goto again;
1544 } 1956 }
1545 1957
1546 scrub_block_get(sblock); /* one for the added page */ 1958 scrub_block_get(sblock); /* one for the page added to the bio */
1547 atomic_inc(&sblock->outstanding_pages); 1959 atomic_inc(&sblock->outstanding_pages);
1548 sbio->page_count++; 1960 sbio->page_count++;
1549 if (sbio->page_count == sdev->pages_per_bio) 1961 if (sbio->page_count == sctx->pages_per_rd_bio)
1550 scrub_submit(sdev); 1962 scrub_submit(sctx);
1551 1963
1552 return 0; 1964 return 0;
1553} 1965}
1554 1966
1555static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1967static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1556 u64 physical, u64 flags, u64 gen, int mirror_num, 1968 u64 physical, struct btrfs_device *dev, u64 flags,
1557 u8 *csum, int force) 1969 u64 gen, int mirror_num, u8 *csum, int force,
1970 u64 physical_for_dev_replace)
1558{ 1971{
1559 struct scrub_block *sblock; 1972 struct scrub_block *sblock;
1560 int index; 1973 int index;
1561 1974
1562 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1563 if (!sblock) { 1976 if (!sblock) {
1564 spin_lock(&sdev->stat_lock); 1977 spin_lock(&sctx->stat_lock);
1565 sdev->stat.malloc_errors++; 1978 sctx->stat.malloc_errors++;
1566 spin_unlock(&sdev->stat_lock); 1979 spin_unlock(&sctx->stat_lock);
1567 return -ENOMEM; 1980 return -ENOMEM;
1568 } 1981 }
1569 1982
1570 /* one ref inside this function, plus one for each page later on */ 1983 /* one ref inside this function, plus one for each page added to
1984 * a bio later on */
1571 atomic_set(&sblock->ref_count, 1); 1985 atomic_set(&sblock->ref_count, 1);
1572 sblock->sdev = sdev; 1986 sblock->sctx = sctx;
1573 sblock->no_io_error_seen = 1; 1987 sblock->no_io_error_seen = 1;
1574 1988
1575 for (index = 0; len > 0; index++) { 1989 for (index = 0; len > 0; index++) {
1576 struct scrub_page *spage = sblock->pagev + index; 1990 struct scrub_page *spage;
1577 u64 l = min_t(u64, len, PAGE_SIZE); 1991 u64 l = min_t(u64, len, PAGE_SIZE);
1578 1992
1579 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1993 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1580 spage->page = alloc_page(GFP_NOFS); 1994 if (!spage) {
1581 if (!spage->page) { 1995leave_nomem:
1582 spin_lock(&sdev->stat_lock); 1996 spin_lock(&sctx->stat_lock);
1583 sdev->stat.malloc_errors++; 1997 sctx->stat.malloc_errors++;
1584 spin_unlock(&sdev->stat_lock); 1998 spin_unlock(&sctx->stat_lock);
1585 while (index > 0) { 1999 scrub_block_put(sblock);
1586 index--;
1587 __free_page(sblock->pagev[index].page);
1588 }
1589 kfree(sblock);
1590 return -ENOMEM; 2000 return -ENOMEM;
1591 } 2001 }
2002 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2003 scrub_page_get(spage);
2004 sblock->pagev[index] = spage;
1592 spage->sblock = sblock; 2005 spage->sblock = sblock;
1593 spage->dev = sdev->dev; 2006 spage->dev = dev;
1594 spage->flags = flags; 2007 spage->flags = flags;
1595 spage->generation = gen; 2008 spage->generation = gen;
1596 spage->logical = logical; 2009 spage->logical = logical;
1597 spage->physical = physical; 2010 spage->physical = physical;
2011 spage->physical_for_dev_replace = physical_for_dev_replace;
1598 spage->mirror_num = mirror_num; 2012 spage->mirror_num = mirror_num;
1599 if (csum) { 2013 if (csum) {
1600 spage->have_csum = 1; 2014 spage->have_csum = 1;
1601 memcpy(spage->csum, csum, sdev->csum_size); 2015 memcpy(spage->csum, csum, sctx->csum_size);
1602 } else { 2016 } else {
1603 spage->have_csum = 0; 2017 spage->have_csum = 0;
1604 } 2018 }
1605 sblock->page_count++; 2019 sblock->page_count++;
2020 spage->page = alloc_page(GFP_NOFS);
2021 if (!spage->page)
2022 goto leave_nomem;
1606 len -= l; 2023 len -= l;
1607 logical += l; 2024 logical += l;
1608 physical += l; 2025 physical += l;
2026 physical_for_dev_replace += l;
1609 } 2027 }
1610 2028
1611 BUG_ON(sblock->page_count == 0); 2029 WARN_ON(sblock->page_count == 0);
1612 for (index = 0; index < sblock->page_count; index++) { 2030 for (index = 0; index < sblock->page_count; index++) {
1613 struct scrub_page *spage = sblock->pagev + index; 2031 struct scrub_page *spage = sblock->pagev[index];
1614 int ret; 2032 int ret;
1615 2033
1616 ret = scrub_add_page_to_bio(sdev, spage); 2034 ret = scrub_add_page_to_rd_bio(sctx, spage);
1617 if (ret) { 2035 if (ret) {
1618 scrub_block_put(sblock); 2036 scrub_block_put(sblock);
1619 return ret; 2037 return ret;
@@ -1621,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1621 } 2039 }
1622 2040
1623 if (force) 2041 if (force)
1624 scrub_submit(sdev); 2042 scrub_submit(sctx);
1625 2043
1626 /* last one frees, either here or in bio completion for last page */ 2044 /* last one frees, either here or in bio completion for last page */
1627 scrub_block_put(sblock); 2045 scrub_block_put(sblock);
@@ -1631,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1631static void scrub_bio_end_io(struct bio *bio, int err) 2049static void scrub_bio_end_io(struct bio *bio, int err)
1632{ 2050{
1633 struct scrub_bio *sbio = bio->bi_private; 2051 struct scrub_bio *sbio = bio->bi_private;
1634 struct scrub_dev *sdev = sbio->sdev; 2052 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1635 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1636 2053
1637 sbio->err = err; 2054 sbio->err = err;
1638 sbio->bio = bio; 2055 sbio->bio = bio;
@@ -1643,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1643static void scrub_bio_end_io_worker(struct btrfs_work *work) 2060static void scrub_bio_end_io_worker(struct btrfs_work *work)
1644{ 2061{
1645 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2062 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1646 struct scrub_dev *sdev = sbio->sdev; 2063 struct scrub_ctx *sctx = sbio->sctx;
1647 int i; 2064 int i;
1648 2065
1649 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2066 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1650 if (sbio->err) { 2067 if (sbio->err) {
1651 for (i = 0; i < sbio->page_count; i++) { 2068 for (i = 0; i < sbio->page_count; i++) {
1652 struct scrub_page *spage = sbio->pagev[i]; 2069 struct scrub_page *spage = sbio->pagev[i];
@@ -1666,40 +2083,39 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1666 scrub_block_put(sblock); 2083 scrub_block_put(sblock);
1667 } 2084 }
1668 2085
1669 if (sbio->err) { 2086 bio_put(sbio->bio);
1670 /* what is this good for??? */ 2087 sbio->bio = NULL;
1671 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 2088 spin_lock(&sctx->list_lock);
1672 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 2089 sbio->next_free = sctx->first_free;
1673 sbio->bio->bi_phys_segments = 0; 2090 sctx->first_free = sbio->index;
1674 sbio->bio->bi_idx = 0; 2091 spin_unlock(&sctx->list_lock);
1675 2092
1676 for (i = 0; i < sbio->page_count; i++) { 2093 if (sctx->is_dev_replace &&
1677 struct bio_vec *bi; 2094 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
1678 bi = &sbio->bio->bi_io_vec[i]; 2095 mutex_lock(&sctx->wr_ctx.wr_lock);
1679 bi->bv_offset = 0; 2096 scrub_wr_submit(sctx);
1680 bi->bv_len = PAGE_SIZE; 2097 mutex_unlock(&sctx->wr_ctx.wr_lock);
1681 }
1682 } 2098 }
1683 2099
1684 bio_put(sbio->bio); 2100 scrub_pending_bio_dec(sctx);
1685 sbio->bio = NULL;
1686 spin_lock(&sdev->list_lock);
1687 sbio->next_free = sdev->first_free;
1688 sdev->first_free = sbio->index;
1689 spin_unlock(&sdev->list_lock);
1690 atomic_dec(&sdev->in_flight);
1691 wake_up(&sdev->list_wait);
1692} 2101}
1693 2102
1694static void scrub_block_complete(struct scrub_block *sblock) 2103static void scrub_block_complete(struct scrub_block *sblock)
1695{ 2104{
1696 if (!sblock->no_io_error_seen) 2105 if (!sblock->no_io_error_seen) {
1697 scrub_handle_errored_block(sblock); 2106 scrub_handle_errored_block(sblock);
1698 else 2107 } else {
1699 scrub_checksum(sblock); 2108 /*
2109 * if has checksum error, write via repair mechanism in
2110 * dev replace case, otherwise write here in dev replace
2111 * case.
2112 */
2113 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2114 scrub_write_block_to_dev_replace(sblock);
2115 }
1700} 2116}
1701 2117
1702static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2118static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1703 u8 *csum) 2119 u8 *csum)
1704{ 2120{
1705 struct btrfs_ordered_sum *sum = NULL; 2121 struct btrfs_ordered_sum *sum = NULL;
@@ -1707,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1707 unsigned long i; 2123 unsigned long i;
1708 unsigned long num_sectors; 2124 unsigned long num_sectors;
1709 2125
1710 while (!list_empty(&sdev->csum_list)) { 2126 while (!list_empty(&sctx->csum_list)) {
1711 sum = list_first_entry(&sdev->csum_list, 2127 sum = list_first_entry(&sctx->csum_list,
1712 struct btrfs_ordered_sum, list); 2128 struct btrfs_ordered_sum, list);
1713 if (sum->bytenr > logical) 2129 if (sum->bytenr > logical)
1714 return 0; 2130 return 0;
1715 if (sum->bytenr + sum->len > logical) 2131 if (sum->bytenr + sum->len > logical)
1716 break; 2132 break;
1717 2133
1718 ++sdev->stat.csum_discards; 2134 ++sctx->stat.csum_discards;
1719 list_del(&sum->list); 2135 list_del(&sum->list);
1720 kfree(sum); 2136 kfree(sum);
1721 sum = NULL; 2137 sum = NULL;
@@ -1723,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1723 if (!sum) 2139 if (!sum)
1724 return 0; 2140 return 0;
1725 2141
1726 num_sectors = sum->len / sdev->sectorsize; 2142 num_sectors = sum->len / sctx->sectorsize;
1727 for (i = 0; i < num_sectors; ++i) { 2143 for (i = 0; i < num_sectors; ++i) {
1728 if (sum->sums[i].bytenr == logical) { 2144 if (sum->sums[i].bytenr == logical) {
1729 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2145 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1730 ret = 1; 2146 ret = 1;
1731 break; 2147 break;
1732 } 2148 }
@@ -1739,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1739} 2155}
1740 2156
1741/* scrub extent tries to collect up to 64 kB for each bio */ 2157/* scrub extent tries to collect up to 64 kB for each bio */
1742static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2158static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1743 u64 physical, u64 flags, u64 gen, int mirror_num) 2159 u64 physical, struct btrfs_device *dev, u64 flags,
2160 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1744{ 2161{
1745 int ret; 2162 int ret;
1746 u8 csum[BTRFS_CSUM_SIZE]; 2163 u8 csum[BTRFS_CSUM_SIZE];
1747 u32 blocksize; 2164 u32 blocksize;
1748 2165
1749 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1750 blocksize = sdev->sectorsize; 2167 blocksize = sctx->sectorsize;
1751 spin_lock(&sdev->stat_lock); 2168 spin_lock(&sctx->stat_lock);
1752 sdev->stat.data_extents_scrubbed++; 2169 sctx->stat.data_extents_scrubbed++;
1753 sdev->stat.data_bytes_scrubbed += len; 2170 sctx->stat.data_bytes_scrubbed += len;
1754 spin_unlock(&sdev->stat_lock); 2171 spin_unlock(&sctx->stat_lock);
1755 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1756 BUG_ON(sdev->nodesize != sdev->leafsize); 2173 WARN_ON(sctx->nodesize != sctx->leafsize);
1757 blocksize = sdev->nodesize; 2174 blocksize = sctx->nodesize;
1758 spin_lock(&sdev->stat_lock); 2175 spin_lock(&sctx->stat_lock);
1759 sdev->stat.tree_extents_scrubbed++; 2176 sctx->stat.tree_extents_scrubbed++;
1760 sdev->stat.tree_bytes_scrubbed += len; 2177 sctx->stat.tree_bytes_scrubbed += len;
1761 spin_unlock(&sdev->stat_lock); 2178 spin_unlock(&sctx->stat_lock);
1762 } else { 2179 } else {
1763 blocksize = sdev->sectorsize; 2180 blocksize = sctx->sectorsize;
1764 BUG_ON(1); 2181 WARN_ON(1);
1765 } 2182 }
1766 2183
1767 while (len) { 2184 while (len) {
@@ -1770,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1770 2187
1771 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2188 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1772 /* push csums to sbio */ 2189 /* push csums to sbio */
1773 have_csum = scrub_find_csum(sdev, logical, l, csum); 2190 have_csum = scrub_find_csum(sctx, logical, l, csum);
1774 if (have_csum == 0) 2191 if (have_csum == 0)
1775 ++sdev->stat.no_csum; 2192 ++sctx->stat.no_csum;
2193 if (sctx->is_dev_replace && !have_csum) {
2194 ret = copy_nocow_pages(sctx, logical, l,
2195 mirror_num,
2196 physical_for_dev_replace);
2197 goto behind_scrub_pages;
2198 }
1776 } 2199 }
1777 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2200 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1778 mirror_num, have_csum ? csum : NULL, 0); 2201 mirror_num, have_csum ? csum : NULL, 0,
2202 physical_for_dev_replace);
2203behind_scrub_pages:
1779 if (ret) 2204 if (ret)
1780 return ret; 2205 return ret;
1781 len -= l; 2206 len -= l;
1782 logical += l; 2207 logical += l;
1783 physical += l; 2208 physical += l;
2209 physical_for_dev_replace += l;
1784 } 2210 }
1785 return 0; 2211 return 0;
1786} 2212}
1787 2213
1788static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2214static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1789 struct map_lookup *map, int num, u64 base, u64 length) 2215 struct map_lookup *map,
2216 struct btrfs_device *scrub_dev,
2217 int num, u64 base, u64 length,
2218 int is_dev_replace)
1790{ 2219{
1791 struct btrfs_path *path; 2220 struct btrfs_path *path;
1792 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2221 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1793 struct btrfs_root *root = fs_info->extent_root; 2222 struct btrfs_root *root = fs_info->extent_root;
1794 struct btrfs_root *csum_root = fs_info->csum_root; 2223 struct btrfs_root *csum_root = fs_info->csum_root;
1795 struct btrfs_extent_item *extent; 2224 struct btrfs_extent_item *extent;
@@ -1809,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1809 struct reada_control *reada2; 2238 struct reada_control *reada2;
1810 struct btrfs_key key_start; 2239 struct btrfs_key key_start;
1811 struct btrfs_key key_end; 2240 struct btrfs_key key_end;
1812
1813 u64 increment = map->stripe_len; 2241 u64 increment = map->stripe_len;
1814 u64 offset; 2242 u64 offset;
2243 u64 extent_logical;
2244 u64 extent_physical;
2245 u64 extent_len;
2246 struct btrfs_device *extent_dev;
2247 int extent_mirror_num;
1815 2248
1816 nstripes = length; 2249 nstripes = length;
1817 offset = 0; 2250 offset = 0;
@@ -1855,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1855 */ 2288 */
1856 logical = base + offset; 2289 logical = base + offset;
1857 2290
1858 wait_event(sdev->list_wait, 2291 wait_event(sctx->list_wait,
1859 atomic_read(&sdev->in_flight) == 0); 2292 atomic_read(&sctx->bios_in_flight) == 0);
1860 atomic_inc(&fs_info->scrubs_paused); 2293 atomic_inc(&fs_info->scrubs_paused);
1861 wake_up(&fs_info->scrub_pause_wait); 2294 wake_up(&fs_info->scrub_pause_wait);
1862 2295
@@ -1910,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1910 * canceled? 2343 * canceled?
1911 */ 2344 */
1912 if (atomic_read(&fs_info->scrub_cancel_req) || 2345 if (atomic_read(&fs_info->scrub_cancel_req) ||
1913 atomic_read(&sdev->cancel_req)) { 2346 atomic_read(&sctx->cancel_req)) {
1914 ret = -ECANCELED; 2347 ret = -ECANCELED;
1915 goto out; 2348 goto out;
1916 } 2349 }
@@ -1919,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1919 */ 2352 */
1920 if (atomic_read(&fs_info->scrub_pause_req)) { 2353 if (atomic_read(&fs_info->scrub_pause_req)) {
1921 /* push queued extents */ 2354 /* push queued extents */
1922 scrub_submit(sdev); 2355 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1923 wait_event(sdev->list_wait, 2356 scrub_submit(sctx);
1924 atomic_read(&sdev->in_flight) == 0); 2357 mutex_lock(&sctx->wr_ctx.wr_lock);
2358 scrub_wr_submit(sctx);
2359 mutex_unlock(&sctx->wr_ctx.wr_lock);
2360 wait_event(sctx->list_wait,
2361 atomic_read(&sctx->bios_in_flight) == 0);
2362 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1925 atomic_inc(&fs_info->scrubs_paused); 2363 atomic_inc(&fs_info->scrubs_paused);
1926 wake_up(&fs_info->scrub_pause_wait); 2364 wake_up(&fs_info->scrub_pause_wait);
1927 mutex_lock(&fs_info->scrub_lock); 2365 mutex_lock(&fs_info->scrub_lock);
@@ -1938,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1938 2376
1939 ret = btrfs_lookup_csums_range(csum_root, logical, 2377 ret = btrfs_lookup_csums_range(csum_root, logical,
1940 logical + map->stripe_len - 1, 2378 logical + map->stripe_len - 1,
1941 &sdev->csum_list, 1); 2379 &sctx->csum_list, 1);
1942 if (ret) 2380 if (ret)
1943 goto out; 2381 goto out;
1944 2382
@@ -2016,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2016 key.objectid; 2454 key.objectid;
2017 } 2455 }
2018 2456
2019 ret = scrub_extent(sdev, key.objectid, key.offset, 2457 extent_logical = key.objectid;
2020 key.objectid - logical + physical, 2458 extent_physical = key.objectid - logical + physical;
2021 flags, generation, mirror_num); 2459 extent_len = key.offset;
2460 extent_dev = scrub_dev;
2461 extent_mirror_num = mirror_num;
2462 if (is_dev_replace)
2463 scrub_remap_extent(fs_info, extent_logical,
2464 extent_len, &extent_physical,
2465 &extent_dev,
2466 &extent_mirror_num);
2467 ret = scrub_extent(sctx, extent_logical, extent_len,
2468 extent_physical, extent_dev, flags,
2469 generation, extent_mirror_num,
2470 key.objectid - logical + physical);
2022 if (ret) 2471 if (ret)
2023 goto out; 2472 goto out;
2024 2473
@@ -2028,29 +2477,34 @@ next:
2028 btrfs_release_path(path); 2477 btrfs_release_path(path);
2029 logical += increment; 2478 logical += increment;
2030 physical += map->stripe_len; 2479 physical += map->stripe_len;
2031 spin_lock(&sdev->stat_lock); 2480 spin_lock(&sctx->stat_lock);
2032 sdev->stat.last_physical = physical; 2481 sctx->stat.last_physical = physical;
2033 spin_unlock(&sdev->stat_lock); 2482 spin_unlock(&sctx->stat_lock);
2034 } 2483 }
2484out:
2035 /* push queued extents */ 2485 /* push queued extents */
2036 scrub_submit(sdev); 2486 scrub_submit(sctx);
2487 mutex_lock(&sctx->wr_ctx.wr_lock);
2488 scrub_wr_submit(sctx);
2489 mutex_unlock(&sctx->wr_ctx.wr_lock);
2037 2490
2038out:
2039 blk_finish_plug(&plug); 2491 blk_finish_plug(&plug);
2040 btrfs_free_path(path); 2492 btrfs_free_path(path);
2041 return ret < 0 ? ret : 0; 2493 return ret < 0 ? ret : 0;
2042} 2494}
2043 2495
2044static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2496static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2045 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2497 struct btrfs_device *scrub_dev,
2046 u64 dev_offset) 2498 u64 chunk_tree, u64 chunk_objectid,
2499 u64 chunk_offset, u64 length,
2500 u64 dev_offset, int is_dev_replace)
2047{ 2501{
2048 struct btrfs_mapping_tree *map_tree = 2502 struct btrfs_mapping_tree *map_tree =
2049 &sdev->dev->dev_root->fs_info->mapping_tree; 2503 &sctx->dev_root->fs_info->mapping_tree;
2050 struct map_lookup *map; 2504 struct map_lookup *map;
2051 struct extent_map *em; 2505 struct extent_map *em;
2052 int i; 2506 int i;
2053 int ret = -EINVAL; 2507 int ret = 0;
2054 2508
2055 read_lock(&map_tree->map_tree.lock); 2509 read_lock(&map_tree->map_tree.lock);
2056 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2510 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2067,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2067 goto out; 2521 goto out;
2068 2522
2069 for (i = 0; i < map->num_stripes; ++i) { 2523 for (i = 0; i < map->num_stripes; ++i) {
2070 if (map->stripes[i].dev == sdev->dev && 2524 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2071 map->stripes[i].physical == dev_offset) { 2525 map->stripes[i].physical == dev_offset) {
2072 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2526 ret = scrub_stripe(sctx, map, scrub_dev, i,
2527 chunk_offset, length,
2528 is_dev_replace);
2073 if (ret) 2529 if (ret)
2074 goto out; 2530 goto out;
2075 } 2531 }
@@ -2081,11 +2537,13 @@ out:
2081} 2537}
2082 2538
2083static noinline_for_stack 2539static noinline_for_stack
2084int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2540int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2541 struct btrfs_device *scrub_dev, u64 start, u64 end,
2542 int is_dev_replace)
2085{ 2543{
2086 struct btrfs_dev_extent *dev_extent = NULL; 2544 struct btrfs_dev_extent *dev_extent = NULL;
2087 struct btrfs_path *path; 2545 struct btrfs_path *path;
2088 struct btrfs_root *root = sdev->dev->dev_root; 2546 struct btrfs_root *root = sctx->dev_root;
2089 struct btrfs_fs_info *fs_info = root->fs_info; 2547 struct btrfs_fs_info *fs_info = root->fs_info;
2090 u64 length; 2548 u64 length;
2091 u64 chunk_tree; 2549 u64 chunk_tree;
@@ -2097,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2097 struct btrfs_key key; 2555 struct btrfs_key key;
2098 struct btrfs_key found_key; 2556 struct btrfs_key found_key;
2099 struct btrfs_block_group_cache *cache; 2557 struct btrfs_block_group_cache *cache;
2558 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2100 2559
2101 path = btrfs_alloc_path(); 2560 path = btrfs_alloc_path();
2102 if (!path) 2561 if (!path)
@@ -2106,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2106 path->search_commit_root = 1; 2565 path->search_commit_root = 1;
2107 path->skip_locking = 1; 2566 path->skip_locking = 1;
2108 2567
2109 key.objectid = sdev->dev->devid; 2568 key.objectid = scrub_dev->devid;
2110 key.offset = 0ull; 2569 key.offset = 0ull;
2111 key.type = BTRFS_DEV_EXTENT_KEY; 2570 key.type = BTRFS_DEV_EXTENT_KEY;
2112 2571
2113
2114 while (1) { 2572 while (1) {
2115 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2573 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2116 if (ret < 0) 2574 if (ret < 0)
@@ -2129,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2129 2587
2130 btrfs_item_key_to_cpu(l, &found_key, slot); 2588 btrfs_item_key_to_cpu(l, &found_key, slot);
2131 2589
2132 if (found_key.objectid != sdev->dev->devid) 2590 if (found_key.objectid != scrub_dev->devid)
2133 break; 2591 break;
2134 2592
2135 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2593 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2163,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2163 ret = -ENOENT; 2621 ret = -ENOENT;
2164 break; 2622 break;
2165 } 2623 }
2166 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2624 dev_replace->cursor_right = found_key.offset + length;
2167 chunk_offset, length, found_key.offset); 2625 dev_replace->cursor_left = found_key.offset;
2626 dev_replace->item_needs_writeback = 1;
2627 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2628 chunk_offset, length, found_key.offset,
2629 is_dev_replace);
2630
2631 /*
2632 * flush, submit all pending read and write bios, afterwards
2633 * wait for them.
2634 * Note that in the dev replace case, a read request causes
2635 * write requests that are submitted in the read completion
2636 * worker. Therefore in the current situation, it is required
2637 * that all write requests are flushed, so that all read and
2638 * write requests are really completed when bios_in_flight
2639 * changes to 0.
2640 */
2641 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2642 scrub_submit(sctx);
2643 mutex_lock(&sctx->wr_ctx.wr_lock);
2644 scrub_wr_submit(sctx);
2645 mutex_unlock(&sctx->wr_ctx.wr_lock);
2646
2647 wait_event(sctx->list_wait,
2648 atomic_read(&sctx->bios_in_flight) == 0);
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2650 atomic_inc(&fs_info->scrubs_paused);
2651 wake_up(&fs_info->scrub_pause_wait);
2652 wait_event(sctx->list_wait,
2653 atomic_read(&sctx->workers_pending) == 0);
2654
2655 mutex_lock(&fs_info->scrub_lock);
2656 while (atomic_read(&fs_info->scrub_pause_req)) {
2657 mutex_unlock(&fs_info->scrub_lock);
2658 wait_event(fs_info->scrub_pause_wait,
2659 atomic_read(&fs_info->scrub_pause_req) == 0);
2660 mutex_lock(&fs_info->scrub_lock);
2661 }
2662 atomic_dec(&fs_info->scrubs_paused);
2663 mutex_unlock(&fs_info->scrub_lock);
2664 wake_up(&fs_info->scrub_pause_wait);
2665
2666 dev_replace->cursor_left = dev_replace->cursor_right;
2667 dev_replace->item_needs_writeback = 1;
2168 btrfs_put_block_group(cache); 2668 btrfs_put_block_group(cache);
2169 if (ret) 2669 if (ret)
2170 break; 2670 break;
2671 if (is_dev_replace &&
2672 atomic64_read(&dev_replace->num_write_errors) > 0) {
2673 ret = -EIO;
2674 break;
2675 }
2676 if (sctx->stat.malloc_errors > 0) {
2677 ret = -ENOMEM;
2678 break;
2679 }
2171 2680
2172 key.offset = found_key.offset + length; 2681 key.offset = found_key.offset + length;
2173 btrfs_release_path(path); 2682 btrfs_release_path(path);
@@ -2182,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2182 return ret < 0 ? ret : 0; 2691 return ret < 0 ? ret : 0;
2183} 2692}
2184 2693
2185static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2694static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2695 struct btrfs_device *scrub_dev)
2186{ 2696{
2187 int i; 2697 int i;
2188 u64 bytenr; 2698 u64 bytenr;
2189 u64 gen; 2699 u64 gen;
2190 int ret; 2700 int ret;
2191 struct btrfs_device *device = sdev->dev; 2701 struct btrfs_root *root = sctx->dev_root;
2192 struct btrfs_root *root = device->dev_root;
2193 2702
2194 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2195 return -EIO; 2704 return -EIO;
@@ -2198,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2198 2707
2199 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2708 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2200 bytenr = btrfs_sb_offset(i); 2709 bytenr = btrfs_sb_offset(i);
2201 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2710 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2202 break; 2711 break;
2203 2712
2204 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2713 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2205 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2714 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2715 NULL, 1, bytenr);
2206 if (ret) 2716 if (ret)
2207 return ret; 2717 return ret;
2208 } 2718 }
2209 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2719 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2210 2720
2211 return 0; 2721 return 0;
2212} 2722}
@@ -2214,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2214/* 2724/*
2215 * get a reference count on fs_info->scrub_workers. start worker if necessary 2725 * get a reference count on fs_info->scrub_workers. start worker if necessary
2216 */ 2726 */
2217static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2727static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2728 int is_dev_replace)
2218{ 2729{
2219 struct btrfs_fs_info *fs_info = root->fs_info;
2220 int ret = 0; 2730 int ret = 0;
2221 2731
2222 mutex_lock(&fs_info->scrub_lock); 2732 mutex_lock(&fs_info->scrub_lock);
2223 if (fs_info->scrub_workers_refcnt == 0) { 2733 if (fs_info->scrub_workers_refcnt == 0) {
2224 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2734 if (is_dev_replace)
2225 fs_info->thread_pool_size, &fs_info->generic_worker); 2735 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2736 &fs_info->generic_worker);
2737 else
2738 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2739 fs_info->thread_pool_size,
2740 &fs_info->generic_worker);
2226 fs_info->scrub_workers.idle_thresh = 4; 2741 fs_info->scrub_workers.idle_thresh = 4;
2227 ret = btrfs_start_workers(&fs_info->scrub_workers); 2742 ret = btrfs_start_workers(&fs_info->scrub_workers);
2228 if (ret) 2743 if (ret)
2229 goto out; 2744 goto out;
2745 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2746 "scrubwrc",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2749 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2750 ret = btrfs_start_workers(
2751 &fs_info->scrub_wr_completion_workers);
2752 if (ret)
2753 goto out;
2754 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2755 &fs_info->generic_worker);
2756 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2757 if (ret)
2758 goto out;
2230 } 2759 }
2231 ++fs_info->scrub_workers_refcnt; 2760 ++fs_info->scrub_workers_refcnt;
2232out: 2761out:
@@ -2235,40 +2764,41 @@ out:
2235 return ret; 2764 return ret;
2236} 2765}
2237 2766
2238static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2767static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2239{ 2768{
2240 struct btrfs_fs_info *fs_info = root->fs_info;
2241
2242 mutex_lock(&fs_info->scrub_lock); 2769 mutex_lock(&fs_info->scrub_lock);
2243 if (--fs_info->scrub_workers_refcnt == 0) 2770 if (--fs_info->scrub_workers_refcnt == 0) {
2244 btrfs_stop_workers(&fs_info->scrub_workers); 2771 btrfs_stop_workers(&fs_info->scrub_workers);
2772 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2773 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2774 }
2245 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2246 mutex_unlock(&fs_info->scrub_lock); 2776 mutex_unlock(&fs_info->scrub_lock);
2247} 2777}
2248 2778
2249 2779int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2250int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 u64 end, struct btrfs_scrub_progress *progress,
2251 struct btrfs_scrub_progress *progress, int readonly) 2781 int readonly, int is_dev_replace)
2252{ 2782{
2253 struct scrub_dev *sdev; 2783 struct scrub_ctx *sctx;
2254 struct btrfs_fs_info *fs_info = root->fs_info;
2255 int ret; 2784 int ret;
2256 struct btrfs_device *dev; 2785 struct btrfs_device *dev;
2257 2786
2258 if (btrfs_fs_closing(root->fs_info)) 2787 if (btrfs_fs_closing(fs_info))
2259 return -EINVAL; 2788 return -EINVAL;
2260 2789
2261 /* 2790 /*
2262 * check some assumptions 2791 * check some assumptions
2263 */ 2792 */
2264 if (root->nodesize != root->leafsize) { 2793 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2265 printk(KERN_ERR 2794 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2795 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2267 root->nodesize, root->leafsize); 2796 fs_info->chunk_root->nodesize,
2797 fs_info->chunk_root->leafsize);
2268 return -EINVAL; 2798 return -EINVAL;
2269 } 2799 }
2270 2800
2271 if (root->nodesize > BTRFS_STRIPE_LEN) { 2801 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2272 /* 2802 /*
2273 * in this case scrub is unable to calculate the checksum 2803 * in this case scrub is unable to calculate the checksum
2274 * the way scrub is implemented. Do not handle this 2804 * the way scrub is implemented. Do not handle this
@@ -2276,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2276 */ 2806 */
2277 printk(KERN_ERR 2807 printk(KERN_ERR
2278 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2808 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2279 root->nodesize, BTRFS_STRIPE_LEN); 2809 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2280 return -EINVAL; 2810 return -EINVAL;
2281 } 2811 }
2282 2812
2283 if (root->sectorsize != PAGE_SIZE) { 2813 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2284 /* not supported for data w/o checksums */ 2814 /* not supported for data w/o checksums */
2285 printk(KERN_ERR 2815 printk(KERN_ERR
2286 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2816 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2287 root->sectorsize, (unsigned long long)PAGE_SIZE); 2817 fs_info->chunk_root->sectorsize,
2818 (unsigned long long)PAGE_SIZE);
2819 return -EINVAL;
2820 }
2821
2822 if (fs_info->chunk_root->nodesize >
2823 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2824 fs_info->chunk_root->sectorsize >
2825 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2826 /*
2827 * would exhaust the array bounds of pagev member in
2828 * struct scrub_block
2829 */
2830 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2831 fs_info->chunk_root->nodesize,
2832 SCRUB_MAX_PAGES_PER_BLOCK,
2833 fs_info->chunk_root->sectorsize,
2834 SCRUB_MAX_PAGES_PER_BLOCK);
2288 return -EINVAL; 2835 return -EINVAL;
2289 } 2836 }
2290 2837
2291 ret = scrub_workers_get(root); 2838 ret = scrub_workers_get(fs_info, is_dev_replace);
2292 if (ret) 2839 if (ret)
2293 return ret; 2840 return ret;
2294 2841
2295 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2842 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2296 dev = btrfs_find_device(root, devid, NULL, NULL); 2843 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2297 if (!dev || dev->missing) { 2844 if (!dev || (dev->missing && !is_dev_replace)) {
2298 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2845 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2299 scrub_workers_put(root); 2846 scrub_workers_put(fs_info);
2300 return -ENODEV; 2847 return -ENODEV;
2301 } 2848 }
2302 mutex_lock(&fs_info->scrub_lock); 2849 mutex_lock(&fs_info->scrub_lock);
2303 2850
2304 if (!dev->in_fs_metadata) { 2851 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2305 mutex_unlock(&fs_info->scrub_lock); 2852 mutex_unlock(&fs_info->scrub_lock);
2306 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2307 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2308 return -ENODEV; 2855 return -EIO;
2309 } 2856 }
2310 2857
2311 if (dev->scrub_device) { 2858 btrfs_dev_replace_lock(&fs_info->dev_replace);
2859 if (dev->scrub_device ||
2860 (!is_dev_replace &&
2861 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2862 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2312 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2313 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2314 scrub_workers_put(root); 2865 scrub_workers_put(fs_info);
2315 return -EINPROGRESS; 2866 return -EINPROGRESS;
2316 } 2867 }
2317 sdev = scrub_setup_dev(dev); 2868 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2318 if (IS_ERR(sdev)) { 2869 sctx = scrub_setup_ctx(dev, is_dev_replace);
2870 if (IS_ERR(sctx)) {
2319 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2320 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2321 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2322 return PTR_ERR(sdev); 2874 return PTR_ERR(sctx);
2323 } 2875 }
2324 sdev->readonly = readonly; 2876 sctx->readonly = readonly;
2325 dev->scrub_device = sdev; 2877 dev->scrub_device = sctx;
2326 2878
2327 atomic_inc(&fs_info->scrubs_running); 2879 atomic_inc(&fs_info->scrubs_running);
2328 mutex_unlock(&fs_info->scrub_lock); 2880 mutex_unlock(&fs_info->scrub_lock);
2329 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2881 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2330 2882
2331 down_read(&fs_info->scrub_super_lock); 2883 if (!is_dev_replace) {
2332 ret = scrub_supers(sdev); 2884 down_read(&fs_info->scrub_super_lock);
2333 up_read(&fs_info->scrub_super_lock); 2885 ret = scrub_supers(sctx, dev);
2886 up_read(&fs_info->scrub_super_lock);
2887 }
2334 2888
2335 if (!ret) 2889 if (!ret)
2336 ret = scrub_enumerate_chunks(sdev, start, end); 2890 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2891 is_dev_replace);
2337 2892
2338 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2893 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2339 atomic_dec(&fs_info->scrubs_running); 2894 atomic_dec(&fs_info->scrubs_running);
2340 wake_up(&fs_info->scrub_pause_wait); 2895 wake_up(&fs_info->scrub_pause_wait);
2341 2896
2342 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2897 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2343 2898
2344 if (progress) 2899 if (progress)
2345 memcpy(progress, &sdev->stat, sizeof(*progress)); 2900 memcpy(progress, &sctx->stat, sizeof(*progress));
2346 2901
2347 mutex_lock(&fs_info->scrub_lock); 2902 mutex_lock(&fs_info->scrub_lock);
2348 dev->scrub_device = NULL; 2903 dev->scrub_device = NULL;
2349 mutex_unlock(&fs_info->scrub_lock); 2904 mutex_unlock(&fs_info->scrub_lock);
2350 2905
2351 scrub_free_dev(sdev); 2906 scrub_free_ctx(sctx);
2352 scrub_workers_put(root); 2907 scrub_workers_put(fs_info);
2353 2908
2354 return ret; 2909 return ret;
2355} 2910}
@@ -2389,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2389 up_write(&root->fs_info->scrub_super_lock); 2944 up_write(&root->fs_info->scrub_super_lock);
2390} 2945}
2391 2946
2392int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2947int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2393{ 2948{
2394
2395 mutex_lock(&fs_info->scrub_lock); 2949 mutex_lock(&fs_info->scrub_lock);
2396 if (!atomic_read(&fs_info->scrubs_running)) { 2950 if (!atomic_read(&fs_info->scrubs_running)) {
2397 mutex_unlock(&fs_info->scrub_lock); 2951 mutex_unlock(&fs_info->scrub_lock);
@@ -2411,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2411 return 0; 2965 return 0;
2412} 2966}
2413 2967
2414int btrfs_scrub_cancel(struct btrfs_root *root) 2968int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2969 struct btrfs_device *dev)
2415{ 2970{
2416 return __btrfs_scrub_cancel(root->fs_info); 2971 struct scrub_ctx *sctx;
2417}
2418
2419int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2420{
2421 struct btrfs_fs_info *fs_info = root->fs_info;
2422 struct scrub_dev *sdev;
2423 2972
2424 mutex_lock(&fs_info->scrub_lock); 2973 mutex_lock(&fs_info->scrub_lock);
2425 sdev = dev->scrub_device; 2974 sctx = dev->scrub_device;
2426 if (!sdev) { 2975 if (!sctx) {
2427 mutex_unlock(&fs_info->scrub_lock); 2976 mutex_unlock(&fs_info->scrub_lock);
2428 return -ENOTCONN; 2977 return -ENOTCONN;
2429 } 2978 }
2430 atomic_inc(&sdev->cancel_req); 2979 atomic_inc(&sctx->cancel_req);
2431 while (dev->scrub_device) { 2980 while (dev->scrub_device) {
2432 mutex_unlock(&fs_info->scrub_lock); 2981 mutex_unlock(&fs_info->scrub_lock);
2433 wait_event(fs_info->scrub_pause_wait, 2982 wait_event(fs_info->scrub_pause_wait,
@@ -2450,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2450 * does not go away in cancel_dev. FIXME: find a better solution 2999 * does not go away in cancel_dev. FIXME: find a better solution
2451 */ 3000 */
2452 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3001 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2453 dev = btrfs_find_device(root, devid, NULL, NULL); 3002 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2454 if (!dev) { 3003 if (!dev) {
2455 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3004 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2456 return -ENODEV; 3005 return -ENODEV;
2457 } 3006 }
2458 ret = btrfs_scrub_cancel_dev(root, dev); 3007 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2459 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3008 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2460 3009
2461 return ret; 3010 return ret;
@@ -2465,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2465 struct btrfs_scrub_progress *progress) 3014 struct btrfs_scrub_progress *progress)
2466{ 3015{
2467 struct btrfs_device *dev; 3016 struct btrfs_device *dev;
2468 struct scrub_dev *sdev = NULL; 3017 struct scrub_ctx *sctx = NULL;
2469 3018
2470 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3019 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2471 dev = btrfs_find_device(root, devid, NULL, NULL); 3020 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2472 if (dev) 3021 if (dev)
2473 sdev = dev->scrub_device; 3022 sctx = dev->scrub_device;
2474 if (sdev) 3023 if (sctx)
2475 memcpy(progress, &sdev->stat, sizeof(*progress)); 3024 memcpy(progress, &sctx->stat, sizeof(*progress));
2476 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3025 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2477 3026
2478 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3027 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3028}
3029
3030static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3031 u64 extent_logical, u64 extent_len,
3032 u64 *extent_physical,
3033 struct btrfs_device **extent_dev,
3034 int *extent_mirror_num)
3035{
3036 u64 mapped_length;
3037 struct btrfs_bio *bbio = NULL;
3038 int ret;
3039
3040 mapped_length = extent_len;
3041 ret = btrfs_map_block(fs_info, READ, extent_logical,
3042 &mapped_length, &bbio, 0);
3043 if (ret || !bbio || mapped_length < extent_len ||
3044 !bbio->stripes[0].dev->bdev) {
3045 kfree(bbio);
3046 return;
3047 }
3048
3049 *extent_physical = bbio->stripes[0].physical;
3050 *extent_mirror_num = bbio->mirror_num;
3051 *extent_dev = bbio->stripes[0].dev;
3052 kfree(bbio);
3053}
3054
3055static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3056 struct scrub_wr_ctx *wr_ctx,
3057 struct btrfs_fs_info *fs_info,
3058 struct btrfs_device *dev,
3059 int is_dev_replace)
3060{
3061 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3062
3063 mutex_init(&wr_ctx->wr_lock);
3064 wr_ctx->wr_curr_bio = NULL;
3065 if (!is_dev_replace)
3066 return 0;
3067
3068 WARN_ON(!dev->bdev);
3069 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3070 bio_get_nr_vecs(dev->bdev));
3071 wr_ctx->tgtdev = dev;
3072 atomic_set(&wr_ctx->flush_all_writes, 0);
3073 return 0;
3074}
3075
3076static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3077{
3078 mutex_lock(&wr_ctx->wr_lock);
3079 kfree(wr_ctx->wr_curr_bio);
3080 wr_ctx->wr_curr_bio = NULL;
3081 mutex_unlock(&wr_ctx->wr_lock);
3082}
3083
3084static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3085 int mirror_num, u64 physical_for_dev_replace)
3086{
3087 struct scrub_copy_nocow_ctx *nocow_ctx;
3088 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3089
3090 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3091 if (!nocow_ctx) {
3092 spin_lock(&sctx->stat_lock);
3093 sctx->stat.malloc_errors++;
3094 spin_unlock(&sctx->stat_lock);
3095 return -ENOMEM;
3096 }
3097
3098 scrub_pending_trans_workers_inc(sctx);
3099
3100 nocow_ctx->sctx = sctx;
3101 nocow_ctx->logical = logical;
3102 nocow_ctx->len = len;
3103 nocow_ctx->mirror_num = mirror_num;
3104 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3105 nocow_ctx->work.func = copy_nocow_pages_worker;
3106 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3107 &nocow_ctx->work);
3108
3109 return 0;
3110}
3111
3112static void copy_nocow_pages_worker(struct btrfs_work *work)
3113{
3114 struct scrub_copy_nocow_ctx *nocow_ctx =
3115 container_of(work, struct scrub_copy_nocow_ctx, work);
3116 struct scrub_ctx *sctx = nocow_ctx->sctx;
3117 u64 logical = nocow_ctx->logical;
3118 u64 len = nocow_ctx->len;
3119 int mirror_num = nocow_ctx->mirror_num;
3120 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3121 int ret;
3122 struct btrfs_trans_handle *trans = NULL;
3123 struct btrfs_fs_info *fs_info;
3124 struct btrfs_path *path;
3125 struct btrfs_root *root;
3126 int not_written = 0;
3127
3128 fs_info = sctx->dev_root->fs_info;
3129 root = fs_info->extent_root;
3130
3131 path = btrfs_alloc_path();
3132 if (!path) {
3133 spin_lock(&sctx->stat_lock);
3134 sctx->stat.malloc_errors++;
3135 spin_unlock(&sctx->stat_lock);
3136 not_written = 1;
3137 goto out;
3138 }
3139
3140 trans = btrfs_join_transaction(root);
3141 if (IS_ERR(trans)) {
3142 not_written = 1;
3143 goto out;
3144 }
3145
3146 ret = iterate_inodes_from_logical(logical, fs_info, path,
3147 copy_nocow_pages_for_inode,
3148 nocow_ctx);
3149 if (ret != 0 && ret != -ENOENT) {
3150 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3151 (unsigned long long)logical,
3152 (unsigned long long)physical_for_dev_replace,
3153 (unsigned long long)len,
3154 (unsigned long long)mirror_num, ret);
3155 not_written = 1;
3156 goto out;
3157 }
3158
3159out:
3160 if (trans && !IS_ERR(trans))
3161 btrfs_end_transaction(trans, root);
3162 if (not_written)
3163 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3164 num_uncorrectable_read_errors);
3165
3166 btrfs_free_path(path);
3167 kfree(nocow_ctx);
3168
3169 scrub_pending_trans_workers_dec(sctx);
3170}
3171
3172static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3173{
3174 unsigned long index;
3175 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3176 int ret = 0;
3177 struct btrfs_key key;
3178 struct inode *inode = NULL;
3179 struct btrfs_root *local_root;
3180 u64 physical_for_dev_replace;
3181 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3183
3184 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1;
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root))
3189 return PTR_ERR(local_root);
3190
3191 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum;
3193 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3195 if (IS_ERR(inode))
3196 return PTR_ERR(inode);
3197
3198 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3199 len = nocow_ctx->len;
3200 while (len >= PAGE_CACHE_SIZE) {
3201 struct page *page = NULL;
3202 int ret_sub;
3203
3204 index = offset >> PAGE_CACHE_SHIFT;
3205
3206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3207 if (!page) {
3208 pr_err("find_or_create_page() failed\n");
3209 ret = -ENOMEM;
3210 goto next_page;
3211 }
3212
3213 if (PageUptodate(page)) {
3214 if (PageDirty(page))
3215 goto next_page;
3216 } else {
3217 ClearPageError(page);
3218 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3219 io_tree,
3220 page, btrfs_get_extent,
3221 nocow_ctx->mirror_num);
3222 if (ret_sub) {
3223 ret = ret_sub;
3224 goto next_page;
3225 }
3226 wait_on_page_locked(page);
3227 if (!PageUptodate(page)) {
3228 ret = -EIO;
3229 goto next_page;
3230 }
3231 }
3232 ret_sub = write_page_nocow(nocow_ctx->sctx,
3233 physical_for_dev_replace, page);
3234 if (ret_sub) {
3235 ret = ret_sub;
3236 goto next_page;
3237 }
3238
3239next_page:
3240 if (page) {
3241 unlock_page(page);
3242 put_page(page);
3243 }
3244 offset += PAGE_CACHE_SIZE;
3245 physical_for_dev_replace += PAGE_CACHE_SIZE;
3246 len -= PAGE_CACHE_SIZE;
3247 }
3248
3249 if (inode)
3250 iput(inode);
3251 return ret;
3252}
3253
3254static int write_page_nocow(struct scrub_ctx *sctx,
3255 u64 physical_for_dev_replace, struct page *page)
3256{
3257 struct bio *bio;
3258 struct btrfs_device *dev;
3259 int ret;
3260 DECLARE_COMPLETION_ONSTACK(compl);
3261
3262 dev = sctx->wr_ctx.tgtdev;
3263 if (!dev)
3264 return -EIO;
3265 if (!dev->bdev) {
3266 printk_ratelimited(KERN_WARNING
3267 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3268 return -EIO;
3269 }
3270 bio = bio_alloc(GFP_NOFS, 1);
3271 if (!bio) {
3272 spin_lock(&sctx->stat_lock);
3273 sctx->stat.malloc_errors++;
3274 spin_unlock(&sctx->stat_lock);
3275 return -ENOMEM;
3276 }
3277 bio->bi_private = &compl;
3278 bio->bi_end_io = scrub_complete_bio_end_io;
3279 bio->bi_size = 0;
3280 bio->bi_sector = physical_for_dev_replace >> 9;
3281 bio->bi_bdev = dev->bdev;
3282 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3283 if (ret != PAGE_CACHE_SIZE) {
3284leave_with_eio:
3285 bio_put(bio);
3286 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3287 return -EIO;
3288 }
3289 btrfsic_submit_bio(WRITE_SYNC, bio);
3290 wait_for_completion(&compl);
3291
3292 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3293 goto leave_with_eio;
3294
3295 bio_put(bio);
3296 return 0;
2479} 3297}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f869..54454542ad40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
107 int cur_inode_new; 107 int cur_inode_new;
108 int cur_inode_new_gen; 108 int cur_inode_new_gen;
109 int cur_inode_deleted; 109 int cur_inode_deleted;
110 int cur_inode_first_ref_orphan;
111 u64 cur_inode_size; 110 u64 cur_inode_size;
112 u64 cur_inode_mode; 111 u64 cur_inode_mode;
113 112
@@ -126,7 +125,15 @@ struct send_ctx {
126 125
127struct name_cache_entry { 126struct name_cache_entry {
128 struct list_head list; 127 struct list_head list;
129 struct list_head use_list; 128 /*
129 * radix_tree has only 32bit entries but we need to handle 64bit inums.
130 * We use the lower 32bit of the 64bit inum to store it in the tree. If
131 * more then one inum would fall into the same entry, we use radix_list
132 * to store the additional entries. radix_list is also used to store
133 * entries where two entries have the same inum but different
134 * generations.
135 */
136 struct list_head radix_list;
130 u64 ino; 137 u64 ino;
131 u64 gen; 138 u64 gen;
132 u64 parent_ino; 139 u64 parent_ino;
@@ -328,6 +335,7 @@ out:
328 return ret; 335 return ret;
329} 336}
330 337
338#if 0
331static void fs_path_remove(struct fs_path *p) 339static void fs_path_remove(struct fs_path *p)
332{ 340{
333 BUG_ON(p->reversed); 341 BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
335 p->end--; 343 p->end--;
336 *p->end = 0; 344 *p->end = 0;
337} 345}
346#endif
338 347
339static int fs_path_copy(struct fs_path *p, struct fs_path *from) 348static int fs_path_copy(struct fs_path *p, struct fs_path *from)
340{ 349{
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
377 return path; 386 return path;
378} 387}
379 388
380static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) 389int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
381{ 390{
382 int ret; 391 int ret;
383 mm_segment_t old_fs; 392 mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
387 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
388 397
389 while (pos < len) { 398 while (pos < len) {
390 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, 399 ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
391 &sctx->send_off);
392 /* TODO handle that correctly */ 400 /* TODO handle that correctly */
393 /*if (ret == -ERESTARTSYS) { 401 /*if (ret == -ERESTARTSYS) {
394 continue; 402 continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
544 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 552 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
545 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); 553 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
546 554
547 return write_buf(sctx, &hdr, sizeof(hdr)); 555 return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
556 &sctx->send_off);
548} 557}
549 558
550/* 559/*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
581 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 590 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
582 hdr->crc = cpu_to_le32(crc); 591 hdr->crc = cpu_to_le32(crc);
583 592
584 ret = write_buf(sctx, sctx->send_buf, sctx->send_size); 593 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
594 &sctx->send_off);
585 595
586 sctx->total_send_size += sctx->send_size; 596 sctx->total_send_size += sctx->send_size;
587 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; 597 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
687 */ 697 */
688static int get_inode_info(struct btrfs_root *root, 698static int get_inode_info(struct btrfs_root *root,
689 u64 ino, u64 *size, u64 *gen, 699 u64 ino, u64 *size, u64 *gen,
690 u64 *mode, u64 *uid, u64 *gid) 700 u64 *mode, u64 *uid, u64 *gid,
701 u64 *rdev)
691{ 702{
692 int ret; 703 int ret;
693 struct btrfs_inode_item *ii; 704 struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
721 *uid = btrfs_inode_uid(path->nodes[0], ii); 732 *uid = btrfs_inode_uid(path->nodes[0], ii);
722 if (gid) 733 if (gid)
723 *gid = btrfs_inode_gid(path->nodes[0], ii); 734 *gid = btrfs_inode_gid(path->nodes[0], ii);
735 if (rdev)
736 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
724 737
725out: 738out:
726 btrfs_free_path(path); 739 btrfs_free_path(path);
@@ -732,31 +745,36 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
732 void *ctx); 745 void *ctx);
733 746
734/* 747/*
735 * Helper function to iterate the entries in ONE btrfs_inode_ref. 748 * Helper function to iterate the entries in ONE btrfs_inode_ref or
749 * btrfs_inode_extref.
736 * The iterate callback may return a non zero value to stop iteration. This can 750 * The iterate callback may return a non zero value to stop iteration. This can
737 * be a negative value for error codes or 1 to simply stop it. 751 * be a negative value for error codes or 1 to simply stop it.
738 * 752 *
739 * path must point to the INODE_REF when called. 753 * path must point to the INODE_REF or INODE_EXTREF when called.
740 */ 754 */
741static int iterate_inode_ref(struct send_ctx *sctx, 755static int iterate_inode_ref(struct send_ctx *sctx,
742 struct btrfs_root *root, struct btrfs_path *path, 756 struct btrfs_root *root, struct btrfs_path *path,
743 struct btrfs_key *found_key, int resolve, 757 struct btrfs_key *found_key, int resolve,
744 iterate_inode_ref_t iterate, void *ctx) 758 iterate_inode_ref_t iterate, void *ctx)
745{ 759{
746 struct extent_buffer *eb; 760 struct extent_buffer *eb = path->nodes[0];
747 struct btrfs_item *item; 761 struct btrfs_item *item;
748 struct btrfs_inode_ref *iref; 762 struct btrfs_inode_ref *iref;
763 struct btrfs_inode_extref *extref;
749 struct btrfs_path *tmp_path; 764 struct btrfs_path *tmp_path;
750 struct fs_path *p; 765 struct fs_path *p;
751 u32 cur; 766 u32 cur = 0;
752 u32 len;
753 u32 total; 767 u32 total;
754 int slot; 768 int slot = path->slots[0];
755 u32 name_len; 769 u32 name_len;
756 char *start; 770 char *start;
757 int ret = 0; 771 int ret = 0;
758 int num; 772 int num = 0;
759 int index; 773 int index;
774 u64 dir;
775 unsigned long name_off;
776 unsigned long elem_size;
777 unsigned long ptr;
760 778
761 p = fs_path_alloc_reversed(sctx); 779 p = fs_path_alloc_reversed(sctx);
762 if (!p) 780 if (!p)
@@ -768,24 +786,40 @@ static int iterate_inode_ref(struct send_ctx *sctx,
768 return -ENOMEM; 786 return -ENOMEM;
769 } 787 }
770 788
771 eb = path->nodes[0];
772 slot = path->slots[0];
773 item = btrfs_item_nr(eb, slot);
774 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
775 cur = 0;
776 len = 0;
777 total = btrfs_item_size(eb, item);
778 789
779 num = 0; 790 if (found_key->type == BTRFS_INODE_REF_KEY) {
791 ptr = (unsigned long)btrfs_item_ptr(eb, slot,
792 struct btrfs_inode_ref);
793 item = btrfs_item_nr(eb, slot);
794 total = btrfs_item_size(eb, item);
795 elem_size = sizeof(*iref);
796 } else {
797 ptr = btrfs_item_ptr_offset(eb, slot);
798 total = btrfs_item_size_nr(eb, slot);
799 elem_size = sizeof(*extref);
800 }
801
780 while (cur < total) { 802 while (cur < total) {
781 fs_path_reset(p); 803 fs_path_reset(p);
782 804
783 name_len = btrfs_inode_ref_name_len(eb, iref); 805 if (found_key->type == BTRFS_INODE_REF_KEY) {
784 index = btrfs_inode_ref_index(eb, iref); 806 iref = (struct btrfs_inode_ref *)(ptr + cur);
807 name_len = btrfs_inode_ref_name_len(eb, iref);
808 name_off = (unsigned long)(iref + 1);
809 index = btrfs_inode_ref_index(eb, iref);
810 dir = found_key->offset;
811 } else {
812 extref = (struct btrfs_inode_extref *)(ptr + cur);
813 name_len = btrfs_inode_extref_name_len(eb, extref);
814 name_off = (unsigned long)&extref->name;
815 index = btrfs_inode_extref_index(eb, extref);
816 dir = btrfs_inode_extref_parent(eb, extref);
817 }
818
785 if (resolve) { 819 if (resolve) {
786 start = btrfs_iref_to_path(root, tmp_path, iref, eb, 820 start = btrfs_ref_to_path(root, tmp_path, name_len,
787 found_key->offset, p->buf, 821 name_off, eb, dir,
788 p->buf_len); 822 p->buf, p->buf_len);
789 if (IS_ERR(start)) { 823 if (IS_ERR(start)) {
790 ret = PTR_ERR(start); 824 ret = PTR_ERR(start);
791 goto out; 825 goto out;
@@ -796,9 +830,10 @@ static int iterate_inode_ref(struct send_ctx *sctx,
796 p->buf_len + p->buf - start); 830 p->buf_len + p->buf - start);
797 if (ret < 0) 831 if (ret < 0)
798 goto out; 832 goto out;
799 start = btrfs_iref_to_path(root, tmp_path, iref, 833 start = btrfs_ref_to_path(root, tmp_path,
800 eb, found_key->offset, p->buf, 834 name_len, name_off,
801 p->buf_len); 835 eb, dir,
836 p->buf, p->buf_len);
802 if (IS_ERR(start)) { 837 if (IS_ERR(start)) {
803 ret = PTR_ERR(start); 838 ret = PTR_ERR(start);
804 goto out; 839 goto out;
@@ -807,21 +842,16 @@ static int iterate_inode_ref(struct send_ctx *sctx,
807 } 842 }
808 p->start = start; 843 p->start = start;
809 } else { 844 } else {
810 ret = fs_path_add_from_extent_buffer(p, eb, 845 ret = fs_path_add_from_extent_buffer(p, eb, name_off,
811 (unsigned long)(iref + 1), name_len); 846 name_len);
812 if (ret < 0) 847 if (ret < 0)
813 goto out; 848 goto out;
814 } 849 }
815 850
816 851 cur += elem_size + name_len;
817 len = sizeof(*iref) + name_len; 852 ret = iterate(num, dir, index, p, ctx);
818 iref = (struct btrfs_inode_ref *)((char *)iref + len);
819 cur += len;
820
821 ret = iterate(num, found_key->offset, index, p, ctx);
822 if (ret) 853 if (ret)
823 goto out; 854 goto out;
824
825 num++; 855 num++;
826 } 856 }
827 857
@@ -852,7 +882,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
852 struct extent_buffer *eb; 882 struct extent_buffer *eb;
853 struct btrfs_item *item; 883 struct btrfs_item *item;
854 struct btrfs_dir_item *di; 884 struct btrfs_dir_item *di;
855 struct btrfs_path *tmp_path = NULL;
856 struct btrfs_key di_key; 885 struct btrfs_key di_key;
857 char *buf = NULL; 886 char *buf = NULL;
858 char *buf2 = NULL; 887 char *buf2 = NULL;
@@ -874,12 +903,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
874 goto out; 903 goto out;
875 } 904 }
876 905
877 tmp_path = alloc_path_for_send();
878 if (!tmp_path) {
879 ret = -ENOMEM;
880 goto out;
881 }
882
883 eb = path->nodes[0]; 906 eb = path->nodes[0];
884 slot = path->slots[0]; 907 slot = path->slots[0];
885 item = btrfs_item_nr(eb, slot); 908 item = btrfs_item_nr(eb, slot);
@@ -941,7 +964,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
941 } 964 }
942 965
943out: 966out:
944 btrfs_free_path(tmp_path);
945 if (buf_virtual) 967 if (buf_virtual)
946 vfree(buf); 968 vfree(buf);
947 else 969 else
@@ -993,7 +1015,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
993 } 1015 }
994 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); 1016 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
995 if (found_key.objectid != ino || 1017 if (found_key.objectid != ino ||
996 found_key.type != BTRFS_INODE_REF_KEY) { 1018 (found_key.type != BTRFS_INODE_REF_KEY &&
1019 found_key.type != BTRFS_INODE_EXTREF_KEY)) {
997 ret = -ENOENT; 1020 ret = -ENOENT;
998 goto out; 1021 goto out;
999 } 1022 }
@@ -1026,12 +1049,12 @@ struct backref_ctx {
1026 u64 extent_len; 1049 u64 extent_len;
1027 1050
1028 /* Just to check for bugs in backref resolving */ 1051 /* Just to check for bugs in backref resolving */
1029 int found_in_send_root; 1052 int found_itself;
1030}; 1053};
1031 1054
1032static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1055static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1033{ 1056{
1034 u64 root = (u64)key; 1057 u64 root = (u64)(uintptr_t)key;
1035 struct clone_root *cr = (struct clone_root *)elt; 1058 struct clone_root *cr = (struct clone_root *)elt;
1036 1059
1037 if (root < cr->root->objectid) 1060 if (root < cr->root->objectid)
@@ -1055,6 +1078,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
1055 1078
1056/* 1079/*
1057 * Called for every backref that is found for the current extent. 1080 * Called for every backref that is found for the current extent.
1081 * Results are collected in sctx->clone_roots->ino/offset/found_refs
1058 */ 1082 */
1059static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) 1083static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1060{ 1084{
@@ -1064,7 +1088,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1064 u64 i_size; 1088 u64 i_size;
1065 1089
1066 /* First check if the root is in the list of accepted clone sources */ 1090 /* First check if the root is in the list of accepted clone sources */
1067 found = bsearch((void *)root, bctx->sctx->clone_roots, 1091 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1068 bctx->sctx->clone_roots_cnt, 1092 bctx->sctx->clone_roots_cnt,
1069 sizeof(struct clone_root), 1093 sizeof(struct clone_root),
1070 __clone_root_cmp_bsearch); 1094 __clone_root_cmp_bsearch);
@@ -1074,14 +1098,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1074 if (found->root == bctx->sctx->send_root && 1098 if (found->root == bctx->sctx->send_root &&
1075 ino == bctx->cur_objectid && 1099 ino == bctx->cur_objectid &&
1076 offset == bctx->cur_offset) { 1100 offset == bctx->cur_offset) {
1077 bctx->found_in_send_root = 1; 1101 bctx->found_itself = 1;
1078 } 1102 }
1079 1103
1080 /* 1104 /*
1081 * There are inodes that have extents that lie behind it's i_size. Don't 1105 * There are inodes that have extents that lie behind its i_size. Don't
1082 * accept clones from these extents. 1106 * accept clones from these extents.
1083 */ 1107 */
1084 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); 1108 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
1109 NULL);
1085 if (ret < 0) 1110 if (ret < 0)
1086 return ret; 1111 return ret;
1087 1112
@@ -1101,16 +1126,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1101 */ 1126 */
1102 if (ino >= bctx->cur_objectid) 1127 if (ino >= bctx->cur_objectid)
1103 return 0; 1128 return 0;
1104 /*if (ino > ctx->cur_objectid) 1129#if 0
1130 if (ino > bctx->cur_objectid)
1105 return 0; 1131 return 0;
1106 if (offset + ctx->extent_len > ctx->cur_offset) 1132 if (offset + bctx->extent_len > bctx->cur_offset)
1107 return 0;*/ 1133 return 0;
1108 1134#endif
1109 bctx->found++;
1110 found->found_refs++;
1111 found->ino = ino;
1112 found->offset = offset;
1113 return 0;
1114 } 1135 }
1115 1136
1116 bctx->found++; 1137 bctx->found++;
@@ -1130,6 +1151,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1130} 1151}
1131 1152
1132/* 1153/*
1154 * Given an inode, offset and extent item, it finds a good clone for a clone
1155 * instruction. Returns -ENOENT when none could be found. The function makes
1156 * sure that the returned clone is usable at the point where sending is at the
1157 * moment. This means, that no clones are accepted which lie behind the current
1158 * inode+offset.
1159 *
1133 * path must point to the extent item when called. 1160 * path must point to the extent item when called.
1134 */ 1161 */
1135static int find_extent_clone(struct send_ctx *sctx, 1162static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1168,29 @@ static int find_extent_clone(struct send_ctx *sctx,
1141 int ret; 1168 int ret;
1142 int extent_type; 1169 int extent_type;
1143 u64 logical; 1170 u64 logical;
1171 u64 disk_byte;
1144 u64 num_bytes; 1172 u64 num_bytes;
1145 u64 extent_item_pos; 1173 u64 extent_item_pos;
1174 u64 flags = 0;
1146 struct btrfs_file_extent_item *fi; 1175 struct btrfs_file_extent_item *fi;
1147 struct extent_buffer *eb = path->nodes[0]; 1176 struct extent_buffer *eb = path->nodes[0];
1148 struct backref_ctx backref_ctx; 1177 struct backref_ctx *backref_ctx = NULL;
1149 struct clone_root *cur_clone_root; 1178 struct clone_root *cur_clone_root;
1150 struct btrfs_key found_key; 1179 struct btrfs_key found_key;
1151 struct btrfs_path *tmp_path; 1180 struct btrfs_path *tmp_path;
1181 int compressed;
1152 u32 i; 1182 u32 i;
1153 1183
1154 tmp_path = alloc_path_for_send(); 1184 tmp_path = alloc_path_for_send();
1155 if (!tmp_path) 1185 if (!tmp_path)
1156 return -ENOMEM; 1186 return -ENOMEM;
1157 1187
1188 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
1189 if (!backref_ctx) {
1190 ret = -ENOMEM;
1191 goto out;
1192 }
1193
1158 if (data_offset >= ino_size) { 1194 if (data_offset >= ino_size) {
1159 /* 1195 /*
1160 * There may be extents that lie behind the file's size. 1196 * There may be extents that lie behind the file's size.
@@ -1172,22 +1208,23 @@ static int find_extent_clone(struct send_ctx *sctx,
1172 ret = -ENOENT; 1208 ret = -ENOENT;
1173 goto out; 1209 goto out;
1174 } 1210 }
1211 compressed = btrfs_file_extent_compression(eb, fi);
1175 1212
1176 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1213 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1177 logical = btrfs_file_extent_disk_bytenr(eb, fi); 1214 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1178 if (logical == 0) { 1215 if (disk_byte == 0) {
1179 ret = -ENOENT; 1216 ret = -ENOENT;
1180 goto out; 1217 goto out;
1181 } 1218 }
1182 logical += btrfs_file_extent_offset(eb, fi); 1219 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1183 1220
1184 ret = extent_from_logical(sctx->send_root->fs_info, 1221 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
1185 logical, tmp_path, &found_key); 1222 &found_key, &flags);
1186 btrfs_release_path(tmp_path); 1223 btrfs_release_path(tmp_path);
1187 1224
1188 if (ret < 0) 1225 if (ret < 0)
1189 goto out; 1226 goto out;
1190 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1227 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1191 ret = -EIO; 1228 ret = -EIO;
1192 goto out; 1229 goto out;
1193 } 1230 }
@@ -1202,12 +1239,12 @@ static int find_extent_clone(struct send_ctx *sctx,
1202 cur_clone_root->found_refs = 0; 1239 cur_clone_root->found_refs = 0;
1203 } 1240 }
1204 1241
1205 backref_ctx.sctx = sctx; 1242 backref_ctx->sctx = sctx;
1206 backref_ctx.found = 0; 1243 backref_ctx->found = 0;
1207 backref_ctx.cur_objectid = ino; 1244 backref_ctx->cur_objectid = ino;
1208 backref_ctx.cur_offset = data_offset; 1245 backref_ctx->cur_offset = data_offset;
1209 backref_ctx.found_in_send_root = 0; 1246 backref_ctx->found_itself = 0;
1210 backref_ctx.extent_len = num_bytes; 1247 backref_ctx->extent_len = num_bytes;
1211 1248
1212 /* 1249 /*
1213 * The last extent of a file may be too large due to page alignment. 1250 * The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1252,31 @@ static int find_extent_clone(struct send_ctx *sctx,
1215 * __iterate_backrefs work. 1252 * __iterate_backrefs work.
1216 */ 1253 */
1217 if (data_offset + num_bytes >= ino_size) 1254 if (data_offset + num_bytes >= ino_size)
1218 backref_ctx.extent_len = ino_size - data_offset; 1255 backref_ctx->extent_len = ino_size - data_offset;
1219 1256
1220 /* 1257 /*
1221 * Now collect all backrefs. 1258 * Now collect all backrefs.
1222 */ 1259 */
1260 if (compressed == BTRFS_COMPRESS_NONE)
1261 extent_item_pos = logical - found_key.objectid;
1262 else
1263 extent_item_pos = 0;
1264
1223 extent_item_pos = logical - found_key.objectid; 1265 extent_item_pos = logical - found_key.objectid;
1224 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1266 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1225 found_key.objectid, extent_item_pos, 1, 1267 found_key.objectid, extent_item_pos, 1,
1226 __iterate_backrefs, &backref_ctx); 1268 __iterate_backrefs, backref_ctx);
1269
1227 if (ret < 0) 1270 if (ret < 0)
1228 goto out; 1271 goto out;
1229 1272
1230 if (!backref_ctx.found_in_send_root) { 1273 if (!backref_ctx->found_itself) {
1231 /* found a bug in backref code? */ 1274 /* found a bug in backref code? */
1232 ret = -EIO; 1275 ret = -EIO;
1233 printk(KERN_ERR "btrfs: ERROR did not find backref in " 1276 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1234 "send_root. inode=%llu, offset=%llu, " 1277 "send_root. inode=%llu, offset=%llu, "
1235 "logical=%llu\n", 1278 "disk_byte=%llu found extent=%llu\n",
1236 ino, data_offset, logical); 1279 ino, data_offset, disk_byte, found_key.objectid);
1237 goto out; 1280 goto out;
1238 } 1281 }
1239 1282
@@ -1242,7 +1285,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1242 "num_bytes=%llu, logical=%llu\n", 1285 "num_bytes=%llu, logical=%llu\n",
1243 data_offset, ino, num_bytes, logical); 1286 data_offset, ino, num_bytes, logical);
1244 1287
1245 if (!backref_ctx.found) 1288 if (!backref_ctx->found)
1246 verbose_printk("btrfs: no clones found\n"); 1289 verbose_printk("btrfs: no clones found\n");
1247 1290
1248 cur_clone_root = NULL; 1291 cur_clone_root = NULL;
@@ -1253,7 +1296,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1253 else if (sctx->clone_roots[i].root == sctx->send_root) 1296 else if (sctx->clone_roots[i].root == sctx->send_root)
1254 /* prefer clones from send_root over others */ 1297 /* prefer clones from send_root over others */
1255 cur_clone_root = sctx->clone_roots + i; 1298 cur_clone_root = sctx->clone_roots + i;
1256 break;
1257 } 1299 }
1258 1300
1259 } 1301 }
@@ -1267,6 +1309,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1267 1309
1268out: 1310out:
1269 btrfs_free_path(tmp_path); 1311 btrfs_free_path(tmp_path);
1312 kfree(backref_ctx);
1270 return ret; 1313 return ret;
1271} 1314}
1272 1315
@@ -1307,8 +1350,6 @@ static int read_symlink(struct send_ctx *sctx,
1307 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 1350 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1308 1351
1309 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1352 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1310 if (ret < 0)
1311 goto out;
1312 1353
1313out: 1354out:
1314 btrfs_free_path(path); 1355 btrfs_free_path(path);
@@ -1404,7 +1445,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1404 u64 right_gen; 1445 u64 right_gen;
1405 1446
1406 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, 1447 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1407 NULL); 1448 NULL, NULL);
1408 if (ret < 0 && ret != -ENOENT) 1449 if (ret < 0 && ret != -ENOENT)
1409 goto out; 1450 goto out;
1410 left_ret = ret; 1451 left_ret = ret;
@@ -1413,16 +1454,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1413 right_ret = -ENOENT; 1454 right_ret = -ENOENT;
1414 } else { 1455 } else {
1415 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, 1456 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1416 NULL, NULL, NULL); 1457 NULL, NULL, NULL, NULL);
1417 if (ret < 0 && ret != -ENOENT) 1458 if (ret < 0 && ret != -ENOENT)
1418 goto out; 1459 goto out;
1419 right_ret = ret; 1460 right_ret = ret;
1420 } 1461 }
1421 1462
1422 if (!left_ret && !right_ret) { 1463 if (!left_ret && !right_ret) {
1423 if (left_gen == gen && right_gen == gen) 1464 if (left_gen == gen && right_gen == gen) {
1424 ret = inode_state_no_change; 1465 ret = inode_state_no_change;
1425 else if (left_gen == gen) { 1466 } else if (left_gen == gen) {
1426 if (ino < sctx->send_progress) 1467 if (ino < sctx->send_progress)
1427 ret = inode_state_did_create; 1468 ret = inode_state_did_create;
1428 else 1469 else
@@ -1516,6 +1557,10 @@ out:
1516 return ret; 1557 return ret;
1517} 1558}
1518 1559
1560/*
1561 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1562 * generation of the parent dir and the name of the dir entry.
1563 */
1519static int get_first_ref(struct send_ctx *sctx, 1564static int get_first_ref(struct send_ctx *sctx,
1520 struct btrfs_root *root, u64 ino, 1565 struct btrfs_root *root, u64 ino,
1521 u64 *dir, u64 *dir_gen, struct fs_path *name) 1566 u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1524,8 +1569,8 @@ static int get_first_ref(struct send_ctx *sctx,
1524 struct btrfs_key key; 1569 struct btrfs_key key;
1525 struct btrfs_key found_key; 1570 struct btrfs_key found_key;
1526 struct btrfs_path *path; 1571 struct btrfs_path *path;
1527 struct btrfs_inode_ref *iref;
1528 int len; 1572 int len;
1573 u64 parent_dir;
1529 1574
1530 path = alloc_path_for_send(); 1575 path = alloc_path_for_send();
1531 if (!path) 1576 if (!path)
@@ -1541,27 +1586,41 @@ static int get_first_ref(struct send_ctx *sctx,
1541 if (!ret) 1586 if (!ret)
1542 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1587 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1543 path->slots[0]); 1588 path->slots[0]);
1544 if (ret || found_key.objectid != key.objectid || 1589 if (ret || found_key.objectid != ino ||
1545 found_key.type != key.type) { 1590 (found_key.type != BTRFS_INODE_REF_KEY &&
1591 found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1546 ret = -ENOENT; 1592 ret = -ENOENT;
1547 goto out; 1593 goto out;
1548 } 1594 }
1549 1595
1550 iref = btrfs_item_ptr(path->nodes[0], path->slots[0], 1596 if (key.type == BTRFS_INODE_REF_KEY) {
1551 struct btrfs_inode_ref); 1597 struct btrfs_inode_ref *iref;
1552 len = btrfs_inode_ref_name_len(path->nodes[0], iref); 1598 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1553 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 1599 struct btrfs_inode_ref);
1554 (unsigned long)(iref + 1), len); 1600 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1601 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1602 (unsigned long)(iref + 1),
1603 len);
1604 parent_dir = found_key.offset;
1605 } else {
1606 struct btrfs_inode_extref *extref;
1607 extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1608 struct btrfs_inode_extref);
1609 len = btrfs_inode_extref_name_len(path->nodes[0], extref);
1610 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1611 (unsigned long)&extref->name, len);
1612 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
1613 }
1555 if (ret < 0) 1614 if (ret < 0)
1556 goto out; 1615 goto out;
1557 btrfs_release_path(path); 1616 btrfs_release_path(path);
1558 1617
1559 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, 1618 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL,
1560 NULL); 1619 NULL, NULL);
1561 if (ret < 0) 1620 if (ret < 0)
1562 goto out; 1621 goto out;
1563 1622
1564 *dir = found_key.offset; 1623 *dir = parent_dir;
1565 1624
1566out: 1625out:
1567 btrfs_free_path(path); 1626 btrfs_free_path(path);
@@ -1586,22 +1645,28 @@ static int is_first_ref(struct send_ctx *sctx,
1586 if (ret < 0) 1645 if (ret < 0)
1587 goto out; 1646 goto out;
1588 1647
1589 if (name_len != fs_path_len(tmp_name)) { 1648 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1590 ret = 0; 1649 ret = 0;
1591 goto out; 1650 goto out;
1592 } 1651 }
1593 1652
1594 ret = memcmp(tmp_name->start, name, name_len); 1653 ret = !memcmp(tmp_name->start, name, name_len);
1595 if (ret)
1596 ret = 0;
1597 else
1598 ret = 1;
1599 1654
1600out: 1655out:
1601 fs_path_free(sctx, tmp_name); 1656 fs_path_free(sctx, tmp_name);
1602 return ret; 1657 return ret;
1603} 1658}
1604 1659
1660/*
1661 * Used by process_recorded_refs to determine if a new ref would overwrite an
1662 * already existing ref. In case it detects an overwrite, it returns the
1663 * inode/gen in who_ino/who_gen.
1664 * When an overwrite is detected, process_recorded_refs does proper orphanizing
1665 * to make sure later references to the overwritten inode are possible.
1666 * Orphanizing is however only required for the first ref of an inode.
1667 * process_recorded_refs does an additional is_first_ref check to see if
1668 * orphanizing is really required.
1669 */
1605static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 1670static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1606 const char *name, int name_len, 1671 const char *name, int name_len,
1607 u64 *who_ino, u64 *who_gen) 1672 u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1691,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1626 goto out; 1691 goto out;
1627 } 1692 }
1628 1693
1694 /*
1695 * Check if the overwritten ref was already processed. If yes, the ref
1696 * was already unlinked/moved, so we can safely assume that we will not
1697 * overwrite anything at this point in time.
1698 */
1629 if (other_inode > sctx->send_progress) { 1699 if (other_inode > sctx->send_progress) {
1630 ret = get_inode_info(sctx->parent_root, other_inode, NULL, 1700 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1631 who_gen, NULL, NULL, NULL); 1701 who_gen, NULL, NULL, NULL, NULL);
1632 if (ret < 0) 1702 if (ret < 0)
1633 goto out; 1703 goto out;
1634 1704
@@ -1642,6 +1712,13 @@ out:
1642 return ret; 1712 return ret;
1643} 1713}
1644 1714
1715/*
1716 * Checks if the ref was overwritten by an already processed inode. This is
1717 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1718 * thus the orphan name needs be used.
1719 * process_recorded_refs also uses it to avoid unlinking of refs that were
1720 * overwritten.
1721 */
1645static int did_overwrite_ref(struct send_ctx *sctx, 1722static int did_overwrite_ref(struct send_ctx *sctx,
1646 u64 dir, u64 dir_gen, 1723 u64 dir, u64 dir_gen,
1647 u64 ino, u64 ino_gen, 1724 u64 ino, u64 ino_gen,
@@ -1671,7 +1748,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
1671 } 1748 }
1672 1749
1673 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, 1750 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1674 NULL); 1751 NULL, NULL);
1675 if (ret < 0) 1752 if (ret < 0)
1676 goto out; 1753 goto out;
1677 1754
@@ -1690,6 +1767,11 @@ out:
1690 return ret; 1767 return ret;
1691} 1768}
1692 1769
1770/*
1771 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
1772 * that got overwritten. This is used by process_recorded_refs to determine
1773 * if it has to use the path as returned by get_cur_path or the orphan name.
1774 */
1693static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 1775static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1694{ 1776{
1695 int ret = 0; 1777 int ret = 0;
@@ -1710,39 +1792,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1710 1792
1711 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 1793 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1712 name->start, fs_path_len(name)); 1794 name->start, fs_path_len(name));
1713 if (ret < 0)
1714 goto out;
1715 1795
1716out: 1796out:
1717 fs_path_free(sctx, name); 1797 fs_path_free(sctx, name);
1718 return ret; 1798 return ret;
1719} 1799}
1720 1800
1801/*
1802 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
1803 * so we need to do some special handling in case we have clashes. This function
1804 * takes care of this with the help of name_cache_entry::radix_list.
1805 * In case of error, nce is kfreed.
1806 */
1721static int name_cache_insert(struct send_ctx *sctx, 1807static int name_cache_insert(struct send_ctx *sctx,
1722 struct name_cache_entry *nce) 1808 struct name_cache_entry *nce)
1723{ 1809{
1724 int ret = 0; 1810 int ret = 0;
1725 struct name_cache_entry **ncea; 1811 struct list_head *nce_head;
1726 1812
1727 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); 1813 nce_head = radix_tree_lookup(&sctx->name_cache,
1728 if (ncea) { 1814 (unsigned long)nce->ino);
1729 if (!ncea[0]) 1815 if (!nce_head) {
1730 ncea[0] = nce; 1816 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1731 else if (!ncea[1]) 1817 if (!nce_head)
1732 ncea[1] = nce;
1733 else
1734 BUG();
1735 } else {
1736 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1737 if (!ncea)
1738 return -ENOMEM; 1818 return -ENOMEM;
1819 INIT_LIST_HEAD(nce_head);
1739 1820
1740 ncea[0] = nce; 1821 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
1741 ncea[1] = NULL; 1822 if (ret < 0) {
1742 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); 1823 kfree(nce_head);
1743 if (ret < 0) 1824 kfree(nce);
1744 return ret; 1825 return ret;
1826 }
1745 } 1827 }
1828 list_add_tail(&nce->radix_list, nce_head);
1746 list_add_tail(&nce->list, &sctx->name_cache_list); 1829 list_add_tail(&nce->list, &sctx->name_cache_list);
1747 sctx->name_cache_size++; 1830 sctx->name_cache_size++;
1748 1831
@@ -1752,50 +1835,52 @@ static int name_cache_insert(struct send_ctx *sctx,
1752static void name_cache_delete(struct send_ctx *sctx, 1835static void name_cache_delete(struct send_ctx *sctx,
1753 struct name_cache_entry *nce) 1836 struct name_cache_entry *nce)
1754{ 1837{
1755 struct name_cache_entry **ncea; 1838 struct list_head *nce_head;
1756
1757 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1758 BUG_ON(!ncea);
1759
1760 if (ncea[0] == nce)
1761 ncea[0] = NULL;
1762 else if (ncea[1] == nce)
1763 ncea[1] = NULL;
1764 else
1765 BUG();
1766 1839
1767 if (!ncea[0] && !ncea[1]) { 1840 nce_head = radix_tree_lookup(&sctx->name_cache,
1768 radix_tree_delete(&sctx->name_cache, nce->ino); 1841 (unsigned long)nce->ino);
1769 kfree(ncea); 1842 BUG_ON(!nce_head);
1770 }
1771 1843
1844 list_del(&nce->radix_list);
1772 list_del(&nce->list); 1845 list_del(&nce->list);
1773
1774 sctx->name_cache_size--; 1846 sctx->name_cache_size--;
1847
1848 if (list_empty(nce_head)) {
1849 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1850 kfree(nce_head);
1851 }
1775} 1852}
1776 1853
1777static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 1854static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1778 u64 ino, u64 gen) 1855 u64 ino, u64 gen)
1779{ 1856{
1780 struct name_cache_entry **ncea; 1857 struct list_head *nce_head;
1858 struct name_cache_entry *cur;
1781 1859
1782 ncea = radix_tree_lookup(&sctx->name_cache, ino); 1860 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
1783 if (!ncea) 1861 if (!nce_head)
1784 return NULL; 1862 return NULL;
1785 1863
1786 if (ncea[0] && ncea[0]->gen == gen) 1864 list_for_each_entry(cur, nce_head, radix_list) {
1787 return ncea[0]; 1865 if (cur->ino == ino && cur->gen == gen)
1788 else if (ncea[1] && ncea[1]->gen == gen) 1866 return cur;
1789 return ncea[1]; 1867 }
1790 return NULL; 1868 return NULL;
1791} 1869}
1792 1870
1871/*
1872 * Removes the entry from the list and adds it back to the end. This marks the
1873 * entry as recently used so that name_cache_clean_unused does not remove it.
1874 */
1793static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) 1875static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1794{ 1876{
1795 list_del(&nce->list); 1877 list_del(&nce->list);
1796 list_add_tail(&nce->list, &sctx->name_cache_list); 1878 list_add_tail(&nce->list, &sctx->name_cache_list);
1797} 1879}
1798 1880
1881/*
1882 * Remove some entries from the beginning of name_cache_list.
1883 */
1799static void name_cache_clean_unused(struct send_ctx *sctx) 1884static void name_cache_clean_unused(struct send_ctx *sctx)
1800{ 1885{
1801 struct name_cache_entry *nce; 1886 struct name_cache_entry *nce;
@@ -1814,13 +1899,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
1814static void name_cache_free(struct send_ctx *sctx) 1899static void name_cache_free(struct send_ctx *sctx)
1815{ 1900{
1816 struct name_cache_entry *nce; 1901 struct name_cache_entry *nce;
1817 struct name_cache_entry *tmp;
1818 1902
1819 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { 1903 while (!list_empty(&sctx->name_cache_list)) {
1904 nce = list_entry(sctx->name_cache_list.next,
1905 struct name_cache_entry, list);
1820 name_cache_delete(sctx, nce); 1906 name_cache_delete(sctx, nce);
1907 kfree(nce);
1821 } 1908 }
1822} 1909}
1823 1910
1911/*
1912 * Used by get_cur_path for each ref up to the root.
1913 * Returns 0 if it succeeded.
1914 * Returns 1 if the inode is not existent or got overwritten. In that case, the
1915 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
1916 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
1917 * Returns <0 in case of error.
1918 */
1824static int __get_cur_name_and_parent(struct send_ctx *sctx, 1919static int __get_cur_name_and_parent(struct send_ctx *sctx,
1825 u64 ino, u64 gen, 1920 u64 ino, u64 gen,
1826 u64 *parent_ino, 1921 u64 *parent_ino,
@@ -1832,6 +1927,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1832 struct btrfs_path *path = NULL; 1927 struct btrfs_path *path = NULL;
1833 struct name_cache_entry *nce = NULL; 1928 struct name_cache_entry *nce = NULL;
1834 1929
1930 /*
1931 * First check if we already did a call to this function with the same
1932 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
1933 * return the cached result.
1934 */
1835 nce = name_cache_search(sctx, ino, gen); 1935 nce = name_cache_search(sctx, ino, gen);
1836 if (nce) { 1936 if (nce) {
1837 if (ino < sctx->send_progress && nce->need_later_update) { 1937 if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1954,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1854 if (!path) 1954 if (!path)
1855 return -ENOMEM; 1955 return -ENOMEM;
1856 1956
1957 /*
1958 * If the inode is not existent yet, add the orphan name and return 1.
1959 * This should only happen for the parent dir that we determine in
1960 * __record_new_ref
1961 */
1857 ret = is_inode_existent(sctx, ino, gen); 1962 ret = is_inode_existent(sctx, ino, gen);
1858 if (ret < 0) 1963 if (ret < 0)
1859 goto out; 1964 goto out;
@@ -1866,6 +1971,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1866 goto out_cache; 1971 goto out_cache;
1867 } 1972 }
1868 1973
1974 /*
1975 * Depending on whether the inode was already processed or not, use
1976 * send_root or parent_root for ref lookup.
1977 */
1869 if (ino < sctx->send_progress) 1978 if (ino < sctx->send_progress)
1870 ret = get_first_ref(sctx, sctx->send_root, ino, 1979 ret = get_first_ref(sctx, sctx->send_root, ino,
1871 parent_ino, parent_gen, dest); 1980 parent_ino, parent_gen, dest);
@@ -1875,6 +1984,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1875 if (ret < 0) 1984 if (ret < 0)
1876 goto out; 1985 goto out;
1877 1986
1987 /*
1988 * Check if the ref was overwritten by an inode's ref that was processed
1989 * earlier. If yes, treat as orphan and return 1.
1990 */
1878 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 1991 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1879 dest->start, dest->end - dest->start); 1992 dest->start, dest->end - dest->start);
1880 if (ret < 0) 1993 if (ret < 0)
@@ -1888,6 +2001,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1888 } 2001 }
1889 2002
1890out_cache: 2003out_cache:
2004 /*
2005 * Store the result of the lookup in the name cache.
2006 */
1891 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 2007 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1892 if (!nce) { 2008 if (!nce) {
1893 ret = -ENOMEM; 2009 ret = -ENOMEM;
@@ -1901,7 +2017,6 @@ out_cache:
1901 nce->name_len = fs_path_len(dest); 2017 nce->name_len = fs_path_len(dest);
1902 nce->ret = ret; 2018 nce->ret = ret;
1903 strcpy(nce->name, dest->start); 2019 strcpy(nce->name, dest->start);
1904 memset(&nce->use_list, 0, sizeof(nce->use_list));
1905 2020
1906 if (ino < sctx->send_progress) 2021 if (ino < sctx->send_progress)
1907 nce->need_later_update = 0; 2022 nce->need_later_update = 0;
@@ -2107,9 +2222,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
2107 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2222 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2108 btrfs_release_path(path); 2223 btrfs_release_path(path);
2109 2224
2110 if (ret < 0)
2111 goto out;
2112
2113 if (parent_root) { 2225 if (parent_root) {
2114 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2226 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2115 if (ret < 0) 2227 if (ret < 0)
@@ -2276,7 +2388,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2276 btrfs_inode_mtime(ii)); 2388 btrfs_inode_mtime(ii));
2277 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, 2389 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2278 btrfs_inode_ctime(ii)); 2390 btrfs_inode_ctime(ii));
2279 /* TODO otime? */ 2391 /* TODO Add otime support when the otime patches get into upstream */
2280 2392
2281 ret = send_cmd(sctx); 2393 ret = send_cmd(sctx);
2282 2394
@@ -2292,39 +2404,39 @@ out:
2292 * a valid path yet because we did not process the refs yet. So, the inode 2404 * a valid path yet because we did not process the refs yet. So, the inode
2293 * is created as orphan. 2405 * is created as orphan.
2294 */ 2406 */
2295static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, 2407static int send_create_inode(struct send_ctx *sctx, u64 ino)
2296 struct btrfs_key *key)
2297{ 2408{
2298 int ret = 0; 2409 int ret = 0;
2299 struct extent_buffer *eb = path->nodes[0];
2300 struct btrfs_inode_item *ii;
2301 struct fs_path *p; 2410 struct fs_path *p;
2302 int slot = path->slots[0];
2303 int cmd; 2411 int cmd;
2412 u64 gen;
2304 u64 mode; 2413 u64 mode;
2414 u64 rdev;
2305 2415
2306verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); 2416verbose_printk("btrfs: send_create_inode %llu\n", ino);
2307 2417
2308 p = fs_path_alloc(sctx); 2418 p = fs_path_alloc(sctx);
2309 if (!p) 2419 if (!p)
2310 return -ENOMEM; 2420 return -ENOMEM;
2311 2421
2312 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2422 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
2313 mode = btrfs_inode_mode(eb, ii); 2423 NULL, &rdev);
2424 if (ret < 0)
2425 goto out;
2314 2426
2315 if (S_ISREG(mode)) 2427 if (S_ISREG(mode)) {
2316 cmd = BTRFS_SEND_C_MKFILE; 2428 cmd = BTRFS_SEND_C_MKFILE;
2317 else if (S_ISDIR(mode)) 2429 } else if (S_ISDIR(mode)) {
2318 cmd = BTRFS_SEND_C_MKDIR; 2430 cmd = BTRFS_SEND_C_MKDIR;
2319 else if (S_ISLNK(mode)) 2431 } else if (S_ISLNK(mode)) {
2320 cmd = BTRFS_SEND_C_SYMLINK; 2432 cmd = BTRFS_SEND_C_SYMLINK;
2321 else if (S_ISCHR(mode) || S_ISBLK(mode)) 2433 } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2322 cmd = BTRFS_SEND_C_MKNOD; 2434 cmd = BTRFS_SEND_C_MKNOD;
2323 else if (S_ISFIFO(mode)) 2435 } else if (S_ISFIFO(mode)) {
2324 cmd = BTRFS_SEND_C_MKFIFO; 2436 cmd = BTRFS_SEND_C_MKFIFO;
2325 else if (S_ISSOCK(mode)) 2437 } else if (S_ISSOCK(mode)) {
2326 cmd = BTRFS_SEND_C_MKSOCK; 2438 cmd = BTRFS_SEND_C_MKSOCK;
2327 else { 2439 } else {
2328 printk(KERN_WARNING "btrfs: unexpected inode type %o", 2440 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2329 (int)(mode & S_IFMT)); 2441 (int)(mode & S_IFMT));
2330 ret = -ENOTSUPP; 2442 ret = -ENOTSUPP;
@@ -2335,22 +2447,23 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2335 if (ret < 0) 2447 if (ret < 0)
2336 goto out; 2448 goto out;
2337 2449
2338 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 2450 ret = gen_unique_name(sctx, ino, gen, p);
2339 if (ret < 0) 2451 if (ret < 0)
2340 goto out; 2452 goto out;
2341 2453
2342 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2454 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2343 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); 2455 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2344 2456
2345 if (S_ISLNK(mode)) { 2457 if (S_ISLNK(mode)) {
2346 fs_path_reset(p); 2458 fs_path_reset(p);
2347 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); 2459 ret = read_symlink(sctx, sctx->send_root, ino, p);
2348 if (ret < 0) 2460 if (ret < 0)
2349 goto out; 2461 goto out;
2350 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2462 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2351 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2463 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2352 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2464 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2353 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); 2465 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2466 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2354 } 2467 }
2355 2468
2356 ret = send_cmd(sctx); 2469 ret = send_cmd(sctx);
@@ -2364,6 +2477,92 @@ out:
2364 return ret; 2477 return ret;
2365} 2478}
2366 2479
2480/*
2481 * We need some special handling for inodes that get processed before the parent
2482 * directory got created. See process_recorded_refs for details.
2483 * This function does the check if we already created the dir out of order.
2484 */
2485static int did_create_dir(struct send_ctx *sctx, u64 dir)
2486{
2487 int ret = 0;
2488 struct btrfs_path *path = NULL;
2489 struct btrfs_key key;
2490 struct btrfs_key found_key;
2491 struct btrfs_key di_key;
2492 struct extent_buffer *eb;
2493 struct btrfs_dir_item *di;
2494 int slot;
2495
2496 path = alloc_path_for_send();
2497 if (!path) {
2498 ret = -ENOMEM;
2499 goto out;
2500 }
2501
2502 key.objectid = dir;
2503 key.type = BTRFS_DIR_INDEX_KEY;
2504 key.offset = 0;
2505 while (1) {
2506 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2507 1, 0);
2508 if (ret < 0)
2509 goto out;
2510 if (!ret) {
2511 eb = path->nodes[0];
2512 slot = path->slots[0];
2513 btrfs_item_key_to_cpu(eb, &found_key, slot);
2514 }
2515 if (ret || found_key.objectid != key.objectid ||
2516 found_key.type != key.type) {
2517 ret = 0;
2518 goto out;
2519 }
2520
2521 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2522 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2523
2524 if (di_key.objectid < sctx->send_progress) {
2525 ret = 1;
2526 goto out;
2527 }
2528
2529 key.offset = found_key.offset + 1;
2530 btrfs_release_path(path);
2531 }
2532
2533out:
2534 btrfs_free_path(path);
2535 return ret;
2536}
2537
2538/*
2539 * Only creates the inode if it is:
2540 * 1. Not a directory
2541 * 2. Or a directory which was not created already due to out of order
2542 * directories. See did_create_dir and process_recorded_refs for details.
2543 */
2544static int send_create_inode_if_needed(struct send_ctx *sctx)
2545{
2546 int ret;
2547
2548 if (S_ISDIR(sctx->cur_inode_mode)) {
2549 ret = did_create_dir(sctx, sctx->cur_ino);
2550 if (ret < 0)
2551 goto out;
2552 if (ret) {
2553 ret = 0;
2554 goto out;
2555 }
2556 }
2557
2558 ret = send_create_inode(sctx, sctx->cur_ino);
2559 if (ret < 0)
2560 goto out;
2561
2562out:
2563 return ret;
2564}
2565
2367struct recorded_ref { 2566struct recorded_ref {
2368 struct list_head list; 2567 struct list_head list;
2369 char *dir_path; 2568 char *dir_path;
@@ -2416,13 +2615,13 @@ static int record_ref(struct list_head *head, u64 dir,
2416static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2615static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2417{ 2616{
2418 struct recorded_ref *cur; 2617 struct recorded_ref *cur;
2419 struct recorded_ref *tmp;
2420 2618
2421 list_for_each_entry_safe(cur, tmp, head, list) { 2619 while (!list_empty(head)) {
2620 cur = list_entry(head->next, struct recorded_ref, list);
2422 fs_path_free(sctx, cur->full_path); 2621 fs_path_free(sctx, cur->full_path);
2622 list_del(&cur->list);
2423 kfree(cur); 2623 kfree(cur);
2424 } 2624 }
2425 INIT_LIST_HEAD(head);
2426} 2625}
2427 2626
2428static void free_recorded_refs(struct send_ctx *sctx) 2627static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2631,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
2432} 2631}
2433 2632
2434/* 2633/*
2435 * Renames/moves a file/dir to it's orphan name. Used when the first 2634 * Renames/moves a file/dir to its orphan name. Used when the first
2436 * ref of an unprocessed inode gets overwritten and for all non empty 2635 * ref of an unprocessed inode gets overwritten and for all non empty
2437 * directories. 2636 * directories.
2438 */ 2637 */
@@ -2472,6 +2671,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2472 struct btrfs_key loc; 2671 struct btrfs_key loc;
2473 struct btrfs_dir_item *di; 2672 struct btrfs_dir_item *di;
2474 2673
2674 /*
2675 * Don't try to rmdir the top/root subvolume dir.
2676 */
2677 if (dir == BTRFS_FIRST_FREE_OBJECTID)
2678 return 0;
2679
2475 path = alloc_path_for_send(); 2680 path = alloc_path_for_send();
2476 if (!path) 2681 if (!path)
2477 return -ENOMEM; 2682 return -ENOMEM;
@@ -2513,160 +2718,6 @@ out:
2513 return ret; 2718 return ret;
2514} 2719}
2515 2720
2516struct finish_unordered_dir_ctx {
2517 struct send_ctx *sctx;
2518 struct fs_path *cur_path;
2519 struct fs_path *dir_path;
2520 u64 dir_ino;
2521 int need_delete;
2522 int delete_pass;
2523};
2524
2525int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2526 const char *name, int name_len,
2527 const char *data, int data_len,
2528 u8 type, void *ctx)
2529{
2530 int ret = 0;
2531 struct finish_unordered_dir_ctx *fctx = ctx;
2532 struct send_ctx *sctx = fctx->sctx;
2533 u64 di_gen;
2534 u64 di_mode;
2535 int is_orphan = 0;
2536
2537 if (di_key->objectid >= fctx->dir_ino)
2538 goto out;
2539
2540 fs_path_reset(fctx->cur_path);
2541
2542 ret = get_inode_info(sctx->send_root, di_key->objectid,
2543 NULL, &di_gen, &di_mode, NULL, NULL);
2544 if (ret < 0)
2545 goto out;
2546
2547 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2548 fctx->dir_ino, name, name_len);
2549 if (ret < 0)
2550 goto out;
2551 if (ret) {
2552 is_orphan = 1;
2553 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2554 fctx->cur_path);
2555 } else {
2556 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2557 fctx->cur_path);
2558 }
2559 if (ret < 0)
2560 goto out;
2561
2562 ret = fs_path_add(fctx->dir_path, name, name_len);
2563 if (ret < 0)
2564 goto out;
2565
2566 if (!fctx->delete_pass) {
2567 if (S_ISDIR(di_mode)) {
2568 ret = send_rename(sctx, fctx->cur_path,
2569 fctx->dir_path);
2570 } else {
2571 ret = send_link(sctx, fctx->dir_path,
2572 fctx->cur_path);
2573 if (is_orphan)
2574 fctx->need_delete = 1;
2575 }
2576 } else if (!S_ISDIR(di_mode)) {
2577 ret = send_unlink(sctx, fctx->cur_path);
2578 } else {
2579 ret = 0;
2580 }
2581
2582 fs_path_remove(fctx->dir_path);
2583
2584out:
2585 return ret;
2586}
2587
2588/*
2589 * Go through all dir items and see if we find refs which could not be created
2590 * in the past because the dir did not exist at that time.
2591 */
2592static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2593{
2594 int ret = 0;
2595 struct btrfs_path *path = NULL;
2596 struct btrfs_key key;
2597 struct btrfs_key found_key;
2598 struct extent_buffer *eb;
2599 struct finish_unordered_dir_ctx fctx;
2600 int slot;
2601
2602 path = alloc_path_for_send();
2603 if (!path) {
2604 ret = -ENOMEM;
2605 goto out;
2606 }
2607
2608 memset(&fctx, 0, sizeof(fctx));
2609 fctx.sctx = sctx;
2610 fctx.cur_path = fs_path_alloc(sctx);
2611 fctx.dir_path = fs_path_alloc(sctx);
2612 if (!fctx.cur_path || !fctx.dir_path) {
2613 ret = -ENOMEM;
2614 goto out;
2615 }
2616 fctx.dir_ino = dir;
2617
2618 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2619 if (ret < 0)
2620 goto out;
2621
2622 /*
2623 * We do two passes. The first links in the new refs and the second
2624 * deletes orphans if required. Deletion of orphans is not required for
2625 * directory inodes, as we always have only one ref and use rename
2626 * instead of link for those.
2627 */
2628
2629again:
2630 key.objectid = dir;
2631 key.type = BTRFS_DIR_ITEM_KEY;
2632 key.offset = 0;
2633 while (1) {
2634 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2635 1, 0);
2636 if (ret < 0)
2637 goto out;
2638 eb = path->nodes[0];
2639 slot = path->slots[0];
2640 btrfs_item_key_to_cpu(eb, &found_key, slot);
2641
2642 if (found_key.objectid != key.objectid ||
2643 found_key.type != key.type) {
2644 btrfs_release_path(path);
2645 break;
2646 }
2647
2648 ret = iterate_dir_item(sctx, sctx->send_root, path,
2649 &found_key, __finish_unordered_dir,
2650 &fctx);
2651 if (ret < 0)
2652 goto out;
2653
2654 key.offset = found_key.offset + 1;
2655 btrfs_release_path(path);
2656 }
2657
2658 if (!fctx.delete_pass && fctx.need_delete) {
2659 fctx.delete_pass = 1;
2660 goto again;
2661 }
2662
2663out:
2664 btrfs_free_path(path);
2665 fs_path_free(sctx, fctx.cur_path);
2666 fs_path_free(sctx, fctx.dir_path);
2667 return ret;
2668}
2669
2670/* 2721/*
2671 * This does all the move/link/unlink/rmdir magic. 2722 * This does all the move/link/unlink/rmdir magic.
2672 */ 2723 */
@@ -2674,6 +2725,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
2674{ 2725{
2675 int ret = 0; 2726 int ret = 0;
2676 struct recorded_ref *cur; 2727 struct recorded_ref *cur;
2728 struct recorded_ref *cur2;
2677 struct ulist *check_dirs = NULL; 2729 struct ulist *check_dirs = NULL;
2678 struct ulist_iterator uit; 2730 struct ulist_iterator uit;
2679 struct ulist_node *un; 2731 struct ulist_node *un;
@@ -2685,6 +2737,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
2685 2737
2686verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 2738verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2687 2739
2740 /*
2741 * This should never happen as the root dir always has the same ref
2742 * which is always '..'
2743 */
2744 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2745
2688 valid_path = fs_path_alloc(sctx); 2746 valid_path = fs_path_alloc(sctx);
2689 if (!valid_path) { 2747 if (!valid_path) {
2690 ret = -ENOMEM; 2748 ret = -ENOMEM;
@@ -2731,6 +2789,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2731 2789
2732 list_for_each_entry(cur, &sctx->new_refs, list) { 2790 list_for_each_entry(cur, &sctx->new_refs, list) {
2733 /* 2791 /*
2792 * We may have refs where the parent directory does not exist
2793 * yet. This happens if the parent directories inum is higher
2794 * the the current inum. To handle this case, we create the
2795 * parent directory out of order. But we need to check if this
2796 * did already happen before due to other refs in the same dir.
2797 */
2798 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
2799 if (ret < 0)
2800 goto out;
2801 if (ret == inode_state_will_create) {
2802 ret = 0;
2803 /*
2804 * First check if any of the current inodes refs did
2805 * already create the dir.
2806 */
2807 list_for_each_entry(cur2, &sctx->new_refs, list) {
2808 if (cur == cur2)
2809 break;
2810 if (cur2->dir == cur->dir) {
2811 ret = 1;
2812 break;
2813 }
2814 }
2815
2816 /*
2817 * If that did not happen, check if a previous inode
2818 * did already create the dir.
2819 */
2820 if (!ret)
2821 ret = did_create_dir(sctx, cur->dir);
2822 if (ret < 0)
2823 goto out;
2824 if (!ret) {
2825 ret = send_create_inode(sctx, cur->dir);
2826 if (ret < 0)
2827 goto out;
2828 }
2829 }
2830
2831 /*
2734 * Check if this new ref would overwrite the first ref of 2832 * Check if this new ref would overwrite the first ref of
2735 * another unprocessed inode. If yes, orphanize the 2833 * another unprocessed inode. If yes, orphanize the
2736 * overwritten inode. If we find an overwritten ref that is 2834 * overwritten inode. If we find an overwritten ref that is
@@ -2764,7 +2862,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2764 * inode, move it and update valid_path. If not, link or move 2862 * inode, move it and update valid_path. If not, link or move
2765 * it depending on the inode mode. 2863 * it depending on the inode mode.
2766 */ 2864 */
2767 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2865 if (is_orphan) {
2768 ret = send_rename(sctx, valid_path, cur->full_path); 2866 ret = send_rename(sctx, valid_path, cur->full_path);
2769 if (ret < 0) 2867 if (ret < 0)
2770 goto out; 2868 goto out;
@@ -2827,6 +2925,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2827 if (ret < 0) 2925 if (ret < 0)
2828 goto out; 2926 goto out;
2829 } 2927 }
2928 } else if (S_ISDIR(sctx->cur_inode_mode) &&
2929 !list_empty(&sctx->deleted_refs)) {
2930 /*
2931 * We have a moved dir. Add the old parent to check_dirs
2932 */
2933 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
2934 list);
2935 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2936 GFP_NOFS);
2937 if (ret < 0)
2938 goto out;
2830 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 2939 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2831 /* 2940 /*
2832 * We have a non dir inode. Go through all deleted refs and 2941 * We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2949,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2840 if (ret < 0) 2949 if (ret < 0)
2841 goto out; 2950 goto out;
2842 if (!ret) { 2951 if (!ret) {
2843 /* 2952 ret = send_unlink(sctx, cur->full_path);
2844 * In case the inode was moved to a directory 2953 if (ret < 0)
2845 * that was not created yet (see 2954 goto out;
2846 * __record_new_ref), we can not unlink the ref
2847 * as it will be needed later when the parent
2848 * directory is created, so that we can move in
2849 * the inode to the new dir.
2850 */
2851 if (!is_orphan &&
2852 sctx->cur_inode_first_ref_orphan) {
2853 ret = orphanize_inode(sctx,
2854 sctx->cur_ino,
2855 sctx->cur_inode_gen,
2856 cur->full_path);
2857 if (ret < 0)
2858 goto out;
2859 ret = gen_unique_name(sctx,
2860 sctx->cur_ino,
2861 sctx->cur_inode_gen,
2862 valid_path);
2863 if (ret < 0)
2864 goto out;
2865 is_orphan = 1;
2866
2867 } else {
2868 ret = send_unlink(sctx, cur->full_path);
2869 if (ret < 0)
2870 goto out;
2871 }
2872 } 2955 }
2873 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, 2956 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2874 GFP_NOFS); 2957 GFP_NOFS);
@@ -2880,12 +2963,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2880 * If the inode is still orphan, unlink the orphan. This may 2963 * If the inode is still orphan, unlink the orphan. This may
2881 * happen when a previous inode did overwrite the first ref 2964 * happen when a previous inode did overwrite the first ref
2882 * of this inode and no new refs were added for the current 2965 * of this inode and no new refs were added for the current
2883 * inode. 2966 * inode. Unlinking does not mean that the inode is deleted in
2884 * We can however not delete the orphan in case the inode relies 2967 * all cases. There may still be links to this inode in other
2885 * in a directory that was not created yet (see 2968 * places.
2886 * __record_new_ref)
2887 */ 2969 */
2888 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2970 if (is_orphan) {
2889 ret = send_unlink(sctx, valid_path); 2971 ret = send_unlink(sctx, valid_path);
2890 if (ret < 0) 2972 if (ret < 0)
2891 goto out; 2973 goto out;
@@ -2900,6 +2982,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2900 */ 2982 */
2901 ULIST_ITER_INIT(&uit); 2983 ULIST_ITER_INIT(&uit);
2902 while ((un = ulist_next(check_dirs, &uit))) { 2984 while ((un = ulist_next(check_dirs, &uit))) {
2985 /*
2986 * In case we had refs into dirs that were not processed yet,
2987 * we don't need to do the utime and rmdir logic for these dirs.
2988 * The dir will be processed later.
2989 */
2903 if (un->val > sctx->cur_ino) 2990 if (un->val > sctx->cur_ino)
2904 continue; 2991 continue;
2905 2992
@@ -2929,25 +3016,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2929 } 3016 }
2930 } 3017 }
2931 3018
2932 /*
2933 * Current inode is now at it's new position, so we must increase
2934 * send_progress
2935 */
2936 sctx->send_progress = sctx->cur_ino + 1;
2937
2938 /*
2939 * We may have a directory here that has pending refs which could not
2940 * be created before (because the dir did not exist before, see
2941 * __record_new_ref). finish_outoforder_dir will link/move the pending
2942 * refs.
2943 */
2944 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2945 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2946 sctx->cur_inode_gen);
2947 if (ret < 0)
2948 goto out;
2949 }
2950
2951 ret = 0; 3019 ret = 0;
2952 3020
2953out: 3021out:
@@ -2971,34 +3039,9 @@ static int __record_new_ref(int num, u64 dir, int index,
2971 return -ENOMEM; 3039 return -ENOMEM;
2972 3040
2973 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3041 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2974 NULL); 3042 NULL, NULL);
2975 if (ret < 0)
2976 goto out;
2977
2978 /*
2979 * The parent may be non-existent at this point in time. This happens
2980 * if the ino of the parent dir is higher then the current ino. In this
2981 * case, we can not process this ref until the parent dir is finally
2982 * created. If we reach the parent dir later, process_recorded_refs
2983 * will go through all dir items and process the refs that could not be
2984 * processed before. In case this is the first ref, we set
2985 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2986 * keep an orphan of the inode so that it later can be used for
2987 * link/move
2988 */
2989 ret = is_inode_existent(sctx, dir, gen);
2990 if (ret < 0) 3043 if (ret < 0)
2991 goto out; 3044 goto out;
2992 if (!ret) {
2993 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2994 name->start, fs_path_len(name));
2995 if (ret < 0)
2996 goto out;
2997 if (ret)
2998 sctx->cur_inode_first_ref_orphan = 1;
2999 ret = 0;
3000 goto out;
3001 }
3002 3045
3003 ret = get_cur_path(sctx, dir, gen, p); 3046 ret = get_cur_path(sctx, dir, gen, p);
3004 if (ret < 0) 3047 if (ret < 0)
@@ -3029,7 +3072,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3029 return -ENOMEM; 3072 return -ENOMEM;
3030 3073
3031 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, 3074 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3032 NULL); 3075 NULL, NULL);
3033 if (ret < 0) 3076 if (ret < 0)
3034 goto out; 3077 goto out;
3035 3078
@@ -3206,33 +3249,29 @@ static int process_all_refs(struct send_ctx *sctx,
3206 key.offset = 0; 3249 key.offset = 0;
3207 while (1) { 3250 while (1) {
3208 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3251 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3209 if (ret < 0) { 3252 if (ret < 0)
3210 btrfs_release_path(path);
3211 goto out; 3253 goto out;
3212 } 3254 if (ret)
3213 if (ret) {
3214 btrfs_release_path(path);
3215 break; 3255 break;
3216 }
3217 3256
3218 eb = path->nodes[0]; 3257 eb = path->nodes[0];
3219 slot = path->slots[0]; 3258 slot = path->slots[0];
3220 btrfs_item_key_to_cpu(eb, &found_key, slot); 3259 btrfs_item_key_to_cpu(eb, &found_key, slot);
3221 3260
3222 if (found_key.objectid != key.objectid || 3261 if (found_key.objectid != key.objectid ||
3223 found_key.type != key.type) { 3262 (found_key.type != BTRFS_INODE_REF_KEY &&
3224 btrfs_release_path(path); 3263 found_key.type != BTRFS_INODE_EXTREF_KEY))
3225 break; 3264 break;
3226 }
3227 3265
3228 ret = iterate_inode_ref(sctx, sctx->parent_root, path, 3266 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
3229 &found_key, 0, cb, sctx); 3267 sctx);
3230 btrfs_release_path(path); 3268 btrfs_release_path(path);
3231 if (ret < 0) 3269 if (ret < 0)
3232 goto out; 3270 goto out;
3233 3271
3234 key.offset = found_key.offset + 1; 3272 key.offset = found_key.offset + 1;
3235 } 3273 }
3274 btrfs_release_path(path);
3236 3275
3237 ret = process_recorded_refs(sctx); 3276 ret = process_recorded_refs(sctx);
3238 3277
@@ -3555,7 +3594,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3555 int ret = 0; 3594 int ret = 0;
3556 struct fs_path *p; 3595 struct fs_path *p;
3557 loff_t pos = offset; 3596 loff_t pos = offset;
3558 int readed = 0; 3597 int num_read = 0;
3559 mm_segment_t old_fs; 3598 mm_segment_t old_fs;
3560 3599
3561 p = fs_path_alloc(sctx); 3600 p = fs_path_alloc(sctx);
@@ -3580,8 +3619,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3580 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); 3619 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3581 if (ret < 0) 3620 if (ret < 0)
3582 goto out; 3621 goto out;
3583 readed = ret; 3622 num_read = ret;
3584 if (!readed) 3623 if (!num_read)
3585 goto out; 3624 goto out;
3586 3625
3587 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 3626 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3633,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3594 3633
3595 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3634 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3596 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 3635 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3597 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); 3636 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
3598 3637
3599 ret = send_cmd(sctx); 3638 ret = send_cmd(sctx);
3600 3639
@@ -3604,7 +3643,7 @@ out:
3604 set_fs(old_fs); 3643 set_fs(old_fs);
3605 if (ret < 0) 3644 if (ret < 0)
3606 return ret; 3645 return ret;
3607 return readed; 3646 return num_read;
3608} 3647}
3609 3648
3610/* 3649/*
@@ -3615,7 +3654,6 @@ static int send_clone(struct send_ctx *sctx,
3615 struct clone_root *clone_root) 3654 struct clone_root *clone_root)
3616{ 3655{
3617 int ret = 0; 3656 int ret = 0;
3618 struct btrfs_root *clone_root2 = clone_root->root;
3619 struct fs_path *p; 3657 struct fs_path *p;
3620 u64 gen; 3658 u64 gen;
3621 3659
@@ -3640,22 +3678,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3640 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 3678 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3641 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3679 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3642 3680
3643 if (clone_root2 == sctx->send_root) { 3681 if (clone_root->root == sctx->send_root) {
3644 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, 3682 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3645 &gen, NULL, NULL, NULL); 3683 &gen, NULL, NULL, NULL, NULL);
3646 if (ret < 0) 3684 if (ret < 0)
3647 goto out; 3685 goto out;
3648 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3686 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3649 } else { 3687 } else {
3650 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); 3688 ret = get_inode_path(sctx, clone_root->root,
3689 clone_root->ino, p);
3651 } 3690 }
3652 if (ret < 0) 3691 if (ret < 0)
3653 goto out; 3692 goto out;
3654 3693
3655 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 3694 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3656 clone_root2->root_item.uuid); 3695 clone_root->root->root_item.uuid);
3657 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 3696 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3658 clone_root2->root_item.ctransid); 3697 clone_root->root->root_item.ctransid);
3659 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 3698 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3660 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 3699 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3661 clone_root->offset); 3700 clone_root->offset);
@@ -3684,10 +3723,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
3684 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3723 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3685 struct btrfs_file_extent_item); 3724 struct btrfs_file_extent_item);
3686 type = btrfs_file_extent_type(path->nodes[0], ei); 3725 type = btrfs_file_extent_type(path->nodes[0], ei);
3687 if (type == BTRFS_FILE_EXTENT_INLINE) 3726 if (type == BTRFS_FILE_EXTENT_INLINE) {
3688 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 3727 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3689 else 3728 /*
3729 * it is possible the inline item won't cover the whole page,
3730 * but there may be items after this page. Make
3731 * sure to send the whole thing
3732 */
3733 len = PAGE_CACHE_ALIGN(len);
3734 } else {
3690 len = btrfs_file_extent_num_bytes(path->nodes[0], ei); 3735 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3736 }
3691 3737
3692 if (offset + len > sctx->cur_inode_size) 3738 if (offset + len > sctx->cur_inode_size)
3693 len = sctx->cur_inode_size - offset; 3739 len = sctx->cur_inode_size - offset;
@@ -3735,6 +3781,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3735 u64 left_offset_fixed; 3781 u64 left_offset_fixed;
3736 u64 left_len; 3782 u64 left_len;
3737 u64 right_len; 3783 u64 right_len;
3784 u64 left_gen;
3785 u64 right_gen;
3738 u8 left_type; 3786 u8 left_type;
3739 u8 right_type; 3787 u8 right_type;
3740 3788
@@ -3744,17 +3792,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3744 3792
3745 eb = left_path->nodes[0]; 3793 eb = left_path->nodes[0];
3746 slot = left_path->slots[0]; 3794 slot = left_path->slots[0];
3747
3748 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 3795 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3749 left_type = btrfs_file_extent_type(eb, ei); 3796 left_type = btrfs_file_extent_type(eb, ei);
3750 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3751 left_len = btrfs_file_extent_num_bytes(eb, ei);
3752 left_offset = btrfs_file_extent_offset(eb, ei);
3753 3797
3754 if (left_type != BTRFS_FILE_EXTENT_REG) { 3798 if (left_type != BTRFS_FILE_EXTENT_REG) {
3755 ret = 0; 3799 ret = 0;
3756 goto out; 3800 goto out;
3757 } 3801 }
3802 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3803 left_len = btrfs_file_extent_num_bytes(eb, ei);
3804 left_offset = btrfs_file_extent_offset(eb, ei);
3805 left_gen = btrfs_file_extent_generation(eb, ei);
3758 3806
3759 /* 3807 /*
3760 * Following comments will refer to these graphics. L is the left 3808 * Following comments will refer to these graphics. L is the left
@@ -3810,6 +3858,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3810 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 3858 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3811 right_len = btrfs_file_extent_num_bytes(eb, ei); 3859 right_len = btrfs_file_extent_num_bytes(eb, ei);
3812 right_offset = btrfs_file_extent_offset(eb, ei); 3860 right_offset = btrfs_file_extent_offset(eb, ei);
3861 right_gen = btrfs_file_extent_generation(eb, ei);
3813 3862
3814 if (right_type != BTRFS_FILE_EXTENT_REG) { 3863 if (right_type != BTRFS_FILE_EXTENT_REG) {
3815 ret = 0; 3864 ret = 0;
@@ -3820,7 +3869,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3820 * Are we at extent 8? If yes, we know the extent is changed. 3869 * Are we at extent 8? If yes, we know the extent is changed.
3821 * This may only happen on the first iteration. 3870 * This may only happen on the first iteration.
3822 */ 3871 */
3823 if (found_key.offset + right_len < ekey->offset) { 3872 if (found_key.offset + right_len <= ekey->offset) {
3824 ret = 0; 3873 ret = 0;
3825 goto out; 3874 goto out;
3826 } 3875 }
@@ -3837,8 +3886,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3837 /* 3886 /*
3838 * Check if we have the same extent. 3887 * Check if we have the same extent.
3839 */ 3888 */
3840 if (left_disknr + left_offset_fixed != 3889 if (left_disknr != right_disknr ||
3841 right_disknr + right_offset) { 3890 left_offset_fixed != right_offset ||
3891 left_gen != right_gen) {
3842 ret = 0; 3892 ret = 0;
3843 goto out; 3893 goto out;
3844 } 3894 }
@@ -3971,12 +4021,21 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3971 if (sctx->cur_ino == 0) 4021 if (sctx->cur_ino == 0)
3972 goto out; 4022 goto out;
3973 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && 4023 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
3974 sctx->cmp_key->type <= BTRFS_INODE_REF_KEY) 4024 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
3975 goto out; 4025 goto out;
3976 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) 4026 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
3977 goto out; 4027 goto out;
3978 4028
3979 ret = process_recorded_refs(sctx); 4029 ret = process_recorded_refs(sctx);
4030 if (ret < 0)
4031 goto out;
4032
4033 /*
4034 * We have processed the refs and thus need to advance send_progress.
4035 * Now, calls to get_cur_xxx will take the updated refs of the current
4036 * inode into account.
4037 */
4038 sctx->send_progress = sctx->cur_ino + 1;
3980 4039
3981out: 4040out:
3982 return ret; 4041 return ret;
@@ -4004,26 +4063,25 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4004 goto out; 4063 goto out;
4005 4064
4006 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, 4065 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4007 &left_mode, &left_uid, &left_gid); 4066 &left_mode, &left_uid, &left_gid, NULL);
4008 if (ret < 0) 4067 if (ret < 0)
4009 goto out; 4068 goto out;
4010 4069
4011 if (!S_ISLNK(sctx->cur_inode_mode)) { 4070 if (!sctx->parent_root || sctx->cur_inode_new) {
4012 if (!sctx->parent_root || sctx->cur_inode_new) { 4071 need_chown = 1;
4072 if (!S_ISLNK(sctx->cur_inode_mode))
4013 need_chmod = 1; 4073 need_chmod = 1;
4014 need_chown = 1; 4074 } else {
4015 } else { 4075 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4016 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, 4076 NULL, NULL, &right_mode, &right_uid,
4017 NULL, NULL, &right_mode, &right_uid, 4077 &right_gid, NULL);
4018 &right_gid); 4078 if (ret < 0)
4019 if (ret < 0) 4079 goto out;
4020 goto out;
4021 4080
4022 if (left_uid != right_uid || left_gid != right_gid) 4081 if (left_uid != right_uid || left_gid != right_gid)
4023 need_chown = 1; 4082 need_chown = 1;
4024 if (left_mode != right_mode) 4083 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
4025 need_chmod = 1; 4084 need_chmod = 1;
4026 }
4027 } 4085 }
4028 4086
4029 if (S_ISREG(sctx->cur_inode_mode)) { 4087 if (S_ISREG(sctx->cur_inode_mode)) {
@@ -4074,7 +4132,12 @@ static int changed_inode(struct send_ctx *sctx,
4074 4132
4075 sctx->cur_ino = key->objectid; 4133 sctx->cur_ino = key->objectid;
4076 sctx->cur_inode_new_gen = 0; 4134 sctx->cur_inode_new_gen = 0;
4077 sctx->cur_inode_first_ref_orphan = 0; 4135
4136 /*
4137 * Set send_progress to current inode. This will tell all get_cur_xxx
4138 * functions that the current inode's refs are not updated yet. Later,
4139 * when process_recorded_refs is finished, it is set to cur_ino + 1.
4140 */
4078 sctx->send_progress = sctx->cur_ino; 4141 sctx->send_progress = sctx->cur_ino;
4079 4142
4080 if (result == BTRFS_COMPARE_TREE_NEW || 4143 if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4161,14 @@ static int changed_inode(struct send_ctx *sctx,
4098 4161
4099 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 4162 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4100 right_ii); 4163 right_ii);
4101 if (left_gen != right_gen) 4164
4165 /*
4166 * The cur_ino = root dir case is special here. We can't treat
4167 * the inode as deleted+reused because it would generate a
4168 * stream that tries to delete/mkdir the root dir.
4169 */
4170 if (left_gen != right_gen &&
4171 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4102 sctx->cur_inode_new_gen = 1; 4172 sctx->cur_inode_new_gen = 1;
4103 } 4173 }
4104 4174
@@ -4111,8 +4181,7 @@ static int changed_inode(struct send_ctx *sctx,
4111 sctx->cur_inode_mode = btrfs_inode_mode( 4181 sctx->cur_inode_mode = btrfs_inode_mode(
4112 sctx->left_path->nodes[0], left_ii); 4182 sctx->left_path->nodes[0], left_ii);
4113 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 4183 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4114 ret = send_create_inode(sctx, sctx->left_path, 4184 ret = send_create_inode_if_needed(sctx);
4115 sctx->cmp_key);
4116 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 4185 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4117 sctx->cur_inode_gen = right_gen; 4186 sctx->cur_inode_gen = right_gen;
4118 sctx->cur_inode_new = 0; 4187 sctx->cur_inode_new = 0;
@@ -4122,7 +4191,17 @@ static int changed_inode(struct send_ctx *sctx,
4122 sctx->cur_inode_mode = btrfs_inode_mode( 4191 sctx->cur_inode_mode = btrfs_inode_mode(
4123 sctx->right_path->nodes[0], right_ii); 4192 sctx->right_path->nodes[0], right_ii);
4124 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 4193 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4194 /*
4195 * We need to do some special handling in case the inode was
4196 * reported as changed with a changed generation number. This
4197 * means that the original inode was deleted and new inode
4198 * reused the same inum. So we have to treat the old inode as
4199 * deleted and the new one as new.
4200 */
4125 if (sctx->cur_inode_new_gen) { 4201 if (sctx->cur_inode_new_gen) {
4202 /*
4203 * First, process the inode as if it was deleted.
4204 */
4126 sctx->cur_inode_gen = right_gen; 4205 sctx->cur_inode_gen = right_gen;
4127 sctx->cur_inode_new = 0; 4206 sctx->cur_inode_new = 0;
4128 sctx->cur_inode_deleted = 1; 4207 sctx->cur_inode_deleted = 1;
@@ -4135,6 +4214,9 @@ static int changed_inode(struct send_ctx *sctx,
4135 if (ret < 0) 4214 if (ret < 0)
4136 goto out; 4215 goto out;
4137 4216
4217 /*
4218 * Now process the inode as if it was new.
4219 */
4138 sctx->cur_inode_gen = left_gen; 4220 sctx->cur_inode_gen = left_gen;
4139 sctx->cur_inode_new = 1; 4221 sctx->cur_inode_new = 1;
4140 sctx->cur_inode_deleted = 0; 4222 sctx->cur_inode_deleted = 0;
@@ -4142,14 +4224,23 @@ static int changed_inode(struct send_ctx *sctx,
4142 sctx->left_path->nodes[0], left_ii); 4224 sctx->left_path->nodes[0], left_ii);
4143 sctx->cur_inode_mode = btrfs_inode_mode( 4225 sctx->cur_inode_mode = btrfs_inode_mode(
4144 sctx->left_path->nodes[0], left_ii); 4226 sctx->left_path->nodes[0], left_ii);
4145 ret = send_create_inode(sctx, sctx->left_path, 4227 ret = send_create_inode_if_needed(sctx);
4146 sctx->cmp_key);
4147 if (ret < 0) 4228 if (ret < 0)
4148 goto out; 4229 goto out;
4149 4230
4150 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 4231 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4151 if (ret < 0) 4232 if (ret < 0)
4152 goto out; 4233 goto out;
4234 /*
4235 * Advance send_progress now as we did not get into
4236 * process_recorded_refs_if_needed in the new_gen case.
4237 */
4238 sctx->send_progress = sctx->cur_ino + 1;
4239
4240 /*
4241 * Now process all extents and xattrs of the inode as if
4242 * they were all new.
4243 */
4153 ret = process_all_extents(sctx); 4244 ret = process_all_extents(sctx);
4154 if (ret < 0) 4245 if (ret < 0)
4155 goto out; 4246 goto out;
@@ -4172,6 +4263,16 @@ out:
4172 return ret; 4263 return ret;
4173} 4264}
4174 4265
4266/*
4267 * We have to process new refs before deleted refs, but compare_trees gives us
4268 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
4269 * first and later process them in process_recorded_refs.
4270 * For the cur_inode_new_gen case, we skip recording completely because
4271 * changed_inode did already initiate processing of refs. The reason for this is
4272 * that in this case, compare_tree actually compares the refs of 2 different
4273 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
4274 * refs of the right tree as deleted and all refs of the left tree as new.
4275 */
4175static int changed_ref(struct send_ctx *sctx, 4276static int changed_ref(struct send_ctx *sctx,
4176 enum btrfs_compare_tree_result result) 4277 enum btrfs_compare_tree_result result)
4177{ 4278{
@@ -4192,6 +4293,11 @@ static int changed_ref(struct send_ctx *sctx,
4192 return ret; 4293 return ret;
4193} 4294}
4194 4295
4296/*
4297 * Process new/deleted/changed xattrs. We skip processing in the
4298 * cur_inode_new_gen case because changed_inode did already initiate processing
4299 * of xattrs. The reason is the same as in changed_ref
4300 */
4195static int changed_xattr(struct send_ctx *sctx, 4301static int changed_xattr(struct send_ctx *sctx,
4196 enum btrfs_compare_tree_result result) 4302 enum btrfs_compare_tree_result result)
4197{ 4303{
@@ -4211,6 +4317,11 @@ static int changed_xattr(struct send_ctx *sctx,
4211 return ret; 4317 return ret;
4212} 4318}
4213 4319
4320/*
4321 * Process new/deleted/changed extents. We skip processing in the
4322 * cur_inode_new_gen case because changed_inode did already initiate processing
4323 * of extents. The reason is the same as in changed_ref
4324 */
4214static int changed_extent(struct send_ctx *sctx, 4325static int changed_extent(struct send_ctx *sctx,
4215 enum btrfs_compare_tree_result result) 4326 enum btrfs_compare_tree_result result)
4216{ 4327{
@@ -4227,7 +4338,10 @@ static int changed_extent(struct send_ctx *sctx,
4227 return ret; 4338 return ret;
4228} 4339}
4229 4340
4230 4341/*
4342 * Updates compare related fields in sctx and simply forwards to the actual
4343 * changed_xxx functions.
4344 */
4231static int changed_cb(struct btrfs_root *left_root, 4345static int changed_cb(struct btrfs_root *left_root,
4232 struct btrfs_root *right_root, 4346 struct btrfs_root *right_root,
4233 struct btrfs_path *left_path, 4347 struct btrfs_path *left_path,
@@ -4247,9 +4361,15 @@ static int changed_cb(struct btrfs_root *left_root,
4247 if (ret < 0) 4361 if (ret < 0)
4248 goto out; 4362 goto out;
4249 4363
4364 /* Ignore non-FS objects */
4365 if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
4366 key->objectid == BTRFS_FREE_SPACE_OBJECTID)
4367 goto out;
4368
4250 if (key->type == BTRFS_INODE_ITEM_KEY) 4369 if (key->type == BTRFS_INODE_ITEM_KEY)
4251 ret = changed_inode(sctx, result); 4370 ret = changed_inode(sctx, result);
4252 else if (key->type == BTRFS_INODE_REF_KEY) 4371 else if (key->type == BTRFS_INODE_REF_KEY ||
4372 key->type == BTRFS_INODE_EXTREF_KEY)
4253 ret = changed_ref(sctx, result); 4373 ret = changed_ref(sctx, result);
4254 else if (key->type == BTRFS_XATTR_ITEM_KEY) 4374 else if (key->type == BTRFS_XATTR_ITEM_KEY)
4255 ret = changed_xattr(sctx, result); 4375 ret = changed_xattr(sctx, result);
@@ -4277,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
4277 if (!path) 4397 if (!path)
4278 return -ENOMEM; 4398 return -ENOMEM;
4279 4399
4280 spin_lock(&send_root->root_times_lock); 4400 spin_lock(&send_root->root_item_lock);
4281 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4282 spin_unlock(&send_root->root_times_lock); 4402 spin_unlock(&send_root->root_item_lock);
4283 4403
4284 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4285 key.type = BTRFS_INODE_ITEM_KEY; 4405 key.type = BTRFS_INODE_ITEM_KEY;
@@ -4299,11 +4419,12 @@ join_trans:
4299 } 4419 }
4300 4420
4301 /* 4421 /*
4302 * Make sure the tree has not changed 4422 * Make sure the tree has not changed after re-joining. We detect this
4423 * by comparing start_ctransid and ctransid. They should always match.
4303 */ 4424 */
4304 spin_lock(&send_root->root_times_lock); 4425 spin_lock(&send_root->root_item_lock);
4305 ctransid = btrfs_root_ctransid(&send_root->root_item); 4426 ctransid = btrfs_root_ctransid(&send_root->root_item);
4306 spin_unlock(&send_root->root_times_lock); 4427 spin_unlock(&send_root->root_item_lock);
4307 4428
4308 if (ctransid != start_ctransid) { 4429 if (ctransid != start_ctransid) {
4309 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57f..1bf4f32fd4ef 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
130 130
131#ifdef __KERNEL__ 131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); 132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
133#endif 134#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c220..99545df1b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
55#include "export.h" 55#include "export.h"
56#include "compression.h" 56#include "compression.h"
57#include "rcu-string.h" 57#include "rcu-string.h"
58#include "dev-replace.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/btrfs.h> 61#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
117 sb->s_flags |= MS_RDONLY; 118 sb->s_flags |= MS_RDONLY;
118 printk(KERN_INFO "btrfs is forced readonly\n"); 119 printk(KERN_INFO "btrfs is forced readonly\n");
119 __btrfs_scrub_cancel(fs_info); 120 /*
121 * Note that a running device replace operation is not
122 * canceled here although there is no way to update
123 * the progress. It would add the risk of a deadlock,
124 * therefore the canceling is ommited. The only penalty
125 * is that some I/O remains active until the procedure
126 * completes. The next time when the filesystem is
127 * mounted writeable again, the device replace
128 * operation continues.
129 */
120// WARN_ON(1); 130// WARN_ON(1);
121 } 131 }
122} 132}
@@ -243,12 +253,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
243 struct btrfs_root *root, const char *function, 253 struct btrfs_root *root, const char *function,
244 unsigned int line, int errno) 254 unsigned int line, int errno)
245{ 255{
246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); 256 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
247 trans->aborted = errno; 257 trans->aborted = errno;
248 /* Nothing used. The other threads that have joined this 258 /* Nothing used. The other threads that have joined this
249 * transaction may be able to continue. */ 259 * transaction may be able to continue. */
250 if (!trans->blocks_used) { 260 if (!trans->blocks_used) {
251 btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); 261 char nbuf[16];
262 const char *errstr;
263
264 errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
265 btrfs_printk(root->fs_info,
266 "%s:%d: Aborting unused transaction(%s).\n",
267 function, line, errstr);
252 return; 268 return;
253 } 269 }
254 trans->transaction->aborted = errno; 270 trans->transaction->aborted = errno;
@@ -407,7 +423,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
407 btrfs_set_opt(info->mount_opt, NODATASUM); 423 btrfs_set_opt(info->mount_opt, NODATASUM);
408 break; 424 break;
409 case Opt_nodatacow: 425 case Opt_nodatacow:
410 printk(KERN_INFO "btrfs: setting nodatacow\n"); 426 if (!btrfs_test_opt(root, COMPRESS) ||
427 !btrfs_test_opt(root, FORCE_COMPRESS)) {
428 printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
429 } else {
430 printk(KERN_INFO "btrfs: setting nodatacow\n");
431 }
432 info->compress_type = BTRFS_COMPRESS_NONE;
433 btrfs_clear_opt(info->mount_opt, COMPRESS);
434 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
411 btrfs_set_opt(info->mount_opt, NODATACOW); 435 btrfs_set_opt(info->mount_opt, NODATACOW);
412 btrfs_set_opt(info->mount_opt, NODATASUM); 436 btrfs_set_opt(info->mount_opt, NODATASUM);
413 break; 437 break;
@@ -422,10 +446,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
422 compress_type = "zlib"; 446 compress_type = "zlib";
423 info->compress_type = BTRFS_COMPRESS_ZLIB; 447 info->compress_type = BTRFS_COMPRESS_ZLIB;
424 btrfs_set_opt(info->mount_opt, COMPRESS); 448 btrfs_set_opt(info->mount_opt, COMPRESS);
449 btrfs_clear_opt(info->mount_opt, NODATACOW);
450 btrfs_clear_opt(info->mount_opt, NODATASUM);
425 } else if (strcmp(args[0].from, "lzo") == 0) { 451 } else if (strcmp(args[0].from, "lzo") == 0) {
426 compress_type = "lzo"; 452 compress_type = "lzo";
427 info->compress_type = BTRFS_COMPRESS_LZO; 453 info->compress_type = BTRFS_COMPRESS_LZO;
428 btrfs_set_opt(info->mount_opt, COMPRESS); 454 btrfs_set_opt(info->mount_opt, COMPRESS);
455 btrfs_clear_opt(info->mount_opt, NODATACOW);
456 btrfs_clear_opt(info->mount_opt, NODATASUM);
429 btrfs_set_fs_incompat(info, COMPRESS_LZO); 457 btrfs_set_fs_incompat(info, COMPRESS_LZO);
430 } else if (strncmp(args[0].from, "no", 2) == 0) { 458 } else if (strncmp(args[0].from, "no", 2) == 0) {
431 compress_type = "no"; 459 compress_type = "no";
@@ -543,11 +571,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
543 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 571 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
544 break; 572 break;
545 case Opt_defrag: 573 case Opt_defrag:
546 printk(KERN_INFO "btrfs: enabling auto defrag"); 574 printk(KERN_INFO "btrfs: enabling auto defrag\n");
547 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 575 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
548 break; 576 break;
549 case Opt_recovery: 577 case Opt_recovery:
550 printk(KERN_INFO "btrfs: enabling auto recovery"); 578 printk(KERN_INFO "btrfs: enabling auto recovery\n");
551 btrfs_set_opt(info->mount_opt, RECOVERY); 579 btrfs_set_opt(info->mount_opt, RECOVERY);
552 break; 580 break;
553 case Opt_skip_balance: 581 case Opt_skip_balance:
@@ -846,18 +874,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
846 return 0; 874 return 0;
847 } 875 }
848 876
849 btrfs_wait_ordered_extents(root, 0, 0); 877 btrfs_wait_ordered_extents(root, 0);
850 878
851 spin_lock(&fs_info->trans_lock); 879 trans = btrfs_attach_transaction(root);
852 if (!fs_info->running_transaction) { 880 if (IS_ERR(trans)) {
853 spin_unlock(&fs_info->trans_lock); 881 /* no transaction, don't bother */
854 return 0; 882 if (PTR_ERR(trans) == -ENOENT)
855 } 883 return 0;
856 spin_unlock(&fs_info->trans_lock);
857
858 trans = btrfs_join_transaction(root);
859 if (IS_ERR(trans))
860 return PTR_ERR(trans); 884 return PTR_ERR(trans);
885 }
861 return btrfs_commit_transaction(trans, root); 886 return btrfs_commit_transaction(trans, root);
862} 887}
863 888
@@ -1171,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1171 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1172 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1173 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1174 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1199 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1200 new_pool_size);
1175} 1201}
1176 1202
1177static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1203static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1200,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1200 return 0; 1226 return 0;
1201 1227
1202 if (*flags & MS_RDONLY) { 1228 if (*flags & MS_RDONLY) {
1229 /*
1230 * this also happens on 'umount -rf' or on shutdown, when
1231 * the filesystem is busy.
1232 */
1203 sb->s_flags |= MS_RDONLY; 1233 sb->s_flags |= MS_RDONLY;
1204 1234
1235 btrfs_dev_replace_suspend_for_unmount(fs_info);
1236 btrfs_scrub_cancel(fs_info);
1237
1205 ret = btrfs_commit_super(root); 1238 ret = btrfs_commit_super(root);
1206 if (ret) 1239 if (ret)
1207 goto restore; 1240 goto restore;
@@ -1211,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1211 goto restore; 1244 goto restore;
1212 } 1245 }
1213 1246
1247 if (fs_info->fs_devices->missing_devices >
1248 fs_info->num_tolerated_disk_barrier_failures &&
1249 !(*flags & MS_RDONLY)) {
1250 printk(KERN_WARNING
1251 "Btrfs: too many missing devices, writeable remount is not allowed\n");
1252 ret = -EACCES;
1253 goto restore;
1254 }
1255
1214 if (btrfs_super_log_root(fs_info->super_copy) != 0) { 1256 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1215 ret = -EINVAL; 1257 ret = -EINVAL;
1216 goto restore; 1258 goto restore;
@@ -1229,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1229 if (ret) 1271 if (ret)
1230 goto restore; 1272 goto restore;
1231 1273
1274 ret = btrfs_resume_dev_replace_async(fs_info);
1275 if (ret) {
1276 pr_warn("btrfs: failed to resume dev_replace\n");
1277 goto restore;
1278 }
1232 sb->s_flags &= ~MS_RDONLY; 1279 sb->s_flags &= ~MS_RDONLY;
1233 } 1280 }
1234 1281
@@ -1321,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1321 min_stripe_size = BTRFS_STRIPE_LEN; 1368 min_stripe_size = BTRFS_STRIPE_LEN;
1322 1369
1323 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1370 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1324 if (!device->in_fs_metadata || !device->bdev) 1371 if (!device->in_fs_metadata || !device->bdev ||
1372 device->is_tgtdev_for_dev_replace)
1325 continue; 1373 continue;
1326 1374
1327 avail_space = device->total_bytes - device->bytes_used; 1375 avail_space = device->total_bytes - device->bytes_used;
@@ -1508,17 +1556,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1508 1556
1509static int btrfs_freeze(struct super_block *sb) 1557static int btrfs_freeze(struct super_block *sb)
1510{ 1558{
1511 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1559 struct btrfs_trans_handle *trans;
1512 mutex_lock(&fs_info->transaction_kthread_mutex); 1560 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1513 mutex_lock(&fs_info->cleaner_mutex); 1561
1514 return 0; 1562 trans = btrfs_attach_transaction(root);
1563 if (IS_ERR(trans)) {
1564 /* no transaction, don't bother */
1565 if (PTR_ERR(trans) == -ENOENT)
1566 return 0;
1567 return PTR_ERR(trans);
1568 }
1569 return btrfs_commit_transaction(trans, root);
1515} 1570}
1516 1571
1517static int btrfs_unfreeze(struct super_block *sb) 1572static int btrfs_unfreeze(struct super_block *sb)
1518{ 1573{
1519 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1520 mutex_unlock(&fs_info->cleaner_mutex);
1521 mutex_unlock(&fs_info->transaction_kthread_mutex);
1522 return 0; 1574 return 0;
1523} 1575}
1524 1576
@@ -1595,7 +1647,7 @@ static int btrfs_interface_init(void)
1595static void btrfs_interface_exit(void) 1647static void btrfs_interface_exit(void)
1596{ 1648{
1597 if (misc_deregister(&btrfs_misc) < 0) 1649 if (misc_deregister(&btrfs_misc) < 0)
1598 printk(KERN_INFO "misc_deregister failed for control device"); 1650 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1599} 1651}
1600 1652
1601static int __init init_btrfs_fs(void) 1653static int __init init_btrfs_fs(void)
@@ -1620,14 +1672,22 @@ static int __init init_btrfs_fs(void)
1620 if (err) 1672 if (err)
1621 goto free_extent_io; 1673 goto free_extent_io;
1622 1674
1623 err = btrfs_delayed_inode_init(); 1675 err = ordered_data_init();
1624 if (err) 1676 if (err)
1625 goto free_extent_map; 1677 goto free_extent_map;
1626 1678
1627 err = btrfs_interface_init(); 1679 err = btrfs_delayed_inode_init();
1680 if (err)
1681 goto free_ordered_data;
1682
1683 err = btrfs_auto_defrag_init();
1628 if (err) 1684 if (err)
1629 goto free_delayed_inode; 1685 goto free_delayed_inode;
1630 1686
1687 err = btrfs_interface_init();
1688 if (err)
1689 goto free_auto_defrag;
1690
1631 err = register_filesystem(&btrfs_fs_type); 1691 err = register_filesystem(&btrfs_fs_type);
1632 if (err) 1692 if (err)
1633 goto unregister_ioctl; 1693 goto unregister_ioctl;
@@ -1639,8 +1699,12 @@ static int __init init_btrfs_fs(void)
1639 1699
1640unregister_ioctl: 1700unregister_ioctl:
1641 btrfs_interface_exit(); 1701 btrfs_interface_exit();
1702free_auto_defrag:
1703 btrfs_auto_defrag_exit();
1642free_delayed_inode: 1704free_delayed_inode:
1643 btrfs_delayed_inode_exit(); 1705 btrfs_delayed_inode_exit();
1706free_ordered_data:
1707 ordered_data_exit();
1644free_extent_map: 1708free_extent_map:
1645 extent_map_exit(); 1709 extent_map_exit();
1646free_extent_io: 1710free_extent_io:
@@ -1656,7 +1720,9 @@ free_compress:
1656static void __exit exit_btrfs_fs(void) 1720static void __exit exit_btrfs_fs(void)
1657{ 1721{
1658 btrfs_destroy_cachep(); 1722 btrfs_destroy_cachep();
1723 btrfs_auto_defrag_exit();
1659 btrfs_delayed_inode_exit(); 1724 btrfs_delayed_inode_exit();
1725 ordered_data_exit();
1660 extent_map_exit(); 1726 extent_map_exit();
1661 extent_io_exit(); 1727 extent_io_exit();
1662 btrfs_interface_exit(); 1728 btrfs_interface_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e050..87fac9a21ea5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
30#include "tree-log.h" 30#include "tree-log.h"
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h"
33 34
34#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
35 36
@@ -53,7 +54,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
53/* 54/*
54 * either allocate a new transaction or hop into the existing one 55 * either allocate a new transaction or hop into the existing one
55 */ 56 */
56static noinline int join_transaction(struct btrfs_root *root, int nofail) 57static noinline int join_transaction(struct btrfs_root *root, int type)
57{ 58{
58 struct btrfs_transaction *cur_trans; 59 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info; 60 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +68,13 @@ loop:
67 } 68 }
68 69
69 if (fs_info->trans_no_join) { 70 if (fs_info->trans_no_join) {
70 if (!nofail) { 71 /*
72 * If we are JOIN_NOLOCK we're already committing a current
73 * transaction, we just need a handle to deal with something
74 * when committing the transaction, such as inode cache and
75 * space cache. It is a special case.
76 */
77 if (type != TRANS_JOIN_NOLOCK) {
71 spin_unlock(&fs_info->trans_lock); 78 spin_unlock(&fs_info->trans_lock);
72 return -EBUSY; 79 return -EBUSY;
73 } 80 }
@@ -87,6 +94,13 @@ loop:
87 } 94 }
88 spin_unlock(&fs_info->trans_lock); 95 spin_unlock(&fs_info->trans_lock);
89 96
97 /*
98 * If we are ATTACH, we just want to catch the current transaction,
99 * and commit it. If there is no transaction, just return ENOENT.
100 */
101 if (type == TRANS_ATTACH)
102 return -ENOENT;
103
90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 104 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
91 if (!cur_trans) 105 if (!cur_trans)
92 return -ENOMEM; 106 return -ENOMEM;
@@ -132,16 +146,12 @@ loop:
132 * the log must never go across transaction boundaries. 146 * the log must never go across transaction boundaries.
133 */ 147 */
134 smp_mb(); 148 smp_mb();
135 if (!list_empty(&fs_info->tree_mod_seq_list)) { 149 if (!list_empty(&fs_info->tree_mod_seq_list))
136 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 150 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
137 "creating a fresh transaction\n"); 151 "creating a fresh transaction\n");
138 WARN_ON(1); 152 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
139 } 153 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
140 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
141 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
142 "creating a fresh transaction\n"); 154 "creating a fresh transaction\n");
143 WARN_ON(1);
144 }
145 atomic_set(&fs_info->tree_mod_seq, 0); 155 atomic_set(&fs_info->tree_mod_seq, 0);
146 156
147 spin_lock_init(&cur_trans->commit_lock); 157 spin_lock_init(&cur_trans->commit_lock);
@@ -267,13 +277,6 @@ static void wait_current_trans(struct btrfs_root *root)
267 } 277 }
268} 278}
269 279
270enum btrfs_trans_type {
271 TRANS_START,
272 TRANS_JOIN,
273 TRANS_USERSPACE,
274 TRANS_JOIN_NOLOCK,
275};
276
277static int may_wait_transaction(struct btrfs_root *root, int type) 280static int may_wait_transaction(struct btrfs_root *root, int type)
278{ 281{
279 if (root->fs_info->log_root_recovering) 282 if (root->fs_info->log_root_recovering)
@@ -289,8 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
289 return 0; 292 return 0;
290} 293}
291 294
292static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 295static struct btrfs_trans_handle *
293 u64 num_items, int type) 296start_transaction(struct btrfs_root *root, u64 num_items, int type,
297 enum btrfs_reserve_flush_enum flush)
294{ 298{
295 struct btrfs_trans_handle *h; 299 struct btrfs_trans_handle *h;
296 struct btrfs_transaction *cur_trans; 300 struct btrfs_transaction *cur_trans;
@@ -305,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
305 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 309 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
306 h = current->journal_info; 310 h = current->journal_info;
307 h->use_count++; 311 h->use_count++;
312 WARN_ON(h->use_count > 2);
308 h->orig_rsv = h->block_rsv; 313 h->orig_rsv = h->block_rsv;
309 h->block_rsv = NULL; 314 h->block_rsv = NULL;
310 goto got_it; 315 goto got_it;
@@ -326,7 +331,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 331 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
327 ret = btrfs_block_rsv_add(root, 332 ret = btrfs_block_rsv_add(root,
328 &root->fs_info->trans_block_rsv, 333 &root->fs_info->trans_block_rsv,
329 num_bytes); 334 num_bytes, flush);
330 if (ret) 335 if (ret)
331 return ERR_PTR(ret); 336 return ERR_PTR(ret);
332 } 337 }
@@ -335,19 +340,34 @@ again:
335 if (!h) 340 if (!h)
336 return ERR_PTR(-ENOMEM); 341 return ERR_PTR(-ENOMEM);
337 342
338 sb_start_intwrite(root->fs_info->sb); 343 /*
344 * If we are JOIN_NOLOCK we're already committing a transaction and
345 * waiting on this guy, so we don't need to do the sb_start_intwrite
346 * because we're already holding a ref. We need this because we could
347 * have raced in and did an fsync() on a file which can kick a commit
348 * and then we deadlock with somebody doing a freeze.
349 *
350 * If we are ATTACH, it means we just want to catch the current
351 * transaction and commit it, so we needn't do sb_start_intwrite().
352 */
353 if (type < TRANS_JOIN_NOLOCK)
354 sb_start_intwrite(root->fs_info->sb);
339 355
340 if (may_wait_transaction(root, type)) 356 if (may_wait_transaction(root, type))
341 wait_current_trans(root); 357 wait_current_trans(root);
342 358
343 do { 359 do {
344 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); 360 ret = join_transaction(root, type);
345 if (ret == -EBUSY) 361 if (ret == -EBUSY)
346 wait_current_trans(root); 362 wait_current_trans(root);
347 } while (ret == -EBUSY); 363 } while (ret == -EBUSY);
348 364
349 if (ret < 0) { 365 if (ret < 0) {
350 sb_end_intwrite(root->fs_info->sb); 366 /* We must get the transaction if we are JOIN_NOLOCK. */
367 BUG_ON(type == TRANS_JOIN_NOLOCK);
368
369 if (type < TRANS_JOIN_NOLOCK)
370 sb_end_intwrite(root->fs_info->sb);
351 kmem_cache_free(btrfs_trans_handle_cachep, h); 371 kmem_cache_free(btrfs_trans_handle_cachep, h);
352 return ERR_PTR(ret); 372 return ERR_PTR(ret);
353 } 373 }
@@ -367,7 +387,9 @@ again:
367 h->aborted = 0; 387 h->aborted = 0;
368 h->qgroup_reserved = qgroup_reserved; 388 h->qgroup_reserved = qgroup_reserved;
369 h->delayed_ref_elem.seq = 0; 389 h->delayed_ref_elem.seq = 0;
390 h->type = type;
370 INIT_LIST_HEAD(&h->qgroup_ref_list); 391 INIT_LIST_HEAD(&h->qgroup_ref_list);
392 INIT_LIST_HEAD(&h->new_bgs);
371 393
372 smp_mb(); 394 smp_mb();
373 if (cur_trans->blocked && may_wait_transaction(root, type)) { 395 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +415,35 @@ got_it:
393struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 415struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
394 int num_items) 416 int num_items)
395{ 417{
396 return start_transaction(root, num_items, TRANS_START); 418 return start_transaction(root, num_items, TRANS_START,
419 BTRFS_RESERVE_FLUSH_ALL);
420}
421
422struct btrfs_trans_handle *btrfs_start_transaction_lflush(
423 struct btrfs_root *root, int num_items)
424{
425 return start_transaction(root, num_items, TRANS_START,
426 BTRFS_RESERVE_FLUSH_LIMIT);
397} 427}
428
398struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 429struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
399{ 430{
400 return start_transaction(root, 0, TRANS_JOIN); 431 return start_transaction(root, 0, TRANS_JOIN, 0);
401} 432}
402 433
403struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) 434struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
404{ 435{
405 return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 436 return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
406} 437}
407 438
408struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) 439struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
409{ 440{
410 return start_transaction(root, 0, TRANS_USERSPACE); 441 return start_transaction(root, 0, TRANS_USERSPACE, 0);
442}
443
444struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
445{
446 return start_transaction(root, 0, TRANS_ATTACH, 0);
411} 447}
412 448
413/* wait for a transaction commit to be fully complete */ 449/* wait for a transaction commit to be fully complete */
@@ -420,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
420int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 456int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
421{ 457{
422 struct btrfs_transaction *cur_trans = NULL, *t; 458 struct btrfs_transaction *cur_trans = NULL, *t;
423 int ret; 459 int ret = 0;
424 460
425 ret = 0;
426 if (transid) { 461 if (transid) {
427 if (transid <= root->fs_info->last_trans_committed) 462 if (transid <= root->fs_info->last_trans_committed)
428 goto out; 463 goto out;
429 464
465 ret = -EINVAL;
430 /* find specified transaction */ 466 /* find specified transaction */
431 spin_lock(&root->fs_info->trans_lock); 467 spin_lock(&root->fs_info->trans_lock);
432 list_for_each_entry(t, &root->fs_info->trans_list, list) { 468 list_for_each_entry(t, &root->fs_info->trans_list, list) {
433 if (t->transid == transid) { 469 if (t->transid == transid) {
434 cur_trans = t; 470 cur_trans = t;
435 atomic_inc(&cur_trans->use_count); 471 atomic_inc(&cur_trans->use_count);
472 ret = 0;
436 break; 473 break;
437 } 474 }
438 if (t->transid > transid) 475 if (t->transid > transid) {
476 ret = 0;
439 break; 477 break;
478 }
440 } 479 }
441 spin_unlock(&root->fs_info->trans_lock); 480 spin_unlock(&root->fs_info->trans_lock);
442 ret = -EINVAL; 481 /* The specified transaction doesn't exist */
443 if (!cur_trans) 482 if (!cur_trans)
444 goto out; /* bad transid */ 483 goto out;
445 } else { 484 } else {
446 /* find newest transaction that is committing | committed */ 485 /* find newest transaction that is committing | committed */
447 spin_lock(&root->fs_info->trans_lock); 486 spin_lock(&root->fs_info->trans_lock);
@@ -461,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
461 } 500 }
462 501
463 wait_for_commit(root, cur_trans); 502 wait_for_commit(root, cur_trans);
464
465 put_transaction(cur_trans); 503 put_transaction(cur_trans);
466 ret = 0;
467out: 504out:
468 return ret; 505 return ret;
469} 506}
@@ -506,11 +543,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
506} 543}
507 544
508static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 545static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
509 struct btrfs_root *root, int throttle, int lock) 546 struct btrfs_root *root, int throttle)
510{ 547{
511 struct btrfs_transaction *cur_trans = trans->transaction; 548 struct btrfs_transaction *cur_trans = trans->transaction;
512 struct btrfs_fs_info *info = root->fs_info; 549 struct btrfs_fs_info *info = root->fs_info;
513 int count = 0; 550 int count = 0;
551 int lock = (trans->type != TRANS_JOIN_NOLOCK);
514 int err = 0; 552 int err = 0;
515 553
516 if (--trans->use_count) { 554 if (--trans->use_count) {
@@ -536,6 +574,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
536 trans->qgroup_reserved = 0; 574 trans->qgroup_reserved = 0;
537 } 575 }
538 576
577 if (!list_empty(&trans->new_bgs))
578 btrfs_create_pending_block_groups(trans, root);
579
539 while (count < 2) { 580 while (count < 2) {
540 unsigned long cur = trans->delayed_ref_updates; 581 unsigned long cur = trans->delayed_ref_updates;
541 trans->delayed_ref_updates = 0; 582 trans->delayed_ref_updates = 0;
@@ -551,7 +592,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
551 btrfs_trans_release_metadata(trans, root); 592 btrfs_trans_release_metadata(trans, root);
552 trans->block_rsv = NULL; 593 trans->block_rsv = NULL;
553 594
554 sb_end_intwrite(root->fs_info->sb); 595 if (!list_empty(&trans->new_bgs))
596 btrfs_create_pending_block_groups(trans, root);
555 597
556 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 598 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
557 should_end_transaction(trans, root)) { 599 should_end_transaction(trans, root)) {
@@ -573,6 +615,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
573 } 615 }
574 } 616 }
575 617
618 if (trans->type < TRANS_JOIN_NOLOCK)
619 sb_end_intwrite(root->fs_info->sb);
620
576 WARN_ON(cur_trans != info->running_transaction); 621 WARN_ON(cur_trans != info->running_transaction);
577 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 622 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
578 atomic_dec(&cur_trans->num_writers); 623 atomic_dec(&cur_trans->num_writers);
@@ -604,7 +649,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
604{ 649{
605 int ret; 650 int ret;
606 651
607 ret = __btrfs_end_transaction(trans, root, 0, 1); 652 ret = __btrfs_end_transaction(trans, root, 0);
608 if (ret) 653 if (ret)
609 return ret; 654 return ret;
610 return 0; 655 return 0;
@@ -615,18 +660,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
615{ 660{
616 int ret; 661 int ret;
617 662
618 ret = __btrfs_end_transaction(trans, root, 1, 1); 663 ret = __btrfs_end_transaction(trans, root, 1);
619 if (ret)
620 return ret;
621 return 0;
622}
623
624int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root)
626{
627 int ret;
628
629 ret = __btrfs_end_transaction(trans, root, 0, 0);
630 if (ret) 664 if (ret)
631 return ret; 665 return ret;
632 return 0; 666 return 0;
@@ -635,7 +669,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
635int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, 669int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
636 struct btrfs_root *root) 670 struct btrfs_root *root)
637{ 671{
638 return __btrfs_end_transaction(trans, root, 1, 1); 672 return __btrfs_end_transaction(trans, root, 1);
639} 673}
640 674
641/* 675/*
@@ -649,13 +683,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
649 int err = 0; 683 int err = 0;
650 int werr = 0; 684 int werr = 0;
651 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 685 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
686 struct extent_state *cached_state = NULL;
652 u64 start = 0; 687 u64 start = 0;
653 u64 end; 688 u64 end;
654 689
655 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 690 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
656 mark)) { 691 mark, &cached_state)) {
657 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, 692 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
658 GFP_NOFS); 693 mark, &cached_state, GFP_NOFS);
694 cached_state = NULL;
659 err = filemap_fdatawrite_range(mapping, start, end); 695 err = filemap_fdatawrite_range(mapping, start, end);
660 if (err) 696 if (err)
661 werr = err; 697 werr = err;
@@ -679,12 +715,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
679 int err = 0; 715 int err = 0;
680 int werr = 0; 716 int werr = 0;
681 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 717 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
718 struct extent_state *cached_state = NULL;
682 u64 start = 0; 719 u64 start = 0;
683 u64 end; 720 u64 end;
684 721
685 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 722 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
686 EXTENT_NEED_WAIT)) { 723 EXTENT_NEED_WAIT, &cached_state)) {
687 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); 724 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
725 0, 0, &cached_state, GFP_NOFS);
688 err = filemap_fdatawait_range(mapping, start, end); 726 err = filemap_fdatawait_range(mapping, start, end);
689 if (err) 727 if (err)
690 werr = err; 728 werr = err;
@@ -809,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
809 return ret; 847 return ret;
810 848
811 ret = btrfs_run_dev_stats(trans, root->fs_info); 849 ret = btrfs_run_dev_stats(trans, root->fs_info);
812 BUG_ON(ret); 850 WARN_ON(ret);
851 ret = btrfs_run_dev_replace(trans, root->fs_info);
852 WARN_ON(ret);
813 853
814 ret = btrfs_run_qgroups(trans, root->fs_info); 854 ret = btrfs_run_qgroups(trans, root->fs_info);
815 BUG_ON(ret); 855 BUG_ON(ret);
@@ -832,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
832 switch_commit_root(fs_info->extent_root); 872 switch_commit_root(fs_info->extent_root);
833 up_write(&fs_info->extent_commit_sem); 873 up_write(&fs_info->extent_commit_sem);
834 874
875 btrfs_after_dev_replace_commit(fs_info);
876
835 return 0; 877 return 0;
836} 878}
837 879
@@ -916,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
916 struct btrfs_fs_info *info = root->fs_info; 958 struct btrfs_fs_info *info = root->fs_info;
917 struct btrfs_trans_handle *trans; 959 struct btrfs_trans_handle *trans;
918 int ret; 960 int ret;
919 unsigned long nr;
920 961
921 if (xchg(&root->defrag_running, 1)) 962 if (xchg(&root->defrag_running, 1))
922 return 0; 963 return 0;
@@ -928,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
928 969
929 ret = btrfs_defrag_leaves(trans, root, cacheonly); 970 ret = btrfs_defrag_leaves(trans, root, cacheonly);
930 971
931 nr = trans->blocks_used;
932 btrfs_end_transaction(trans, root); 972 btrfs_end_transaction(trans, root);
933 btrfs_btree_balance_dirty(info->tree_root, nr); 973 btrfs_btree_balance_dirty(info->tree_root);
934 cond_resched(); 974 cond_resched();
935 975
936 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 976 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -955,6 +995,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
955 struct btrfs_root *parent_root; 995 struct btrfs_root *parent_root;
956 struct btrfs_block_rsv *rsv; 996 struct btrfs_block_rsv *rsv;
957 struct inode *parent_inode; 997 struct inode *parent_inode;
998 struct btrfs_path *path;
999 struct btrfs_dir_item *dir_item;
958 struct dentry *parent; 1000 struct dentry *parent;
959 struct dentry *dentry; 1001 struct dentry *dentry;
960 struct extent_buffer *tmp; 1002 struct extent_buffer *tmp;
@@ -967,43 +1009,48 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 u64 root_flags; 1009 u64 root_flags;
968 uuid_le new_uuid; 1010 uuid_le new_uuid;
969 1011
970 rsv = trans->block_rsv; 1012 path = btrfs_alloc_path();
1013 if (!path) {
1014 ret = pending->error = -ENOMEM;
1015 goto path_alloc_fail;
1016 }
971 1017
972 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1018 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
973 if (!new_root_item) { 1019 if (!new_root_item) {
974 ret = pending->error = -ENOMEM; 1020 ret = pending->error = -ENOMEM;
975 goto fail; 1021 goto root_item_alloc_fail;
976 } 1022 }
977 1023
978 ret = btrfs_find_free_objectid(tree_root, &objectid); 1024 ret = btrfs_find_free_objectid(tree_root, &objectid);
979 if (ret) { 1025 if (ret) {
980 pending->error = ret; 1026 pending->error = ret;
981 goto fail; 1027 goto no_free_objectid;
982 } 1028 }
983 1029
984 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1030 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
985 1031
986 if (to_reserve > 0) { 1032 if (to_reserve > 0) {
987 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1033 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
988 to_reserve); 1034 to_reserve,
1035 BTRFS_RESERVE_NO_FLUSH);
989 if (ret) { 1036 if (ret) {
990 pending->error = ret; 1037 pending->error = ret;
991 goto fail; 1038 goto no_free_objectid;
992 } 1039 }
993 } 1040 }
994 1041
995 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, 1042 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
996 objectid, pending->inherit); 1043 objectid, pending->inherit);
997 kfree(pending->inherit);
998 if (ret) { 1044 if (ret) {
999 pending->error = ret; 1045 pending->error = ret;
1000 goto fail; 1046 goto no_free_objectid;
1001 } 1047 }
1002 1048
1003 key.objectid = objectid; 1049 key.objectid = objectid;
1004 key.offset = (u64)-1; 1050 key.offset = (u64)-1;
1005 key.type = BTRFS_ROOT_ITEM_KEY; 1051 key.type = BTRFS_ROOT_ITEM_KEY;
1006 1052
1053 rsv = trans->block_rsv;
1007 trans->block_rsv = &pending->block_rsv; 1054 trans->block_rsv = &pending->block_rsv;
1008 1055
1009 dentry = pending->dentry; 1056 dentry = pending->dentry;
@@ -1017,24 +1064,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1017 */ 1064 */
1018 ret = btrfs_set_inode_index(parent_inode, &index); 1065 ret = btrfs_set_inode_index(parent_inode, &index);
1019 BUG_ON(ret); /* -ENOMEM */ 1066 BUG_ON(ret); /* -ENOMEM */
1020 ret = btrfs_insert_dir_item(trans, parent_root, 1067
1021 dentry->d_name.name, dentry->d_name.len, 1068 /* check if there is a file/dir which has the same name. */
1022 parent_inode, &key, 1069 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1023 BTRFS_FT_DIR, index); 1070 btrfs_ino(parent_inode),
1024 if (ret == -EEXIST) { 1071 dentry->d_name.name,
1072 dentry->d_name.len, 0);
1073 if (dir_item != NULL && !IS_ERR(dir_item)) {
1025 pending->error = -EEXIST; 1074 pending->error = -EEXIST;
1026 dput(parent);
1027 goto fail; 1075 goto fail;
1028 } else if (ret) { 1076 } else if (IS_ERR(dir_item)) {
1029 goto abort_trans_dput; 1077 ret = PTR_ERR(dir_item);
1078 btrfs_abort_transaction(trans, root, ret);
1079 goto fail;
1030 } 1080 }
1031 1081 btrfs_release_path(path);
1032 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1033 dentry->d_name.len * 2);
1034 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1035 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1036 if (ret)
1037 goto abort_trans_dput;
1038 1082
1039 /* 1083 /*
1040 * pull in the delayed directory update 1084 * pull in the delayed directory update
@@ -1043,8 +1087,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1043 * snapshot 1087 * snapshot
1044 */ 1088 */
1045 ret = btrfs_run_delayed_items(trans, root); 1089 ret = btrfs_run_delayed_items(trans, root);
1046 if (ret) { /* Transaction aborted */ 1090 if (ret) { /* Transaction aborted */
1047 dput(parent); 1091 btrfs_abort_transaction(trans, root, ret);
1048 goto fail; 1092 goto fail;
1049 } 1093 }
1050 1094
@@ -1079,7 +1123,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1079 if (ret) { 1123 if (ret) {
1080 btrfs_tree_unlock(old); 1124 btrfs_tree_unlock(old);
1081 free_extent_buffer(old); 1125 free_extent_buffer(old);
1082 goto abort_trans_dput; 1126 btrfs_abort_transaction(trans, root, ret);
1127 goto fail;
1083 } 1128 }
1084 1129
1085 btrfs_set_lock_blocking(old); 1130 btrfs_set_lock_blocking(old);
@@ -1088,8 +1133,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1088 /* clean up in any case */ 1133 /* clean up in any case */
1089 btrfs_tree_unlock(old); 1134 btrfs_tree_unlock(old);
1090 free_extent_buffer(old); 1135 free_extent_buffer(old);
1091 if (ret) 1136 if (ret) {
1092 goto abort_trans_dput; 1137 btrfs_abort_transaction(trans, root, ret);
1138 goto fail;
1139 }
1093 1140
1094 /* see comments in should_cow_block() */ 1141 /* see comments in should_cow_block() */
1095 root->force_cow = 1; 1142 root->force_cow = 1;
@@ -1101,8 +1148,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1101 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 1148 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1102 btrfs_tree_unlock(tmp); 1149 btrfs_tree_unlock(tmp);
1103 free_extent_buffer(tmp); 1150 free_extent_buffer(tmp);
1104 if (ret) 1151 if (ret) {
1105 goto abort_trans_dput; 1152 btrfs_abort_transaction(trans, root, ret);
1153 goto fail;
1154 }
1106 1155
1107 /* 1156 /*
1108 * insert root back/forward references 1157 * insert root back/forward references
@@ -1111,32 +1160,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1111 parent_root->root_key.objectid, 1160 parent_root->root_key.objectid,
1112 btrfs_ino(parent_inode), index, 1161 btrfs_ino(parent_inode), index,
1113 dentry->d_name.name, dentry->d_name.len); 1162 dentry->d_name.name, dentry->d_name.len);
1114 dput(parent); 1163 if (ret) {
1115 if (ret) 1164 btrfs_abort_transaction(trans, root, ret);
1116 goto fail; 1165 goto fail;
1166 }
1117 1167
1118 key.offset = (u64)-1; 1168 key.offset = (u64)-1;
1119 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1169 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1120 if (IS_ERR(pending->snap)) { 1170 if (IS_ERR(pending->snap)) {
1121 ret = PTR_ERR(pending->snap); 1171 ret = PTR_ERR(pending->snap);
1122 goto abort_trans; 1172 btrfs_abort_transaction(trans, root, ret);
1173 goto fail;
1123 } 1174 }
1124 1175
1125 ret = btrfs_reloc_post_snapshot(trans, pending); 1176 ret = btrfs_reloc_post_snapshot(trans, pending);
1177 if (ret) {
1178 btrfs_abort_transaction(trans, root, ret);
1179 goto fail;
1180 }
1181
1182 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1183 if (ret) {
1184 btrfs_abort_transaction(trans, root, ret);
1185 goto fail;
1186 }
1187
1188 ret = btrfs_insert_dir_item(trans, parent_root,
1189 dentry->d_name.name, dentry->d_name.len,
1190 parent_inode, &key,
1191 BTRFS_FT_DIR, index);
1192 /* We have check then name at the beginning, so it is impossible. */
1193 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1194 if (ret) {
1195 btrfs_abort_transaction(trans, root, ret);
1196 goto fail;
1197 }
1198
1199 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1200 dentry->d_name.len * 2);
1201 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1202 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
1126 if (ret) 1203 if (ret)
1127 goto abort_trans; 1204 btrfs_abort_transaction(trans, root, ret);
1128 ret = 0;
1129fail: 1205fail:
1130 kfree(new_root_item); 1206 dput(parent);
1131 trans->block_rsv = rsv; 1207 trans->block_rsv = rsv;
1208no_free_objectid:
1209 kfree(new_root_item);
1210root_item_alloc_fail:
1211 btrfs_free_path(path);
1212path_alloc_fail:
1132 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1213 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1133 return ret; 1214 return ret;
1134
1135abort_trans_dput:
1136 dput(parent);
1137abort_trans:
1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140} 1215}
1141 1216
1142/* 1217/*
@@ -1229,6 +1304,17 @@ static void do_async_commit(struct work_struct *work)
1229 struct btrfs_async_commit *ac = 1304 struct btrfs_async_commit *ac =
1230 container_of(work, struct btrfs_async_commit, work.work); 1305 container_of(work, struct btrfs_async_commit, work.work);
1231 1306
1307 /*
1308 * We've got freeze protection passed with the transaction.
1309 * Tell lockdep about it.
1310 */
1311 if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
1312 rwsem_acquire_read(
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315
1316 current->journal_info = ac->newtrans;
1317
1232 btrfs_commit_transaction(ac->newtrans, ac->root); 1318 btrfs_commit_transaction(ac->newtrans, ac->root);
1233 kfree(ac); 1319 kfree(ac);
1234} 1320}
@@ -1258,6 +1344,16 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1258 atomic_inc(&cur_trans->use_count); 1344 atomic_inc(&cur_trans->use_count);
1259 1345
1260 btrfs_end_transaction(trans, root); 1346 btrfs_end_transaction(trans, root);
1347
1348 /*
1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it.
1351 */
1352 if (trans->type < TRANS_JOIN_NOLOCK)
1353 rwsem_release(
1354 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1355 1, _THIS_IP_);
1356
1261 schedule_delayed_work(&ac->work, 0); 1357 schedule_delayed_work(&ac->work, 0);
1262 1358
1263 /* wait for transaction to start and unblock */ 1359 /* wait for transaction to start and unblock */
@@ -1306,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1306 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1402 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1307} 1403}
1308 1404
1405static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1406 struct btrfs_root *root)
1407{
1408 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1409 int snap_pending = 0;
1410 int ret;
1411
1412 if (!flush_on_commit) {
1413 spin_lock(&root->fs_info->trans_lock);
1414 if (!list_empty(&trans->transaction->pending_snapshots))
1415 snap_pending = 1;
1416 spin_unlock(&root->fs_info->trans_lock);
1417 }
1418
1419 if (flush_on_commit || snap_pending) {
1420 btrfs_start_delalloc_inodes(root, 1);
1421 btrfs_wait_ordered_extents(root, 1);
1422 }
1423
1424 ret = btrfs_run_delayed_items(trans, root);
1425 if (ret)
1426 return ret;
1427
1428 /*
1429 * running the delayed items may have added new refs. account
1430 * them now so that they hinder processing of more delayed refs
1431 * as little as possible.
1432 */
1433 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1434
1435 /*
1436 * rename don't use btrfs_join_transaction, so, once we
1437 * set the transaction to blocked above, we aren't going
1438 * to get any new ordered operations. We can safely run
1439 * it here and no for sure that nothing new will be added
1440 * to the list
1441 */
1442 btrfs_run_ordered_operations(root, 1);
1443
1444 return 0;
1445}
1446
1309/* 1447/*
1310 * btrfs_transaction state sequence: 1448 * btrfs_transaction state sequence:
1311 * in_commit = 0, blocked = 0 (initial) 1449 * in_commit = 0, blocked = 0 (initial)
@@ -1320,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1320 struct btrfs_transaction *cur_trans = trans->transaction; 1458 struct btrfs_transaction *cur_trans = trans->transaction;
1321 struct btrfs_transaction *prev_trans = NULL; 1459 struct btrfs_transaction *prev_trans = NULL;
1322 DEFINE_WAIT(wait); 1460 DEFINE_WAIT(wait);
1323 int ret = -EIO; 1461 int ret;
1324 int should_grow = 0; 1462 int should_grow = 0;
1325 unsigned long now = get_seconds(); 1463 unsigned long now = get_seconds();
1326 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1327 1464
1328 btrfs_run_ordered_operations(root, 0); 1465 ret = btrfs_run_ordered_operations(root, 0);
1466 if (ret) {
1467 btrfs_abort_transaction(trans, root, ret);
1468 goto cleanup_transaction;
1469 }
1329 1470
1330 if (cur_trans->aborted) 1471 if (cur_trans->aborted) {
1472 ret = cur_trans->aborted;
1331 goto cleanup_transaction; 1473 goto cleanup_transaction;
1474 }
1332 1475
1333 /* make a pass through all the delayed refs we have so far 1476 /* make a pass through all the delayed refs we have so far
1334 * any runnings procs may add more while we are here 1477 * any runnings procs may add more while we are here
@@ -1348,6 +1491,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1348 */ 1491 */
1349 cur_trans->delayed_refs.flushing = 1; 1492 cur_trans->delayed_refs.flushing = 1;
1350 1493
1494 if (!list_empty(&trans->new_bgs))
1495 btrfs_create_pending_block_groups(trans, root);
1496
1351 ret = btrfs_run_delayed_refs(trans, root, 0); 1497 ret = btrfs_run_delayed_refs(trans, root, 0);
1352 if (ret) 1498 if (ret)
1353 goto cleanup_transaction; 1499 goto cleanup_transaction;
@@ -1393,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1393 should_grow = 1; 1539 should_grow = 1;
1394 1540
1395 do { 1541 do {
1396 int snap_pending = 0;
1397
1398 joined = cur_trans->num_joined; 1542 joined = cur_trans->num_joined;
1399 if (!list_empty(&trans->transaction->pending_snapshots))
1400 snap_pending = 1;
1401 1543
1402 WARN_ON(cur_trans != trans->transaction); 1544 WARN_ON(cur_trans != trans->transaction);
1403 1545
1404 if (flush_on_commit || snap_pending) { 1546 ret = btrfs_flush_all_pending_stuffs(trans, root);
1405 btrfs_start_delalloc_inodes(root, 1);
1406 btrfs_wait_ordered_extents(root, 0, 1);
1407 }
1408
1409 ret = btrfs_run_delayed_items(trans, root);
1410 if (ret) 1547 if (ret)
1411 goto cleanup_transaction; 1548 goto cleanup_transaction;
1412 1549
1413 /*
1414 * running the delayed items may have added new refs. account
1415 * them now so that they hinder processing of more delayed refs
1416 * as little as possible.
1417 */
1418 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1419
1420 /*
1421 * rename don't use btrfs_join_transaction, so, once we
1422 * set the transaction to blocked above, we aren't going
1423 * to get any new ordered operations. We can safely run
1424 * it here and no for sure that nothing new will be added
1425 * to the list
1426 */
1427 btrfs_run_ordered_operations(root, 1);
1428
1429 prepare_to_wait(&cur_trans->writer_wait, &wait, 1550 prepare_to_wait(&cur_trans->writer_wait, &wait,
1430 TASK_UNINTERRUPTIBLE); 1551 TASK_UNINTERRUPTIBLE);
1431 1552
@@ -1438,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1438 } while (atomic_read(&cur_trans->num_writers) > 1 || 1559 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1439 (should_grow && cur_trans->num_joined != joined)); 1560 (should_grow && cur_trans->num_joined != joined));
1440 1561
1562 ret = btrfs_flush_all_pending_stuffs(trans, root);
1563 if (ret)
1564 goto cleanup_transaction;
1565
1441 /* 1566 /*
1442 * Ok now we need to make sure to block out any other joins while we 1567 * Ok now we need to make sure to block out any other joins while we
1443 * commit the transaction. We could have started a join before setting 1568 * commit the transaction. We could have started a join before setting
@@ -1456,13 +1581,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1456 */ 1581 */
1457 mutex_lock(&root->fs_info->reloc_mutex); 1582 mutex_lock(&root->fs_info->reloc_mutex);
1458 1583
1459 ret = btrfs_run_delayed_items(trans, root); 1584 /*
1585 * We needn't worry about the delayed items because we will
1586 * deal with them in create_pending_snapshot(), which is the
1587 * core function of the snapshot creation.
1588 */
1589 ret = create_pending_snapshots(trans, root->fs_info);
1460 if (ret) { 1590 if (ret) {
1461 mutex_unlock(&root->fs_info->reloc_mutex); 1591 mutex_unlock(&root->fs_info->reloc_mutex);
1462 goto cleanup_transaction; 1592 goto cleanup_transaction;
1463 } 1593 }
1464 1594
1465 ret = create_pending_snapshots(trans, root->fs_info); 1595 /*
1596 * We insert the dir indexes of the snapshots and update the inode
1597 * of the snapshots' parents after the snapshot creation, so there
1598 * are some delayed items which are not dealt with. Now deal with
1599 * them.
1600 *
1601 * We needn't worry that this operation will corrupt the snapshots,
1602 * because all the tree which are snapshoted will be forced to COW
1603 * the nodes and leaves.
1604 */
1605 ret = btrfs_run_delayed_items(trans, root);
1466 if (ret) { 1606 if (ret) {
1467 mutex_unlock(&root->fs_info->reloc_mutex); 1607 mutex_unlock(&root->fs_info->reloc_mutex);
1468 goto cleanup_transaction; 1608 goto cleanup_transaction;
@@ -1584,7 +1724,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1584 put_transaction(cur_trans); 1724 put_transaction(cur_trans);
1585 put_transaction(cur_trans); 1725 put_transaction(cur_trans);
1586 1726
1587 sb_end_intwrite(root->fs_info->sb); 1727 if (trans->type < TRANS_JOIN_NOLOCK)
1728 sb_end_intwrite(root->fs_info->sb);
1588 1729
1589 trace_btrfs_transaction_commit(root); 1730 trace_btrfs_transaction_commit(root);
1590 1731
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688b..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
47 int aborted; 47 int aborted;
48}; 48};
49 49
50enum btrfs_trans_type {
51 TRANS_START,
52 TRANS_JOIN,
53 TRANS_USERSPACE,
54 TRANS_JOIN_NOLOCK,
55 TRANS_ATTACH,
56};
57
50struct btrfs_trans_handle { 58struct btrfs_trans_handle {
51 u64 transid; 59 u64 transid;
52 u64 bytes_reserved; 60 u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
58 struct btrfs_transaction *transaction; 66 struct btrfs_transaction *transaction;
59 struct btrfs_block_rsv *block_rsv; 67 struct btrfs_block_rsv *block_rsv;
60 struct btrfs_block_rsv *orig_rsv; 68 struct btrfs_block_rsv *orig_rsv;
61 int aborted; 69 short aborted;
62 int adding_csums; 70 short adding_csums;
71 enum btrfs_trans_type type;
63 /* 72 /*
64 * this root is only needed to validate that the root passed to 73 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction. 74 * start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
68 struct btrfs_root *root; 77 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem; 78 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list; 79 struct list_head qgroup_ref_list;
80 struct list_head new_bgs;
71}; 81};
72 82
73struct btrfs_pending_snapshot { 83struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
88{ 98{
89 BTRFS_I(inode)->last_trans = trans->transaction->transid; 99 BTRFS_I(inode)->last_trans = trans->transaction->transid;
90 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 100 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
101 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
91} 102}
92 103
93int btrfs_end_transaction(struct btrfs_trans_handle *trans, 104int btrfs_end_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 105 struct btrfs_root *root);
95int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root);
97struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
98 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_lflush(
109 struct btrfs_root *root, int num_items);
99struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
100struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
101struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
102int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
103int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..83186c7e45d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,16 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/list_sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "transaction.h" 23#include "transaction.h"
23#include "disk-io.h" 24#include "disk-io.h"
24#include "locking.h" 25#include "locking.h"
25#include "print-tree.h" 26#include "print-tree.h"
27#include "backref.h"
26#include "compat.h" 28#include "compat.h"
27#include "tree-log.h" 29#include "tree-log.h"
30#include "hash.h"
28 31
29/* magic values for the inode_only field in btrfs_log_inode: 32/* magic values for the inode_only field in btrfs_log_inode:
30 * 33 *
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
146 root->log_multiple_pids = true; 149 root->log_multiple_pids = true;
147 } 150 }
148 151
149 root->log_batch++; 152 atomic_inc(&root->log_batch);
150 atomic_inc(&root->log_writers); 153 atomic_inc(&root->log_writers);
151 mutex_unlock(&root->log_mutex); 154 mutex_unlock(&root->log_mutex);
152 return 0; 155 return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
165 err = ret; 168 err = ret;
166 } 169 }
167 mutex_unlock(&root->fs_info->tree_log_mutex); 170 mutex_unlock(&root->fs_info->tree_log_mutex);
168 root->log_batch++; 171 atomic_inc(&root->log_batch);
169 atomic_inc(&root->log_writers); 172 atomic_inc(&root->log_writers);
170 mutex_unlock(&root->log_mutex); 173 mutex_unlock(&root->log_mutex);
171 return err; 174 return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
484 int found_type; 487 int found_type;
485 u64 mask = root->sectorsize - 1; 488 u64 mask = root->sectorsize - 1;
486 u64 extent_end; 489 u64 extent_end;
487 u64 alloc_hint;
488 u64 start = key->offset; 490 u64 start = key->offset;
489 u64 saved_nbytes; 491 u64 saved_nbytes;
490 struct btrfs_file_extent_item *item; 492 struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
550 552
551 saved_nbytes = inode_get_bytes(inode); 553 saved_nbytes = inode_get_bytes(inode);
552 /* drop any overlapping extents */ 554 /* drop any overlapping extents */
553 ret = btrfs_drop_extents(trans, inode, start, extent_end, 555 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
554 &alloc_hint, 1);
555 BUG_ON(ret); 556 BUG_ON(ret);
556 557
557 if (found_type == BTRFS_FILE_EXTENT_REG || 558 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
744 */ 745 */
745static noinline int backref_in_log(struct btrfs_root *log, 746static noinline int backref_in_log(struct btrfs_root *log,
746 struct btrfs_key *key, 747 struct btrfs_key *key,
748 u64 ref_objectid,
747 char *name, int namelen) 749 char *name, int namelen)
748{ 750{
749 struct btrfs_path *path; 751 struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
764 if (ret != 0) 766 if (ret != 0)
765 goto out; 767 goto out;
766 768
767 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
768 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 769 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
770
771 if (key->type == BTRFS_INODE_EXTREF_KEY) {
772 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
773 name, namelen, NULL))
774 match = 1;
775
776 goto out;
777 }
778
779 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
769 ptr_end = ptr + item_size; 780 ptr_end = ptr + item_size;
770 while (ptr < ptr_end) { 781 while (ptr < ptr_end) {
771 ref = (struct btrfs_inode_ref *)ptr; 782 ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
786 return match; 797 return match;
787} 798}
788 799
789 800static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
790/*
791 * replay one inode back reference item found in the log tree.
792 * eb, slot and key refer to the buffer and key found in the log tree.
793 * root is the destination we are replaying into, and path is for temp
794 * use by this function. (it should be released on return).
795 */
796static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 struct btrfs_root *root, 801 struct btrfs_root *root,
798 struct btrfs_root *log,
799 struct btrfs_path *path, 802 struct btrfs_path *path,
800 struct extent_buffer *eb, int slot, 803 struct btrfs_root *log_root,
801 struct btrfs_key *key) 804 struct inode *dir, struct inode *inode,
805 struct extent_buffer *eb,
806 u64 inode_objectid, u64 parent_objectid,
807 u64 ref_index, char *name, int namelen,
808 int *search_done)
802{ 809{
803 struct btrfs_inode_ref *ref;
804 struct btrfs_dir_item *di;
805 struct inode *dir;
806 struct inode *inode;
807 unsigned long ref_ptr;
808 unsigned long ref_end;
809 char *name;
810 int namelen;
811 int ret; 810 int ret;
812 int search_done = 0; 811 char *victim_name;
813 812 int victim_name_len;
814 /* 813 struct extent_buffer *leaf;
815 * it is possible that we didn't log all the parent directories 814 struct btrfs_dir_item *di;
816 * for a given inode. If we don't find the dir, just don't 815 struct btrfs_key search_key;
817 * copy the back ref in. The link count fixup code will take 816 struct btrfs_inode_extref *extref;
818 * care of the rest
819 */
820 dir = read_one_inode(root, key->offset);
821 if (!dir)
822 return -ENOENT;
823
824 inode = read_one_inode(root, key->objectid);
825 if (!inode) {
826 iput(dir);
827 return -EIO;
828 }
829
830 ref_ptr = btrfs_item_ptr_offset(eb, slot);
831 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
832 817
833again: 818again:
834 ref = (struct btrfs_inode_ref *)ref_ptr; 819 /* Search old style refs */
835 820 search_key.objectid = inode_objectid;
836 namelen = btrfs_inode_ref_name_len(eb, ref); 821 search_key.type = BTRFS_INODE_REF_KEY;
837 name = kmalloc(namelen, GFP_NOFS); 822 search_key.offset = parent_objectid;
838 BUG_ON(!name); 823 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
839
840 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
841
842 /* if we already have a perfect match, we're done */
843 if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
844 btrfs_inode_ref_index(eb, ref),
845 name, namelen)) {
846 goto out;
847 }
848
849 /*
850 * look for a conflicting back reference in the metadata.
851 * if we find one we have to unlink that name of the file
852 * before we add our new link. Later on, we overwrite any
853 * existing back reference, and we don't want to create
854 * dangling pointers in the directory.
855 */
856
857 if (search_done)
858 goto insert;
859
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) { 824 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref; 825 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr; 826 unsigned long ptr;
866 unsigned long ptr_end; 827 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0]; 828
829 leaf = path->nodes[0];
868 830
869 /* are we trying to overwrite a back ref for the root directory 831 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done 832 * if so, just jump out, we're done
871 */ 833 */
872 if (key->objectid == key->offset) 834 if (search_key.objectid == search_key.offset)
873 goto out_nowrite; 835 return 1;
874 836
875 /* check all the names in this back reference to see 837 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay 838 * if they are in the log. if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
889 (unsigned long)(victim_ref + 1), 851 (unsigned long)(victim_ref + 1),
890 victim_name_len); 852 victim_name_len);
891 853
892 if (!backref_in_log(log, key, victim_name, 854 if (!backref_in_log(log_root, &search_key,
855 parent_objectid,
856 victim_name,
893 victim_name_len)) { 857 victim_name_len)) {
894 btrfs_inc_nlink(inode); 858 btrfs_inc_nlink(inode);
895 btrfs_release_path(path); 859 btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
897 ret = btrfs_unlink_inode(trans, root, dir, 861 ret = btrfs_unlink_inode(trans, root, dir,
898 inode, victim_name, 862 inode, victim_name,
899 victim_name_len); 863 victim_name_len);
864 BUG_ON(ret);
900 btrfs_run_delayed_items(trans, root); 865 btrfs_run_delayed_items(trans, root);
866 kfree(victim_name);
867 *search_done = 1;
868 goto again;
901 } 869 }
902 kfree(victim_name); 870 kfree(victim_name);
871
903 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 872 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
904 } 873 }
905 BUG_ON(ret); 874 BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
908 * NOTE: we have searched root tree and checked the 877 * NOTE: we have searched root tree and checked the
909 * coresponding ref, it does not need to check again. 878 * coresponding ref, it does not need to check again.
910 */ 879 */
911 search_done = 1; 880 *search_done = 1;
881 }
882 btrfs_release_path(path);
883
884 /* Same search but for extended refs */
885 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
886 inode_objectid, parent_objectid, 0,
887 0);
888 if (!IS_ERR_OR_NULL(extref)) {
889 u32 item_size;
890 u32 cur_offset = 0;
891 unsigned long base;
892 struct inode *victim_parent;
893
894 leaf = path->nodes[0];
895
896 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
897 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
898
899 while (cur_offset < item_size) {
900 extref = (struct btrfs_inode_extref *)base + cur_offset;
901
902 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
903
904 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
905 goto next;
906
907 victim_name = kmalloc(victim_name_len, GFP_NOFS);
908 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
909 victim_name_len);
910
911 search_key.objectid = inode_objectid;
912 search_key.type = BTRFS_INODE_EXTREF_KEY;
913 search_key.offset = btrfs_extref_hash(parent_objectid,
914 victim_name,
915 victim_name_len);
916 ret = 0;
917 if (!backref_in_log(log_root, &search_key,
918 parent_objectid, victim_name,
919 victim_name_len)) {
920 ret = -ENOENT;
921 victim_parent = read_one_inode(root,
922 parent_objectid);
923 if (victim_parent) {
924 btrfs_inc_nlink(inode);
925 btrfs_release_path(path);
926
927 ret = btrfs_unlink_inode(trans, root,
928 victim_parent,
929 inode,
930 victim_name,
931 victim_name_len);
932 btrfs_run_delayed_items(trans, root);
933 }
934 BUG_ON(ret);
935 iput(victim_parent);
936 kfree(victim_name);
937 *search_done = 1;
938 goto again;
939 }
940 kfree(victim_name);
941 BUG_ON(ret);
942next:
943 cur_offset += victim_name_len + sizeof(*extref);
944 }
945 *search_done = 1;
912 } 946 }
913 btrfs_release_path(path); 947 btrfs_release_path(path);
914 948
915 /* look for a conflicting sequence number */ 949 /* look for a conflicting sequence number */
916 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 950 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
917 btrfs_inode_ref_index(eb, ref), 951 ref_index, name, namelen, 0);
918 name, namelen, 0);
919 if (di && !IS_ERR(di)) { 952 if (di && !IS_ERR(di)) {
920 ret = drop_one_dir_item(trans, root, path, dir, di); 953 ret = drop_one_dir_item(trans, root, path, dir, di);
921 BUG_ON(ret); 954 BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
931 } 964 }
932 btrfs_release_path(path); 965 btrfs_release_path(path);
933 966
934insert: 967 return 0;
935 /* insert our name */ 968}
936 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
937 btrfs_inode_ref_index(eb, ref));
938 BUG_ON(ret);
939 969
940 btrfs_update_inode(trans, root, inode); 970static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
971 u32 *namelen, char **name, u64 *index,
972 u64 *parent_objectid)
973{
974 struct btrfs_inode_extref *extref;
941 975
942out: 976 extref = (struct btrfs_inode_extref *)ref_ptr;
943 ref_ptr = (unsigned long)(ref + 1) + namelen; 977
944 kfree(name); 978 *namelen = btrfs_inode_extref_name_len(eb, extref);
945 if (ref_ptr < ref_end) 979 *name = kmalloc(*namelen, GFP_NOFS);
946 goto again; 980 if (*name == NULL)
981 return -ENOMEM;
982
983 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
984 *namelen);
985
986 *index = btrfs_inode_extref_index(eb, extref);
987 if (parent_objectid)
988 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
989
990 return 0;
991}
992
993static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
994 u32 *namelen, char **name, u64 *index)
995{
996 struct btrfs_inode_ref *ref;
997
998 ref = (struct btrfs_inode_ref *)ref_ptr;
999
1000 *namelen = btrfs_inode_ref_name_len(eb, ref);
1001 *name = kmalloc(*namelen, GFP_NOFS);
1002 if (*name == NULL)
1003 return -ENOMEM;
1004
1005 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1006
1007 *index = btrfs_inode_ref_index(eb, ref);
1008
1009 return 0;
1010}
1011
1012/*
1013 * replay one inode back reference item found in the log tree.
1014 * eb, slot and key refer to the buffer and key found in the log tree.
1015 * root is the destination we are replaying into, and path is for temp
1016 * use by this function. (it should be released on return).
1017 */
1018static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root,
1020 struct btrfs_root *log,
1021 struct btrfs_path *path,
1022 struct extent_buffer *eb, int slot,
1023 struct btrfs_key *key)
1024{
1025 struct inode *dir;
1026 struct inode *inode;
1027 unsigned long ref_ptr;
1028 unsigned long ref_end;
1029 char *name;
1030 int namelen;
1031 int ret;
1032 int search_done = 0;
1033 int log_ref_ver = 0;
1034 u64 parent_objectid;
1035 u64 inode_objectid;
1036 u64 ref_index = 0;
1037 int ref_struct_size;
1038
1039 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1040 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1041
1042 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1043 struct btrfs_inode_extref *r;
1044
1045 ref_struct_size = sizeof(struct btrfs_inode_extref);
1046 log_ref_ver = 1;
1047 r = (struct btrfs_inode_extref *)ref_ptr;
1048 parent_objectid = btrfs_inode_extref_parent(eb, r);
1049 } else {
1050 ref_struct_size = sizeof(struct btrfs_inode_ref);
1051 parent_objectid = key->offset;
1052 }
1053 inode_objectid = key->objectid;
1054
1055 /*
1056 * it is possible that we didn't log all the parent directories
1057 * for a given inode. If we don't find the dir, just don't
1058 * copy the back ref in. The link count fixup code will take
1059 * care of the rest
1060 */
1061 dir = read_one_inode(root, parent_objectid);
1062 if (!dir)
1063 return -ENOENT;
1064
1065 inode = read_one_inode(root, inode_objectid);
1066 if (!inode) {
1067 iput(dir);
1068 return -EIO;
1069 }
1070
1071 while (ref_ptr < ref_end) {
1072 if (log_ref_ver) {
1073 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1074 &ref_index, &parent_objectid);
1075 /*
1076 * parent object can change from one array
1077 * item to another.
1078 */
1079 if (!dir)
1080 dir = read_one_inode(root, parent_objectid);
1081 if (!dir)
1082 return -ENOENT;
1083 } else {
1084 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1085 &ref_index);
1086 }
1087 if (ret)
1088 return ret;
1089
1090 /* if we already have a perfect match, we're done */
1091 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1092 ref_index, name, namelen)) {
1093 /*
1094 * look for a conflicting back reference in the
1095 * metadata. if we find one we have to unlink that name
1096 * of the file before we add our new link. Later on, we
1097 * overwrite any existing back reference, and we don't
1098 * want to create dangling pointers in the directory.
1099 */
1100
1101 if (!search_done) {
1102 ret = __add_inode_ref(trans, root, path, log,
1103 dir, inode, eb,
1104 inode_objectid,
1105 parent_objectid,
1106 ref_index, name, namelen,
1107 &search_done);
1108 if (ret == 1)
1109 goto out;
1110 BUG_ON(ret);
1111 }
1112
1113 /* insert our name */
1114 ret = btrfs_add_link(trans, dir, inode, name, namelen,
1115 0, ref_index);
1116 BUG_ON(ret);
1117
1118 btrfs_update_inode(trans, root, inode);
1119 }
1120
1121 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1122 kfree(name);
1123 if (log_ref_ver) {
1124 iput(dir);
1125 dir = NULL;
1126 }
1127 }
947 1128
948 /* finally write the back reference in the inode */ 1129 /* finally write the back reference in the inode */
949 ret = overwrite_item(trans, root, path, eb, slot, key); 1130 ret = overwrite_item(trans, root, path, eb, slot, key);
950 BUG_ON(ret); 1131 BUG_ON(ret);
951 1132
952out_nowrite: 1133out:
953 btrfs_release_path(path); 1134 btrfs_release_path(path);
954 iput(dir); 1135 iput(dir);
955 iput(inode); 1136 iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
966 return ret; 1147 return ret;
967} 1148}
968 1149
1150static int count_inode_extrefs(struct btrfs_root *root,
1151 struct inode *inode, struct btrfs_path *path)
1152{
1153 int ret = 0;
1154 int name_len;
1155 unsigned int nlink = 0;
1156 u32 item_size;
1157 u32 cur_offset = 0;
1158 u64 inode_objectid = btrfs_ino(inode);
1159 u64 offset = 0;
1160 unsigned long ptr;
1161 struct btrfs_inode_extref *extref;
1162 struct extent_buffer *leaf;
969 1163
970/* 1164 while (1) {
971 * There are a few corners where the link count of the file can't 1165 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
972 * be properly maintained during replay. So, instead of adding 1166 &extref, &offset);
973 * lots of complexity to the log code, we just scan the backrefs 1167 if (ret)
974 * for any file that has been through replay. 1168 break;
975 * 1169
976 * The scan will update the link count on the inode to reflect the 1170 leaf = path->nodes[0];
977 * number of back refs found. If it goes down to zero, the iput 1171 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
978 * will free the inode. 1172 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
979 */ 1173
980static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1174 while (cur_offset < item_size) {
981 struct btrfs_root *root, 1175 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
982 struct inode *inode) 1176 name_len = btrfs_inode_extref_name_len(leaf, extref);
1177
1178 nlink++;
1179
1180 cur_offset += name_len + sizeof(*extref);
1181 }
1182
1183 offset++;
1184 btrfs_release_path(path);
1185 }
1186 btrfs_release_path(path);
1187
1188 if (ret < 0)
1189 return ret;
1190 return nlink;
1191}
1192
1193static int count_inode_refs(struct btrfs_root *root,
1194 struct inode *inode, struct btrfs_path *path)
983{ 1195{
984 struct btrfs_path *path;
985 int ret; 1196 int ret;
986 struct btrfs_key key; 1197 struct btrfs_key key;
987 u64 nlink = 0; 1198 unsigned int nlink = 0;
988 unsigned long ptr; 1199 unsigned long ptr;
989 unsigned long ptr_end; 1200 unsigned long ptr_end;
990 int name_len; 1201 int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
994 key.type = BTRFS_INODE_REF_KEY; 1205 key.type = BTRFS_INODE_REF_KEY;
995 key.offset = (u64)-1; 1206 key.offset = (u64)-1;
996 1207
997 path = btrfs_alloc_path();
998 if (!path)
999 return -ENOMEM;
1000
1001 while (1) { 1208 while (1) {
1002 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1209 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003 if (ret < 0) 1210 if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1031 btrfs_release_path(path); 1238 btrfs_release_path(path);
1032 } 1239 }
1033 btrfs_release_path(path); 1240 btrfs_release_path(path);
1241
1242 return nlink;
1243}
1244
1245/*
1246 * There are a few corners where the link count of the file can't
1247 * be properly maintained during replay. So, instead of adding
1248 * lots of complexity to the log code, we just scan the backrefs
1249 * for any file that has been through replay.
1250 *
1251 * The scan will update the link count on the inode to reflect the
1252 * number of back refs found. If it goes down to zero, the iput
1253 * will free the inode.
1254 */
1255static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1256 struct btrfs_root *root,
1257 struct inode *inode)
1258{
1259 struct btrfs_path *path;
1260 int ret;
1261 u64 nlink = 0;
1262 u64 ino = btrfs_ino(inode);
1263
1264 path = btrfs_alloc_path();
1265 if (!path)
1266 return -ENOMEM;
1267
1268 ret = count_inode_refs(root, inode, path);
1269 if (ret < 0)
1270 goto out;
1271
1272 nlink = ret;
1273
1274 ret = count_inode_extrefs(root, inode, path);
1275 if (ret == -ENOENT)
1276 ret = 0;
1277
1278 if (ret < 0)
1279 goto out;
1280
1281 nlink += ret;
1282
1283 ret = 0;
1284
1034 if (nlink != inode->i_nlink) { 1285 if (nlink != inode->i_nlink) {
1035 set_nlink(inode, nlink); 1286 set_nlink(inode, nlink);
1036 btrfs_update_inode(trans, root, inode); 1287 btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1046 ret = insert_orphan_item(trans, root, ino); 1297 ret = insert_orphan_item(trans, root, ino);
1047 BUG_ON(ret); 1298 BUG_ON(ret);
1048 } 1299 }
1049 btrfs_free_path(path);
1050 1300
1051 return 0; 1301out:
1302 btrfs_free_path(path);
1303 return ret;
1052} 1304}
1053 1305
1054static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1306static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1695 ret = add_inode_ref(wc->trans, root, log, path, 1947 ret = add_inode_ref(wc->trans, root, log, path,
1696 eb, i, &key); 1948 eb, i, &key);
1697 BUG_ON(ret && ret != -ENOENT); 1949 BUG_ON(ret && ret != -ENOENT);
1950 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
1951 ret = add_inode_ref(wc->trans, root, log, path,
1952 eb, i, &key);
1953 BUG_ON(ret && ret != -ENOENT);
1698 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1954 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1699 ret = replay_one_extent(wc->trans, root, path, 1955 ret = replay_one_extent(wc->trans, root, path,
1700 eb, i, &key); 1956 eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2037 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2293 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2038 wait_log_commit(trans, root, root->log_transid - 1); 2294 wait_log_commit(trans, root, root->log_transid - 1);
2039 while (1) { 2295 while (1) {
2040 unsigned long batch = root->log_batch; 2296 int batch = atomic_read(&root->log_batch);
2041 /* when we're on an ssd, just kick the log commit out */ 2297 /* when we're on an ssd, just kick the log commit out */
2042 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2298 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2043 mutex_unlock(&root->log_mutex); 2299 mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2045 mutex_lock(&root->log_mutex); 2301 mutex_lock(&root->log_mutex);
2046 } 2302 }
2047 wait_for_writer(trans, root); 2303 wait_for_writer(trans, root);
2048 if (batch == root->log_batch) 2304 if (batch == atomic_read(&root->log_batch))
2049 break; 2305 break;
2050 } 2306 }
2051 2307
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2074 2330
2075 btrfs_set_root_node(&log->root_item, log->node); 2331 btrfs_set_root_node(&log->root_item, log->node);
2076 2332
2077 root->log_batch = 0;
2078 root->log_transid++; 2333 root->log_transid++;
2079 log->log_transid = root->log_transid; 2334 log->log_transid = root->log_transid;
2080 root->log_start_pid = 0; 2335 root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2087 mutex_unlock(&root->log_mutex); 2342 mutex_unlock(&root->log_mutex);
2088 2343
2089 mutex_lock(&log_root_tree->log_mutex); 2344 mutex_lock(&log_root_tree->log_mutex);
2090 log_root_tree->log_batch++; 2345 atomic_inc(&log_root_tree->log_batch);
2091 atomic_inc(&log_root_tree->log_writers); 2346 atomic_inc(&log_root_tree->log_writers);
2092 mutex_unlock(&log_root_tree->log_mutex); 2347 mutex_unlock(&log_root_tree->log_mutex);
2093 2348
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2157 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2412 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2158 btrfs_header_level(log_root_tree->node)); 2413 btrfs_header_level(log_root_tree->node));
2159 2414
2160 log_root_tree->log_batch = 0;
2161 log_root_tree->log_transid++; 2415 log_root_tree->log_transid++;
2162 smp_mb(); 2416 smp_mb();
2163 2417
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2171 * in and cause problems either. 2425 * in and cause problems either.
2172 */ 2426 */
2173 btrfs_scrub_pause_super(root); 2427 btrfs_scrub_pause_super(root);
2174 write_ctree_super(trans, root->fs_info->tree_root, 1); 2428 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2175 btrfs_scrub_continue_super(root); 2429 btrfs_scrub_continue_super(root);
2176 ret = 0; 2430 if (ret) {
2431 btrfs_abort_transaction(trans, root, ret);
2432 goto out_wake_log_root;
2433 }
2177 2434
2178 mutex_lock(&root->log_mutex); 2435 mutex_lock(&root->log_mutex);
2179 if (root->last_log_commit < log_transid) 2436 if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2209 2466
2210 while (1) { 2467 while (1) {
2211 ret = find_first_extent_bit(&log->dirty_log_pages, 2468 ret = find_first_extent_bit(&log->dirty_log_pages,
2212 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2469 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2470 NULL);
2213 if (ret) 2471 if (ret)
2214 break; 2472 break;
2215 2473
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2646 int ret; 2904 int ret;
2647 struct btrfs_key key; 2905 struct btrfs_key key;
2648 struct btrfs_key found_key; 2906 struct btrfs_key found_key;
2907 int start_slot;
2649 2908
2650 key.objectid = objectid; 2909 key.objectid = objectid;
2651 key.type = max_key_type; 2910 key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2667 if (found_key.objectid != objectid) 2926 if (found_key.objectid != objectid)
2668 break; 2927 break;
2669 2928
2670 ret = btrfs_del_item(trans, log, path); 2929 found_key.offset = 0;
2671 if (ret) 2930 found_key.type = 0;
2931 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
2932 &start_slot);
2933
2934 ret = btrfs_del_items(trans, log, path, start_slot,
2935 path->slots[0] - start_slot + 1);
2936 /*
2937 * If start slot isn't 0 then we don't need to re-search, we've
2938 * found the last guy with the objectid in this tree.
2939 */
2940 if (ret || start_slot != 0)
2672 break; 2941 break;
2673 btrfs_release_path(path); 2942 btrfs_release_path(path);
2674 } 2943 }
@@ -2678,14 +2947,89 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2678 return ret; 2947 return ret;
2679} 2948}
2680 2949
2950static void fill_inode_item(struct btrfs_trans_handle *trans,
2951 struct extent_buffer *leaf,
2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only)
2954{
2955 struct btrfs_map_token token;
2956
2957 btrfs_init_map_token(&token);
2958
2959 if (log_inode_only) {
2960 /* set the generation to zero so the recover code
2961 * can tell the difference between an logging
2962 * just to say 'this inode exists' and a logging
2963 * to say 'update this inode with these values'
2964 */
2965 btrfs_set_token_inode_generation(leaf, item, 0, &token);
2966 btrfs_set_token_inode_size(leaf, item, 0, &token);
2967 } else {
2968 btrfs_set_token_inode_generation(leaf, item,
2969 BTRFS_I(inode)->generation,
2970 &token);
2971 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2972 }
2973
2974 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2975 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2976 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2977 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2978
2979 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2980 inode->i_atime.tv_sec, &token);
2981 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2982 inode->i_atime.tv_nsec, &token);
2983
2984 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2985 inode->i_mtime.tv_sec, &token);
2986 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2987 inode->i_mtime.tv_nsec, &token);
2988
2989 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2990 inode->i_ctime.tv_sec, &token);
2991 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2992 inode->i_ctime.tv_nsec, &token);
2993
2994 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2995 &token);
2996
2997 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2998 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2999 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3000 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3001 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3002}
3003
3004static int log_inode_item(struct btrfs_trans_handle *trans,
3005 struct btrfs_root *log, struct btrfs_path *path,
3006 struct inode *inode)
3007{
3008 struct btrfs_inode_item *inode_item;
3009 struct btrfs_key key;
3010 int ret;
3011
3012 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3013 ret = btrfs_insert_empty_item(trans, log, path, &key,
3014 sizeof(*inode_item));
3015 if (ret && ret != -EEXIST)
3016 return ret;
3017 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3018 struct btrfs_inode_item);
3019 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3020 btrfs_release_path(path);
3021 return 0;
3022}
3023
2681static noinline int copy_items(struct btrfs_trans_handle *trans, 3024static noinline int copy_items(struct btrfs_trans_handle *trans,
2682 struct btrfs_root *log, 3025 struct inode *inode,
2683 struct btrfs_path *dst_path, 3026 struct btrfs_path *dst_path,
2684 struct extent_buffer *src, 3027 struct extent_buffer *src,
2685 int start_slot, int nr, int inode_only) 3028 int start_slot, int nr, int inode_only)
2686{ 3029{
2687 unsigned long src_offset; 3030 unsigned long src_offset;
2688 unsigned long dst_offset; 3031 unsigned long dst_offset;
3032 struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
2689 struct btrfs_file_extent_item *extent; 3033 struct btrfs_file_extent_item *extent;
2690 struct btrfs_inode_item *inode_item; 3034 struct btrfs_inode_item *inode_item;
2691 int ret; 3035 int ret;
@@ -2694,6 +3038,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2694 char *ins_data; 3038 char *ins_data;
2695 int i; 3039 int i;
2696 struct list_head ordered_sums; 3040 struct list_head ordered_sums;
3041 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2697 3042
2698 INIT_LIST_HEAD(&ordered_sums); 3043 INIT_LIST_HEAD(&ordered_sums);
2699 3044
@@ -2722,29 +3067,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2722 3067
2723 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3068 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2724 3069
2725 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3070 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2726 src_offset, ins_sizes[i]);
2727
2728 if (inode_only == LOG_INODE_EXISTS &&
2729 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2730 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3071 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2731 dst_path->slots[0], 3072 dst_path->slots[0],
2732 struct btrfs_inode_item); 3073 struct btrfs_inode_item);
2733 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 3074 fill_inode_item(trans, dst_path->nodes[0], inode_item,
2734 3075 inode, inode_only == LOG_INODE_EXISTS);
2735 /* set the generation to zero so the recover code 3076 } else {
2736 * can tell the difference between an logging 3077 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2737 * just to say 'this inode exists' and a logging 3078 src_offset, ins_sizes[i]);
2738 * to say 'update this inode with these values'
2739 */
2740 btrfs_set_inode_generation(dst_path->nodes[0],
2741 inode_item, 0);
2742 } 3079 }
3080
2743 /* take a reference on file data extents so that truncates 3081 /* take a reference on file data extents so that truncates
2744 * or deletes of this inode don't have to relog the inode 3082 * or deletes of this inode don't have to relog the inode
2745 * again 3083 * again
2746 */ 3084 */
2747 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 3085 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3086 !skip_csum) {
2748 int found_type; 3087 int found_type;
2749 extent = btrfs_item_ptr(src, start_slot + i, 3088 extent = btrfs_item_ptr(src, start_slot + i,
2750 struct btrfs_file_extent_item); 3089 struct btrfs_file_extent_item);
@@ -2753,8 +3092,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2753 continue; 3092 continue;
2754 3093
2755 found_type = btrfs_file_extent_type(src, extent); 3094 found_type = btrfs_file_extent_type(src, extent);
2756 if (found_type == BTRFS_FILE_EXTENT_REG || 3095 if (found_type == BTRFS_FILE_EXTENT_REG) {
2757 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2758 u64 ds, dl, cs, cl; 3096 u64 ds, dl, cs, cl;
2759 ds = btrfs_file_extent_disk_bytenr(src, 3097 ds = btrfs_file_extent_disk_bytenr(src,
2760 extent); 3098 extent);
@@ -2803,6 +3141,299 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2803 return ret; 3141 return ret;
2804} 3142}
2805 3143
3144static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3145{
3146 struct extent_map *em1, *em2;
3147
3148 em1 = list_entry(a, struct extent_map, list);
3149 em2 = list_entry(b, struct extent_map, list);
3150
3151 if (em1->start < em2->start)
3152 return -1;
3153 else if (em1->start > em2->start)
3154 return 1;
3155 return 0;
3156}
3157
3158static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root, struct inode *inode,
3160 struct extent_map *em,
3161 struct btrfs_path *path)
3162{
3163 struct btrfs_file_extent_item *fi;
3164 struct extent_buffer *leaf;
3165 struct btrfs_key key, new_key;
3166 struct btrfs_map_token token;
3167 u64 extent_end;
3168 u64 extent_offset = 0;
3169 int extent_type;
3170 int del_slot = 0;
3171 int del_nr = 0;
3172 int ret = 0;
3173
3174 while (1) {
3175 btrfs_init_map_token(&token);
3176 leaf = path->nodes[0];
3177 path->slots[0]++;
3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3179 if (del_nr) {
3180 ret = btrfs_del_items(trans, root, path,
3181 del_slot, del_nr);
3182 if (ret)
3183 return ret;
3184 del_nr = 0;
3185 }
3186
3187 ret = btrfs_next_leaf_write(trans, root, path, 1);
3188 if (ret < 0)
3189 return ret;
3190 if (ret > 0)
3191 return 0;
3192 leaf = path->nodes[0];
3193 }
3194
3195 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3196 if (key.objectid != btrfs_ino(inode) ||
3197 key.type != BTRFS_EXTENT_DATA_KEY ||
3198 key.offset >= em->start + em->len)
3199 break;
3200
3201 fi = btrfs_item_ptr(leaf, path->slots[0],
3202 struct btrfs_file_extent_item);
3203 extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3204 if (extent_type == BTRFS_FILE_EXTENT_REG ||
3205 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3206 extent_offset = btrfs_token_file_extent_offset(leaf,
3207 fi, &token);
3208 extent_end = key.offset +
3209 btrfs_token_file_extent_num_bytes(leaf, fi,
3210 &token);
3211 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3212 extent_end = key.offset +
3213 btrfs_file_extent_inline_len(leaf, fi);
3214 } else {
3215 BUG();
3216 }
3217
3218 if (extent_end <= em->len + em->start) {
3219 if (!del_nr) {
3220 del_slot = path->slots[0];
3221 }
3222 del_nr++;
3223 continue;
3224 }
3225
3226 /*
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3232 */
3233 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3234 struct btrfs_key tmp_key;
3235
3236 btrfs_item_key_to_cpu(leaf, &tmp_key,
3237 path->slots[0] + 1);
3238 if (tmp_key.objectid == btrfs_ino(inode) &&
3239 tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3240 tmp_key.offset <= em->start + em->len) {
3241 if (!del_nr)
3242 del_slot = path->slots[0];
3243 del_nr++;
3244 continue;
3245 }
3246 }
3247
3248 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3249 memcpy(&new_key, &key, sizeof(new_key));
3250 new_key.offset = em->start + em->len;
3251 btrfs_set_item_key_safe(trans, root, path, &new_key);
3252 extent_offset += em->start + em->len - key.offset;
3253 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3254 &token);
3255 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3256 (em->start + em->len),
3257 &token);
3258 btrfs_mark_buffer_dirty(leaf);
3259 }
3260
3261 if (del_nr)
3262 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3263
3264 return ret;
3265}
3266
3267static int log_one_extent(struct btrfs_trans_handle *trans,
3268 struct inode *inode, struct btrfs_root *root,
3269 struct extent_map *em, struct btrfs_path *path)
3270{
3271 struct btrfs_root *log = root->log_root;
3272 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf;
3274 struct list_head ordered_sums;
3275 struct btrfs_map_token token;
3276 struct btrfs_key key;
3277 u64 csum_offset = em->mod_start - em->start;
3278 u64 csum_len = em->mod_len;
3279 u64 extent_offset = em->start - em->orig_start;
3280 u64 block_len;
3281 int ret;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3283
3284 INIT_LIST_HEAD(&ordered_sums);
3285 btrfs_init_map_token(&token);
3286 key.objectid = btrfs_ino(inode);
3287 key.type = BTRFS_EXTENT_DATA_KEY;
3288 key.offset = em->start;
3289 path->really_keep_locks = 1;
3290
3291 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3292 if (ret && ret != -EEXIST) {
3293 path->really_keep_locks = 0;
3294 return ret;
3295 }
3296 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item);
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3302 skip_csum = true;
3303 btrfs_set_token_file_extent_type(leaf, fi,
3304 BTRFS_FILE_EXTENT_PREALLOC,
3305 &token);
3306 } else {
3307 btrfs_set_token_file_extent_type(leaf, fi,
3308 BTRFS_FILE_EXTENT_REG,
3309 &token);
3310 if (em->block_start == 0)
3311 skip_csum = true;
3312 }
3313
3314 block_len = max(em->block_len, em->orig_block_len);
3315 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3317 em->block_start,
3318 &token);
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3320 &token);
3321 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3323 em->block_start -
3324 extent_offset, &token);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3326 &token);
3327 } else {
3328 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3330 &token);
3331 }
3332
3333 btrfs_set_token_file_extent_offset(leaf, fi,
3334 em->start - em->orig_start,
3335 &token);
3336 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3337 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3338 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3339 &token);
3340 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3341 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3342 btrfs_mark_buffer_dirty(leaf);
3343
3344 /*
3345 * Have to check the extent to the right of us to make sure it doesn't
3346 * fall in our current range. We're ok if the previous extent is in our
3347 * range since the recovery stuff will run us in key order and thus just
3348 * drop the part we overwrote.
3349 */
3350 ret = drop_adjacent_extents(trans, log, inode, em, path);
3351 btrfs_release_path(path);
3352 path->really_keep_locks = 0;
3353 if (ret) {
3354 return ret;
3355 }
3356
3357 if (skip_csum)
3358 return 0;
3359
3360 /* block start is already adjusted for the file extent offset. */
3361 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3362 em->block_start + csum_offset,
3363 em->block_start + csum_offset +
3364 csum_len - 1, &ordered_sums, 0);
3365 if (ret)
3366 return ret;
3367
3368 while (!list_empty(&ordered_sums)) {
3369 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3370 struct btrfs_ordered_sum,
3371 list);
3372 if (!ret)
3373 ret = btrfs_csum_file_blocks(trans, log, sums);
3374 list_del(&sums->list);
3375 kfree(sums);
3376 }
3377
3378 return ret;
3379}
3380
3381static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3382 struct btrfs_root *root,
3383 struct inode *inode,
3384 struct btrfs_path *path)
3385{
3386 struct extent_map *em, *n;
3387 struct list_head extents;
3388 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3389 u64 test_gen;
3390 int ret = 0;
3391
3392 INIT_LIST_HEAD(&extents);
3393
3394 write_lock(&tree->lock);
3395 test_gen = root->fs_info->last_trans_committed;
3396
3397 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3398 list_del_init(&em->list);
3399 if (em->generation <= test_gen)
3400 continue;
3401 /* Need a ref to keep it from getting evicted from cache */
3402 atomic_inc(&em->refs);
3403 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3404 list_add_tail(&em->list, &extents);
3405 }
3406
3407 list_sort(NULL, &extents, extent_cmp);
3408
3409 while (!list_empty(&extents)) {
3410 em = list_entry(extents.next, struct extent_map, list);
3411
3412 list_del_init(&em->list);
3413 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3414
3415 /*
3416 * If we had an error we just need to delete everybody from our
3417 * private list.
3418 */
3419 if (ret) {
3420 free_extent_map(em);
3421 continue;
3422 }
3423
3424 write_unlock(&tree->lock);
3425
3426 ret = log_one_extent(trans, inode, root, em, path);
3427 free_extent_map(em);
3428 write_lock(&tree->lock);
3429 }
3430 WARN_ON(!list_empty(&extents));
3431 write_unlock(&tree->lock);
3432
3433 btrfs_release_path(path);
3434 return ret;
3435}
3436
2806/* log a single inode in the tree log. 3437/* log a single inode in the tree log.
2807 * At least one parent directory for this inode must exist in the tree 3438 * At least one parent directory for this inode must exist in the tree
2808 * or be logged already. 3439 * or be logged already.
@@ -2832,6 +3463,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2832 int nritems; 3463 int nritems;
2833 int ins_start_slot = 0; 3464 int ins_start_slot = 0;
2834 int ins_nr; 3465 int ins_nr;
3466 bool fast_search = false;
2835 u64 ino = btrfs_ino(inode); 3467 u64 ino = btrfs_ino(inode);
2836 3468
2837 log = root->log_root; 3469 log = root->log_root;
@@ -2851,21 +3483,26 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2851 3483
2852 max_key.objectid = ino; 3484 max_key.objectid = ino;
2853 3485
2854 /* today the code can only do partial logging of directories */
2855 if (!S_ISDIR(inode->i_mode))
2856 inode_only = LOG_INODE_ALL;
2857 3486
2858 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3487 /* today the code can only do partial logging of directories */
3488 if (S_ISDIR(inode->i_mode) ||
3489 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3490 &BTRFS_I(inode)->runtime_flags) &&
3491 inode_only == LOG_INODE_EXISTS))
2859 max_key.type = BTRFS_XATTR_ITEM_KEY; 3492 max_key.type = BTRFS_XATTR_ITEM_KEY;
2860 else 3493 else
2861 max_key.type = (u8)-1; 3494 max_key.type = (u8)-1;
2862 max_key.offset = (u64)-1; 3495 max_key.offset = (u64)-1;
2863 3496
2864 ret = btrfs_commit_inode_delayed_items(trans, inode); 3497 /* Only run delayed items if we are a dir or a new file */
2865 if (ret) { 3498 if (S_ISDIR(inode->i_mode) ||
2866 btrfs_free_path(path); 3499 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
2867 btrfs_free_path(dst_path); 3500 ret = btrfs_commit_inode_delayed_items(trans, inode);
2868 return ret; 3501 if (ret) {
3502 btrfs_free_path(path);
3503 btrfs_free_path(dst_path);
3504 return ret;
3505 }
2869 } 3506 }
2870 3507
2871 mutex_lock(&BTRFS_I(inode)->log_mutex); 3508 mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3518,30 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2881 max_key_type = BTRFS_XATTR_ITEM_KEY; 3518 max_key_type = BTRFS_XATTR_ITEM_KEY;
2882 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3519 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883 } else { 3520 } else {
2884 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 3521 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3522 &BTRFS_I(inode)->runtime_flags)) {
3523 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3524 &BTRFS_I(inode)->runtime_flags);
3525 ret = btrfs_truncate_inode_items(trans, log,
3526 inode, 0, 0);
3527 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3528 &BTRFS_I(inode)->runtime_flags)) {
3529 if (inode_only == LOG_INODE_ALL)
3530 fast_search = true;
3531 max_key.type = BTRFS_XATTR_ITEM_KEY;
3532 ret = drop_objectid_items(trans, log, path, ino,
3533 max_key.type);
3534 } else {
3535 if (inode_only == LOG_INODE_ALL)
3536 fast_search = true;
3537 ret = log_inode_item(trans, log, dst_path, inode);
3538 if (ret) {
3539 err = ret;
3540 goto out_unlock;
3541 }
3542 goto log_extents;
3543 }
3544
2885 } 3545 }
2886 if (ret) { 3546 if (ret) {
2887 err = ret; 3547 err = ret;
@@ -2912,7 +3572,7 @@ again:
2912 goto next_slot; 3572 goto next_slot;
2913 } 3573 }
2914 3574
2915 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 3575 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2916 ins_nr, inode_only); 3576 ins_nr, inode_only);
2917 if (ret) { 3577 if (ret) {
2918 err = ret; 3578 err = ret;
@@ -2930,7 +3590,7 @@ next_slot:
2930 goto again; 3590 goto again;
2931 } 3591 }
2932 if (ins_nr) { 3592 if (ins_nr) {
2933 ret = copy_items(trans, log, dst_path, src, 3593 ret = copy_items(trans, inode, dst_path, src,
2934 ins_start_slot, 3594 ins_start_slot,
2935 ins_nr, inode_only); 3595 ins_nr, inode_only);
2936 if (ret) { 3596 if (ret) {
@@ -2951,8 +3611,7 @@ next_slot:
2951 break; 3611 break;
2952 } 3612 }
2953 if (ins_nr) { 3613 if (ins_nr) {
2954 ret = copy_items(trans, log, dst_path, src, 3614 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2955 ins_start_slot,
2956 ins_nr, inode_only); 3615 ins_nr, inode_only);
2957 if (ret) { 3616 if (ret) {
2958 err = ret; 3617 err = ret;
@@ -2960,7 +3619,25 @@ next_slot:
2960 } 3619 }
2961 ins_nr = 0; 3620 ins_nr = 0;
2962 } 3621 }
2963 WARN_ON(ins_nr); 3622
3623log_extents:
3624 if (fast_search) {
3625 btrfs_release_path(dst_path);
3626 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3627 if (ret) {
3628 err = ret;
3629 goto out_unlock;
3630 }
3631 } else {
3632 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3633 struct extent_map *em, *n;
3634
3635 write_lock(&tree->lock);
3636 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3637 list_del_init(&em->list);
3638 write_unlock(&tree->lock);
3639 }
3640
2964 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3641 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965 btrfs_release_path(path); 3642 btrfs_release_path(path);
2966 btrfs_release_path(dst_path); 3643 btrfs_release_path(dst_path);
@@ -2971,6 +3648,7 @@ next_slot:
2971 } 3648 }
2972 } 3649 }
2973 BTRFS_I(inode)->logged_trans = trans->transid; 3650 BTRFS_I(inode)->logged_trans = trans->transid;
3651 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
2974out_unlock: 3652out_unlock:
2975 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3653 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2976 3654
@@ -3138,7 +3816,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3138end_trans: 3816end_trans:
3139 dput(old_parent); 3817 dput(old_parent);
3140 if (ret < 0) { 3818 if (ret < 0) {
3141 BUG_ON(ret != -ENOSPC); 3819 WARN_ON(ret != -ENOSPC);
3142 root->fs_info->last_trans_log_full_commit = trans->transid; 3820 root->fs_info->last_trans_log_full_commit = trans->transid;
3143 ret = 1; 3821 ret = 1;
3144 } 3822 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3dd..99be4c138db6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
143 * In case of allocation failure -ENOMEM is returned and the ulist stays 143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
147 gfp_t gfp_mask)
148{ 147{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); 148 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150} 149}
151 150
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 151int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
153 unsigned long *old_aux, gfp_t gfp_mask) 152 u64 *old_aux, gfp_t gfp_mask)
154{ 153{
155 int i; 154 int i;
156 155
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec8130..21a1963439c3 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
33 */ 33 */
34struct ulist_node { 34struct ulist_node {
35 u64 val; /* value to store */ 35 u64 val; /* value to store */
36 unsigned long aux; /* auxiliary value saved along with the val */ 36 u64 aux; /* auxiliary value saved along with the val */
37}; 37};
38 38
39struct ulist { 39struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
65void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
66struct ulist *ulist_alloc(gfp_t gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
67void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
69 gfp_t gfp_mask); 69int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 70 u64 *old_aux, gfp_t gfp_mask);
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist, 71struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter); 72 struct ulist_iterator *uiter);
74 73
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb71..5cce6aa74012 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <asm/div64.h>
29#include "compat.h" 28#include "compat.h"
30#include "ctree.h" 29#include "ctree.h"
31#include "extent_map.h" 30#include "extent_map.h"
@@ -36,6 +35,8 @@
36#include "async-thread.h" 35#include "async-thread.h"
37#include "check-integrity.h" 36#include "check-integrity.h"
38#include "rcu-string.h" 37#include "rcu-string.h"
38#include "math.h"
39#include "dev-replace.h"
39 40
40static int init_first_rw_device(struct btrfs_trans_handle *trans, 41static int init_first_rw_device(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 42 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
71 kfree(fs_devices); 72 kfree(fs_devices);
72} 73}
73 74
75static void btrfs_kobject_uevent(struct block_device *bdev,
76 enum kobject_action action)
77{
78 int ret;
79
80 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
81 if (ret)
82 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
83 action,
84 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
85 &disk_to_dev(bdev->bd_disk)->kobj);
86}
87
74void btrfs_cleanup_fs_uuids(void) 88void btrfs_cleanup_fs_uuids(void)
75{ 89{
76 struct btrfs_fs_devices *fs_devices; 90 struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
108 return NULL; 122 return NULL;
109} 123}
110 124
125static int
126btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
127 int flush, struct block_device **bdev,
128 struct buffer_head **bh)
129{
130 int ret;
131
132 *bdev = blkdev_get_by_path(device_path, flags, holder);
133
134 if (IS_ERR(*bdev)) {
135 ret = PTR_ERR(*bdev);
136 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
137 goto error;
138 }
139
140 if (flush)
141 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
142 ret = set_blocksize(*bdev, 4096);
143 if (ret) {
144 blkdev_put(*bdev, flags);
145 goto error;
146 }
147 invalidate_bdev(*bdev);
148 *bh = btrfs_read_dev_super(*bdev);
149 if (!*bh) {
150 ret = -EINVAL;
151 blkdev_put(*bdev, flags);
152 goto error;
153 }
154
155 return 0;
156
157error:
158 *bdev = NULL;
159 *bh = NULL;
160 return ret;
161}
162
111static void requeue_list(struct btrfs_pending_bios *pending_bios, 163static void requeue_list(struct btrfs_pending_bios *pending_bios,
112 struct bio *head, struct bio *tail) 164 struct bio *head, struct bio *tail)
113{ 165{
@@ -467,7 +519,8 @@ error:
467 return ERR_PTR(-ENOMEM); 519 return ERR_PTR(-ENOMEM);
468} 520}
469 521
470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 522void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
523 struct btrfs_fs_devices *fs_devices, int step)
471{ 524{
472 struct btrfs_device *device, *next; 525 struct btrfs_device *device, *next;
473 526
@@ -480,8 +533,9 @@ again:
480 /* This is the initialized path, it is safe to release the devices. */ 533 /* This is the initialized path, it is safe to release the devices. */
481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 534 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482 if (device->in_fs_metadata) { 535 if (device->in_fs_metadata) {
483 if (!latest_transid || 536 if (!device->is_tgtdev_for_dev_replace &&
484 device->generation > latest_transid) { 537 (!latest_transid ||
538 device->generation > latest_transid)) {
485 latest_devid = device->devid; 539 latest_devid = device->devid;
486 latest_transid = device->generation; 540 latest_transid = device->generation;
487 latest_bdev = device->bdev; 541 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
489 continue; 543 continue;
490 } 544 }
491 545
546 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
547 /*
548 * In the first step, keep the device which has
549 * the correct fsid and the devid that is used
550 * for the dev_replace procedure.
551 * In the second step, the dev_replace state is
552 * read from the device tree and it is known
553 * whether the procedure is really active or
554 * not, which means whether this device is
555 * used or whether it should be removed.
556 */
557 if (step == 0 || device->is_tgtdev_for_dev_replace) {
558 continue;
559 }
560 }
492 if (device->bdev) { 561 if (device->bdev) {
493 blkdev_put(device->bdev, device->mode); 562 blkdev_put(device->bdev, device->mode);
494 device->bdev = NULL; 563 device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
497 if (device->writeable) { 566 if (device->writeable) {
498 list_del_init(&device->dev_alloc_list); 567 list_del_init(&device->dev_alloc_list);
499 device->writeable = 0; 568 device->writeable = 0;
500 fs_devices->rw_devices--; 569 if (!device->is_tgtdev_for_dev_replace)
570 fs_devices->rw_devices--;
501 } 571 }
502 list_del_init(&device->dev_list); 572 list_del_init(&device->dev_list);
503 fs_devices->num_devices--; 573 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 if (device->bdev) 625 if (device->bdev)
556 fs_devices->open_devices--; 626 fs_devices->open_devices--;
557 627
558 if (device->writeable) { 628 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
559 list_del_init(&device->dev_alloc_list); 629 list_del_init(&device->dev_alloc_list);
560 fs_devices->rw_devices--; 630 fs_devices->rw_devices--;
561 } 631 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
637 if (!device->name) 707 if (!device->name)
638 continue; 708 continue;
639 709
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 710 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
641 if (IS_ERR(bdev)) { 711 &bdev, &bh);
642 printk(KERN_INFO "open %s failed\n", device->name->str); 712 if (ret)
643 goto error; 713 continue;
644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
646 invalidate_bdev(bdev);
647 set_blocksize(bdev, 4096);
648
649 bh = btrfs_read_dev_super(bdev);
650 if (!bh)
651 goto error_close;
652 714
653 disk_super = (struct btrfs_super_block *)bh->b_data; 715 disk_super = (struct btrfs_super_block *)bh->b_data;
654 devid = btrfs_stack_device_id(&disk_super->dev_item); 716 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
687 fs_devices->rotating = 1; 749 fs_devices->rotating = 1;
688 750
689 fs_devices->open_devices++; 751 fs_devices->open_devices++;
690 if (device->writeable) { 752 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
691 fs_devices->rw_devices++; 753 fs_devices->rw_devices++;
692 list_add(&device->dev_alloc_list, 754 list_add(&device->dev_alloc_list,
693 &fs_devices->alloc_list); 755 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
697 759
698error_brelse: 760error_brelse:
699 brelse(bh); 761 brelse(bh);
700error_close:
701 blkdev_put(bdev, flags); 762 blkdev_put(bdev, flags);
702error:
703 continue; 763 continue;
704 } 764 }
705 if (fs_devices->open_devices == 0) { 765 if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
744 u64 total_devices; 804 u64 total_devices;
745 805
746 flags |= FMODE_EXCL; 806 flags |= FMODE_EXCL;
747 bdev = blkdev_get_by_path(path, flags, holder);
748
749 if (IS_ERR(bdev)) {
750 ret = PTR_ERR(bdev);
751 goto error;
752 }
753
754 mutex_lock(&uuid_mutex); 807 mutex_lock(&uuid_mutex);
755 ret = set_blocksize(bdev, 4096); 808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
756 if (ret) 809 if (ret)
757 goto error_close; 810 goto error;
758 bh = btrfs_read_dev_super(bdev);
759 if (!bh) {
760 ret = -EINVAL;
761 goto error_close;
762 }
763 disk_super = (struct btrfs_super_block *)bh->b_data; 811 disk_super = (struct btrfs_super_block *)bh->b_data;
764 devid = btrfs_stack_device_id(&disk_super->dev_item); 812 devid = btrfs_stack_device_id(&disk_super->dev_item);
765 transid = btrfs_super_generation(disk_super); 813 transid = btrfs_super_generation(disk_super);
766 total_devices = btrfs_super_num_devices(disk_super); 814 total_devices = btrfs_super_num_devices(disk_super);
767 if (disk_super->label[0]) 815 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
768 printk(KERN_INFO "device label %s ", disk_super->label); 818 printk(KERN_INFO "device label %s ", disk_super->label);
769 else 819 } else {
770 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 }
771 printk(KERN_CONT "devid %llu transid %llu %s\n", 822 printk(KERN_CONT "devid %llu transid %llu %s\n",
772 (unsigned long long)devid, (unsigned long long)transid, path); 823 (unsigned long long)devid, (unsigned long long)transid, path);
773 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 824 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
774 if (!ret && fs_devices_ret) 825 if (!ret && fs_devices_ret)
775 (*fs_devices_ret)->total_devices = total_devices; 826 (*fs_devices_ret)->total_devices = total_devices;
776 brelse(bh); 827 brelse(bh);
777error_close:
778 mutex_unlock(&uuid_mutex);
779 blkdev_put(bdev, flags); 828 blkdev_put(bdev, flags);
780error: 829error:
830 mutex_unlock(&uuid_mutex);
781 return ret; 831 return ret;
782} 832}
783 833
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
796 846
797 *length = 0; 847 *length = 0;
798 848
799 if (start >= device->total_bytes) 849 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
800 return 0; 850 return 0;
801 851
802 path = btrfs_alloc_path(); 852 path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
913 max_hole_size = 0; 963 max_hole_size = 0;
914 hole_size = 0; 964 hole_size = 0;
915 965
916 if (search_start >= search_end) { 966 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
917 ret = -ENOSPC; 967 ret = -ENOSPC;
918 goto error; 968 goto error;
919 } 969 }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1096 struct btrfs_key key; 1146 struct btrfs_key key;
1097 1147
1098 WARN_ON(!device->in_fs_metadata); 1148 WARN_ON(!device->in_fs_metadata);
1149 WARN_ON(device->is_tgtdev_for_dev_replace);
1099 path = btrfs_alloc_path(); 1150 path = btrfs_alloc_path();
1100 if (!path) 1151 if (!path)
1101 return -ENOMEM; 1152 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1330 root->fs_info->avail_system_alloc_bits | 1381 root->fs_info->avail_system_alloc_bits |
1331 root->fs_info->avail_metadata_alloc_bits; 1382 root->fs_info->avail_metadata_alloc_bits;
1332 1383
1333 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1384 num_devices = root->fs_info->fs_devices->num_devices;
1334 root->fs_info->fs_devices->num_devices <= 4) { 1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1386 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1387 WARN_ON(num_devices < 1);
1388 num_devices--;
1389 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1335 printk(KERN_ERR "btrfs: unable to go below four devices " 1393 printk(KERN_ERR "btrfs: unable to go below four devices "
1336 "on raid10\n"); 1394 "on raid10\n");
1337 ret = -EINVAL; 1395 ret = -EINVAL;
1338 goto out; 1396 goto out;
1339 } 1397 }
1340 1398
1341 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1342 root->fs_info->fs_devices->num_devices <= 2) {
1343 printk(KERN_ERR "btrfs: unable to go below two " 1400 printk(KERN_ERR "btrfs: unable to go below two "
1344 "devices on raid1\n"); 1401 "devices on raid1\n");
1345 ret = -EINVAL; 1402 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1357 * is held. 1414 * is held.
1358 */ 1415 */
1359 list_for_each_entry(tmp, devices, dev_list) { 1416 list_for_each_entry(tmp, devices, dev_list) {
1360 if (tmp->in_fs_metadata && !tmp->bdev) { 1417 if (tmp->in_fs_metadata &&
1418 !tmp->is_tgtdev_for_dev_replace &&
1419 !tmp->bdev) {
1361 device = tmp; 1420 device = tmp;
1362 break; 1421 break;
1363 } 1422 }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1371 goto out; 1430 goto out;
1372 } 1431 }
1373 } else { 1432 } else {
1374 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1433 ret = btrfs_get_bdev_and_sb(device_path,
1375 root->fs_info->bdev_holder); 1434 FMODE_READ | FMODE_EXCL,
1376 if (IS_ERR(bdev)) { 1435 root->fs_info->bdev_holder, 0,
1377 ret = PTR_ERR(bdev); 1436 &bdev, &bh);
1437 if (ret)
1378 goto out; 1438 goto out;
1379 }
1380
1381 set_blocksize(bdev, 4096);
1382 invalidate_bdev(bdev);
1383 bh = btrfs_read_dev_super(bdev);
1384 if (!bh) {
1385 ret = -EINVAL;
1386 goto error_close;
1387 }
1388 disk_super = (struct btrfs_super_block *)bh->b_data; 1439 disk_super = (struct btrfs_super_block *)bh->b_data;
1389 devid = btrfs_stack_device_id(&disk_super->dev_item); 1440 devid = btrfs_stack_device_id(&disk_super->dev_item);
1390 dev_uuid = disk_super->dev_item.uuid; 1441 dev_uuid = disk_super->dev_item.uuid;
1391 device = btrfs_find_device(root, devid, dev_uuid, 1442 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1392 disk_super->fsid); 1443 disk_super->fsid);
1393 if (!device) { 1444 if (!device) {
1394 ret = -ENOENT; 1445 ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1396 } 1447 }
1397 } 1448 }
1398 1449
1450 if (device->is_tgtdev_for_dev_replace) {
1451 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1452 ret = -EINVAL;
1453 goto error_brelse;
1454 }
1455
1399 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1456 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1457 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401 "device\n"); 1458 "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1415 if (ret) 1472 if (ret)
1416 goto error_undo; 1473 goto error_undo;
1417 1474
1475 /*
1476 * TODO: the superblock still includes this device in its num_devices
1477 * counter although write_all_supers() is not locked out. This
1478 * could give a filesystem state which requires a degraded mount.
1479 */
1418 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1480 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419 if (ret) 1481 if (ret)
1420 goto error_undo; 1482 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1425 spin_unlock(&root->fs_info->free_chunk_lock); 1487 spin_unlock(&root->fs_info->free_chunk_lock);
1426 1488
1427 device->in_fs_metadata = 0; 1489 device->in_fs_metadata = 0;
1428 btrfs_scrub_cancel_dev(root, device); 1490 btrfs_scrub_cancel_dev(root->fs_info, device);
1429 1491
1430 /* 1492 /*
1431 * the device list mutex makes sure that we don't change 1493 * the device list mutex makes sure that we don't change
@@ -1475,11 +1537,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1475 free_fs_devices(cur_devices); 1537 free_fs_devices(cur_devices);
1476 } 1538 }
1477 1539
1540 root->fs_info->num_tolerated_disk_barrier_failures =
1541 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1542
1478 /* 1543 /*
1479 * at this point, the device is zero sized. We want to 1544 * at this point, the device is zero sized. We want to
1480 * remove it from the devices list and zero out the old super 1545 * remove it from the devices list and zero out the old super
1481 */ 1546 */
1482 if (clear_super) { 1547 if (clear_super && disk_super) {
1483 /* make sure this device isn't detected as part of 1548 /* make sure this device isn't detected as part of
1484 * the FS anymore 1549 * the FS anymore
1485 */ 1550 */
@@ -1490,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1490 1555
1491 ret = 0; 1556 ret = 0;
1492 1557
1558 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560
1493error_brelse: 1561error_brelse:
1494 brelse(bh); 1562 brelse(bh);
1495error_close:
1496 if (bdev) 1563 if (bdev)
1497 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1564 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1498out: 1565out:
@@ -1509,6 +1576,112 @@ error_undo:
1509 goto error_brelse; 1576 goto error_brelse;
1510} 1577}
1511 1578
1579void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1580 struct btrfs_device *srcdev)
1581{
1582 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1583 list_del_rcu(&srcdev->dev_list);
1584 list_del_rcu(&srcdev->dev_alloc_list);
1585 fs_info->fs_devices->num_devices--;
1586 if (srcdev->missing) {
1587 fs_info->fs_devices->missing_devices--;
1588 fs_info->fs_devices->rw_devices++;
1589 }
1590 if (srcdev->can_discard)
1591 fs_info->fs_devices->num_can_discard--;
1592 if (srcdev->bdev)
1593 fs_info->fs_devices->open_devices--;
1594
1595 call_rcu(&srcdev->rcu, free_device);
1596}
1597
1598void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1599 struct btrfs_device *tgtdev)
1600{
1601 struct btrfs_device *next_device;
1602
1603 WARN_ON(!tgtdev);
1604 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1605 if (tgtdev->bdev) {
1606 btrfs_scratch_superblock(tgtdev);
1607 fs_info->fs_devices->open_devices--;
1608 }
1609 fs_info->fs_devices->num_devices--;
1610 if (tgtdev->can_discard)
1611 fs_info->fs_devices->num_can_discard++;
1612
1613 next_device = list_entry(fs_info->fs_devices->devices.next,
1614 struct btrfs_device, dev_list);
1615 if (tgtdev->bdev == fs_info->sb->s_bdev)
1616 fs_info->sb->s_bdev = next_device->bdev;
1617 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1618 fs_info->fs_devices->latest_bdev = next_device->bdev;
1619 list_del_rcu(&tgtdev->dev_list);
1620
1621 call_rcu(&tgtdev->rcu, free_device);
1622
1623 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1624}
1625
1626int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1627 struct btrfs_device **device)
1628{
1629 int ret = 0;
1630 struct btrfs_super_block *disk_super;
1631 u64 devid;
1632 u8 *dev_uuid;
1633 struct block_device *bdev;
1634 struct buffer_head *bh;
1635
1636 *device = NULL;
1637 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1638 root->fs_info->bdev_holder, 0, &bdev, &bh);
1639 if (ret)
1640 return ret;
1641 disk_super = (struct btrfs_super_block *)bh->b_data;
1642 devid = btrfs_stack_device_id(&disk_super->dev_item);
1643 dev_uuid = disk_super->dev_item.uuid;
1644 *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1645 disk_super->fsid);
1646 brelse(bh);
1647 if (!*device)
1648 ret = -ENOENT;
1649 blkdev_put(bdev, FMODE_READ);
1650 return ret;
1651}
1652
1653int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1654 char *device_path,
1655 struct btrfs_device **device)
1656{
1657 *device = NULL;
1658 if (strcmp(device_path, "missing") == 0) {
1659 struct list_head *devices;
1660 struct btrfs_device *tmp;
1661
1662 devices = &root->fs_info->fs_devices->devices;
1663 /*
1664 * It is safe to read the devices since the volume_mutex
1665 * is held by the caller.
1666 */
1667 list_for_each_entry(tmp, devices, dev_list) {
1668 if (tmp->in_fs_metadata && !tmp->bdev) {
1669 *device = tmp;
1670 break;
1671 }
1672 }
1673
1674 if (!*device) {
1675 pr_err("btrfs: no missing device found\n");
1676 return -ENOENT;
1677 }
1678
1679 return 0;
1680 } else {
1681 return btrfs_find_device_by_path(root, device_path, device);
1682 }
1683}
1684
1512/* 1685/*
1513 * does all the dirty work required for changing file system's UUID. 1686 * does all the dirty work required for changing file system's UUID.
1514 */ 1687 */
@@ -1627,7 +1800,8 @@ next_slot:
1627 read_extent_buffer(leaf, fs_uuid, 1800 read_extent_buffer(leaf, fs_uuid,
1628 (unsigned long)btrfs_device_fsid(dev_item), 1801 (unsigned long)btrfs_device_fsid(dev_item),
1629 BTRFS_UUID_SIZE); 1802 BTRFS_UUID_SIZE);
1630 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1803 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1804 fs_uuid);
1631 BUG_ON(!device); /* Logic error */ 1805 BUG_ON(!device); /* Logic error */
1632 1806
1633 if (device->fs_devices->seeding) { 1807 if (device->fs_devices->seeding) {
@@ -1675,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1675 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1849 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1676 1850
1677 devices = &root->fs_info->fs_devices->devices; 1851 devices = &root->fs_info->fs_devices->devices;
1678 /* 1852
1679 * we have the volume lock, so we don't need the extra 1853 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1680 * device list mutex while reading the list here.
1681 */
1682 list_for_each_entry(device, devices, dev_list) { 1854 list_for_each_entry(device, devices, dev_list) {
1683 if (device->bdev == bdev) { 1855 if (device->bdev == bdev) {
1684 ret = -EEXIST; 1856 ret = -EEXIST;
1857 mutex_unlock(
1858 &root->fs_info->fs_devices->device_list_mutex);
1685 goto error; 1859 goto error;
1686 } 1860 }
1687 } 1861 }
1862 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1688 1863
1689 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 device = kzalloc(sizeof(*device), GFP_NOFS);
1690 if (!device) { 1865 if (!device) {
@@ -1734,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1734 device->dev_root = root->fs_info->dev_root; 1909 device->dev_root = root->fs_info->dev_root;
1735 device->bdev = bdev; 1910 device->bdev = bdev;
1736 device->in_fs_metadata = 1; 1911 device->in_fs_metadata = 1;
1912 device->is_tgtdev_for_dev_replace = 0;
1737 device->mode = FMODE_EXCL; 1913 device->mode = FMODE_EXCL;
1738 set_blocksize(device->bdev, 4096); 1914 set_blocksize(device->bdev, 4096);
1739 1915
@@ -1775,15 +1951,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1775 1951
1776 if (seeding_dev) { 1952 if (seeding_dev) {
1777 ret = init_first_rw_device(trans, root, device); 1953 ret = init_first_rw_device(trans, root, device);
1778 if (ret) 1954 if (ret) {
1955 btrfs_abort_transaction(trans, root, ret);
1779 goto error_trans; 1956 goto error_trans;
1957 }
1780 ret = btrfs_finish_sprout(trans, root); 1958 ret = btrfs_finish_sprout(trans, root);
1781 if (ret) 1959 if (ret) {
1960 btrfs_abort_transaction(trans, root, ret);
1782 goto error_trans; 1961 goto error_trans;
1962 }
1783 } else { 1963 } else {
1784 ret = btrfs_add_device(trans, root, device); 1964 ret = btrfs_add_device(trans, root, device);
1785 if (ret) 1965 if (ret) {
1966 btrfs_abort_transaction(trans, root, ret);
1786 goto error_trans; 1967 goto error_trans;
1968 }
1787 } 1969 }
1788 1970
1789 /* 1971 /*
@@ -1793,6 +1975,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1793 btrfs_clear_space_info_full(root->fs_info); 1975 btrfs_clear_space_info_full(root->fs_info);
1794 1976
1795 unlock_chunks(root); 1977 unlock_chunks(root);
1978 root->fs_info->num_tolerated_disk_barrier_failures =
1979 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1796 ret = btrfs_commit_transaction(trans, root); 1980 ret = btrfs_commit_transaction(trans, root);
1797 1981
1798 if (seeding_dev) { 1982 if (seeding_dev) {
@@ -1808,13 +1992,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1808 "Failed to relocate sys chunks after " 1992 "Failed to relocate sys chunks after "
1809 "device initialization. This can be fixed " 1993 "device initialization. This can be fixed "
1810 "using the \"btrfs balance\" command."); 1994 "using the \"btrfs balance\" command.");
1995 trans = btrfs_attach_transaction(root);
1996 if (IS_ERR(trans)) {
1997 if (PTR_ERR(trans) == -ENOENT)
1998 return 0;
1999 return PTR_ERR(trans);
2000 }
2001 ret = btrfs_commit_transaction(trans, root);
1811 } 2002 }
1812 2003
1813 return ret; 2004 return ret;
1814 2005
1815error_trans: 2006error_trans:
1816 unlock_chunks(root); 2007 unlock_chunks(root);
1817 btrfs_abort_transaction(trans, root, ret);
1818 btrfs_end_transaction(trans, root); 2008 btrfs_end_transaction(trans, root);
1819 rcu_string_free(device->name); 2009 rcu_string_free(device->name);
1820 kfree(device); 2010 kfree(device);
@@ -1827,6 +2017,98 @@ error:
1827 return ret; 2017 return ret;
1828} 2018}
1829 2019
2020int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2021 struct btrfs_device **device_out)
2022{
2023 struct request_queue *q;
2024 struct btrfs_device *device;
2025 struct block_device *bdev;
2026 struct btrfs_fs_info *fs_info = root->fs_info;
2027 struct list_head *devices;
2028 struct rcu_string *name;
2029 int ret = 0;
2030
2031 *device_out = NULL;
2032 if (fs_info->fs_devices->seeding)
2033 return -EINVAL;
2034
2035 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2036 fs_info->bdev_holder);
2037 if (IS_ERR(bdev))
2038 return PTR_ERR(bdev);
2039
2040 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2041
2042 devices = &fs_info->fs_devices->devices;
2043 list_for_each_entry(device, devices, dev_list) {
2044 if (device->bdev == bdev) {
2045 ret = -EEXIST;
2046 goto error;
2047 }
2048 }
2049
2050 device = kzalloc(sizeof(*device), GFP_NOFS);
2051 if (!device) {
2052 ret = -ENOMEM;
2053 goto error;
2054 }
2055
2056 name = rcu_string_strdup(device_path, GFP_NOFS);
2057 if (!name) {
2058 kfree(device);
2059 ret = -ENOMEM;
2060 goto error;
2061 }
2062 rcu_assign_pointer(device->name, name);
2063
2064 q = bdev_get_queue(bdev);
2065 if (blk_queue_discard(q))
2066 device->can_discard = 1;
2067 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2068 device->writeable = 1;
2069 device->work.func = pending_bios_fn;
2070 generate_random_uuid(device->uuid);
2071 device->devid = BTRFS_DEV_REPLACE_DEVID;
2072 spin_lock_init(&device->io_lock);
2073 device->generation = 0;
2074 device->io_width = root->sectorsize;
2075 device->io_align = root->sectorsize;
2076 device->sector_size = root->sectorsize;
2077 device->total_bytes = i_size_read(bdev->bd_inode);
2078 device->disk_total_bytes = device->total_bytes;
2079 device->dev_root = fs_info->dev_root;
2080 device->bdev = bdev;
2081 device->in_fs_metadata = 1;
2082 device->is_tgtdev_for_dev_replace = 1;
2083 device->mode = FMODE_EXCL;
2084 set_blocksize(device->bdev, 4096);
2085 device->fs_devices = fs_info->fs_devices;
2086 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2087 fs_info->fs_devices->num_devices++;
2088 fs_info->fs_devices->open_devices++;
2089 if (device->can_discard)
2090 fs_info->fs_devices->num_can_discard++;
2091 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2092
2093 *device_out = device;
2094 return ret;
2095
2096error:
2097 blkdev_put(bdev, FMODE_EXCL);
2098 return ret;
2099}
2100
2101void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2102 struct btrfs_device *tgtdev)
2103{
2104 WARN_ON(fs_info->fs_devices->rw_devices == 0);
2105 tgtdev->io_width = fs_info->dev_root->sectorsize;
2106 tgtdev->io_align = fs_info->dev_root->sectorsize;
2107 tgtdev->sector_size = fs_info->dev_root->sectorsize;
2108 tgtdev->dev_root = fs_info->dev_root;
2109 tgtdev->in_fs_metadata = 1;
2110}
2111
1830static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2112static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1831 struct btrfs_device *device) 2113 struct btrfs_device *device)
1832{ 2114{
@@ -1883,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1883 2165
1884 if (!device->writeable) 2166 if (!device->writeable)
1885 return -EACCES; 2167 return -EACCES;
1886 if (new_size <= device->total_bytes) 2168 if (new_size <= device->total_bytes ||
2169 device->is_tgtdev_for_dev_replace)
1887 return -EINVAL; 2170 return -EINVAL;
1888 2171
1889 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2172 btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2321,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
2321 return 1; 2604 return 1;
2322} 2605}
2323 2606
2324static u64 div_factor_fine(u64 num, int factor)
2325{
2326 if (factor <= 0)
2327 return 0;
2328 if (factor >= 100)
2329 return num;
2330
2331 num *= factor;
2332 do_div(num, 100);
2333 return num;
2334}
2335
2336static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2607static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2337 struct btrfs_balance_args *bargs) 2608 struct btrfs_balance_args *bargs)
2338{ 2609{
@@ -2497,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
2497 return 1; 2768 return 1;
2498} 2769}
2499 2770
2500static u64 div_factor(u64 num, int factor)
2501{
2502 if (factor == 10)
2503 return num;
2504 num *= factor;
2505 do_div(num, 10);
2506 return num;
2507}
2508
2509static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2771static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2510{ 2772{
2511 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2773 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2533,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2533 size_to_free = div_factor(old_size, 1); 2795 size_to_free = div_factor(old_size, 1);
2534 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2796 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2535 if (!device->writeable || 2797 if (!device->writeable ||
2536 device->total_bytes - device->bytes_used > size_to_free) 2798 device->total_bytes - device->bytes_used > size_to_free ||
2799 device->is_tgtdev_for_dev_replace)
2537 continue; 2800 continue;
2538 2801
2539 ret = btrfs_shrink_device(device, old_size - size_to_free); 2802 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2711,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2711 u64 allowed; 2974 u64 allowed;
2712 int mixed = 0; 2975 int mixed = 0;
2713 int ret; 2976 int ret;
2977 u64 num_devices;
2714 2978
2715 if (btrfs_fs_closing(fs_info) || 2979 if (btrfs_fs_closing(fs_info) ||
2716 atomic_read(&fs_info->balance_pause_req) || 2980 atomic_read(&fs_info->balance_pause_req) ||
@@ -2739,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2739 } 3003 }
2740 } 3004 }
2741 3005
3006 num_devices = fs_info->fs_devices->num_devices;
3007 btrfs_dev_replace_lock(&fs_info->dev_replace);
3008 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3009 BUG_ON(num_devices < 1);
3010 num_devices--;
3011 }
3012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2742 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3013 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2743 if (fs_info->fs_devices->num_devices == 1) 3014 if (num_devices == 1)
2744 allowed |= BTRFS_BLOCK_GROUP_DUP; 3015 allowed |= BTRFS_BLOCK_GROUP_DUP;
2745 else if (fs_info->fs_devices->num_devices < 4) 3016 else if (num_devices < 4)
2746 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3017 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2747 else 3018 else
2748 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3019 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2804,6 +3075,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2804 } 3075 }
2805 } 3076 }
2806 3077
3078 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3079 int num_tolerated_disk_barrier_failures;
3080 u64 target = bctl->sys.target;
3081
3082 num_tolerated_disk_barrier_failures =
3083 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3084 if (num_tolerated_disk_barrier_failures > 0 &&
3085 (target &
3086 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3087 BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3088 num_tolerated_disk_barrier_failures = 0;
3089 else if (num_tolerated_disk_barrier_failures > 1 &&
3090 (target &
3091 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3092 num_tolerated_disk_barrier_failures = 1;
3093
3094 fs_info->num_tolerated_disk_barrier_failures =
3095 num_tolerated_disk_barrier_failures;
3096 }
3097
2807 ret = insert_balance_item(fs_info->tree_root, bctl); 3098 ret = insert_balance_item(fs_info->tree_root, bctl);
2808 if (ret && ret != -EEXIST) 3099 if (ret && ret != -EEXIST)
2809 goto out; 3100 goto out;
@@ -2836,6 +3127,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2836 __cancel_balance(fs_info); 3127 __cancel_balance(fs_info);
2837 } 3128 }
2838 3129
3130 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3131 fs_info->num_tolerated_disk_barrier_failures =
3132 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3133 }
3134
2839 wake_up(&fs_info->balance_wait_q); 3135 wake_up(&fs_info->balance_wait_q);
2840 3136
2841 return ret; 3137 return ret;
@@ -2860,6 +3156,7 @@ static int balance_kthread(void *data)
2860 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3156 ret = btrfs_balance(fs_info->balance_ctl, NULL);
2861 } 3157 }
2862 3158
3159 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2863 mutex_unlock(&fs_info->balance_mutex); 3160 mutex_unlock(&fs_info->balance_mutex);
2864 mutex_unlock(&fs_info->volume_mutex); 3161 mutex_unlock(&fs_info->volume_mutex);
2865 3162
@@ -2882,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2882 return 0; 3179 return 0;
2883 } 3180 }
2884 3181
3182 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
2885 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3183 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2886 if (IS_ERR(tsk)) 3184 if (IS_ERR(tsk))
2887 return PTR_ERR(tsk); 3185 return PTR_ERR(tsk);
@@ -3038,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3038 u64 old_size = device->total_bytes; 3336 u64 old_size = device->total_bytes;
3039 u64 diff = device->total_bytes - new_size; 3337 u64 diff = device->total_bytes - new_size;
3040 3338
3041 if (new_size >= device->total_bytes) 3339 if (device->is_tgtdev_for_dev_replace)
3042 return -EINVAL; 3340 return -EINVAL;
3043 3341
3044 path = btrfs_alloc_path(); 3342 path = btrfs_alloc_path();
@@ -3193,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3193 return 0; 3491 return 0;
3194} 3492}
3195 3493
3494struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3495 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3496 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3497 { 1, 2, 1, 1, 1, 2 /* dup */ },
3498 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3499 { 1, 1, 0, 1, 1, 1 /* single */ },
3500};
3501
3196static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3502static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3197 struct btrfs_root *extent_root, 3503 struct btrfs_root *extent_root,
3198 struct map_lookup **map_ret, 3504 struct map_lookup **map_ret,
@@ -3222,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3222 int ndevs; 3528 int ndevs;
3223 int i; 3529 int i;
3224 int j; 3530 int j;
3531 int index;
3225 3532
3226 BUG_ON(!alloc_profile_is_valid(type, 0)); 3533 BUG_ON(!alloc_profile_is_valid(type, 0));
3227 3534
3228 if (list_empty(&fs_devices->alloc_list)) 3535 if (list_empty(&fs_devices->alloc_list))
3229 return -ENOSPC; 3536 return -ENOSPC;
3230 3537
3231 sub_stripes = 1; 3538 index = __get_raid_index(type);
3232 dev_stripes = 1;
3233 devs_increment = 1;
3234 ncopies = 1;
3235 devs_max = 0; /* 0 == as many as possible */
3236 devs_min = 1;
3237 3539
3238 /* 3540 sub_stripes = btrfs_raid_array[index].sub_stripes;
3239 * define the properties of each RAID type. 3541 dev_stripes = btrfs_raid_array[index].dev_stripes;
3240 * FIXME: move this to a global table and use it in all RAID 3542 devs_max = btrfs_raid_array[index].devs_max;
3241 * calculation code 3543 devs_min = btrfs_raid_array[index].devs_min;
3242 */ 3544 devs_increment = btrfs_raid_array[index].devs_increment;
3243 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3545 ncopies = btrfs_raid_array[index].ncopies;
3244 dev_stripes = 2;
3245 ncopies = 2;
3246 devs_max = 1;
3247 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3248 devs_min = 2;
3249 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3250 devs_increment = 2;
3251 ncopies = 2;
3252 devs_max = 2;
3253 devs_min = 2;
3254 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3255 sub_stripes = 2;
3256 devs_increment = 2;
3257 ncopies = 2;
3258 devs_min = 4;
3259 } else {
3260 devs_max = 1;
3261 }
3262 3546
3263 if (type & BTRFS_BLOCK_GROUP_DATA) { 3547 if (type & BTRFS_BLOCK_GROUP_DATA) {
3264 max_stripe_size = 1024 * 1024 * 1024; 3548 max_stripe_size = 1024 * 1024 * 1024;
@@ -3305,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3305 cur = cur->next; 3589 cur = cur->next;
3306 3590
3307 if (!device->writeable) { 3591 if (!device->writeable) {
3308 printk(KERN_ERR 3592 WARN(1, KERN_ERR
3309 "btrfs: read-only device in alloc_list\n"); 3593 "btrfs: read-only device in alloc_list\n");
3310 WARN_ON(1);
3311 continue; 3594 continue;
3312 } 3595 }
3313 3596
3314 if (!device->in_fs_metadata) 3597 if (!device->in_fs_metadata ||
3598 device->is_tgtdev_for_dev_replace)
3315 continue; 3599 continue;
3316 3600
3317 if (device->total_bytes > device->bytes_used) 3601 if (device->total_bytes > device->bytes_used)
@@ -3340,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3340 devices_info[ndevs].total_avail = total_avail; 3624 devices_info[ndevs].total_avail = total_avail;
3341 devices_info[ndevs].dev = device; 3625 devices_info[ndevs].dev = device;
3342 ++ndevs; 3626 ++ndevs;
3627 WARN_ON(ndevs > fs_devices->rw_devices);
3343 } 3628 }
3344 3629
3345 /* 3630 /*
@@ -3608,12 +3893,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3608 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3893 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3609 &sys_chunk_size, &sys_stripe_size, 3894 &sys_chunk_size, &sys_stripe_size,
3610 sys_chunk_offset, alloc_profile); 3895 sys_chunk_offset, alloc_profile);
3611 if (ret) 3896 if (ret) {
3612 goto abort; 3897 btrfs_abort_transaction(trans, root, ret);
3898 goto out;
3899 }
3613 3900
3614 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3901 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3615 if (ret) 3902 if (ret) {
3616 goto abort; 3903 btrfs_abort_transaction(trans, root, ret);
3904 goto out;
3905 }
3617 3906
3618 /* 3907 /*
3619 * Modifying chunk tree needs allocating new blocks from both 3908 * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3912,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3623 */ 3912 */
3624 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3913 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3625 chunk_size, stripe_size); 3914 chunk_size, stripe_size);
3626 if (ret) 3915 if (ret) {
3627 goto abort; 3916 btrfs_abort_transaction(trans, root, ret);
3917 goto out;
3918 }
3628 3919
3629 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3920 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3630 sys_chunk_offset, sys_chunk_size, 3921 sys_chunk_offset, sys_chunk_size,
3631 sys_stripe_size); 3922 sys_stripe_size);
3632 if (ret) 3923 if (ret)
3633 goto abort; 3924 btrfs_abort_transaction(trans, root, ret);
3634 3925
3635 return 0; 3926out:
3636 3927
3637abort:
3638 btrfs_abort_transaction(trans, root, ret);
3639 return ret; 3928 return ret;
3640} 3929}
3641 3930
@@ -3694,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3694 } 3983 }
3695} 3984}
3696 3985
3697int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3986int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
3698{ 3987{
3988 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3699 struct extent_map *em; 3989 struct extent_map *em;
3700 struct map_lookup *map; 3990 struct map_lookup *map;
3701 struct extent_map_tree *em_tree = &map_tree->map_tree; 3991 struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3715,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3715 else 4005 else
3716 ret = 1; 4006 ret = 1;
3717 free_extent_map(em); 4007 free_extent_map(em);
4008
4009 btrfs_dev_replace_lock(&fs_info->dev_replace);
4010 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4011 ret++;
4012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
4013
3718 return ret; 4014 return ret;
3719} 4015}
3720 4016
3721static int find_live_mirror(struct map_lookup *map, int first, int num, 4017static int find_live_mirror(struct btrfs_fs_info *fs_info,
3722 int optimal) 4018 struct map_lookup *map, int first, int num,
4019 int optimal, int dev_replace_is_ongoing)
3723{ 4020{
3724 int i; 4021 int i;
3725 if (map->stripes[optimal].dev->bdev) 4022 int tolerance;
3726 return optimal; 4023 struct btrfs_device *srcdev;
3727 for (i = first; i < first + num; i++) { 4024
3728 if (map->stripes[i].dev->bdev) 4025 if (dev_replace_is_ongoing &&
3729 return i; 4026 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4027 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4028 srcdev = fs_info->dev_replace.srcdev;
4029 else
4030 srcdev = NULL;
4031
4032 /*
4033 * try to avoid the drive that is the source drive for a
4034 * dev-replace procedure, only choose it if no other non-missing
4035 * mirror is available
4036 */
4037 for (tolerance = 0; tolerance < 2; tolerance++) {
4038 if (map->stripes[optimal].dev->bdev &&
4039 (tolerance || map->stripes[optimal].dev != srcdev))
4040 return optimal;
4041 for (i = first; i < first + num; i++) {
4042 if (map->stripes[i].dev->bdev &&
4043 (tolerance || map->stripes[i].dev != srcdev))
4044 return i;
4045 }
3730 } 4046 }
4047
3731 /* we couldn't find one that doesn't fail. Just return something 4048 /* we couldn't find one that doesn't fail. Just return something
3732 * and the io error handling code will clean up eventually 4049 * and the io error handling code will clean up eventually
3733 */ 4050 */
3734 return optimal; 4051 return optimal;
3735} 4052}
3736 4053
3737static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4054static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3738 u64 logical, u64 *length, 4055 u64 logical, u64 *length,
3739 struct btrfs_bio **bbio_ret, 4056 struct btrfs_bio **bbio_ret,
3740 int mirror_num) 4057 int mirror_num)
3741{ 4058{
3742 struct extent_map *em; 4059 struct extent_map *em;
3743 struct map_lookup *map; 4060 struct map_lookup *map;
4061 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3744 struct extent_map_tree *em_tree = &map_tree->map_tree; 4062 struct extent_map_tree *em_tree = &map_tree->map_tree;
3745 u64 offset; 4063 u64 offset;
3746 u64 stripe_offset; 4064 u64 stripe_offset;
@@ -3754,13 +4072,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3754 int num_stripes; 4072 int num_stripes;
3755 int max_errors = 0; 4073 int max_errors = 0;
3756 struct btrfs_bio *bbio = NULL; 4074 struct btrfs_bio *bbio = NULL;
4075 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4076 int dev_replace_is_ongoing = 0;
4077 int num_alloc_stripes;
4078 int patch_the_first_stripe_for_dev_replace = 0;
4079 u64 physical_to_patch_in_first_stripe = 0;
3757 4080
3758 read_lock(&em_tree->lock); 4081 read_lock(&em_tree->lock);
3759 em = lookup_extent_mapping(em_tree, logical, *length); 4082 em = lookup_extent_mapping(em_tree, logical, *length);
3760 read_unlock(&em_tree->lock); 4083 read_unlock(&em_tree->lock);
3761 4084
3762 if (!em) { 4085 if (!em) {
3763 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 4086 printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
3764 (unsigned long long)logical, 4087 (unsigned long long)logical,
3765 (unsigned long long)*length); 4088 (unsigned long long)*length);
3766 BUG(); 4089 BUG();
@@ -3770,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3770 map = (struct map_lookup *)em->bdev; 4093 map = (struct map_lookup *)em->bdev;
3771 offset = logical - em->start; 4094 offset = logical - em->start;
3772 4095
3773 if (mirror_num > map->num_stripes)
3774 mirror_num = 0;
3775
3776 stripe_nr = offset; 4096 stripe_nr = offset;
3777 /* 4097 /*
3778 * stripe_nr counts the total number of stripes we have to stride 4098 * stripe_nr counts the total number of stripes we have to stride
@@ -3799,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3799 if (!bbio_ret) 4119 if (!bbio_ret)
3800 goto out; 4120 goto out;
3801 4121
4122 btrfs_dev_replace_lock(dev_replace);
4123 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4124 if (!dev_replace_is_ongoing)
4125 btrfs_dev_replace_unlock(dev_replace);
4126
4127 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4128 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4129 dev_replace->tgtdev != NULL) {
4130 /*
4131 * in dev-replace case, for repair case (that's the only
4132 * case where the mirror is selected explicitly when
4133 * calling btrfs_map_block), blocks left of the left cursor
4134 * can also be read from the target drive.
4135 * For REQ_GET_READ_MIRRORS, the target drive is added as
4136 * the last one to the array of stripes. For READ, it also
4137 * needs to be supported using the same mirror number.
4138 * If the requested block is not left of the left cursor,
4139 * EIO is returned. This can happen because btrfs_num_copies()
4140 * returns one more in the dev-replace case.
4141 */
4142 u64 tmp_length = *length;
4143 struct btrfs_bio *tmp_bbio = NULL;
4144 int tmp_num_stripes;
4145 u64 srcdev_devid = dev_replace->srcdev->devid;
4146 int index_srcdev = 0;
4147 int found = 0;
4148 u64 physical_of_found = 0;
4149
4150 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4151 logical, &tmp_length, &tmp_bbio, 0);
4152 if (ret) {
4153 WARN_ON(tmp_bbio != NULL);
4154 goto out;
4155 }
4156
4157 tmp_num_stripes = tmp_bbio->num_stripes;
4158 if (mirror_num > tmp_num_stripes) {
4159 /*
4160 * REQ_GET_READ_MIRRORS does not contain this
4161 * mirror, that means that the requested area
4162 * is not left of the left cursor
4163 */
4164 ret = -EIO;
4165 kfree(tmp_bbio);
4166 goto out;
4167 }
4168
4169 /*
4170 * process the rest of the function using the mirror_num
4171 * of the source drive. Therefore look it up first.
4172 * At the end, patch the device pointer to the one of the
4173 * target drive.
4174 */
4175 for (i = 0; i < tmp_num_stripes; i++) {
4176 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4177 /*
4178 * In case of DUP, in order to keep it
4179 * simple, only add the mirror with the
4180 * lowest physical address
4181 */
4182 if (found &&
4183 physical_of_found <=
4184 tmp_bbio->stripes[i].physical)
4185 continue;
4186 index_srcdev = i;
4187 found = 1;
4188 physical_of_found =
4189 tmp_bbio->stripes[i].physical;
4190 }
4191 }
4192
4193 if (found) {
4194 mirror_num = index_srcdev + 1;
4195 patch_the_first_stripe_for_dev_replace = 1;
4196 physical_to_patch_in_first_stripe = physical_of_found;
4197 } else {
4198 WARN_ON(1);
4199 ret = -EIO;
4200 kfree(tmp_bbio);
4201 goto out;
4202 }
4203
4204 kfree(tmp_bbio);
4205 } else if (mirror_num > map->num_stripes) {
4206 mirror_num = 0;
4207 }
4208
3802 num_stripes = 1; 4209 num_stripes = 1;
3803 stripe_index = 0; 4210 stripe_index = 0;
3804 stripe_nr_orig = stripe_nr; 4211 stripe_nr_orig = stripe_nr;
@@ -3813,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3813 stripe_nr_end - stripe_nr_orig); 4220 stripe_nr_end - stripe_nr_orig);
3814 stripe_index = do_div(stripe_nr, map->num_stripes); 4221 stripe_index = do_div(stripe_nr, map->num_stripes);
3815 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4222 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3816 if (rw & (REQ_WRITE | REQ_DISCARD)) 4223 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
3817 num_stripes = map->num_stripes; 4224 num_stripes = map->num_stripes;
3818 else if (mirror_num) 4225 else if (mirror_num)
3819 stripe_index = mirror_num - 1; 4226 stripe_index = mirror_num - 1;
3820 else { 4227 else {
3821 stripe_index = find_live_mirror(map, 0, 4228 stripe_index = find_live_mirror(fs_info, map, 0,
3822 map->num_stripes, 4229 map->num_stripes,
3823 current->pid % map->num_stripes); 4230 current->pid % map->num_stripes,
4231 dev_replace_is_ongoing);
3824 mirror_num = stripe_index + 1; 4232 mirror_num = stripe_index + 1;
3825 } 4233 }
3826 4234
3827 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4235 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3828 if (rw & (REQ_WRITE | REQ_DISCARD)) { 4236 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
3829 num_stripes = map->num_stripes; 4237 num_stripes = map->num_stripes;
3830 } else if (mirror_num) { 4238 } else if (mirror_num) {
3831 stripe_index = mirror_num - 1; 4239 stripe_index = mirror_num - 1;
@@ -3839,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3839 stripe_index = do_div(stripe_nr, factor); 4247 stripe_index = do_div(stripe_nr, factor);
3840 stripe_index *= map->sub_stripes; 4248 stripe_index *= map->sub_stripes;
3841 4249
3842 if (rw & REQ_WRITE) 4250 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
3843 num_stripes = map->sub_stripes; 4251 num_stripes = map->sub_stripes;
3844 else if (rw & REQ_DISCARD) 4252 else if (rw & REQ_DISCARD)
3845 num_stripes = min_t(u64, map->sub_stripes * 4253 num_stripes = min_t(u64, map->sub_stripes *
@@ -3849,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3849 stripe_index += mirror_num - 1; 4257 stripe_index += mirror_num - 1;
3850 else { 4258 else {
3851 int old_stripe_index = stripe_index; 4259 int old_stripe_index = stripe_index;
3852 stripe_index = find_live_mirror(map, stripe_index, 4260 stripe_index = find_live_mirror(fs_info, map,
4261 stripe_index,
3853 map->sub_stripes, stripe_index + 4262 map->sub_stripes, stripe_index +
3854 current->pid % map->sub_stripes); 4263 current->pid % map->sub_stripes,
4264 dev_replace_is_ongoing);
3855 mirror_num = stripe_index - old_stripe_index + 1; 4265 mirror_num = stripe_index - old_stripe_index + 1;
3856 } 4266 }
3857 } else { 4267 } else {
@@ -3865,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3865 } 4275 }
3866 BUG_ON(stripe_index >= map->num_stripes); 4276 BUG_ON(stripe_index >= map->num_stripes);
3867 4277
3868 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 4278 num_alloc_stripes = num_stripes;
4279 if (dev_replace_is_ongoing) {
4280 if (rw & (REQ_WRITE | REQ_DISCARD))
4281 num_alloc_stripes <<= 1;
4282 if (rw & REQ_GET_READ_MIRRORS)
4283 num_alloc_stripes++;
4284 }
4285 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
3869 if (!bbio) { 4286 if (!bbio) {
3870 ret = -ENOMEM; 4287 ret = -ENOMEM;
3871 goto out; 4288 goto out;
@@ -3952,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3952 } 4369 }
3953 } 4370 }
3954 4371
3955 if (rw & REQ_WRITE) { 4372 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
3956 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4373 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3957 BTRFS_BLOCK_GROUP_RAID10 | 4374 BTRFS_BLOCK_GROUP_RAID10 |
3958 BTRFS_BLOCK_GROUP_DUP)) { 4375 BTRFS_BLOCK_GROUP_DUP)) {
@@ -3960,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3960 } 4377 }
3961 } 4378 }
3962 4379
4380 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4381 dev_replace->tgtdev != NULL) {
4382 int index_where_to_add;
4383 u64 srcdev_devid = dev_replace->srcdev->devid;
4384
4385 /*
4386 * duplicate the write operations while the dev replace
4387 * procedure is running. Since the copying of the old disk
4388 * to the new disk takes place at run time while the
4389 * filesystem is mounted writable, the regular write
4390 * operations to the old disk have to be duplicated to go
4391 * to the new disk as well.
4392 * Note that device->missing is handled by the caller, and
4393 * that the write to the old disk is already set up in the
4394 * stripes array.
4395 */
4396 index_where_to_add = num_stripes;
4397 for (i = 0; i < num_stripes; i++) {
4398 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4399 /* write to new disk, too */
4400 struct btrfs_bio_stripe *new =
4401 bbio->stripes + index_where_to_add;
4402 struct btrfs_bio_stripe *old =
4403 bbio->stripes + i;
4404
4405 new->physical = old->physical;
4406 new->length = old->length;
4407 new->dev = dev_replace->tgtdev;
4408 index_where_to_add++;
4409 max_errors++;
4410 }
4411 }
4412 num_stripes = index_where_to_add;
4413 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4414 dev_replace->tgtdev != NULL) {
4415 u64 srcdev_devid = dev_replace->srcdev->devid;
4416 int index_srcdev = 0;
4417 int found = 0;
4418 u64 physical_of_found = 0;
4419
4420 /*
4421 * During the dev-replace procedure, the target drive can
4422 * also be used to read data in case it is needed to repair
4423 * a corrupt block elsewhere. This is possible if the
4424 * requested area is left of the left cursor. In this area,
4425 * the target drive is a full copy of the source drive.
4426 */
4427 for (i = 0; i < num_stripes; i++) {
4428 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4429 /*
4430 * In case of DUP, in order to keep it
4431 * simple, only add the mirror with the
4432 * lowest physical address
4433 */
4434 if (found &&
4435 physical_of_found <=
4436 bbio->stripes[i].physical)
4437 continue;
4438 index_srcdev = i;
4439 found = 1;
4440 physical_of_found = bbio->stripes[i].physical;
4441 }
4442 }
4443 if (found) {
4444 u64 length = map->stripe_len;
4445
4446 if (physical_of_found + length <=
4447 dev_replace->cursor_left) {
4448 struct btrfs_bio_stripe *tgtdev_stripe =
4449 bbio->stripes + num_stripes;
4450
4451 tgtdev_stripe->physical = physical_of_found;
4452 tgtdev_stripe->length =
4453 bbio->stripes[index_srcdev].length;
4454 tgtdev_stripe->dev = dev_replace->tgtdev;
4455
4456 num_stripes++;
4457 }
4458 }
4459 }
4460
3963 *bbio_ret = bbio; 4461 *bbio_ret = bbio;
3964 bbio->num_stripes = num_stripes; 4462 bbio->num_stripes = num_stripes;
3965 bbio->max_errors = max_errors; 4463 bbio->max_errors = max_errors;
3966 bbio->mirror_num = mirror_num; 4464 bbio->mirror_num = mirror_num;
4465
4466 /*
4467 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4468 * mirror_num == num_stripes + 1 && dev_replace target drive is
4469 * available as a mirror
4470 */
4471 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4472 WARN_ON(num_stripes > 1);
4473 bbio->stripes[0].dev = dev_replace->tgtdev;
4474 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4475 bbio->mirror_num = map->num_stripes + 1;
4476 }
3967out: 4477out:
4478 if (dev_replace_is_ongoing)
4479 btrfs_dev_replace_unlock(dev_replace);
3968 free_extent_map(em); 4480 free_extent_map(em);
3969 return ret; 4481 return ret;
3970} 4482}
3971 4483
3972int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4484int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3973 u64 logical, u64 *length, 4485 u64 logical, u64 *length,
3974 struct btrfs_bio **bbio_ret, int mirror_num) 4486 struct btrfs_bio **bbio_ret, int mirror_num)
3975{ 4487{
3976 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4488 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
3977 mirror_num); 4489 mirror_num);
3978} 4490}
3979 4491
@@ -4192,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
4192 &device->work); 4704 &device->work);
4193} 4705}
4194 4706
4707static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4708 sector_t sector)
4709{
4710 struct bio_vec *prev;
4711 struct request_queue *q = bdev_get_queue(bdev);
4712 unsigned short max_sectors = queue_max_sectors(q);
4713 struct bvec_merge_data bvm = {
4714 .bi_bdev = bdev,
4715 .bi_sector = sector,
4716 .bi_rw = bio->bi_rw,
4717 };
4718
4719 if (bio->bi_vcnt == 0) {
4720 WARN_ON(1);
4721 return 1;
4722 }
4723
4724 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4725 if ((bio->bi_size >> 9) > max_sectors)
4726 return 0;
4727
4728 if (!q->merge_bvec_fn)
4729 return 1;
4730
4731 bvm.bi_size = bio->bi_size - prev->bv_len;
4732 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4733 return 0;
4734 return 1;
4735}
4736
4737static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4738 struct bio *bio, u64 physical, int dev_nr,
4739 int rw, int async)
4740{
4741 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4742
4743 bio->bi_private = bbio;
4744 bio->bi_private = merge_stripe_index_into_bio_private(
4745 bio->bi_private, (unsigned int)dev_nr);
4746 bio->bi_end_io = btrfs_end_bio;
4747 bio->bi_sector = physical >> 9;
4748#ifdef DEBUG
4749 {
4750 struct rcu_string *name;
4751
4752 rcu_read_lock();
4753 name = rcu_dereference(dev->name);
4754 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4755 "(%s id %llu), size=%u\n", rw,
4756 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4757 name->str, dev->devid, bio->bi_size);
4758 rcu_read_unlock();
4759 }
4760#endif
4761 bio->bi_bdev = dev->bdev;
4762 if (async)
4763 schedule_bio(root, dev, rw, bio);
4764 else
4765 btrfsic_submit_bio(rw, bio);
4766}
4767
4768static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4769 struct bio *first_bio, struct btrfs_device *dev,
4770 int dev_nr, int rw, int async)
4771{
4772 struct bio_vec *bvec = first_bio->bi_io_vec;
4773 struct bio *bio;
4774 int nr_vecs = bio_get_nr_vecs(dev->bdev);
4775 u64 physical = bbio->stripes[dev_nr].physical;
4776
4777again:
4778 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4779 if (!bio)
4780 return -ENOMEM;
4781
4782 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4783 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4784 bvec->bv_offset) < bvec->bv_len) {
4785 u64 len = bio->bi_size;
4786
4787 atomic_inc(&bbio->stripes_pending);
4788 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4789 rw, async);
4790 physical += len;
4791 goto again;
4792 }
4793 bvec++;
4794 }
4795
4796 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4797 return 0;
4798}
4799
4800static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4801{
4802 atomic_inc(&bbio->error);
4803 if (atomic_dec_and_test(&bbio->stripes_pending)) {
4804 bio->bi_private = bbio->private;
4805 bio->bi_end_io = bbio->end_io;
4806 bio->bi_bdev = (struct block_device *)
4807 (unsigned long)bbio->mirror_num;
4808 bio->bi_sector = logical >> 9;
4809 kfree(bbio);
4810 bio_endio(bio, -EIO);
4811 }
4812}
4813
4195int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4814int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4196 int mirror_num, int async_submit) 4815 int mirror_num, int async_submit)
4197{ 4816{
4198 struct btrfs_mapping_tree *map_tree;
4199 struct btrfs_device *dev; 4817 struct btrfs_device *dev;
4200 struct bio *first_bio = bio; 4818 struct bio *first_bio = bio;
4201 u64 logical = (u64)bio->bi_sector << 9; 4819 u64 logical = (u64)bio->bi_sector << 9;
@@ -4207,17 +4825,16 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4207 struct btrfs_bio *bbio = NULL; 4825 struct btrfs_bio *bbio = NULL;
4208 4826
4209 length = bio->bi_size; 4827 length = bio->bi_size;
4210 map_tree = &root->fs_info->mapping_tree;
4211 map_length = length; 4828 map_length = length;
4212 4829
4213 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4830 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4214 mirror_num); 4831 mirror_num);
4215 if (ret) /* -ENOMEM */ 4832 if (ret)
4216 return ret; 4833 return ret;
4217 4834
4218 total_devs = bbio->num_stripes; 4835 total_devs = bbio->num_stripes;
4219 if (map_length < length) { 4836 if (map_length < length) {
4220 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4837 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4221 "len %llu\n", (unsigned long long)logical, 4838 "len %llu\n", (unsigned long long)logical,
4222 (unsigned long long)length, 4839 (unsigned long long)length,
4223 (unsigned long long)map_length); 4840 (unsigned long long)map_length);
@@ -4230,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4230 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4847 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4231 4848
4232 while (dev_nr < total_devs) { 4849 while (dev_nr < total_devs) {
4850 dev = bbio->stripes[dev_nr].dev;
4851 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4852 bbio_error(bbio, first_bio, logical);
4853 dev_nr++;
4854 continue;
4855 }
4856
4857 /*
4858 * Check and see if we're ok with this bio based on it's size
4859 * and offset with the given device.
4860 */
4861 if (!bio_size_ok(dev->bdev, first_bio,
4862 bbio->stripes[dev_nr].physical >> 9)) {
4863 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4864 dev_nr, rw, async_submit);
4865 BUG_ON(ret);
4866 dev_nr++;
4867 continue;
4868 }
4869
4233 if (dev_nr < total_devs - 1) { 4870 if (dev_nr < total_devs - 1) {
4234 bio = bio_clone(first_bio, GFP_NOFS); 4871 bio = bio_clone(first_bio, GFP_NOFS);
4235 BUG_ON(!bio); /* -ENOMEM */ 4872 BUG_ON(!bio); /* -ENOMEM */
4236 } else { 4873 } else {
4237 bio = first_bio; 4874 bio = first_bio;
4238 } 4875 }
4239 bio->bi_private = bbio; 4876
4240 bio->bi_private = merge_stripe_index_into_bio_private( 4877 submit_stripe_bio(root, bbio, bio,
4241 bio->bi_private, (unsigned int)dev_nr); 4878 bbio->stripes[dev_nr].physical, dev_nr, rw,
4242 bio->bi_end_io = btrfs_end_bio; 4879 async_submit);
4243 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4244 dev = bbio->stripes[dev_nr].dev;
4245 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4246#ifdef DEBUG
4247 struct rcu_string *name;
4248
4249 rcu_read_lock();
4250 name = rcu_dereference(dev->name);
4251 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4252 "(%s id %llu), size=%u\n", rw,
4253 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4254 name->str, dev->devid, bio->bi_size);
4255 rcu_read_unlock();
4256#endif
4257 bio->bi_bdev = dev->bdev;
4258 if (async_submit)
4259 schedule_bio(root, dev, rw, bio);
4260 else
4261 btrfsic_submit_bio(rw, bio);
4262 } else {
4263 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4264 bio->bi_sector = logical >> 9;
4265 bio_endio(bio, -EIO);
4266 }
4267 dev_nr++; 4880 dev_nr++;
4268 } 4881 }
4269 return 0; 4882 return 0;
4270} 4883}
4271 4884
4272struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4885struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4273 u8 *uuid, u8 *fsid) 4886 u8 *uuid, u8 *fsid)
4274{ 4887{
4275 struct btrfs_device *device; 4888 struct btrfs_device *device;
4276 struct btrfs_fs_devices *cur_devices; 4889 struct btrfs_fs_devices *cur_devices;
4277 4890
4278 cur_devices = root->fs_info->fs_devices; 4891 cur_devices = fs_info->fs_devices;
4279 while (cur_devices) { 4892 while (cur_devices) {
4280 if (!fsid || 4893 if (!fsid ||
4281 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4894 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4356,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4356 em->bdev = (struct block_device *)map; 4969 em->bdev = (struct block_device *)map;
4357 em->start = logical; 4970 em->start = logical;
4358 em->len = length; 4971 em->len = length;
4972 em->orig_start = 0;
4359 em->block_start = 0; 4973 em->block_start = 0;
4360 em->block_len = em->len; 4974 em->block_len = em->len;
4361 4975
@@ -4373,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4373 read_extent_buffer(leaf, uuid, (unsigned long) 4987 read_extent_buffer(leaf, uuid, (unsigned long)
4374 btrfs_stripe_dev_uuid_nr(chunk, i), 4988 btrfs_stripe_dev_uuid_nr(chunk, i),
4375 BTRFS_UUID_SIZE); 4989 BTRFS_UUID_SIZE);
4376 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4990 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
4377 NULL); 4991 uuid, NULL);
4378 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4992 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4379 kfree(map); 4993 kfree(map);
4380 free_extent_map(em); 4994 free_extent_map(em);
@@ -4415,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
4415 device->io_align = btrfs_device_io_align(leaf, dev_item); 5029 device->io_align = btrfs_device_io_align(leaf, dev_item);
4416 device->io_width = btrfs_device_io_width(leaf, dev_item); 5030 device->io_width = btrfs_device_io_width(leaf, dev_item);
4417 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5031 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5032 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5033 device->is_tgtdev_for_dev_replace = 0;
4418 5034
4419 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5035 ptr = (unsigned long)btrfs_device_uuid(dev_item);
4420 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5036 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4492,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
4492 return ret; 5108 return ret;
4493 } 5109 }
4494 5110
4495 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 5111 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
4496 if (!device || !device->bdev) { 5112 if (!device || !device->bdev) {
4497 if (!btrfs_test_opt(root, DEGRADED)) 5113 if (!btrfs_test_opt(root, DEGRADED))
4498 return -EIO; 5114 return -EIO;
@@ -4525,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
4525 fill_device_from_item(leaf, dev_item, device); 5141 fill_device_from_item(leaf, dev_item, device);
4526 device->dev_root = root->fs_info->dev_root; 5142 device->dev_root = root->fs_info->dev_root;
4527 device->in_fs_metadata = 1; 5143 device->in_fs_metadata = 1;
4528 if (device->writeable) { 5144 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
4529 device->fs_devices->total_rw_bytes += device->total_bytes; 5145 device->fs_devices->total_rw_bytes += device->total_bytes;
4530 spin_lock(&root->fs_info->free_chunk_lock); 5146 spin_lock(&root->fs_info->free_chunk_lock);
4531 root->fs_info->free_chunk_space += device->total_bytes - 5147 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4884,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4884 int i; 5500 int i;
4885 5501
4886 mutex_lock(&fs_devices->device_list_mutex); 5502 mutex_lock(&fs_devices->device_list_mutex);
4887 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 5503 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
4888 mutex_unlock(&fs_devices->device_list_mutex); 5504 mutex_unlock(&fs_devices->device_list_mutex);
4889 5505
4890 if (!dev) { 5506 if (!dev) {
@@ -4912,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4912 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 5528 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4913 return 0; 5529 return 0;
4914} 5530}
5531
5532int btrfs_scratch_superblock(struct btrfs_device *device)
5533{
5534 struct buffer_head *bh;
5535 struct btrfs_super_block *disk_super;
5536
5537 bh = btrfs_read_dev_super(device->bdev);
5538 if (!bh)
5539 return -EINVAL;
5540 disk_super = (struct btrfs_super_block *)bh->b_data;
5541
5542 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5543 set_buffer_dirty(bh);
5544 sync_dirty_buffer(bh);
5545 brelse(bh);
5546
5547 return 0;
5548}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
50 int in_fs_metadata; 50 int in_fs_metadata;
51 int missing; 51 int missing;
52 int can_discard; 52 int can_discard;
53 int is_tgtdev_for_dev_replace;
53 54
54 spinlock_t io_lock; 55 spinlock_t io_lock;
55 56
@@ -88,7 +89,7 @@ struct btrfs_device {
88 u8 uuid[BTRFS_UUID_SIZE]; 89 u8 uuid[BTRFS_UUID_SIZE];
89 90
90 /* per-device scrub information */ 91 /* per-device scrub information */
91 struct scrub_dev *scrub_device; 92 struct scrub_ctx *scrub_device;
92 93
93 struct btrfs_work work; 94 struct btrfs_work work;
94 struct rcu_head rcu; 95 struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
179 u64 total_avail; 180 u64 total_avail;
180}; 181};
181 182
183struct btrfs_raid_attr {
184 int sub_stripes; /* sub_stripes info for map */
185 int dev_stripes; /* stripes per dev */
186 int devs_max; /* max devs to use */
187 int devs_min; /* min devs needed */
188 int devs_increment; /* ndevs has to be a multiple of this */
189 int ncopies; /* how many copies to data has */
190};
191
182struct map_lookup { 192struct map_lookup {
183 u64 type; 193 u64 type;
184 int io_align; 194 int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
248 struct btrfs_device *device, 258 struct btrfs_device *device,
249 u64 chunk_tree, u64 chunk_objectid, 259 u64 chunk_tree, u64 chunk_objectid,
250 u64 chunk_offset, u64 start, u64 num_bytes); 260 u64 chunk_offset, u64 start, u64 num_bytes);
251int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 261int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
252 u64 logical, u64 *length, 262 u64 logical, u64 *length,
253 struct btrfs_bio **bbio_ret, int mirror_num); 263 struct btrfs_bio **bbio_ret, int mirror_num);
254int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 264int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
267int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 277int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
268 struct btrfs_fs_devices **fs_devices_ret); 278 struct btrfs_fs_devices **fs_devices_ret);
269int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 279int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
270void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 280void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
281 struct btrfs_fs_devices *fs_devices, int step);
282int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
283 char *device_path,
284 struct btrfs_device **device);
285int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
286 struct btrfs_device **device);
271int btrfs_add_device(struct btrfs_trans_handle *trans, 287int btrfs_add_device(struct btrfs_trans_handle *trans,
272 struct btrfs_root *root, 288 struct btrfs_root *root,
273 struct btrfs_device *device); 289 struct btrfs_device *device);
274int btrfs_rm_device(struct btrfs_root *root, char *device_path); 290int btrfs_rm_device(struct btrfs_root *root, char *device_path);
275void btrfs_cleanup_fs_uuids(void); 291void btrfs_cleanup_fs_uuids(void);
276int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 292int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
277int btrfs_grow_device(struct btrfs_trans_handle *trans, 293int btrfs_grow_device(struct btrfs_trans_handle *trans,
278 struct btrfs_device *device, u64 new_size); 294 struct btrfs_device *device, u64 new_size);
279struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 295struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
280 u8 *uuid, u8 *fsid); 296 u8 *uuid, u8 *fsid);
281int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 297int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
282int btrfs_init_new_device(struct btrfs_root *root, char *path); 298int btrfs_init_new_device(struct btrfs_root *root, char *path);
299int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
300 struct btrfs_device **device_out);
283int btrfs_balance(struct btrfs_balance_control *bctl, 301int btrfs_balance(struct btrfs_balance_control *bctl,
284 struct btrfs_ioctl_balance_args *bargs); 302 struct btrfs_ioctl_balance_args *bargs);
285int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); 303int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
296int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 314int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 315int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
298 struct btrfs_fs_info *fs_info); 316 struct btrfs_fs_info *fs_info);
317void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
318 struct btrfs_device *srcdev);
319void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
320 struct btrfs_device *tgtdev);
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device);
299 324
300static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
301 int index) 326 int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
122 */ 122 */
123 if (!value) 123 if (!value)
124 goto out; 124 goto out;
125 } else {
126 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
127 name, name_len, 0);
128 if (IS_ERR(di)) {
129 ret = PTR_ERR(di);
130 goto out;
131 }
132 if (!di && !value)
133 goto out;
134 btrfs_release_path(path);
125 } 135 }
126 136
127again: 137again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
198 208
199 inode_inc_iversion(inode); 209 inode_inc_iversion(inode);
200 inode->i_ctime = CURRENT_TIME; 210 inode->i_ctime = CURRENT_TIME;
211 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
201 ret = btrfs_update_inode(trans, root, inode); 212 ret = btrfs_update_inode(trans, root, inode);
202 BUG_ON(ret); 213 BUG_ON(ret);
203out: 214out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
265 276
266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 277 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
267 if (verify_dir_item(root, leaf, di)) 278 if (verify_dir_item(root, leaf, di))
268 continue; 279 goto next;
269 280
270 name_len = btrfs_dir_name_len(leaf, di); 281 name_len = btrfs_dir_name_len(leaf, di);
271 total_size += name_len + 1; 282 total_size += name_len + 1;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc55..9acb846c3e7f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
97 *total_in = 0; 97 *total_in = 0;
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "deflateInit failed\n"); 100 printk(KERN_WARNING "btrfs: deflateInit failed\n");
101 ret = -1; 101 ret = -1;
102 goto out; 102 goto out;
103 } 103 }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
125 while (workspace->def_strm.total_in < len) { 125 while (workspace->def_strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 127 if (ret != Z_OK) {
128 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", 128 printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
252 } 252 }
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "inflateInit failed\n"); 255 printk(KERN_WARNING "btrfs: inflateInit failed\n");
256 return -1; 256 return -1;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
336 } 336 }
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "inflateInit failed\n"); 339 printk(KERN_WARNING "btrfs: inflateInit failed\n");
340 return -1; 340 return -1;
341 } 341 }
342 342