aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 21:49:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 21:49:20 -0400
commit72055425e53540d9d0e59a57ac8c9b8ce77b62d5 (patch)
tree8033d7d7bfdf8725eed785d02f7121d201052d2e
parentfc81c038c2d61d4fcd8150f383fec1ce23087597 (diff)
parentf46dbe3dee853f8a860f889cb2b7ff4c624f2a7a (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "This is a large pull, with the bulk of the updates coming from: - Hole punching - send/receive fixes - fsync performance - Disk format extension allowing more hardlinks inside a single directory (btrfs-progs patch required to enable the compat bit for this one) I'm cooking more unrelated RAID code, but I wanted to make sure this original batch makes it in. The largest updates here are relatively old and have been in testing for some time." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (121 commits) btrfs: init ref_index to zero in add_inode_ref Btrfs: remove repeated eb->pages check in, disk-io.c/csum_dirty_buffer Btrfs: fix page leakage Btrfs: do not warn_on when we cannot alloc a page for an extent buffer Btrfs: don't bug on enomem in readpage Btrfs: cleanup pages properly when ENOMEM in compression Btrfs: make filesystem read-only when submitting barrier fails Btrfs: detect corrupted filesystem after write I/O errors Btrfs: make compress and nodatacow mount options mutually exclusive btrfs: fix message printing Btrfs: don't bother committing delayed inode updates when fsyncing btrfs: move inline function code to header file Btrfs: remove unnecessary IS_ERR in bio_readpage_error() btrfs: remove unused function btrfs_insert_some_items() Btrfs: don't commit instead of overcommitting Btrfs: confirmation of value is added before trace_btrfs_get_extent() is called Btrfs: be smarter about dropping things from the tree log Btrfs: don't lookup csums for prealloc extents Btrfs: cache extent state when writing out dirty metadata pages Btrfs: do not hold the file extent leaf locked when adding extent item ...
-rw-r--r--fs/btrfs/backref.c299
-rw-r--r--fs/btrfs/backref.h10
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/check-integrity.c16
-rw-r--r--fs/btrfs/compression.c13
-rw-r--r--fs/btrfs/ctree.c148
-rw-r--r--fs/btrfs/ctree.h109
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/disk-io.c230
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c376
-rw-r--r--fs/btrfs/extent_io.c128
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/extent_map.c55
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c447
-rw-r--r--fs/btrfs/free-space-cache.c10
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode.c386
-rw-r--r--fs/btrfs/ioctl.c100
-rw-r--r--fs/btrfs/ordered-data.c97
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/qgroup.c40
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c29
-rw-r--r--fs/btrfs/scrub.c30
-rw-r--r--fs/btrfs/send.c915
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c74
-rw-r--r--fs/btrfs/transaction.c283
-rw-r--r--fs/btrfs/transaction.h20
-rw-r--r--fs/btrfs/tree-log.c889
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c73
-rw-r--r--fs/btrfs/zlib.c8
-rw-r--r--include/trace/events/btrfs.h14
39 files changed, 3574 insertions, 1619 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d6..f3187938e081 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/vmalloc.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "disk-io.h" 21#include "disk-io.h"
21#include "backref.h" 22#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
231 } 232 }
232 if (!ret) { 233 if (!ret) {
233 ret = ulist_add(parents, eb->start, 234 ret = ulist_add(parents, eb->start,
234 (unsigned long)eie, GFP_NOFS); 235 (uintptr_t)eie, GFP_NOFS);
235 if (ret < 0) 236 if (ret < 0)
236 break; 237 break;
237 if (!extent_item_pos) { 238 if (!extent_item_pos) {
@@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
363 ULIST_ITER_INIT(&uiter); 364 ULIST_ITER_INIT(&uiter);
364 node = ulist_next(parents, &uiter); 365 node = ulist_next(parents, &uiter);
365 ref->parent = node ? node->val : 0; 366 ref->parent = node ? node->val : 0;
366 ref->inode_list = 367 ref->inode_list = node ?
367 node ? (struct extent_inode_elem *)node->aux : 0; 368 (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
368 369
369 /* additional parents require new refs being added here */ 370 /* additional parents require new refs being added here */
370 while ((node = ulist_next(parents, &uiter))) { 371 while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 } 376 }
376 memcpy(new_ref, ref, sizeof(*ref)); 377 memcpy(new_ref, ref, sizeof(*ref));
377 new_ref->parent = node->val; 378 new_ref->parent = node->val;
378 new_ref->inode_list = 379 new_ref->inode_list = (struct extent_inode_elem *)
379 (struct extent_inode_elem *)node->aux; 380 (uintptr_t)node->aux;
380 list_add(&new_ref->list, &ref->list); 381 list_add(&new_ref->list, &ref->list);
381 } 382 }
382 ulist_reinit(parents); 383 ulist_reinit(parents);
@@ -914,8 +915,8 @@ again:
914 free_extent_buffer(eb); 915 free_extent_buffer(eb);
915 } 916 }
916 ret = ulist_add_merge(refs, ref->parent, 917 ret = ulist_add_merge(refs, ref->parent,
917 (unsigned long)ref->inode_list, 918 (uintptr_t)ref->inode_list,
918 (unsigned long *)&eie, GFP_NOFS); 919 (u64 *)&eie, GFP_NOFS);
919 if (!ret && extent_item_pos) { 920 if (!ret && extent_item_pos) {
920 /* 921 /*
921 * we've recorded that parent, so we must extend 922 * we've recorded that parent, so we must extend
@@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)
959 while ((node = ulist_next(blocks, &uiter))) { 960 while ((node = ulist_next(blocks, &uiter))) {
960 if (!node->aux) 961 if (!node->aux)
961 continue; 962 continue;
962 eie = (struct extent_inode_elem *)node->aux; 963 eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
963 for (; eie; eie = eie_next) { 964 for (; eie; eie = eie_next) {
964 eie_next = eie->next; 965 eie_next = eie->next;
965 kfree(eie); 966 kfree(eie);
@@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1108 found_key); 1109 found_key);
1109} 1110}
1110 1111
1111/* 1112int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1112 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements 1113 u64 start_off, struct btrfs_path *path,
1113 * of the path are separated by '/' and the path is guaranteed to be 1114 struct btrfs_inode_extref **ret_extref,
1114 * 0-terminated. the path is only given within the current file system. 1115 u64 *found_off)
1115 * Therefore, it never starts with a '/'. the caller is responsible to provide 1116{
1116 * "size" bytes in "dest". the dest buffer will be filled backwards. finally, 1117 int ret, slot;
1117 * the start point of the resulting string is returned. this pointer is within 1118 struct btrfs_key key;
1118 * dest, normally. 1119 struct btrfs_key found_key;
1119 * in case the path buffer would overflow, the pointer is decremented further 1120 struct btrfs_inode_extref *extref;
1120 * as if output was written to the buffer, though no more output is actually 1121 struct extent_buffer *leaf;
1121 * generated. that way, the caller can determine how much space would be 1122 unsigned long ptr;
1122 * required for the path to fit into the buffer. in that case, the returned 1123
1123 * value will be smaller than dest. callers must check this! 1124 key.objectid = inode_objectid;
1124 */ 1125 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1126 key.offset = start_off;
1126 struct btrfs_inode_ref *iref, 1127
1128 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1129 if (ret < 0)
1130 return ret;
1131
1132 while (1) {
1133 leaf = path->nodes[0];
1134 slot = path->slots[0];
1135 if (slot >= btrfs_header_nritems(leaf)) {
1136 /*
1137 * If the item at offset is not found,
1138 * btrfs_search_slot will point us to the slot
1139 * where it should be inserted. In our case
1140 * that will be the slot directly before the
1141 * next INODE_REF_KEY_V2 item. In the case
1142 * that we're pointing to the last slot in a
1143 * leaf, we must move one leaf over.
1144 */
1145 ret = btrfs_next_leaf(root, path);
1146 if (ret) {
1147 if (ret >= 1)
1148 ret = -ENOENT;
1149 break;
1150 }
1151 continue;
1152 }
1153
1154 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1155
1156 /*
1157 * Check that we're still looking at an extended ref key for
1158 * this particular objectid. If we have different
1159 * objectid or type then there are no more to be found
1160 * in the tree and we can exit.
1161 */
1162 ret = -ENOENT;
1163 if (found_key.objectid != inode_objectid)
1164 break;
1165 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
1166 break;
1167
1168 ret = 0;
1169 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1170 extref = (struct btrfs_inode_extref *)ptr;
1171 *ret_extref = extref;
1172 if (found_off)
1173 *found_off = found_key.offset;
1174 break;
1175 }
1176
1177 return ret;
1178}
1179
1180static char *ref_to_path(struct btrfs_root *fs_root,
1181 struct btrfs_path *path,
1182 u32 name_len, unsigned long name_off,
1127 struct extent_buffer *eb_in, u64 parent, 1183 struct extent_buffer *eb_in, u64 parent,
1128 char *dest, u32 size) 1184 char *dest, u32 size)
1129{ 1185{
1130 u32 len;
1131 int slot; 1186 int slot;
1132 u64 next_inum; 1187 u64 next_inum;
1133 int ret; 1188 int ret;
@@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1135 struct extent_buffer *eb = eb_in; 1190 struct extent_buffer *eb = eb_in;
1136 struct btrfs_key found_key; 1191 struct btrfs_key found_key;
1137 int leave_spinning = path->leave_spinning; 1192 int leave_spinning = path->leave_spinning;
1193 struct btrfs_inode_ref *iref;
1138 1194
1139 if (bytes_left >= 0) 1195 if (bytes_left >= 0)
1140 dest[bytes_left] = '\0'; 1196 dest[bytes_left] = '\0';
1141 1197
1142 path->leave_spinning = 1; 1198 path->leave_spinning = 1;
1143 while (1) { 1199 while (1) {
1144 len = btrfs_inode_ref_name_len(eb, iref); 1200 bytes_left -= name_len;
1145 bytes_left -= len;
1146 if (bytes_left >= 0) 1201 if (bytes_left >= 0)
1147 read_extent_buffer(eb, dest + bytes_left, 1202 read_extent_buffer(eb, dest + bytes_left,
1148 (unsigned long)(iref + 1), len); 1203 name_off, name_len);
1149 if (eb != eb_in) { 1204 if (eb != eb_in) {
1150 btrfs_tree_read_unlock_blocking(eb); 1205 btrfs_tree_read_unlock_blocking(eb);
1151 free_extent_buffer(eb); 1206 free_extent_buffer(eb);
@@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1155 ret = -ENOENT; 1210 ret = -ENOENT;
1156 if (ret) 1211 if (ret)
1157 break; 1212 break;
1213
1158 next_inum = found_key.offset; 1214 next_inum = found_key.offset;
1159 1215
1160 /* regular exit ahead */ 1216 /* regular exit ahead */
@@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1170 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1226 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1171 } 1227 }
1172 btrfs_release_path(path); 1228 btrfs_release_path(path);
1173
1174 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); 1229 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1230
1231 name_len = btrfs_inode_ref_name_len(eb, iref);
1232 name_off = (unsigned long)(iref + 1);
1233
1175 parent = next_inum; 1234 parent = next_inum;
1176 --bytes_left; 1235 --bytes_left;
1177 if (bytes_left >= 0) 1236 if (bytes_left >= 0)
@@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1188} 1247}
1189 1248
1190/* 1249/*
1250 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
1251 * of the path are separated by '/' and the path is guaranteed to be
1252 * 0-terminated. the path is only given within the current file system.
1253 * Therefore, it never starts with a '/'. the caller is responsible to provide
1254 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
1255 * the start point of the resulting string is returned. this pointer is within
1256 * dest, normally.
1257 * in case the path buffer would overflow, the pointer is decremented further
1258 * as if output was written to the buffer, though no more output is actually
1259 * generated. that way, the caller can determine how much space would be
1260 * required for the path to fit into the buffer. in that case, the returned
1261 * value will be smaller than dest. callers must check this!
1262 */
1263char *btrfs_iref_to_path(struct btrfs_root *fs_root,
1264 struct btrfs_path *path,
1265 struct btrfs_inode_ref *iref,
1266 struct extent_buffer *eb_in, u64 parent,
1267 char *dest, u32 size)
1268{
1269 return ref_to_path(fs_root, path,
1270 btrfs_inode_ref_name_len(eb_in, iref),
1271 (unsigned long)(iref + 1),
1272 eb_in, parent, dest, size);
1273}
1274
1275/*
1191 * this makes the path point to (logical EXTENT_ITEM *) 1276 * this makes the path point to (logical EXTENT_ITEM *)
1192 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for 1277 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
1193 * tree blocks and <0 on error. 1278 * tree blocks and <0 on error.
1194 */ 1279 */
1195int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 1280int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1196 struct btrfs_path *path, struct btrfs_key *found_key) 1281 struct btrfs_path *path, struct btrfs_key *found_key,
1282 u64 *flags_ret)
1197{ 1283{
1198 int ret; 1284 int ret;
1199 u64 flags; 1285 u64 flags;
@@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1237 (unsigned long long)found_key->objectid, 1323 (unsigned long long)found_key->objectid,
1238 (unsigned long long)found_key->offset, 1324 (unsigned long long)found_key->offset,
1239 (unsigned long long)flags, item_size); 1325 (unsigned long long)flags, item_size);
1240 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1326
1241 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 1327 WARN_ON(!flags_ret);
1242 if (flags & BTRFS_EXTENT_FLAG_DATA) 1328 if (flags_ret) {
1243 return BTRFS_EXTENT_FLAG_DATA; 1329 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1330 *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
1331 else if (flags & BTRFS_EXTENT_FLAG_DATA)
1332 *flags_ret = BTRFS_EXTENT_FLAG_DATA;
1333 else
1334 BUG_ON(1);
1335 return 0;
1336 }
1244 1337
1245 return -EIO; 1338 return -EIO;
1246} 1339}
@@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1404 ULIST_ITER_INIT(&root_uiter); 1497 ULIST_ITER_INIT(&root_uiter);
1405 while (!ret && (root_node = ulist_next(roots, &root_uiter))) { 1498 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1406 pr_debug("root %llu references leaf %llu, data list " 1499 pr_debug("root %llu references leaf %llu, data list "
1407 "%#lx\n", root_node->val, ref_node->val, 1500 "%#llx\n", root_node->val, ref_node->val,
1408 ref_node->aux); 1501 (long long)ref_node->aux);
1409 ret = iterate_leaf_refs( 1502 ret = iterate_leaf_refs((struct extent_inode_elem *)
1410 (struct extent_inode_elem *)ref_node->aux, 1503 (uintptr_t)ref_node->aux,
1411 root_node->val, extent_item_objectid, 1504 root_node->val,
1412 iterate, ctx); 1505 extent_item_objectid,
1506 iterate, ctx);
1413 } 1507 }
1414 ulist_free(roots); 1508 ulist_free(roots);
1415 roots = NULL; 1509 roots = NULL;
@@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1432{ 1526{
1433 int ret; 1527 int ret;
1434 u64 extent_item_pos; 1528 u64 extent_item_pos;
1529 u64 flags = 0;
1435 struct btrfs_key found_key; 1530 struct btrfs_key found_key;
1436 int search_commit_root = path->search_commit_root; 1531 int search_commit_root = path->search_commit_root;
1437 1532
1438 ret = extent_from_logical(fs_info, logical, path, 1533 ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
1439 &found_key);
1440 btrfs_release_path(path); 1534 btrfs_release_path(path);
1441 if (ret < 0) 1535 if (ret < 0)
1442 return ret; 1536 return ret;
1443 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1537 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1444 return -EINVAL; 1538 return -EINVAL;
1445 1539
1446 extent_item_pos = logical - found_key.objectid; 1540 extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1451 return ret; 1545 return ret;
1452} 1546}
1453 1547
1454static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, 1548typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
1455 struct btrfs_path *path, 1549 struct extent_buffer *eb, void *ctx);
1456 iterate_irefs_t *iterate, void *ctx) 1550
1551static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1552 struct btrfs_path *path,
1553 iterate_irefs_t *iterate, void *ctx)
1457{ 1554{
1458 int ret = 0; 1555 int ret = 0;
1459 int slot; 1556 int slot;
@@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1470 while (!ret) { 1567 while (!ret) {
1471 path->leave_spinning = 1; 1568 path->leave_spinning = 1;
1472 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, 1569 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
1473 &found_key); 1570 &found_key);
1474 if (ret < 0) 1571 if (ret < 0)
1475 break; 1572 break;
1476 if (ret) { 1573 if (ret) {
@@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1498 "tree %llu\n", cur, 1595 "tree %llu\n", cur,
1499 (unsigned long long)found_key.objectid, 1596 (unsigned long long)found_key.objectid,
1500 (unsigned long long)fs_root->objectid); 1597 (unsigned long long)fs_root->objectid);
1501 ret = iterate(parent, iref, eb, ctx); 1598 ret = iterate(parent, name_len,
1599 (unsigned long)(iref + 1), eb, ctx);
1502 if (ret) 1600 if (ret)
1503 break; 1601 break;
1504 len = sizeof(*iref) + name_len; 1602 len = sizeof(*iref) + name_len;
@@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1513 return ret; 1611 return ret;
1514} 1612}
1515 1613
1614static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
1615 struct btrfs_path *path,
1616 iterate_irefs_t *iterate, void *ctx)
1617{
1618 int ret;
1619 int slot;
1620 u64 offset = 0;
1621 u64 parent;
1622 int found = 0;
1623 struct extent_buffer *eb;
1624 struct btrfs_inode_extref *extref;
1625 struct extent_buffer *leaf;
1626 u32 item_size;
1627 u32 cur_offset;
1628 unsigned long ptr;
1629
1630 while (1) {
1631 ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
1632 &offset);
1633 if (ret < 0)
1634 break;
1635 if (ret) {
1636 ret = found ? 0 : -ENOENT;
1637 break;
1638 }
1639 ++found;
1640
1641 slot = path->slots[0];
1642 eb = path->nodes[0];
1643 /* make sure we can use eb after releasing the path */
1644 atomic_inc(&eb->refs);
1645
1646 btrfs_tree_read_lock(eb);
1647 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1648 btrfs_release_path(path);
1649
1650 leaf = path->nodes[0];
1651 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1652 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1653 cur_offset = 0;
1654
1655 while (cur_offset < item_size) {
1656 u32 name_len;
1657
1658 extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
1659 parent = btrfs_inode_extref_parent(eb, extref);
1660 name_len = btrfs_inode_extref_name_len(eb, extref);
1661 ret = iterate(parent, name_len,
1662 (unsigned long)&extref->name, eb, ctx);
1663 if (ret)
1664 break;
1665
1666 cur_offset += btrfs_inode_extref_name_len(leaf, extref);
1667 cur_offset += sizeof(*extref);
1668 }
1669 btrfs_tree_read_unlock_blocking(eb);
1670 free_extent_buffer(eb);
1671
1672 offset++;
1673 }
1674
1675 btrfs_release_path(path);
1676
1677 return ret;
1678}
1679
1680static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1681 struct btrfs_path *path, iterate_irefs_t *iterate,
1682 void *ctx)
1683{
1684 int ret;
1685 int found_refs = 0;
1686
1687 ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
1688 if (!ret)
1689 ++found_refs;
1690 else if (ret != -ENOENT)
1691 return ret;
1692
1693 ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
1694 if (ret == -ENOENT && found_refs)
1695 return 0;
1696
1697 return ret;
1698}
1699
1516/* 1700/*
1517 * returns 0 if the path could be dumped (probably truncated) 1701 * returns 0 if the path could be dumped (probably truncated)
1518 * returns <0 in case of an error 1702 * returns <0 in case of an error
1519 */ 1703 */
1520static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, 1704static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
1521 struct extent_buffer *eb, void *ctx) 1705 struct extent_buffer *eb, void *ctx)
1522{ 1706{
1523 struct inode_fs_paths *ipath = ctx; 1707 struct inode_fs_paths *ipath = ctx;
1524 char *fspath; 1708 char *fspath;
@@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1531 ipath->fspath->bytes_left - s_ptr : 0; 1715 ipath->fspath->bytes_left - s_ptr : 0;
1532 1716
1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1717 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1718 fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
1535 inum, fspath_min, bytes_left); 1719 name_off, eb, inum, fspath_min,
1720 bytes_left);
1536 if (IS_ERR(fspath)) 1721 if (IS_ERR(fspath))
1537 return PTR_ERR(fspath); 1722 return PTR_ERR(fspath);
1538 1723
1539 if (fspath > fspath_min) { 1724 if (fspath > fspath_min) {
1540 pr_debug("path resolved: %s\n", fspath);
1541 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1725 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
1542 ++ipath->fspath->elem_cnt; 1726 ++ipath->fspath->elem_cnt;
1543 ipath->fspath->bytes_left = fspath - fspath_min; 1727 ipath->fspath->bytes_left = fspath - fspath_min;
1544 } else { 1728 } else {
1545 pr_debug("missed path, not enough space. missing bytes: %lu, "
1546 "constructed so far: %s\n",
1547 (unsigned long)(fspath_min - fspath), fspath_min);
1548 ++ipath->fspath->elem_missed; 1729 ++ipath->fspath->elem_missed;
1549 ipath->fspath->bytes_missing += fspath_min - fspath; 1730 ipath->fspath->bytes_missing += fspath_min - fspath;
1550 ipath->fspath->bytes_left = 0; 1731 ipath->fspath->bytes_left = 0;
@@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1566int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) 1747int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
1567{ 1748{
1568 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, 1749 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
1569 inode_to_path, ipath); 1750 inode_to_path, ipath);
1570} 1751}
1571 1752
1572struct btrfs_data_container *init_data_container(u32 total_bytes) 1753struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
1575 size_t alloc_bytes; 1756 size_t alloc_bytes;
1576 1757
1577 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); 1758 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
1578 data = kmalloc(alloc_bytes, GFP_NOFS); 1759 data = vmalloc(alloc_bytes);
1579 if (!data) 1760 if (!data)
1580 return ERR_PTR(-ENOMEM); 1761 return ERR_PTR(-ENOMEM);
1581 1762
@@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)
1626{ 1807{
1627 if (!ipath) 1808 if (!ipath)
1628 return; 1809 return;
1629 kfree(ipath->fspath); 1810 vfree(ipath->fspath);
1630 kfree(ipath); 1811 kfree(ipath);
1631} 1812}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab8..e75533043a5f 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,14 +33,13 @@ struct inode_fs_paths {
33 33
34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, 34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
35 void *ctx); 35 void *ctx);
36typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
37 struct extent_buffer *eb, void *ctx);
38 36
39int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, 37int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
40 struct btrfs_path *path); 38 struct btrfs_path *path);
41 39
42int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 40int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
43 struct btrfs_path *path, struct btrfs_key *found_key); 41 struct btrfs_path *path, struct btrfs_key *found_key,
42 u64 *flags);
44 43
45int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 44int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
46 struct btrfs_extent_item *ei, u32 item_size, 45 struct btrfs_extent_item *ei, u32 item_size,
@@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
69 struct btrfs_path *path); 68 struct btrfs_path *path);
70void free_ipath(struct inode_fs_paths *ipath); 69void free_ipath(struct inode_fs_paths *ipath);
71 70
71int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
72 u64 start_off, struct btrfs_path *path,
73 struct btrfs_inode_extref **ret_extref,
74 u64 *found_off);
75
72#endif 76#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe7..ed8ca7ca5eff 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4 38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
41 42
42/* in memory btrfs inode */ 43/* in memory btrfs inode */
43struct btrfs_inode { 44struct btrfs_inode {
@@ -143,6 +144,9 @@ struct btrfs_inode {
143 /* flags field from the on disk inode */ 144 /* flags field from the on disk inode */
144 u32 flags; 145 u32 flags;
145 146
147 /* a local copy of root's last_log_commit */
148 unsigned long last_log_commit;
149
146 /* 150 /*
147 * Counters to keep track of the number of extent item's we may use due 151 * Counters to keep track of the number of extent item's we may use due
148 * to delalloc and such. outstanding_extents is the number of extent 152 * to delalloc and such. outstanding_extents is the number of extent
@@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
202 206
203static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 207static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
204{ 208{
205 struct btrfs_root *root = BTRFS_I(inode)->root;
206 int ret = 0;
207
208 mutex_lock(&root->log_mutex);
209 if (BTRFS_I(inode)->logged_trans == generation && 209 if (BTRFS_I(inode)->logged_trans == generation &&
210 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 210 BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
211 ret = 1; 211 return 1;
212 mutex_unlock(&root->log_mutex); 212 return 0;
213 return ret;
214} 213}
215 214
216#endif 215#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e33407..5a3e45db642a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
37 * the file system was mounted, (i.e., they have been 37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been 38 * referenced by the super block) or they have been
39 * written since then and the write completion callback 39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where 40 * was called and no write error was indicated and a
41 * these blocks are located was received and completed. 41 * FLUSH request to the device where these blocks are
42 * located was received and completed.
42 * 2b. All referenced blocks need to have a generation 43 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number. 44 * number which is equal to the parent's number.
44 * 45 *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2601 (unsigned long long)l->block_ref_to->dev_bytenr, 2602 (unsigned long long)l->block_ref_to->dev_bytenr,
2602 l->block_ref_to->mirror_num); 2603 l->block_ref_to->mirror_num);
2603 ret = -1; 2604 ret = -1;
2605 } else if (l->block_ref_to->iodone_w_error) {
2606 printk(KERN_INFO "btrfs: attempt to write superblock"
2607 " which references block %c @%llu (%s/%llu/%d)"
2608 " which has write error!\n",
2609 btrfsic_get_block_type(state, l->block_ref_to),
2610 (unsigned long long)
2611 l->block_ref_to->logical_bytenr,
2612 l->block_ref_to->dev_state->name,
2613 (unsigned long long)l->block_ref_to->dev_bytenr,
2614 l->block_ref_to->mirror_num);
2615 ret = -1;
2604 } else if (l->parent_generation != 2616 } else if (l->parent_generation !=
2605 l->block_ref_to->generation && 2617 l->block_ref_to->generation &&
2606 BTRFSIC_GENERATION_UNKNOWN != 2618 BTRFSIC_GENERATION_UNKNOWN !=
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a030..c6467aa88bee 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
577 u64 em_start; 577 u64 em_start;
578 struct extent_map *em; 578 struct extent_map *em;
579 int ret = -ENOMEM; 579 int ret = -ENOMEM;
580 int faili = 0;
580 u32 *sums; 581 u32 *sums;
581 582
582 tree = &BTRFS_I(inode)->io_tree; 583 tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
626 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 627 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
627 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | 628 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
628 __GFP_HIGHMEM); 629 __GFP_HIGHMEM);
629 if (!cb->compressed_pages[pg_index]) 630 if (!cb->compressed_pages[pg_index]) {
631 faili = pg_index - 1;
632 ret = -ENOMEM;
630 goto fail2; 633 goto fail2;
634 }
631 } 635 }
636 faili = nr_pages - 1;
632 cb->nr_pages = nr_pages; 637 cb->nr_pages = nr_pages;
633 638
634 add_ra_bio_pages(inode, em_start + em_len, cb); 639 add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
713 return 0; 718 return 0;
714 719
715fail2: 720fail2:
716 for (pg_index = 0; pg_index < nr_pages; pg_index++) 721 while (faili >= 0) {
717 free_page((unsigned long)cb->compressed_pages[pg_index]); 722 __free_page(cb->compressed_pages[faili]);
723 faili--;
724 }
718 725
719 kfree(cb->compressed_pages); 726 kfree(cb->compressed_pages);
720fail1: 727fail1:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63a..b33436211000 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
4402} 4402}
4403 4403
4404/* 4404/*
4405 * Given a key and some data, insert items into the tree.
4406 * This does all the path init required, making room in the tree if needed.
4407 * Returns the number of keys that were inserted.
4408 */
4409int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4410 struct btrfs_root *root,
4411 struct btrfs_path *path,
4412 struct btrfs_key *cpu_key, u32 *data_size,
4413 int nr)
4414{
4415 struct extent_buffer *leaf;
4416 struct btrfs_item *item;
4417 int ret = 0;
4418 int slot;
4419 int i;
4420 u32 nritems;
4421 u32 total_data = 0;
4422 u32 total_size = 0;
4423 unsigned int data_end;
4424 struct btrfs_disk_key disk_key;
4425 struct btrfs_key found_key;
4426 struct btrfs_map_token token;
4427
4428 btrfs_init_map_token(&token);
4429
4430 for (i = 0; i < nr; i++) {
4431 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4432 BTRFS_LEAF_DATA_SIZE(root)) {
4433 break;
4434 nr = i;
4435 }
4436 total_data += data_size[i];
4437 total_size += data_size[i] + sizeof(struct btrfs_item);
4438 }
4439 BUG_ON(nr == 0);
4440
4441 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4442 if (ret == 0)
4443 return -EEXIST;
4444 if (ret < 0)
4445 goto out;
4446
4447 leaf = path->nodes[0];
4448
4449 nritems = btrfs_header_nritems(leaf);
4450 data_end = leaf_data_end(root, leaf);
4451
4452 if (btrfs_leaf_free_space(root, leaf) < total_size) {
4453 for (i = nr; i >= 0; i--) {
4454 total_data -= data_size[i];
4455 total_size -= data_size[i] + sizeof(struct btrfs_item);
4456 if (total_size < btrfs_leaf_free_space(root, leaf))
4457 break;
4458 }
4459 nr = i;
4460 }
4461
4462 slot = path->slots[0];
4463 BUG_ON(slot < 0);
4464
4465 if (slot != nritems) {
4466 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4467
4468 item = btrfs_item_nr(leaf, slot);
4469 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4470
4471 /* figure out how many keys we can insert in here */
4472 total_data = data_size[0];
4473 for (i = 1; i < nr; i++) {
4474 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4475 break;
4476 total_data += data_size[i];
4477 }
4478 nr = i;
4479
4480 if (old_data < data_end) {
4481 btrfs_print_leaf(root, leaf);
4482 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4483 slot, old_data, data_end);
4484 BUG_ON(1);
4485 }
4486 /*
4487 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4488 */
4489 /* first correct the data pointers */
4490 for (i = slot; i < nritems; i++) {
4491 u32 ioff;
4492
4493 item = btrfs_item_nr(leaf, i);
4494 ioff = btrfs_token_item_offset(leaf, item, &token);
4495 btrfs_set_token_item_offset(leaf, item,
4496 ioff - total_data, &token);
4497 }
4498 /* shift the items */
4499 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4500 btrfs_item_nr_offset(slot),
4501 (nritems - slot) * sizeof(struct btrfs_item));
4502
4503 /* shift the data */
4504 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4505 data_end - total_data, btrfs_leaf_data(leaf) +
4506 data_end, old_data - data_end);
4507 data_end = old_data;
4508 } else {
4509 /*
4510 * this sucks but it has to be done, if we are inserting at
4511 * the end of the leaf only insert 1 of the items, since we
4512 * have no way of knowing whats on the next leaf and we'd have
4513 * to drop our current locks to figure it out
4514 */
4515 nr = 1;
4516 }
4517
4518 /* setup the item for the new data */
4519 for (i = 0; i < nr; i++) {
4520 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4521 btrfs_set_item_key(leaf, &disk_key, slot + i);
4522 item = btrfs_item_nr(leaf, slot + i);
4523 btrfs_set_token_item_offset(leaf, item,
4524 data_end - data_size[i], &token);
4525 data_end -= data_size[i];
4526 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4527 }
4528 btrfs_set_header_nritems(leaf, nritems + nr);
4529 btrfs_mark_buffer_dirty(leaf);
4530
4531 ret = 0;
4532 if (slot == 0) {
4533 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4534 fixup_low_keys(trans, root, path, &disk_key, 1);
4535 }
4536
4537 if (btrfs_leaf_free_space(root, leaf) < 0) {
4538 btrfs_print_leaf(root, leaf);
4539 BUG();
4540 }
4541out:
4542 if (!ret)
4543 ret = nr;
4544 return ret;
4545}
4546
4547/*
4548 * this is a helper for btrfs_insert_empty_items, the main goal here is 4405 * this is a helper for btrfs_insert_empty_items, the main goal here is
4549 * to save stack depth by doing the bulk of the work in a function 4406 * to save stack depth by doing the bulk of the work in a function
4550 * that doesn't call btrfs_search_slot 4407 * that doesn't call btrfs_search_slot
@@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,
5073 struct btrfs_path *path, 4930 struct btrfs_path *path,
5074 int *level, int root_level) 4931 int *level, int root_level)
5075{ 4932{
4933 BUG_ON(*level == 0);
5076 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 4934 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5077 path->slots[*level]); 4935 path->slots[*level]);
5078 path->slots[*level - 1] = 0; 4936 path->slots[*level - 1] = 0;
@@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
5089 4947
5090 path->slots[*level]++; 4948 path->slots[*level]++;
5091 4949
5092 while (path->slots[*level] == nritems) { 4950 while (path->slots[*level] >= nritems) {
5093 if (*level == root_level) 4951 if (*level == root_level)
5094 return -1; 4952 return -1;
5095 4953
@@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5433 goto out; 5291 goto out;
5434 advance_right = ADVANCE; 5292 advance_right = ADVANCE;
5435 } else { 5293 } else {
5294 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5436 ret = tree_compare_item(left_root, left_path, 5295 ret = tree_compare_item(left_root, left_path,
5437 right_path, tmp_buf); 5296 right_path, tmp_buf);
5438 if (ret) { 5297 if (ret) {
5298 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5439 ret = changed_cb(left_root, right_root, 5299 ret = changed_cb(left_root, right_root,
5440 left_path, right_path, 5300 left_path, right_path,
5441 &left_key, 5301 &left_key,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9821b672f5a2..926c9ffc66d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
154 */ 154 */
155#define BTRFS_NAME_LEN 255 155#define BTRFS_NAME_LEN 255
156 156
157/*
158 * Theoretical limit is larger, but we keep this down to a sane
159 * value. That should limit greatly the possibility of collisions on
160 * inode ref items.
161 */
162#define BTRFS_LINK_MAX 65535U
163
157/* 32 bytes in various csum fields */ 164/* 32 bytes in various csum fields */
158#define BTRFS_CSUM_SIZE 32 165#define BTRFS_CSUM_SIZE 32
159 166
@@ -489,6 +496,8 @@ struct btrfs_super_block {
489 */ 496 */
490#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 497#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
491 498
499#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
500
492#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 501#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
493#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 502#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
494#define BTRFS_FEATURE_INCOMPAT_SUPP \ 503#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -496,7 +505,8 @@ struct btrfs_super_block {
496 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 505 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
497 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 506 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
498 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 507 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
499 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) 508 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
509 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
500 510
501/* 511/*
502 * A leaf is full of items. offset and size tell us where to find 512 * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
643 /* name goes here */ 653 /* name goes here */
644} __attribute__ ((__packed__)); 654} __attribute__ ((__packed__));
645 655
656struct btrfs_inode_extref {
657 __le64 parent_objectid;
658 __le64 index;
659 __le16 name_len;
660 __u8 name[0];
661 /* name goes here */
662} __attribute__ ((__packed__));
663
646struct btrfs_timespec { 664struct btrfs_timespec {
647 __le64 sec; 665 __le64 sec;
648 __le32 nsec; 666 __le32 nsec;
@@ -1028,12 +1046,22 @@ struct btrfs_space_info {
1028 wait_queue_head_t wait; 1046 wait_queue_head_t wait;
1029}; 1047};
1030 1048
1049#define BTRFS_BLOCK_RSV_GLOBAL 1
1050#define BTRFS_BLOCK_RSV_DELALLOC 2
1051#define BTRFS_BLOCK_RSV_TRANS 3
1052#define BTRFS_BLOCK_RSV_CHUNK 4
1053#define BTRFS_BLOCK_RSV_DELOPS 5
1054#define BTRFS_BLOCK_RSV_EMPTY 6
1055#define BTRFS_BLOCK_RSV_TEMP 7
1056
1031struct btrfs_block_rsv { 1057struct btrfs_block_rsv {
1032 u64 size; 1058 u64 size;
1033 u64 reserved; 1059 u64 reserved;
1034 struct btrfs_space_info *space_info; 1060 struct btrfs_space_info *space_info;
1035 spinlock_t lock; 1061 spinlock_t lock;
1036 unsigned int full; 1062 unsigned short full;
1063 unsigned short type;
1064 unsigned short failfast;
1037}; 1065};
1038 1066
1039/* 1067/*
@@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {
1127 * Today it will only have one thing on it, but that may change 1155 * Today it will only have one thing on it, but that may change
1128 */ 1156 */
1129 struct list_head cluster_list; 1157 struct list_head cluster_list;
1158
1159 /* For delayed block group creation */
1160 struct list_head new_bg_list;
1130}; 1161};
1131 1162
1132/* delayed seq elem */ 1163/* delayed seq elem */
@@ -1240,7 +1271,6 @@ struct btrfs_fs_info {
1240 struct mutex reloc_mutex; 1271 struct mutex reloc_mutex;
1241 1272
1242 struct list_head trans_list; 1273 struct list_head trans_list;
1243 struct list_head hashers;
1244 struct list_head dead_roots; 1274 struct list_head dead_roots;
1245 struct list_head caching_block_groups; 1275 struct list_head caching_block_groups;
1246 1276
@@ -1366,9 +1396,6 @@ struct btrfs_fs_info {
1366 struct rb_root defrag_inodes; 1396 struct rb_root defrag_inodes;
1367 atomic_t defrag_running; 1397 atomic_t defrag_running;
1368 1398
1369 spinlock_t ref_cache_lock;
1370 u64 total_ref_cache_size;
1371
1372 /* 1399 /*
1373 * these three are in extended format (availability of single 1400 * these three are in extended format (availability of single
1374 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1401 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1441,6 +1468,8 @@ struct btrfs_fs_info {
1441 1468
1442 /* next backup root to be overwritten */ 1469 /* next backup root to be overwritten */
1443 int backup_root_index; 1470 int backup_root_index;
1471
1472 int num_tolerated_disk_barrier_failures;
1444}; 1473};
1445 1474
1446/* 1475/*
@@ -1481,9 +1510,9 @@ struct btrfs_root {
1481 wait_queue_head_t log_commit_wait[2]; 1510 wait_queue_head_t log_commit_wait[2];
1482 atomic_t log_writers; 1511 atomic_t log_writers;
1483 atomic_t log_commit[2]; 1512 atomic_t log_commit[2];
1513 atomic_t log_batch;
1484 unsigned long log_transid; 1514 unsigned long log_transid;
1485 unsigned long last_log_commit; 1515 unsigned long last_log_commit;
1486 unsigned long log_batch;
1487 pid_t log_start_pid; 1516 pid_t log_start_pid;
1488 bool log_multiple_pids; 1517 bool log_multiple_pids;
1489 1518
@@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {
1592 */ 1621 */
1593#define BTRFS_INODE_ITEM_KEY 1 1622#define BTRFS_INODE_ITEM_KEY 1
1594#define BTRFS_INODE_REF_KEY 12 1623#define BTRFS_INODE_REF_KEY 12
1624#define BTRFS_INODE_EXTREF_KEY 13
1595#define BTRFS_XATTR_ITEM_KEY 24 1625#define BTRFS_XATTR_ITEM_KEY 24
1596#define BTRFS_ORPHAN_ITEM_KEY 48 1626#define BTRFS_ORPHAN_ITEM_KEY 48
1597/* reserve 2-15 close to the inode for later flexibility */ 1627/* reserve 2-15 close to the inode for later flexibility */
@@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1978BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); 2008BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1979BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); 2009BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1980 2010
2011/* struct btrfs_inode_extref */
2012BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
2013 parent_objectid, 64);
2014BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
2015 name_len, 16);
2016BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
2017
1981/* struct btrfs_inode_item */ 2018/* struct btrfs_inode_item */
1982BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); 2019BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1983BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); 2020BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2858 u64 size); 2895 u64 size);
2859int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2896int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2860 struct btrfs_root *root, u64 group_start); 2897 struct btrfs_root *root, u64 group_start);
2898void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2899 struct btrfs_root *root);
2861u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2862u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2863void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2874void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 2913void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2875int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 2914int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2876void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); 2915void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2877void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); 2916void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2878struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2917struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2918 unsigned short type);
2879void btrfs_free_block_rsv(struct btrfs_root *root, 2919void btrfs_free_block_rsv(struct btrfs_root *root,
2880 struct btrfs_block_rsv *rsv); 2920 struct btrfs_block_rsv *rsv);
2881int btrfs_block_rsv_add(struct btrfs_root *root, 2921int btrfs_block_rsv_add(struct btrfs_root *root,
@@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
3172 struct btrfs_root *root, 3212 struct btrfs_root *root,
3173 const char *name, int name_len, 3213 const char *name, int name_len,
3174 u64 inode_objectid, u64 ref_objectid, u64 *index); 3214 u64 inode_objectid, u64 ref_objectid, u64 *index);
3175struct btrfs_inode_ref * 3215int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
3176btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 3216 struct btrfs_root *root,
3177 struct btrfs_root *root, 3217 struct btrfs_path *path,
3178 struct btrfs_path *path, 3218 const char *name, int name_len,
3179 const char *name, int name_len, 3219 u64 inode_objectid, u64 ref_objectid, int mod,
3180 u64 inode_objectid, u64 ref_objectid, int mod); 3220 u64 *ret_index);
3181int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 3221int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
3182 struct btrfs_root *root, 3222 struct btrfs_root *root,
3183 struct btrfs_path *path, u64 objectid); 3223 struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
3185 *root, struct btrfs_path *path, 3225 *root, struct btrfs_path *path,
3186 struct btrfs_key *location, int mod); 3226 struct btrfs_key *location, int mod);
3187 3227
3228struct btrfs_inode_extref *
3229btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
3230 struct btrfs_root *root,
3231 struct btrfs_path *path,
3232 const char *name, int name_len,
3233 u64 inode_objectid, u64 ref_objectid, int ins_len,
3234 int cow);
3235
3236int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
3237 u64 ref_objectid, const char *name,
3238 int name_len,
3239 struct btrfs_inode_extref **extref_ret);
3240
3188/* file-item.c */ 3241/* file-item.c */
3189int btrfs_del_csums(struct btrfs_trans_handle *trans, 3242int btrfs_del_csums(struct btrfs_trans_handle *trans,
3190 struct btrfs_root *root, u64 bytenr, u64 len); 3243 struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3249 struct btrfs_root *root, 3302 struct btrfs_root *root,
3250 struct inode *dir, u64 objectid, 3303 struct inode *dir, u64 objectid,
3251 const char *name, int name_len); 3304 const char *name, int name_len);
3305int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3306 int front);
3252int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3307int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3253 struct btrfs_root *root, 3308 struct btrfs_root *root,
3254 struct inode *inode, u64 new_size, 3309 struct inode *inode, u64 new_size,
@@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
3308int btrfs_defrag_file(struct inode *inode, struct file *file, 3363int btrfs_defrag_file(struct inode *inode, struct file *file,
3309 struct btrfs_ioctl_defrag_range_args *range, 3364 struct btrfs_ioctl_defrag_range_args *range,
3310 u64 newer_than, unsigned long max_pages); 3365 u64 newer_than, unsigned long max_pages);
3366void btrfs_get_block_group_info(struct list_head *groups_list,
3367 struct btrfs_ioctl_space_info *space);
3368
3311/* file.c */ 3369/* file.c */
3312int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3370int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3313 struct inode *inode); 3371 struct inode *inode);
3314int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3372int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3315int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3373int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3316int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3374void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3317 int skip_pinned); 3375 int skip_pinned);
3376int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
3377 u64 start, u64 end, int skip_pinned,
3378 int modified);
3318extern const struct file_operations btrfs_file_operations; 3379extern const struct file_operations btrfs_file_operations;
3319int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 3380int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
3320 u64 start, u64 end, u64 *hint_byte, int drop_cache); 3381 struct btrfs_root *root, struct inode *inode,
3382 struct btrfs_path *path, u64 start, u64 end,
3383 u64 *drop_end, int drop_cache);
3384int btrfs_drop_extents(struct btrfs_trans_handle *trans,
3385 struct btrfs_root *root, struct inode *inode, u64 start,
3386 u64 end, int drop_cache);
3321int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 3387int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
3322 struct inode *inode, u64 start, u64 end); 3388 struct inode *inode, u64 start, u64 end);
3323int btrfs_release_file(struct inode *inode, struct file *file); 3389int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3378 } 3444 }
3379} 3445}
3380 3446
3447/*
3448 * Call btrfs_abort_transaction as early as possible when an error condition is
3449 * detected, that way the exact line number is reported.
3450 */
3451
3381#define btrfs_abort_transaction(trans, root, errno) \ 3452#define btrfs_abort_transaction(trans, root, errno) \
3382do { \ 3453do { \
3383 __btrfs_abort_transaction(trans, root, __func__, \ 3454 __btrfs_abort_transaction(trans, root, __func__, \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 52c85e2b95d0..478f66bdc57b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
29 29
30int __init btrfs_delayed_inode_init(void) 30int __init btrfs_delayed_inode_init(void)
31{ 31{
32 delayed_node_cache = kmem_cache_create("delayed_node", 32 delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
33 sizeof(struct btrfs_delayed_node), 33 sizeof(struct btrfs_delayed_node),
34 0, 34 0,
35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
650 * we're accounted for. 650 * we're accounted for.
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv != &root->fs_info->delalloc_block_rsv)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
655 /* 655 /*
656 * Since we're under a transaction reserve_metadata_bytes could 656 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
668 num_bytes, 1); 668 num_bytes, 1);
669 } 669 }
670 return ret; 670 return ret;
671 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 671 } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
672 spin_lock(&BTRFS_I(inode)->lock); 672 spin_lock(&BTRFS_I(inode)->lock);
673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
674 &BTRFS_I(inode)->runtime_flags)) { 674 &BTRFS_I(inode)->runtime_flags)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..7cda51995c1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48 48
49#ifdef CONFIG_X86
50#include <asm/cpufeature.h>
51#endif
52
49static struct extent_io_ops btree_extent_io_ops; 53static struct extent_io_ops btree_extent_io_ops;
50static void end_workqueue_fn(struct btrfs_work *work); 54static void end_workqueue_fn(struct btrfs_work *work);
51static void free_fs_root(struct btrfs_root *root); 55static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
217 write_lock(&em_tree->lock); 221 write_lock(&em_tree->lock);
218 ret = add_extent_mapping(em_tree, em); 222 ret = add_extent_mapping(em_tree, em);
219 if (ret == -EEXIST) { 223 if (ret == -EEXIST) {
220 u64 failed_start = em->start;
221 u64 failed_len = em->len;
222
223 free_extent_map(em); 224 free_extent_map(em);
224 em = lookup_extent_mapping(em_tree, start, len); 225 em = lookup_extent_mapping(em_tree, start, len);
225 if (em) { 226 if (!em)
226 ret = 0; 227 em = ERR_PTR(-EIO);
227 } else {
228 em = lookup_extent_mapping(em_tree, failed_start,
229 failed_len);
230 ret = -EIO;
231 }
232 } else if (ret) { 228 } else if (ret) {
233 free_extent_map(em); 229 free_extent_map(em);
234 em = NULL; 230 em = ERR_PTR(ret);
235 } 231 }
236 write_unlock(&em_tree->lock); 232 write_unlock(&em_tree->lock);
237 233
238 if (ret)
239 em = ERR_PTR(ret);
240out: 234out:
241 return em; 235 return em;
242} 236}
@@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
439 WARN_ON(1); 433 WARN_ON(1);
440 return 0; 434 return 0;
441 } 435 }
442 if (eb->pages[0] != page) {
443 WARN_ON(1);
444 return 0;
445 }
446 if (!PageUptodate(page)) { 436 if (!PageUptodate(page)) {
447 WARN_ON(1); 437 WARN_ON(1);
448 return 0; 438 return 0;
@@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
869 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
870} 860}
871 861
862static int check_async_write(struct inode *inode, unsigned long bio_flags)
863{
864 if (bio_flags & EXTENT_BIO_TREE_LOG)
865 return 0;
866#ifdef CONFIG_X86
867 if (cpu_has_xmm4_2)
868 return 0;
869#endif
870 return 1;
871}
872
872static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 873static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
873 int mirror_num, unsigned long bio_flags, 874 int mirror_num, unsigned long bio_flags,
874 u64 bio_offset) 875 u64 bio_offset)
875{ 876{
877 int async = check_async_write(inode, bio_flags);
876 int ret; 878 int ret;
877 879
878 if (!(rw & REQ_WRITE)) { 880 if (!(rw & REQ_WRITE)) {
@@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
887 return ret; 889 return ret;
888 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
889 mirror_num, 0); 891 mirror_num, 0);
892 } else if (!async) {
893 ret = btree_csum_one_bio(bio);
894 if (ret)
895 return ret;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0);
890 } 898 }
891 899
892 /* 900 /*
@@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1168 atomic_set(&root->log_commit[0], 0); 1176 atomic_set(&root->log_commit[0], 0);
1169 atomic_set(&root->log_commit[1], 0); 1177 atomic_set(&root->log_commit[1], 0);
1170 atomic_set(&root->log_writers, 0); 1178 atomic_set(&root->log_writers, 0);
1179 atomic_set(&root->log_batch, 0);
1171 atomic_set(&root->orphan_inodes, 0); 1180 atomic_set(&root->orphan_inodes, 0);
1172 root->log_batch = 0;
1173 root->log_transid = 0; 1181 root->log_transid = 0;
1174 root->last_log_commit = 0; 1182 root->last_log_commit = 0;
1175 extent_io_tree_init(&root->dirty_log_pages, 1183 extent_io_tree_init(&root->dirty_log_pages,
@@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)
1667 spin_unlock(&root->fs_info->trans_lock); 1675 spin_unlock(&root->fs_info->trans_lock);
1668 1676
1669 /* If the file system is aborted, this will always fail. */ 1677 /* If the file system is aborted, this will always fail. */
1670 trans = btrfs_join_transaction(root); 1678 trans = btrfs_attach_transaction(root);
1671 if (IS_ERR(trans)) { 1679 if (IS_ERR(trans)) {
1672 cannot_commit = true; 1680 if (PTR_ERR(trans) != -ENOENT)
1681 cannot_commit = true;
1673 goto sleep; 1682 goto sleep;
1674 } 1683 }
1675 if (transid == trans->transid) { 1684 if (transid == trans->transid) {
@@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,
1994 INIT_LIST_HEAD(&fs_info->trans_list); 2003 INIT_LIST_HEAD(&fs_info->trans_list);
1995 INIT_LIST_HEAD(&fs_info->dead_roots); 2004 INIT_LIST_HEAD(&fs_info->dead_roots);
1996 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2005 INIT_LIST_HEAD(&fs_info->delayed_iputs);
1997 INIT_LIST_HEAD(&fs_info->hashers);
1998 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2006 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1999 INIT_LIST_HEAD(&fs_info->ordered_operations); 2007 INIT_LIST_HEAD(&fs_info->ordered_operations);
2000 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2008 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2001 spin_lock_init(&fs_info->delalloc_lock); 2009 spin_lock_init(&fs_info->delalloc_lock);
2002 spin_lock_init(&fs_info->trans_lock); 2010 spin_lock_init(&fs_info->trans_lock);
2003 spin_lock_init(&fs_info->ref_cache_lock);
2004 spin_lock_init(&fs_info->fs_roots_radix_lock); 2011 spin_lock_init(&fs_info->fs_roots_radix_lock);
2005 spin_lock_init(&fs_info->delayed_iput_lock); 2012 spin_lock_init(&fs_info->delayed_iput_lock);
2006 spin_lock_init(&fs_info->defrag_inodes_lock); 2013 spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,
2014 INIT_LIST_HEAD(&fs_info->space_info); 2021 INIT_LIST_HEAD(&fs_info->space_info);
2015 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2022 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2016 btrfs_mapping_init(&fs_info->mapping_tree); 2023 btrfs_mapping_init(&fs_info->mapping_tree);
2017 btrfs_init_block_rsv(&fs_info->global_block_rsv); 2024 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2018 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 2025 BTRFS_BLOCK_RSV_GLOBAL);
2019 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 2026 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2020 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 2027 BTRFS_BLOCK_RSV_DELALLOC);
2021 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 2028 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2022 btrfs_init_block_rsv(&fs_info->delayed_block_rsv); 2029 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2030 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2031 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2032 BTRFS_BLOCK_RSV_DELOPS);
2023 atomic_set(&fs_info->nr_async_submits, 0); 2033 atomic_set(&fs_info->nr_async_submits, 0);
2024 atomic_set(&fs_info->async_delalloc_pages, 0); 2034 atomic_set(&fs_info->async_delalloc_pages, 0);
2025 atomic_set(&fs_info->async_submit_draining, 0); 2035 atomic_set(&fs_info->async_submit_draining, 0);
@@ -2491,6 +2501,8 @@ retry_root_backup:
2491 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2501 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2492 goto fail_block_groups; 2502 goto fail_block_groups;
2493 } 2503 }
2504 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2494 2506
2495 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2496 "btrfs-cleaner"); 2508 "btrfs-cleaner");
@@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2874 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 2886 printk_in_rcu("btrfs: disabling barriers on dev %s\n",
2875 rcu_str_deref(device->name)); 2887 rcu_str_deref(device->name));
2876 device->nobarriers = 1; 2888 device->nobarriers = 1;
2877 } 2889 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
2878 if (!bio_flagged(bio, BIO_UPTODATE)) {
2879 ret = -EIO; 2890 ret = -EIO;
2880 if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2891 btrfs_dev_stat_inc_and_print(device,
2881 btrfs_dev_stat_inc_and_print(device, 2892 BTRFS_DEV_STAT_FLUSH_ERRS);
2882 BTRFS_DEV_STAT_FLUSH_ERRS);
2883 } 2893 }
2884 2894
2885 /* drop the reference from the wait == 0 run */ 2895 /* drop the reference from the wait == 0 run */
@@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2918{ 2928{
2919 struct list_head *head; 2929 struct list_head *head;
2920 struct btrfs_device *dev; 2930 struct btrfs_device *dev;
2921 int errors = 0; 2931 int errors_send = 0;
2932 int errors_wait = 0;
2922 int ret; 2933 int ret;
2923 2934
2924 /* send down all the barriers */ 2935 /* send down all the barriers */
2925 head = &info->fs_devices->devices; 2936 head = &info->fs_devices->devices;
2926 list_for_each_entry_rcu(dev, head, dev_list) { 2937 list_for_each_entry_rcu(dev, head, dev_list) {
2927 if (!dev->bdev) { 2938 if (!dev->bdev) {
2928 errors++; 2939 errors_send++;
2929 continue; 2940 continue;
2930 } 2941 }
2931 if (!dev->in_fs_metadata || !dev->writeable) 2942 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2933 2944
2934 ret = write_dev_flush(dev, 0); 2945 ret = write_dev_flush(dev, 0);
2935 if (ret) 2946 if (ret)
2936 errors++; 2947 errors_send++;
2937 } 2948 }
2938 2949
2939 /* wait for all the barriers */ 2950 /* wait for all the barriers */
2940 list_for_each_entry_rcu(dev, head, dev_list) { 2951 list_for_each_entry_rcu(dev, head, dev_list) {
2941 if (!dev->bdev) { 2952 if (!dev->bdev) {
2942 errors++; 2953 errors_wait++;
2943 continue; 2954 continue;
2944 } 2955 }
2945 if (!dev->in_fs_metadata || !dev->writeable) 2956 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2947 2958
2948 ret = write_dev_flush(dev, 1); 2959 ret = write_dev_flush(dev, 1);
2949 if (ret) 2960 if (ret)
2950 errors++; 2961 errors_wait++;
2951 } 2962 }
2952 if (errors) 2963 if (errors_send > info->num_tolerated_disk_barrier_failures ||
2964 errors_wait > info->num_tolerated_disk_barrier_failures)
2953 return -EIO; 2965 return -EIO;
2954 return 0; 2966 return 0;
2955} 2967}
2956 2968
2969int btrfs_calc_num_tolerated_disk_barrier_failures(
2970 struct btrfs_fs_info *fs_info)
2971{
2972 struct btrfs_ioctl_space_info space;
2973 struct btrfs_space_info *sinfo;
2974 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2975 BTRFS_BLOCK_GROUP_SYSTEM,
2976 BTRFS_BLOCK_GROUP_METADATA,
2977 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2978 int num_types = 4;
2979 int i;
2980 int c;
2981 int num_tolerated_disk_barrier_failures =
2982 (int)fs_info->fs_devices->num_devices;
2983
2984 for (i = 0; i < num_types; i++) {
2985 struct btrfs_space_info *tmp;
2986
2987 sinfo = NULL;
2988 rcu_read_lock();
2989 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
2990 if (tmp->flags == types[i]) {
2991 sinfo = tmp;
2992 break;
2993 }
2994 }
2995 rcu_read_unlock();
2996
2997 if (!sinfo)
2998 continue;
2999
3000 down_read(&sinfo->groups_sem);
3001 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3002 if (!list_empty(&sinfo->block_groups[c])) {
3003 u64 flags;
3004
3005 btrfs_get_block_group_info(
3006 &sinfo->block_groups[c], &space);
3007 if (space.total_bytes == 0 ||
3008 space.used_bytes == 0)
3009 continue;
3010 flags = space.flags;
3011 /*
3012 * return
3013 * 0: if dup, single or RAID0 is configured for
3014 * any of metadata, system or data, else
3015 * 1: if RAID5 is configured, or if RAID1 or
3016 * RAID10 is configured and only two mirrors
3017 * are used, else
3018 * 2: if RAID6 is configured, else
3019 * num_mirrors - 1: if RAID1 or RAID10 is
3020 * configured and more than
3021 * 2 mirrors are used.
3022 */
3023 if (num_tolerated_disk_barrier_failures > 0 &&
3024 ((flags & (BTRFS_BLOCK_GROUP_DUP |
3025 BTRFS_BLOCK_GROUP_RAID0)) ||
3026 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3027 == 0)))
3028 num_tolerated_disk_barrier_failures = 0;
3029 else if (num_tolerated_disk_barrier_failures > 1
3030 &&
3031 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3032 BTRFS_BLOCK_GROUP_RAID10)))
3033 num_tolerated_disk_barrier_failures = 1;
3034 }
3035 }
3036 up_read(&sinfo->groups_sem);
3037 }
3038
3039 return num_tolerated_disk_barrier_failures;
3040}
3041
2957int write_all_supers(struct btrfs_root *root, int max_mirrors) 3042int write_all_supers(struct btrfs_root *root, int max_mirrors)
2958{ 3043{
2959 struct list_head *head; 3044 struct list_head *head;
@@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2976 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3061 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2977 head = &root->fs_info->fs_devices->devices; 3062 head = &root->fs_info->fs_devices->devices;
2978 3063
2979 if (do_barriers) 3064 if (do_barriers) {
2980 barrier_all_devices(root->fs_info); 3065 ret = barrier_all_devices(root->fs_info);
3066 if (ret) {
3067 mutex_unlock(
3068 &root->fs_info->fs_devices->device_list_mutex);
3069 btrfs_error(root->fs_info, ret,
3070 "errors while submitting device barriers.");
3071 return ret;
3072 }
3073 }
2981 3074
2982 list_for_each_entry_rcu(dev, head, dev_list) { 3075 list_for_each_entry_rcu(dev, head, dev_list) {
2983 if (!dev->bdev) { 3076 if (!dev->bdev) {
@@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
3211 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3304 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3212 (unsigned long long)fs_info->delalloc_bytes); 3305 (unsigned long long)fs_info->delalloc_bytes);
3213 } 3306 }
3214 if (fs_info->total_ref_cache_size) {
3215 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
3216 (unsigned long long)fs_info->total_ref_cache_size);
3217 }
3218 3307
3219 free_extent_buffer(fs_info->extent_root->node); 3308 free_extent_buffer(fs_info->extent_root->node);
3220 free_extent_buffer(fs_info->extent_root->commit_root); 3309 free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3360 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3449 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3361} 3450}
3362 3451
3363int btree_lock_page_hook(struct page *page, void *data,
3364 void (*flush_fn)(void *))
3365{
3366 struct inode *inode = page->mapping->host;
3367 struct btrfs_root *root = BTRFS_I(inode)->root;
3368 struct extent_buffer *eb;
3369
3370 /*
3371 * We culled this eb but the page is still hanging out on the mapping,
3372 * carry on.
3373 */
3374 if (!PagePrivate(page))
3375 goto out;
3376
3377 eb = (struct extent_buffer *)page->private;
3378 if (!eb) {
3379 WARN_ON(1);
3380 goto out;
3381 }
3382 if (page != eb->pages[0])
3383 goto out;
3384
3385 if (!btrfs_try_tree_write_lock(eb)) {
3386 flush_fn(data);
3387 btrfs_tree_lock(eb);
3388 }
3389 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3390
3391 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3392 spin_lock(&root->fs_info->delalloc_lock);
3393 if (root->fs_info->dirty_metadata_bytes >= eb->len)
3394 root->fs_info->dirty_metadata_bytes -= eb->len;
3395 else
3396 WARN_ON(1);
3397 spin_unlock(&root->fs_info->delalloc_lock);
3398 }
3399
3400 btrfs_tree_unlock(eb);
3401out:
3402 if (!trylock_page(page)) {
3403 flush_fn(data);
3404 lock_page(page);
3405 }
3406 return 0;
3407}
3408
3409static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3452static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3410 int read_only) 3453 int read_only)
3411{ 3454{
@@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3608 3651
3609 while (1) { 3652 while (1) {
3610 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 3653 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3611 mark); 3654 mark, NULL);
3612 if (ret) 3655 if (ret)
3613 break; 3656 break;
3614 3657
@@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3663again: 3706again:
3664 while (1) { 3707 while (1) {
3665 ret = find_first_extent_bit(unpin, 0, &start, &end, 3708 ret = find_first_extent_bit(unpin, 0, &start, &end,
3666 EXTENT_DIRTY); 3709 EXTENT_DIRTY, NULL);
3667 if (ret) 3710 if (ret)
3668 break; 3711 break;
3669 3712
@@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3800} 3843}
3801 3844
3802static struct extent_io_ops btree_extent_io_ops = { 3845static struct extent_io_ops btree_extent_io_ops = {
3803 .write_cache_pages_lock_hook = btree_lock_page_hook,
3804 .readpage_end_io_hook = btree_readpage_end_io_hook, 3846 .readpage_end_io_hook = btree_readpage_end_io_hook,
3805 .readpage_io_failed_hook = btree_io_failed_hook, 3847 .readpage_io_failed_hook = btree_io_failed_hook,
3806 .submit_bio_hook = btree_submit_bio_hook, 3848 .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..2025a9132c16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
95 u64 objectid); 95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data, 96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *)); 97 void (*flush_fn)(void *));
98int btrfs_calc_num_tolerated_disk_barrier_failures(
99 struct btrfs_fs_info *fs_info);
98 100
99#ifdef CONFIG_DEBUG_LOCK_ALLOC 101#ifdef CONFIG_DEBUG_LOCK_ALLOC
100void btrfs_init_lockdep(void); 102void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d3..3d3e2c17d8d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 u64 flags, struct btrfs_disk_key *key, 94 u64 flags, struct btrfs_disk_key *key,
95 int level, struct btrfs_key *ins); 95 int level, struct btrfs_key *ins);
96static int do_chunk_alloc(struct btrfs_trans_handle *trans, 96static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97 struct btrfs_root *extent_root, u64 alloc_bytes, 97 struct btrfs_root *extent_root, u64 flags,
98 u64 flags, int force); 98 int force);
99static int find_next_key(struct btrfs_path *path, int level, 99static int find_next_key(struct btrfs_path *path, int level,
100 struct btrfs_key *key); 100 struct btrfs_key *key);
101static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 101static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
312 while (start < end) { 312 while (start < end) {
313 ret = find_first_extent_bit(info->pinned_extents, start, 313 ret = find_first_extent_bit(info->pinned_extents, start,
314 &extent_start, &extent_end, 314 &extent_start, &extent_end,
315 EXTENT_DIRTY | EXTENT_UPTODATE); 315 EXTENT_DIRTY | EXTENT_UPTODATE,
316 NULL);
316 if (ret) 317 if (ret)
317 break; 318 break;
318 319
@@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2361 } 2362 }
2362 2363
2363next: 2364next:
2364 do_chunk_alloc(trans, fs_info->extent_root,
2365 2 * 1024 * 1024,
2366 btrfs_get_alloc_profile(root, 0),
2367 CHUNK_ALLOC_NO_FORCE);
2368 cond_resched(); 2365 cond_resched();
2369 spin_lock(&delayed_refs->lock); 2366 spin_lock(&delayed_refs->lock);
2370 } 2367 }
@@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2478 if (root == root->fs_info->extent_root) 2475 if (root == root->fs_info->extent_root)
2479 root = root->fs_info->tree_root; 2476 root = root->fs_info->tree_root;
2480 2477
2481 do_chunk_alloc(trans, root->fs_info->extent_root,
2482 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2483 CHUNK_ALLOC_NO_FORCE);
2484
2485 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2478 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2486 2479
2487 delayed_refs = &trans->transaction->delayed_refs; 2480 delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2544,12 @@ again:
2551 } 2544 }
2552 2545
2553 if (run_all) { 2546 if (run_all) {
2547 if (!list_empty(&trans->new_bgs)) {
2548 spin_unlock(&delayed_refs->lock);
2549 btrfs_create_pending_block_groups(trans, root);
2550 spin_lock(&delayed_refs->lock);
2551 }
2552
2554 node = rb_first(&delayed_refs->root); 2553 node = rb_first(&delayed_refs->root);
2555 if (!node) 2554 if (!node)
2556 goto out; 2555 goto out;
@@ -3406,7 +3405,6 @@ alloc:
3406 return PTR_ERR(trans); 3405 return PTR_ERR(trans);
3407 3406
3408 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3407 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3409 bytes + 2 * 1024 * 1024,
3410 alloc_target, 3408 alloc_target,
3411 CHUNK_ALLOC_NO_FORCE); 3409 CHUNK_ALLOC_NO_FORCE);
3412 btrfs_end_transaction(trans, root); 3410 btrfs_end_transaction(trans, root);
@@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3488} 3486}
3489 3487
3490static int should_alloc_chunk(struct btrfs_root *root, 3488static int should_alloc_chunk(struct btrfs_root *root,
3491 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3489 struct btrfs_space_info *sinfo, int force)
3492 int force)
3493{ 3490{
3494 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3491 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3495 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3492 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3504 * and purposes it's used space. Don't worry about locking the 3501 * and purposes it's used space. Don't worry about locking the
3505 * global_rsv, it doesn't change except when the transaction commits. 3502 * global_rsv, it doesn't change except when the transaction commits.
3506 */ 3503 */
3507 num_allocated += global_rsv->size; 3504 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3505 num_allocated += global_rsv->size;
3508 3506
3509 /* 3507 /*
3510 * in limited mode, we want to have some free space up to 3508 * in limited mode, we want to have some free space up to
@@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3518 if (num_bytes - num_allocated < thresh) 3516 if (num_bytes - num_allocated < thresh)
3519 return 1; 3517 return 1;
3520 } 3518 }
3521 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3522 3519
3523 /* 256MB or 2% of the FS */ 3520 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3524 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3525 /* system chunks need a much small threshold */
3526 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3527 thresh = 32 * 1024 * 1024;
3528
3529 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3530 return 0; 3521 return 0;
3531 return 1; 3522 return 1;
3532} 3523}
@@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
3576} 3567}
3577 3568
3578static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3569static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3579 struct btrfs_root *extent_root, u64 alloc_bytes, 3570 struct btrfs_root *extent_root, u64 flags, int force)
3580 u64 flags, int force)
3581{ 3571{
3582 struct btrfs_space_info *space_info; 3572 struct btrfs_space_info *space_info;
3583 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3573 struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3591,7 @@ again:
3601 return 0; 3591 return 0;
3602 } 3592 }
3603 3593
3604 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { 3594 if (!should_alloc_chunk(extent_root, space_info, force)) {
3605 spin_unlock(&space_info->lock); 3595 spin_unlock(&space_info->lock);
3606 return 0; 3596 return 0;
3607 } else if (space_info->chunk_alloc) { 3597 } else if (space_info->chunk_alloc) {
@@ -3669,6 +3659,46 @@ out:
3669 return ret; 3659 return ret;
3670} 3660}
3671 3661
3662static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush)
3665{
3666 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail;
3668 u64 used;
3669
3670 used = space_info->bytes_used + space_info->bytes_reserved +
3671 space_info->bytes_pinned + space_info->bytes_readonly +
3672 space_info->bytes_may_use;
3673
3674 spin_lock(&root->fs_info->free_chunk_lock);
3675 avail = root->fs_info->free_chunk_space;
3676 spin_unlock(&root->fs_info->free_chunk_lock);
3677
3678 /*
3679 * If we have dup, raid1 or raid10 then only half of the free
3680 * space is actually useable.
3681 */
3682 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3683 BTRFS_BLOCK_GROUP_RAID1 |
3684 BTRFS_BLOCK_GROUP_RAID10))
3685 avail >>= 1;
3686
3687 /*
3688 * If we aren't flushing don't let us overcommit too much, say
3689 * 1/8th of the space. If we can flush, let it overcommit up to
3690 * 1/2 of the space.
3691 */
3692 if (flush)
3693 avail >>= 3;
3694 else
3695 avail >>= 1;
3696
3697 if (used + bytes < space_info->total_bytes + avail)
3698 return 1;
3699 return 0;
3700}
3701
3672/* 3702/*
3673 * shrink metadata reservation for delalloc 3703 * shrink metadata reservation for delalloc
3674 */ 3704 */
@@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3693 if (delalloc_bytes == 0) { 3723 if (delalloc_bytes == 0) {
3694 if (trans) 3724 if (trans)
3695 return; 3725 return;
3696 btrfs_wait_ordered_extents(root, 0, 0); 3726 btrfs_wait_ordered_extents(root, 0);
3697 return; 3727 return;
3698 } 3728 }
3699 3729
@@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3703 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3704 WB_REASON_FS_FREE_SPACE); 3734 WB_REASON_FS_FREE_SPACE);
3705 3735
3736 /*
3737 * We need to wait for the async pages to actually start before
3738 * we do anything.
3739 */
3740 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages));
3742
3706 spin_lock(&space_info->lock); 3743 spin_lock(&space_info->lock);
3707 if (space_info->bytes_used + space_info->bytes_reserved + 3744 if (can_overcommit(root, space_info, orig, !trans)) {
3708 space_info->bytes_pinned + space_info->bytes_readonly +
3709 space_info->bytes_may_use + orig <=
3710 space_info->total_bytes) {
3711 spin_unlock(&space_info->lock); 3745 spin_unlock(&space_info->lock);
3712 break; 3746 break;
3713 } 3747 }
@@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3715 3749
3716 loops++; 3750 loops++;
3717 if (wait_ordered && !trans) { 3751 if (wait_ordered && !trans) {
3718 btrfs_wait_ordered_extents(root, 0, 0); 3752 btrfs_wait_ordered_extents(root, 0);
3719 } else { 3753 } else {
3720 time_left = schedule_timeout_killable(1); 3754 time_left = schedule_timeout_killable(1);
3721 if (time_left) 3755 if (time_left)
@@ -3784,11 +3818,12 @@ commit:
3784} 3818}
3785 3819
3786enum flush_state { 3820enum flush_state {
3787 FLUSH_DELALLOC = 1, 3821 FLUSH_DELAYED_ITEMS_NR = 1,
3788 FLUSH_DELALLOC_WAIT = 2, 3822 FLUSH_DELAYED_ITEMS = 2,
3789 FLUSH_DELAYED_ITEMS_NR = 3, 3823 FLUSH_DELALLOC = 3,
3790 FLUSH_DELAYED_ITEMS = 4, 3824 FLUSH_DELALLOC_WAIT = 4,
3791 COMMIT_TRANS = 5, 3825 ALLOC_CHUNK = 5,
3826 COMMIT_TRANS = 6,
3792}; 3827};
3793 3828
3794static int flush_space(struct btrfs_root *root, 3829static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,
3800 int ret = 0; 3835 int ret = 0;
3801 3836
3802 switch (state) { 3837 switch (state) {
3803 case FLUSH_DELALLOC:
3804 case FLUSH_DELALLOC_WAIT:
3805 shrink_delalloc(root, num_bytes, orig_bytes,
3806 state == FLUSH_DELALLOC_WAIT);
3807 break;
3808 case FLUSH_DELAYED_ITEMS_NR: 3838 case FLUSH_DELAYED_ITEMS_NR:
3809 case FLUSH_DELAYED_ITEMS: 3839 case FLUSH_DELAYED_ITEMS:
3810 if (state == FLUSH_DELAYED_ITEMS_NR) { 3840 if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,
3825 ret = btrfs_run_delayed_items_nr(trans, root, nr); 3855 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3826 btrfs_end_transaction(trans, root); 3856 btrfs_end_transaction(trans, root);
3827 break; 3857 break;
3858 case FLUSH_DELALLOC:
3859 case FLUSH_DELALLOC_WAIT:
3860 shrink_delalloc(root, num_bytes, orig_bytes,
3861 state == FLUSH_DELALLOC_WAIT);
3862 break;
3863 case ALLOC_CHUNK:
3864 trans = btrfs_join_transaction(root);
3865 if (IS_ERR(trans)) {
3866 ret = PTR_ERR(trans);
3867 break;
3868 }
3869 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3870 btrfs_get_alloc_profile(root, 0),
3871 CHUNK_ALLOC_NO_FORCE);
3872 btrfs_end_transaction(trans, root);
3873 if (ret == -ENOSPC)
3874 ret = 0;
3875 break;
3828 case COMMIT_TRANS: 3876 case COMMIT_TRANS:
3829 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 3877 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3830 break; 3878 break;
@@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
3856 struct btrfs_space_info *space_info = block_rsv->space_info; 3904 struct btrfs_space_info *space_info = block_rsv->space_info;
3857 u64 used; 3905 u64 used;
3858 u64 num_bytes = orig_bytes; 3906 u64 num_bytes = orig_bytes;
3859 int flush_state = FLUSH_DELALLOC; 3907 int flush_state = FLUSH_DELAYED_ITEMS_NR;
3860 int ret = 0; 3908 int ret = 0;
3861 bool flushing = false; 3909 bool flushing = false;
3862 bool committed = false;
3863 3910
3864again: 3911again:
3865 ret = 0; 3912 ret = 0;
@@ -3922,57 +3969,12 @@ again:
3922 (orig_bytes * 2); 3969 (orig_bytes * 2);
3923 } 3970 }
3924 3971
3925 if (ret) { 3972 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3926 u64 profile = btrfs_get_alloc_profile(root, 0); 3973 space_info->bytes_may_use += orig_bytes;
3927 u64 avail; 3974 trace_btrfs_space_reservation(root->fs_info, "space_info",
3928 3975 space_info->flags, orig_bytes,
3929 /* 3976 1);
3930 * If we have a lot of space that's pinned, don't bother doing 3977 ret = 0;
3931 * the overcommit dance yet and just commit the transaction.
3932 */
3933 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3934 do_div(avail, 10);
3935 if (space_info->bytes_pinned >= avail && flush && !committed) {
3936 space_info->flush = 1;
3937 flushing = true;
3938 spin_unlock(&space_info->lock);
3939 ret = may_commit_transaction(root, space_info,
3940 orig_bytes, 1);
3941 if (ret)
3942 goto out;
3943 committed = true;
3944 goto again;
3945 }
3946
3947 spin_lock(&root->fs_info->free_chunk_lock);
3948 avail = root->fs_info->free_chunk_space;
3949
3950 /*
3951 * If we have dup, raid1 or raid10 then only half of the free
3952 * space is actually useable.
3953 */
3954 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3955 BTRFS_BLOCK_GROUP_RAID1 |
3956 BTRFS_BLOCK_GROUP_RAID10))
3957 avail >>= 1;
3958
3959 /*
3960 * If we aren't flushing don't let us overcommit too much, say
3961 * 1/8th of the space. If we can flush, let it overcommit up to
3962 * 1/2 of the space.
3963 */
3964 if (flush)
3965 avail >>= 3;
3966 else
3967 avail >>= 1;
3968 spin_unlock(&root->fs_info->free_chunk_lock);
3969
3970 if (used + num_bytes < space_info->total_bytes + avail) {
3971 space_info->bytes_may_use += orig_bytes;
3972 trace_btrfs_space_reservation(root->fs_info,
3973 "space_info", space_info->flags, orig_bytes, 1);
3974 ret = 0;
3975 }
3976 } 3978 }
3977 3979
3978 /* 3980 /*
@@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4114 return 0; 4116 return 0;
4115} 4117}
4116 4118
4117void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) 4119void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4118{ 4120{
4119 memset(rsv, 0, sizeof(*rsv)); 4121 memset(rsv, 0, sizeof(*rsv));
4120 spin_lock_init(&rsv->lock); 4122 spin_lock_init(&rsv->lock);
4123 rsv->type = type;
4121} 4124}
4122 4125
4123struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 4126struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4127 unsigned short type)
4124{ 4128{
4125 struct btrfs_block_rsv *block_rsv; 4129 struct btrfs_block_rsv *block_rsv;
4126 struct btrfs_fs_info *fs_info = root->fs_info; 4130 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4129 if (!block_rsv) 4133 if (!block_rsv)
4130 return NULL; 4134 return NULL;
4131 4135
4132 btrfs_init_block_rsv(block_rsv); 4136 btrfs_init_block_rsv(block_rsv, type);
4133 block_rsv->space_info = __find_space_info(fs_info, 4137 block_rsv->space_info = __find_space_info(fs_info,
4134 BTRFS_BLOCK_GROUP_METADATA); 4138 BTRFS_BLOCK_GROUP_METADATA);
4135 return block_rsv; 4139 return block_rsv;
@@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4138void btrfs_free_block_rsv(struct btrfs_root *root, 4142void btrfs_free_block_rsv(struct btrfs_root *root,
4139 struct btrfs_block_rsv *rsv) 4143 struct btrfs_block_rsv *rsv)
4140{ 4144{
4145 if (!rsv)
4146 return;
4141 btrfs_block_rsv_release(root, rsv, (u64)-1); 4147 btrfs_block_rsv_release(root, rsv, (u64)-1);
4142 kfree(rsv); 4148 kfree(rsv);
4143} 4149}
@@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4416 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4422 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4417 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4423 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4418 /* 4424 /*
4419 * two for root back/forward refs, two for directory entries 4425 * two for root back/forward refs, two for directory entries,
4420 * and one for root of the snapshot. 4426 * one for root of the snapshot and one for parent inode.
4421 */ 4427 */
4422 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); 4428 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4423 dst_rsv->space_info = src_rsv->space_info; 4429 dst_rsv->space_info = src_rsv->space_info;
4424 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4430 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4425} 4431}
@@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5018 5024
5019 while (1) { 5025 while (1) {
5020 ret = find_first_extent_bit(unpin, 0, &start, &end, 5026 ret = find_first_extent_bit(unpin, 0, &start, &end,
5021 EXTENT_DIRTY); 5027 EXTENT_DIRTY, NULL);
5022 if (ret) 5028 if (ret)
5023 break; 5029 break;
5024 5030
@@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5096 ret = remove_extent_backref(trans, extent_root, path, 5102 ret = remove_extent_backref(trans, extent_root, path,
5097 NULL, refs_to_drop, 5103 NULL, refs_to_drop,
5098 is_data); 5104 is_data);
5099 if (ret) 5105 if (ret) {
5100 goto abort; 5106 btrfs_abort_transaction(trans, extent_root, ret);
5107 goto out;
5108 }
5101 btrfs_release_path(path); 5109 btrfs_release_path(path);
5102 path->leave_spinning = 1; 5110 path->leave_spinning = 1;
5103 5111
@@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5115 btrfs_print_leaf(extent_root, 5123 btrfs_print_leaf(extent_root,
5116 path->nodes[0]); 5124 path->nodes[0]);
5117 } 5125 }
5118 if (ret < 0) 5126 if (ret < 0) {
5119 goto abort; 5127 btrfs_abort_transaction(trans, extent_root, ret);
5128 goto out;
5129 }
5120 extent_slot = path->slots[0]; 5130 extent_slot = path->slots[0];
5121 } 5131 }
5122 } else if (ret == -ENOENT) { 5132 } else if (ret == -ENOENT) {
@@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5130 (unsigned long long)owner_objectid, 5140 (unsigned long long)owner_objectid,
5131 (unsigned long long)owner_offset); 5141 (unsigned long long)owner_offset);
5132 } else { 5142 } else {
5133 goto abort; 5143 btrfs_abort_transaction(trans, extent_root, ret);
5144 goto out;
5134 } 5145 }
5135 5146
5136 leaf = path->nodes[0]; 5147 leaf = path->nodes[0];
@@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5140 BUG_ON(found_extent || extent_slot != path->slots[0]); 5151 BUG_ON(found_extent || extent_slot != path->slots[0]);
5141 ret = convert_extent_item_v0(trans, extent_root, path, 5152 ret = convert_extent_item_v0(trans, extent_root, path,
5142 owner_objectid, 0); 5153 owner_objectid, 0);
5143 if (ret < 0) 5154 if (ret < 0) {
5144 goto abort; 5155 btrfs_abort_transaction(trans, extent_root, ret);
5156 goto out;
5157 }
5145 5158
5146 btrfs_release_path(path); 5159 btrfs_release_path(path);
5147 path->leave_spinning = 1; 5160 path->leave_spinning = 1;
@@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5158 (unsigned long long)bytenr); 5171 (unsigned long long)bytenr);
5159 btrfs_print_leaf(extent_root, path->nodes[0]); 5172 btrfs_print_leaf(extent_root, path->nodes[0]);
5160 } 5173 }
5161 if (ret < 0) 5174 if (ret < 0) {
5162 goto abort; 5175 btrfs_abort_transaction(trans, extent_root, ret);
5176 goto out;
5177 }
5178
5163 extent_slot = path->slots[0]; 5179 extent_slot = path->slots[0];
5164 leaf = path->nodes[0]; 5180 leaf = path->nodes[0];
5165 item_size = btrfs_item_size_nr(leaf, extent_slot); 5181 item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5196 ret = remove_extent_backref(trans, extent_root, path, 5212 ret = remove_extent_backref(trans, extent_root, path,
5197 iref, refs_to_drop, 5213 iref, refs_to_drop,
5198 is_data); 5214 is_data);
5199 if (ret) 5215 if (ret) {
5200 goto abort; 5216 btrfs_abort_transaction(trans, extent_root, ret);
5217 goto out;
5218 }
5201 } 5219 }
5202 } else { 5220 } else {
5203 if (found_extent) { 5221 if (found_extent) {
@@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5214 5232
5215 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5233 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5216 num_to_del); 5234 num_to_del);
5217 if (ret) 5235 if (ret) {
5218 goto abort; 5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238 }
5219 btrfs_release_path(path); 5239 btrfs_release_path(path);
5220 5240
5221 if (is_data) { 5241 if (is_data) {
5222 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5242 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5223 if (ret) 5243 if (ret) {
5224 goto abort; 5244 btrfs_abort_transaction(trans, extent_root, ret);
5245 goto out;
5246 }
5225 } 5247 }
5226 5248
5227 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5249 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5228 if (ret) 5250 if (ret) {
5229 goto abort; 5251 btrfs_abort_transaction(trans, extent_root, ret);
5252 goto out;
5253 }
5230 } 5254 }
5231out: 5255out:
5232 btrfs_free_path(path); 5256 btrfs_free_path(path);
5233 return ret; 5257 return ret;
5234
5235abort:
5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238} 5258}
5239 5259
5240/* 5260/*
@@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5497 struct btrfs_block_group_cache *used_block_group; 5517 struct btrfs_block_group_cache *used_block_group;
5498 u64 search_start = 0; 5518 u64 search_start = 0;
5499 int empty_cluster = 2 * 1024 * 1024; 5519 int empty_cluster = 2 * 1024 * 1024;
5500 int allowed_chunk_alloc = 0;
5501 int done_chunk_alloc = 0;
5502 struct btrfs_space_info *space_info; 5520 struct btrfs_space_info *space_info;
5503 int loop = 0; 5521 int loop = 0;
5504 int index = 0; 5522 int index = 0;
@@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5530 if (btrfs_mixed_space_info(space_info)) 5548 if (btrfs_mixed_space_info(space_info))
5531 use_cluster = false; 5549 use_cluster = false;
5532 5550
5533 if (orig_root->ref_cows || empty_size)
5534 allowed_chunk_alloc = 1;
5535
5536 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5551 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5537 last_ptr = &root->fs_info->meta_alloc_cluster; 5552 last_ptr = &root->fs_info->meta_alloc_cluster;
5538 if (!btrfs_test_opt(root, SSD)) 5553 if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5821,6 @@ checks:
5806 5821
5807 trace_btrfs_reserve_extent(orig_root, block_group, 5822 trace_btrfs_reserve_extent(orig_root, block_group,
5808 search_start, num_bytes); 5823 search_start, num_bytes);
5809 if (offset < search_start)
5810 btrfs_add_free_space(used_block_group, offset,
5811 search_start - offset);
5812 BUG_ON(offset > search_start);
5813 if (used_block_group != block_group) 5824 if (used_block_group != block_group)
5814 btrfs_put_block_group(used_block_group); 5825 btrfs_put_block_group(used_block_group);
5815 btrfs_put_block_group(block_group); 5826 btrfs_put_block_group(block_group);
@@ -5842,34 +5853,17 @@ loop:
5842 index = 0; 5853 index = 0;
5843 loop++; 5854 loop++;
5844 if (loop == LOOP_ALLOC_CHUNK) { 5855 if (loop == LOOP_ALLOC_CHUNK) {
5845 if (allowed_chunk_alloc) { 5856 ret = do_chunk_alloc(trans, root, data,
5846 ret = do_chunk_alloc(trans, root, num_bytes + 5857 CHUNK_ALLOC_FORCE);
5847 2 * 1024 * 1024, data, 5858 /*
5848 CHUNK_ALLOC_LIMITED); 5859 * Do not bail out on ENOSPC since we
5849 /* 5860 * can do more things.
5850 * Do not bail out on ENOSPC since we 5861 */
5851 * can do more things. 5862 if (ret < 0 && ret != -ENOSPC) {
5852 */ 5863 btrfs_abort_transaction(trans,
5853 if (ret < 0 && ret != -ENOSPC) { 5864 root, ret);
5854 btrfs_abort_transaction(trans, 5865 goto out;
5855 root, ret);
5856 goto out;
5857 }
5858 allowed_chunk_alloc = 0;
5859 if (ret == 1)
5860 done_chunk_alloc = 1;
5861 } else if (!done_chunk_alloc &&
5862 space_info->force_alloc ==
5863 CHUNK_ALLOC_NO_FORCE) {
5864 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5865 } 5866 }
5866
5867 /*
5868 * We didn't allocate a chunk, go ahead and drop the
5869 * empty size and loop again.
5870 */
5871 if (!done_chunk_alloc)
5872 loop = LOOP_NO_EMPTY_SIZE;
5873 } 5867 }
5874 5868
5875 if (loop == LOOP_NO_EMPTY_SIZE) { 5869 if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5944 5938
5945 data = btrfs_get_alloc_profile(root, data); 5939 data = btrfs_get_alloc_profile(root, data);
5946again: 5940again:
5947 /*
5948 * the only place that sets empty_size is btrfs_realloc_node, which
5949 * is not called recursively on allocations
5950 */
5951 if (empty_size || root->ref_cows) {
5952 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5953 num_bytes + 2 * 1024 * 1024, data,
5954 CHUNK_ALLOC_NO_FORCE);
5955 if (ret < 0 && ret != -ENOSPC) {
5956 btrfs_abort_transaction(trans, root, ret);
5957 return ret;
5958 }
5959 }
5960
5961 WARN_ON(num_bytes < root->sectorsize); 5941 WARN_ON(num_bytes < root->sectorsize);
5962 ret = find_free_extent(trans, root, num_bytes, empty_size, 5942 ret = find_free_extent(trans, root, num_bytes, empty_size,
5963 hint_byte, ins, data); 5943 hint_byte, ins, data);
@@ -5967,12 +5947,6 @@ again:
5967 num_bytes = num_bytes >> 1; 5947 num_bytes = num_bytes >> 1;
5968 num_bytes = num_bytes & ~(root->sectorsize - 1); 5948 num_bytes = num_bytes & ~(root->sectorsize - 1);
5969 num_bytes = max(num_bytes, min_alloc_size); 5949 num_bytes = max(num_bytes, min_alloc_size);
5970 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5971 num_bytes, data, CHUNK_ALLOC_FORCE);
5972 if (ret < 0 && ret != -ENOSPC) {
5973 btrfs_abort_transaction(trans, root, ret);
5974 return ret;
5975 }
5976 if (num_bytes == min_alloc_size) 5950 if (num_bytes == min_alloc_size)
5977 final_tried = true; 5951 final_tried = true;
5978 goto again; 5952 goto again;
@@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6314 ret = block_rsv_use_bytes(block_rsv, blocksize); 6288 ret = block_rsv_use_bytes(block_rsv, blocksize);
6315 if (!ret) 6289 if (!ret)
6316 return block_rsv; 6290 return block_rsv;
6317 if (ret) { 6291 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6292 static DEFINE_RATELIMIT_STATE(_rs,
6319 DEFAULT_RATELIMIT_INTERVAL, 6293 DEFAULT_RATELIMIT_INTERVAL,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6294 /*DEFAULT_RATELIMIT_BURST*/ 2);
@@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7279 7253
7280 alloc_flags = update_block_group_flags(root, cache->flags); 7254 alloc_flags = update_block_group_flags(root, cache->flags);
7281 if (alloc_flags != cache->flags) { 7255 if (alloc_flags != cache->flags) {
7282 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7256 ret = do_chunk_alloc(trans, root, alloc_flags,
7283 CHUNK_ALLOC_FORCE); 7257 CHUNK_ALLOC_FORCE);
7284 if (ret < 0) 7258 if (ret < 0)
7285 goto out; 7259 goto out;
@@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7289 if (!ret) 7263 if (!ret)
7290 goto out; 7264 goto out;
7291 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7265 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7292 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7266 ret = do_chunk_alloc(trans, root, alloc_flags,
7293 CHUNK_ALLOC_FORCE); 7267 CHUNK_ALLOC_FORCE);
7294 if (ret < 0) 7268 if (ret < 0)
7295 goto out; 7269 goto out;
@@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7303 struct btrfs_root *root, u64 type) 7277 struct btrfs_root *root, u64 type)
7304{ 7278{
7305 u64 alloc_flags = get_alloc_profile(root, type); 7279 u64 alloc_flags = get_alloc_profile(root, type);
7306 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7280 return do_chunk_alloc(trans, root, alloc_flags,
7307 CHUNK_ALLOC_FORCE); 7281 CHUNK_ALLOC_FORCE);
7308} 7282}
7309 7283
@@ -7810,6 +7784,34 @@ error:
7810 return ret; 7784 return ret;
7811} 7785}
7812 7786
7787void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7788 struct btrfs_root *root)
7789{
7790 struct btrfs_block_group_cache *block_group, *tmp;
7791 struct btrfs_root *extent_root = root->fs_info->extent_root;
7792 struct btrfs_block_group_item item;
7793 struct btrfs_key key;
7794 int ret = 0;
7795
7796 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7797 new_bg_list) {
7798 list_del_init(&block_group->new_bg_list);
7799
7800 if (ret)
7801 continue;
7802
7803 spin_lock(&block_group->lock);
7804 memcpy(&item, &block_group->item, sizeof(item));
7805 memcpy(&key, &block_group->key, sizeof(key));
7806 spin_unlock(&block_group->lock);
7807
7808 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7809 sizeof(item));
7810 if (ret)
7811 btrfs_abort_transaction(trans, extent_root, ret);
7812 }
7813}
7814
7813int btrfs_make_block_group(struct btrfs_trans_handle *trans, 7815int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7814 struct btrfs_root *root, u64 bytes_used, 7816 struct btrfs_root *root, u64 bytes_used,
7815 u64 type, u64 chunk_objectid, u64 chunk_offset, 7817 u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7843 spin_lock_init(&cache->lock); 7845 spin_lock_init(&cache->lock);
7844 INIT_LIST_HEAD(&cache->list); 7846 INIT_LIST_HEAD(&cache->list);
7845 INIT_LIST_HEAD(&cache->cluster_list); 7847 INIT_LIST_HEAD(&cache->cluster_list);
7848 INIT_LIST_HEAD(&cache->new_bg_list);
7846 7849
7847 btrfs_init_free_space_ctl(cache); 7850 btrfs_init_free_space_ctl(cache);
7848 7851
@@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7874 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7877 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7875 BUG_ON(ret); /* Logic error */ 7878 BUG_ON(ret); /* Logic error */
7876 7879
7877 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 7880 list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7878 sizeof(cache->item));
7879 if (ret) {
7880 btrfs_abort_transaction(trans, extent_root, ret);
7881 return ret;
7882 }
7883 7881
7884 set_avail_alloc_bits(extent_root->fs_info, type); 7882 set_avail_alloc_bits(extent_root->fs_info, type);
7885 7883
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b08ea4717e9d..8036d3a84853 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
45 struct bio *bio; 45 struct bio *bio;
46 struct extent_io_tree *tree; 46 struct extent_io_tree *tree;
47 get_extent_t *get_extent; 47 get_extent_t *get_extent;
48 unsigned long bio_flags;
48 49
49 /* tells writepage not to lock the state bits for this range 50 /* tells writepage not to lock the state bits for this range
50 * it still does the unlocking 51 * it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
64 65
65int __init extent_io_init(void) 66int __init extent_io_init(void)
66{ 67{
67 extent_state_cache = kmem_cache_create("extent_state", 68 extent_state_cache = kmem_cache_create("btrfs_extent_state",
68 sizeof(struct extent_state), 0, 69 sizeof(struct extent_state), 0,
69 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 70 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
70 if (!extent_state_cache) 71 if (!extent_state_cache)
71 return -ENOMEM; 72 return -ENOMEM;
72 73
73 extent_buffer_cache = kmem_cache_create("extent_buffers", 74 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
74 sizeof(struct extent_buffer), 0, 75 sizeof(struct extent_buffer), 0,
75 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 76 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
76 if (!extent_buffer_cache) 77 if (!extent_buffer_cache)
@@ -942,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
942 * @end: the end offset in bytes (inclusive) 943 * @end: the end offset in bytes (inclusive)
943 * @bits: the bits to set in this range 944 * @bits: the bits to set in this range
944 * @clear_bits: the bits to clear in this range 945 * @clear_bits: the bits to clear in this range
946 * @cached_state: state that we're going to cache
945 * @mask: the allocation mask 947 * @mask: the allocation mask
946 * 948 *
947 * This will go through and set bits for the given range. If any states exist 949 * This will go through and set bits for the given range. If any states exist
@@ -951,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
951 * boundary bits like LOCK. 953 * boundary bits like LOCK.
952 */ 954 */
953int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 955int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
954 int bits, int clear_bits, gfp_t mask) 956 int bits, int clear_bits,
957 struct extent_state **cached_state, gfp_t mask)
955{ 958{
956 struct extent_state *state; 959 struct extent_state *state;
957 struct extent_state *prealloc = NULL; 960 struct extent_state *prealloc = NULL;
@@ -968,6 +971,15 @@ again:
968 } 971 }
969 972
970 spin_lock(&tree->lock); 973 spin_lock(&tree->lock);
974 if (cached_state && *cached_state) {
975 state = *cached_state;
976 if (state->start <= start && state->end > start &&
977 state->tree) {
978 node = &state->rb_node;
979 goto hit_next;
980 }
981 }
982
971 /* 983 /*
972 * this search will find all the extents that end after 984 * this search will find all the extents that end after
973 * our range starts. 985 * our range starts.
@@ -998,6 +1010,7 @@ hit_next:
998 */ 1010 */
999 if (state->start == start && state->end <= end) { 1011 if (state->start == start && state->end <= end) {
1000 set_state_bits(tree, state, &bits); 1012 set_state_bits(tree, state, &bits);
1013 cache_state(state, cached_state);
1001 state = clear_state_bit(tree, state, &clear_bits, 0); 1014 state = clear_state_bit(tree, state, &clear_bits, 0);
1002 if (last_end == (u64)-1) 1015 if (last_end == (u64)-1)
1003 goto out; 1016 goto out;
@@ -1038,6 +1051,7 @@ hit_next:
1038 goto out; 1051 goto out;
1039 if (state->end <= end) { 1052 if (state->end <= end) {
1040 set_state_bits(tree, state, &bits); 1053 set_state_bits(tree, state, &bits);
1054 cache_state(state, cached_state);
1041 state = clear_state_bit(tree, state, &clear_bits, 0); 1055 state = clear_state_bit(tree, state, &clear_bits, 0);
1042 if (last_end == (u64)-1) 1056 if (last_end == (u64)-1)
1043 goto out; 1057 goto out;
@@ -1076,6 +1090,7 @@ hit_next:
1076 &bits); 1090 &bits);
1077 if (err) 1091 if (err)
1078 extent_io_tree_panic(tree, err); 1092 extent_io_tree_panic(tree, err);
1093 cache_state(prealloc, cached_state);
1079 prealloc = NULL; 1094 prealloc = NULL;
1080 start = this_end + 1; 1095 start = this_end + 1;
1081 goto search_again; 1096 goto search_again;
@@ -1098,6 +1113,7 @@ hit_next:
1098 extent_io_tree_panic(tree, err); 1113 extent_io_tree_panic(tree, err);
1099 1114
1100 set_state_bits(tree, prealloc, &bits); 1115 set_state_bits(tree, prealloc, &bits);
1116 cache_state(prealloc, cached_state);
1101 clear_state_bit(tree, prealloc, &clear_bits, 0); 1117 clear_state_bit(tree, prealloc, &clear_bits, 0);
1102 prealloc = NULL; 1118 prealloc = NULL;
1103 goto out; 1119 goto out;
@@ -1150,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1150 NULL, cached_state, mask); 1166 NULL, cached_state, mask);
1151} 1167}
1152 1168
1169int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1170 struct extent_state **cached_state, gfp_t mask)
1171{
1172 return set_extent_bit(tree, start, end,
1173 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1174 NULL, cached_state, mask);
1175}
1176
1153int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1177int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1154 gfp_t mask) 1178 gfp_t mask)
1155{ 1179{
@@ -1294,18 +1318,42 @@ out:
1294 * If nothing was found, 1 is returned. If found something, return 0. 1318 * If nothing was found, 1 is returned. If found something, return 0.
1295 */ 1319 */
1296int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1320int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1297 u64 *start_ret, u64 *end_ret, int bits) 1321 u64 *start_ret, u64 *end_ret, int bits,
1322 struct extent_state **cached_state)
1298{ 1323{
1299 struct extent_state *state; 1324 struct extent_state *state;
1325 struct rb_node *n;
1300 int ret = 1; 1326 int ret = 1;
1301 1327
1302 spin_lock(&tree->lock); 1328 spin_lock(&tree->lock);
1329 if (cached_state && *cached_state) {
1330 state = *cached_state;
1331 if (state->end == start - 1 && state->tree) {
1332 n = rb_next(&state->rb_node);
1333 while (n) {
1334 state = rb_entry(n, struct extent_state,
1335 rb_node);
1336 if (state->state & bits)
1337 goto got_it;
1338 n = rb_next(n);
1339 }
1340 free_extent_state(*cached_state);
1341 *cached_state = NULL;
1342 goto out;
1343 }
1344 free_extent_state(*cached_state);
1345 *cached_state = NULL;
1346 }
1347
1303 state = find_first_extent_bit_state(tree, start, bits); 1348 state = find_first_extent_bit_state(tree, start, bits);
1349got_it:
1304 if (state) { 1350 if (state) {
1351 cache_state(state, cached_state);
1305 *start_ret = state->start; 1352 *start_ret = state->start;
1306 *end_ret = state->end; 1353 *end_ret = state->end;
1307 ret = 0; 1354 ret = 0;
1308 } 1355 }
1356out:
1309 spin_unlock(&tree->lock); 1357 spin_unlock(&tree->lock);
1310 return ret; 1358 return ret;
1311} 1359}
@@ -2068,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2068 } 2116 }
2069 read_unlock(&em_tree->lock); 2117 read_unlock(&em_tree->lock);
2070 2118
2071 if (!em || IS_ERR(em)) { 2119 if (!em) {
2072 kfree(failrec); 2120 kfree(failrec);
2073 return -EIO; 2121 return -EIO;
2074 } 2122 }
@@ -2304,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2304 struct extent_state *cached = NULL; 2352 struct extent_state *cached = NULL;
2305 struct extent_state *state; 2353 struct extent_state *state;
2306 2354
2307 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " 2355 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2308 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, 2356 "mirror=%ld\n", (u64)bio->bi_sector, err,
2309 (long int)bio->bi_bdev); 2357 (long int)bio->bi_bdev);
2310 tree = &BTRFS_I(page->mapping->host)->io_tree; 2358 tree = &BTRFS_I(page->mapping->host)->io_tree;
2311 2359
@@ -2709,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2709 end_bio_extent_readpage, mirror_num, 2757 end_bio_extent_readpage, mirror_num,
2710 *bio_flags, 2758 *bio_flags,
2711 this_bio_flag); 2759 this_bio_flag);
2712 BUG_ON(ret == -ENOMEM); 2760 if (!ret) {
2713 nr++; 2761 nr++;
2714 *bio_flags = this_bio_flag; 2762 *bio_flags = this_bio_flag;
2763 }
2715 } 2764 }
2716 if (ret) 2765 if (ret) {
2717 SetPageError(page); 2766 SetPageError(page);
2767 unlock_extent(tree, cur, cur + iosize - 1);
2768 }
2718 cur = cur + iosize; 2769 cur = cur + iosize;
2719 pg_offset += iosize; 2770 pg_offset += iosize;
2720 } 2771 }
@@ -3161,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,
3161 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3212 struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3162 u64 offset = eb->start; 3213 u64 offset = eb->start;
3163 unsigned long i, num_pages; 3214 unsigned long i, num_pages;
3215 unsigned long bio_flags = 0;
3164 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3216 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3165 int ret = 0; 3217 int ret = 0;
3166 3218
3167 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3219 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3168 num_pages = num_extent_pages(eb->start, eb->len); 3220 num_pages = num_extent_pages(eb->start, eb->len);
3169 atomic_set(&eb->io_pages, num_pages); 3221 atomic_set(&eb->io_pages, num_pages);
3222 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3223 bio_flags = EXTENT_BIO_TREE_LOG;
3224
3170 for (i = 0; i < num_pages; i++) { 3225 for (i = 0; i < num_pages; i++) {
3171 struct page *p = extent_buffer_page(eb, i); 3226 struct page *p = extent_buffer_page(eb, i);
3172 3227
@@ -3175,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,
3175 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3230 ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3176 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3231 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3177 -1, end_bio_extent_buffer_writepage, 3232 -1, end_bio_extent_buffer_writepage,
3178 0, 0, 0); 3233 0, epd->bio_flags, bio_flags);
3234 epd->bio_flags = bio_flags;
3179 if (ret) { 3235 if (ret) {
3180 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3236 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3181 SetPageError(p); 3237 SetPageError(p);
@@ -3210,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,
3210 .tree = tree, 3266 .tree = tree,
3211 .extent_locked = 0, 3267 .extent_locked = 0,
3212 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3268 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3269 .bio_flags = 0,
3213 }; 3270 };
3214 int ret = 0; 3271 int ret = 0;
3215 int done = 0; 3272 int done = 0;
@@ -3254,19 +3311,34 @@ retry:
3254 break; 3311 break;
3255 } 3312 }
3256 3313
3314 spin_lock(&mapping->private_lock);
3315 if (!PagePrivate(page)) {
3316 spin_unlock(&mapping->private_lock);
3317 continue;
3318 }
3319
3257 eb = (struct extent_buffer *)page->private; 3320 eb = (struct extent_buffer *)page->private;
3321
3322 /*
3323 * Shouldn't happen and normally this would be a BUG_ON
3324 * but no sense in crashing the users box for something
3325 * we can survive anyway.
3326 */
3258 if (!eb) { 3327 if (!eb) {
3328 spin_unlock(&mapping->private_lock);
3259 WARN_ON(1); 3329 WARN_ON(1);
3260 continue; 3330 continue;
3261 } 3331 }
3262 3332
3263 if (eb == prev_eb) 3333 if (eb == prev_eb) {
3334 spin_unlock(&mapping->private_lock);
3264 continue; 3335 continue;
3336 }
3265 3337
3266 if (!atomic_inc_not_zero(&eb->refs)) { 3338 ret = atomic_inc_not_zero(&eb->refs);
3267 WARN_ON(1); 3339 spin_unlock(&mapping->private_lock);
3340 if (!ret)
3268 continue; 3341 continue;
3269 }
3270 3342
3271 prev_eb = eb; 3343 prev_eb = eb;
3272 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3344 ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3457,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
3457 if (epd->sync_io) 3529 if (epd->sync_io)
3458 rw = WRITE_SYNC; 3530 rw = WRITE_SYNC;
3459 3531
3460 ret = submit_one_bio(rw, epd->bio, 0, 0); 3532 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3461 BUG_ON(ret < 0); /* -ENOMEM */ 3533 BUG_ON(ret < 0); /* -ENOMEM */
3462 epd->bio = NULL; 3534 epd->bio = NULL;
3463 } 3535 }
@@ -3480,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3480 .get_extent = get_extent, 3552 .get_extent = get_extent,
3481 .extent_locked = 0, 3553 .extent_locked = 0,
3482 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3554 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3555 .bio_flags = 0,
3483 }; 3556 };
3484 3557
3485 ret = __extent_writepage(page, wbc, &epd); 3558 ret = __extent_writepage(page, wbc, &epd);
@@ -3504,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3504 .get_extent = get_extent, 3577 .get_extent = get_extent,
3505 .extent_locked = 1, 3578 .extent_locked = 1,
3506 .sync_io = mode == WB_SYNC_ALL, 3579 .sync_io = mode == WB_SYNC_ALL,
3580 .bio_flags = 0,
3507 }; 3581 };
3508 struct writeback_control wbc_writepages = { 3582 struct writeback_control wbc_writepages = {
3509 .sync_mode = mode, 3583 .sync_mode = mode,
@@ -3543,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,
3543 .get_extent = get_extent, 3617 .get_extent = get_extent,
3544 .extent_locked = 0, 3618 .extent_locked = 0,
3545 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3619 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3620 .bio_flags = 0,
3546 }; 3621 };
3547 3622
3548 ret = extent_write_cache_pages(tree, mapping, wbc, 3623 ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3920,18 +3995,6 @@ out:
3920 return ret; 3995 return ret;
3921} 3996}
3922 3997
3923inline struct page *extent_buffer_page(struct extent_buffer *eb,
3924 unsigned long i)
3925{
3926 return eb->pages[i];
3927}
3928
3929inline unsigned long num_extent_pages(u64 start, u64 len)
3930{
3931 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3932 (start >> PAGE_CACHE_SHIFT);
3933}
3934
3935static void __free_extent_buffer(struct extent_buffer *eb) 3998static void __free_extent_buffer(struct extent_buffer *eb)
3936{ 3999{
3937#if LEAK_DEBUG 4000#if LEAK_DEBUG
@@ -4047,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4047 4110
4048 return eb; 4111 return eb;
4049err: 4112err:
4050 for (i--; i > 0; i--) 4113 for (i--; i >= 0; i--)
4051 __free_page(eb->pages[i]); 4114 __free_page(eb->pages[i]);
4052 __free_extent_buffer(eb); 4115 __free_extent_buffer(eb);
4053 return NULL; 4116 return NULL;
@@ -4192,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4192 4255
4193 for (i = 0; i < num_pages; i++, index++) { 4256 for (i = 0; i < num_pages; i++, index++) {
4194 p = find_or_create_page(mapping, index, GFP_NOFS); 4257 p = find_or_create_page(mapping, index, GFP_NOFS);
4195 if (!p) { 4258 if (!p)
4196 WARN_ON(1);
4197 goto free_eb; 4259 goto free_eb;
4198 }
4199 4260
4200 spin_lock(&mapping->private_lock); 4261 spin_lock(&mapping->private_lock);
4201 if (PagePrivate(p)) { 4262 if (PagePrivate(p)) {
@@ -4338,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4338 4399
4339 /* Should be safe to release our pages at this point */ 4400 /* Should be safe to release our pages at this point */
4340 btrfs_release_extent_buffer_page(eb, 0); 4401 btrfs_release_extent_buffer_page(eb, 0);
4341
4342 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4402 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4343 return 1; 4403 return 1;
4344 } 4404 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15d..711d12b80028 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
27 * type for this bio 27 * type for this bio
28 */ 28 */
29#define EXTENT_BIO_COMPRESSED 1 29#define EXTENT_BIO_COMPRESSED 1
30#define EXTENT_BIO_TREE_LOG 2
30#define EXTENT_BIO_FLAG_SHIFT 16 31#define EXTENT_BIO_FLAG_SHIFT 16
31 32
32/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
232int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 233int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
233 gfp_t mask); 234 gfp_t mask);
234int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 235int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
235 int bits, int clear_bits, gfp_t mask); 236 int bits, int clear_bits,
237 struct extent_state **cached_state, gfp_t mask);
236int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 238int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
237 struct extent_state **cached_state, gfp_t mask); 239 struct extent_state **cached_state, gfp_t mask);
240int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
241 struct extent_state **cached_state, gfp_t mask);
238int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 242int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
239 u64 *start_ret, u64 *end_ret, int bits); 243 u64 *start_ret, u64 *end_ret, int bits,
244 struct extent_state **cached_state);
240struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 245struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
241 u64 start, int bits); 246 u64 start, int bits);
242int extent_invalidatepage(struct extent_io_tree *tree, 247int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
277int read_extent_buffer_pages(struct extent_io_tree *tree, 282int read_extent_buffer_pages(struct extent_io_tree *tree,
278 struct extent_buffer *eb, u64 start, int wait, 283 struct extent_buffer *eb, u64 start, int wait,
279 get_extent_t *get_extent, int mirror_num); 284 get_extent_t *get_extent, int mirror_num);
280unsigned long num_extent_pages(u64 start, u64 len); 285
281struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); 286static inline unsigned long num_extent_pages(u64 start, u64 len)
287{
288 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
289 (start >> PAGE_CACHE_SHIFT);
290}
291
292static inline struct page *extent_buffer_page(struct extent_buffer *eb,
293 unsigned long i)
294{
295 return eb->pages[i];
296}
282 297
283static inline void extent_buffer_get(struct extent_buffer *eb) 298static inline void extent_buffer_get(struct extent_buffer *eb)
284{ 299{
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..b8cbc8d5c7f7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
11 11
12int __init extent_map_init(void) 12int __init extent_map_init(void)
13{ 13{
14 extent_map_cache = kmem_cache_create("extent_map", 14 extent_map_cache = kmem_cache_create("btrfs_extent_map",
15 sizeof(struct extent_map), 0, 15 sizeof(struct extent_map), 0,
16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
17 if (!extent_map_cache) 17 if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
35void extent_map_tree_init(struct extent_map_tree *tree) 35void extent_map_tree_init(struct extent_map_tree *tree)
36{ 36{
37 tree->map = RB_ROOT; 37 tree->map = RB_ROOT;
38 INIT_LIST_HEAD(&tree->modified_extents);
38 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
39} 40}
40 41
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
54 em->in_tree = 0; 55 em->in_tree = 0;
55 em->flags = 0; 56 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 57 em->compress_type = BTRFS_COMPRESS_NONE;
58 em->generation = 0;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
60 INIT_LIST_HEAD(&em->list);
58 return em; 61 return em;
59} 62}
60 63
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
72 WARN_ON(atomic_read(&em->refs) == 0); 75 WARN_ON(atomic_read(&em->refs) == 0);
73 if (atomic_dec_and_test(&em->refs)) { 76 if (atomic_dec_and_test(&em->refs)) {
74 WARN_ON(em->in_tree); 77 WARN_ON(em->in_tree);
78 WARN_ON(!list_empty(&em->list));
75 kmem_cache_free(extent_map_cache, em); 79 kmem_cache_free(extent_map_cache, em);
76 } 80 }
77} 81}
@@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 em->block_len += merge->block_len; 202 em->block_len += merge->block_len;
199 em->block_start = merge->block_start; 203 em->block_start = merge->block_start;
200 merge->in_tree = 0; 204 merge->in_tree = 0;
205 if (merge->generation > em->generation) {
206 em->mod_start = em->start;
207 em->mod_len = em->len;
208 em->generation = merge->generation;
209 list_move(&em->list, &tree->modified_extents);
210 }
211
212 list_del_init(&merge->list);
201 rb_erase(&merge->rb_node, &tree->map); 213 rb_erase(&merge->rb_node, &tree->map);
202 free_extent_map(merge); 214 free_extent_map(merge);
203 } 215 }
@@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
211 em->block_len += merge->len; 223 em->block_len += merge->len;
212 rb_erase(&merge->rb_node, &tree->map); 224 rb_erase(&merge->rb_node, &tree->map);
213 merge->in_tree = 0; 225 merge->in_tree = 0;
226 if (merge->generation > em->generation) {
227 em->mod_len = em->len;
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list);
214 free_extent_map(merge); 232 free_extent_map(merge);
215 } 233 }
216} 234}
217 235
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 236/**
237 * unpint_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file
240 * @len: length of the extent
241 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 *
244 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know
246 * we need to sync this extent when we call fsync().
247 */
248int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
249 u64 gen)
219{ 250{
220 int ret = 0; 251 int ret = 0;
221 struct extent_map *em; 252 struct extent_map *em;
253 bool prealloc = false;
222 254
223 write_lock(&tree->lock); 255 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len); 256 em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
228 if (!em) 260 if (!em)
229 goto out; 261 goto out;
230 262
263 list_move(&em->list, &tree->modified_extents);
264 em->generation = gen;
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 265 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
266 em->mod_start = em->start;
267 em->mod_len = em->len;
268
269 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
270 prealloc = true;
271 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
272 }
232 273
233 try_merge_map(tree, em); 274 try_merge_map(tree, em);
234 275
276 if (prealloc) {
277 em->mod_start = em->start;
278 em->mod_len = em->len;
279 }
280
235 free_extent_map(em); 281 free_extent_map(em);
236out: 282out:
237 write_unlock(&tree->lock); 283 write_unlock(&tree->lock);
@@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
269 } 315 }
270 atomic_inc(&em->refs); 316 atomic_inc(&em->refs);
271 317
318 em->mod_start = em->start;
319 em->mod_len = em->len;
320
272 try_merge_map(tree, em); 321 try_merge_map(tree, em);
273out: 322out:
274 return ret; 323 return ret;
@@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
358 407
359 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 408 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
360 rb_erase(&em->rb_node, &tree->map); 409 rb_erase(&em->rb_node, &tree->map);
410 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
411 list_del_init(&em->list);
361 em->in_tree = 0; 412 em->in_tree = 0;
362 return ret; 413 return ret;
363} 414}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761fe..679225555f7b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
13#define EXTENT_FLAG_COMPRESSED 1 13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
16 17
17struct extent_map { 18struct extent_map {
18 struct rb_node rb_node; 19 struct rb_node rb_node;
@@ -20,18 +21,23 @@ struct extent_map {
20 /* all of these are in bytes */ 21 /* all of these are in bytes */
21 u64 start; 22 u64 start;
22 u64 len; 23 u64 len;
24 u64 mod_start;
25 u64 mod_len;
23 u64 orig_start; 26 u64 orig_start;
24 u64 block_start; 27 u64 block_start;
25 u64 block_len; 28 u64 block_len;
29 u64 generation;
26 unsigned long flags; 30 unsigned long flags;
27 struct block_device *bdev; 31 struct block_device *bdev;
28 atomic_t refs; 32 atomic_t refs;
29 unsigned int in_tree; 33 unsigned int in_tree;
30 unsigned int compress_type; 34 unsigned int compress_type;
35 struct list_head list;
31}; 36};
32 37
33struct extent_map_tree { 38struct extent_map_tree {
34 struct rb_root map; 39 struct rb_root map;
40 struct list_head modified_extents;
35 rwlock_t lock; 41 rwlock_t lock;
36}; 42};
37 43
@@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);
60void free_extent_map(struct extent_map *em); 66void free_extent_map(struct extent_map *em);
61int __init extent_map_init(void); 67int __init extent_map_init(void);
62void extent_map_exit(void); 68void extent_map_exit(void);
63int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); 69int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
64struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 70struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
65 u64 start, u64 len); 71 u64 start, u64 len);
66#endif 72#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01dc..1ad08e4e4a15 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "print-tree.h" 26#include "print-tree.h"
27 27
28#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ 28#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
29 sizeof(struct btrfs_item) * 2) / \ 29 sizeof(struct btrfs_item) * 2) / \
30 size) - 1)) 30 size) - 1))
31 31
32#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) 32#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
33 PAGE_CACHE_SIZE))
33 34
34#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
35 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6b40e86121b..9ab1bed88116 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h"
42 43
43/* 44/*
44 * when auto defrag is enabled we 45 * when auto defrag is enabled we
@@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
458 * this drops all the extents in the cache that intersect the range 459 * this drops all the extents in the cache that intersect the range
459 * [start, end]. Existing extents are split as required. 460 * [start, end]. Existing extents are split as required.
460 */ 461 */
461int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 462void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
462 int skip_pinned) 463 int skip_pinned)
463{ 464{
464 struct extent_map *em; 465 struct extent_map *em;
465 struct extent_map *split = NULL; 466 struct extent_map *split = NULL;
466 struct extent_map *split2 = NULL; 467 struct extent_map *split2 = NULL;
467 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 468 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
468 u64 len = end - start + 1; 469 u64 len = end - start + 1;
470 u64 gen;
469 int ret; 471 int ret;
470 int testend = 1; 472 int testend = 1;
471 unsigned long flags; 473 unsigned long flags;
@@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
477 testend = 0; 479 testend = 0;
478 } 480 }
479 while (1) { 481 while (1) {
482 int no_splits = 0;
483
480 if (!split) 484 if (!split)
481 split = alloc_extent_map(); 485 split = alloc_extent_map();
482 if (!split2) 486 if (!split2)
483 split2 = alloc_extent_map(); 487 split2 = alloc_extent_map();
484 BUG_ON(!split || !split2); /* -ENOMEM */ 488 if (!split || !split2)
489 no_splits = 1;
485 490
486 write_lock(&em_tree->lock); 491 write_lock(&em_tree->lock);
487 em = lookup_extent_mapping(em_tree, start, len); 492 em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
490 break; 495 break;
491 } 496 }
492 flags = em->flags; 497 flags = em->flags;
498 gen = em->generation;
493 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 499 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
494 if (testend && em->start + em->len >= start + len) { 500 if (testend && em->start + em->len >= start + len) {
495 free_extent_map(em); 501 free_extent_map(em);
@@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
506 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 512 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
507 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 513 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
508 remove_extent_mapping(em_tree, em); 514 remove_extent_mapping(em_tree, em);
515 if (no_splits)
516 goto next;
509 517
510 if (em->block_start < EXTENT_MAP_LAST_BYTE && 518 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
511 em->start < start) { 519 em->start < start) {
@@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
518 split->block_len = em->block_len; 526 split->block_len = em->block_len;
519 else 527 else
520 split->block_len = split->len; 528 split->block_len = split->len;
521 529 split->generation = gen;
522 split->bdev = em->bdev; 530 split->bdev = em->bdev;
523 split->flags = flags; 531 split->flags = flags;
524 split->compress_type = em->compress_type; 532 split->compress_type = em->compress_type;
525 ret = add_extent_mapping(em_tree, split); 533 ret = add_extent_mapping(em_tree, split);
526 BUG_ON(ret); /* Logic error */ 534 BUG_ON(ret); /* Logic error */
535 list_move(&split->list, &em_tree->modified_extents);
527 free_extent_map(split); 536 free_extent_map(split);
528 split = split2; 537 split = split2;
529 split2 = NULL; 538 split2 = NULL;
@@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
537 split->bdev = em->bdev; 546 split->bdev = em->bdev;
538 split->flags = flags; 547 split->flags = flags;
539 split->compress_type = em->compress_type; 548 split->compress_type = em->compress_type;
549 split->generation = gen;
540 550
541 if (compressed) { 551 if (compressed) {
542 split->block_len = em->block_len; 552 split->block_len = em->block_len;
@@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
550 560
551 ret = add_extent_mapping(em_tree, split); 561 ret = add_extent_mapping(em_tree, split);
552 BUG_ON(ret); /* Logic error */ 562 BUG_ON(ret); /* Logic error */
563 list_move(&split->list, &em_tree->modified_extents);
553 free_extent_map(split); 564 free_extent_map(split);
554 split = NULL; 565 split = NULL;
555 } 566 }
567next:
556 write_unlock(&em_tree->lock); 568 write_unlock(&em_tree->lock);
557 569
558 /* once for us */ 570 /* once for us */
@@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
564 free_extent_map(split); 576 free_extent_map(split);
565 if (split2) 577 if (split2)
566 free_extent_map(split2); 578 free_extent_map(split2);
567 return 0;
568} 579}
569 580
570/* 581/*
@@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
576 * it is either truncated or split. Anything entirely inside the range 587 * it is either truncated or split. Anything entirely inside the range
577 * is deleted from the tree. 588 * is deleted from the tree.
578 */ 589 */
579int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 590int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
580 u64 start, u64 end, u64 *hint_byte, int drop_cache) 591 struct btrfs_root *root, struct inode *inode,
592 struct btrfs_path *path, u64 start, u64 end,
593 u64 *drop_end, int drop_cache)
581{ 594{
582 struct btrfs_root *root = BTRFS_I(inode)->root;
583 struct extent_buffer *leaf; 595 struct extent_buffer *leaf;
584 struct btrfs_file_extent_item *fi; 596 struct btrfs_file_extent_item *fi;
585 struct btrfs_path *path;
586 struct btrfs_key key; 597 struct btrfs_key key;
587 struct btrfs_key new_key; 598 struct btrfs_key new_key;
588 u64 ino = btrfs_ino(inode); 599 u64 ino = btrfs_ino(inode);
@@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
597 int recow; 608 int recow;
598 int ret; 609 int ret;
599 int modify_tree = -1; 610 int modify_tree = -1;
611 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
612 int found = 0;
600 613
601 if (drop_cache) 614 if (drop_cache)
602 btrfs_drop_extent_cache(inode, start, end - 1, 0); 615 btrfs_drop_extent_cache(inode, start, end - 1, 0);
603 616
604 path = btrfs_alloc_path();
605 if (!path)
606 return -ENOMEM;
607
608 if (start >= BTRFS_I(inode)->disk_i_size) 617 if (start >= BTRFS_I(inode)->disk_i_size)
609 modify_tree = 0; 618 modify_tree = 0;
610 619
@@ -666,6 +675,7 @@ next_slot:
666 goto next_slot; 675 goto next_slot;
667 } 676 }
668 677
678 found = 1;
669 search_start = max(key.offset, start); 679 search_start = max(key.offset, start);
670 if (recow || !modify_tree) { 680 if (recow || !modify_tree) {
671 modify_tree = -1; 681 modify_tree = -1;
@@ -707,14 +717,13 @@ next_slot:
707 extent_end - start); 717 extent_end - start);
708 btrfs_mark_buffer_dirty(leaf); 718 btrfs_mark_buffer_dirty(leaf);
709 719
710 if (disk_bytenr > 0) { 720 if (update_refs && disk_bytenr > 0) {
711 ret = btrfs_inc_extent_ref(trans, root, 721 ret = btrfs_inc_extent_ref(trans, root,
712 disk_bytenr, num_bytes, 0, 722 disk_bytenr, num_bytes, 0,
713 root->root_key.objectid, 723 root->root_key.objectid,
714 new_key.objectid, 724 new_key.objectid,
715 start - extent_offset, 0); 725 start - extent_offset, 0);
716 BUG_ON(ret); /* -ENOMEM */ 726 BUG_ON(ret); /* -ENOMEM */
717 *hint_byte = disk_bytenr;
718 } 727 }
719 key.offset = start; 728 key.offset = start;
720 } 729 }
@@ -734,10 +743,8 @@ next_slot:
734 btrfs_set_file_extent_num_bytes(leaf, fi, 743 btrfs_set_file_extent_num_bytes(leaf, fi,
735 extent_end - end); 744 extent_end - end);
736 btrfs_mark_buffer_dirty(leaf); 745 btrfs_mark_buffer_dirty(leaf);
737 if (disk_bytenr > 0) { 746 if (update_refs && disk_bytenr > 0)
738 inode_sub_bytes(inode, end - key.offset); 747 inode_sub_bytes(inode, end - key.offset);
739 *hint_byte = disk_bytenr;
740 }
741 break; 748 break;
742 } 749 }
743 750
@@ -753,10 +760,8 @@ next_slot:
753 btrfs_set_file_extent_num_bytes(leaf, fi, 760 btrfs_set_file_extent_num_bytes(leaf, fi,
754 start - key.offset); 761 start - key.offset);
755 btrfs_mark_buffer_dirty(leaf); 762 btrfs_mark_buffer_dirty(leaf);
756 if (disk_bytenr > 0) { 763 if (update_refs && disk_bytenr > 0)
757 inode_sub_bytes(inode, extent_end - start); 764 inode_sub_bytes(inode, extent_end - start);
758 *hint_byte = disk_bytenr;
759 }
760 if (end == extent_end) 765 if (end == extent_end)
761 break; 766 break;
762 767
@@ -777,12 +782,13 @@ next_slot:
777 del_nr++; 782 del_nr++;
778 } 783 }
779 784
780 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 785 if (update_refs &&
786 extent_type == BTRFS_FILE_EXTENT_INLINE) {
781 inode_sub_bytes(inode, 787 inode_sub_bytes(inode,
782 extent_end - key.offset); 788 extent_end - key.offset);
783 extent_end = ALIGN(extent_end, 789 extent_end = ALIGN(extent_end,
784 root->sectorsize); 790 root->sectorsize);
785 } else if (disk_bytenr > 0) { 791 } else if (update_refs && disk_bytenr > 0) {
786 ret = btrfs_free_extent(trans, root, 792 ret = btrfs_free_extent(trans, root,
787 disk_bytenr, num_bytes, 0, 793 disk_bytenr, num_bytes, 0,
788 root->root_key.objectid, 794 root->root_key.objectid,
@@ -791,7 +797,6 @@ next_slot:
791 BUG_ON(ret); /* -ENOMEM */ 797 BUG_ON(ret); /* -ENOMEM */
792 inode_sub_bytes(inode, 798 inode_sub_bytes(inode,
793 extent_end - key.offset); 799 extent_end - key.offset);
794 *hint_byte = disk_bytenr;
795 } 800 }
796 801
797 if (end == extent_end) 802 if (end == extent_end)
@@ -806,7 +811,7 @@ next_slot:
806 del_nr); 811 del_nr);
807 if (ret) { 812 if (ret) {
808 btrfs_abort_transaction(trans, root, ret); 813 btrfs_abort_transaction(trans, root, ret);
809 goto out; 814 break;
810 } 815 }
811 816
812 del_nr = 0; 817 del_nr = 0;
@@ -825,7 +830,24 @@ next_slot:
825 btrfs_abort_transaction(trans, root, ret); 830 btrfs_abort_transaction(trans, root, ret);
826 } 831 }
827 832
828out: 833 if (drop_end)
834 *drop_end = found ? min(end, extent_end) : end;
835 btrfs_release_path(path);
836 return ret;
837}
838
839int btrfs_drop_extents(struct btrfs_trans_handle *trans,
840 struct btrfs_root *root, struct inode *inode, u64 start,
841 u64 end, int drop_cache)
842{
843 struct btrfs_path *path;
844 int ret;
845
846 path = btrfs_alloc_path();
847 if (!path)
848 return -ENOMEM;
849 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
850 drop_cache);
829 btrfs_free_path(path); 851 btrfs_free_path(path);
830 return ret; 852 return ret;
831} 853}
@@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
892 int ret; 914 int ret;
893 u64 ino = btrfs_ino(inode); 915 u64 ino = btrfs_ino(inode);
894 916
895 btrfs_drop_extent_cache(inode, start, end - 1, 0);
896
897 path = btrfs_alloc_path(); 917 path = btrfs_alloc_path();
898 if (!path) 918 if (!path)
899 return -ENOMEM; 919 return -ENOMEM;
@@ -935,12 +955,16 @@ again:
935 btrfs_set_item_key_safe(trans, root, path, &new_key); 955 btrfs_set_item_key_safe(trans, root, path, &new_key);
936 fi = btrfs_item_ptr(leaf, path->slots[0], 956 fi = btrfs_item_ptr(leaf, path->slots[0],
937 struct btrfs_file_extent_item); 957 struct btrfs_file_extent_item);
958 btrfs_set_file_extent_generation(leaf, fi,
959 trans->transid);
938 btrfs_set_file_extent_num_bytes(leaf, fi, 960 btrfs_set_file_extent_num_bytes(leaf, fi,
939 extent_end - end); 961 extent_end - end);
940 btrfs_set_file_extent_offset(leaf, fi, 962 btrfs_set_file_extent_offset(leaf, fi,
941 end - orig_offset); 963 end - orig_offset);
942 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 964 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
943 struct btrfs_file_extent_item); 965 struct btrfs_file_extent_item);
966 btrfs_set_file_extent_generation(leaf, fi,
967 trans->transid);
944 btrfs_set_file_extent_num_bytes(leaf, fi, 968 btrfs_set_file_extent_num_bytes(leaf, fi,
945 end - other_start); 969 end - other_start);
946 btrfs_mark_buffer_dirty(leaf); 970 btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +982,16 @@ again:
958 struct btrfs_file_extent_item); 982 struct btrfs_file_extent_item);
959 btrfs_set_file_extent_num_bytes(leaf, fi, 983 btrfs_set_file_extent_num_bytes(leaf, fi,
960 start - key.offset); 984 start - key.offset);
985 btrfs_set_file_extent_generation(leaf, fi,
986 trans->transid);
961 path->slots[0]++; 987 path->slots[0]++;
962 new_key.offset = start; 988 new_key.offset = start;
963 btrfs_set_item_key_safe(trans, root, path, &new_key); 989 btrfs_set_item_key_safe(trans, root, path, &new_key);
964 990
965 fi = btrfs_item_ptr(leaf, path->slots[0], 991 fi = btrfs_item_ptr(leaf, path->slots[0],
966 struct btrfs_file_extent_item); 992 struct btrfs_file_extent_item);
993 btrfs_set_file_extent_generation(leaf, fi,
994 trans->transid);
967 btrfs_set_file_extent_num_bytes(leaf, fi, 995 btrfs_set_file_extent_num_bytes(leaf, fi,
968 other_end - start); 996 other_end - start);
969 btrfs_set_file_extent_offset(leaf, fi, 997 btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1019,14 @@ again:
991 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
992 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1020 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
993 struct btrfs_file_extent_item); 1021 struct btrfs_file_extent_item);
1022 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
994 btrfs_set_file_extent_num_bytes(leaf, fi, 1023 btrfs_set_file_extent_num_bytes(leaf, fi,
995 split - key.offset); 1024 split - key.offset);
996 1025
997 fi = btrfs_item_ptr(leaf, path->slots[0], 1026 fi = btrfs_item_ptr(leaf, path->slots[0],
998 struct btrfs_file_extent_item); 1027 struct btrfs_file_extent_item);
999 1028
1029 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1000 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1030 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1001 btrfs_set_file_extent_num_bytes(leaf, fi, 1031 btrfs_set_file_extent_num_bytes(leaf, fi,
1002 extent_end - split); 1032 extent_end - split);
@@ -1056,12 +1086,14 @@ again:
1056 struct btrfs_file_extent_item); 1086 struct btrfs_file_extent_item);
1057 btrfs_set_file_extent_type(leaf, fi, 1087 btrfs_set_file_extent_type(leaf, fi,
1058 BTRFS_FILE_EXTENT_REG); 1088 BTRFS_FILE_EXTENT_REG);
1089 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1059 btrfs_mark_buffer_dirty(leaf); 1090 btrfs_mark_buffer_dirty(leaf);
1060 } else { 1091 } else {
1061 fi = btrfs_item_ptr(leaf, del_slot - 1, 1092 fi = btrfs_item_ptr(leaf, del_slot - 1,
1062 struct btrfs_file_extent_item); 1093 struct btrfs_file_extent_item);
1063 btrfs_set_file_extent_type(leaf, fi, 1094 btrfs_set_file_extent_type(leaf, fi,
1064 BTRFS_FILE_EXTENT_REG); 1095 BTRFS_FILE_EXTENT_REG);
1096 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1065 btrfs_set_file_extent_num_bytes(leaf, fi, 1097 btrfs_set_file_extent_num_bytes(leaf, fi,
1066 extent_end - key.offset); 1098 extent_end - key.offset);
1067 btrfs_mark_buffer_dirty(leaf); 1099 btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1205,8 @@ again:
1173 1205
1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1206 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1175 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1207 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1176 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1208 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1177 GFP_NOFS); 1209 0, 0, &cached_state, GFP_NOFS);
1178 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1210 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1179 start_pos, last_pos - 1, &cached_state, 1211 start_pos, last_pos - 1, &cached_state,
1180 GFP_NOFS); 1212 GFP_NOFS);
@@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1514 1546
1515 trace_btrfs_sync_file(file, datasync); 1547 trace_btrfs_sync_file(file, datasync);
1516 1548
1549 /*
1550 * We write the dirty pages in the range and wait until they complete
1551 * out of the ->i_mutex. If so, we can flush the dirty pages by
1552 * multi-task, and make the performance up.
1553 */
1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1555 if (ret)
1556 return ret;
1557
1517 mutex_lock(&inode->i_mutex); 1558 mutex_lock(&inode->i_mutex);
1518 1559
1519 /* 1560 /*
1520 * we wait first, since the writeback may change the inode, also wait 1561 * We flush the dirty pages again to avoid some dirty pages in the
1521 * ordered range does a filemape_write_and_wait_range which is why we 1562 * range being left.
1522 * don't do it above like other file systems.
1523 */ 1563 */
1524 root->log_batch++; 1564 atomic_inc(&root->log_batch);
1525 btrfs_wait_ordered_range(inode, start, end); 1565 btrfs_wait_ordered_range(inode, start, end);
1526 root->log_batch++; 1566 atomic_inc(&root->log_batch);
1527 1567
1528 /* 1568 /*
1529 * check the transaction that last modified this inode 1569 * check the transaction that last modified this inode
@@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1544 BTRFS_I(inode)->last_trans <= 1584 BTRFS_I(inode)->last_trans <=
1545 root->fs_info->last_trans_committed) { 1585 root->fs_info->last_trans_committed) {
1546 BTRFS_I(inode)->last_trans = 0; 1586 BTRFS_I(inode)->last_trans = 0;
1587
1588 /*
1589 * We'v had everything committed since the last time we were
1590 * modified so clear this flag in case it was set for whatever
1591 * reason, it's no longer relevant.
1592 */
1593 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1594 &BTRFS_I(inode)->runtime_flags);
1547 mutex_unlock(&inode->i_mutex); 1595 mutex_unlock(&inode->i_mutex);
1548 goto out; 1596 goto out;
1549 } 1597 }
@@ -1615,6 +1663,324 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1615 return 0; 1663 return 0;
1616} 1664}
1617 1665
1666static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
1667 int slot, u64 start, u64 end)
1668{
1669 struct btrfs_file_extent_item *fi;
1670 struct btrfs_key key;
1671
1672 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1673 return 0;
1674
1675 btrfs_item_key_to_cpu(leaf, &key, slot);
1676 if (key.objectid != btrfs_ino(inode) ||
1677 key.type != BTRFS_EXTENT_DATA_KEY)
1678 return 0;
1679
1680 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1681
1682 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1683 return 0;
1684
1685 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1686 return 0;
1687
1688 if (key.offset == end)
1689 return 1;
1690 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1691 return 1;
1692 return 0;
1693}
1694
1695static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
1696 struct btrfs_path *path, u64 offset, u64 end)
1697{
1698 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 struct extent_buffer *leaf;
1700 struct btrfs_file_extent_item *fi;
1701 struct extent_map *hole_em;
1702 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1703 struct btrfs_key key;
1704 int ret;
1705
1706 key.objectid = btrfs_ino(inode);
1707 key.type = BTRFS_EXTENT_DATA_KEY;
1708 key.offset = offset;
1709
1710
1711 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1712 if (ret < 0)
1713 return ret;
1714 BUG_ON(!ret);
1715
1716 leaf = path->nodes[0];
1717 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
1718 u64 num_bytes;
1719
1720 path->slots[0]--;
1721 fi = btrfs_item_ptr(leaf, path->slots[0],
1722 struct btrfs_file_extent_item);
1723 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
1724 end - offset;
1725 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1726 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1727 btrfs_set_file_extent_offset(leaf, fi, 0);
1728 btrfs_mark_buffer_dirty(leaf);
1729 goto out;
1730 }
1731
1732 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
1733 u64 num_bytes;
1734
1735 path->slots[0]++;
1736 key.offset = offset;
1737 btrfs_set_item_key_safe(trans, root, path, &key);
1738 fi = btrfs_item_ptr(leaf, path->slots[0],
1739 struct btrfs_file_extent_item);
1740 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
1741 offset;
1742 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1743 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1744 btrfs_set_file_extent_offset(leaf, fi, 0);
1745 btrfs_mark_buffer_dirty(leaf);
1746 goto out;
1747 }
1748 btrfs_release_path(path);
1749
1750 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
1751 0, 0, end - offset, 0, end - offset,
1752 0, 0, 0);
1753 if (ret)
1754 return ret;
1755
1756out:
1757 btrfs_release_path(path);
1758
1759 hole_em = alloc_extent_map();
1760 if (!hole_em) {
1761 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1762 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1763 &BTRFS_I(inode)->runtime_flags);
1764 } else {
1765 hole_em->start = offset;
1766 hole_em->len = end - offset;
1767 hole_em->orig_start = offset;
1768
1769 hole_em->block_start = EXTENT_MAP_HOLE;
1770 hole_em->block_len = 0;
1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1772 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1773 hole_em->generation = trans->transid;
1774
1775 do {
1776 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1777 write_lock(&em_tree->lock);
1778 ret = add_extent_mapping(em_tree, hole_em);
1779 if (!ret)
1780 list_move(&hole_em->list,
1781 &em_tree->modified_extents);
1782 write_unlock(&em_tree->lock);
1783 } while (ret == -EEXIST);
1784 free_extent_map(hole_em);
1785 if (ret)
1786 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1787 &BTRFS_I(inode)->runtime_flags);
1788 }
1789
1790 return 0;
1791}
1792
1793static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1794{
1795 struct btrfs_root *root = BTRFS_I(inode)->root;
1796 struct extent_state *cached_state = NULL;
1797 struct btrfs_path *path;
1798 struct btrfs_block_rsv *rsv;
1799 struct btrfs_trans_handle *trans;
1800 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1801 u64 lockstart = (offset + mask) & ~mask;
1802 u64 lockend = ((offset + len) & ~mask) - 1;
1803 u64 cur_offset = lockstart;
1804 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1805 u64 drop_end;
1806 unsigned long nr;
1807 int ret = 0;
1808 int err = 0;
1809 bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
1810 ((offset + len) >> PAGE_CACHE_SHIFT);
1811
1812 btrfs_wait_ordered_range(inode, offset, len);
1813
1814 mutex_lock(&inode->i_mutex);
1815 if (offset >= inode->i_size) {
1816 mutex_unlock(&inode->i_mutex);
1817 return 0;
1818 }
1819
1820 /*
1821 * Only do this if we are in the same page and we aren't doing the
1822 * entire page.
1823 */
1824 if (same_page && len < PAGE_CACHE_SIZE) {
1825 ret = btrfs_truncate_page(inode, offset, len, 0);
1826 mutex_unlock(&inode->i_mutex);
1827 return ret;
1828 }
1829
1830 /* zero back part of the first page */
1831 ret = btrfs_truncate_page(inode, offset, 0, 0);
1832 if (ret) {
1833 mutex_unlock(&inode->i_mutex);
1834 return ret;
1835 }
1836
1837 /* zero the front end of the last page */
1838 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1839 if (ret) {
1840 mutex_unlock(&inode->i_mutex);
1841 return ret;
1842 }
1843
1844 if (lockend < lockstart) {
1845 mutex_unlock(&inode->i_mutex);
1846 return 0;
1847 }
1848
1849 while (1) {
1850 struct btrfs_ordered_extent *ordered;
1851
1852 truncate_pagecache_range(inode, lockstart, lockend);
1853
1854 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1855 0, &cached_state);
1856 ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
1857
1858 /*
1859 * We need to make sure we have no ordered extents in this range
1860 * and nobody raced in and read a page in this range, if we did
1861 * we need to try again.
1862 */
1863 if ((!ordered ||
1864 (ordered->file_offset + ordered->len < lockstart ||
1865 ordered->file_offset > lockend)) &&
1866 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
1867 lockend, EXTENT_UPTODATE, 0,
1868 cached_state)) {
1869 if (ordered)
1870 btrfs_put_ordered_extent(ordered);
1871 break;
1872 }
1873 if (ordered)
1874 btrfs_put_ordered_extent(ordered);
1875 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
1876 lockend, &cached_state, GFP_NOFS);
1877 btrfs_wait_ordered_range(inode, lockstart,
1878 lockend - lockstart + 1);
1879 }
1880
1881 path = btrfs_alloc_path();
1882 if (!path) {
1883 ret = -ENOMEM;
1884 goto out;
1885 }
1886
1887 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
1888 if (!rsv) {
1889 ret = -ENOMEM;
1890 goto out_free;
1891 }
1892 rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
1893 rsv->failfast = 1;
1894
1895 /*
1896 * 1 - update the inode
1897 * 1 - removing the extents in the range
1898 * 1 - adding the hole extent
1899 */
1900 trans = btrfs_start_transaction(root, 3);
1901 if (IS_ERR(trans)) {
1902 err = PTR_ERR(trans);
1903 goto out_free;
1904 }
1905
1906 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
1907 min_size);
1908 BUG_ON(ret);
1909 trans->block_rsv = rsv;
1910
1911 while (cur_offset < lockend) {
1912 ret = __btrfs_drop_extents(trans, root, inode, path,
1913 cur_offset, lockend + 1,
1914 &drop_end, 1);
1915 if (ret != -ENOSPC)
1916 break;
1917
1918 trans->block_rsv = &root->fs_info->trans_block_rsv;
1919
1920 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1921 if (ret) {
1922 err = ret;
1923 break;
1924 }
1925
1926 cur_offset = drop_end;
1927
1928 ret = btrfs_update_inode(trans, root, inode);
1929 if (ret) {
1930 err = ret;
1931 break;
1932 }
1933
1934 nr = trans->blocks_used;
1935 btrfs_end_transaction(trans, root);
1936 btrfs_btree_balance_dirty(root, nr);
1937
1938 trans = btrfs_start_transaction(root, 3);
1939 if (IS_ERR(trans)) {
1940 ret = PTR_ERR(trans);
1941 trans = NULL;
1942 break;
1943 }
1944
1945 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
1946 rsv, min_size);
1947 BUG_ON(ret); /* shouldn't happen */
1948 trans->block_rsv = rsv;
1949 }
1950
1951 if (ret) {
1952 err = ret;
1953 goto out_trans;
1954 }
1955
1956 trans->block_rsv = &root->fs_info->trans_block_rsv;
1957 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1958 if (ret) {
1959 err = ret;
1960 goto out_trans;
1961 }
1962
1963out_trans:
1964 if (!trans)
1965 goto out_free;
1966
1967 trans->block_rsv = &root->fs_info->trans_block_rsv;
1968 ret = btrfs_update_inode(trans, root, inode);
1969 nr = trans->blocks_used;
1970 btrfs_end_transaction(trans, root);
1971 btrfs_btree_balance_dirty(root, nr);
1972out_free:
1973 btrfs_free_path(path);
1974 btrfs_free_block_rsv(root, rsv);
1975out:
1976 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1977 &cached_state, GFP_NOFS);
1978 mutex_unlock(&inode->i_mutex);
1979 if (ret && !err)
1980 err = ret;
1981 return err;
1982}
1983
1618static long btrfs_fallocate(struct file *file, int mode, 1984static long btrfs_fallocate(struct file *file, int mode,
1619 loff_t offset, loff_t len) 1985 loff_t offset, loff_t len)
1620{ 1986{
@@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,
1633 alloc_start = offset & ~mask; 1999 alloc_start = offset & ~mask;
1634 alloc_end = (offset + len + mask) & ~mask; 2000 alloc_end = (offset + len + mask) & ~mask;
1635 2001
1636 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 2002 /* Make sure we aren't being give some crap mode */
1637 if (mode & ~FALLOC_FL_KEEP_SIZE) 2003 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1638 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
1639 2005
2006 if (mode & FALLOC_FL_PUNCH_HOLE)
2007 return btrfs_punch_hole(inode, offset, len);
2008
1640 /* 2009 /*
1641 * Make sure we have enough space before we do the 2010 * Make sure we have enough space before we do the
1642 * allocation. 2011 * allocation.
1643 */ 2012 */
1644 ret = btrfs_check_data_free_space(inode, len); 2013 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
1645 if (ret) 2014 if (ret)
1646 return ret; 2015 return ret;
1647 2016
@@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,
1748out: 2117out:
1749 mutex_unlock(&inode->i_mutex); 2118 mutex_unlock(&inode->i_mutex);
1750 /* Let go of our reservation. */ 2119 /* Let go of our reservation. */
1751 btrfs_free_reserved_data_space(inode, len); 2120 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
1752 return ret; 2121 return ret;
1753} 2122}
1754 2123
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5c..1027b854b90c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
966 block_group->key.offset)) { 966 block_group->key.offset)) {
967 ret = find_first_extent_bit(unpin, start, 967 ret = find_first_extent_bit(unpin, start,
968 &extent_start, &extent_end, 968 &extent_start, &extent_end,
969 EXTENT_DIRTY); 969 EXTENT_DIRTY, NULL);
970 if (ret) { 970 if (ret) {
971 ret = 0; 971 ret = 0;
972 break; 972 break;
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1454 max_t(u64, *offset, bitmap_info->offset)); 1454 max_t(u64, *offset, bitmap_info->offset));
1455 bits = bytes_to_bits(*bytes, ctl->unit); 1455 bits = bytes_to_bits(*bytes, ctl->unit);
1456 1456
1457 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1457 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1458 i < BITS_PER_BITMAP;
1459 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1460 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1458 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1461 BITS_PER_BITMAP, i); 1459 BITS_PER_BITMAP, i);
1462 if ((next_zero - i) >= bits) { 1460 if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2307 2305
2308again: 2306again:
2309 found_bits = 0; 2307 found_bits = 0;
2310 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 2308 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2311 i < BITS_PER_BITMAP;
2312 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2313 next_zero = find_next_zero_bit(entry->bitmap, 2309 next_zero = find_next_zero_bit(entry->bitmap,
2314 BITS_PER_BITMAP, i); 2310 BITS_PER_BITMAP, i);
2315 if (next_zero - i >= min_bits) { 2311 if (next_zero - i >= min_bits) {
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b99..1d982812ab67 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
24{ 24{
25 return crc32c((u32)~1, name, len); 25 return crc32c((u32)~1, name, len);
26} 26}
27
28/*
29 * Figure the key offset of an extended inode ref
30 */
31static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
32 int len)
33{
34 return (u64) crc32c(parent_objectid, name, len);
35}
36
27#endif 37#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c73..48b8fda93132 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
18 18
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "hash.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
50 return 0; 51 return 0;
51} 52}
52 53
53struct btrfs_inode_ref * 54int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
55 const char *name, int name_len,
56 struct btrfs_inode_extref **extref_ret)
57{
58 struct extent_buffer *leaf;
59 struct btrfs_inode_extref *extref;
60 unsigned long ptr;
61 unsigned long name_ptr;
62 u32 item_size;
63 u32 cur_offset = 0;
64 int ref_name_len;
65
66 leaf = path->nodes[0];
67 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
68 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
69
70 /*
71 * Search all extended backrefs in this item. We're only
72 * looking through any collisions so most of the time this is
73 * just going to compare against one buffer. If all is well,
74 * we'll return success and the inode ref object.
75 */
76 while (cur_offset < item_size) {
77 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
78 name_ptr = (unsigned long)(&extref->name);
79 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
80
81 if (ref_name_len == name_len &&
82 btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
83 (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
84 if (extref_ret)
85 *extref_ret = extref;
86 return 1;
87 }
88
89 cur_offset += ref_name_len + sizeof(*extref);
90 }
91 return 0;
92}
93
94static struct btrfs_inode_ref *
54btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 95btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root, 96 struct btrfs_root *root,
56 struct btrfs_path *path, 97 struct btrfs_path *path,
57 const char *name, int name_len, 98 const char *name, int name_len,
58 u64 inode_objectid, u64 ref_objectid, int mod) 99 u64 inode_objectid, u64 ref_objectid, int ins_len,
100 int cow)
59{ 101{
102 int ret;
60 struct btrfs_key key; 103 struct btrfs_key key;
61 struct btrfs_inode_ref *ref; 104 struct btrfs_inode_ref *ref;
62 int ins_len = mod < 0 ? -1 : 0;
63 int cow = mod != 0;
64 int ret;
65 105
66 key.objectid = inode_objectid; 106 key.objectid = inode_objectid;
67 key.type = BTRFS_INODE_REF_KEY; 107 key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
77 return ref; 117 return ref;
78} 118}
79 119
80int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 120/* Returns NULL if no extref found */
121struct btrfs_inode_extref *
122btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
123 struct btrfs_root *root,
124 struct btrfs_path *path,
125 const char *name, int name_len,
126 u64 inode_objectid, u64 ref_objectid, int ins_len,
127 int cow)
128{
129 int ret;
130 struct btrfs_key key;
131 struct btrfs_inode_extref *extref;
132
133 key.objectid = inode_objectid;
134 key.type = BTRFS_INODE_EXTREF_KEY;
135 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
136
137 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
138 if (ret < 0)
139 return ERR_PTR(ret);
140 if (ret > 0)
141 return NULL;
142 if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
143 return NULL;
144 return extref;
145}
146
147int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
148 struct btrfs_root *root,
149 struct btrfs_path *path,
150 const char *name, int name_len,
151 u64 inode_objectid, u64 ref_objectid, int mod,
152 u64 *ret_index)
153{
154 struct btrfs_inode_ref *ref;
155 struct btrfs_inode_extref *extref;
156 int ins_len = mod < 0 ? -1 : 0;
157 int cow = mod != 0;
158
159 ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
160 inode_objectid, ref_objectid, ins_len,
161 cow);
162 if (IS_ERR(ref))
163 return PTR_ERR(ref);
164
165 if (ref != NULL) {
166 *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
167 return 0;
168 }
169
170 btrfs_release_path(path);
171
172 extref = btrfs_lookup_inode_extref(trans, root, path, name,
173 name_len, inode_objectid,
174 ref_objectid, ins_len, cow);
175 if (IS_ERR(extref))
176 return PTR_ERR(extref);
177
178 if (extref) {
179 *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
180 return 0;
181 }
182
183 return -ENOENT;
184}
185
186int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root, 187 struct btrfs_root *root,
82 const char *name, int name_len, 188 const char *name, int name_len,
83 u64 inode_objectid, u64 ref_objectid, u64 *index) 189 u64 inode_objectid, u64 ref_objectid, u64 *index)
84{ 190{
85 struct btrfs_path *path; 191 struct btrfs_path *path;
86 struct btrfs_key key; 192 struct btrfs_key key;
193 struct btrfs_inode_extref *extref;
194 struct extent_buffer *leaf;
195 int ret;
196 int del_len = name_len + sizeof(*extref);
197 unsigned long ptr;
198 unsigned long item_start;
199 u32 item_size;
200
201 key.objectid = inode_objectid;
202 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
203 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
204
205 path = btrfs_alloc_path();
206 if (!path)
207 return -ENOMEM;
208
209 path->leave_spinning = 1;
210
211 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
212 if (ret > 0)
213 ret = -ENOENT;
214 if (ret < 0)
215 goto out;
216
217 /*
218 * Sanity check - did we find the right item for this name?
219 * This should always succeed so error here will make the FS
220 * readonly.
221 */
222 if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
223 name, name_len, &extref)) {
224 btrfs_std_error(root->fs_info, -ENOENT);
225 ret = -EROFS;
226 goto out;
227 }
228
229 leaf = path->nodes[0];
230 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
231 if (index)
232 *index = btrfs_inode_extref_index(leaf, extref);
233
234 if (del_len == item_size) {
235 /*
236 * Common case only one ref in the item, remove the
237 * whole item.
238 */
239 ret = btrfs_del_item(trans, root, path);
240 goto out;
241 }
242
243 ptr = (unsigned long)extref;
244 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
245
246 memmove_extent_buffer(leaf, ptr, ptr + del_len,
247 item_size - (ptr + del_len - item_start));
248
249 btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
250
251out:
252 btrfs_free_path(path);
253
254 return ret;
255}
256
257int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
258 struct btrfs_root *root,
259 const char *name, int name_len,
260 u64 inode_objectid, u64 ref_objectid, u64 *index)
261{
262 struct btrfs_path *path;
263 struct btrfs_key key;
87 struct btrfs_inode_ref *ref; 264 struct btrfs_inode_ref *ref;
88 struct extent_buffer *leaf; 265 struct extent_buffer *leaf;
89 unsigned long ptr; 266 unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
91 u32 item_size; 268 u32 item_size;
92 u32 sub_item_len; 269 u32 sub_item_len;
93 int ret; 270 int ret;
271 int search_ext_refs = 0;
94 int del_len = name_len + sizeof(*ref); 272 int del_len = name_len + sizeof(*ref);
95 273
96 key.objectid = inode_objectid; 274 key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
106 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 284 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
107 if (ret > 0) { 285 if (ret > 0) {
108 ret = -ENOENT; 286 ret = -ENOENT;
287 search_ext_refs = 1;
109 goto out; 288 goto out;
110 } else if (ret < 0) { 289 } else if (ret < 0) {
111 goto out; 290 goto out;
112 } 291 }
113 if (!find_name_in_backref(path, name, name_len, &ref)) { 292 if (!find_name_in_backref(path, name, name_len, &ref)) {
114 ret = -ENOENT; 293 ret = -ENOENT;
294 search_ext_refs = 1;
115 goto out; 295 goto out;
116 } 296 }
117 leaf = path->nodes[0]; 297 leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
129 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); 309 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
130 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, 310 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
131 item_size - (ptr + sub_item_len - item_start)); 311 item_size - (ptr + sub_item_len - item_start));
132 btrfs_truncate_item(trans, root, path, 312 btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
133 item_size - sub_item_len, 1); 313out:
314 btrfs_free_path(path);
315
316 if (search_ext_refs) {
317 /*
318 * No refs were found, or we could not find the
319 * name in our ref array. Find and remove the extended
320 * inode ref then.
321 */
322 return btrfs_del_inode_extref(trans, root, name, name_len,
323 inode_objectid, ref_objectid, index);
324 }
325
326 return ret;
327}
328
329/*
330 * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
331 *
332 * The caller must have checked against BTRFS_LINK_MAX already.
333 */
334static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
335 struct btrfs_root *root,
336 const char *name, int name_len,
337 u64 inode_objectid, u64 ref_objectid, u64 index)
338{
339 struct btrfs_inode_extref *extref;
340 int ret;
341 int ins_len = name_len + sizeof(*extref);
342 unsigned long ptr;
343 struct btrfs_path *path;
344 struct btrfs_key key;
345 struct extent_buffer *leaf;
346 struct btrfs_item *item;
347
348 key.objectid = inode_objectid;
349 key.type = BTRFS_INODE_EXTREF_KEY;
350 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
351
352 path = btrfs_alloc_path();
353 if (!path)
354 return -ENOMEM;
355
356 path->leave_spinning = 1;
357 ret = btrfs_insert_empty_item(trans, root, path, &key,
358 ins_len);
359 if (ret == -EEXIST) {
360 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
361 name, name_len, NULL))
362 goto out;
363
364 btrfs_extend_item(trans, root, path, ins_len);
365 ret = 0;
366 }
367 if (ret < 0)
368 goto out;
369
370 leaf = path->nodes[0];
371 item = btrfs_item_nr(leaf, path->slots[0]);
372 ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
373 ptr += btrfs_item_size(leaf, item) - ins_len;
374 extref = (struct btrfs_inode_extref *)ptr;
375
376 btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
377 btrfs_set_inode_extref_index(path->nodes[0], extref, index);
378 btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
379
380 ptr = (unsigned long)&extref->name;
381 write_extent_buffer(path->nodes[0], name, ptr, name_len);
382 btrfs_mark_buffer_dirty(path->nodes[0]);
383
134out: 384out:
135 btrfs_free_path(path); 385 btrfs_free_path(path);
136 return ret; 386 return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
191 441
192out: 442out:
193 btrfs_free_path(path); 443 btrfs_free_path(path);
444
445 if (ret == -EMLINK) {
446 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
447 /* We ran out of space in the ref array. Need to
448 * add an extended ref. */
449 if (btrfs_super_incompat_flags(disk_super)
450 & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
451 ret = btrfs_insert_inode_extref(trans, root, name,
452 name_len,
453 inode_objectid,
454 ref_objectid, index);
455 }
456
194 return ret; 457 return ret;
195} 458}
196 459
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6ed6944e50c..85a1e5053fe6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
230 u64 inline_len = actual_end - start; 230 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) & 231 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1); 232 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len; 233 u64 data_len = inline_len;
235 int ret; 234 int ret;
236 235
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
247 return 1; 246 return 1;
248 } 247 }
249 248
250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 249 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
251 &hint_byte, 1);
252 if (ret) 250 if (ret)
253 return ret; 251 return ret;
254 252
@@ -664,7 +662,7 @@ retry:
664 async_extent->compressed_size, 662 async_extent->compressed_size,
665 async_extent->compressed_size, 663 async_extent->compressed_size,
666 0, alloc_hint, &ins, 1); 664 0, alloc_hint, &ins, 1);
667 if (ret) 665 if (ret && ret != -ENOSPC)
668 btrfs_abort_transaction(trans, root, ret); 666 btrfs_abort_transaction(trans, root, ret);
669 btrfs_end_transaction(trans, root); 667 btrfs_end_transaction(trans, root);
670 } 668 }
@@ -1308,6 +1306,7 @@ out_check:
1308 em->block_start = disk_bytenr; 1306 em->block_start = disk_bytenr;
1309 em->bdev = root->fs_info->fs_devices->latest_bdev; 1307 em->bdev = root->fs_info->fs_devices->latest_bdev;
1310 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1308 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1309 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
1311 while (1) { 1310 while (1) {
1312 write_lock(&em_tree->lock); 1311 write_lock(&em_tree->lock);
1313 ret = add_extent_mapping(em_tree, em); 1312 ret = add_extent_mapping(em_tree, em);
@@ -1364,11 +1363,7 @@ out_check:
1364 } 1363 }
1365 1364
1366error: 1365error:
1367 if (nolock) { 1366 err = btrfs_end_transaction(trans, root);
1368 err = btrfs_end_transaction_nolock(trans, root);
1369 } else {
1370 err = btrfs_end_transaction(trans, root);
1371 }
1372 if (!ret) 1367 if (!ret)
1373 ret = err; 1368 ret = err;
1374 1369
@@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1785 struct btrfs_path *path; 1780 struct btrfs_path *path;
1786 struct extent_buffer *leaf; 1781 struct extent_buffer *leaf;
1787 struct btrfs_key ins; 1782 struct btrfs_key ins;
1788 u64 hint;
1789 int ret; 1783 int ret;
1790 1784
1791 path = btrfs_alloc_path(); 1785 path = btrfs_alloc_path();
@@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1803 * the caller is expected to unpin it and allow it to be merged 1797 * the caller is expected to unpin it and allow it to be merged
1804 * with the others. 1798 * with the others.
1805 */ 1799 */
1806 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1800 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1807 &hint, 0); 1801 file_pos + num_bytes, 0);
1808 if (ret) 1802 if (ret)
1809 goto out; 1803 goto out;
1810 1804
@@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1828 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1822 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1829 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1823 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1830 1824
1831 btrfs_unlock_up_safe(path, 1);
1832 btrfs_set_lock_blocking(leaf);
1833
1834 btrfs_mark_buffer_dirty(leaf); 1825 btrfs_mark_buffer_dirty(leaf);
1826 btrfs_release_path(path);
1835 1827
1836 inode_add_bytes(inode, num_bytes); 1828 inode_add_bytes(inode, num_bytes);
1837 1829
@@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1929 ordered_extent->len, 1921 ordered_extent->len,
1930 compress_type, 0, 0, 1922 compress_type, 0, 0,
1931 BTRFS_FILE_EXTENT_REG); 1923 BTRFS_FILE_EXTENT_REG);
1932 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1933 ordered_extent->file_offset,
1934 ordered_extent->len);
1935 } 1924 }
1936 1925 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1926 ordered_extent->file_offset, ordered_extent->len,
1927 trans->transid);
1937 if (ret < 0) { 1928 if (ret < 0) {
1938 btrfs_abort_transaction(trans, root, ret); 1929 btrfs_abort_transaction(trans, root, ret);
1939 goto out_unlock; 1930 goto out_unlock;
@@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1949 btrfs_abort_transaction(trans, root, ret); 1940 btrfs_abort_transaction(trans, root, ret);
1950 goto out_unlock; 1941 goto out_unlock;
1951 } 1942 }
1943 } else {
1944 btrfs_set_inode_last_trans(trans, inode);
1952 } 1945 }
1953 ret = 0; 1946 ret = 0;
1954out_unlock: 1947out_unlock:
@@ -1958,12 +1951,8 @@ out_unlock:
1958out: 1951out:
1959 if (root != root->fs_info->tree_root) 1952 if (root != root->fs_info->tree_root)
1960 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1953 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1961 if (trans) { 1954 if (trans)
1962 if (nolock) 1955 btrfs_end_transaction(trans, root);
1963 btrfs_end_transaction_nolock(trans, root);
1964 else
1965 btrfs_end_transaction(trans, root);
1966 }
1967 1956
1968 if (ret) 1957 if (ret)
1969 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1958 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2119 if (empty) 2108 if (empty)
2120 return; 2109 return;
2121 2110
2122 down_read(&root->fs_info->cleanup_work_sem);
2123 spin_lock(&fs_info->delayed_iput_lock); 2111 spin_lock(&fs_info->delayed_iput_lock);
2124 list_splice_init(&fs_info->delayed_iputs, &list); 2112 list_splice_init(&fs_info->delayed_iputs, &list);
2125 spin_unlock(&fs_info->delayed_iput_lock); 2113 spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2130 iput(delayed->inode); 2118 iput(delayed->inode);
2131 kfree(delayed); 2119 kfree(delayed);
2132 } 2120 }
2133 up_read(&root->fs_info->cleanup_work_sem);
2134} 2121}
2135 2122
2136enum btrfs_orphan_cleanup_state { 2123enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2198 int ret; 2185 int ret;
2199 2186
2200 if (!root->orphan_block_rsv) { 2187 if (!root->orphan_block_rsv) {
2201 block_rsv = btrfs_alloc_block_rsv(root); 2188 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2202 if (!block_rsv) 2189 if (!block_rsv)
2203 return -ENOMEM; 2190 return -ENOMEM;
2204 } 2191 }
@@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2225 insert = 1; 2212 insert = 1;
2226#endif 2213#endif
2227 insert = 1; 2214 insert = 1;
2228 atomic_dec(&root->orphan_inodes); 2215 atomic_inc(&root->orphan_inodes);
2229 } 2216 }
2230 2217
2231 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2218 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
2590 2577
2591 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2578 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2592 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2579 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2580 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
2581
2582 /*
2583 * If we were modified in the current generation and evicted from memory
2584 * and then re-read we need to do a full sync since we don't have any
2585 * idea about which extents were modified before we were evicted from
2586 * cache.
2587 */
2588 if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
2589 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2590 &BTRFS_I(inode)->runtime_flags);
2591
2593 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2592 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2594 inode->i_generation = BTRFS_I(inode)->generation; 2593 inode->i_generation = BTRFS_I(inode)->generation;
2595 inode->i_rdev = 0; 2594 inode->i_rdev = 0;
@@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2894 struct btrfs_trans_handle *trans; 2893 struct btrfs_trans_handle *trans;
2895 struct btrfs_root *root = BTRFS_I(dir)->root; 2894 struct btrfs_root *root = BTRFS_I(dir)->root;
2896 struct btrfs_path *path; 2895 struct btrfs_path *path;
2897 struct btrfs_inode_ref *ref;
2898 struct btrfs_dir_item *di; 2896 struct btrfs_dir_item *di;
2899 struct inode *inode = dentry->d_inode; 2897 struct inode *inode = dentry->d_inode;
2900 u64 index; 2898 u64 index;
@@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3008 } 3006 }
3009 btrfs_release_path(path); 3007 btrfs_release_path(path);
3010 3008
3011 ref = btrfs_lookup_inode_ref(trans, root, path, 3009 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3012 dentry->d_name.name, dentry->d_name.len, 3010 dentry->d_name.len, ino, dir_ino, 0,
3013 ino, dir_ino, 0); 3011 &index);
3014 if (IS_ERR(ref)) { 3012 if (ret) {
3015 err = PTR_ERR(ref); 3013 err = ret;
3016 goto out; 3014 goto out;
3017 } 3015 }
3018 BUG_ON(!ref); /* Logic error */ 3016
3019 if (check_path_shared(root, path)) 3017 if (check_path_shared(root, path))
3020 goto out; 3018 goto out;
3021 index = btrfs_inode_ref_index(path->nodes[0], ref); 3019
3022 btrfs_release_path(path); 3020 btrfs_release_path(path);
3023 3021
3024 /* 3022 /*
@@ -3061,7 +3059,7 @@ out:
3061static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3059static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3062 struct btrfs_root *root) 3060 struct btrfs_root *root)
3063{ 3061{
3064 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3062 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3065 btrfs_block_rsv_release(root, trans->block_rsv, 3063 btrfs_block_rsv_release(root, trans->block_rsv,
3066 trans->bytes_reserved); 3064 trans->bytes_reserved);
3067 trans->block_rsv = &root->fs_info->trans_block_rsv; 3065 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3191 struct btrfs_trans_handle *trans; 3189 struct btrfs_trans_handle *trans;
3192 unsigned long nr = 0; 3190 unsigned long nr = 0;
3193 3191
3194 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3192 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3195 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3196 return -ENOTEMPTY; 3193 return -ENOTEMPTY;
3194 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3195 return -EPERM;
3197 3196
3198 trans = __unlink_start_trans(dir, dentry); 3197 trans = __unlink_start_trans(dir, dentry);
3199 if (IS_ERR(trans)) 3198 if (IS_ERR(trans))
@@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3267 return -ENOMEM; 3266 return -ENOMEM;
3268 path->reada = -1; 3267 path->reada = -1;
3269 3268
3269 /*
3270 * We want to drop from the next block forward in case this new size is
3271 * not block aligned since we will be keeping the last block of the
3272 * extent just the way it is.
3273 */
3270 if (root->ref_cows || root == root->fs_info->tree_root) 3274 if (root->ref_cows || root == root->fs_info->tree_root)
3271 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3275 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
3272 3276
3273 /* 3277 /*
3274 * This function is also used to drop the items in the log tree before 3278 * This function is also used to drop the items in the log tree before
@@ -3429,12 +3433,6 @@ delete:
3429 3433
3430 if (path->slots[0] == 0 || 3434 if (path->slots[0] == 0 ||
3431 path->slots[0] != pending_del_slot) { 3435 path->slots[0] != pending_del_slot) {
3432 if (root->ref_cows &&
3433 BTRFS_I(inode)->location.objectid !=
3434 BTRFS_FREE_INO_OBJECTID) {
3435 err = -EAGAIN;
3436 goto out;
3437 }
3438 if (pending_del_nr) { 3436 if (pending_del_nr) {
3439 ret = btrfs_del_items(trans, root, path, 3437 ret = btrfs_del_items(trans, root, path,
3440 pending_del_slot, 3438 pending_del_slot,
@@ -3465,12 +3463,20 @@ error:
3465} 3463}
3466 3464
3467/* 3465/*
3468 * taken from block_truncate_page, but does cow as it zeros out 3466 * btrfs_truncate_page - read, zero a chunk and write a page
3469 * any bytes left in the last page in the file. 3467 * @inode - inode that we're zeroing
3468 * @from - the offset to start zeroing
3469 * @len - the length to zero, 0 to zero the entire range respective to the
3470 * offset
3471 * @front - zero up to the offset instead of from the offset on
3472 *
3473 * This will find the page for the "from" offset and cow the page and zero the
3474 * part we want to zero. This is used with truncate and hole punching.
3470 */ 3475 */
3471static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3476int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3477 int front)
3472{ 3478{
3473 struct inode *inode = mapping->host; 3479 struct address_space *mapping = inode->i_mapping;
3474 struct btrfs_root *root = BTRFS_I(inode)->root; 3480 struct btrfs_root *root = BTRFS_I(inode)->root;
3475 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3476 struct btrfs_ordered_extent *ordered; 3482 struct btrfs_ordered_extent *ordered;
@@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3485 u64 page_start; 3491 u64 page_start;
3486 u64 page_end; 3492 u64 page_end;
3487 3493
3488 if ((offset & (blocksize - 1)) == 0) 3494 if ((offset & (blocksize - 1)) == 0 &&
3495 (!len || ((len & (blocksize - 1)) == 0)))
3489 goto out; 3496 goto out;
3490 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3497 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3491 if (ret) 3498 if (ret)
@@ -3532,7 +3539,8 @@ again:
3532 } 3539 }
3533 3540
3534 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3541 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3535 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3542 EXTENT_DIRTY | EXTENT_DELALLOC |
3543 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
3536 0, 0, &cached_state, GFP_NOFS); 3544 0, 0, &cached_state, GFP_NOFS);
3537 3545
3538 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3546 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3545,8 +3553,13 @@ again:
3545 3553
3546 ret = 0; 3554 ret = 0;
3547 if (offset != PAGE_CACHE_SIZE) { 3555 if (offset != PAGE_CACHE_SIZE) {
3556 if (!len)
3557 len = PAGE_CACHE_SIZE - offset;
3548 kaddr = kmap(page); 3558 kaddr = kmap(page);
3549 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3559 if (front)
3560 memset(kaddr, 0, offset);
3561 else
3562 memset(kaddr + offset, 0, len);
3550 flush_dcache_page(page); 3563 flush_dcache_page(page);
3551 kunmap(page); 3564 kunmap(page);
3552 } 3565 }
@@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3577 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3590 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3578 struct extent_map *em = NULL; 3591 struct extent_map *em = NULL;
3579 struct extent_state *cached_state = NULL; 3592 struct extent_state *cached_state = NULL;
3593 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3580 u64 mask = root->sectorsize - 1; 3594 u64 mask = root->sectorsize - 1;
3581 u64 hole_start = (oldsize + mask) & ~mask; 3595 u64 hole_start = (oldsize + mask) & ~mask;
3582 u64 block_end = (size + mask) & ~mask; 3596 u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3613 last_byte = min(extent_map_end(em), block_end); 3627 last_byte = min(extent_map_end(em), block_end);
3614 last_byte = (last_byte + mask) & ~mask; 3628 last_byte = (last_byte + mask) & ~mask;
3615 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3629 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3616 u64 hint_byte = 0; 3630 struct extent_map *hole_em;
3617 hole_size = last_byte - cur_offset; 3631 hole_size = last_byte - cur_offset;
3618 3632
3619 trans = btrfs_start_transaction(root, 3); 3633 trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3622 break; 3636 break;
3623 } 3637 }
3624 3638
3625 err = btrfs_drop_extents(trans, inode, cur_offset, 3639 err = btrfs_drop_extents(trans, root, inode,
3626 cur_offset + hole_size, 3640 cur_offset,
3627 &hint_byte, 1); 3641 cur_offset + hole_size, 1);
3628 if (err) { 3642 if (err) {
3629 btrfs_abort_transaction(trans, root, err); 3643 btrfs_abort_transaction(trans, root, err);
3630 btrfs_end_transaction(trans, root); 3644 btrfs_end_transaction(trans, root);
@@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3641 break; 3655 break;
3642 } 3656 }
3643 3657
3644 btrfs_drop_extent_cache(inode, hole_start, 3658 btrfs_drop_extent_cache(inode, cur_offset,
3645 last_byte - 1, 0); 3659 cur_offset + hole_size - 1, 0);
3660 hole_em = alloc_extent_map();
3661 if (!hole_em) {
3662 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3663 &BTRFS_I(inode)->runtime_flags);
3664 goto next;
3665 }
3666 hole_em->start = cur_offset;
3667 hole_em->len = hole_size;
3668 hole_em->orig_start = cur_offset;
3646 3669
3670 hole_em->block_start = EXTENT_MAP_HOLE;
3671 hole_em->block_len = 0;
3672 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3673 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3674 hole_em->generation = trans->transid;
3675
3676 while (1) {
3677 write_lock(&em_tree->lock);
3678 err = add_extent_mapping(em_tree, hole_em);
3679 if (!err)
3680 list_move(&hole_em->list,
3681 &em_tree->modified_extents);
3682 write_unlock(&em_tree->lock);
3683 if (err != -EEXIST)
3684 break;
3685 btrfs_drop_extent_cache(inode, cur_offset,
3686 cur_offset +
3687 hole_size - 1, 0);
3688 }
3689 free_extent_map(hole_em);
3690next:
3647 btrfs_update_inode(trans, root, inode); 3691 btrfs_update_inode(trans, root, inode);
3648 btrfs_end_transaction(trans, root); 3692 btrfs_end_transaction(trans, root);
3649 } 3693 }
@@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)
3768 goto no_delete; 3812 goto no_delete;
3769 } 3813 }
3770 3814
3771 rsv = btrfs_alloc_block_rsv(root); 3815 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3772 if (!rsv) { 3816 if (!rsv) {
3773 btrfs_orphan_del(NULL, inode); 3817 btrfs_orphan_del(NULL, inode);
3774 goto no_delete; 3818 goto no_delete;
3775 } 3819 }
3776 rsv->size = min_size; 3820 rsv->size = min_size;
3821 rsv->failfast = 1;
3777 global_rsv = &root->fs_info->global_block_rsv; 3822 global_rsv = &root->fs_info->global_block_rsv;
3778 3823
3779 btrfs_i_size_write(inode, 0); 3824 btrfs_i_size_write(inode, 0);
3780 3825
3781 /* 3826 /*
3782 * This is a bit simpler than btrfs_truncate since 3827 * This is a bit simpler than btrfs_truncate since we've already
3783 * 3828 * reserved our space for our orphan item in the unlink, so we just
3784 * 1) We've already reserved our space for our orphan item in the 3829 * need to reserve some slack space in case we add bytes and update
3785 * unlink. 3830 * inode item when doing the truncate.
3786 * 2) We're going to delete the inode item, so we don't need to update
3787 * it at all.
3788 *
3789 * So we just need to reserve some slack space in case we add bytes when
3790 * doing the truncate.
3791 */ 3831 */
3792 while (1) { 3832 while (1) {
3793 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3833 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)
3808 goto no_delete; 3848 goto no_delete;
3809 } 3849 }
3810 3850
3811 trans = btrfs_start_transaction(root, 0); 3851 trans = btrfs_start_transaction_noflush(root, 1);
3812 if (IS_ERR(trans)) { 3852 if (IS_ERR(trans)) {
3813 btrfs_orphan_del(NULL, inode); 3853 btrfs_orphan_del(NULL, inode);
3814 btrfs_free_block_rsv(root, rsv); 3854 btrfs_free_block_rsv(root, rsv);
@@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)
3818 trans->block_rsv = rsv; 3858 trans->block_rsv = rsv;
3819 3859
3820 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3860 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3821 if (ret != -EAGAIN) 3861 if (ret != -ENOSPC)
3822 break; 3862 break;
3823 3863
3864 trans->block_rsv = &root->fs_info->trans_block_rsv;
3865 ret = btrfs_update_inode(trans, root, inode);
3866 BUG_ON(ret);
3867
3824 nr = trans->blocks_used; 3868 nr = trans->blocks_used;
3825 btrfs_end_transaction(trans, root); 3869 btrfs_end_transaction(trans, root);
3826 trans = NULL; 3870 trans = NULL;
@@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4470 trans = btrfs_join_transaction(root); 4514 trans = btrfs_join_transaction(root);
4471 if (IS_ERR(trans)) 4515 if (IS_ERR(trans))
4472 return PTR_ERR(trans); 4516 return PTR_ERR(trans);
4473 if (nolock) 4517 ret = btrfs_commit_transaction(trans, root);
4474 ret = btrfs_end_transaction_nolock(trans, root);
4475 else
4476 ret = btrfs_commit_transaction(trans, root);
4477 } 4518 }
4478 return ret; 4519 return ret;
4479} 4520}
@@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4671 BTRFS_I(inode)->generation = trans->transid; 4712 BTRFS_I(inode)->generation = trans->transid;
4672 inode->i_generation = BTRFS_I(inode)->generation; 4713 inode->i_generation = BTRFS_I(inode)->generation;
4673 4714
4715 /*
4716 * We could have gotten an inode number from somebody who was fsynced
4717 * and then removed in this same transaction, so let's just set full
4718 * sync since it will be a full sync anyway and this will blow away the
4719 * old info in the log.
4720 */
4721 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
4722
4674 if (S_ISDIR(mode)) 4723 if (S_ISDIR(mode))
4675 owner = 0; 4724 owner = 0;
4676 else 4725 else
@@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4680 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4729 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4681 key[0].offset = 0; 4730 key[0].offset = 0;
4682 4731
4732 /*
4733 * Start new inodes with an inode_ref. This is slightly more
4734 * efficient for small numbers of hard links since they will
4735 * be packed into one item. Extended refs will kick in if we
4736 * add more hard links than can fit in the ref item.
4737 */
4683 key[1].objectid = objectid; 4738 key[1].objectid = objectid;
4684 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4739 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4685 key[1].offset = ref_objectid; 4740 key[1].offset = ref_objectid;
@@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4986 if (root->objectid != BTRFS_I(inode)->root->objectid) 5041 if (root->objectid != BTRFS_I(inode)->root->objectid)
4987 return -EXDEV; 5042 return -EXDEV;
4988 5043
4989 if (inode->i_nlink == ~0U) 5044 if (inode->i_nlink >= BTRFS_LINK_MAX)
4990 return -EMLINK; 5045 return -EMLINK;
4991 5046
4992 err = btrfs_set_inode_index(dir, &index); 5047 err = btrfs_set_inode_index(dir, &index);
@@ -5450,7 +5505,8 @@ insert:
5450 write_unlock(&em_tree->lock); 5505 write_unlock(&em_tree->lock);
5451out: 5506out:
5452 5507
5453 trace_btrfs_get_extent(root, em); 5508 if (em)
5509 trace_btrfs_get_extent(root, em);
5454 5510
5455 if (path) 5511 if (path)
5456 btrfs_free_path(path); 5512 btrfs_free_path(path);
@@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5836 return ret; 5892 return ret;
5837} 5893}
5838 5894
5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5896 u64 len, u64 orig_start,
5897 u64 block_start, u64 block_len,
5898 int type)
5899{
5900 struct extent_map_tree *em_tree;
5901 struct extent_map *em;
5902 struct btrfs_root *root = BTRFS_I(inode)->root;
5903 int ret;
5904
5905 em_tree = &BTRFS_I(inode)->extent_tree;
5906 em = alloc_extent_map();
5907 if (!em)
5908 return ERR_PTR(-ENOMEM);
5909
5910 em->start = start;
5911 em->orig_start = orig_start;
5912 em->len = len;
5913 em->block_len = block_len;
5914 em->block_start = block_start;
5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5917 if (type == BTRFS_ORDERED_PREALLOC)
5918 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5919
5920 do {
5921 btrfs_drop_extent_cache(inode, em->start,
5922 em->start + em->len - 1, 0);
5923 write_lock(&em_tree->lock);
5924 ret = add_extent_mapping(em_tree, em);
5925 write_unlock(&em_tree->lock);
5926 } while (ret == -EEXIST);
5927
5928 if (ret) {
5929 free_extent_map(em);
5930 return ERR_PTR(ret);
5931 }
5932
5933 return em;
5934}
5935
5936
5839static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5937static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5840 struct buffer_head *bh_result, int create) 5938 struct buffer_head *bh_result, int create)
5841{ 5939{
@@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5950 goto must_cow; 6048 goto must_cow;
5951 6049
5952 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6050 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6051 u64 orig_start = em->start;
6052
6053 if (type == BTRFS_ORDERED_PREALLOC) {
6054 free_extent_map(em);
6055 em = create_pinned_em(inode, start, len,
6056 orig_start,
6057 block_start, len, type);
6058 if (IS_ERR(em)) {
6059 btrfs_end_transaction(trans, root);
6060 goto unlock_err;
6061 }
6062 }
6063
5953 ret = btrfs_add_ordered_extent_dio(inode, start, 6064 ret = btrfs_add_ordered_extent_dio(inode, start,
5954 block_start, len, len, type); 6065 block_start, len, len, type);
5955 btrfs_end_transaction(trans, root); 6066 btrfs_end_transaction(trans, root);
@@ -5999,7 +6110,8 @@ unlock:
5999 if (lockstart < lockend) { 6110 if (lockstart < lockend) {
6000 if (create && len < lockend - lockstart) { 6111 if (create && len < lockend - lockstart) {
6001 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6112 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6002 lockstart + len - 1, unlock_bits, 1, 0, 6113 lockstart + len - 1,
6114 unlock_bits | EXTENT_DEFRAG, 1, 0,
6003 &cached_state, GFP_NOFS); 6115 &cached_state, GFP_NOFS);
6004 /* 6116 /*
6005 * Beside unlock, we also need to cleanup reserved space 6117 * Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6119,8 @@ unlock:
6007 */ 6119 */
6008 clear_extent_bit(&BTRFS_I(inode)->io_tree, 6120 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6009 lockstart + len, lockend, 6121 lockstart + len, lockend,
6010 unlock_bits | EXTENT_DO_ACCOUNTING, 6122 unlock_bits | EXTENT_DO_ACCOUNTING |
6011 1, 0, NULL, GFP_NOFS); 6123 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6012 } else { 6124 } else {
6013 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6125 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6014 lockend, unlock_bits, 1, 0, 6126 lockend, unlock_bits, 1, 0,
@@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6573 */ 6685 */
6574 clear_extent_bit(tree, page_start, page_end, 6686 clear_extent_bit(tree, page_start, page_end,
6575 EXTENT_DIRTY | EXTENT_DELALLOC | 6687 EXTENT_DIRTY | EXTENT_DELALLOC |
6576 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6688 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
6577 &cached_state, GFP_NOFS); 6689 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
6578 /* 6690 /*
6579 * whoever cleared the private bit is responsible 6691 * whoever cleared the private bit is responsible
6580 * for the finish_ordered_io 6692 * for the finish_ordered_io
@@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6590 } 6702 }
6591 clear_extent_bit(tree, page_start, page_end, 6703 clear_extent_bit(tree, page_start, page_end,
6592 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6704 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
6593 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6705 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
6706 &cached_state, GFP_NOFS);
6594 __btrfs_releasepage(page, GFP_NOFS); 6707 __btrfs_releasepage(page, GFP_NOFS);
6595 6708
6596 ClearPageChecked(page); 6709 ClearPageChecked(page);
@@ -6687,7 +6800,8 @@ again:
6687 * prepare_pages in the normal write path. 6800 * prepare_pages in the normal write path.
6688 */ 6801 */
6689 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6802 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
6690 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6803 EXTENT_DIRTY | EXTENT_DELALLOC |
6804 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
6691 0, 0, &cached_state, GFP_NOFS); 6805 0, 0, &cached_state, GFP_NOFS);
6692 6806
6693 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6807 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6832,7 @@ again:
6718 6832
6719 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6833 BTRFS_I(inode)->last_trans = root->fs_info->generation;
6720 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6834 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
6835 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
6721 6836
6722 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6837 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6723 6838
@@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)
6745 u64 mask = root->sectorsize - 1; 6860 u64 mask = root->sectorsize - 1;
6746 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6861 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6747 6862
6748 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6863 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
6749 if (ret) 6864 if (ret)
6750 return ret; 6865 return ret;
6751 6866
@@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)
6788 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6903 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6789 * updating the inode. 6904 * updating the inode.
6790 */ 6905 */
6791 rsv = btrfs_alloc_block_rsv(root); 6906 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
6792 if (!rsv) 6907 if (!rsv)
6793 return -ENOMEM; 6908 return -ENOMEM;
6794 rsv->size = min_size; 6909 rsv->size = min_size;
6910 rsv->failfast = 1;
6795 6911
6796 /* 6912 /*
6797 * 1 for the truncate slack space 6913 * 1 for the truncate slack space
@@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)
6837 &BTRFS_I(inode)->runtime_flags)) 6953 &BTRFS_I(inode)->runtime_flags))
6838 btrfs_add_ordered_operation(trans, root, inode); 6954 btrfs_add_ordered_operation(trans, root, inode);
6839 6955
6840 while (1) { 6956 /*
6841 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6957 * So if we truncate and then write and fsync we normally would just
6842 if (ret) { 6958 * write the extents that changed, which is a problem if we need to
6843 /* 6959 * first truncate that entire inode. So set this flag so we write out
6844 * This can only happen with the original transaction we 6960 * all of the extents in the inode to the sync log so we're completely
6845 * started above, every other time we shouldn't have a 6961 * safe.
6846 * transaction started yet. 6962 */
6847 */ 6963 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6848 if (ret == -EAGAIN) 6964 trans->block_rsv = rsv;
6849 goto end_trans;
6850 err = ret;
6851 break;
6852 }
6853
6854 if (!trans) {
6855 /* Just need the 1 for updating the inode */
6856 trans = btrfs_start_transaction(root, 1);
6857 if (IS_ERR(trans)) {
6858 ret = err = PTR_ERR(trans);
6859 trans = NULL;
6860 break;
6861 }
6862 }
6863
6864 trans->block_rsv = rsv;
6865 6965
6966 while (1) {
6866 ret = btrfs_truncate_inode_items(trans, root, inode, 6967 ret = btrfs_truncate_inode_items(trans, root, inode,
6867 inode->i_size, 6968 inode->i_size,
6868 BTRFS_EXTENT_DATA_KEY); 6969 BTRFS_EXTENT_DATA_KEY);
6869 if (ret != -EAGAIN) { 6970 if (ret != -ENOSPC) {
6870 err = ret; 6971 err = ret;
6871 break; 6972 break;
6872 } 6973 }
@@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)
6877 err = ret; 6978 err = ret;
6878 break; 6979 break;
6879 } 6980 }
6880end_trans: 6981
6881 nr = trans->blocks_used; 6982 nr = trans->blocks_used;
6882 btrfs_end_transaction(trans, root); 6983 btrfs_end_transaction(trans, root);
6883 trans = NULL;
6884 btrfs_btree_balance_dirty(root, nr); 6984 btrfs_btree_balance_dirty(root, nr);
6985
6986 trans = btrfs_start_transaction(root, 2);
6987 if (IS_ERR(trans)) {
6988 ret = err = PTR_ERR(trans);
6989 trans = NULL;
6990 break;
6991 }
6992
6993 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
6994 rsv, min_size);
6995 BUG_ON(ret); /* shouldn't happen */
6996 trans->block_rsv = rsv;
6885 } 6997 }
6886 6998
6887 if (ret == 0 && inode->i_nlink > 0) { 6999 if (ret == 0 && inode->i_nlink > 0) {
@@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6965 ei->csum_bytes = 0; 7077 ei->csum_bytes = 0;
6966 ei->index_cnt = (u64)-1; 7078 ei->index_cnt = (u64)-1;
6967 ei->last_unlink_trans = 0; 7079 ei->last_unlink_trans = 0;
7080 ei->last_log_commit = 0;
6968 7081
6969 spin_lock_init(&ei->lock); 7082 spin_lock_init(&ei->lock);
6970 ei->outstanding_extents = 0; 7083 ei->outstanding_extents = 0;
@@ -7095,31 +7208,31 @@ void btrfs_destroy_cachep(void)
7095 7208
7096int btrfs_init_cachep(void) 7209int btrfs_init_cachep(void)
7097{ 7210{
7098 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7211 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7099 sizeof(struct btrfs_inode), 0, 7212 sizeof(struct btrfs_inode), 0,
7100 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7213 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7101 if (!btrfs_inode_cachep) 7214 if (!btrfs_inode_cachep)
7102 goto fail; 7215 goto fail;
7103 7216
7104 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7217 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7105 sizeof(struct btrfs_trans_handle), 0, 7218 sizeof(struct btrfs_trans_handle), 0,
7106 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7219 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7107 if (!btrfs_trans_handle_cachep) 7220 if (!btrfs_trans_handle_cachep)
7108 goto fail; 7221 goto fail;
7109 7222
7110 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7223 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7111 sizeof(struct btrfs_transaction), 0, 7224 sizeof(struct btrfs_transaction), 0,
7112 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7225 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7113 if (!btrfs_transaction_cachep) 7226 if (!btrfs_transaction_cachep)
7114 goto fail; 7227 goto fail;
7115 7228
7116 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7229 btrfs_path_cachep = kmem_cache_create("btrfs_path",
7117 sizeof(struct btrfs_path), 0, 7230 sizeof(struct btrfs_path), 0,
7118 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7231 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7119 if (!btrfs_path_cachep) 7232 if (!btrfs_path_cachep)
7120 goto fail; 7233 goto fail;
7121 7234
7122 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7235 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7123 sizeof(struct btrfs_free_space), 0, 7236 sizeof(struct btrfs_free_space), 0,
7124 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7237 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7125 if (!btrfs_free_space_cachep) 7238 if (!btrfs_free_space_cachep)
@@ -7513,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7513 loff_t actual_len, u64 *alloc_hint, 7626 loff_t actual_len, u64 *alloc_hint,
7514 struct btrfs_trans_handle *trans) 7627 struct btrfs_trans_handle *trans)
7515{ 7628{
7629 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
7630 struct extent_map *em;
7516 struct btrfs_root *root = BTRFS_I(inode)->root; 7631 struct btrfs_root *root = BTRFS_I(inode)->root;
7517 struct btrfs_key ins; 7632 struct btrfs_key ins;
7518 u64 cur_offset = start; 7633 u64 cur_offset = start;
@@ -7553,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7553 btrfs_drop_extent_cache(inode, cur_offset, 7668 btrfs_drop_extent_cache(inode, cur_offset,
7554 cur_offset + ins.offset -1, 0); 7669 cur_offset + ins.offset -1, 0);
7555 7670
7671 em = alloc_extent_map();
7672 if (!em) {
7673 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
7674 &BTRFS_I(inode)->runtime_flags);
7675 goto next;
7676 }
7677
7678 em->start = cur_offset;
7679 em->orig_start = cur_offset;
7680 em->len = ins.offset;
7681 em->block_start = ins.objectid;
7682 em->block_len = ins.offset;
7683 em->bdev = root->fs_info->fs_devices->latest_bdev;
7684 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7685 em->generation = trans->transid;
7686
7687 while (1) {
7688 write_lock(&em_tree->lock);
7689 ret = add_extent_mapping(em_tree, em);
7690 if (!ret)
7691 list_move(&em->list,
7692 &em_tree->modified_extents);
7693 write_unlock(&em_tree->lock);
7694 if (ret != -EEXIST)
7695 break;
7696 btrfs_drop_extent_cache(inode, cur_offset,
7697 cur_offset + ins.offset - 1,
7698 0);
7699 }
7700 free_extent_map(em);
7701next:
7556 num_bytes -= ins.offset; 7702 num_bytes -= ins.offset;
7557 cur_offset += ins.offset; 7703 cur_offset += ins.offset;
7558 *alloc_hint = ins.objectid + ins.offset; 7704 *alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47127c1bd290..e568c472f807 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
181 int ret; 181 int ret;
182 u64 ip_oldflags; 182 u64 ip_oldflags;
183 unsigned int i_oldflags; 183 unsigned int i_oldflags;
184 umode_t mode;
184 185
185 if (btrfs_root_readonly(root)) 186 if (btrfs_root_readonly(root))
186 return -EROFS; 187 return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
203 204
204 ip_oldflags = ip->flags; 205 ip_oldflags = ip->flags;
205 i_oldflags = inode->i_flags; 206 i_oldflags = inode->i_flags;
207 mode = inode->i_mode;
206 208
207 flags = btrfs_mask_flags(inode->i_mode, flags); 209 flags = btrfs_mask_flags(inode->i_mode, flags);
208 oldflags = btrfs_flags_to_ioctl(ip->flags); 210 oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
237 ip->flags |= BTRFS_INODE_DIRSYNC; 239 ip->flags |= BTRFS_INODE_DIRSYNC;
238 else 240 else
239 ip->flags &= ~BTRFS_INODE_DIRSYNC; 241 ip->flags &= ~BTRFS_INODE_DIRSYNC;
240 if (flags & FS_NOCOW_FL) 242 if (flags & FS_NOCOW_FL) {
241 ip->flags |= BTRFS_INODE_NODATACOW; 243 if (S_ISREG(mode)) {
242 else 244 /*
243 ip->flags &= ~BTRFS_INODE_NODATACOW; 245 * It's safe to turn csums off here, no extents exist.
246 * Otherwise we want the flag to reflect the real COW
247 * status of the file and will not set it.
248 */
249 if (inode->i_size == 0)
250 ip->flags |= BTRFS_INODE_NODATACOW
251 | BTRFS_INODE_NODATASUM;
252 } else {
253 ip->flags |= BTRFS_INODE_NODATACOW;
254 }
255 } else {
256 /*
257 * Revert back under same assuptions as above
258 */
259 if (S_ISREG(mode)) {
260 if (inode->i_size == 0)
261 ip->flags &= ~(BTRFS_INODE_NODATACOW
262 | BTRFS_INODE_NODATASUM);
263 } else {
264 ip->flags &= ~BTRFS_INODE_NODATACOW;
265 }
266 }
244 267
245 /* 268 /*
246 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 269 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
516 if (!pending_snapshot) 539 if (!pending_snapshot)
517 return -ENOMEM; 540 return -ENOMEM;
518 541
519 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 542 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
543 BTRFS_BLOCK_RSV_TEMP);
520 pending_snapshot->dentry = dentry; 544 pending_snapshot->dentry = dentry;
521 pending_snapshot->root = root; 545 pending_snapshot->root = root;
522 pending_snapshot->readonly = readonly; 546 pending_snapshot->readonly = readonly;
@@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
525 *inherit = NULL; /* take responsibility to free it */ 549 *inherit = NULL; /* take responsibility to free it */
526 } 550 }
527 551
528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 552 trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
529 if (IS_ERR(trans)) { 553 if (IS_ERR(trans)) {
530 ret = PTR_ERR(trans); 554 ret = PTR_ERR(trans);
531 goto fail; 555 goto fail;
@@ -1022,8 +1046,8 @@ again:
1022 page_start, page_end - 1, 0, &cached_state); 1046 page_start, page_end - 1, 0, &cached_state);
1023 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1047 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
1024 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1048 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1025 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1049 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1026 GFP_NOFS); 1050 &cached_state, GFP_NOFS);
1027 1051
1028 if (i_done != page_cnt) { 1052 if (i_done != page_cnt) {
1029 spin_lock(&BTRFS_I(inode)->lock); 1053 spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1058,8 @@ again:
1034 } 1058 }
1035 1059
1036 1060
1037 btrfs_set_extent_delalloc(inode, page_start, page_end - 1, 1061 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
1038 &cached_state); 1062 &cached_state, GFP_NOFS);
1039 1063
1040 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1064 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1041 page_start, page_end - 1, &cached_state, 1065 page_start, page_end - 1, &cached_state,
@@ -2351,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2351 int ret; 2375 int ret;
2352 u64 len = olen; 2376 u64 len = olen;
2353 u64 bs = root->fs_info->sb->s_blocksize; 2377 u64 bs = root->fs_info->sb->s_blocksize;
2354 u64 hint_byte;
2355 2378
2356 /* 2379 /*
2357 * TODO: 2380 * TODO:
@@ -2456,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2456 another, and lock file content */ 2479 another, and lock file content */
2457 while (1) { 2480 while (1) {
2458 struct btrfs_ordered_extent *ordered; 2481 struct btrfs_ordered_extent *ordered;
2459 lock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2482 lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2460 ordered = btrfs_lookup_first_ordered_extent(src, off+len); 2483 ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
2461 if (!ordered && 2484 if (!ordered &&
2462 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, 2485 !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
2463 EXTENT_DELALLOC, 0, NULL)) 2486 EXTENT_DELALLOC, 0, NULL))
2464 break; 2487 break;
2465 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2488 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2466 if (ordered) 2489 if (ordered)
2467 btrfs_put_ordered_extent(ordered); 2490 btrfs_put_ordered_extent(ordered);
2468 btrfs_wait_ordered_range(src, off, len); 2491 btrfs_wait_ordered_range(src, off, len);
@@ -2536,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2536 btrfs_release_path(path); 2559 btrfs_release_path(path);
2537 2560
2538 if (key.offset + datal <= off || 2561 if (key.offset + datal <= off ||
2539 key.offset >= off+len) 2562 key.offset >= off + len - 1)
2540 goto next; 2563 goto next;
2541 2564
2542 memcpy(&new_key, &key, sizeof(new_key)); 2565 memcpy(&new_key, &key, sizeof(new_key));
@@ -2574,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2574 datal -= off - key.offset; 2597 datal -= off - key.offset;
2575 } 2598 }
2576 2599
2577 ret = btrfs_drop_extents(trans, inode, 2600 ret = btrfs_drop_extents(trans, root, inode,
2578 new_key.offset, 2601 new_key.offset,
2579 new_key.offset + datal, 2602 new_key.offset + datal,
2580 &hint_byte, 1); 2603 1);
2581 if (ret) { 2604 if (ret) {
2582 btrfs_abort_transaction(trans, root, 2605 btrfs_abort_transaction(trans, root,
2583 ret); 2606 ret);
@@ -2637,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2637 new_key.offset += skip; 2660 new_key.offset += skip;
2638 } 2661 }
2639 2662
2640 if (key.offset + datal > off+len) 2663 if (key.offset + datal > off + len)
2641 trim = key.offset + datal - (off+len); 2664 trim = key.offset + datal - (off + len);
2642 2665
2643 if (comp && (skip || trim)) { 2666 if (comp && (skip || trim)) {
2644 ret = -EINVAL; 2667 ret = -EINVAL;
@@ -2648,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2648 size -= skip + trim; 2671 size -= skip + trim;
2649 datal -= skip + trim; 2672 datal -= skip + trim;
2650 2673
2651 ret = btrfs_drop_extents(trans, inode, 2674 ret = btrfs_drop_extents(trans, root, inode,
2652 new_key.offset, 2675 new_key.offset,
2653 new_key.offset + datal, 2676 new_key.offset + datal,
2654 &hint_byte, 1); 2677 1);
2655 if (ret) { 2678 if (ret) {
2656 btrfs_abort_transaction(trans, root, 2679 btrfs_abort_transaction(trans, root,
2657 ret); 2680 ret);
@@ -2715,7 +2738,7 @@ next:
2715 ret = 0; 2738 ret = 0;
2716out: 2739out:
2717 btrfs_release_path(path); 2740 btrfs_release_path(path);
2718 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2741 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2719out_unlock: 2742out_unlock:
2720 mutex_unlock(&src->i_mutex); 2743 mutex_unlock(&src->i_mutex);
2721 mutex_unlock(&inode->i_mutex); 2744 mutex_unlock(&inode->i_mutex);
@@ -2850,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2850 return 0; 2873 return 0;
2851} 2874}
2852 2875
2853static void get_block_group_info(struct list_head *groups_list, 2876void btrfs_get_block_group_info(struct list_head *groups_list,
2854 struct btrfs_ioctl_space_info *space) 2877 struct btrfs_ioctl_space_info *space)
2855{ 2878{
2856 struct btrfs_block_group_cache *block_group; 2879 struct btrfs_block_group_cache *block_group;
2857 2880
@@ -2959,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2959 down_read(&info->groups_sem); 2982 down_read(&info->groups_sem);
2960 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 2983 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2961 if (!list_empty(&info->block_groups[c])) { 2984 if (!list_empty(&info->block_groups[c])) {
2962 get_block_group_info(&info->block_groups[c], 2985 btrfs_get_block_group_info(
2963 &space); 2986 &info->block_groups[c], &space);
2964 memcpy(dest, &space, sizeof(space)); 2987 memcpy(dest, &space, sizeof(space));
2965 dest++; 2988 dest++;
2966 space_args.total_spaces++; 2989 space_args.total_spaces++;
@@ -3208,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3208{ 3231{
3209 int ret = 0; 3232 int ret = 0;
3210 int size; 3233 int size;
3211 u64 extent_item_pos;
3212 struct btrfs_ioctl_logical_ino_args *loi; 3234 struct btrfs_ioctl_logical_ino_args *loi;
3213 struct btrfs_data_container *inodes = NULL; 3235 struct btrfs_data_container *inodes = NULL;
3214 struct btrfs_path *path = NULL; 3236 struct btrfs_path *path = NULL;
3215 struct btrfs_key key;
3216 3237
3217 if (!capable(CAP_SYS_ADMIN)) 3238 if (!capable(CAP_SYS_ADMIN))
3218 return -EPERM; 3239 return -EPERM;
@@ -3230,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3230 goto out; 3251 goto out;
3231 } 3252 }
3232 3253
3233 size = min_t(u32, loi->size, 4096); 3254 size = min_t(u32, loi->size, 64 * 1024);
3234 inodes = init_data_container(size); 3255 inodes = init_data_container(size);
3235 if (IS_ERR(inodes)) { 3256 if (IS_ERR(inodes)) {
3236 ret = PTR_ERR(inodes); 3257 ret = PTR_ERR(inodes);
@@ -3238,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3238 goto out; 3259 goto out;
3239 } 3260 }
3240 3261
3241 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3262 ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
3242 btrfs_release_path(path); 3263 build_ino_list, inodes);
3243 3264 if (ret == -EINVAL)
3244 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3245 ret = -ENOENT; 3265 ret = -ENOENT;
3246 if (ret < 0) 3266 if (ret < 0)
3247 goto out; 3267 goto out;
3248 3268
3249 extent_item_pos = loi->logical - key.objectid;
3250 ret = iterate_extent_inodes(root->fs_info, key.objectid,
3251 extent_item_pos, 0, build_ino_list,
3252 inodes);
3253
3254 if (ret < 0)
3255 goto out;
3256
3257 ret = copy_to_user((void *)(unsigned long)loi->inodes, 3269 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3258 (void *)(unsigned long)inodes, size); 3270 (void *)(unsigned long)inodes, size);
3259 if (ret) 3271 if (ret)
@@ -3261,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3261 3273
3262out: 3274out:
3263 btrfs_free_path(path); 3275 btrfs_free_path(path);
3264 kfree(inodes); 3276 vfree(inodes);
3265 kfree(loi); 3277 kfree(loi);
3266 3278
3267 return ret; 3279 return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551dd..7772f02ba28e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27 27
28static struct kmem_cache *btrfs_ordered_extent_cache;
29
28static u64 entry_end(struct btrfs_ordered_extent *entry) 30static u64 entry_end(struct btrfs_ordered_extent *entry)
29{ 31{
30 if (entry->file_offset + entry->len < entry->file_offset) 32 if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
187 struct btrfs_ordered_extent *entry; 189 struct btrfs_ordered_extent *entry;
188 190
189 tree = &BTRFS_I(inode)->ordered_tree; 191 tree = &BTRFS_I(inode)->ordered_tree;
190 entry = kzalloc(sizeof(*entry), GFP_NOFS); 192 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
191 if (!entry) 193 if (!entry)
192 return -ENOMEM; 194 return -ENOMEM;
193 195
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
421 list_del(&sum->list); 423 list_del(&sum->list);
422 kfree(sum); 424 kfree(sum);
423 } 425 }
424 kfree(entry); 426 kmem_cache_free(btrfs_ordered_extent_cache, entry);
425 } 427 }
426} 428}
427 429
@@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
466 * wait for all the ordered extents in a root. This is done when balancing 468 * wait for all the ordered extents in a root. This is done when balancing
467 * space between drives. 469 * space between drives.
468 */ 470 */
469void btrfs_wait_ordered_extents(struct btrfs_root *root, 471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
470 int nocow_only, int delay_iput)
471{ 472{
472 struct list_head splice; 473 struct list_head splice;
473 struct list_head *cur; 474 struct list_head *cur;
@@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
482 cur = splice.next; 483 cur = splice.next;
483 ordered = list_entry(cur, struct btrfs_ordered_extent, 484 ordered = list_entry(cur, struct btrfs_ordered_extent,
484 root_extent_list); 485 root_extent_list);
485 if (nocow_only &&
486 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
487 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
488 list_move(&ordered->root_extent_list,
489 &root->fs_info->ordered_extents);
490 cond_resched_lock(&root->fs_info->ordered_extent_lock);
491 continue;
492 }
493
494 list_del_init(&ordered->root_extent_list); 486 list_del_init(&ordered->root_extent_list);
495 atomic_inc(&ordered->refs); 487 atomic_inc(&ordered->refs);
496 488
@@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
775 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 767 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
776 u64 disk_i_size; 768 u64 disk_i_size;
777 u64 new_i_size; 769 u64 new_i_size;
778 u64 i_size_test;
779 u64 i_size = i_size_read(inode); 770 u64 i_size = i_size_read(inode);
780 struct rb_node *node; 771 struct rb_node *node;
781 struct rb_node *prev = NULL; 772 struct rb_node *prev = NULL;
@@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
835 break; 826 break;
836 if (test->file_offset >= i_size) 827 if (test->file_offset >= i_size)
837 break; 828 break;
838 if (test->file_offset >= disk_i_size) 829 if (test->file_offset >= disk_i_size) {
830 /*
831 * we don't update disk_i_size now, so record this
832 * undealt i_size. Or we will not know the real
833 * i_size.
834 */
835 if (test->outstanding_isize < offset)
836 test->outstanding_isize = offset;
837 if (ordered &&
838 ordered->outstanding_isize >
839 test->outstanding_isize)
840 test->outstanding_isize =
841 ordered->outstanding_isize;
839 goto out; 842 goto out;
840 }
841 new_i_size = min_t(u64, offset, i_size);
842
843 /*
844 * at this point, we know we can safely update i_size to at least
845 * the offset from this ordered extent. But, we need to
846 * walk forward and see if ios from higher up in the file have
847 * finished.
848 */
849 if (ordered) {
850 node = rb_next(&ordered->rb_node);
851 } else {
852 if (prev)
853 node = rb_next(prev);
854 else
855 node = rb_first(&tree->tree);
856 }
857
858 /*
859 * We are looking for an area between our current extent and the next
860 * ordered extent to update the i_size to. There are 3 cases here
861 *
862 * 1) We don't actually have anything and we can update to i_size.
863 * 2) We have stuff but they already did their i_size update so again we
864 * can just update to i_size.
865 * 3) We have an outstanding ordered extent so the most we can update
866 * our disk_i_size to is the start of the next offset.
867 */
868 i_size_test = i_size;
869 for (; node; node = rb_next(node)) {
870 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
871
872 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
873 continue;
874 if (test->file_offset > offset) {
875 i_size_test = test->file_offset;
876 break;
877 } 843 }
878 } 844 }
845 new_i_size = min_t(u64, offset, i_size);
879 846
880 /* 847 /*
881 * i_size_test is the end of a region after this ordered 848 * Some ordered extents may completed before the current one, and
882 * extent where there are no ordered extents, we can safely set 849 * we hold the real i_size in ->outstanding_isize.
883 * disk_i_size to this.
884 */ 850 */
885 if (i_size_test > offset) 851 if (ordered && ordered->outstanding_isize > new_i_size)
886 new_i_size = min_t(u64, i_size_test, i_size); 852 new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
887 BTRFS_I(inode)->disk_i_size = new_i_size; 853 BTRFS_I(inode)->disk_i_size = new_i_size;
888 ret = 0; 854 ret = 0;
889out: 855out:
@@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
984 } 950 }
985 spin_unlock(&root->fs_info->ordered_extent_lock); 951 spin_unlock(&root->fs_info->ordered_extent_lock);
986} 952}
953
954int __init ordered_data_init(void)
955{
956 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
957 sizeof(struct btrfs_ordered_extent), 0,
958 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
959 NULL);
960 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM;
962 return 0;
963}
964
965void ordered_data_exit(void)
966{
967 if (btrfs_ordered_extent_cache)
968 kmem_cache_destroy(btrfs_ordered_extent_cache);
969}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d2997..dd27a0b46a37 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 96 /* number of bytes that still need writing */
97 u64 bytes_left; 97 u64 bytes_left;
98 98
99 /*
100 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of
102 * btrfs_ordered_update_i_size();
103 */
104 u64 outstanding_isize;
105
99 /* flags (described above) */ 106 /* flags (described above) */
100 unsigned long flags; 107 unsigned long flags;
101 108
@@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
183void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root, 191 struct btrfs_root *root,
185 struct inode *inode); 192 struct inode *inode);
186void btrfs_wait_ordered_extents(struct btrfs_root *root, 193void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
187 int nocow_only, int delay_iput); 194int __init ordered_data_init(void);
195void ordered_data_exit(void);
188#endif 196#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b65015581744..5039686df6ae 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1145 1145
1146 ulist_reinit(tmp); 1146 ulist_reinit(tmp);
1147 /* XXX id not needed */ 1147 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1148 ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter); 1149 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist; 1151 struct btrfs_qgroup_list *glist;
1152 1152
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1153 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1154 if (qg->refcnt < seq) 1154 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1; 1155 qg->refcnt = seq + 1;
1156 else 1156 else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1158 1158
1159 list_for_each_entry(glist, &qg->groups, next_group) { 1159 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid, 1160 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group, 1161 (u64)(uintptr_t)glist->group,
1162 GFP_ATOMIC); 1162 GFP_ATOMIC);
1163 } 1163 }
1164 } 1164 }
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1168 * step 2: walk from the new root 1168 * step 2: walk from the new root
1169 */ 1169 */
1170 ulist_reinit(tmp); 1170 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1171 ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter); 1172 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) { 1173 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg; 1174 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist; 1175 struct btrfs_qgroup_list *glist;
1176 1176
1177 qg = (struct btrfs_qgroup *)unode->aux; 1177 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1178 if (qg->refcnt < seq) { 1178 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */ 1179 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes; 1180 qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1190 1190
1191 list_for_each_entry(glist, &qg->groups, next_group) { 1191 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid, 1192 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC); 1193 (uintptr_t)glist->group, GFP_ATOMIC);
1194 } 1194 }
1195 } 1195 }
1196 1196
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1208 continue; 1208 continue;
1209 1209
1210 ulist_reinit(tmp); 1210 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1211 ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter); 1212 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist; 1214 struct btrfs_qgroup_list *glist;
1215 1215
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1216 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1217 if (qg->tag == seq) 1217 if (qg->tag == seq)
1218 continue; 1218 continue;
1219 1219
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1225 1225
1226 list_for_each_entry(glist, &qg->groups, next_group) { 1226 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid, 1227 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group, 1228 (uintptr_t)glist->group,
1229 GFP_ATOMIC); 1229 GFP_ATOMIC);
1230 } 1230 }
1231 } 1231 }
@@ -1469,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1469 * be exceeded 1469 * be exceeded
1470 */ 1470 */
1471 ulist = ulist_alloc(GFP_ATOMIC); 1471 ulist = ulist_alloc(GFP_ATOMIC);
1472 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1472 if (!ulist) {
1473 ret = -ENOMEM;
1474 goto out;
1475 }
1476 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1473 ULIST_ITER_INIT(&uiter); 1477 ULIST_ITER_INIT(&uiter);
1474 while ((unode = ulist_next(ulist, &uiter))) { 1478 while ((unode = ulist_next(ulist, &uiter))) {
1475 struct btrfs_qgroup *qg; 1479 struct btrfs_qgroup *qg;
1476 struct btrfs_qgroup_list *glist; 1480 struct btrfs_qgroup_list *glist;
1477 1481
1478 qg = (struct btrfs_qgroup *)unode->aux; 1482 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1479 1483
1480 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 1484 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1481 qg->reserved + qg->rfer + num_bytes > 1485 qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1489 1493
1490 list_for_each_entry(glist, &qg->groups, next_group) { 1494 list_for_each_entry(glist, &qg->groups, next_group) {
1491 ulist_add(ulist, glist->group->qgroupid, 1495 ulist_add(ulist, glist->group->qgroupid,
1492 (unsigned long)glist->group, GFP_ATOMIC); 1496 (uintptr_t)glist->group, GFP_ATOMIC);
1493 } 1497 }
1494 } 1498 }
1495 if (ret) 1499 if (ret)
@@ -1502,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1502 while ((unode = ulist_next(ulist, &uiter))) { 1506 while ((unode = ulist_next(ulist, &uiter))) {
1503 struct btrfs_qgroup *qg; 1507 struct btrfs_qgroup *qg;
1504 1508
1505 qg = (struct btrfs_qgroup *)unode->aux; 1509 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1506 1510
1507 qg->reserved += num_bytes; 1511 qg->reserved += num_bytes;
1508 } 1512 }
@@ -1541,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1541 goto out; 1545 goto out;
1542 1546
1543 ulist = ulist_alloc(GFP_ATOMIC); 1547 ulist = ulist_alloc(GFP_ATOMIC);
1544 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1548 if (!ulist) {
1549 btrfs_std_error(fs_info, -ENOMEM);
1550 goto out;
1551 }
1552 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1545 ULIST_ITER_INIT(&uiter); 1553 ULIST_ITER_INIT(&uiter);
1546 while ((unode = ulist_next(ulist, &uiter))) { 1554 while ((unode = ulist_next(ulist, &uiter))) {
1547 struct btrfs_qgroup *qg; 1555 struct btrfs_qgroup *qg;
1548 struct btrfs_qgroup_list *glist; 1556 struct btrfs_qgroup_list *glist;
1549 1557
1550 qg = (struct btrfs_qgroup *)unode->aux; 1558 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1551 1559
1552 qg->reserved -= num_bytes; 1560 qg->reserved -= num_bytes;
1553 1561
1554 list_for_each_entry(glist, &qg->groups, next_group) { 1562 list_for_each_entry(glist, &qg->groups, next_group) {
1555 ulist_add(ulist, glist->group->qgroupid, 1563 ulist_add(ulist, glist->group->qgroupid,
1556 (unsigned long)glist->group, GFP_ATOMIC); 1564 (uintptr_t)glist->group, GFP_ATOMIC);
1557 } 1565 }
1558 } 1566 }
1559 1567
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004d..776f0aa128fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3270 key.offset = 0; 3270 key.offset = 0;
3271 3271
3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3273 if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { 3273 if (IS_ERR(inode) || is_bad_inode(inode)) {
3274 if (inode && !IS_ERR(inode)) 3274 if (!IS_ERR(inode))
3275 iput(inode); 3275 iput(inode);
3276 return -ENOENT; 3276 return -ENOENT;
3277 } 3277 }
@@ -3621,7 +3621,7 @@ next:
3621 3621
3622 ret = find_first_extent_bit(&rc->processed_blocks, 3622 ret = find_first_extent_bit(&rc->processed_blocks,
3623 key.objectid, &start, &end, 3623 key.objectid, &start, &end,
3624 EXTENT_DIRTY); 3624 EXTENT_DIRTY, NULL);
3625 3625
3626 if (ret == 0 && start <= key.objectid) { 3626 if (ret == 0 && start <= key.objectid) {
3627 btrfs_release_path(path); 3627 btrfs_release_path(path);
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3674 struct btrfs_trans_handle *trans; 3674 struct btrfs_trans_handle *trans;
3675 int ret; 3675 int ret;
3676 3676
3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); 3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3678 BTRFS_BLOCK_RSV_TEMP);
3678 if (!rc->block_rsv) 3679 if (!rc->block_rsv)
3679 return -ENOMEM; 3680 return -ENOMEM;
3680 3681
@@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->flags); 4058 (unsigned long long)rc->block_group->flags);
4058 4059
4059 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4060 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4061 4062
4062 while (1) { 4063 while (1) {
4063 mutex_lock(&fs_info->cleaner_mutex); 4064 mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d88071..eb923d087da7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
141 return -ENOMEM; 141 return -ENOMEM;
142 142
143 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
144 if (ret < 0) 144 if (ret < 0) {
145 goto out_abort; 145 btrfs_abort_transaction(trans, root, ret);
146 goto out;
147 }
146 148
147 if (ret != 0) { 149 if (ret != 0) {
148 btrfs_print_leaf(root, path->nodes[0]); 150 btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
166 btrfs_release_path(path); 168 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path, 169 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1); 170 -1, 1);
169 if (ret < 0) 171 if (ret < 0) {
170 goto out_abort; 172 btrfs_abort_transaction(trans, root, ret);
173 goto out;
174 }
175
171 ret = btrfs_del_item(trans, root, path); 176 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0) 177 if (ret < 0) {
173 goto out_abort; 178 btrfs_abort_transaction(trans, root, ret);
179 goto out;
180 }
174 btrfs_release_path(path); 181 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path, 182 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item)); 183 key, sizeof(*item));
177 if (ret < 0) 184 if (ret < 0) {
178 goto out_abort; 185 btrfs_abort_transaction(trans, root, ret);
186 goto out;
187 }
179 l = path->nodes[0]; 188 l = path->nodes[0];
180 slot = path->slots[0]; 189 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot); 190 ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
192out: 201out:
193 btrfs_free_path(path); 202 btrfs_free_path(path);
194 return ret; 203 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
199} 204}
200 205
201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 206int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a6..27892f67e69b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
352 struct extent_buffer *eb; 352 struct extent_buffer *eb;
353 struct btrfs_extent_item *ei; 353 struct btrfs_extent_item *ei;
354 struct scrub_warning swarn; 354 struct scrub_warning swarn;
355 u32 item_size; 355 unsigned long ptr = 0;
356 int ret; 356 u64 extent_item_pos;
357 u64 flags = 0;
357 u64 ref_root; 358 u64 ref_root;
359 u32 item_size;
358 u8 ref_level; 360 u8 ref_level;
359 unsigned long ptr = 0;
360 const int bufsize = 4096; 361 const int bufsize = 4096;
361 u64 extent_item_pos; 362 int ret;
362 363
363 path = btrfs_alloc_path(); 364 path = btrfs_alloc_path();
364 365
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
375 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 376 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
376 goto out; 377 goto out;
377 378
378 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); 379 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
380 &flags);
379 if (ret < 0) 381 if (ret < 0)
380 goto out; 382 goto out;
381 383
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
387 item_size = btrfs_item_size_nr(eb, path->slots[0]); 389 item_size = btrfs_item_size_nr(eb, path->slots[0]);
388 btrfs_release_path(path); 390 btrfs_release_path(path);
389 391
390 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 392 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
391 do { 393 do {
392 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 394 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
393 &ref_root, &ref_level); 395 &ref_root, &ref_level);
@@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1029 spin_lock(&sdev->stat_lock); 1031 spin_lock(&sdev->stat_lock);
1030 sdev->stat.malloc_errors++; 1032 sdev->stat.malloc_errors++;
1031 spin_unlock(&sdev->stat_lock); 1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1032 return -ENOMEM; 1035 return -ENOMEM;
1033 } 1036 }
1034 sblock->page_count++; 1037 sblock->page_count++;
@@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1666 scrub_block_put(sblock); 1669 scrub_block_put(sblock);
1667 } 1670 }
1668 1671
1669 if (sbio->err) {
1670 /* what is this good for??? */
1671 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1672 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
1673 sbio->bio->bi_phys_segments = 0;
1674 sbio->bio->bi_idx = 0;
1675
1676 for (i = 0; i < sbio->page_count; i++) {
1677 struct bio_vec *bi;
1678 bi = &sbio->bio->bi_io_vec[i];
1679 bi->bv_offset = 0;
1680 bi->bv_len = PAGE_SIZE;
1681 }
1682 }
1683
1684 bio_put(sbio->bio); 1672 bio_put(sbio->bio);
1685 sbio->bio = NULL; 1673 sbio->bio = NULL;
1686 spin_lock(&sdev->list_lock); 1674 spin_lock(&sdev->list_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f869..c7beb543a4a8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
107 int cur_inode_new; 107 int cur_inode_new;
108 int cur_inode_new_gen; 108 int cur_inode_new_gen;
109 int cur_inode_deleted; 109 int cur_inode_deleted;
110 int cur_inode_first_ref_orphan;
111 u64 cur_inode_size; 110 u64 cur_inode_size;
112 u64 cur_inode_mode; 111 u64 cur_inode_mode;
113 112
@@ -126,7 +125,15 @@ struct send_ctx {
126 125
127struct name_cache_entry { 126struct name_cache_entry {
128 struct list_head list; 127 struct list_head list;
129 struct list_head use_list; 128 /*
129 * radix_tree has only 32bit entries but we need to handle 64bit inums.
130 * We use the lower 32bit of the 64bit inum to store it in the tree. If
131 * more then one inum would fall into the same entry, we use radix_list
132 * to store the additional entries. radix_list is also used to store
133 * entries where two entries have the same inum but different
134 * generations.
135 */
136 struct list_head radix_list;
130 u64 ino; 137 u64 ino;
131 u64 gen; 138 u64 gen;
132 u64 parent_ino; 139 u64 parent_ino;
@@ -328,6 +335,7 @@ out:
328 return ret; 335 return ret;
329} 336}
330 337
338#if 0
331static void fs_path_remove(struct fs_path *p) 339static void fs_path_remove(struct fs_path *p)
332{ 340{
333 BUG_ON(p->reversed); 341 BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
335 p->end--; 343 p->end--;
336 *p->end = 0; 344 *p->end = 0;
337} 345}
346#endif
338 347
339static int fs_path_copy(struct fs_path *p, struct fs_path *from) 348static int fs_path_copy(struct fs_path *p, struct fs_path *from)
340{ 349{
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
377 return path; 386 return path;
378} 387}
379 388
380static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) 389int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
381{ 390{
382 int ret; 391 int ret;
383 mm_segment_t old_fs; 392 mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
387 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
388 397
389 while (pos < len) { 398 while (pos < len) {
390 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, 399 ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
391 &sctx->send_off);
392 /* TODO handle that correctly */ 400 /* TODO handle that correctly */
393 /*if (ret == -ERESTARTSYS) { 401 /*if (ret == -ERESTARTSYS) {
394 continue; 402 continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
544 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 552 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
545 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); 553 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
546 554
547 return write_buf(sctx, &hdr, sizeof(hdr)); 555 return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
556 &sctx->send_off);
548} 557}
549 558
550/* 559/*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
581 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 590 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
582 hdr->crc = cpu_to_le32(crc); 591 hdr->crc = cpu_to_le32(crc);
583 592
584 ret = write_buf(sctx, sctx->send_buf, sctx->send_size); 593 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
594 &sctx->send_off);
585 595
586 sctx->total_send_size += sctx->send_size; 596 sctx->total_send_size += sctx->send_size;
587 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; 597 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
687 */ 697 */
688static int get_inode_info(struct btrfs_root *root, 698static int get_inode_info(struct btrfs_root *root,
689 u64 ino, u64 *size, u64 *gen, 699 u64 ino, u64 *size, u64 *gen,
690 u64 *mode, u64 *uid, u64 *gid) 700 u64 *mode, u64 *uid, u64 *gid,
701 u64 *rdev)
691{ 702{
692 int ret; 703 int ret;
693 struct btrfs_inode_item *ii; 704 struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
721 *uid = btrfs_inode_uid(path->nodes[0], ii); 732 *uid = btrfs_inode_uid(path->nodes[0], ii);
722 if (gid) 733 if (gid)
723 *gid = btrfs_inode_gid(path->nodes[0], ii); 734 *gid = btrfs_inode_gid(path->nodes[0], ii);
735 if (rdev)
736 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
724 737
725out: 738out:
726 btrfs_free_path(path); 739 btrfs_free_path(path);
@@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
852 struct extent_buffer *eb; 865 struct extent_buffer *eb;
853 struct btrfs_item *item; 866 struct btrfs_item *item;
854 struct btrfs_dir_item *di; 867 struct btrfs_dir_item *di;
855 struct btrfs_path *tmp_path = NULL;
856 struct btrfs_key di_key; 868 struct btrfs_key di_key;
857 char *buf = NULL; 869 char *buf = NULL;
858 char *buf2 = NULL; 870 char *buf2 = NULL;
@@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
874 goto out; 886 goto out;
875 } 887 }
876 888
877 tmp_path = alloc_path_for_send();
878 if (!tmp_path) {
879 ret = -ENOMEM;
880 goto out;
881 }
882
883 eb = path->nodes[0]; 889 eb = path->nodes[0];
884 slot = path->slots[0]; 890 slot = path->slots[0];
885 item = btrfs_item_nr(eb, slot); 891 item = btrfs_item_nr(eb, slot);
@@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
941 } 947 }
942 948
943out: 949out:
944 btrfs_free_path(tmp_path);
945 if (buf_virtual) 950 if (buf_virtual)
946 vfree(buf); 951 vfree(buf);
947 else 952 else
@@ -1026,12 +1031,12 @@ struct backref_ctx {
1026 u64 extent_len; 1031 u64 extent_len;
1027 1032
1028 /* Just to check for bugs in backref resolving */ 1033 /* Just to check for bugs in backref resolving */
1029 int found_in_send_root; 1034 int found_itself;
1030}; 1035};
1031 1036
1032static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1037static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1033{ 1038{
1034 u64 root = (u64)key; 1039 u64 root = (u64)(uintptr_t)key;
1035 struct clone_root *cr = (struct clone_root *)elt; 1040 struct clone_root *cr = (struct clone_root *)elt;
1036 1041
1037 if (root < cr->root->objectid) 1042 if (root < cr->root->objectid)
@@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
1055 1060
1056/* 1061/*
1057 * Called for every backref that is found for the current extent. 1062 * Called for every backref that is found for the current extent.
1063 * Results are collected in sctx->clone_roots->ino/offset/found_refs
1058 */ 1064 */
1059static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) 1065static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1060{ 1066{
@@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1064 u64 i_size; 1070 u64 i_size;
1065 1071
1066 /* First check if the root is in the list of accepted clone sources */ 1072 /* First check if the root is in the list of accepted clone sources */
1067 found = bsearch((void *)root, bctx->sctx->clone_roots, 1073 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1068 bctx->sctx->clone_roots_cnt, 1074 bctx->sctx->clone_roots_cnt,
1069 sizeof(struct clone_root), 1075 sizeof(struct clone_root),
1070 __clone_root_cmp_bsearch); 1076 __clone_root_cmp_bsearch);
@@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1074 if (found->root == bctx->sctx->send_root && 1080 if (found->root == bctx->sctx->send_root &&
1075 ino == bctx->cur_objectid && 1081 ino == bctx->cur_objectid &&
1076 offset == bctx->cur_offset) { 1082 offset == bctx->cur_offset) {
1077 bctx->found_in_send_root = 1; 1083 bctx->found_itself = 1;
1078 } 1084 }
1079 1085
1080 /* 1086 /*
1081 * There are inodes that have extents that lie behind it's i_size. Don't 1087 * There are inodes that have extents that lie behind its i_size. Don't
1082 * accept clones from these extents. 1088 * accept clones from these extents.
1083 */ 1089 */
1084 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); 1090 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
1091 NULL);
1085 if (ret < 0) 1092 if (ret < 0)
1086 return ret; 1093 return ret;
1087 1094
@@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1101 */ 1108 */
1102 if (ino >= bctx->cur_objectid) 1109 if (ino >= bctx->cur_objectid)
1103 return 0; 1110 return 0;
1104 /*if (ino > ctx->cur_objectid) 1111#if 0
1112 if (ino > bctx->cur_objectid)
1105 return 0; 1113 return 0;
1106 if (offset + ctx->extent_len > ctx->cur_offset) 1114 if (offset + bctx->extent_len > bctx->cur_offset)
1107 return 0;*/ 1115 return 0;
1108 1116#endif
1109 bctx->found++;
1110 found->found_refs++;
1111 found->ino = ino;
1112 found->offset = offset;
1113 return 0;
1114 } 1117 }
1115 1118
1116 bctx->found++; 1119 bctx->found++;
@@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1130} 1133}
1131 1134
1132/* 1135/*
1136 * Given an inode, offset and extent item, it finds a good clone for a clone
1137 * instruction. Returns -ENOENT when none could be found. The function makes
1138 * sure that the returned clone is usable at the point where sending is at the
1139 * moment. This means, that no clones are accepted which lie behind the current
1140 * inode+offset.
1141 *
1133 * path must point to the extent item when called. 1142 * path must point to the extent item when called.
1134 */ 1143 */
1135static int find_extent_clone(struct send_ctx *sctx, 1144static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,
1141 int ret; 1150 int ret;
1142 int extent_type; 1151 int extent_type;
1143 u64 logical; 1152 u64 logical;
1153 u64 disk_byte;
1144 u64 num_bytes; 1154 u64 num_bytes;
1145 u64 extent_item_pos; 1155 u64 extent_item_pos;
1156 u64 flags = 0;
1146 struct btrfs_file_extent_item *fi; 1157 struct btrfs_file_extent_item *fi;
1147 struct extent_buffer *eb = path->nodes[0]; 1158 struct extent_buffer *eb = path->nodes[0];
1148 struct backref_ctx backref_ctx; 1159 struct backref_ctx *backref_ctx = NULL;
1149 struct clone_root *cur_clone_root; 1160 struct clone_root *cur_clone_root;
1150 struct btrfs_key found_key; 1161 struct btrfs_key found_key;
1151 struct btrfs_path *tmp_path; 1162 struct btrfs_path *tmp_path;
1163 int compressed;
1152 u32 i; 1164 u32 i;
1153 1165
1154 tmp_path = alloc_path_for_send(); 1166 tmp_path = alloc_path_for_send();
1155 if (!tmp_path) 1167 if (!tmp_path)
1156 return -ENOMEM; 1168 return -ENOMEM;
1157 1169
1170 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
1171 if (!backref_ctx) {
1172 ret = -ENOMEM;
1173 goto out;
1174 }
1175
1158 if (data_offset >= ino_size) { 1176 if (data_offset >= ino_size) {
1159 /* 1177 /*
1160 * There may be extents that lie behind the file's size. 1178 * There may be extents that lie behind the file's size.
@@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,
1172 ret = -ENOENT; 1190 ret = -ENOENT;
1173 goto out; 1191 goto out;
1174 } 1192 }
1193 compressed = btrfs_file_extent_compression(eb, fi);
1175 1194
1176 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1195 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1177 logical = btrfs_file_extent_disk_bytenr(eb, fi); 1196 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1178 if (logical == 0) { 1197 if (disk_byte == 0) {
1179 ret = -ENOENT; 1198 ret = -ENOENT;
1180 goto out; 1199 goto out;
1181 } 1200 }
1182 logical += btrfs_file_extent_offset(eb, fi); 1201 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1183 1202
1184 ret = extent_from_logical(sctx->send_root->fs_info, 1203 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
1185 logical, tmp_path, &found_key); 1204 &found_key, &flags);
1186 btrfs_release_path(tmp_path); 1205 btrfs_release_path(tmp_path);
1187 1206
1188 if (ret < 0) 1207 if (ret < 0)
1189 goto out; 1208 goto out;
1190 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1209 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1191 ret = -EIO; 1210 ret = -EIO;
1192 goto out; 1211 goto out;
1193 } 1212 }
@@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,
1202 cur_clone_root->found_refs = 0; 1221 cur_clone_root->found_refs = 0;
1203 } 1222 }
1204 1223
1205 backref_ctx.sctx = sctx; 1224 backref_ctx->sctx = sctx;
1206 backref_ctx.found = 0; 1225 backref_ctx->found = 0;
1207 backref_ctx.cur_objectid = ino; 1226 backref_ctx->cur_objectid = ino;
1208 backref_ctx.cur_offset = data_offset; 1227 backref_ctx->cur_offset = data_offset;
1209 backref_ctx.found_in_send_root = 0; 1228 backref_ctx->found_itself = 0;
1210 backref_ctx.extent_len = num_bytes; 1229 backref_ctx->extent_len = num_bytes;
1211 1230
1212 /* 1231 /*
1213 * The last extent of a file may be too large due to page alignment. 1232 * The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,
1215 * __iterate_backrefs work. 1234 * __iterate_backrefs work.
1216 */ 1235 */
1217 if (data_offset + num_bytes >= ino_size) 1236 if (data_offset + num_bytes >= ino_size)
1218 backref_ctx.extent_len = ino_size - data_offset; 1237 backref_ctx->extent_len = ino_size - data_offset;
1219 1238
1220 /* 1239 /*
1221 * Now collect all backrefs. 1240 * Now collect all backrefs.
1222 */ 1241 */
1242 if (compressed == BTRFS_COMPRESS_NONE)
1243 extent_item_pos = logical - found_key.objectid;
1244 else
1245 extent_item_pos = 0;
1246
1223 extent_item_pos = logical - found_key.objectid; 1247 extent_item_pos = logical - found_key.objectid;
1224 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1248 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1225 found_key.objectid, extent_item_pos, 1, 1249 found_key.objectid, extent_item_pos, 1,
1226 __iterate_backrefs, &backref_ctx); 1250 __iterate_backrefs, backref_ctx);
1251
1227 if (ret < 0) 1252 if (ret < 0)
1228 goto out; 1253 goto out;
1229 1254
1230 if (!backref_ctx.found_in_send_root) { 1255 if (!backref_ctx->found_itself) {
1231 /* found a bug in backref code? */ 1256 /* found a bug in backref code? */
1232 ret = -EIO; 1257 ret = -EIO;
1233 printk(KERN_ERR "btrfs: ERROR did not find backref in " 1258 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1234 "send_root. inode=%llu, offset=%llu, " 1259 "send_root. inode=%llu, offset=%llu, "
1235 "logical=%llu\n", 1260 "disk_byte=%llu found extent=%llu\n",
1236 ino, data_offset, logical); 1261 ino, data_offset, disk_byte, found_key.objectid);
1237 goto out; 1262 goto out;
1238 } 1263 }
1239 1264
@@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1242 "num_bytes=%llu, logical=%llu\n", 1267 "num_bytes=%llu, logical=%llu\n",
1243 data_offset, ino, num_bytes, logical); 1268 data_offset, ino, num_bytes, logical);
1244 1269
1245 if (!backref_ctx.found) 1270 if (!backref_ctx->found)
1246 verbose_printk("btrfs: no clones found\n"); 1271 verbose_printk("btrfs: no clones found\n");
1247 1272
1248 cur_clone_root = NULL; 1273 cur_clone_root = NULL;
@@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1253 else if (sctx->clone_roots[i].root == sctx->send_root) 1278 else if (sctx->clone_roots[i].root == sctx->send_root)
1254 /* prefer clones from send_root over others */ 1279 /* prefer clones from send_root over others */
1255 cur_clone_root = sctx->clone_roots + i; 1280 cur_clone_root = sctx->clone_roots + i;
1256 break;
1257 } 1281 }
1258 1282
1259 } 1283 }
@@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1267 1291
1268out: 1292out:
1269 btrfs_free_path(tmp_path); 1293 btrfs_free_path(tmp_path);
1294 kfree(backref_ctx);
1270 return ret; 1295 return ret;
1271} 1296}
1272 1297
@@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,
1307 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 1332 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1308 1333
1309 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1334 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1310 if (ret < 0)
1311 goto out;
1312 1335
1313out: 1336out:
1314 btrfs_free_path(path); 1337 btrfs_free_path(path);
@@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1404 u64 right_gen; 1427 u64 right_gen;
1405 1428
1406 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, 1429 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1407 NULL); 1430 NULL, NULL);
1408 if (ret < 0 && ret != -ENOENT) 1431 if (ret < 0 && ret != -ENOENT)
1409 goto out; 1432 goto out;
1410 left_ret = ret; 1433 left_ret = ret;
@@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1413 right_ret = -ENOENT; 1436 right_ret = -ENOENT;
1414 } else { 1437 } else {
1415 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, 1438 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1416 NULL, NULL, NULL); 1439 NULL, NULL, NULL, NULL);
1417 if (ret < 0 && ret != -ENOENT) 1440 if (ret < 0 && ret != -ENOENT)
1418 goto out; 1441 goto out;
1419 right_ret = ret; 1442 right_ret = ret;
1420 } 1443 }
1421 1444
1422 if (!left_ret && !right_ret) { 1445 if (!left_ret && !right_ret) {
1423 if (left_gen == gen && right_gen == gen) 1446 if (left_gen == gen && right_gen == gen) {
1424 ret = inode_state_no_change; 1447 ret = inode_state_no_change;
1425 else if (left_gen == gen) { 1448 } else if (left_gen == gen) {
1426 if (ino < sctx->send_progress) 1449 if (ino < sctx->send_progress)
1427 ret = inode_state_did_create; 1450 ret = inode_state_did_create;
1428 else 1451 else
@@ -1516,6 +1539,10 @@ out:
1516 return ret; 1539 return ret;
1517} 1540}
1518 1541
1542/*
1543 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1544 * generation of the parent dir and the name of the dir entry.
1545 */
1519static int get_first_ref(struct send_ctx *sctx, 1546static int get_first_ref(struct send_ctx *sctx,
1520 struct btrfs_root *root, u64 ino, 1547 struct btrfs_root *root, u64 ino,
1521 u64 *dir, u64 *dir_gen, struct fs_path *name) 1548 u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,
1557 btrfs_release_path(path); 1584 btrfs_release_path(path);
1558 1585
1559 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, 1586 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
1560 NULL); 1587 NULL, NULL);
1561 if (ret < 0) 1588 if (ret < 0)
1562 goto out; 1589 goto out;
1563 1590
@@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,
1586 if (ret < 0) 1613 if (ret < 0)
1587 goto out; 1614 goto out;
1588 1615
1589 if (name_len != fs_path_len(tmp_name)) { 1616 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1590 ret = 0; 1617 ret = 0;
1591 goto out; 1618 goto out;
1592 } 1619 }
1593 1620
1594 ret = memcmp(tmp_name->start, name, name_len); 1621 ret = !memcmp(tmp_name->start, name, name_len);
1595 if (ret)
1596 ret = 0;
1597 else
1598 ret = 1;
1599 1622
1600out: 1623out:
1601 fs_path_free(sctx, tmp_name); 1624 fs_path_free(sctx, tmp_name);
1602 return ret; 1625 return ret;
1603} 1626}
1604 1627
1628/*
1629 * Used by process_recorded_refs to determine if a new ref would overwrite an
1630 * already existing ref. In case it detects an overwrite, it returns the
1631 * inode/gen in who_ino/who_gen.
1632 * When an overwrite is detected, process_recorded_refs does proper orphanizing
1633 * to make sure later references to the overwritten inode are possible.
1634 * Orphanizing is however only required for the first ref of an inode.
1635 * process_recorded_refs does an additional is_first_ref check to see if
1636 * orphanizing is really required.
1637 */
1605static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 1638static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1606 const char *name, int name_len, 1639 const char *name, int name_len,
1607 u64 *who_ino, u64 *who_gen) 1640 u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1626 goto out; 1659 goto out;
1627 } 1660 }
1628 1661
1662 /*
1663 * Check if the overwritten ref was already processed. If yes, the ref
1664 * was already unlinked/moved, so we can safely assume that we will not
1665 * overwrite anything at this point in time.
1666 */
1629 if (other_inode > sctx->send_progress) { 1667 if (other_inode > sctx->send_progress) {
1630 ret = get_inode_info(sctx->parent_root, other_inode, NULL, 1668 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1631 who_gen, NULL, NULL, NULL); 1669 who_gen, NULL, NULL, NULL, NULL);
1632 if (ret < 0) 1670 if (ret < 0)
1633 goto out; 1671 goto out;
1634 1672
@@ -1642,6 +1680,13 @@ out:
1642 return ret; 1680 return ret;
1643} 1681}
1644 1682
1683/*
1684 * Checks if the ref was overwritten by an already processed inode. This is
1685 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1686 * thus the orphan name needs be used.
1687 * process_recorded_refs also uses it to avoid unlinking of refs that were
1688 * overwritten.
1689 */
1645static int did_overwrite_ref(struct send_ctx *sctx, 1690static int did_overwrite_ref(struct send_ctx *sctx,
1646 u64 dir, u64 dir_gen, 1691 u64 dir, u64 dir_gen,
1647 u64 ino, u64 ino_gen, 1692 u64 ino, u64 ino_gen,
@@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
1671 } 1716 }
1672 1717
1673 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, 1718 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1674 NULL); 1719 NULL, NULL);
1675 if (ret < 0) 1720 if (ret < 0)
1676 goto out; 1721 goto out;
1677 1722
@@ -1690,6 +1735,11 @@ out:
1690 return ret; 1735 return ret;
1691} 1736}
1692 1737
1738/*
1739 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
1740 * that got overwritten. This is used by process_recorded_refs to determine
1741 * if it has to use the path as returned by get_cur_path or the orphan name.
1742 */
1693static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 1743static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1694{ 1744{
1695 int ret = 0; 1745 int ret = 0;
@@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1710 1760
1711 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 1761 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1712 name->start, fs_path_len(name)); 1762 name->start, fs_path_len(name));
1713 if (ret < 0)
1714 goto out;
1715 1763
1716out: 1764out:
1717 fs_path_free(sctx, name); 1765 fs_path_free(sctx, name);
1718 return ret; 1766 return ret;
1719} 1767}
1720 1768
1769/*
1770 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
1771 * so we need to do some special handling in case we have clashes. This function
1772 * takes care of this with the help of name_cache_entry::radix_list.
1773 * In case of error, nce is kfreed.
1774 */
1721static int name_cache_insert(struct send_ctx *sctx, 1775static int name_cache_insert(struct send_ctx *sctx,
1722 struct name_cache_entry *nce) 1776 struct name_cache_entry *nce)
1723{ 1777{
1724 int ret = 0; 1778 int ret = 0;
1725 struct name_cache_entry **ncea; 1779 struct list_head *nce_head;
1726 1780
1727 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); 1781 nce_head = radix_tree_lookup(&sctx->name_cache,
1728 if (ncea) { 1782 (unsigned long)nce->ino);
1729 if (!ncea[0]) 1783 if (!nce_head) {
1730 ncea[0] = nce; 1784 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1731 else if (!ncea[1]) 1785 if (!nce_head)
1732 ncea[1] = nce;
1733 else
1734 BUG();
1735 } else {
1736 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1737 if (!ncea)
1738 return -ENOMEM; 1786 return -ENOMEM;
1787 INIT_LIST_HEAD(nce_head);
1739 1788
1740 ncea[0] = nce; 1789 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
1741 ncea[1] = NULL; 1790 if (ret < 0) {
1742 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); 1791 kfree(nce_head);
1743 if (ret < 0) 1792 kfree(nce);
1744 return ret; 1793 return ret;
1794 }
1745 } 1795 }
1796 list_add_tail(&nce->radix_list, nce_head);
1746 list_add_tail(&nce->list, &sctx->name_cache_list); 1797 list_add_tail(&nce->list, &sctx->name_cache_list);
1747 sctx->name_cache_size++; 1798 sctx->name_cache_size++;
1748 1799
@@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,
1752static void name_cache_delete(struct send_ctx *sctx, 1803static void name_cache_delete(struct send_ctx *sctx,
1753 struct name_cache_entry *nce) 1804 struct name_cache_entry *nce)
1754{ 1805{
1755 struct name_cache_entry **ncea; 1806 struct list_head *nce_head;
1756
1757 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1758 BUG_ON(!ncea);
1759
1760 if (ncea[0] == nce)
1761 ncea[0] = NULL;
1762 else if (ncea[1] == nce)
1763 ncea[1] = NULL;
1764 else
1765 BUG();
1766 1807
1767 if (!ncea[0] && !ncea[1]) { 1808 nce_head = radix_tree_lookup(&sctx->name_cache,
1768 radix_tree_delete(&sctx->name_cache, nce->ino); 1809 (unsigned long)nce->ino);
1769 kfree(ncea); 1810 BUG_ON(!nce_head);
1770 }
1771 1811
1812 list_del(&nce->radix_list);
1772 list_del(&nce->list); 1813 list_del(&nce->list);
1773
1774 sctx->name_cache_size--; 1814 sctx->name_cache_size--;
1815
1816 if (list_empty(nce_head)) {
1817 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1818 kfree(nce_head);
1819 }
1775} 1820}
1776 1821
1777static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 1822static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1778 u64 ino, u64 gen) 1823 u64 ino, u64 gen)
1779{ 1824{
1780 struct name_cache_entry **ncea; 1825 struct list_head *nce_head;
1826 struct name_cache_entry *cur;
1781 1827
1782 ncea = radix_tree_lookup(&sctx->name_cache, ino); 1828 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
1783 if (!ncea) 1829 if (!nce_head)
1784 return NULL; 1830 return NULL;
1785 1831
1786 if (ncea[0] && ncea[0]->gen == gen) 1832 list_for_each_entry(cur, nce_head, radix_list) {
1787 return ncea[0]; 1833 if (cur->ino == ino && cur->gen == gen)
1788 else if (ncea[1] && ncea[1]->gen == gen) 1834 return cur;
1789 return ncea[1]; 1835 }
1790 return NULL; 1836 return NULL;
1791} 1837}
1792 1838
1839/*
1840 * Removes the entry from the list and adds it back to the end. This marks the
1841 * entry as recently used so that name_cache_clean_unused does not remove it.
1842 */
1793static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) 1843static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1794{ 1844{
1795 list_del(&nce->list); 1845 list_del(&nce->list);
1796 list_add_tail(&nce->list, &sctx->name_cache_list); 1846 list_add_tail(&nce->list, &sctx->name_cache_list);
1797} 1847}
1798 1848
1849/*
1850 * Remove some entries from the beginning of name_cache_list.
1851 */
1799static void name_cache_clean_unused(struct send_ctx *sctx) 1852static void name_cache_clean_unused(struct send_ctx *sctx)
1800{ 1853{
1801 struct name_cache_entry *nce; 1854 struct name_cache_entry *nce;
@@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
1814static void name_cache_free(struct send_ctx *sctx) 1867static void name_cache_free(struct send_ctx *sctx)
1815{ 1868{
1816 struct name_cache_entry *nce; 1869 struct name_cache_entry *nce;
1817 struct name_cache_entry *tmp;
1818 1870
1819 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { 1871 while (!list_empty(&sctx->name_cache_list)) {
1872 nce = list_entry(sctx->name_cache_list.next,
1873 struct name_cache_entry, list);
1820 name_cache_delete(sctx, nce); 1874 name_cache_delete(sctx, nce);
1875 kfree(nce);
1821 } 1876 }
1822} 1877}
1823 1878
1879/*
1880 * Used by get_cur_path for each ref up to the root.
1881 * Returns 0 if it succeeded.
1882 * Returns 1 if the inode is not existent or got overwritten. In that case, the
1883 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
1884 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
1885 * Returns <0 in case of error.
1886 */
1824static int __get_cur_name_and_parent(struct send_ctx *sctx, 1887static int __get_cur_name_and_parent(struct send_ctx *sctx,
1825 u64 ino, u64 gen, 1888 u64 ino, u64 gen,
1826 u64 *parent_ino, 1889 u64 *parent_ino,
@@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1832 struct btrfs_path *path = NULL; 1895 struct btrfs_path *path = NULL;
1833 struct name_cache_entry *nce = NULL; 1896 struct name_cache_entry *nce = NULL;
1834 1897
1898 /*
1899 * First check if we already did a call to this function with the same
1900 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
1901 * return the cached result.
1902 */
1835 nce = name_cache_search(sctx, ino, gen); 1903 nce = name_cache_search(sctx, ino, gen);
1836 if (nce) { 1904 if (nce) {
1837 if (ino < sctx->send_progress && nce->need_later_update) { 1905 if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1854 if (!path) 1922 if (!path)
1855 return -ENOMEM; 1923 return -ENOMEM;
1856 1924
1925 /*
1926 * If the inode is not existent yet, add the orphan name and return 1.
1927 * This should only happen for the parent dir that we determine in
1928 * __record_new_ref
1929 */
1857 ret = is_inode_existent(sctx, ino, gen); 1930 ret = is_inode_existent(sctx, ino, gen);
1858 if (ret < 0) 1931 if (ret < 0)
1859 goto out; 1932 goto out;
@@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1866 goto out_cache; 1939 goto out_cache;
1867 } 1940 }
1868 1941
1942 /*
1943 * Depending on whether the inode was already processed or not, use
1944 * send_root or parent_root for ref lookup.
1945 */
1869 if (ino < sctx->send_progress) 1946 if (ino < sctx->send_progress)
1870 ret = get_first_ref(sctx, sctx->send_root, ino, 1947 ret = get_first_ref(sctx, sctx->send_root, ino,
1871 parent_ino, parent_gen, dest); 1948 parent_ino, parent_gen, dest);
@@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1875 if (ret < 0) 1952 if (ret < 0)
1876 goto out; 1953 goto out;
1877 1954
1955 /*
1956 * Check if the ref was overwritten by an inode's ref that was processed
1957 * earlier. If yes, treat as orphan and return 1.
1958 */
1878 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 1959 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1879 dest->start, dest->end - dest->start); 1960 dest->start, dest->end - dest->start);
1880 if (ret < 0) 1961 if (ret < 0)
@@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1888 } 1969 }
1889 1970
1890out_cache: 1971out_cache:
1972 /*
1973 * Store the result of the lookup in the name cache.
1974 */
1891 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 1975 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1892 if (!nce) { 1976 if (!nce) {
1893 ret = -ENOMEM; 1977 ret = -ENOMEM;
@@ -1901,7 +1985,6 @@ out_cache:
1901 nce->name_len = fs_path_len(dest); 1985 nce->name_len = fs_path_len(dest);
1902 nce->ret = ret; 1986 nce->ret = ret;
1903 strcpy(nce->name, dest->start); 1987 strcpy(nce->name, dest->start);
1904 memset(&nce->use_list, 0, sizeof(nce->use_list));
1905 1988
1906 if (ino < sctx->send_progress) 1989 if (ino < sctx->send_progress)
1907 nce->need_later_update = 0; 1990 nce->need_later_update = 0;
@@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
2107 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2190 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2108 btrfs_release_path(path); 2191 btrfs_release_path(path);
2109 2192
2110 if (ret < 0)
2111 goto out;
2112
2113 if (parent_root) { 2193 if (parent_root) {
2114 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2194 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2115 if (ret < 0) 2195 if (ret < 0)
@@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2276 btrfs_inode_mtime(ii)); 2356 btrfs_inode_mtime(ii));
2277 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, 2357 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2278 btrfs_inode_ctime(ii)); 2358 btrfs_inode_ctime(ii));
2279 /* TODO otime? */ 2359 /* TODO Add otime support when the otime patches get into upstream */
2280 2360
2281 ret = send_cmd(sctx); 2361 ret = send_cmd(sctx);
2282 2362
@@ -2292,39 +2372,39 @@ out:
2292 * a valid path yet because we did not process the refs yet. So, the inode 2372 * a valid path yet because we did not process the refs yet. So, the inode
2293 * is created as orphan. 2373 * is created as orphan.
2294 */ 2374 */
2295static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, 2375static int send_create_inode(struct send_ctx *sctx, u64 ino)
2296 struct btrfs_key *key)
2297{ 2376{
2298 int ret = 0; 2377 int ret = 0;
2299 struct extent_buffer *eb = path->nodes[0];
2300 struct btrfs_inode_item *ii;
2301 struct fs_path *p; 2378 struct fs_path *p;
2302 int slot = path->slots[0];
2303 int cmd; 2379 int cmd;
2380 u64 gen;
2304 u64 mode; 2381 u64 mode;
2382 u64 rdev;
2305 2383
2306verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); 2384verbose_printk("btrfs: send_create_inode %llu\n", ino);
2307 2385
2308 p = fs_path_alloc(sctx); 2386 p = fs_path_alloc(sctx);
2309 if (!p) 2387 if (!p)
2310 return -ENOMEM; 2388 return -ENOMEM;
2311 2389
2312 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2390 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
2313 mode = btrfs_inode_mode(eb, ii); 2391 NULL, &rdev);
2392 if (ret < 0)
2393 goto out;
2314 2394
2315 if (S_ISREG(mode)) 2395 if (S_ISREG(mode)) {
2316 cmd = BTRFS_SEND_C_MKFILE; 2396 cmd = BTRFS_SEND_C_MKFILE;
2317 else if (S_ISDIR(mode)) 2397 } else if (S_ISDIR(mode)) {
2318 cmd = BTRFS_SEND_C_MKDIR; 2398 cmd = BTRFS_SEND_C_MKDIR;
2319 else if (S_ISLNK(mode)) 2399 } else if (S_ISLNK(mode)) {
2320 cmd = BTRFS_SEND_C_SYMLINK; 2400 cmd = BTRFS_SEND_C_SYMLINK;
2321 else if (S_ISCHR(mode) || S_ISBLK(mode)) 2401 } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2322 cmd = BTRFS_SEND_C_MKNOD; 2402 cmd = BTRFS_SEND_C_MKNOD;
2323 else if (S_ISFIFO(mode)) 2403 } else if (S_ISFIFO(mode)) {
2324 cmd = BTRFS_SEND_C_MKFIFO; 2404 cmd = BTRFS_SEND_C_MKFIFO;
2325 else if (S_ISSOCK(mode)) 2405 } else if (S_ISSOCK(mode)) {
2326 cmd = BTRFS_SEND_C_MKSOCK; 2406 cmd = BTRFS_SEND_C_MKSOCK;
2327 else { 2407 } else {
2328 printk(KERN_WARNING "btrfs: unexpected inode type %o", 2408 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2329 (int)(mode & S_IFMT)); 2409 (int)(mode & S_IFMT));
2330 ret = -ENOTSUPP; 2410 ret = -ENOTSUPP;
@@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2335 if (ret < 0) 2415 if (ret < 0)
2336 goto out; 2416 goto out;
2337 2417
2338 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 2418 ret = gen_unique_name(sctx, ino, gen, p);
2339 if (ret < 0) 2419 if (ret < 0)
2340 goto out; 2420 goto out;
2341 2421
2342 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2422 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2343 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); 2423 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2344 2424
2345 if (S_ISLNK(mode)) { 2425 if (S_ISLNK(mode)) {
2346 fs_path_reset(p); 2426 fs_path_reset(p);
2347 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); 2427 ret = read_symlink(sctx, sctx->send_root, ino, p);
2348 if (ret < 0) 2428 if (ret < 0)
2349 goto out; 2429 goto out;
2350 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2430 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2351 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2431 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2352 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2432 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2353 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); 2433 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
2354 } 2434 }
2355 2435
2356 ret = send_cmd(sctx); 2436 ret = send_cmd(sctx);
@@ -2364,6 +2444,92 @@ out:
2364 return ret; 2444 return ret;
2365} 2445}
2366 2446
2447/*
2448 * We need some special handling for inodes that get processed before the parent
2449 * directory got created. See process_recorded_refs for details.
2450 * This function does the check if we already created the dir out of order.
2451 */
2452static int did_create_dir(struct send_ctx *sctx, u64 dir)
2453{
2454 int ret = 0;
2455 struct btrfs_path *path = NULL;
2456 struct btrfs_key key;
2457 struct btrfs_key found_key;
2458 struct btrfs_key di_key;
2459 struct extent_buffer *eb;
2460 struct btrfs_dir_item *di;
2461 int slot;
2462
2463 path = alloc_path_for_send();
2464 if (!path) {
2465 ret = -ENOMEM;
2466 goto out;
2467 }
2468
2469 key.objectid = dir;
2470 key.type = BTRFS_DIR_INDEX_KEY;
2471 key.offset = 0;
2472 while (1) {
2473 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2474 1, 0);
2475 if (ret < 0)
2476 goto out;
2477 if (!ret) {
2478 eb = path->nodes[0];
2479 slot = path->slots[0];
2480 btrfs_item_key_to_cpu(eb, &found_key, slot);
2481 }
2482 if (ret || found_key.objectid != key.objectid ||
2483 found_key.type != key.type) {
2484 ret = 0;
2485 goto out;
2486 }
2487
2488 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2489 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2490
2491 if (di_key.objectid < sctx->send_progress) {
2492 ret = 1;
2493 goto out;
2494 }
2495
2496 key.offset = found_key.offset + 1;
2497 btrfs_release_path(path);
2498 }
2499
2500out:
2501 btrfs_free_path(path);
2502 return ret;
2503}
2504
2505/*
2506 * Only creates the inode if it is:
2507 * 1. Not a directory
2508 * 2. Or a directory which was not created already due to out of order
2509 * directories. See did_create_dir and process_recorded_refs for details.
2510 */
2511static int send_create_inode_if_needed(struct send_ctx *sctx)
2512{
2513 int ret;
2514
2515 if (S_ISDIR(sctx->cur_inode_mode)) {
2516 ret = did_create_dir(sctx, sctx->cur_ino);
2517 if (ret < 0)
2518 goto out;
2519 if (ret) {
2520 ret = 0;
2521 goto out;
2522 }
2523 }
2524
2525 ret = send_create_inode(sctx, sctx->cur_ino);
2526 if (ret < 0)
2527 goto out;
2528
2529out:
2530 return ret;
2531}
2532
2367struct recorded_ref { 2533struct recorded_ref {
2368 struct list_head list; 2534 struct list_head list;
2369 char *dir_path; 2535 char *dir_path;
@@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,
2416static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2582static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2417{ 2583{
2418 struct recorded_ref *cur; 2584 struct recorded_ref *cur;
2419 struct recorded_ref *tmp;
2420 2585
2421 list_for_each_entry_safe(cur, tmp, head, list) { 2586 while (!list_empty(head)) {
2587 cur = list_entry(head->next, struct recorded_ref, list);
2422 fs_path_free(sctx, cur->full_path); 2588 fs_path_free(sctx, cur->full_path);
2589 list_del(&cur->list);
2423 kfree(cur); 2590 kfree(cur);
2424 } 2591 }
2425 INIT_LIST_HEAD(head);
2426} 2592}
2427 2593
2428static void free_recorded_refs(struct send_ctx *sctx) 2594static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
2432} 2598}
2433 2599
2434/* 2600/*
2435 * Renames/moves a file/dir to it's orphan name. Used when the first 2601 * Renames/moves a file/dir to its orphan name. Used when the first
2436 * ref of an unprocessed inode gets overwritten and for all non empty 2602 * ref of an unprocessed inode gets overwritten and for all non empty
2437 * directories. 2603 * directories.
2438 */ 2604 */
@@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2472 struct btrfs_key loc; 2638 struct btrfs_key loc;
2473 struct btrfs_dir_item *di; 2639 struct btrfs_dir_item *di;
2474 2640
2641 /*
2642 * Don't try to rmdir the top/root subvolume dir.
2643 */
2644 if (dir == BTRFS_FIRST_FREE_OBJECTID)
2645 return 0;
2646
2475 path = alloc_path_for_send(); 2647 path = alloc_path_for_send();
2476 if (!path) 2648 if (!path)
2477 return -ENOMEM; 2649 return -ENOMEM;
@@ -2513,160 +2685,6 @@ out:
2513 return ret; 2685 return ret;
2514} 2686}
2515 2687
2516struct finish_unordered_dir_ctx {
2517 struct send_ctx *sctx;
2518 struct fs_path *cur_path;
2519 struct fs_path *dir_path;
2520 u64 dir_ino;
2521 int need_delete;
2522 int delete_pass;
2523};
2524
2525int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2526 const char *name, int name_len,
2527 const char *data, int data_len,
2528 u8 type, void *ctx)
2529{
2530 int ret = 0;
2531 struct finish_unordered_dir_ctx *fctx = ctx;
2532 struct send_ctx *sctx = fctx->sctx;
2533 u64 di_gen;
2534 u64 di_mode;
2535 int is_orphan = 0;
2536
2537 if (di_key->objectid >= fctx->dir_ino)
2538 goto out;
2539
2540 fs_path_reset(fctx->cur_path);
2541
2542 ret = get_inode_info(sctx->send_root, di_key->objectid,
2543 NULL, &di_gen, &di_mode, NULL, NULL);
2544 if (ret < 0)
2545 goto out;
2546
2547 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2548 fctx->dir_ino, name, name_len);
2549 if (ret < 0)
2550 goto out;
2551 if (ret) {
2552 is_orphan = 1;
2553 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2554 fctx->cur_path);
2555 } else {
2556 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2557 fctx->cur_path);
2558 }
2559 if (ret < 0)
2560 goto out;
2561
2562 ret = fs_path_add(fctx->dir_path, name, name_len);
2563 if (ret < 0)
2564 goto out;
2565
2566 if (!fctx->delete_pass) {
2567 if (S_ISDIR(di_mode)) {
2568 ret = send_rename(sctx, fctx->cur_path,
2569 fctx->dir_path);
2570 } else {
2571 ret = send_link(sctx, fctx->dir_path,
2572 fctx->cur_path);
2573 if (is_orphan)
2574 fctx->need_delete = 1;
2575 }
2576 } else if (!S_ISDIR(di_mode)) {
2577 ret = send_unlink(sctx, fctx->cur_path);
2578 } else {
2579 ret = 0;
2580 }
2581
2582 fs_path_remove(fctx->dir_path);
2583
2584out:
2585 return ret;
2586}
2587
2588/*
2589 * Go through all dir items and see if we find refs which could not be created
2590 * in the past because the dir did not exist at that time.
2591 */
2592static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2593{
2594 int ret = 0;
2595 struct btrfs_path *path = NULL;
2596 struct btrfs_key key;
2597 struct btrfs_key found_key;
2598 struct extent_buffer *eb;
2599 struct finish_unordered_dir_ctx fctx;
2600 int slot;
2601
2602 path = alloc_path_for_send();
2603 if (!path) {
2604 ret = -ENOMEM;
2605 goto out;
2606 }
2607
2608 memset(&fctx, 0, sizeof(fctx));
2609 fctx.sctx = sctx;
2610 fctx.cur_path = fs_path_alloc(sctx);
2611 fctx.dir_path = fs_path_alloc(sctx);
2612 if (!fctx.cur_path || !fctx.dir_path) {
2613 ret = -ENOMEM;
2614 goto out;
2615 }
2616 fctx.dir_ino = dir;
2617
2618 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2619 if (ret < 0)
2620 goto out;
2621
2622 /*
2623 * We do two passes. The first links in the new refs and the second
2624 * deletes orphans if required. Deletion of orphans is not required for
2625 * directory inodes, as we always have only one ref and use rename
2626 * instead of link for those.
2627 */
2628
2629again:
2630 key.objectid = dir;
2631 key.type = BTRFS_DIR_ITEM_KEY;
2632 key.offset = 0;
2633 while (1) {
2634 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2635 1, 0);
2636 if (ret < 0)
2637 goto out;
2638 eb = path->nodes[0];
2639 slot = path->slots[0];
2640 btrfs_item_key_to_cpu(eb, &found_key, slot);
2641
2642 if (found_key.objectid != key.objectid ||
2643 found_key.type != key.type) {
2644 btrfs_release_path(path);
2645 break;
2646 }
2647
2648 ret = iterate_dir_item(sctx, sctx->send_root, path,
2649 &found_key, __finish_unordered_dir,
2650 &fctx);
2651 if (ret < 0)
2652 goto out;
2653
2654 key.offset = found_key.offset + 1;
2655 btrfs_release_path(path);
2656 }
2657
2658 if (!fctx.delete_pass && fctx.need_delete) {
2659 fctx.delete_pass = 1;
2660 goto again;
2661 }
2662
2663out:
2664 btrfs_free_path(path);
2665 fs_path_free(sctx, fctx.cur_path);
2666 fs_path_free(sctx, fctx.dir_path);
2667 return ret;
2668}
2669
2670/* 2688/*
2671 * This does all the move/link/unlink/rmdir magic. 2689 * This does all the move/link/unlink/rmdir magic.
2672 */ 2690 */
@@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
2674{ 2692{
2675 int ret = 0; 2693 int ret = 0;
2676 struct recorded_ref *cur; 2694 struct recorded_ref *cur;
2695 struct recorded_ref *cur2;
2677 struct ulist *check_dirs = NULL; 2696 struct ulist *check_dirs = NULL;
2678 struct ulist_iterator uit; 2697 struct ulist_iterator uit;
2679 struct ulist_node *un; 2698 struct ulist_node *un;
@@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
2685 2704
2686verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 2705verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2687 2706
2707 /*
2708 * This should never happen as the root dir always has the same ref
2709 * which is always '..'
2710 */
2711 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2712
2688 valid_path = fs_path_alloc(sctx); 2713 valid_path = fs_path_alloc(sctx);
2689 if (!valid_path) { 2714 if (!valid_path) {
2690 ret = -ENOMEM; 2715 ret = -ENOMEM;
@@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2731 2756
2732 list_for_each_entry(cur, &sctx->new_refs, list) { 2757 list_for_each_entry(cur, &sctx->new_refs, list) {
2733 /* 2758 /*
2759 * We may have refs where the parent directory does not exist
2760 * yet. This happens if the parent directories inum is higher
2761 * the the current inum. To handle this case, we create the
2762 * parent directory out of order. But we need to check if this
2763 * did already happen before due to other refs in the same dir.
2764 */
2765 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
2766 if (ret < 0)
2767 goto out;
2768 if (ret == inode_state_will_create) {
2769 ret = 0;
2770 /*
2771 * First check if any of the current inodes refs did
2772 * already create the dir.
2773 */
2774 list_for_each_entry(cur2, &sctx->new_refs, list) {
2775 if (cur == cur2)
2776 break;
2777 if (cur2->dir == cur->dir) {
2778 ret = 1;
2779 break;
2780 }
2781 }
2782
2783 /*
2784 * If that did not happen, check if a previous inode
2785 * did already create the dir.
2786 */
2787 if (!ret)
2788 ret = did_create_dir(sctx, cur->dir);
2789 if (ret < 0)
2790 goto out;
2791 if (!ret) {
2792 ret = send_create_inode(sctx, cur->dir);
2793 if (ret < 0)
2794 goto out;
2795 }
2796 }
2797
2798 /*
2734 * Check if this new ref would overwrite the first ref of 2799 * Check if this new ref would overwrite the first ref of
2735 * another unprocessed inode. If yes, orphanize the 2800 * another unprocessed inode. If yes, orphanize the
2736 * overwritten inode. If we find an overwritten ref that is 2801 * overwritten inode. If we find an overwritten ref that is
@@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2764 * inode, move it and update valid_path. If not, link or move 2829 * inode, move it and update valid_path. If not, link or move
2765 * it depending on the inode mode. 2830 * it depending on the inode mode.
2766 */ 2831 */
2767 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2832 if (is_orphan) {
2768 ret = send_rename(sctx, valid_path, cur->full_path); 2833 ret = send_rename(sctx, valid_path, cur->full_path);
2769 if (ret < 0) 2834 if (ret < 0)
2770 goto out; 2835 goto out;
@@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2827 if (ret < 0) 2892 if (ret < 0)
2828 goto out; 2893 goto out;
2829 } 2894 }
2895 } else if (S_ISDIR(sctx->cur_inode_mode) &&
2896 !list_empty(&sctx->deleted_refs)) {
2897 /*
2898 * We have a moved dir. Add the old parent to check_dirs
2899 */
2900 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
2901 list);
2902 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2903 GFP_NOFS);
2904 if (ret < 0)
2905 goto out;
2830 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 2906 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2831 /* 2907 /*
2832 * We have a non dir inode. Go through all deleted refs and 2908 * We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2840 if (ret < 0) 2916 if (ret < 0)
2841 goto out; 2917 goto out;
2842 if (!ret) { 2918 if (!ret) {
2843 /* 2919 ret = send_unlink(sctx, cur->full_path);
2844 * In case the inode was moved to a directory 2920 if (ret < 0)
2845 * that was not created yet (see 2921 goto out;
2846 * __record_new_ref), we can not unlink the ref
2847 * as it will be needed later when the parent
2848 * directory is created, so that we can move in
2849 * the inode to the new dir.
2850 */
2851 if (!is_orphan &&
2852 sctx->cur_inode_first_ref_orphan) {
2853 ret = orphanize_inode(sctx,
2854 sctx->cur_ino,
2855 sctx->cur_inode_gen,
2856 cur->full_path);
2857 if (ret < 0)
2858 goto out;
2859 ret = gen_unique_name(sctx,
2860 sctx->cur_ino,
2861 sctx->cur_inode_gen,
2862 valid_path);
2863 if (ret < 0)
2864 goto out;
2865 is_orphan = 1;
2866
2867 } else {
2868 ret = send_unlink(sctx, cur->full_path);
2869 if (ret < 0)
2870 goto out;
2871 }
2872 } 2922 }
2873 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, 2923 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2874 GFP_NOFS); 2924 GFP_NOFS);
@@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2880 * If the inode is still orphan, unlink the orphan. This may 2930 * If the inode is still orphan, unlink the orphan. This may
2881 * happen when a previous inode did overwrite the first ref 2931 * happen when a previous inode did overwrite the first ref
2882 * of this inode and no new refs were added for the current 2932 * of this inode and no new refs were added for the current
2883 * inode. 2933 * inode. Unlinking does not mean that the inode is deleted in
2884 * We can however not delete the orphan in case the inode relies 2934 * all cases. There may still be links to this inode in other
2885 * in a directory that was not created yet (see 2935 * places.
2886 * __record_new_ref)
2887 */ 2936 */
2888 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2937 if (is_orphan) {
2889 ret = send_unlink(sctx, valid_path); 2938 ret = send_unlink(sctx, valid_path);
2890 if (ret < 0) 2939 if (ret < 0)
2891 goto out; 2940 goto out;
@@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2900 */ 2949 */
2901 ULIST_ITER_INIT(&uit); 2950 ULIST_ITER_INIT(&uit);
2902 while ((un = ulist_next(check_dirs, &uit))) { 2951 while ((un = ulist_next(check_dirs, &uit))) {
2952 /*
2953 * In case we had refs into dirs that were not processed yet,
2954 * we don't need to do the utime and rmdir logic for these dirs.
2955 * The dir will be processed later.
2956 */
2903 if (un->val > sctx->cur_ino) 2957 if (un->val > sctx->cur_ino)
2904 continue; 2958 continue;
2905 2959
@@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2929 } 2983 }
2930 } 2984 }
2931 2985
2932 /*
2933 * Current inode is now at it's new position, so we must increase
2934 * send_progress
2935 */
2936 sctx->send_progress = sctx->cur_ino + 1;
2937
2938 /*
2939 * We may have a directory here that has pending refs which could not
2940 * be created before (because the dir did not exist before, see
2941 * __record_new_ref). finish_outoforder_dir will link/move the pending
2942 * refs.
2943 */
2944 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2945 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2946 sctx->cur_inode_gen);
2947 if (ret < 0)
2948 goto out;
2949 }
2950
2951 ret = 0; 2986 ret = 0;
2952 2987
2953out: 2988out:
@@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,
2971 return -ENOMEM; 3006 return -ENOMEM;
2972 3007
2973 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3008 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2974 NULL); 3009 NULL, NULL);
2975 if (ret < 0)
2976 goto out;
2977
2978 /*
2979 * The parent may be non-existent at this point in time. This happens
2980 * if the ino of the parent dir is higher then the current ino. In this
2981 * case, we can not process this ref until the parent dir is finally
2982 * created. If we reach the parent dir later, process_recorded_refs
2983 * will go through all dir items and process the refs that could not be
2984 * processed before. In case this is the first ref, we set
2985 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2986 * keep an orphan of the inode so that it later can be used for
2987 * link/move
2988 */
2989 ret = is_inode_existent(sctx, dir, gen);
2990 if (ret < 0) 3010 if (ret < 0)
2991 goto out; 3011 goto out;
2992 if (!ret) {
2993 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2994 name->start, fs_path_len(name));
2995 if (ret < 0)
2996 goto out;
2997 if (ret)
2998 sctx->cur_inode_first_ref_orphan = 1;
2999 ret = 0;
3000 goto out;
3001 }
3002 3012
3003 ret = get_cur_path(sctx, dir, gen, p); 3013 ret = get_cur_path(sctx, dir, gen, p);
3004 if (ret < 0) 3014 if (ret < 0)
@@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3029 return -ENOMEM; 3039 return -ENOMEM;
3030 3040
3031 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, 3041 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3032 NULL); 3042 NULL, NULL);
3033 if (ret < 0) 3043 if (ret < 0)
3034 goto out; 3044 goto out;
3035 3045
@@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,
3206 key.offset = 0; 3216 key.offset = 0;
3207 while (1) { 3217 while (1) {
3208 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3218 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3209 if (ret < 0) { 3219 if (ret < 0)
3210 btrfs_release_path(path);
3211 goto out; 3220 goto out;
3212 } 3221 if (ret)
3213 if (ret) {
3214 btrfs_release_path(path);
3215 break; 3222 break;
3216 }
3217 3223
3218 eb = path->nodes[0]; 3224 eb = path->nodes[0];
3219 slot = path->slots[0]; 3225 slot = path->slots[0];
3220 btrfs_item_key_to_cpu(eb, &found_key, slot); 3226 btrfs_item_key_to_cpu(eb, &found_key, slot);
3221 3227
3222 if (found_key.objectid != key.objectid || 3228 if (found_key.objectid != key.objectid ||
3223 found_key.type != key.type) { 3229 found_key.type != key.type)
3224 btrfs_release_path(path);
3225 break; 3230 break;
3226 }
3227 3231
3228 ret = iterate_inode_ref(sctx, sctx->parent_root, path, 3232 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
3229 &found_key, 0, cb, sctx); 3233 sctx);
3230 btrfs_release_path(path); 3234 btrfs_release_path(path);
3231 if (ret < 0) 3235 if (ret < 0)
3232 goto out; 3236 goto out;
3233 3237
3234 key.offset = found_key.offset + 1; 3238 key.offset = found_key.offset + 1;
3235 } 3239 }
3240 btrfs_release_path(path);
3236 3241
3237 ret = process_recorded_refs(sctx); 3242 ret = process_recorded_refs(sctx);
3238 3243
@@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3555 int ret = 0; 3560 int ret = 0;
3556 struct fs_path *p; 3561 struct fs_path *p;
3557 loff_t pos = offset; 3562 loff_t pos = offset;
3558 int readed = 0; 3563 int num_read = 0;
3559 mm_segment_t old_fs; 3564 mm_segment_t old_fs;
3560 3565
3561 p = fs_path_alloc(sctx); 3566 p = fs_path_alloc(sctx);
@@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3580 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); 3585 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3581 if (ret < 0) 3586 if (ret < 0)
3582 goto out; 3587 goto out;
3583 readed = ret; 3588 num_read = ret;
3584 if (!readed) 3589 if (!num_read)
3585 goto out; 3590 goto out;
3586 3591
3587 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 3592 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3594 3599
3595 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3600 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3596 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 3601 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3597 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); 3602 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
3598 3603
3599 ret = send_cmd(sctx); 3604 ret = send_cmd(sctx);
3600 3605
@@ -3604,7 +3609,7 @@ out:
3604 set_fs(old_fs); 3609 set_fs(old_fs);
3605 if (ret < 0) 3610 if (ret < 0)
3606 return ret; 3611 return ret;
3607 return readed; 3612 return num_read;
3608} 3613}
3609 3614
3610/* 3615/*
@@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,
3615 struct clone_root *clone_root) 3620 struct clone_root *clone_root)
3616{ 3621{
3617 int ret = 0; 3622 int ret = 0;
3618 struct btrfs_root *clone_root2 = clone_root->root;
3619 struct fs_path *p; 3623 struct fs_path *p;
3620 u64 gen; 3624 u64 gen;
3621 3625
@@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3640 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 3644 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3641 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3645 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3642 3646
3643 if (clone_root2 == sctx->send_root) { 3647 if (clone_root->root == sctx->send_root) {
3644 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, 3648 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3645 &gen, NULL, NULL, NULL); 3649 &gen, NULL, NULL, NULL, NULL);
3646 if (ret < 0) 3650 if (ret < 0)
3647 goto out; 3651 goto out;
3648 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3652 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3649 } else { 3653 } else {
3650 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); 3654 ret = get_inode_path(sctx, clone_root->root,
3655 clone_root->ino, p);
3651 } 3656 }
3652 if (ret < 0) 3657 if (ret < 0)
3653 goto out; 3658 goto out;
3654 3659
3655 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 3660 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3656 clone_root2->root_item.uuid); 3661 clone_root->root->root_item.uuid);
3657 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 3662 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3658 clone_root2->root_item.ctransid); 3663 clone_root->root->root_item.ctransid);
3659 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 3664 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3660 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 3665 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3661 clone_root->offset); 3666 clone_root->offset);
@@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
3684 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3689 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3685 struct btrfs_file_extent_item); 3690 struct btrfs_file_extent_item);
3686 type = btrfs_file_extent_type(path->nodes[0], ei); 3691 type = btrfs_file_extent_type(path->nodes[0], ei);
3687 if (type == BTRFS_FILE_EXTENT_INLINE) 3692 if (type == BTRFS_FILE_EXTENT_INLINE) {
3688 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 3693 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3689 else 3694 /*
3695 * it is possible the inline item won't cover the whole page,
3696 * but there may be items after this page. Make
3697 * sure to send the whole thing
3698 */
3699 len = PAGE_CACHE_ALIGN(len);
3700 } else {
3690 len = btrfs_file_extent_num_bytes(path->nodes[0], ei); 3701 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3702 }
3691 3703
3692 if (offset + len > sctx->cur_inode_size) 3704 if (offset + len > sctx->cur_inode_size)
3693 len = sctx->cur_inode_size - offset; 3705 len = sctx->cur_inode_size - offset;
@@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3735 u64 left_offset_fixed; 3747 u64 left_offset_fixed;
3736 u64 left_len; 3748 u64 left_len;
3737 u64 right_len; 3749 u64 right_len;
3750 u64 left_gen;
3751 u64 right_gen;
3738 u8 left_type; 3752 u8 left_type;
3739 u8 right_type; 3753 u8 right_type;
3740 3754
@@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3744 3758
3745 eb = left_path->nodes[0]; 3759 eb = left_path->nodes[0];
3746 slot = left_path->slots[0]; 3760 slot = left_path->slots[0];
3747
3748 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 3761 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3749 left_type = btrfs_file_extent_type(eb, ei); 3762 left_type = btrfs_file_extent_type(eb, ei);
3750 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3751 left_len = btrfs_file_extent_num_bytes(eb, ei);
3752 left_offset = btrfs_file_extent_offset(eb, ei);
3753 3763
3754 if (left_type != BTRFS_FILE_EXTENT_REG) { 3764 if (left_type != BTRFS_FILE_EXTENT_REG) {
3755 ret = 0; 3765 ret = 0;
3756 goto out; 3766 goto out;
3757 } 3767 }
3768 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3769 left_len = btrfs_file_extent_num_bytes(eb, ei);
3770 left_offset = btrfs_file_extent_offset(eb, ei);
3771 left_gen = btrfs_file_extent_generation(eb, ei);
3758 3772
3759 /* 3773 /*
3760 * Following comments will refer to these graphics. L is the left 3774 * Following comments will refer to these graphics. L is the left
@@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3810 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 3824 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3811 right_len = btrfs_file_extent_num_bytes(eb, ei); 3825 right_len = btrfs_file_extent_num_bytes(eb, ei);
3812 right_offset = btrfs_file_extent_offset(eb, ei); 3826 right_offset = btrfs_file_extent_offset(eb, ei);
3827 right_gen = btrfs_file_extent_generation(eb, ei);
3813 3828
3814 if (right_type != BTRFS_FILE_EXTENT_REG) { 3829 if (right_type != BTRFS_FILE_EXTENT_REG) {
3815 ret = 0; 3830 ret = 0;
@@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3820 * Are we at extent 8? If yes, we know the extent is changed. 3835 * Are we at extent 8? If yes, we know the extent is changed.
3821 * This may only happen on the first iteration. 3836 * This may only happen on the first iteration.
3822 */ 3837 */
3823 if (found_key.offset + right_len < ekey->offset) { 3838 if (found_key.offset + right_len <= ekey->offset) {
3824 ret = 0; 3839 ret = 0;
3825 goto out; 3840 goto out;
3826 } 3841 }
@@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3837 /* 3852 /*
3838 * Check if we have the same extent. 3853 * Check if we have the same extent.
3839 */ 3854 */
3840 if (left_disknr + left_offset_fixed != 3855 if (left_disknr != right_disknr ||
3841 right_disknr + right_offset) { 3856 left_offset_fixed != right_offset ||
3857 left_gen != right_gen) {
3842 ret = 0; 3858 ret = 0;
3843 goto out; 3859 goto out;
3844 } 3860 }
@@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3977 goto out; 3993 goto out;
3978 3994
3979 ret = process_recorded_refs(sctx); 3995 ret = process_recorded_refs(sctx);
3996 if (ret < 0)
3997 goto out;
3998
3999 /*
4000 * We have processed the refs and thus need to advance send_progress.
4001 * Now, calls to get_cur_xxx will take the updated refs of the current
4002 * inode into account.
4003 */
4004 sctx->send_progress = sctx->cur_ino + 1;
3980 4005
3981out: 4006out:
3982 return ret; 4007 return ret;
@@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4004 goto out; 4029 goto out;
4005 4030
4006 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, 4031 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4007 &left_mode, &left_uid, &left_gid); 4032 &left_mode, &left_uid, &left_gid, NULL);
4008 if (ret < 0) 4033 if (ret < 0)
4009 goto out; 4034 goto out;
4010 4035
@@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4015 } else { 4040 } else {
4016 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, 4041 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4017 NULL, NULL, &right_mode, &right_uid, 4042 NULL, NULL, &right_mode, &right_uid,
4018 &right_gid); 4043 &right_gid, NULL);
4019 if (ret < 0) 4044 if (ret < 0)
4020 goto out; 4045 goto out;
4021 4046
@@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,
4074 4099
4075 sctx->cur_ino = key->objectid; 4100 sctx->cur_ino = key->objectid;
4076 sctx->cur_inode_new_gen = 0; 4101 sctx->cur_inode_new_gen = 0;
4077 sctx->cur_inode_first_ref_orphan = 0; 4102
4103 /*
4104 * Set send_progress to current inode. This will tell all get_cur_xxx
4105 * functions that the current inode's refs are not updated yet. Later,
4106 * when process_recorded_refs is finished, it is set to cur_ino + 1.
4107 */
4078 sctx->send_progress = sctx->cur_ino; 4108 sctx->send_progress = sctx->cur_ino;
4079 4109
4080 if (result == BTRFS_COMPARE_TREE_NEW || 4110 if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,
4098 4128
4099 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 4129 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4100 right_ii); 4130 right_ii);
4101 if (left_gen != right_gen) 4131
4132 /*
4133 * The cur_ino = root dir case is special here. We can't treat
4134 * the inode as deleted+reused because it would generate a
4135 * stream that tries to delete/mkdir the root dir.
4136 */
4137 if (left_gen != right_gen &&
4138 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4102 sctx->cur_inode_new_gen = 1; 4139 sctx->cur_inode_new_gen = 1;
4103 } 4140 }
4104 4141
@@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,
4111 sctx->cur_inode_mode = btrfs_inode_mode( 4148 sctx->cur_inode_mode = btrfs_inode_mode(
4112 sctx->left_path->nodes[0], left_ii); 4149 sctx->left_path->nodes[0], left_ii);
4113 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 4150 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4114 ret = send_create_inode(sctx, sctx->left_path, 4151 ret = send_create_inode_if_needed(sctx);
4115 sctx->cmp_key);
4116 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 4152 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4117 sctx->cur_inode_gen = right_gen; 4153 sctx->cur_inode_gen = right_gen;
4118 sctx->cur_inode_new = 0; 4154 sctx->cur_inode_new = 0;
@@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,
4122 sctx->cur_inode_mode = btrfs_inode_mode( 4158 sctx->cur_inode_mode = btrfs_inode_mode(
4123 sctx->right_path->nodes[0], right_ii); 4159 sctx->right_path->nodes[0], right_ii);
4124 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 4160 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4161 /*
4162 * We need to do some special handling in case the inode was
4163 * reported as changed with a changed generation number. This
4164 * means that the original inode was deleted and new inode
4165 * reused the same inum. So we have to treat the old inode as
4166 * deleted and the new one as new.
4167 */
4125 if (sctx->cur_inode_new_gen) { 4168 if (sctx->cur_inode_new_gen) {
4169 /*
4170 * First, process the inode as if it was deleted.
4171 */
4126 sctx->cur_inode_gen = right_gen; 4172 sctx->cur_inode_gen = right_gen;
4127 sctx->cur_inode_new = 0; 4173 sctx->cur_inode_new = 0;
4128 sctx->cur_inode_deleted = 1; 4174 sctx->cur_inode_deleted = 1;
@@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,
4135 if (ret < 0) 4181 if (ret < 0)
4136 goto out; 4182 goto out;
4137 4183
4184 /*
4185 * Now process the inode as if it was new.
4186 */
4138 sctx->cur_inode_gen = left_gen; 4187 sctx->cur_inode_gen = left_gen;
4139 sctx->cur_inode_new = 1; 4188 sctx->cur_inode_new = 1;
4140 sctx->cur_inode_deleted = 0; 4189 sctx->cur_inode_deleted = 0;
@@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,
4142 sctx->left_path->nodes[0], left_ii); 4191 sctx->left_path->nodes[0], left_ii);
4143 sctx->cur_inode_mode = btrfs_inode_mode( 4192 sctx->cur_inode_mode = btrfs_inode_mode(
4144 sctx->left_path->nodes[0], left_ii); 4193 sctx->left_path->nodes[0], left_ii);
4145 ret = send_create_inode(sctx, sctx->left_path, 4194 ret = send_create_inode_if_needed(sctx);
4146 sctx->cmp_key);
4147 if (ret < 0) 4195 if (ret < 0)
4148 goto out; 4196 goto out;
4149 4197
4150 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 4198 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4151 if (ret < 0) 4199 if (ret < 0)
4152 goto out; 4200 goto out;
4201 /*
4202 * Advance send_progress now as we did not get into
4203 * process_recorded_refs_if_needed in the new_gen case.
4204 */
4205 sctx->send_progress = sctx->cur_ino + 1;
4206
4207 /*
4208 * Now process all extents and xattrs of the inode as if
4209 * they were all new.
4210 */
4153 ret = process_all_extents(sctx); 4211 ret = process_all_extents(sctx);
4154 if (ret < 0) 4212 if (ret < 0)
4155 goto out; 4213 goto out;
@@ -4172,6 +4230,16 @@ out:
4172 return ret; 4230 return ret;
4173} 4231}
4174 4232
4233/*
4234 * We have to process new refs before deleted refs, but compare_trees gives us
4235 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
4236 * first and later process them in process_recorded_refs.
4237 * For the cur_inode_new_gen case, we skip recording completely because
4238 * changed_inode did already initiate processing of refs. The reason for this is
4239 * that in this case, compare_tree actually compares the refs of 2 different
4240 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
4241 * refs of the right tree as deleted and all refs of the left tree as new.
4242 */
4175static int changed_ref(struct send_ctx *sctx, 4243static int changed_ref(struct send_ctx *sctx,
4176 enum btrfs_compare_tree_result result) 4244 enum btrfs_compare_tree_result result)
4177{ 4245{
@@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,
4192 return ret; 4260 return ret;
4193} 4261}
4194 4262
4263/*
4264 * Process new/deleted/changed xattrs. We skip processing in the
4265 * cur_inode_new_gen case because changed_inode did already initiate processing
4266 * of xattrs. The reason is the same as in changed_ref
4267 */
4195static int changed_xattr(struct send_ctx *sctx, 4268static int changed_xattr(struct send_ctx *sctx,
4196 enum btrfs_compare_tree_result result) 4269 enum btrfs_compare_tree_result result)
4197{ 4270{
@@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,
4211 return ret; 4284 return ret;
4212} 4285}
4213 4286
4287/*
4288 * Process new/deleted/changed extents. We skip processing in the
4289 * cur_inode_new_gen case because changed_inode did already initiate processing
4290 * of extents. The reason is the same as in changed_ref
4291 */
4214static int changed_extent(struct send_ctx *sctx, 4292static int changed_extent(struct send_ctx *sctx,
4215 enum btrfs_compare_tree_result result) 4293 enum btrfs_compare_tree_result result)
4216{ 4294{
@@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,
4227 return ret; 4305 return ret;
4228} 4306}
4229 4307
4230 4308/*
4309 * Updates compare related fields in sctx and simply forwards to the actual
4310 * changed_xxx functions.
4311 */
4231static int changed_cb(struct btrfs_root *left_root, 4312static int changed_cb(struct btrfs_root *left_root,
4232 struct btrfs_root *right_root, 4313 struct btrfs_root *right_root,
4233 struct btrfs_path *left_path, 4314 struct btrfs_path *left_path,
@@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,
4247 if (ret < 0) 4328 if (ret < 0)
4248 goto out; 4329 goto out;
4249 4330
4331 /* Ignore non-FS objects */
4332 if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
4333 key->objectid == BTRFS_FREE_SPACE_OBJECTID)
4334 goto out;
4335
4250 if (key->type == BTRFS_INODE_ITEM_KEY) 4336 if (key->type == BTRFS_INODE_ITEM_KEY)
4251 ret = changed_inode(sctx, result); 4337 ret = changed_inode(sctx, result);
4252 else if (key->type == BTRFS_INODE_REF_KEY) 4338 else if (key->type == BTRFS_INODE_REF_KEY)
@@ -4299,7 +4385,8 @@ join_trans:
4299 } 4385 }
4300 4386
4301 /* 4387 /*
4302 * Make sure the tree has not changed 4388 * Make sure the tree has not changed after re-joining. We detect this
4389 * by comparing start_ctransid and ctransid. They should always match.
4303 */ 4390 */
4304 spin_lock(&send_root->root_times_lock); 4391 spin_lock(&send_root->root_times_lock);
4305 ctransid = btrfs_root_ctransid(&send_root->root_item); 4392 ctransid = btrfs_root_ctransid(&send_root->root_item);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57f..1bf4f32fd4ef 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
130 130
131#ifdef __KERNEL__ 131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); 132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
133#endif 134#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c220..915ac14c2064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
243 struct btrfs_root *root, const char *function, 243 struct btrfs_root *root, const char *function,
244 unsigned int line, int errno) 244 unsigned int line, int errno)
245{ 245{
246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); 246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
247 trans->aborted = errno; 247 trans->aborted = errno;
248 /* Nothing used. The other threads that have joined this 248 /* Nothing used. The other threads that have joined this
249 * transaction may be able to continue. */ 249 * transaction may be able to continue. */
250 if (!trans->blocks_used) { 250 if (!trans->blocks_used) {
251 btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); 251 char nbuf[16];
252 const char *errstr;
253
254 errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
255 btrfs_printk(root->fs_info,
256 "%s:%d: Aborting unused transaction(%s).\n",
257 function, line, errstr);
252 return; 258 return;
253 } 259 }
254 trans->transaction->aborted = errno; 260 trans->transaction->aborted = errno;
@@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
407 btrfs_set_opt(info->mount_opt, NODATASUM); 413 btrfs_set_opt(info->mount_opt, NODATASUM);
408 break; 414 break;
409 case Opt_nodatacow: 415 case Opt_nodatacow:
410 printk(KERN_INFO "btrfs: setting nodatacow\n"); 416 if (!btrfs_test_opt(root, COMPRESS) ||
417 !btrfs_test_opt(root, FORCE_COMPRESS)) {
418 printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
419 } else {
420 printk(KERN_INFO "btrfs: setting nodatacow\n");
421 }
422 info->compress_type = BTRFS_COMPRESS_NONE;
423 btrfs_clear_opt(info->mount_opt, COMPRESS);
424 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
411 btrfs_set_opt(info->mount_opt, NODATACOW); 425 btrfs_set_opt(info->mount_opt, NODATACOW);
412 btrfs_set_opt(info->mount_opt, NODATASUM); 426 btrfs_set_opt(info->mount_opt, NODATASUM);
413 break; 427 break;
@@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
422 compress_type = "zlib"; 436 compress_type = "zlib";
423 info->compress_type = BTRFS_COMPRESS_ZLIB; 437 info->compress_type = BTRFS_COMPRESS_ZLIB;
424 btrfs_set_opt(info->mount_opt, COMPRESS); 438 btrfs_set_opt(info->mount_opt, COMPRESS);
439 btrfs_clear_opt(info->mount_opt, NODATACOW);
440 btrfs_clear_opt(info->mount_opt, NODATASUM);
425 } else if (strcmp(args[0].from, "lzo") == 0) { 441 } else if (strcmp(args[0].from, "lzo") == 0) {
426 compress_type = "lzo"; 442 compress_type = "lzo";
427 info->compress_type = BTRFS_COMPRESS_LZO; 443 info->compress_type = BTRFS_COMPRESS_LZO;
428 btrfs_set_opt(info->mount_opt, COMPRESS); 444 btrfs_set_opt(info->mount_opt, COMPRESS);
445 btrfs_clear_opt(info->mount_opt, NODATACOW);
446 btrfs_clear_opt(info->mount_opt, NODATASUM);
429 btrfs_set_fs_incompat(info, COMPRESS_LZO); 447 btrfs_set_fs_incompat(info, COMPRESS_LZO);
430 } else if (strncmp(args[0].from, "no", 2) == 0) { 448 } else if (strncmp(args[0].from, "no", 2) == 0) {
431 compress_type = "no"; 449 compress_type = "no";
@@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
543 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 561 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
544 break; 562 break;
545 case Opt_defrag: 563 case Opt_defrag:
546 printk(KERN_INFO "btrfs: enabling auto defrag"); 564 printk(KERN_INFO "btrfs: enabling auto defrag\n");
547 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 565 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
548 break; 566 break;
549 case Opt_recovery: 567 case Opt_recovery:
550 printk(KERN_INFO "btrfs: enabling auto recovery"); 568 printk(KERN_INFO "btrfs: enabling auto recovery\n");
551 btrfs_set_opt(info->mount_opt, RECOVERY); 569 btrfs_set_opt(info->mount_opt, RECOVERY);
552 break; 570 break;
553 case Opt_skip_balance: 571 case Opt_skip_balance:
@@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
846 return 0; 864 return 0;
847 } 865 }
848 866
849 btrfs_wait_ordered_extents(root, 0, 0); 867 btrfs_wait_ordered_extents(root, 0);
850
851 spin_lock(&fs_info->trans_lock);
852 if (!fs_info->running_transaction) {
853 spin_unlock(&fs_info->trans_lock);
854 return 0;
855 }
856 spin_unlock(&fs_info->trans_lock);
857 868
858 trans = btrfs_join_transaction(root); 869 trans = btrfs_attach_transaction(root);
859 if (IS_ERR(trans)) 870 if (IS_ERR(trans)) {
871 /* no transaction, don't bother */
872 if (PTR_ERR(trans) == -ENOENT)
873 return 0;
860 return PTR_ERR(trans); 874 return PTR_ERR(trans);
875 }
861 return btrfs_commit_transaction(trans, root); 876 return btrfs_commit_transaction(trans, root);
862} 877}
863 878
@@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1508 1523
1509static int btrfs_freeze(struct super_block *sb) 1524static int btrfs_freeze(struct super_block *sb)
1510{ 1525{
1511 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1526 struct btrfs_trans_handle *trans;
1512 mutex_lock(&fs_info->transaction_kthread_mutex); 1527 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1513 mutex_lock(&fs_info->cleaner_mutex); 1528
1514 return 0; 1529 trans = btrfs_attach_transaction(root);
1530 if (IS_ERR(trans)) {
1531 /* no transaction, don't bother */
1532 if (PTR_ERR(trans) == -ENOENT)
1533 return 0;
1534 return PTR_ERR(trans);
1535 }
1536 return btrfs_commit_transaction(trans, root);
1515} 1537}
1516 1538
1517static int btrfs_unfreeze(struct super_block *sb) 1539static int btrfs_unfreeze(struct super_block *sb)
1518{ 1540{
1519 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1520 mutex_unlock(&fs_info->cleaner_mutex);
1521 mutex_unlock(&fs_info->transaction_kthread_mutex);
1522 return 0; 1541 return 0;
1523} 1542}
1524 1543
@@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)
1595static void btrfs_interface_exit(void) 1614static void btrfs_interface_exit(void)
1596{ 1615{
1597 if (misc_deregister(&btrfs_misc) < 0) 1616 if (misc_deregister(&btrfs_misc) < 0)
1598 printk(KERN_INFO "misc_deregister failed for control device"); 1617 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1599} 1618}
1600 1619
1601static int __init init_btrfs_fs(void) 1620static int __init init_btrfs_fs(void)
@@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)
1620 if (err) 1639 if (err)
1621 goto free_extent_io; 1640 goto free_extent_io;
1622 1641
1623 err = btrfs_delayed_inode_init(); 1642 err = ordered_data_init();
1624 if (err) 1643 if (err)
1625 goto free_extent_map; 1644 goto free_extent_map;
1626 1645
1646 err = btrfs_delayed_inode_init();
1647 if (err)
1648 goto free_ordered_data;
1649
1627 err = btrfs_interface_init(); 1650 err = btrfs_interface_init();
1628 if (err) 1651 if (err)
1629 goto free_delayed_inode; 1652 goto free_delayed_inode;
@@ -1641,6 +1664,8 @@ unregister_ioctl:
1641 btrfs_interface_exit(); 1664 btrfs_interface_exit();
1642free_delayed_inode: 1665free_delayed_inode:
1643 btrfs_delayed_inode_exit(); 1666 btrfs_delayed_inode_exit();
1667free_ordered_data:
1668 ordered_data_exit();
1644free_extent_map: 1669free_extent_map:
1645 extent_map_exit(); 1670 extent_map_exit();
1646free_extent_io: 1671free_extent_io:
@@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)
1657{ 1682{
1658 btrfs_destroy_cachep(); 1683 btrfs_destroy_cachep();
1659 btrfs_delayed_inode_exit(); 1684 btrfs_delayed_inode_exit();
1685 ordered_data_exit();
1660 extent_map_exit(); 1686 extent_map_exit();
1661 extent_io_exit(); 1687 extent_io_exit();
1662 btrfs_interface_exit(); 1688 btrfs_interface_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e050..77db875b5116 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
53/* 53/*
54 * either allocate a new transaction or hop into the existing one 54 * either allocate a new transaction or hop into the existing one
55 */ 55 */
56static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int type)
57{ 57{
58 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info; 59 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
67 } 67 }
68 68
69 if (fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
70 if (!nofail) { 70 /*
71 * If we are JOIN_NOLOCK we're already committing a current
72 * transaction, we just need a handle to deal with something
73 * when committing the transaction, such as inode cache and
74 * space cache. It is a special case.
75 */
76 if (type != TRANS_JOIN_NOLOCK) {
71 spin_unlock(&fs_info->trans_lock); 77 spin_unlock(&fs_info->trans_lock);
72 return -EBUSY; 78 return -EBUSY;
73 } 79 }
@@ -87,6 +93,13 @@ loop:
87 } 93 }
88 spin_unlock(&fs_info->trans_lock); 94 spin_unlock(&fs_info->trans_lock);
89 95
96 /*
97 * If we are ATTACH, we just want to catch the current transaction,
98 * and commit it. If there is no transaction, just return ENOENT.
99 */
100 if (type == TRANS_ATTACH)
101 return -ENOENT;
102
90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 103 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
91 if (!cur_trans) 104 if (!cur_trans)
92 return -ENOMEM; 105 return -ENOMEM;
@@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)
267 } 280 }
268} 281}
269 282
270enum btrfs_trans_type {
271 TRANS_START,
272 TRANS_JOIN,
273 TRANS_USERSPACE,
274 TRANS_JOIN_NOLOCK,
275};
276
277static int may_wait_transaction(struct btrfs_root *root, int type) 283static int may_wait_transaction(struct btrfs_root *root, int type)
278{ 284{
279 if (root->fs_info->log_root_recovering) 285 if (root->fs_info->log_root_recovering)
@@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
290} 296}
291 297
292static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
293 u64 num_items, int type) 299 u64 num_items, int type,
300 int noflush)
294{ 301{
295 struct btrfs_trans_handle *h; 302 struct btrfs_trans_handle *h;
296 struct btrfs_transaction *cur_trans; 303 struct btrfs_transaction *cur_trans;
@@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
324 } 331 }
325 332
326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
327 ret = btrfs_block_rsv_add(root, 334 if (noflush)
328 &root->fs_info->trans_block_rsv, 335 ret = btrfs_block_rsv_add_noflush(root,
329 num_bytes); 336 &root->fs_info->trans_block_rsv,
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
330 if (ret) 342 if (ret)
331 return ERR_PTR(ret); 343 return ERR_PTR(ret);
332 } 344 }
@@ -335,19 +347,34 @@ again:
335 if (!h) 347 if (!h)
336 return ERR_PTR(-ENOMEM); 348 return ERR_PTR(-ENOMEM);
337 349
338 sb_start_intwrite(root->fs_info->sb); 350 /*
351 * If we are JOIN_NOLOCK we're already committing a transaction and
352 * waiting on this guy, so we don't need to do the sb_start_intwrite
353 * because we're already holding a ref. We need this because we could
354 * have raced in and did an fsync() on a file which can kick a commit
355 * and then we deadlock with somebody doing a freeze.
356 *
357 * If we are ATTACH, it means we just want to catch the current
358 * transaction and commit it, so we needn't do sb_start_intwrite().
359 */
360 if (type < TRANS_JOIN_NOLOCK)
361 sb_start_intwrite(root->fs_info->sb);
339 362
340 if (may_wait_transaction(root, type)) 363 if (may_wait_transaction(root, type))
341 wait_current_trans(root); 364 wait_current_trans(root);
342 365
343 do { 366 do {
344 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); 367 ret = join_transaction(root, type);
345 if (ret == -EBUSY) 368 if (ret == -EBUSY)
346 wait_current_trans(root); 369 wait_current_trans(root);
347 } while (ret == -EBUSY); 370 } while (ret == -EBUSY);
348 371
349 if (ret < 0) { 372 if (ret < 0) {
350 sb_end_intwrite(root->fs_info->sb); 373 /* We must get the transaction if we are JOIN_NOLOCK. */
374 BUG_ON(type == TRANS_JOIN_NOLOCK);
375
376 if (type < TRANS_JOIN_NOLOCK)
377 sb_end_intwrite(root->fs_info->sb);
351 kmem_cache_free(btrfs_trans_handle_cachep, h); 378 kmem_cache_free(btrfs_trans_handle_cachep, h);
352 return ERR_PTR(ret); 379 return ERR_PTR(ret);
353 } 380 }
@@ -367,7 +394,9 @@ again:
367 h->aborted = 0; 394 h->aborted = 0;
368 h->qgroup_reserved = qgroup_reserved; 395 h->qgroup_reserved = qgroup_reserved;
369 h->delayed_ref_elem.seq = 0; 396 h->delayed_ref_elem.seq = 0;
397 h->type = type;
370 INIT_LIST_HEAD(&h->qgroup_ref_list); 398 INIT_LIST_HEAD(&h->qgroup_ref_list);
399 INIT_LIST_HEAD(&h->new_bgs);
371 400
372 smp_mb(); 401 smp_mb();
373 if (cur_trans->blocked && may_wait_transaction(root, type)) { 402 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +422,33 @@ got_it:
393struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
394 int num_items) 423 int num_items)
395{ 424{
396 return start_transaction(root, num_items, TRANS_START); 425 return start_transaction(root, num_items, TRANS_START, 0);
426}
427
428struct btrfs_trans_handle *btrfs_start_transaction_noflush(
429 struct btrfs_root *root, int num_items)
430{
431 return start_transaction(root, num_items, TRANS_START, 1);
397} 432}
433
398struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
399{ 435{
400 return start_transaction(root, 0, TRANS_JOIN); 436 return start_transaction(root, 0, TRANS_JOIN, 0);
401} 437}
402 438
403struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) 439struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
404{ 440{
405 return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 441 return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
406} 442}
407 443
408struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) 444struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
409{ 445{
410 return start_transaction(root, 0, TRANS_USERSPACE); 446 return start_transaction(root, 0, TRANS_USERSPACE, 0);
447}
448
449struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
450{
451 return start_transaction(root, 0, TRANS_ATTACH, 0);
411} 452}
412 453
413/* wait for a transaction commit to be fully complete */ 454/* wait for a transaction commit to be fully complete */
@@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
506} 547}
507 548
508static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 549static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
509 struct btrfs_root *root, int throttle, int lock) 550 struct btrfs_root *root, int throttle)
510{ 551{
511 struct btrfs_transaction *cur_trans = trans->transaction; 552 struct btrfs_transaction *cur_trans = trans->transaction;
512 struct btrfs_fs_info *info = root->fs_info; 553 struct btrfs_fs_info *info = root->fs_info;
513 int count = 0; 554 int count = 0;
555 int lock = (trans->type != TRANS_JOIN_NOLOCK);
514 int err = 0; 556 int err = 0;
515 557
516 if (--trans->use_count) { 558 if (--trans->use_count) {
@@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
536 trans->qgroup_reserved = 0; 578 trans->qgroup_reserved = 0;
537 } 579 }
538 580
581 if (!list_empty(&trans->new_bgs))
582 btrfs_create_pending_block_groups(trans, root);
583
539 while (count < 2) { 584 while (count < 2) {
540 unsigned long cur = trans->delayed_ref_updates; 585 unsigned long cur = trans->delayed_ref_updates;
541 trans->delayed_ref_updates = 0; 586 trans->delayed_ref_updates = 0;
@@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
551 btrfs_trans_release_metadata(trans, root); 596 btrfs_trans_release_metadata(trans, root);
552 trans->block_rsv = NULL; 597 trans->block_rsv = NULL;
553 598
554 sb_end_intwrite(root->fs_info->sb); 599 if (!list_empty(&trans->new_bgs))
600 btrfs_create_pending_block_groups(trans, root);
555 601
556 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 602 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
557 should_end_transaction(trans, root)) { 603 should_end_transaction(trans, root)) {
@@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
573 } 619 }
574 } 620 }
575 621
622 if (trans->type < TRANS_JOIN_NOLOCK)
623 sb_end_intwrite(root->fs_info->sb);
624
576 WARN_ON(cur_trans != info->running_transaction); 625 WARN_ON(cur_trans != info->running_transaction);
577 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 626 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
578 atomic_dec(&cur_trans->num_writers); 627 atomic_dec(&cur_trans->num_writers);
@@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
604{ 653{
605 int ret; 654 int ret;
606 655
607 ret = __btrfs_end_transaction(trans, root, 0, 1); 656 ret = __btrfs_end_transaction(trans, root, 0);
608 if (ret) 657 if (ret)
609 return ret; 658 return ret;
610 return 0; 659 return 0;
@@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
615{ 664{
616 int ret; 665 int ret;
617 666
618 ret = __btrfs_end_transaction(trans, root, 1, 1); 667 ret = __btrfs_end_transaction(trans, root, 1);
619 if (ret)
620 return ret;
621 return 0;
622}
623
624int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root)
626{
627 int ret;
628
629 ret = __btrfs_end_transaction(trans, root, 0, 0);
630 if (ret) 668 if (ret)
631 return ret; 669 return ret;
632 return 0; 670 return 0;
@@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
635int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, 673int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
636 struct btrfs_root *root) 674 struct btrfs_root *root)
637{ 675{
638 return __btrfs_end_transaction(trans, root, 1, 1); 676 return __btrfs_end_transaction(trans, root, 1);
639} 677}
640 678
641/* 679/*
@@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
649 int err = 0; 687 int err = 0;
650 int werr = 0; 688 int werr = 0;
651 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 689 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
690 struct extent_state *cached_state = NULL;
652 u64 start = 0; 691 u64 start = 0;
653 u64 end; 692 u64 end;
654 693
655 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 694 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
656 mark)) { 695 mark, &cached_state)) {
657 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, 696 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
658 GFP_NOFS); 697 mark, &cached_state, GFP_NOFS);
698 cached_state = NULL;
659 err = filemap_fdatawrite_range(mapping, start, end); 699 err = filemap_fdatawrite_range(mapping, start, end);
660 if (err) 700 if (err)
661 werr = err; 701 werr = err;
@@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
679 int err = 0; 719 int err = 0;
680 int werr = 0; 720 int werr = 0;
681 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 721 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
722 struct extent_state *cached_state = NULL;
682 u64 start = 0; 723 u64 start = 0;
683 u64 end; 724 u64 end;
684 725
685 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 726 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
686 EXTENT_NEED_WAIT)) { 727 EXTENT_NEED_WAIT, &cached_state)) {
687 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); 728 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
729 0, 0, &cached_state, GFP_NOFS);
688 err = filemap_fdatawait_range(mapping, start, end); 730 err = filemap_fdatawait_range(mapping, start, end);
689 if (err) 731 if (err)
690 werr = err; 732 werr = err;
@@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
955 struct btrfs_root *parent_root; 997 struct btrfs_root *parent_root;
956 struct btrfs_block_rsv *rsv; 998 struct btrfs_block_rsv *rsv;
957 struct inode *parent_inode; 999 struct inode *parent_inode;
1000 struct btrfs_path *path;
1001 struct btrfs_dir_item *dir_item;
958 struct dentry *parent; 1002 struct dentry *parent;
959 struct dentry *dentry; 1003 struct dentry *dentry;
960 struct extent_buffer *tmp; 1004 struct extent_buffer *tmp;
@@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 u64 root_flags; 1011 u64 root_flags;
968 uuid_le new_uuid; 1012 uuid_le new_uuid;
969 1013
970 rsv = trans->block_rsv; 1014 path = btrfs_alloc_path();
1015 if (!path) {
1016 ret = pending->error = -ENOMEM;
1017 goto path_alloc_fail;
1018 }
971 1019
972 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1020 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
973 if (!new_root_item) { 1021 if (!new_root_item) {
974 ret = pending->error = -ENOMEM; 1022 ret = pending->error = -ENOMEM;
975 goto fail; 1023 goto root_item_alloc_fail;
976 } 1024 }
977 1025
978 ret = btrfs_find_free_objectid(tree_root, &objectid); 1026 ret = btrfs_find_free_objectid(tree_root, &objectid);
979 if (ret) { 1027 if (ret) {
980 pending->error = ret; 1028 pending->error = ret;
981 goto fail; 1029 goto no_free_objectid;
982 } 1030 }
983 1031
984 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
988 to_reserve); 1036 to_reserve);
989 if (ret) { 1037 if (ret) {
990 pending->error = ret; 1038 pending->error = ret;
991 goto fail; 1039 goto no_free_objectid;
992 } 1040 }
993 } 1041 }
994 1042
995 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, 1043 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
996 objectid, pending->inherit); 1044 objectid, pending->inherit);
997 kfree(pending->inherit);
998 if (ret) { 1045 if (ret) {
999 pending->error = ret; 1046 pending->error = ret;
1000 goto fail; 1047 goto no_free_objectid;
1001 } 1048 }
1002 1049
1003 key.objectid = objectid; 1050 key.objectid = objectid;
1004 key.offset = (u64)-1; 1051 key.offset = (u64)-1;
1005 key.type = BTRFS_ROOT_ITEM_KEY; 1052 key.type = BTRFS_ROOT_ITEM_KEY;
1006 1053
1054 rsv = trans->block_rsv;
1007 trans->block_rsv = &pending->block_rsv; 1055 trans->block_rsv = &pending->block_rsv;
1008 1056
1009 dentry = pending->dentry; 1057 dentry = pending->dentry;
@@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1017 */ 1065 */
1018 ret = btrfs_set_inode_index(parent_inode, &index); 1066 ret = btrfs_set_inode_index(parent_inode, &index);
1019 BUG_ON(ret); /* -ENOMEM */ 1067 BUG_ON(ret); /* -ENOMEM */
1020 ret = btrfs_insert_dir_item(trans, parent_root, 1068
1021 dentry->d_name.name, dentry->d_name.len, 1069 /* check if there is a file/dir which has the same name. */
1022 parent_inode, &key, 1070 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1023 BTRFS_FT_DIR, index); 1071 btrfs_ino(parent_inode),
1024 if (ret == -EEXIST) { 1072 dentry->d_name.name,
1073 dentry->d_name.len, 0);
1074 if (dir_item != NULL && !IS_ERR(dir_item)) {
1025 pending->error = -EEXIST; 1075 pending->error = -EEXIST;
1026 dput(parent);
1027 goto fail; 1076 goto fail;
1028 } else if (ret) { 1077 } else if (IS_ERR(dir_item)) {
1029 goto abort_trans_dput; 1078 ret = PTR_ERR(dir_item);
1079 btrfs_abort_transaction(trans, root, ret);
1080 goto fail;
1030 } 1081 }
1031 1082 btrfs_release_path(path);
1032 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1033 dentry->d_name.len * 2);
1034 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1035 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1036 if (ret)
1037 goto abort_trans_dput;
1038 1083
1039 /* 1084 /*
1040 * pull in the delayed directory update 1085 * pull in the delayed directory update
@@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1043 * snapshot 1088 * snapshot
1044 */ 1089 */
1045 ret = btrfs_run_delayed_items(trans, root); 1090 ret = btrfs_run_delayed_items(trans, root);
1046 if (ret) { /* Transaction aborted */ 1091 if (ret) { /* Transaction aborted */
1047 dput(parent); 1092 btrfs_abort_transaction(trans, root, ret);
1048 goto fail; 1093 goto fail;
1049 } 1094 }
1050 1095
@@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1079 if (ret) { 1124 if (ret) {
1080 btrfs_tree_unlock(old); 1125 btrfs_tree_unlock(old);
1081 free_extent_buffer(old); 1126 free_extent_buffer(old);
1082 goto abort_trans_dput; 1127 btrfs_abort_transaction(trans, root, ret);
1128 goto fail;
1083 } 1129 }
1084 1130
1085 btrfs_set_lock_blocking(old); 1131 btrfs_set_lock_blocking(old);
@@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1088 /* clean up in any case */ 1134 /* clean up in any case */
1089 btrfs_tree_unlock(old); 1135 btrfs_tree_unlock(old);
1090 free_extent_buffer(old); 1136 free_extent_buffer(old);
1091 if (ret) 1137 if (ret) {
1092 goto abort_trans_dput; 1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140 }
1093 1141
1094 /* see comments in should_cow_block() */ 1142 /* see comments in should_cow_block() */
1095 root->force_cow = 1; 1143 root->force_cow = 1;
@@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1101 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 1149 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1102 btrfs_tree_unlock(tmp); 1150 btrfs_tree_unlock(tmp);
1103 free_extent_buffer(tmp); 1151 free_extent_buffer(tmp);
1104 if (ret) 1152 if (ret) {
1105 goto abort_trans_dput; 1153 btrfs_abort_transaction(trans, root, ret);
1154 goto fail;
1155 }
1106 1156
1107 /* 1157 /*
1108 * insert root back/forward references 1158 * insert root back/forward references
@@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1111 parent_root->root_key.objectid, 1161 parent_root->root_key.objectid,
1112 btrfs_ino(parent_inode), index, 1162 btrfs_ino(parent_inode), index,
1113 dentry->d_name.name, dentry->d_name.len); 1163 dentry->d_name.name, dentry->d_name.len);
1114 dput(parent); 1164 if (ret) {
1115 if (ret) 1165 btrfs_abort_transaction(trans, root, ret);
1116 goto fail; 1166 goto fail;
1167 }
1117 1168
1118 key.offset = (u64)-1; 1169 key.offset = (u64)-1;
1119 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1170 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1120 if (IS_ERR(pending->snap)) { 1171 if (IS_ERR(pending->snap)) {
1121 ret = PTR_ERR(pending->snap); 1172 ret = PTR_ERR(pending->snap);
1122 goto abort_trans; 1173 btrfs_abort_transaction(trans, root, ret);
1174 goto fail;
1123 } 1175 }
1124 1176
1125 ret = btrfs_reloc_post_snapshot(trans, pending); 1177 ret = btrfs_reloc_post_snapshot(trans, pending);
1178 if (ret) {
1179 btrfs_abort_transaction(trans, root, ret);
1180 goto fail;
1181 }
1182
1183 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1184 if (ret) {
1185 btrfs_abort_transaction(trans, root, ret);
1186 goto fail;
1187 }
1188
1189 ret = btrfs_insert_dir_item(trans, parent_root,
1190 dentry->d_name.name, dentry->d_name.len,
1191 parent_inode, &key,
1192 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST);
1195 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret);
1197 goto fail;
1198 }
1199
1200 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1201 dentry->d_name.len * 2);
1202 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1203 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1126 if (ret) 1204 if (ret)
1127 goto abort_trans; 1205 btrfs_abort_transaction(trans, root, ret);
1128 ret = 0;
1129fail: 1206fail:
1130 kfree(new_root_item); 1207 dput(parent);
1131 trans->block_rsv = rsv; 1208 trans->block_rsv = rsv;
1209no_free_objectid:
1210 kfree(new_root_item);
1211root_item_alloc_fail:
1212 btrfs_free_path(path);
1213path_alloc_fail:
1132 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1214 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1133 return ret; 1215 return ret;
1134
1135abort_trans_dput:
1136 dput(parent);
1137abort_trans:
1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140} 1216}
1141 1217
1142/* 1218/*
@@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)
1229 struct btrfs_async_commit *ac = 1305 struct btrfs_async_commit *ac =
1230 container_of(work, struct btrfs_async_commit, work.work); 1306 container_of(work, struct btrfs_async_commit, work.work);
1231 1307
1308 /*
1309 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it.
1311 */
1312 rwsem_acquire_read(
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315
1316 current->journal_info = ac->newtrans;
1317
1232 btrfs_commit_transaction(ac->newtrans, ac->root); 1318 btrfs_commit_transaction(ac->newtrans, ac->root);
1233 kfree(ac); 1319 kfree(ac);
1234} 1320}
@@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1258 atomic_inc(&cur_trans->use_count); 1344 atomic_inc(&cur_trans->use_count);
1259 1345
1260 btrfs_end_transaction(trans, root); 1346 btrfs_end_transaction(trans, root);
1347
1348 /*
1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it.
1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1353 1, _THIS_IP_);
1354
1261 schedule_delayed_work(&ac->work, 0); 1355 schedule_delayed_work(&ac->work, 0);
1262 1356
1263 /* wait for transaction to start and unblock */ 1357 /* wait for transaction to start and unblock */
@@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1348 */ 1442 */
1349 cur_trans->delayed_refs.flushing = 1; 1443 cur_trans->delayed_refs.flushing = 1;
1350 1444
1445 if (!list_empty(&trans->new_bgs))
1446 btrfs_create_pending_block_groups(trans, root);
1447
1351 ret = btrfs_run_delayed_refs(trans, root, 0); 1448 ret = btrfs_run_delayed_refs(trans, root, 0);
1352 if (ret) 1449 if (ret)
1353 goto cleanup_transaction; 1450 goto cleanup_transaction;
@@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1403 1500
1404 if (flush_on_commit || snap_pending) { 1501 if (flush_on_commit || snap_pending) {
1405 btrfs_start_delalloc_inodes(root, 1); 1502 btrfs_start_delalloc_inodes(root, 1);
1406 btrfs_wait_ordered_extents(root, 0, 1); 1503 btrfs_wait_ordered_extents(root, 1);
1407 } 1504 }
1408 1505
1409 ret = btrfs_run_delayed_items(trans, root); 1506 ret = btrfs_run_delayed_items(trans, root);
@@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1456 */ 1553 */
1457 mutex_lock(&root->fs_info->reloc_mutex); 1554 mutex_lock(&root->fs_info->reloc_mutex);
1458 1555
1459 ret = btrfs_run_delayed_items(trans, root); 1556 /*
1557 * We needn't worry about the delayed items because we will
1558 * deal with them in create_pending_snapshot(), which is the
1559 * core function of the snapshot creation.
1560 */
1561 ret = create_pending_snapshots(trans, root->fs_info);
1460 if (ret) { 1562 if (ret) {
1461 mutex_unlock(&root->fs_info->reloc_mutex); 1563 mutex_unlock(&root->fs_info->reloc_mutex);
1462 goto cleanup_transaction; 1564 goto cleanup_transaction;
1463 } 1565 }
1464 1566
1465 ret = create_pending_snapshots(trans, root->fs_info); 1567 /*
1568 * We insert the dir indexes of the snapshots and update the inode
1569 * of the snapshots' parents after the snapshot creation, so there
1570 * are some delayed items which are not dealt with. Now deal with
1571 * them.
1572 *
1573 * We needn't worry that this operation will corrupt the snapshots,
1574 * because all the tree which are snapshoted will be forced to COW
1575 * the nodes and leaves.
1576 */
1577 ret = btrfs_run_delayed_items(trans, root);
1466 if (ret) { 1578 if (ret) {
1467 mutex_unlock(&root->fs_info->reloc_mutex); 1579 mutex_unlock(&root->fs_info->reloc_mutex);
1468 goto cleanup_transaction; 1580 goto cleanup_transaction;
@@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1584 put_transaction(cur_trans); 1696 put_transaction(cur_trans);
1585 put_transaction(cur_trans); 1697 put_transaction(cur_trans);
1586 1698
1587 sb_end_intwrite(root->fs_info->sb); 1699 if (trans->type < TRANS_JOIN_NOLOCK)
1700 sb_end_intwrite(root->fs_info->sb);
1588 1701
1589 trace_btrfs_transaction_commit(root); 1702 trace_btrfs_transaction_commit(root);
1590 1703
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688b..80961947a6b2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
47 int aborted; 47 int aborted;
48}; 48};
49 49
50enum btrfs_trans_type {
51 TRANS_START,
52 TRANS_JOIN,
53 TRANS_USERSPACE,
54 TRANS_JOIN_NOLOCK,
55 TRANS_ATTACH,
56};
57
50struct btrfs_trans_handle { 58struct btrfs_trans_handle {
51 u64 transid; 59 u64 transid;
52 u64 bytes_reserved; 60 u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
58 struct btrfs_transaction *transaction; 66 struct btrfs_transaction *transaction;
59 struct btrfs_block_rsv *block_rsv; 67 struct btrfs_block_rsv *block_rsv;
60 struct btrfs_block_rsv *orig_rsv; 68 struct btrfs_block_rsv *orig_rsv;
61 int aborted; 69 short aborted;
62 int adding_csums; 70 short adding_csums;
71 enum btrfs_trans_type type;
63 /* 72 /*
64 * this root is only needed to validate that the root passed to 73 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction. 74 * start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
68 struct btrfs_root *root; 77 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem; 78 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list; 79 struct list_head qgroup_ref_list;
80 struct list_head new_bgs;
71}; 81};
72 82
73struct btrfs_pending_snapshot { 83struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
88{ 98{
89 BTRFS_I(inode)->last_trans = trans->transaction->transid; 99 BTRFS_I(inode)->last_trans = trans->transaction->transid;
90 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 100 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
101 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
91} 102}
92 103
93int btrfs_end_transaction(struct btrfs_trans_handle *trans, 104int btrfs_end_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 105 struct btrfs_root *root);
95int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root);
97struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
98 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush(
109 struct btrfs_root *root, int num_items);
99struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
100struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
101struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
102int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
103int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..e9ebb472b28b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,16 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/list_sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "transaction.h" 23#include "transaction.h"
23#include "disk-io.h" 24#include "disk-io.h"
24#include "locking.h" 25#include "locking.h"
25#include "print-tree.h" 26#include "print-tree.h"
27#include "backref.h"
26#include "compat.h" 28#include "compat.h"
27#include "tree-log.h" 29#include "tree-log.h"
30#include "hash.h"
28 31
29/* magic values for the inode_only field in btrfs_log_inode: 32/* magic values for the inode_only field in btrfs_log_inode:
30 * 33 *
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
146 root->log_multiple_pids = true; 149 root->log_multiple_pids = true;
147 } 150 }
148 151
149 root->log_batch++; 152 atomic_inc(&root->log_batch);
150 atomic_inc(&root->log_writers); 153 atomic_inc(&root->log_writers);
151 mutex_unlock(&root->log_mutex); 154 mutex_unlock(&root->log_mutex);
152 return 0; 155 return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
165 err = ret; 168 err = ret;
166 } 169 }
167 mutex_unlock(&root->fs_info->tree_log_mutex); 170 mutex_unlock(&root->fs_info->tree_log_mutex);
168 root->log_batch++; 171 atomic_inc(&root->log_batch);
169 atomic_inc(&root->log_writers); 172 atomic_inc(&root->log_writers);
170 mutex_unlock(&root->log_mutex); 173 mutex_unlock(&root->log_mutex);
171 return err; 174 return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
484 int found_type; 487 int found_type;
485 u64 mask = root->sectorsize - 1; 488 u64 mask = root->sectorsize - 1;
486 u64 extent_end; 489 u64 extent_end;
487 u64 alloc_hint;
488 u64 start = key->offset; 490 u64 start = key->offset;
489 u64 saved_nbytes; 491 u64 saved_nbytes;
490 struct btrfs_file_extent_item *item; 492 struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
550 552
551 saved_nbytes = inode_get_bytes(inode); 553 saved_nbytes = inode_get_bytes(inode);
552 /* drop any overlapping extents */ 554 /* drop any overlapping extents */
553 ret = btrfs_drop_extents(trans, inode, start, extent_end, 555 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
554 &alloc_hint, 1);
555 BUG_ON(ret); 556 BUG_ON(ret);
556 557
557 if (found_type == BTRFS_FILE_EXTENT_REG || 558 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
744 */ 745 */
745static noinline int backref_in_log(struct btrfs_root *log, 746static noinline int backref_in_log(struct btrfs_root *log,
746 struct btrfs_key *key, 747 struct btrfs_key *key,
748 u64 ref_objectid,
747 char *name, int namelen) 749 char *name, int namelen)
748{ 750{
749 struct btrfs_path *path; 751 struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
764 if (ret != 0) 766 if (ret != 0)
765 goto out; 767 goto out;
766 768
767 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
768 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 769 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
770
771 if (key->type == BTRFS_INODE_EXTREF_KEY) {
772 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
773 name, namelen, NULL))
774 match = 1;
775
776 goto out;
777 }
778
779 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
769 ptr_end = ptr + item_size; 780 ptr_end = ptr + item_size;
770 while (ptr < ptr_end) { 781 while (ptr < ptr_end) {
771 ref = (struct btrfs_inode_ref *)ptr; 782 ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
786 return match; 797 return match;
787} 798}
788 799
789 800static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
790/*
791 * replay one inode back reference item found in the log tree.
792 * eb, slot and key refer to the buffer and key found in the log tree.
793 * root is the destination we are replaying into, and path is for temp
794 * use by this function. (it should be released on return).
795 */
796static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 struct btrfs_root *root, 801 struct btrfs_root *root,
798 struct btrfs_root *log,
799 struct btrfs_path *path, 802 struct btrfs_path *path,
800 struct extent_buffer *eb, int slot, 803 struct btrfs_root *log_root,
801 struct btrfs_key *key) 804 struct inode *dir, struct inode *inode,
805 struct extent_buffer *eb,
806 u64 inode_objectid, u64 parent_objectid,
807 u64 ref_index, char *name, int namelen,
808 int *search_done)
802{ 809{
803 struct btrfs_inode_ref *ref;
804 struct btrfs_dir_item *di;
805 struct inode *dir;
806 struct inode *inode;
807 unsigned long ref_ptr;
808 unsigned long ref_end;
809 char *name;
810 int namelen;
811 int ret; 810 int ret;
812 int search_done = 0; 811 char *victim_name;
813 812 int victim_name_len;
814 /* 813 struct extent_buffer *leaf;
815 * it is possible that we didn't log all the parent directories 814 struct btrfs_dir_item *di;
816 * for a given inode. If we don't find the dir, just don't 815 struct btrfs_key search_key;
817 * copy the back ref in. The link count fixup code will take 816 struct btrfs_inode_extref *extref;
818 * care of the rest
819 */
820 dir = read_one_inode(root, key->offset);
821 if (!dir)
822 return -ENOENT;
823
824 inode = read_one_inode(root, key->objectid);
825 if (!inode) {
826 iput(dir);
827 return -EIO;
828 }
829
830 ref_ptr = btrfs_item_ptr_offset(eb, slot);
831 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
832 817
833again: 818again:
834 ref = (struct btrfs_inode_ref *)ref_ptr; 819 /* Search old style refs */
835 820 search_key.objectid = inode_objectid;
836 namelen = btrfs_inode_ref_name_len(eb, ref); 821 search_key.type = BTRFS_INODE_REF_KEY;
837 name = kmalloc(namelen, GFP_NOFS); 822 search_key.offset = parent_objectid;
838 BUG_ON(!name); 823 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
839
840 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
841
842 /* if we already have a perfect match, we're done */
843 if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
844 btrfs_inode_ref_index(eb, ref),
845 name, namelen)) {
846 goto out;
847 }
848
849 /*
850 * look for a conflicting back reference in the metadata.
851 * if we find one we have to unlink that name of the file
852 * before we add our new link. Later on, we overwrite any
853 * existing back reference, and we don't want to create
854 * dangling pointers in the directory.
855 */
856
857 if (search_done)
858 goto insert;
859
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) { 824 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref; 825 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr; 826 unsigned long ptr;
866 unsigned long ptr_end; 827 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0]; 828
829 leaf = path->nodes[0];
868 830
869 /* are we trying to overwrite a back ref for the root directory 831 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done 832 * if so, just jump out, we're done
871 */ 833 */
872 if (key->objectid == key->offset) 834 if (search_key.objectid == search_key.offset)
873 goto out_nowrite; 835 return 1;
874 836
875 /* check all the names in this back reference to see 837 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay 838 * if they are in the log. if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
889 (unsigned long)(victim_ref + 1), 851 (unsigned long)(victim_ref + 1),
890 victim_name_len); 852 victim_name_len);
891 853
892 if (!backref_in_log(log, key, victim_name, 854 if (!backref_in_log(log_root, &search_key,
855 parent_objectid,
856 victim_name,
893 victim_name_len)) { 857 victim_name_len)) {
894 btrfs_inc_nlink(inode); 858 btrfs_inc_nlink(inode);
895 btrfs_release_path(path); 859 btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
897 ret = btrfs_unlink_inode(trans, root, dir, 861 ret = btrfs_unlink_inode(trans, root, dir,
898 inode, victim_name, 862 inode, victim_name,
899 victim_name_len); 863 victim_name_len);
864 BUG_ON(ret);
900 btrfs_run_delayed_items(trans, root); 865 btrfs_run_delayed_items(trans, root);
866 kfree(victim_name);
867 *search_done = 1;
868 goto again;
901 } 869 }
902 kfree(victim_name); 870 kfree(victim_name);
871
903 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 872 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
904 } 873 }
905 BUG_ON(ret); 874 BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
908 * NOTE: we have searched root tree and checked the 877 * NOTE: we have searched root tree and checked the
909 * coresponding ref, it does not need to check again. 878 * coresponding ref, it does not need to check again.
910 */ 879 */
911 search_done = 1; 880 *search_done = 1;
881 }
882 btrfs_release_path(path);
883
884 /* Same search but for extended refs */
885 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
886 inode_objectid, parent_objectid, 0,
887 0);
888 if (!IS_ERR_OR_NULL(extref)) {
889 u32 item_size;
890 u32 cur_offset = 0;
891 unsigned long base;
892 struct inode *victim_parent;
893
894 leaf = path->nodes[0];
895
896 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
897 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
898
899 while (cur_offset < item_size) {
900 extref = (struct btrfs_inode_extref *)base + cur_offset;
901
902 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
903
904 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
905 goto next;
906
907 victim_name = kmalloc(victim_name_len, GFP_NOFS);
908 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
909 victim_name_len);
910
911 search_key.objectid = inode_objectid;
912 search_key.type = BTRFS_INODE_EXTREF_KEY;
913 search_key.offset = btrfs_extref_hash(parent_objectid,
914 victim_name,
915 victim_name_len);
916 ret = 0;
917 if (!backref_in_log(log_root, &search_key,
918 parent_objectid, victim_name,
919 victim_name_len)) {
920 ret = -ENOENT;
921 victim_parent = read_one_inode(root,
922 parent_objectid);
923 if (victim_parent) {
924 btrfs_inc_nlink(inode);
925 btrfs_release_path(path);
926
927 ret = btrfs_unlink_inode(trans, root,
928 victim_parent,
929 inode,
930 victim_name,
931 victim_name_len);
932 btrfs_run_delayed_items(trans, root);
933 }
934 BUG_ON(ret);
935 iput(victim_parent);
936 kfree(victim_name);
937 *search_done = 1;
938 goto again;
939 }
940 kfree(victim_name);
941 BUG_ON(ret);
942next:
943 cur_offset += victim_name_len + sizeof(*extref);
944 }
945 *search_done = 1;
912 } 946 }
913 btrfs_release_path(path); 947 btrfs_release_path(path);
914 948
915 /* look for a conflicting sequence number */ 949 /* look for a conflicting sequence number */
916 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 950 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
917 btrfs_inode_ref_index(eb, ref), 951 ref_index, name, namelen, 0);
918 name, namelen, 0);
919 if (di && !IS_ERR(di)) { 952 if (di && !IS_ERR(di)) {
920 ret = drop_one_dir_item(trans, root, path, dir, di); 953 ret = drop_one_dir_item(trans, root, path, dir, di);
921 BUG_ON(ret); 954 BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
931 } 964 }
932 btrfs_release_path(path); 965 btrfs_release_path(path);
933 966
934insert: 967 return 0;
935 /* insert our name */ 968}
936 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
937 btrfs_inode_ref_index(eb, ref));
938 BUG_ON(ret);
939 969
940 btrfs_update_inode(trans, root, inode); 970static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
971 u32 *namelen, char **name, u64 *index,
972 u64 *parent_objectid)
973{
974 struct btrfs_inode_extref *extref;
941 975
942out: 976 extref = (struct btrfs_inode_extref *)ref_ptr;
943 ref_ptr = (unsigned long)(ref + 1) + namelen; 977
944 kfree(name); 978 *namelen = btrfs_inode_extref_name_len(eb, extref);
945 if (ref_ptr < ref_end) 979 *name = kmalloc(*namelen, GFP_NOFS);
946 goto again; 980 if (*name == NULL)
981 return -ENOMEM;
982
983 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
984 *namelen);
985
986 *index = btrfs_inode_extref_index(eb, extref);
987 if (parent_objectid)
988 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
989
990 return 0;
991}
992
993static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
994 u32 *namelen, char **name, u64 *index)
995{
996 struct btrfs_inode_ref *ref;
997
998 ref = (struct btrfs_inode_ref *)ref_ptr;
999
1000 *namelen = btrfs_inode_ref_name_len(eb, ref);
1001 *name = kmalloc(*namelen, GFP_NOFS);
1002 if (*name == NULL)
1003 return -ENOMEM;
1004
1005 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1006
1007 *index = btrfs_inode_ref_index(eb, ref);
1008
1009 return 0;
1010}
1011
1012/*
1013 * replay one inode back reference item found in the log tree.
1014 * eb, slot and key refer to the buffer and key found in the log tree.
1015 * root is the destination we are replaying into, and path is for temp
1016 * use by this function. (it should be released on return).
1017 */
1018static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root,
1020 struct btrfs_root *log,
1021 struct btrfs_path *path,
1022 struct extent_buffer *eb, int slot,
1023 struct btrfs_key *key)
1024{
1025 struct inode *dir;
1026 struct inode *inode;
1027 unsigned long ref_ptr;
1028 unsigned long ref_end;
1029 char *name;
1030 int namelen;
1031 int ret;
1032 int search_done = 0;
1033 int log_ref_ver = 0;
1034 u64 parent_objectid;
1035 u64 inode_objectid;
1036 u64 ref_index = 0;
1037 int ref_struct_size;
1038
1039 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1040 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1041
1042 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1043 struct btrfs_inode_extref *r;
1044
1045 ref_struct_size = sizeof(struct btrfs_inode_extref);
1046 log_ref_ver = 1;
1047 r = (struct btrfs_inode_extref *)ref_ptr;
1048 parent_objectid = btrfs_inode_extref_parent(eb, r);
1049 } else {
1050 ref_struct_size = sizeof(struct btrfs_inode_ref);
1051 parent_objectid = key->offset;
1052 }
1053 inode_objectid = key->objectid;
1054
1055 /*
1056 * it is possible that we didn't log all the parent directories
1057 * for a given inode. If we don't find the dir, just don't
1058 * copy the back ref in. The link count fixup code will take
1059 * care of the rest
1060 */
1061 dir = read_one_inode(root, parent_objectid);
1062 if (!dir)
1063 return -ENOENT;
1064
1065 inode = read_one_inode(root, inode_objectid);
1066 if (!inode) {
1067 iput(dir);
1068 return -EIO;
1069 }
1070
1071 while (ref_ptr < ref_end) {
1072 if (log_ref_ver) {
1073 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1074 &ref_index, &parent_objectid);
1075 /*
1076 * parent object can change from one array
1077 * item to another.
1078 */
1079 if (!dir)
1080 dir = read_one_inode(root, parent_objectid);
1081 if (!dir)
1082 return -ENOENT;
1083 } else {
1084 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1085 &ref_index);
1086 }
1087 if (ret)
1088 return ret;
1089
1090 /* if we already have a perfect match, we're done */
1091 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1092 ref_index, name, namelen)) {
1093 /*
1094 * look for a conflicting back reference in the
1095 * metadata. if we find one we have to unlink that name
1096 * of the file before we add our new link. Later on, we
1097 * overwrite any existing back reference, and we don't
1098 * want to create dangling pointers in the directory.
1099 */
1100
1101 if (!search_done) {
1102 ret = __add_inode_ref(trans, root, path, log,
1103 dir, inode, eb,
1104 inode_objectid,
1105 parent_objectid,
1106 ref_index, name, namelen,
1107 &search_done);
1108 if (ret == 1)
1109 goto out;
1110 BUG_ON(ret);
1111 }
1112
1113 /* insert our name */
1114 ret = btrfs_add_link(trans, dir, inode, name, namelen,
1115 0, ref_index);
1116 BUG_ON(ret);
1117
1118 btrfs_update_inode(trans, root, inode);
1119 }
1120
1121 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1122 kfree(name);
1123 if (log_ref_ver) {
1124 iput(dir);
1125 dir = NULL;
1126 }
1127 }
947 1128
948 /* finally write the back reference in the inode */ 1129 /* finally write the back reference in the inode */
949 ret = overwrite_item(trans, root, path, eb, slot, key); 1130 ret = overwrite_item(trans, root, path, eb, slot, key);
950 BUG_ON(ret); 1131 BUG_ON(ret);
951 1132
952out_nowrite: 1133out:
953 btrfs_release_path(path); 1134 btrfs_release_path(path);
954 iput(dir); 1135 iput(dir);
955 iput(inode); 1136 iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
966 return ret; 1147 return ret;
967} 1148}
968 1149
1150static int count_inode_extrefs(struct btrfs_root *root,
1151 struct inode *inode, struct btrfs_path *path)
1152{
1153 int ret = 0;
1154 int name_len;
1155 unsigned int nlink = 0;
1156 u32 item_size;
1157 u32 cur_offset = 0;
1158 u64 inode_objectid = btrfs_ino(inode);
1159 u64 offset = 0;
1160 unsigned long ptr;
1161 struct btrfs_inode_extref *extref;
1162 struct extent_buffer *leaf;
969 1163
970/* 1164 while (1) {
971 * There are a few corners where the link count of the file can't 1165 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
972 * be properly maintained during replay. So, instead of adding 1166 &extref, &offset);
973 * lots of complexity to the log code, we just scan the backrefs 1167 if (ret)
974 * for any file that has been through replay. 1168 break;
975 * 1169
976 * The scan will update the link count on the inode to reflect the 1170 leaf = path->nodes[0];
977 * number of back refs found. If it goes down to zero, the iput 1171 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
978 * will free the inode. 1172 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
979 */ 1173
980static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1174 while (cur_offset < item_size) {
981 struct btrfs_root *root, 1175 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
982 struct inode *inode) 1176 name_len = btrfs_inode_extref_name_len(leaf, extref);
1177
1178 nlink++;
1179
1180 cur_offset += name_len + sizeof(*extref);
1181 }
1182
1183 offset++;
1184 btrfs_release_path(path);
1185 }
1186 btrfs_release_path(path);
1187
1188 if (ret < 0)
1189 return ret;
1190 return nlink;
1191}
1192
1193static int count_inode_refs(struct btrfs_root *root,
1194 struct inode *inode, struct btrfs_path *path)
983{ 1195{
984 struct btrfs_path *path;
985 int ret; 1196 int ret;
986 struct btrfs_key key; 1197 struct btrfs_key key;
987 u64 nlink = 0; 1198 unsigned int nlink = 0;
988 unsigned long ptr; 1199 unsigned long ptr;
989 unsigned long ptr_end; 1200 unsigned long ptr_end;
990 int name_len; 1201 int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
994 key.type = BTRFS_INODE_REF_KEY; 1205 key.type = BTRFS_INODE_REF_KEY;
995 key.offset = (u64)-1; 1206 key.offset = (u64)-1;
996 1207
997 path = btrfs_alloc_path();
998 if (!path)
999 return -ENOMEM;
1000
1001 while (1) { 1208 while (1) {
1002 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1209 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003 if (ret < 0) 1210 if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1031 btrfs_release_path(path); 1238 btrfs_release_path(path);
1032 } 1239 }
1033 btrfs_release_path(path); 1240 btrfs_release_path(path);
1241
1242 return nlink;
1243}
1244
1245/*
1246 * There are a few corners where the link count of the file can't
1247 * be properly maintained during replay. So, instead of adding
1248 * lots of complexity to the log code, we just scan the backrefs
1249 * for any file that has been through replay.
1250 *
1251 * The scan will update the link count on the inode to reflect the
1252 * number of back refs found. If it goes down to zero, the iput
1253 * will free the inode.
1254 */
1255static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1256 struct btrfs_root *root,
1257 struct inode *inode)
1258{
1259 struct btrfs_path *path;
1260 int ret;
1261 u64 nlink = 0;
1262 u64 ino = btrfs_ino(inode);
1263
1264 path = btrfs_alloc_path();
1265 if (!path)
1266 return -ENOMEM;
1267
1268 ret = count_inode_refs(root, inode, path);
1269 if (ret < 0)
1270 goto out;
1271
1272 nlink = ret;
1273
1274 ret = count_inode_extrefs(root, inode, path);
1275 if (ret == -ENOENT)
1276 ret = 0;
1277
1278 if (ret < 0)
1279 goto out;
1280
1281 nlink += ret;
1282
1283 ret = 0;
1284
1034 if (nlink != inode->i_nlink) { 1285 if (nlink != inode->i_nlink) {
1035 set_nlink(inode, nlink); 1286 set_nlink(inode, nlink);
1036 btrfs_update_inode(trans, root, inode); 1287 btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1046 ret = insert_orphan_item(trans, root, ino); 1297 ret = insert_orphan_item(trans, root, ino);
1047 BUG_ON(ret); 1298 BUG_ON(ret);
1048 } 1299 }
1049 btrfs_free_path(path);
1050 1300
1051 return 0; 1301out:
1302 btrfs_free_path(path);
1303 return ret;
1052} 1304}
1053 1305
1054static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1306static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1695 ret = add_inode_ref(wc->trans, root, log, path, 1947 ret = add_inode_ref(wc->trans, root, log, path,
1696 eb, i, &key); 1948 eb, i, &key);
1697 BUG_ON(ret && ret != -ENOENT); 1949 BUG_ON(ret && ret != -ENOENT);
1950 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
1951 ret = add_inode_ref(wc->trans, root, log, path,
1952 eb, i, &key);
1953 BUG_ON(ret && ret != -ENOENT);
1698 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1954 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1699 ret = replay_one_extent(wc->trans, root, path, 1955 ret = replay_one_extent(wc->trans, root, path,
1700 eb, i, &key); 1956 eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2037 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2293 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2038 wait_log_commit(trans, root, root->log_transid - 1); 2294 wait_log_commit(trans, root, root->log_transid - 1);
2039 while (1) { 2295 while (1) {
2040 unsigned long batch = root->log_batch; 2296 int batch = atomic_read(&root->log_batch);
2041 /* when we're on an ssd, just kick the log commit out */ 2297 /* when we're on an ssd, just kick the log commit out */
2042 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2298 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2043 mutex_unlock(&root->log_mutex); 2299 mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2045 mutex_lock(&root->log_mutex); 2301 mutex_lock(&root->log_mutex);
2046 } 2302 }
2047 wait_for_writer(trans, root); 2303 wait_for_writer(trans, root);
2048 if (batch == root->log_batch) 2304 if (batch == atomic_read(&root->log_batch))
2049 break; 2305 break;
2050 } 2306 }
2051 2307
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2074 2330
2075 btrfs_set_root_node(&log->root_item, log->node); 2331 btrfs_set_root_node(&log->root_item, log->node);
2076 2332
2077 root->log_batch = 0;
2078 root->log_transid++; 2333 root->log_transid++;
2079 log->log_transid = root->log_transid; 2334 log->log_transid = root->log_transid;
2080 root->log_start_pid = 0; 2335 root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2087 mutex_unlock(&root->log_mutex); 2342 mutex_unlock(&root->log_mutex);
2088 2343
2089 mutex_lock(&log_root_tree->log_mutex); 2344 mutex_lock(&log_root_tree->log_mutex);
2090 log_root_tree->log_batch++; 2345 atomic_inc(&log_root_tree->log_batch);
2091 atomic_inc(&log_root_tree->log_writers); 2346 atomic_inc(&log_root_tree->log_writers);
2092 mutex_unlock(&log_root_tree->log_mutex); 2347 mutex_unlock(&log_root_tree->log_mutex);
2093 2348
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2157 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2412 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2158 btrfs_header_level(log_root_tree->node)); 2413 btrfs_header_level(log_root_tree->node));
2159 2414
2160 log_root_tree->log_batch = 0;
2161 log_root_tree->log_transid++; 2415 log_root_tree->log_transid++;
2162 smp_mb(); 2416 smp_mb();
2163 2417
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2171 * in and cause problems either. 2425 * in and cause problems either.
2172 */ 2426 */
2173 btrfs_scrub_pause_super(root); 2427 btrfs_scrub_pause_super(root);
2174 write_ctree_super(trans, root->fs_info->tree_root, 1); 2428 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2175 btrfs_scrub_continue_super(root); 2429 btrfs_scrub_continue_super(root);
2176 ret = 0; 2430 if (ret) {
2431 btrfs_abort_transaction(trans, root, ret);
2432 goto out_wake_log_root;
2433 }
2177 2434
2178 mutex_lock(&root->log_mutex); 2435 mutex_lock(&root->log_mutex);
2179 if (root->last_log_commit < log_transid) 2436 if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2209 2466
2210 while (1) { 2467 while (1) {
2211 ret = find_first_extent_bit(&log->dirty_log_pages, 2468 ret = find_first_extent_bit(&log->dirty_log_pages,
2212 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2469 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2470 NULL);
2213 if (ret) 2471 if (ret)
2214 break; 2472 break;
2215 2473
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2646 int ret; 2904 int ret;
2647 struct btrfs_key key; 2905 struct btrfs_key key;
2648 struct btrfs_key found_key; 2906 struct btrfs_key found_key;
2907 int start_slot;
2649 2908
2650 key.objectid = objectid; 2909 key.objectid = objectid;
2651 key.type = max_key_type; 2910 key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2667 if (found_key.objectid != objectid) 2926 if (found_key.objectid != objectid)
2668 break; 2927 break;
2669 2928
2670 ret = btrfs_del_item(trans, log, path); 2929 found_key.offset = 0;
2671 if (ret) 2930 found_key.type = 0;
2931 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
2932 &start_slot);
2933
2934 ret = btrfs_del_items(trans, log, path, start_slot,
2935 path->slots[0] - start_slot + 1);
2936 /*
2937 * If start slot isn't 0 then we don't need to re-search, we've
2938 * found the last guy with the objectid in this tree.
2939 */
2940 if (ret || start_slot != 0)
2672 break; 2941 break;
2673 btrfs_release_path(path); 2942 btrfs_release_path(path);
2674 } 2943 }
@@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2678 return ret; 2947 return ret;
2679} 2948}
2680 2949
2950static void fill_inode_item(struct btrfs_trans_handle *trans,
2951 struct extent_buffer *leaf,
2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only)
2954{
2955 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2956 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982
2983 if (log_inode_only) {
2984 /* set the generation to zero so the recover code
2985 * can tell the difference between an logging
2986 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values'
2988 */
2989 btrfs_set_inode_generation(leaf, item, 0);
2990 btrfs_set_inode_size(leaf, item, 0);
2991 } else {
2992 btrfs_set_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation);
2994 btrfs_set_inode_size(leaf, item, inode->i_size);
2995 }
2996
2997}
2998
2681static noinline int copy_items(struct btrfs_trans_handle *trans, 2999static noinline int copy_items(struct btrfs_trans_handle *trans,
2682 struct btrfs_root *log, 3000 struct inode *inode,
2683 struct btrfs_path *dst_path, 3001 struct btrfs_path *dst_path,
2684 struct extent_buffer *src, 3002 struct extent_buffer *src,
2685 int start_slot, int nr, int inode_only) 3003 int start_slot, int nr, int inode_only)
2686{ 3004{
2687 unsigned long src_offset; 3005 unsigned long src_offset;
2688 unsigned long dst_offset; 3006 unsigned long dst_offset;
3007 struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
2689 struct btrfs_file_extent_item *extent; 3008 struct btrfs_file_extent_item *extent;
2690 struct btrfs_inode_item *inode_item; 3009 struct btrfs_inode_item *inode_item;
2691 int ret; 3010 int ret;
@@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2694 char *ins_data; 3013 char *ins_data;
2695 int i; 3014 int i;
2696 struct list_head ordered_sums; 3015 struct list_head ordered_sums;
3016 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2697 3017
2698 INIT_LIST_HEAD(&ordered_sums); 3018 INIT_LIST_HEAD(&ordered_sums);
2699 3019
@@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2722 3042
2723 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3043 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2724 3044
2725 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3045 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2726 src_offset, ins_sizes[i]);
2727
2728 if (inode_only == LOG_INODE_EXISTS &&
2729 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2730 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3046 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2731 dst_path->slots[0], 3047 dst_path->slots[0],
2732 struct btrfs_inode_item); 3048 struct btrfs_inode_item);
2733 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 3049 fill_inode_item(trans, dst_path->nodes[0], inode_item,
2734 3050 inode, inode_only == LOG_INODE_EXISTS);
2735 /* set the generation to zero so the recover code 3051 } else {
2736 * can tell the difference between an logging 3052 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2737 * just to say 'this inode exists' and a logging 3053 src_offset, ins_sizes[i]);
2738 * to say 'update this inode with these values'
2739 */
2740 btrfs_set_inode_generation(dst_path->nodes[0],
2741 inode_item, 0);
2742 } 3054 }
3055
2743 /* take a reference on file data extents so that truncates 3056 /* take a reference on file data extents so that truncates
2744 * or deletes of this inode don't have to relog the inode 3057 * or deletes of this inode don't have to relog the inode
2745 * again 3058 * again
2746 */ 3059 */
2747 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 3060 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3061 !skip_csum) {
2748 int found_type; 3062 int found_type;
2749 extent = btrfs_item_ptr(src, start_slot + i, 3063 extent = btrfs_item_ptr(src, start_slot + i,
2750 struct btrfs_file_extent_item); 3064 struct btrfs_file_extent_item);
@@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2753 continue; 3067 continue;
2754 3068
2755 found_type = btrfs_file_extent_type(src, extent); 3069 found_type = btrfs_file_extent_type(src, extent);
2756 if (found_type == BTRFS_FILE_EXTENT_REG || 3070 if (found_type == BTRFS_FILE_EXTENT_REG) {
2757 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2758 u64 ds, dl, cs, cl; 3071 u64 ds, dl, cs, cl;
2759 ds = btrfs_file_extent_disk_bytenr(src, 3072 ds = btrfs_file_extent_disk_bytenr(src,
2760 extent); 3073 extent);
@@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2803 return ret; 3116 return ret;
2804} 3117}
2805 3118
3119static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3120{
3121 struct extent_map *em1, *em2;
3122
3123 em1 = list_entry(a, struct extent_map, list);
3124 em2 = list_entry(b, struct extent_map, list);
3125
3126 if (em1->start < em2->start)
3127 return -1;
3128 else if (em1->start > em2->start)
3129 return 1;
3130 return 0;
3131}
3132
3133struct log_args {
3134 struct extent_buffer *src;
3135 u64 next_offset;
3136 int start_slot;
3137 int nr;
3138};
3139
3140static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path,
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{
3145 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi;
3147 struct btrfs_key key;
3148 u64 start = em->mod_start;
3149 u64 search_start = start;
3150 u64 len = em->mod_len;
3151 u64 num_bytes;
3152 int nritems;
3153 int ret;
3154
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) {
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
3157 start + len, NULL, 0);
3158 if (ret)
3159 return ret;
3160 }
3161
3162 while (len) {
3163 if (args->nr)
3164 goto next_slot;
3165again:
3166 key.objectid = btrfs_ino(inode);
3167 key.type = BTRFS_EXTENT_DATA_KEY;
3168 key.offset = search_start;
3169
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3171 if (ret < 0)
3172 return ret;
3173
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191
3192 path->slots[0]--;
3193 btrfs_item_key_to_cpu(path->nodes[0], &key,
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3203 struct btrfs_file_extent_item);
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205 fi);
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250
3251 if (path->slots[0] < nritems) {
3252 if (len)
3253 goto next_slot;
3254 break;
3255 }
3256
3257 if (args->nr) {
3258 ret = copy_items(trans, inode, dst_path, args->src,
3259 args->start_slot, args->nr,
3260 LOG_INODE_ALL);
3261 if (ret)
3262 return ret;
3263 args->nr = 0;
3264 btrfs_release_path(path);
3265 }
3266 }
3267
3268 return 0;
3269}
3270
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root,
3273 struct inode *inode,
3274 struct btrfs_path *path,
3275 struct btrfs_path *dst_path)
3276{
3277 struct log_args args;
3278 struct extent_map *em, *n;
3279 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3281 u64 test_gen;
3282 int ret = 0;
3283
3284 INIT_LIST_HEAD(&extents);
3285
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed;
3290
3291 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3292 list_del_init(&em->list);
3293 if (em->generation <= test_gen)
3294 continue;
3295 /* Need a ref to keep it from getting evicted from cache */
3296 atomic_inc(&em->refs);
3297 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3298 list_add_tail(&em->list, &extents);
3299 }
3300
3301 list_sort(NULL, &extents, extent_cmp);
3302
3303 while (!list_empty(&extents)) {
3304 em = list_entry(extents.next, struct extent_map, list);
3305
3306 list_del_init(&em->list);
3307 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3308
3309 /*
3310 * If we had an error we just need to delete everybody from our
3311 * private list.
3312 */
3313 if (ret) {
3314 free_extent_map(em);
3315 continue;
3316 }
3317
3318 write_unlock(&tree->lock);
3319
3320 /*
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em);
3340 write_lock(&tree->lock);
3341 }
3342 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock);
3344
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path);
3349 return ret;
3350}
3351
2806/* log a single inode in the tree log. 3352/* log a single inode in the tree log.
2807 * At least one parent directory for this inode must exist in the tree 3353 * At least one parent directory for this inode must exist in the tree
2808 * or be logged already. 3354 * or be logged already.
@@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2832 int nritems; 3378 int nritems;
2833 int ins_start_slot = 0; 3379 int ins_start_slot = 0;
2834 int ins_nr; 3380 int ins_nr;
3381 bool fast_search = false;
2835 u64 ino = btrfs_ino(inode); 3382 u64 ino = btrfs_ino(inode);
2836 3383
2837 log = root->log_root; 3384 log = root->log_root;
@@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2851 3398
2852 max_key.objectid = ino; 3399 max_key.objectid = ino;
2853 3400
2854 /* today the code can only do partial logging of directories */
2855 if (!S_ISDIR(inode->i_mode))
2856 inode_only = LOG_INODE_ALL;
2857 3401
3402 /* today the code can only do partial logging of directories */
2858 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2859 max_key.type = BTRFS_XATTR_ITEM_KEY; 3404 max_key.type = BTRFS_XATTR_ITEM_KEY;
2860 else 3405 else
2861 max_key.type = (u8)-1; 3406 max_key.type = (u8)-1;
2862 max_key.offset = (u64)-1; 3407 max_key.offset = (u64)-1;
2863 3408
2864 ret = btrfs_commit_inode_delayed_items(trans, inode); 3409 /* Only run delayed items if we are a dir or a new file */
2865 if (ret) { 3410 if (S_ISDIR(inode->i_mode) ||
2866 btrfs_free_path(path); 3411 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
2867 btrfs_free_path(dst_path); 3412 ret = btrfs_commit_inode_delayed_items(trans, inode);
2868 return ret; 3413 if (ret) {
3414 btrfs_free_path(path);
3415 btrfs_free_path(dst_path);
3416 return ret;
3417 }
2869 } 3418 }
2870 3419
2871 mutex_lock(&BTRFS_I(inode)->log_mutex); 3420 mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2881 max_key_type = BTRFS_XATTR_ITEM_KEY; 3430 max_key_type = BTRFS_XATTR_ITEM_KEY;
2882 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3431 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883 } else { 3432 } else {
2884 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) {
3435 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0);
3437 } else {
3438 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY);
3442 }
2885 } 3443 }
2886 if (ret) { 3444 if (ret) {
2887 err = ret; 3445 err = ret;
@@ -2912,7 +3470,7 @@ again:
2912 goto next_slot; 3470 goto next_slot;
2913 } 3471 }
2914 3472
2915 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 3473 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2916 ins_nr, inode_only); 3474 ins_nr, inode_only);
2917 if (ret) { 3475 if (ret) {
2918 err = ret; 3476 err = ret;
@@ -2930,7 +3488,7 @@ next_slot:
2930 goto again; 3488 goto again;
2931 } 3489 }
2932 if (ins_nr) { 3490 if (ins_nr) {
2933 ret = copy_items(trans, log, dst_path, src, 3491 ret = copy_items(trans, inode, dst_path, src,
2934 ins_start_slot, 3492 ins_start_slot,
2935 ins_nr, inode_only); 3493 ins_nr, inode_only);
2936 if (ret) { 3494 if (ret) {
@@ -2951,8 +3509,7 @@ next_slot:
2951 break; 3509 break;
2952 } 3510 }
2953 if (ins_nr) { 3511 if (ins_nr) {
2954 ret = copy_items(trans, log, dst_path, src, 3512 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2955 ins_start_slot,
2956 ins_nr, inode_only); 3513 ins_nr, inode_only);
2957 if (ret) { 3514 if (ret) {
2958 err = ret; 3515 err = ret;
@@ -2960,7 +3517,24 @@ next_slot:
2960 } 3517 }
2961 ins_nr = 0; 3518 ins_nr = 0;
2962 } 3519 }
2963 WARN_ON(ins_nr); 3520
3521 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path,
3525 dst_path);
3526 if (ret) {
3527 err = ret;
3528 goto out_unlock;
3529 }
3530 } else {
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n;
3533
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list);
3536 }
3537
2964 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965 btrfs_release_path(path); 3539 btrfs_release_path(path);
2966 btrfs_release_path(dst_path); 3540 btrfs_release_path(dst_path);
@@ -2971,6 +3545,7 @@ next_slot:
2971 } 3545 }
2972 } 3546 }
2973 BTRFS_I(inode)->logged_trans = trans->transid; 3547 BTRFS_I(inode)->logged_trans = trans->transid;
3548 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
2974out_unlock: 3549out_unlock:
2975 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3550 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2976 3551
@@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3138end_trans: 3713end_trans:
3139 dput(old_parent); 3714 dput(old_parent);
3140 if (ret < 0) { 3715 if (ret < 0) {
3141 BUG_ON(ret != -ENOSPC); 3716 WARN_ON(ret != -ENOSPC);
3142 root->fs_info->last_trans_log_full_commit = trans->transid; 3717 root->fs_info->last_trans_log_full_commit = trans->transid;
3143 ret = 1; 3718 ret = 1;
3144 } 3719 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3dd..99be4c138db6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
143 * In case of allocation failure -ENOMEM is returned and the ulist stays 143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
147 gfp_t gfp_mask)
148{ 147{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); 148 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150} 149}
151 150
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 151int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
153 unsigned long *old_aux, gfp_t gfp_mask) 152 u64 *old_aux, gfp_t gfp_mask)
154{ 153{
155 int i; 154 int i;
156 155
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec8130..21a1963439c3 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
33 */ 33 */
34struct ulist_node { 34struct ulist_node {
35 u64 val; /* value to store */ 35 u64 val; /* value to store */
36 unsigned long aux; /* auxiliary value saved along with the val */ 36 u64 aux; /* auxiliary value saved along with the val */
37}; 37};
38 38
39struct ulist { 39struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
65void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
66struct ulist *ulist_alloc(gfp_t gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
67void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
69 gfp_t gfp_mask); 69int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 70 u64 *old_aux, gfp_t gfp_mask);
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist, 71struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter); 72 struct ulist_iterator *uiter);
74 73
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb71..029b903a4ae3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
639 639
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 640 bdev = blkdev_get_by_path(device->name->str, flags, holder);
641 if (IS_ERR(bdev)) { 641 if (IS_ERR(bdev)) {
642 printk(KERN_INFO "open %s failed\n", device->name->str); 642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
643 goto error; 643 goto error;
644 } 644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping); 645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1475 free_fs_devices(cur_devices); 1475 free_fs_devices(cur_devices);
1476 } 1476 }
1477 1477
1478 root->fs_info->num_tolerated_disk_barrier_failures =
1479 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1480
1478 /* 1481 /*
1479 * at this point, the device is zero sized. We want to 1482 * at this point, the device is zero sized. We want to
1480 * remove it from the devices list and zero out the old super 1483 * remove it from the devices list and zero out the old super
@@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1775 1778
1776 if (seeding_dev) { 1779 if (seeding_dev) {
1777 ret = init_first_rw_device(trans, root, device); 1780 ret = init_first_rw_device(trans, root, device);
1778 if (ret) 1781 if (ret) {
1782 btrfs_abort_transaction(trans, root, ret);
1779 goto error_trans; 1783 goto error_trans;
1784 }
1780 ret = btrfs_finish_sprout(trans, root); 1785 ret = btrfs_finish_sprout(trans, root);
1781 if (ret) 1786 if (ret) {
1787 btrfs_abort_transaction(trans, root, ret);
1782 goto error_trans; 1788 goto error_trans;
1789 }
1783 } else { 1790 } else {
1784 ret = btrfs_add_device(trans, root, device); 1791 ret = btrfs_add_device(trans, root, device);
1785 if (ret) 1792 if (ret) {
1793 btrfs_abort_transaction(trans, root, ret);
1786 goto error_trans; 1794 goto error_trans;
1795 }
1787 } 1796 }
1788 1797
1789 /* 1798 /*
@@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1793 btrfs_clear_space_info_full(root->fs_info); 1802 btrfs_clear_space_info_full(root->fs_info);
1794 1803
1795 unlock_chunks(root); 1804 unlock_chunks(root);
1805 root->fs_info->num_tolerated_disk_barrier_failures =
1806 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1796 ret = btrfs_commit_transaction(trans, root); 1807 ret = btrfs_commit_transaction(trans, root);
1797 1808
1798 if (seeding_dev) { 1809 if (seeding_dev) {
@@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1814 1825
1815error_trans: 1826error_trans:
1816 unlock_chunks(root); 1827 unlock_chunks(root);
1817 btrfs_abort_transaction(trans, root, ret);
1818 btrfs_end_transaction(trans, root); 1828 btrfs_end_transaction(trans, root);
1819 rcu_string_free(device->name); 1829 rcu_string_free(device->name);
1820 kfree(device); 1830 kfree(device);
@@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2804 } 2814 }
2805 } 2815 }
2806 2816
2817 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2818 int num_tolerated_disk_barrier_failures;
2819 u64 target = bctl->sys.target;
2820
2821 num_tolerated_disk_barrier_failures =
2822 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2823 if (num_tolerated_disk_barrier_failures > 0 &&
2824 (target &
2825 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
2826 BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
2827 num_tolerated_disk_barrier_failures = 0;
2828 else if (num_tolerated_disk_barrier_failures > 1 &&
2829 (target &
2830 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
2831 num_tolerated_disk_barrier_failures = 1;
2832
2833 fs_info->num_tolerated_disk_barrier_failures =
2834 num_tolerated_disk_barrier_failures;
2835 }
2836
2807 ret = insert_balance_item(fs_info->tree_root, bctl); 2837 ret = insert_balance_item(fs_info->tree_root, bctl);
2808 if (ret && ret != -EEXIST) 2838 if (ret && ret != -EEXIST)
2809 goto out; 2839 goto out;
@@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2836 __cancel_balance(fs_info); 2866 __cancel_balance(fs_info);
2837 } 2867 }
2838 2868
2869 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2870 fs_info->num_tolerated_disk_barrier_failures =
2871 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2872 }
2873
2839 wake_up(&fs_info->balance_wait_q); 2874 wake_up(&fs_info->balance_wait_q);
2840 2875
2841 return ret; 2876 return ret;
@@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3608 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3643 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3609 &sys_chunk_size, &sys_stripe_size, 3644 &sys_chunk_size, &sys_stripe_size,
3610 sys_chunk_offset, alloc_profile); 3645 sys_chunk_offset, alloc_profile);
3611 if (ret) 3646 if (ret) {
3612 goto abort; 3647 btrfs_abort_transaction(trans, root, ret);
3648 goto out;
3649 }
3613 3650
3614 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3651 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3615 if (ret) 3652 if (ret) {
3616 goto abort; 3653 btrfs_abort_transaction(trans, root, ret);
3654 goto out;
3655 }
3617 3656
3618 /* 3657 /*
3619 * Modifying chunk tree needs allocating new blocks from both 3658 * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3623 */ 3662 */
3624 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3663 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3625 chunk_size, stripe_size); 3664 chunk_size, stripe_size);
3626 if (ret) 3665 if (ret) {
3627 goto abort; 3666 btrfs_abort_transaction(trans, root, ret);
3667 goto out;
3668 }
3628 3669
3629 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3670 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3630 sys_chunk_offset, sys_chunk_size, 3671 sys_chunk_offset, sys_chunk_size,
3631 sys_stripe_size); 3672 sys_stripe_size);
3632 if (ret) 3673 if (ret)
3633 goto abort; 3674 btrfs_abort_transaction(trans, root, ret);
3634 3675
3635 return 0; 3676out:
3636 3677
3637abort:
3638 btrfs_abort_transaction(trans, root, ret);
3639 return ret; 3678 return ret;
3640} 3679}
3641 3680
@@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3760 read_unlock(&em_tree->lock); 3799 read_unlock(&em_tree->lock);
3761 3800
3762 if (!em) { 3801 if (!em) {
3763 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 3802 printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
3764 (unsigned long long)logical, 3803 (unsigned long long)logical,
3765 (unsigned long long)*length); 3804 (unsigned long long)*length);
3766 BUG(); 3805 BUG();
@@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4217 4256
4218 total_devs = bbio->num_stripes; 4257 total_devs = bbio->num_stripes;
4219 if (map_length < length) { 4258 if (map_length < length) {
4220 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4259 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4221 "len %llu\n", (unsigned long long)logical, 4260 "len %llu\n", (unsigned long long)logical,
4222 (unsigned long long)length, 4261 (unsigned long long)length,
4223 (unsigned long long)map_length); 4262 (unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc55..9acb846c3e7f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
97 *total_in = 0; 97 *total_in = 0;
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "deflateInit failed\n"); 100 printk(KERN_WARNING "btrfs: deflateInit failed\n");
101 ret = -1; 101 ret = -1;
102 goto out; 102 goto out;
103 } 103 }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
125 while (workspace->def_strm.total_in < len) { 125 while (workspace->def_strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 127 if (ret != Z_OK) {
128 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", 128 printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
252 } 252 }
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "inflateInit failed\n"); 255 printk(KERN_WARNING "btrfs: inflateInit failed\n");
256 return -1; 256 return -1;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
336 } 336 }
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "inflateInit failed\n"); 339 printk(KERN_WARNING "btrfs: inflateInit failed\n");
340 return -1; 340 return -1;
341 } 341 }
342 342
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 91b91e805673..54fab041b22a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -445,6 +445,7 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
445 __field( u64, ref_root ) 445 __field( u64, ref_root )
446 __field( int, level ) 446 __field( int, level )
447 __field( int, type ) 447 __field( int, type )
448 __field( u64, seq )
448 ), 449 ),
449 450
450 TP_fast_assign( 451 TP_fast_assign(
@@ -455,17 +456,19 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
455 __entry->ref_root = full_ref->root; 456 __entry->ref_root = full_ref->root;
456 __entry->level = full_ref->level; 457 __entry->level = full_ref->level;
457 __entry->type = ref->type; 458 __entry->type = ref->type;
459 __entry->seq = ref->seq;
458 ), 460 ),
459 461
460 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " 462 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
461 "parent = %llu(%s), ref_root = %llu(%s), level = %d, " 463 "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
462 "type = %s", 464 "type = %s, seq = %llu",
463 (unsigned long long)__entry->bytenr, 465 (unsigned long long)__entry->bytenr,
464 (unsigned long long)__entry->num_bytes, 466 (unsigned long long)__entry->num_bytes,
465 show_ref_action(__entry->action), 467 show_ref_action(__entry->action),
466 show_root_type(__entry->parent), 468 show_root_type(__entry->parent),
467 show_root_type(__entry->ref_root), 469 show_root_type(__entry->ref_root),
468 __entry->level, show_ref_type(__entry->type)) 470 __entry->level, show_ref_type(__entry->type),
471 (unsigned long long)__entry->seq)
469); 472);
470 473
471TRACE_EVENT(btrfs_delayed_data_ref, 474TRACE_EVENT(btrfs_delayed_data_ref,
@@ -485,6 +488,7 @@ TRACE_EVENT(btrfs_delayed_data_ref,
485 __field( u64, owner ) 488 __field( u64, owner )
486 __field( u64, offset ) 489 __field( u64, offset )
487 __field( int, type ) 490 __field( int, type )
491 __field( u64, seq )
488 ), 492 ),
489 493
490 TP_fast_assign( 494 TP_fast_assign(
@@ -496,11 +500,12 @@ TRACE_EVENT(btrfs_delayed_data_ref,
496 __entry->owner = full_ref->objectid; 500 __entry->owner = full_ref->objectid;
497 __entry->offset = full_ref->offset; 501 __entry->offset = full_ref->offset;
498 __entry->type = ref->type; 502 __entry->type = ref->type;
503 __entry->seq = ref->seq;
499 ), 504 ),
500 505
501 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " 506 TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
502 "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " 507 "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
503 "offset = %llu, type = %s", 508 "offset = %llu, type = %s, seq = %llu",
504 (unsigned long long)__entry->bytenr, 509 (unsigned long long)__entry->bytenr,
505 (unsigned long long)__entry->num_bytes, 510 (unsigned long long)__entry->num_bytes,
506 show_ref_action(__entry->action), 511 show_ref_action(__entry->action),
@@ -508,7 +513,8 @@ TRACE_EVENT(btrfs_delayed_data_ref,
508 show_root_type(__entry->ref_root), 513 show_root_type(__entry->ref_root),
509 (unsigned long long)__entry->owner, 514 (unsigned long long)__entry->owner,
510 (unsigned long long)__entry->offset, 515 (unsigned long long)__entry->offset,
511 show_ref_type(__entry->type)) 516 show_ref_type(__entry->type),
517 (unsigned long long)__entry->seq)
512); 518);
513 519
514TRACE_EVENT(btrfs_delayed_ref_head, 520TRACE_EVENT(btrfs_delayed_ref_head,