aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c13
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c243
-rw-r--r--fs/btrfs/ctree.h186
-rw-r--r--fs/btrfs/delayed-ref.c102
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c231
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c2283
-rw-r--r--fs/btrfs/extent_io.c189
-rw-r--r--fs/btrfs/extent_io.h24
-rw-r--r--fs/btrfs/extent_map.c3
-rw-r--r--fs/btrfs/file-item.c29
-rw-r--r--fs/btrfs/file.c213
-rw-r--r--fs/btrfs/free-space-cache.c5
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1997
-rw-r--r--fs/btrfs/ioctl.c897
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h18
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h2
-rw-r--r--fs/btrfs/relocation.c1984
-rw-r--r--fs/btrfs/root-tree.c26
-rw-r--r--fs/btrfs/super.c258
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c244
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c91
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h6
39 files changed, 6489 insertions, 3240 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
@@ -59,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
60 if (size > 0) { 61 if (size > 0) {
61 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl))
64 return acl;
62 set_cached_acl(inode, type, acl); 65 set_cached_acl(inode, type, acl);
63 } 66 }
64 kfree(value); 67 kfree(value);
@@ -159,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
159 int ret; 162 int ret;
160 struct posix_acl *acl = NULL; 163 struct posix_acl *acl = NULL;
161 164
165 if (!is_owner_or_cap(dentry->d_inode))
166 return -EPERM;
167
168 if (!IS_POSIXACL(dentry->d_inode))
169 return -EOPNOTSUPP;
170
162 if (value) { 171 if (value) {
163 acl = posix_acl_from_xattr(value, size); 172 acl = posix_acl_from_xattr(value, size);
164 if (acl == NULL) { 173 if (acl == NULL) {
@@ -281,14 +290,14 @@ int btrfs_acl_chmod(struct inode *inode)
281 return ret; 290 return ret;
282} 291}
283 292
284struct xattr_handler btrfs_xattr_acl_default_handler = { 293const struct xattr_handler btrfs_xattr_acl_default_handler = {
285 .prefix = POSIX_ACL_XATTR_DEFAULT, 294 .prefix = POSIX_ACL_XATTR_DEFAULT,
286 .flags = ACL_TYPE_DEFAULT, 295 .flags = ACL_TYPE_DEFAULT,
287 .get = btrfs_xattr_acl_get, 296 .get = btrfs_xattr_acl_get,
288 .set = btrfs_xattr_acl_set, 297 .set = btrfs_xattr_acl_set,
289}; 298};
290 299
291struct xattr_handler btrfs_xattr_acl_access_handler = { 300const struct xattr_handler btrfs_xattr_acl_access_handler = {
292 .prefix = POSIX_ACL_XATTR_ACCESS, 301 .prefix = POSIX_ACL_XATTR_ACCESS,
293 .flags = ACL_TYPE_ACCESS, 302 .flags = ACL_TYPE_ACCESS,
294 .get = btrfs_xattr_acl_get, 303 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
@@ -376,6 +377,7 @@ again:
376 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
377 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
378 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
379 goto again; 381 goto again;
380 } 382 }
381 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,8 +151,14 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
157 /*
158 * always compress this one file
159 */
160 unsigned force_compress:1;
161
156 struct inode vfs_inode; 162 struct inode vfs_inode;
157}; 163};
158 164
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -279,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
279static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
280 struct btrfs_root *root, 281 struct btrfs_root *root,
281 struct extent_buffer *buf, 282 struct extent_buffer *buf,
282 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
283{ 285{
284 u64 refs; 286 u64 refs;
285 u64 owner; 287 u64 owner;
@@ -365,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 BUG_ON(ret); 367 BUG_ON(ret);
366 } 368 }
367 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
368 } 371 }
369 return 0; 372 return 0;
370} 373}
@@ -391,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
391 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
392 struct extent_buffer *cow; 395 struct extent_buffer *cow;
393 int level; 396 int level;
397 int last_ref = 0;
394 int unlock_orig = 0; 398 int unlock_orig = 0;
395 u64 parent_start; 399 u64 parent_start;
396 400
@@ -441,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
441 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
442 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
443 447
444 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
445 452
446 if (buf == root->node) { 453 if (buf == root->node) {
447 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -456,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
456 extent_buffer_get(cow); 463 extent_buffer_get(cow);
457 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
458 465
459 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
460 parent_start, root->root_key.objectid, level); 467 last_ref);
461 free_extent_buffer(buf); 468 free_extent_buffer(buf);
462 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
463 } else { 470 } else {
@@ -472,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
472 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
473 trans->transid); 480 trans->transid);
474 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
475 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
476 parent_start, root->root_key.objectid, level); 483 last_ref);
477 } 484 }
478 if (unlock_orig) 485 if (unlock_orig)
479 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -948,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
948 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
949} 956}
950 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
951/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
952 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
953 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1018,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1018 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1019 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1020 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1021 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1022 1049
1023 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1024 root->node = child; 1051 root->node = child;
@@ -1033,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1033 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1034 /* once for the path */ 1061 /* once for the path */
1035 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1036 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1037 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038 /* once for the root ptr */ 1066 /* once for the root ptr */
1039 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1040 return ret; 1068 return 0;
1041 } 1069 }
1042 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1043 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1087,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1087 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1088 ret = wret; 1116 ret = wret;
1089 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1090 u64 bytenr = right->start;
1091 u32 blocksize = right->len;
1092
1093 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1094 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1095 free_extent_buffer(right);
1096 right = NULL;
1097 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1098 1); 1121 1);
1099 if (wret) 1122 if (wret)
1100 ret = wret; 1123 ret = wret;
1101 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1102 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1103 root->root_key.objectid, 1126 free_extent_buffer(right);
1104 level); 1127 right = NULL;
1105 if (wret)
1106 ret = wret;
1107 } else { 1128 } else {
1108 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1109 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1135,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1135 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1136 } 1157 }
1137 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1138 /* we've managed to empty the middle node, drop it */
1139 u64 bytenr = mid->start;
1140 u32 blocksize = mid->len;
1141
1142 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1143 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1144 free_extent_buffer(mid);
1145 mid = NULL;
1146 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1147 if (wret) 1162 if (wret)
1148 ret = wret; 1163 ret = wret;
1149 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1150 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1151 if (wret) 1166 free_extent_buffer(mid);
1152 ret = wret; 1167 mid = NULL;
1153 } else { 1168 } else {
1154 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1155 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1589,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1589 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1590 1605
1591 ret = -EAGAIN; 1606 ret = -EAGAIN;
1592 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1593 if (tmp) { 1608 if (tmp) {
1594 /* 1609 /*
1595 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1739,7 +1754,6 @@ again:
1739 p->nodes[level + 1], 1754 p->nodes[level + 1],
1740 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1741 if (err) { 1756 if (err) {
1742 free_extent_buffer(b);
1743 ret = err; 1757 ret = err;
1744 goto done; 1758 goto done;
1745 } 1759 }
@@ -2075,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2075 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2076 return PTR_ERR(c); 2090 return PTR_ERR(c);
2077 2091
2092 root_add_used(root, root->nodesize);
2093
2078 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2079 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2080 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2133,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2133 int nritems; 2149 int nritems;
2134 2150
2135 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2136 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2137 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2138 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2201,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2201 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2202 return PTR_ERR(split); 2219 return PTR_ERR(split);
2203 2220
2221 root_add_used(root, root->nodesize);
2222
2204 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2205 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2206 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2285,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2285 return ret; 2304 return ret;
2286} 2305}
2287 2306
2307/*
2308 * min slot controls the lowest index we're willing to push to the
2309 * right. We'll push up to and including min_slot, but no lower
2310 */
2288static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 2311static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2289 struct btrfs_root *root, 2312 struct btrfs_root *root,
2290 struct btrfs_path *path, 2313 struct btrfs_path *path,
2291 int data_size, int empty, 2314 int data_size, int empty,
2292 struct extent_buffer *right, 2315 struct extent_buffer *right,
2293 int free_space, u32 left_nritems) 2316 int free_space, u32 left_nritems,
2317 u32 min_slot)
2294{ 2318{
2295 struct extent_buffer *left = path->nodes[0]; 2319 struct extent_buffer *left = path->nodes[0];
2296 struct extent_buffer *upper = path->nodes[1]; 2320 struct extent_buffer *upper = path->nodes[1];
@@ -2308,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2308 if (empty) 2332 if (empty)
2309 nr = 0; 2333 nr = 0;
2310 else 2334 else
2311 nr = 1; 2335 nr = max_t(u32, 1, min_slot);
2312 2336
2313 if (path->slots[0] >= left_nritems) 2337 if (path->slots[0] >= left_nritems)
2314 push_space += data_size; 2338 push_space += data_size;
@@ -2414,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2414 2438
2415 if (left_nritems) 2439 if (left_nritems)
2416 btrfs_mark_buffer_dirty(left); 2440 btrfs_mark_buffer_dirty(left);
2441 else
2442 clean_tree_block(trans, root, left);
2443
2417 btrfs_mark_buffer_dirty(right); 2444 btrfs_mark_buffer_dirty(right);
2418 2445
2419 btrfs_item_key(right, &disk_key, 0); 2446 btrfs_item_key(right, &disk_key, 0);
@@ -2447,10 +2474,14 @@ out_unlock:
2447 * 2474 *
2448 * returns 1 if the push failed because the other node didn't have enough 2475 * returns 1 if the push failed because the other node didn't have enough
2449 * room, 0 if everything worked out and < 0 if there were major errors. 2476 * room, 0 if everything worked out and < 0 if there were major errors.
2477 *
2478 * this will push starting from min_slot to the end of the leaf. It won't
2479 * push any slot lower than min_slot
2450 */ 2480 */
2451static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2452 *root, struct btrfs_path *path, int data_size, 2482 *root, struct btrfs_path *path,
2453 int empty) 2483 int min_data_size, int data_size,
2484 int empty, u32 min_slot)
2454{ 2485{
2455 struct extent_buffer *left = path->nodes[0]; 2486 struct extent_buffer *left = path->nodes[0];
2456 struct extent_buffer *right; 2487 struct extent_buffer *right;
@@ -2492,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2492 if (left_nritems == 0) 2523 if (left_nritems == 0)
2493 goto out_unlock; 2524 goto out_unlock;
2494 2525
2495 return __push_leaf_right(trans, root, path, data_size, empty, 2526 return __push_leaf_right(trans, root, path, min_data_size, empty,
2496 right, free_space, left_nritems); 2527 right, free_space, left_nritems, min_slot);
2497out_unlock: 2528out_unlock:
2498 btrfs_tree_unlock(right); 2529 btrfs_tree_unlock(right);
2499 free_extent_buffer(right); 2530 free_extent_buffer(right);
@@ -2503,12 +2534,17 @@ out_unlock:
2503/* 2534/*
2504 * push some data in the path leaf to the left, trying to free up at 2535 * push some data in the path leaf to the left, trying to free up at
2505 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2536 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2537 *
2538 * max_slot can put a limit on how far into the leaf we'll push items. The
2539 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2540 * items
2506 */ 2541 */
2507static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 2542static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2508 struct btrfs_root *root, 2543 struct btrfs_root *root,
2509 struct btrfs_path *path, int data_size, 2544 struct btrfs_path *path, int data_size,
2510 int empty, struct extent_buffer *left, 2545 int empty, struct extent_buffer *left,
2511 int free_space, int right_nritems) 2546 int free_space, u32 right_nritems,
2547 u32 max_slot)
2512{ 2548{
2513 struct btrfs_disk_key disk_key; 2549 struct btrfs_disk_key disk_key;
2514 struct extent_buffer *right = path->nodes[0]; 2550 struct extent_buffer *right = path->nodes[0];
@@ -2527,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2527 slot = path->slots[1]; 2563 slot = path->slots[1];
2528 2564
2529 if (empty) 2565 if (empty)
2530 nr = right_nritems; 2566 nr = min(right_nritems, max_slot);
2531 else 2567 else
2532 nr = right_nritems - 1; 2568 nr = min(right_nritems - 1, max_slot);
2533 2569
2534 for (i = 0; i < nr; i++) { 2570 for (i = 0; i < nr; i++) {
2535 item = btrfs_item_nr(right, i); 2571 item = btrfs_item_nr(right, i);
@@ -2659,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2659 btrfs_mark_buffer_dirty(left); 2695 btrfs_mark_buffer_dirty(left);
2660 if (right_nritems) 2696 if (right_nritems)
2661 btrfs_mark_buffer_dirty(right); 2697 btrfs_mark_buffer_dirty(right);
2698 else
2699 clean_tree_block(trans, root, right);
2662 2700
2663 btrfs_item_key(right, &disk_key, 0); 2701 btrfs_item_key(right, &disk_key, 0);
2664 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2702 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2668,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2668 /* then fixup the leaf pointer in the path */ 2706 /* then fixup the leaf pointer in the path */
2669 if (path->slots[0] < push_items) { 2707 if (path->slots[0] < push_items) {
2670 path->slots[0] += old_left_nritems; 2708 path->slots[0] += old_left_nritems;
2671 if (btrfs_header_nritems(path->nodes[0]) == 0)
2672 clean_tree_block(trans, root, path->nodes[0]);
2673 btrfs_tree_unlock(path->nodes[0]); 2709 btrfs_tree_unlock(path->nodes[0]);
2674 free_extent_buffer(path->nodes[0]); 2710 free_extent_buffer(path->nodes[0]);
2675 path->nodes[0] = left; 2711 path->nodes[0] = left;
@@ -2690,10 +2726,14 @@ out:
2690/* 2726/*
2691 * push some data in the path leaf to the left, trying to free up at 2727 * push some data in the path leaf to the left, trying to free up at
2692 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2728 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2729 *
2730 * max_slot can put a limit on how far into the leaf we'll push items. The
2731 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
2732 * items
2693 */ 2733 */
2694static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2734static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2695 *root, struct btrfs_path *path, int data_size, 2735 *root, struct btrfs_path *path, int min_data_size,
2696 int empty) 2736 int data_size, int empty, u32 max_slot)
2697{ 2737{
2698 struct extent_buffer *right = path->nodes[0]; 2738 struct extent_buffer *right = path->nodes[0];
2699 struct extent_buffer *left; 2739 struct extent_buffer *left;
@@ -2739,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2739 goto out; 2779 goto out;
2740 } 2780 }
2741 2781
2742 return __push_leaf_left(trans, root, path, data_size, 2782 return __push_leaf_left(trans, root, path, min_data_size,
2743 empty, left, free_space, right_nritems); 2783 empty, left, free_space, right_nritems,
2784 max_slot);
2744out: 2785out:
2745 btrfs_tree_unlock(left); 2786 btrfs_tree_unlock(left);
2746 free_extent_buffer(left); 2787 free_extent_buffer(left);
@@ -2833,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2833} 2874}
2834 2875
2835/* 2876/*
2877 * double splits happen when we need to insert a big item in the middle
2878 * of a leaf. A double split can leave us with 3 mostly empty leaves:
2879 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2880 * A B C
2881 *
2882 * We avoid this by trying to push the items on either side of our target
2883 * into the adjacent leaves. If all goes well we can avoid the double split
2884 * completely.
2885 */
2886static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2887 struct btrfs_root *root,
2888 struct btrfs_path *path,
2889 int data_size)
2890{
2891 int ret;
2892 int progress = 0;
2893 int slot;
2894 u32 nritems;
2895
2896 slot = path->slots[0];
2897
2898 /*
2899 * try to push all the items after our slot into the
2900 * right leaf
2901 */
2902 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2903 if (ret < 0)
2904 return ret;
2905
2906 if (ret == 0)
2907 progress++;
2908
2909 nritems = btrfs_header_nritems(path->nodes[0]);
2910 /*
2911 * our goal is to get our slot at the start or end of a leaf. If
2912 * we've done so we're done
2913 */
2914 if (path->slots[0] == 0 || path->slots[0] == nritems)
2915 return 0;
2916
2917 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2918 return 0;
2919
2920 /* try to push all the items before our slot into the next leaf */
2921 slot = path->slots[0];
2922 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2923 if (ret < 0)
2924 return ret;
2925
2926 if (ret == 0)
2927 progress++;
2928
2929 if (progress)
2930 return 0;
2931 return 1;
2932}
2933
2934/*
2836 * split the path's leaf in two, making sure there is at least data_size 2935 * split the path's leaf in two, making sure there is at least data_size
2837 * available for the resulting leaf level of the path. 2936 * available for the resulting leaf level of the path.
2838 * 2937 *
@@ -2854,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2854 int wret; 2953 int wret;
2855 int split; 2954 int split;
2856 int num_doubles = 0; 2955 int num_doubles = 0;
2956 int tried_avoid_double = 0;
2857 2957
2858 l = path->nodes[0]; 2958 l = path->nodes[0];
2859 slot = path->slots[0]; 2959 slot = path->slots[0];
@@ -2862,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2862 return -EOVERFLOW; 2962 return -EOVERFLOW;
2863 2963
2864 /* first try to make some room by pushing left and right */ 2964 /* first try to make some room by pushing left and right */
2865 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2965 if (data_size) {
2866 wret = push_leaf_right(trans, root, path, data_size, 0); 2966 wret = push_leaf_right(trans, root, path, data_size,
2967 data_size, 0, 0);
2867 if (wret < 0) 2968 if (wret < 0)
2868 return wret; 2969 return wret;
2869 if (wret) { 2970 if (wret) {
2870 wret = push_leaf_left(trans, root, path, data_size, 0); 2971 wret = push_leaf_left(trans, root, path, data_size,
2972 data_size, 0, (u32)-1);
2871 if (wret < 0) 2973 if (wret < 0)
2872 return wret; 2974 return wret;
2873 } 2975 }
@@ -2901,6 +3003,8 @@ again:
2901 if (mid != nritems && 3003 if (mid != nritems &&
2902 leaf_space_used(l, mid, nritems - mid) + 3004 leaf_space_used(l, mid, nritems - mid) +
2903 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3005 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3006 if (data_size && !tried_avoid_double)
3007 goto push_for_double;
2904 split = 2; 3008 split = 2;
2905 } 3009 }
2906 } 3010 }
@@ -2917,6 +3021,8 @@ again:
2917 if (mid != nritems && 3021 if (mid != nritems &&
2918 leaf_space_used(l, mid, nritems - mid) + 3022 leaf_space_used(l, mid, nritems - mid) +
2919 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3023 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3024 if (data_size && !tried_avoid_double)
3025 goto push_for_double;
2920 split = 2 ; 3026 split = 2 ;
2921 } 3027 }
2922 } 3028 }
@@ -2931,10 +3037,10 @@ again:
2931 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3037 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2932 root->root_key.objectid, 3038 root->root_key.objectid,
2933 &disk_key, 0, l->start, 0); 3039 &disk_key, 0, l->start, 0);
2934 if (IS_ERR(right)) { 3040 if (IS_ERR(right))
2935 BUG_ON(1);
2936 return PTR_ERR(right); 3041 return PTR_ERR(right);
2937 } 3042
3043 root_add_used(root, root->leafsize);
2938 3044
2939 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 3045 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2940 btrfs_set_header_bytenr(right, right->start); 3046 btrfs_set_header_bytenr(right, right->start);
@@ -2997,6 +3103,13 @@ again:
2997 } 3103 }
2998 3104
2999 return ret; 3105 return ret;
3106
3107push_for_double:
3108 push_for_double_split(trans, root, path, data_size);
3109 tried_avoid_double = 1;
3110 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3111 return 0;
3112 goto again;
3000} 3113}
3001 3114
3002static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3115static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3040,6 +3153,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3153 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3154 goto err;
3042 3155
3156 /* the leaf has changed, it now has room. return now */
3157 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3158 goto err;
3159
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3160 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3161 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3162 struct btrfs_file_extent_item);
@@ -3049,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3049 3166
3050 btrfs_set_path_blocking(path); 3167 btrfs_set_path_blocking(path);
3051 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3168 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3052 BUG_ON(ret); 3169 if (ret)
3170 goto err;
3053 3171
3054 path->keep_locks = 0; 3172 path->keep_locks = 0;
3055 btrfs_unlock_up_safe(path, 1); 3173 btrfs_unlock_up_safe(path, 1);
@@ -3791,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3791 */ 3909 */
3792 btrfs_unlock_up_safe(path, 0); 3910 btrfs_unlock_up_safe(path, 0);
3793 3911
3794 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3912 root_sub_used(root, leaf->len);
3795 0, root->root_key.objectid, 0); 3913
3796 return ret; 3914 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3915 return 0;
3797} 3916}
3798/* 3917/*
3799 * delete the item at the leaf level in path. If that empties 3918 * delete the item at the leaf level in path. If that empties
@@ -3860,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3860 if (leaf == root->node) { 3979 if (leaf == root->node) {
3861 btrfs_set_header_level(leaf, 0); 3980 btrfs_set_header_level(leaf, 0);
3862 } else { 3981 } else {
3982 btrfs_set_path_blocking(path);
3983 clean_tree_block(trans, root, leaf);
3863 ret = btrfs_del_leaf(trans, root, path, leaf); 3984 ret = btrfs_del_leaf(trans, root, path, leaf);
3864 BUG_ON(ret); 3985 BUG_ON(ret);
3865 } 3986 }
@@ -3885,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3885 extent_buffer_get(leaf); 4006 extent_buffer_get(leaf);
3886 4007
3887 btrfs_set_path_blocking(path); 4008 btrfs_set_path_blocking(path);
3888 wret = push_leaf_left(trans, root, path, 1, 1); 4009 wret = push_leaf_left(trans, root, path, 1, 1,
4010 1, (u32)-1);
3889 if (wret < 0 && wret != -ENOSPC) 4011 if (wret < 0 && wret != -ENOSPC)
3890 ret = wret; 4012 ret = wret;
3891 4013
3892 if (path->nodes[0] == leaf && 4014 if (path->nodes[0] == leaf &&
3893 btrfs_header_nritems(leaf)) { 4015 btrfs_header_nritems(leaf)) {
3894 wret = push_leaf_right(trans, root, path, 1, 1); 4016 wret = push_leaf_right(trans, root, path, 1,
4017 1, 1, 0);
3895 if (wret < 0 && wret != -ENOSPC) 4018 if (wret < 0 && wret != -ENOSPC)
3896 ret = wret; 4019 ret = wret;
3897 } 4020 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..eaf286abad17 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -33,6 +34,7 @@
33 34
34struct btrfs_trans_handle; 35struct btrfs_trans_handle;
35struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
36extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -373,11 +375,13 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 375 * ones specified below then we will fail to mount
374 */ 376 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 377#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
378#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
376 379
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 380#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 381#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 382#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 383 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
384 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
381 385
382/* 386/*
383 * A leaf is full of items. offset and size tell us where to find 387 * A leaf is full of items. offset and size tell us where to find
@@ -660,6 +664,7 @@ struct btrfs_csum_item {
660#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
661#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
662#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
663 668
664struct btrfs_block_group_item { 669struct btrfs_block_group_item {
665 __le64 used; 670 __le64 used;
@@ -671,42 +676,46 @@ struct btrfs_space_info {
671 u64 flags; 676 u64 flags;
672 677
673 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
674 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
675 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
676 transaction finishes */ 682 transaction finishes */
677 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
678 current allocations */ 684 current allocations */
679 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
680 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
681 u64 bytes_root; /* the number of bytes needed to commit a
682 transaction */
683 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc/allocations */ 688 delalloc/allocations */
685 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
686 delayed allocation */
687 690
688 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
689 chunks for this space */ 692 chunks for this space */
690 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
691 this space */ 694 this space */
692 int force_delalloc; /* make people start doing filemap_flush until
693 we're under a threshold */
694 695
695 struct list_head list; 696 struct list_head list;
696 697
697 /* for controlling how we free up space for allocations */
698 wait_queue_head_t allocate_wait;
699 wait_queue_head_t flush_wait;
700 int allocating_chunk;
701 int flushing;
702
703 /* for block groups in our same type */ 698 /* for block groups in our same type */
704 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
705 spinlock_t lock; 700 spinlock_t lock;
706 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
707 atomic_t caching_threads; 702 atomic_t caching_threads;
708}; 703};
709 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
710/* 719/*
711 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
712 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -757,6 +766,7 @@ struct btrfs_block_group_cache {
757 spinlock_t lock; 766 spinlock_t lock;
758 u64 pinned; 767 u64 pinned;
759 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
760 u64 bytes_super; 770 u64 bytes_super;
761 u64 flags; 771 u64 flags;
762 u64 sectorsize; 772 u64 sectorsize;
@@ -822,6 +832,22 @@ struct btrfs_fs_info {
822 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
823 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
824 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
825 u64 generation; 851 u64 generation;
826 u64 last_trans_committed; 852 u64 last_trans_committed;
827 853
@@ -832,7 +858,6 @@ struct btrfs_fs_info {
832 u64 last_trans_log_full_commit; 858 u64 last_trans_log_full_commit;
833 u64 open_ioctl_trans; 859 u64 open_ioctl_trans;
834 unsigned long mount_opt; 860 unsigned long mount_opt;
835 u64 max_extent;
836 u64 max_inline; 861 u64 max_inline;
837 u64 alloc_start; 862 u64 alloc_start;
838 struct btrfs_transaction *running_transaction; 863 struct btrfs_transaction *running_transaction;
@@ -925,7 +950,6 @@ struct btrfs_fs_info {
925 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
926 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
927 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
928 struct btrfs_workers enospc_workers;
929 /* 953 /*
930 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
931 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -941,6 +965,7 @@ struct btrfs_fs_info {
941 int do_barriers; 965 int do_barriers;
942 int closing; 966 int closing;
943 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
944 969
945 u64 total_pinned; 970 u64 total_pinned;
946 971
@@ -1010,6 +1035,9 @@ struct btrfs_root {
1010 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1011 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1012 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1013 struct mutex log_mutex; 1041 struct mutex log_mutex;
1014 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1015 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1041,7 +1069,6 @@ struct btrfs_root {
1041 int ref_cows; 1069 int ref_cows;
1042 int track_dirty; 1070 int track_dirty;
1043 int in_radix; 1071 int in_radix;
1044 int clean_orphans;
1045 1072
1046 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1047 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1055,8 +1082,11 @@ struct btrfs_root {
1055 1082
1056 struct list_head root_list; 1083 struct list_head root_list;
1057 1084
1058 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1059 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1060 1090
1061 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1062 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1182,7 +1212,6 @@ struct btrfs_root {
1182#define BTRFS_INODE_NOATIME (1 << 9) 1212#define BTRFS_INODE_NOATIME (1 << 9)
1183#define BTRFS_INODE_DIRSYNC (1 << 10) 1213#define BTRFS_INODE_DIRSYNC (1 << 10)
1184 1214
1185
1186/* some macros to generate set/get funcs for the struct fields. This 1215/* some macros to generate set/get funcs for the struct fields. This
1187 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1216 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1188 * one for u8: 1217 * one for u8:
@@ -1842,7 +1871,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1842BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1871BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1843 compat_flags, 64); 1872 compat_flags, 64);
1844BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1873BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1845 compat_flags, 64); 1874 compat_ro_flags, 64);
1846BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1875BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1847 incompat_flags, 64); 1876 incompat_flags, 64);
1848BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1877BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -1964,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1964int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1965 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1966int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1967int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1968 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1969int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1983,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1983 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1984 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1985 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1986int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1987 struct btrfs_root *root, 2019 struct btrfs_root *root,
1988 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1989 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1990struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1991 struct btrfs_root *root, 2023 struct btrfs_root *root,
1992 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2040,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2040 u64 size); 2072 u64 size);
2041int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2042 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2043int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2044 struct btrfs_block_group_cache *group);
2045
2046u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2047void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2048void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2049 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2050int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2051int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2052int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2053 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2054int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2055 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2056int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2057 u64 bytes); 2086 struct inode *inode);
2058void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2059 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2060void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2061 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2062void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2063 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2064/* ctree.c */ 2118/* ctree.c */
2065int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2066 int level, int *slot); 2120 int level, int *slot);
@@ -2151,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2151int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2152int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2154int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2155int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2156 struct btrfs_root *root, 2211 struct btrfs_root *root,
2157 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2244,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2244 struct btrfs_root *root, 2299 struct btrfs_root *root,
2245 const char *name, int name_len, 2300 const char *name, int name_len,
2246 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2247int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2248 struct btrfs_root *root, 2309 struct btrfs_root *root,
2249 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2256,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2256 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2257int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2258 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2259int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root, 2323 struct btrfs_root *root,
2261 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2310,7 +2373,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2310 u32 min_type); 2373 u32 min_type);
2311 2374
2312int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2313int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2378 struct extent_state **cached_state);
2314int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
2315 struct writeback_control *wbc); 2380 struct writeback_control *wbc);
2316int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2381int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2324,18 +2389,18 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
2324 pgoff_t offset, pgoff_t last_index); 2389 pgoff_t offset, pgoff_t last_index);
2325int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2390int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2326int btrfs_readpage(struct file *file, struct page *page); 2391int btrfs_readpage(struct file *file, struct page *page);
2327void btrfs_delete_inode(struct inode *inode); 2392void btrfs_evict_inode(struct inode *inode);
2328void btrfs_put_inode(struct inode *inode); 2393void btrfs_put_inode(struct inode *inode);
2329int btrfs_write_inode(struct inode *inode, int wait); 2394int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2330void btrfs_dirty_inode(struct inode *inode); 2395void btrfs_dirty_inode(struct inode *inode);
2331struct inode *btrfs_alloc_inode(struct super_block *sb); 2396struct inode *btrfs_alloc_inode(struct super_block *sb);
2332void btrfs_destroy_inode(struct inode *inode); 2397void btrfs_destroy_inode(struct inode *inode);
2333void btrfs_drop_inode(struct inode *inode); 2398int btrfs_drop_inode(struct inode *inode);
2334int btrfs_init_cachep(void); 2399int btrfs_init_cachep(void);
2335void btrfs_destroy_cachep(void); 2400void btrfs_destroy_cachep(void);
2336long btrfs_ioctl_trans_end(struct file *file); 2401long btrfs_ioctl_trans_end(struct file *file);
2337struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2402struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2338 struct btrfs_root *root); 2403 struct btrfs_root *root, int *was_new);
2339int btrfs_commit_write(struct file *file, struct page *page, 2404int btrfs_commit_write(struct file *file, struct page *page,
2340 unsigned from, unsigned to); 2405 unsigned from, unsigned to);
2341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2406struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2347,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2347int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2348int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2349void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2350int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2351int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2352void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2353void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2354extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2355 2430
2356/* ioctl.c */ 2431/* ioctl.c */
@@ -2359,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2359void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2360 2435
2361/* file.c */ 2436/* file.c */
2362int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2363int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2364 int skip_pinned); 2439 int skip_pinned);
2365int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2386,7 +2461,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2386ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2461ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2387 2462
2388/* super.c */ 2463/* super.c */
2389u64 btrfs_parse_size(char *str);
2390int btrfs_parse_options(struct btrfs_root *root, char *options); 2464int btrfs_parse_options(struct btrfs_root *root, char *options);
2391int btrfs_sync_fs(struct super_block *sb, int wait); 2465int btrfs_sync_fs(struct super_block *sb, int wait);
2392 2466
@@ -2408,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2408 struct btrfs_root *root); 2482 struct btrfs_root *root);
2409int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2410int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2411#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
@@ -318,107 +319,6 @@ out:
318} 319}
319 320
320/* 321/*
321 * helper function to lookup reference count and flags of extent.
322 *
323 * the head node for delayed ref is used to store the sum of all the
324 * reference count modifications queued up in the rbtree. the head
325 * node may also store the extent flags to set. This way you can check
326 * to see what the reference count and extent flags would be if all of
327 * the delayed refs are not processed.
328 */
329int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
330 struct btrfs_root *root, u64 bytenr,
331 u64 num_bytes, u64 *refs, u64 *flags)
332{
333 struct btrfs_delayed_ref_node *ref;
334 struct btrfs_delayed_ref_head *head;
335 struct btrfs_delayed_ref_root *delayed_refs;
336 struct btrfs_path *path;
337 struct btrfs_extent_item *ei;
338 struct extent_buffer *leaf;
339 struct btrfs_key key;
340 u32 item_size;
341 u64 num_refs;
342 u64 extent_flags;
343 int ret;
344
345 path = btrfs_alloc_path();
346 if (!path)
347 return -ENOMEM;
348
349 key.objectid = bytenr;
350 key.type = BTRFS_EXTENT_ITEM_KEY;
351 key.offset = num_bytes;
352 delayed_refs = &trans->transaction->delayed_refs;
353again:
354 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
355 &key, path, 0, 0);
356 if (ret < 0)
357 goto out;
358
359 if (ret == 0) {
360 leaf = path->nodes[0];
361 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
362 if (item_size >= sizeof(*ei)) {
363 ei = btrfs_item_ptr(leaf, path->slots[0],
364 struct btrfs_extent_item);
365 num_refs = btrfs_extent_refs(leaf, ei);
366 extent_flags = btrfs_extent_flags(leaf, ei);
367 } else {
368#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
369 struct btrfs_extent_item_v0 *ei0;
370 BUG_ON(item_size != sizeof(*ei0));
371 ei0 = btrfs_item_ptr(leaf, path->slots[0],
372 struct btrfs_extent_item_v0);
373 num_refs = btrfs_extent_refs_v0(leaf, ei0);
374 /* FIXME: this isn't correct for data */
375 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
376#else
377 BUG();
378#endif
379 }
380 BUG_ON(num_refs == 0);
381 } else {
382 num_refs = 0;
383 extent_flags = 0;
384 ret = 0;
385 }
386
387 spin_lock(&delayed_refs->lock);
388 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
389 if (ref) {
390 head = btrfs_delayed_node_to_head(ref);
391 if (!mutex_trylock(&head->mutex)) {
392 atomic_inc(&ref->refs);
393 spin_unlock(&delayed_refs->lock);
394
395 btrfs_release_path(root->fs_info->extent_root, path);
396
397 mutex_lock(&head->mutex);
398 mutex_unlock(&head->mutex);
399 btrfs_put_delayed_ref(ref);
400 goto again;
401 }
402 if (head->extent_op && head->extent_op->update_flags)
403 extent_flags |= head->extent_op->flags_to_set;
404 else
405 BUG_ON(num_refs == 0);
406
407 num_refs += ref->ref_mod;
408 mutex_unlock(&head->mutex);
409 }
410 WARN_ON(num_refs == 0);
411 if (refs)
412 *refs = num_refs;
413 if (flags)
414 *flags = extent_flags;
415out:
416 spin_unlock(&delayed_refs->lock);
417 btrfs_free_path(path);
418 return ret;
419}
420
421/*
422 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
423 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
424 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87b25543d7d1..64f10082f048 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root); 45static void free_fs_root(struct btrfs_root *root);
45 46
46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
47
48/* 47/*
49 * end_io_wq structs are used to do processing in task context when an IO is 48 * end_io_wq structs are used to do processing in task context when an IO is
50 * complete. This is used during reads to verify checksums, and it is used 49 * complete. This is used during reads to verify checksums, and it is used
@@ -75,6 +74,11 @@ struct async_submit_bio {
75 int rw; 74 int rw;
76 int mirror_num; 75 int mirror_num;
77 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
78 struct btrfs_work work; 82 struct btrfs_work work;
79}; 83};
80 84
@@ -263,13 +267,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 267static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 268 struct extent_buffer *eb, u64 parent_transid)
265{ 269{
270 struct extent_state *cached_state = NULL;
266 int ret; 271 int ret;
267 272
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 273 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 274 return 0;
270 275
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 276 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 277 0, &cached_state, GFP_NOFS);
278 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 279 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 280 ret = 0;
275 goto out; 281 goto out;
@@ -282,10 +288,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 288 (unsigned long long)btrfs_header_generation(eb));
283 } 289 }
284 ret = 1; 290 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 291 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 292out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 293 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 294 &cached_state, GFP_NOFS);
289 return ret; 295 return ret;
290} 296}
291 297
@@ -474,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
474 end_io_wq->work.func = end_workqueue_fn; 480 end_io_wq->work.func = end_workqueue_fn;
475 end_io_wq->work.flags = 0; 481 end_io_wq->work.flags = 0;
476 482
477 if (bio->bi_rw & (1 << BIO_RW)) { 483 if (bio->bi_rw & REQ_WRITE) {
478 if (end_io_wq->metadata) 484 if (end_io_wq->metadata)
479 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 485 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
480 &end_io_wq->work); 486 &end_io_wq->work);
@@ -533,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
533 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
534 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
535 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
536 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
537} 544}
538 545
539static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -555,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
555 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
556 563
557 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
558 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
559} 567}
560 568
561static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -569,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
569int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
570 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
571 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
572 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
573 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
574{ 583{
@@ -591,10 +600,11 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
591 600
592 async->work.flags = 0; 601 async->work.flags = 0;
593 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
594 604
595 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
596 606
597 if (rw & (1 << BIO_RW_SYNCIO)) 607 if (rw & REQ_SYNC)
598 btrfs_set_work_high_prio(&async->work); 608 btrfs_set_work_high_prio(&async->work);
599 609
600 btrfs_queue_worker(&fs_info->workers, &async->work); 610 btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -626,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
626 636
627static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
628 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
629 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
630{ 641{
631 /* 642 /*
632 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -637,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
637} 648}
638 649
639static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
640 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
641{ 653{
642 /* 654 /*
643 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -647,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
647} 659}
648 660
649static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
650 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
651{ 664{
652 int ret; 665 int ret;
653 666
@@ -655,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
655 bio, 1); 668 bio, 1);
656 BUG_ON(ret); 669 BUG_ON(ret);
657 670
658 if (!(rw & (1 << BIO_RW))) { 671 if (!(rw & REQ_WRITE)) {
659 /* 672 /*
660 * called for a read, do the setup so that checksum validation 673 * called for a read, do the setup so that checksum validation
661 * can happen in the async kernel threads 674 * can happen in the async kernel threads
@@ -670,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
670 */ 683 */
671 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
672 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
673 __btree_submit_bio_start, 687 __btree_submit_bio_start,
674 __btree_submit_bio_done); 688 __btree_submit_bio_done);
675} 689}
@@ -893,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
893 root->ref_cows = 0; 907 root->ref_cows = 0;
894 root->track_dirty = 0; 908 root->track_dirty = 0;
895 root->in_radix = 0; 909 root->in_radix = 0;
896 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
897 912
898 root->fs_info = fs_info; 913 root->fs_info = fs_info;
899 root->objectid = objectid; 914 root->objectid = objectid;
@@ -901,14 +916,17 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
901 root->highest_objectid = 0; 916 root->highest_objectid = 0;
902 root->name = NULL; 917 root->name = NULL;
903 root->in_sysfs = 0; 918 root->in_sysfs = 0;
904 root->inode_tree.rb_node = NULL; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
905 922
906 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
907 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
908 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
909 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
910 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
911 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
912 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
913 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
914 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -967,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
967 return 0; 985 return 0;
968} 986}
969 987
970int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
971 struct btrfs_fs_info *fs_info)
972{
973 struct extent_buffer *eb;
974 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
975 u64 start = 0;
976 u64 end = 0;
977 int ret;
978
979 if (!log_root_tree)
980 return 0;
981
982 while (1) {
983 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
984 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
985 if (ret)
986 break;
987
988 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
989 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
990 }
991 eb = fs_info->log_root_tree->node;
992
993 WARN_ON(btrfs_header_level(eb) != 0);
994 WARN_ON(btrfs_header_nritems(eb) != 0);
995
996 ret = btrfs_free_reserved_extent(fs_info->tree_root,
997 eb->start, eb->len);
998 BUG_ON(ret);
999
1000 free_extent_buffer(eb);
1001 kfree(fs_info->log_root_tree);
1002 fs_info->log_root_tree = NULL;
1003 return 0;
1004}
1005
1006static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1007 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1008{ 990{
@@ -1190,19 +1172,23 @@ again:
1190 if (root) 1172 if (root)
1191 return root; 1173 return root;
1192 1174
1193 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1194 if (ret == 0)
1195 ret = -ENOENT;
1196 if (ret < 0)
1197 return ERR_PTR(ret);
1198
1199 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1200 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1201 return root; 1177 return root;
1202 1178
1203 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1204 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1205 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1206 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1207 if (ret) 1193 if (ret)
1208 goto fail; 1194 goto fail;
@@ -1211,10 +1197,9 @@ again:
1211 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1212 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1213 root); 1199 root);
1214 if (ret == 0) { 1200 if (ret == 0)
1215 root->in_radix = 1; 1201 root->in_radix = 1;
1216 root->clean_orphans = 1; 1202
1217 }
1218 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1219 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1220 if (ret) { 1205 if (ret) {
@@ -1372,19 +1357,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1372{ 1357{
1373 int err; 1358 int err;
1374 1359
1375 bdi->name = "btrfs";
1376 bdi->capabilities = BDI_CAP_MAP_COPY; 1360 bdi->capabilities = BDI_CAP_MAP_COPY;
1377 err = bdi_init(bdi); 1361 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1378 if (err) 1362 if (err)
1379 return err; 1363 return err;
1380 1364
1381 err = bdi_register(bdi, NULL, "btrfs-%d",
1382 atomic_inc_return(&btrfs_bdi_num));
1383 if (err) {
1384 bdi_destroy(bdi);
1385 return err;
1386 }
1387
1388 bdi->ra_pages = default_backing_dev_info.ra_pages; 1365 bdi->ra_pages = default_backing_dev_info.ra_pages;
1389 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1366 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1390 bdi->unplug_io_data = info; 1367 bdi->unplug_io_data = info;
@@ -1450,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
1450 * ram and up to date before trying to verify things. For 1427 * ram and up to date before trying to verify things. For
1451 * blocksize <= pagesize, it is basically a noop 1428 * blocksize <= pagesize, it is basically a noop
1452 */ 1429 */
1453 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && 1430 if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
1454 !bio_ready_for_csum(bio)) { 1431 !bio_ready_for_csum(bio)) {
1455 btrfs_queue_worker(&fs_info->endio_meta_workers, 1432 btrfs_queue_worker(&fs_info->endio_meta_workers,
1456 &end_io_wq->work); 1433 &end_io_wq->work);
@@ -1468,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1468 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1469 1446
1470 do { 1447 do {
1471 smp_mb();
1472 if (root->fs_info->closing)
1473 break;
1474
1475 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1476 1449
1477 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1484,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1484 if (freezing(current)) { 1457 if (freezing(current)) {
1485 refrigerator(); 1458 refrigerator();
1486 } else { 1459 } else {
1487 smp_mb();
1488 if (root->fs_info->closing)
1489 break;
1490 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1491 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1492 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1493 } 1464 }
1494 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1500,36 +1471,40 @@ static int transaction_kthread(void *arg)
1500 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1501 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1502 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1503 unsigned long now; 1475 unsigned long now;
1504 unsigned long delay; 1476 unsigned long delay;
1505 int ret; 1477 int ret;
1506 1478
1507 do { 1479 do {
1508 smp_mb();
1509 if (root->fs_info->closing)
1510 break;
1511
1512 delay = HZ * 30; 1480 delay = HZ * 30;
1513 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1514 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1515 1483
1516 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1517 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1518 if (!cur) { 1486 if (!cur) {
1519 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1520 goto sleep; 1488 goto sleep;
1521 } 1489 }
1522 1490
1523 now = get_seconds(); 1491 now = get_seconds();
1524 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1525 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1526 delay = HZ * 5; 1495 delay = HZ * 5;
1527 goto sleep; 1496 goto sleep;
1528 } 1497 }
1529 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1530 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1531 ret = btrfs_commit_transaction(trans, root);
1532 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1533sleep: 1508sleep:
1534 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1535 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1537,10 +1512,10 @@ sleep:
1537 if (freezing(current)) { 1512 if (freezing(current)) {
1538 refrigerator(); 1513 refrigerator();
1539 } else { 1514 } else {
1540 if (root->fs_info->closing)
1541 break;
1542 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1543 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1544 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1545 } 1520 }
1546 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1627,12 +1602,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1628 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1629 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1630 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1631 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1632 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
1633 atomic_set(&fs_info->nr_async_bios, 0); 1615 atomic_set(&fs_info->nr_async_bios, 0);
1634 fs_info->sb = sb; 1616 fs_info->sb = sb;
1635 fs_info->max_extent = (u64)-1;
1636 fs_info->max_inline = 8192 * 1024; 1617 fs_info->max_inline = 8192 * 1024;
1637 fs_info->metadata_ratio = 0; 1618 fs_info->metadata_ratio = 0;
1638 1619
@@ -1673,7 +1654,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1673 insert_inode_hash(fs_info->btree_inode); 1654 insert_inode_hash(fs_info->btree_inode);
1674 1655
1675 spin_lock_init(&fs_info->block_group_cache_lock); 1656 spin_lock_init(&fs_info->block_group_cache_lock);
1676 fs_info->block_group_cache_tree.rb_node = NULL; 1657 fs_info->block_group_cache_tree = RB_ROOT;
1677 1658
1678 extent_io_tree_init(&fs_info->freed_extents[0], 1659 extent_io_tree_init(&fs_info->freed_extents[0],
1679 fs_info->btree_inode->i_mapping, GFP_NOFS); 1660 fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1767,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1767 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1768 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1769 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1770 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1771 fs_info->thread_pool_size,
1772 &fs_info->generic_worker);
1773 1751
1774 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1775 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1817,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1817 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1818 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1819 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1820 btrfs_start_workers(&fs_info->enospc_workers, 1);
1821 1798
1822 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1823 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1920,17 +1897,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1920 1897
1921 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1922 1899
1923 btrfs_read_block_groups(extent_root);
1924
1925 fs_info->generation = generation; 1900 fs_info->generation = generation;
1926 fs_info->last_trans_committed = generation; 1901 fs_info->last_trans_committed = generation;
1927 fs_info->data_alloc_profile = (u64)-1; 1902 fs_info->data_alloc_profile = (u64)-1;
1928 fs_info->metadata_alloc_profile = (u64)-1; 1903 fs_info->metadata_alloc_profile = (u64)-1;
1929 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1906 ret = btrfs_read_block_groups(extent_root);
1907 if (ret) {
1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1909 goto fail_block_groups;
1910 }
1911
1930 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1931 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1932 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
1933 goto fail_csum_root; 1915 goto fail_block_groups;
1934 1916
1935 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1917 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1936 tree_root, 1918 tree_root,
@@ -1959,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1959 btrfs_level_size(tree_root, 1941 btrfs_level_size(tree_root,
1960 btrfs_super_log_root_level(disk_super)); 1942 btrfs_super_log_root_level(disk_super));
1961 1943
1962 log_tree_root = kzalloc(sizeof(struct btrfs_root), 1944 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1963 GFP_NOFS); 1945 if (!log_tree_root) {
1946 err = -ENOMEM;
1947 goto fail_trans_kthread;
1948 }
1964 1949
1965 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1950 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1966 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1951 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1981,8 +1966,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1981 BUG_ON(ret); 1966 BUG_ON(ret);
1982 1967
1983 if (!(sb->s_flags & MS_RDONLY)) { 1968 if (!(sb->s_flags & MS_RDONLY)) {
1984 ret = btrfs_recover_relocation(tree_root); 1969 ret = btrfs_cleanup_fs_roots(fs_info);
1985 BUG_ON(ret); 1970 BUG_ON(ret);
1971
1972 ret = btrfs_recover_relocation(tree_root);
1973 if (ret < 0) {
1974 printk(KERN_WARNING
1975 "btrfs: failed to recover relocation\n");
1976 err = -EINVAL;
1977 goto fail_trans_kthread;
1978 }
1986 } 1979 }
1987 1980
1988 location.objectid = BTRFS_FS_TREE_OBJECTID; 1981 location.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -1992,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1992 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1985 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1993 if (!fs_info->fs_root) 1986 if (!fs_info->fs_root)
1994 goto fail_trans_kthread; 1987 goto fail_trans_kthread;
1988 if (IS_ERR(fs_info->fs_root)) {
1989 err = PTR_ERR(fs_info->fs_root);
1990 goto fail_trans_kthread;
1991 }
1995 1992
1996 if (!(sb->s_flags & MS_RDONLY)) { 1993 if (!(sb->s_flags & MS_RDONLY)) {
1997 down_read(&fs_info->cleanup_work_sem); 1994 down_read(&fs_info->cleanup_work_sem);
@@ -2013,7 +2010,8 @@ fail_cleaner:
2013 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2010 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2014 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2011 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2015 2012
2016fail_csum_root: 2013fail_block_groups:
2014 btrfs_free_block_groups(fs_info);
2017 free_extent_buffer(csum_root->node); 2015 free_extent_buffer(csum_root->node);
2018 free_extent_buffer(csum_root->commit_root); 2016 free_extent_buffer(csum_root->commit_root);
2019fail_dev_root: 2017fail_dev_root:
@@ -2038,7 +2036,6 @@ fail_sb_buffer:
2038 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2039 btrfs_stop_workers(&fs_info->endio_write_workers); 2037 btrfs_stop_workers(&fs_info->endio_write_workers);
2040 btrfs_stop_workers(&fs_info->submit_workers); 2038 btrfs_stop_workers(&fs_info->submit_workers);
2041 btrfs_stop_workers(&fs_info->enospc_workers);
2042fail_iput: 2039fail_iput:
2043 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2044 iput(fs_info->btree_inode); 2041 iput(fs_info->btree_inode);
@@ -2403,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2403 down_write(&root->fs_info->cleanup_work_sem); 2400 down_write(&root->fs_info->cleanup_work_sem);
2404 up_write(&root->fs_info->cleanup_work_sem); 2401 up_write(&root->fs_info->cleanup_work_sem);
2405 2402
2406 trans = btrfs_start_transaction(root, 1); 2403 trans = btrfs_join_transaction(root, 1);
2407 ret = btrfs_commit_transaction(trans, root); 2404 ret = btrfs_commit_transaction(trans, root);
2408 BUG_ON(ret); 2405 BUG_ON(ret);
2409 /* run commit again to drop the original snapshot */ 2406 /* run commit again to drop the original snapshot */
2410 trans = btrfs_start_transaction(root, 1); 2407 trans = btrfs_join_transaction(root, 1);
2411 btrfs_commit_transaction(trans, root); 2408 btrfs_commit_transaction(trans, root);
2412 ret = btrfs_write_and_wait_transaction(NULL, root); 2409 ret = btrfs_write_and_wait_transaction(NULL, root);
2413 BUG_ON(ret); 2410 BUG_ON(ret);
@@ -2424,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
2424 fs_info->closing = 1; 2421 fs_info->closing = 1;
2425 smp_mb(); 2422 smp_mb();
2426 2423
2427 kthread_stop(root->fs_info->transaction_kthread);
2428 kthread_stop(root->fs_info->cleaner_kthread);
2429
2430 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2431 ret = btrfs_commit_super(root); 2425 ret = btrfs_commit_super(root);
2432 if (ret) 2426 if (ret)
2433 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2434 } 2428 }
2435 2429
2430 kthread_stop(root->fs_info->transaction_kthread);
2431 kthread_stop(root->fs_info->cleaner_kthread);
2432
2436 fs_info->closing = 2; 2433 fs_info->closing = 2;
2437 smp_mb(); 2434 smp_mb();
2438 2435
@@ -2471,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
2471 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2472 btrfs_stop_workers(&fs_info->endio_write_workers); 2469 btrfs_stop_workers(&fs_info->endio_write_workers);
2473 btrfs_stop_workers(&fs_info->submit_workers); 2470 btrfs_stop_workers(&fs_info->submit_workers);
2474 btrfs_stop_workers(&fs_info->enospc_workers);
2475 2471
2476 btrfs_close_devices(fs_info->fs_devices); 2472 btrfs_close_devices(fs_info->fs_devices);
2477 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2473 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2492,7 +2488,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2492 int ret; 2488 int ret;
2493 struct inode *btree_inode = buf->first_page->mapping->host; 2489 struct inode *btree_inode = buf->first_page->mapping->host;
2494 2490
2495 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2491 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2492 NULL);
2496 if (!ret) 2493 if (!ret)
2497 return ret; 2494 return ret;
2498 2495
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 96 key.offset = 0;
97 97
98 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 100 err = PTR_ERR(inode);
101 goto fail; 101 goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 223
224 key.type = BTRFS_INODE_ITEM_KEY; 224 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 225 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry)) 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations; 228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry; 229 return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 432a2da4641e..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -34,10 +35,9 @@
34 35
35static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
36 struct btrfs_root *root, 37 struct btrfs_root *root,
37 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
38 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
39static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -60,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
60static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 64 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -90,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
90 84
91void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
92{ 86{
93 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
94 kfree(cache); 91 kfree(cache);
92 }
95} 93}
96 94
97/* 95/*
@@ -318,7 +316,7 @@ static int caching_kthread(void *data)
318 316
319 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
320 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
321 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
322 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
323 321
324 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -506,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
506 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
507 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
508 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
509 rcu_read_lock(); 510 rcu_read_lock();
510 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
511 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -609,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
609} 610}
610 611
611/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
612 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
613 * 721 *
614 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1588,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1588 u64 start, u64 len) 1696 u64 start, u64 len)
1589{ 1697{
1590 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1591 DISCARD_FL_BARRIER); 1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1592} 1700}
1593 1701
1594static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1870,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1870 return ret; 1978 return ret;
1871} 1979}
1872 1980
1873
1874/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1875static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1890,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1890 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1891 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1892 if (insert_reserved) { 1999 if (insert_reserved) {
1893 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1894 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1895
1896 ret = pin_down_bytes(trans, root, NULL,
1897 node->bytenr, node->num_bytes,
1898 head->is_data, 1, &must_clean);
1899 if (ret > 0)
1900 mark_free = 1;
1901
1902 if (must_clean) {
1903 clean_tree_block(NULL, root, must_clean);
1904 btrfs_tree_unlock(must_clean);
1905 free_extent_buffer(must_clean);
1906 }
1907 if (head->is_data) { 2002 if (head->is_data) {
1908 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1909 node->bytenr, 2004 node->bytenr,
1910 node->num_bytes); 2005 node->num_bytes);
1911 BUG_ON(ret); 2006 BUG_ON(ret);
1912 } 2007 }
1913 if (mark_free) {
1914 ret = btrfs_free_reserved_extent(root,
1915 node->bytenr,
1916 node->num_bytes);
1917 BUG_ON(ret);
1918 }
1919 } 2008 }
1920 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1921 return 0; 2010 return 0;
@@ -2346,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2346 ret = 0; 2435 ret = 0;
2347out: 2436out:
2348 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2349 return ret; 2440 return ret;
2350} 2441}
2351 2442
@@ -2659,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2659 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2660{ 2751{
2661 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2662 2761
2663 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2664 if (found) { 2763 if (found) {
2665 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2666 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2667 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2668 found->full = 0; 2768 found->full = 0;
2669 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2670 *space_info = found; 2770 *space_info = found;
@@ -2674,16 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2674 if (!found) 2774 if (!found)
2675 return -ENOMEM; 2775 return -ENOMEM;
2676 2776
2677 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2678 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2679 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2680 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2681 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2682 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2683 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2684 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2685 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2686 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2687 found->full = 0; 2791 found->full = 0;
2688 found->force_alloc = 0; 2792 found->force_alloc = 0;
2689 *space_info = found; 2793 *space_info = found;
@@ -2708,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2708 } 2812 }
2709} 2813}
2710 2814
2711static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2712{
2713 spin_lock(&cache->space_info->lock);
2714 spin_lock(&cache->lock);
2715 if (!cache->ro) {
2716 cache->space_info->bytes_readonly += cache->key.offset -
2717 btrfs_block_group_used(&cache->item);
2718 cache->ro = 1;
2719 }
2720 spin_unlock(&cache->lock);
2721 spin_unlock(&cache->space_info->lock);
2722}
2723
2724u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2725{ 2816{
2726 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2749,492 +2840,49 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2749 return flags; 2840 return flags;
2750} 2841}
2751 2842
2752static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2753{ 2844{
2754 struct btrfs_fs_info *info = root->fs_info; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2755 u64 alloc_profile; 2846 flags |= root->fs_info->avail_data_alloc_bits &
2756 2847 root->fs_info->data_alloc_profile;
2757 if (data) { 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2758 alloc_profile = info->avail_data_alloc_bits & 2849 flags |= root->fs_info->avail_system_alloc_bits &
2759 info->data_alloc_profile; 2850 root->fs_info->system_alloc_profile;
2760 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2761 } else if (root == root->fs_info->chunk_root) { 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2762 alloc_profile = info->avail_system_alloc_bits & 2853 root->fs_info->metadata_alloc_profile;
2763 info->system_alloc_profile; 2854 return btrfs_reduce_alloc_profile(root, flags);
2764 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2765 } else {
2766 alloc_profile = info->avail_metadata_alloc_bits &
2767 info->metadata_alloc_profile;
2768 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2769 }
2770
2771 return btrfs_reduce_alloc_profile(root, data);
2772} 2855}
2773 2856
2774void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2775{ 2858{
2776 u64 alloc_target; 2859 u64 flags;
2777
2778 alloc_target = btrfs_get_alloc_profile(root, 1);
2779 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2780 alloc_target);
2781}
2782
2783static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2784{
2785 u64 num_bytes;
2786 int level;
2787
2788 level = BTRFS_MAX_LEVEL - 2;
2789 /*
2790 * NOTE: these calculations are absolutely the worst possible case.
2791 * This assumes that _every_ item we insert will require a new leaf, and
2792 * that the tree has grown to its maximum level size.
2793 */
2794
2795 /*
2796 * for every item we insert we could insert both an extent item and a
2797 * extent ref item. Then for ever item we insert, we will need to cow
2798 * both the original leaf, plus the leaf to the left and right of it.
2799 *
2800 * Unless we are talking about the extent root, then we just want the
2801 * number of items * 2, since we just need the extent item plus its ref.
2802 */
2803 if (root == root->fs_info->extent_root)
2804 num_bytes = num_items * 2;
2805 else
2806 num_bytes = (num_items + (2 * num_items)) * 3;
2807
2808 /*
2809 * num_bytes is total number of leaves we could need times the leaf
2810 * size, and then for every leaf we could end up cow'ing 2 nodes per
2811 * level, down to the leaf level.
2812 */
2813 num_bytes = (num_bytes * root->leafsize) +
2814 (num_bytes * (level * 2)) * root->nodesize;
2815
2816 return num_bytes;
2817}
2818
2819/*
2820 * Unreserve metadata space for delalloc. If we have less reserved credits than
2821 * we have extents, this function does nothing.
2822 */
2823int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2824 struct inode *inode, int num_items)
2825{
2826 struct btrfs_fs_info *info = root->fs_info;
2827 struct btrfs_space_info *meta_sinfo;
2828 u64 num_bytes;
2829 u64 alloc_target;
2830 bool bug = false;
2831
2832 /* get the space info for where the metadata will live */
2833 alloc_target = btrfs_get_alloc_profile(root, 0);
2834 meta_sinfo = __find_space_info(info, alloc_target);
2835
2836 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2837 num_items);
2838
2839 spin_lock(&meta_sinfo->lock);
2840 spin_lock(&BTRFS_I(inode)->accounting_lock);
2841 if (BTRFS_I(inode)->reserved_extents <=
2842 BTRFS_I(inode)->outstanding_extents) {
2843 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2844 spin_unlock(&meta_sinfo->lock);
2845 return 0;
2846 }
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2848
2849 BTRFS_I(inode)->reserved_extents--;
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2851
2852 if (meta_sinfo->bytes_delalloc < num_bytes) {
2853 bug = true;
2854 meta_sinfo->bytes_delalloc = 0;
2855 } else {
2856 meta_sinfo->bytes_delalloc -= num_bytes;
2857 }
2858 spin_unlock(&meta_sinfo->lock);
2859
2860 BUG_ON(bug);
2861
2862 return 0;
2863}
2864
2865static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2866{
2867 u64 thresh;
2868
2869 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2870 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2871 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2872 meta_sinfo->bytes_may_use;
2873 2860
2874 thresh = meta_sinfo->total_bytes - thresh; 2861 if (data)
2875 thresh *= 80; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2876 do_div(thresh, 100); 2863 else if (root == root->fs_info->chunk_root)
2877 if (thresh <= meta_sinfo->bytes_delalloc) 2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2878 meta_sinfo->force_delalloc = 1;
2879 else 2865 else
2880 meta_sinfo->force_delalloc = 0; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2881}
2882
2883struct async_flush {
2884 struct btrfs_root *root;
2885 struct btrfs_space_info *info;
2886 struct btrfs_work work;
2887};
2888
2889static noinline void flush_delalloc_async(struct btrfs_work *work)
2890{
2891 struct async_flush *async;
2892 struct btrfs_root *root;
2893 struct btrfs_space_info *info;
2894
2895 async = container_of(work, struct async_flush, work);
2896 root = async->root;
2897 info = async->info;
2898
2899 btrfs_start_delalloc_inodes(root, 0);
2900 wake_up(&info->flush_wait);
2901 btrfs_wait_ordered_extents(root, 0, 0);
2902
2903 spin_lock(&info->lock);
2904 info->flushing = 0;
2905 spin_unlock(&info->lock);
2906 wake_up(&info->flush_wait);
2907
2908 kfree(async);
2909}
2910
2911static void wait_on_flush(struct btrfs_space_info *info)
2912{
2913 DEFINE_WAIT(wait);
2914 u64 used;
2915
2916 while (1) {
2917 prepare_to_wait(&info->flush_wait, &wait,
2918 TASK_UNINTERRUPTIBLE);
2919 spin_lock(&info->lock);
2920 if (!info->flushing) {
2921 spin_unlock(&info->lock);
2922 break;
2923 }
2924
2925 used = info->bytes_used + info->bytes_reserved +
2926 info->bytes_pinned + info->bytes_readonly +
2927 info->bytes_super + info->bytes_root +
2928 info->bytes_may_use + info->bytes_delalloc;
2929 if (used < info->total_bytes) {
2930 spin_unlock(&info->lock);
2931 break;
2932 }
2933 spin_unlock(&info->lock);
2934 schedule();
2935 }
2936 finish_wait(&info->flush_wait, &wait);
2937}
2938
2939static void flush_delalloc(struct btrfs_root *root,
2940 struct btrfs_space_info *info)
2941{
2942 struct async_flush *async;
2943 bool wait = false;
2944
2945 spin_lock(&info->lock);
2946
2947 if (!info->flushing) {
2948 info->flushing = 1;
2949 init_waitqueue_head(&info->flush_wait);
2950 } else {
2951 wait = true;
2952 }
2953
2954 spin_unlock(&info->lock);
2955
2956 if (wait) {
2957 wait_on_flush(info);
2958 return;
2959 }
2960
2961 async = kzalloc(sizeof(*async), GFP_NOFS);
2962 if (!async)
2963 goto flush;
2964
2965 async->root = root;
2966 async->info = info;
2967 async->work.func = flush_delalloc_async;
2968
2969 btrfs_queue_worker(&root->fs_info->enospc_workers,
2970 &async->work);
2971 wait_on_flush(info);
2972 return;
2973
2974flush:
2975 btrfs_start_delalloc_inodes(root, 0);
2976 btrfs_wait_ordered_extents(root, 0, 0);
2977
2978 spin_lock(&info->lock);
2979 info->flushing = 0;
2980 spin_unlock(&info->lock);
2981 wake_up(&info->flush_wait);
2982}
2983
2984static int maybe_allocate_chunk(struct btrfs_root *root,
2985 struct btrfs_space_info *info)
2986{
2987 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2988 struct btrfs_trans_handle *trans;
2989 bool wait = false;
2990 int ret = 0;
2991 u64 min_metadata;
2992 u64 free_space;
2993
2994 free_space = btrfs_super_total_bytes(disk_super);
2995 /*
2996 * we allow the metadata to grow to a max of either 10gb or 5% of the
2997 * space in the volume.
2998 */
2999 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3000 div64_u64(free_space * 5, 100));
3001 if (info->total_bytes >= min_metadata) {
3002 spin_unlock(&info->lock);
3003 return 0;
3004 }
3005
3006 if (info->full) {
3007 spin_unlock(&info->lock);
3008 return 0;
3009 }
3010
3011 if (!info->allocating_chunk) {
3012 info->force_alloc = 1;
3013 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113
3114 BTRFS_I(inode)->reserved_extents++;
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120 2867
3121 return 0; 2868 return get_alloc_profile(root, flags);
3122} 2869}
3123 2870
3124/* 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
2885 u64 used;
3238 int ret = 0, committed = 0; 2886 int ret = 0, committed = 0;
3239 2887
3240 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
@@ -3247,10 +2895,11 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3247again: 2895again:
3248 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3249 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 2900 data_sinfo->bytes_may_use;
3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 2901
2902 if (used + bytes > data_sinfo->total_bytes) {
3254 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3255 2904
3256 /* 2905 /*
@@ -3264,15 +2913,15 @@ again:
3264 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3265alloc: 2914alloc:
3266 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3267 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3268 if (!trans) 2917 if (IS_ERR(trans))
3269 return -ENOMEM; 2918 return PTR_ERR(trans);
3270 2919
3271 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3272 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3273 alloc_target, 0); 2922 alloc_target, 0);
3274 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3275 if (ret) 2924 if (ret < 0)
3276 return ret; 2925 return ret;
3277 2926
3278 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3287,25 +2936,26 @@ alloc:
3287 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3288 committed = 1; 2937 committed = 1;
3289 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3290 if (!trans) 2939 if (IS_ERR(trans))
3291 return -ENOMEM; 2940 return PTR_ERR(trans);
3292 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3293 if (ret) 2942 if (ret)
3294 return ret; 2943 return ret;
3295 goto again; 2944 goto again;
3296 } 2945 }
3297 2946
3298 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3299 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3300 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3301 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3302 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3303 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3304 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3305 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3306 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3307 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3308 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3309 return -ENOSPC; 2959 return -ENOSPC;
3310 } 2960 }
3311 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3316,12 +2966,13 @@ alloc:
3316} 2966}
3317 2967
3318/* 2968/*
3319 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3320 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3321 */ 2972 */
3322void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3323 struct inode *inode, u64 bytes)
3324{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3325 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3326 2977
3327 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3334,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3334 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3335} 2986}
3336 2987
3337/* called when we are adding a delalloc extent to the inode's io_tree */
3338void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3339 u64 bytes)
3340{
3341 struct btrfs_space_info *data_sinfo;
3342
3343 /* get the space info for where this inode will be storing its data */
3344 data_sinfo = BTRFS_I(inode)->space_info;
3345
3346 /* make sure we have enough space to handle the data first */
3347 spin_lock(&data_sinfo->lock);
3348 data_sinfo->bytes_delalloc += bytes;
3349
3350 /*
3351 * we are adding a delalloc extent without calling
3352 * btrfs_check_data_free_space first. This happens on a weird
3353 * writepage condition, but shouldn't hurt our accounting
3354 */
3355 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3356 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3357 BTRFS_I(inode)->reserved_bytes = 0;
3358 } else {
3359 data_sinfo->bytes_may_use -= bytes;
3360 BTRFS_I(inode)->reserved_bytes -= bytes;
3361 }
3362
3363 spin_unlock(&data_sinfo->lock);
3364}
3365
3366/* called when we are clearing an delalloc extent from the inode's io_tree */
3367void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3368 u64 bytes)
3369{
3370 struct btrfs_space_info *info;
3371
3372 info = BTRFS_I(inode)->space_info;
3373
3374 spin_lock(&info->lock);
3375 info->bytes_delalloc -= bytes;
3376 spin_unlock(&info->lock);
3377}
3378
3379static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3380{ 2989{
3381 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3389,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3389 rcu_read_unlock(); 2998 rcu_read_unlock();
3390} 2999}
3391 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3392static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3393 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3394 u64 flags, int force) 3019 u64 flags, int force)
3395{ 3020{
3396 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3397 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3398 u64 thresh;
3399 int ret = 0; 3023 int ret = 0;
3400 3024
3401 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3418,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3418 goto out; 3042 goto out;
3419 } 3043 }
3420 3044
3421 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3422 thresh = div_factor(thresh, 8);
3423 if (!force &&
3424 (space_info->bytes_used + space_info->bytes_pinned +
3425 space_info->bytes_reserved + alloc_bytes) < thresh) {
3426 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3427 goto out; 3047 goto out;
3428 } 3048 }
@@ -3444,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3444 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3445 if (ret) 3065 if (ret)
3446 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3447 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3448 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3449out: 3071out:
@@ -3451,13 +3073,713 @@ out:
3451 return ret; 3073 return ret;
3452} 3074}
3453 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3454static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3455 struct btrfs_root *root, 3777 struct btrfs_root *root,
3456 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3457 int mark_free)
3458{ 3779{
3459 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3460 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3461 u64 total = num_bytes; 3783 u64 total = num_bytes;
3462 u64 old_val; 3784 u64 old_val;
3463 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3476,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3476 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3477 if (!cache) 3799 if (!cache)
3478 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3479 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3480 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3481 3809
@@ -3488,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3488 old_val += num_bytes; 3816 old_val += num_bytes;
3489 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3490 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3491 cache->space_info->bytes_used += num_bytes;
3492 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3493 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3494 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3495 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3496 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3497 } else { 3824 } else {
3498 old_val -= num_bytes; 3825 old_val -= num_bytes;
3499 cache->space_info->bytes_used -= num_bytes;
3500 if (cache->ro)
3501 cache->space_info->bytes_readonly += num_bytes;
3502 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3503 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3504 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3505 if (mark_free) {
3506 int ret;
3507
3508 ret = btrfs_discard_extent(root, bytenr,
3509 num_bytes);
3510 WARN_ON(ret);
3511 3833
3512 ret = btrfs_add_free_space(cache, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3513 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3514 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3515 }
3516 } 3837 }
3517 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3518 total -= num_bytes; 3839 total -= num_bytes;
@@ -3536,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3536 return bytenr; 3857 return bytenr;
3537} 3858}
3538 3859
3539/* 3860static int pin_down_extent(struct btrfs_root *root,
3540 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3541 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3542int btrfs_pin_extent(struct btrfs_root *root,
3543 u64 bytenr, u64 num_bytes, int reserved)
3544{ 3863{
3545 struct btrfs_fs_info *fs_info = root->fs_info;
3546 struct btrfs_block_group_cache *cache;
3547
3548 cache = btrfs_lookup_block_group(fs_info, bytenr);
3549 BUG_ON(!cache);
3550
3551 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3552 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3553 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3559,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3559 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3560 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3561 3874
3562 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3879
3880/*
3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3563 3890
3564 set_extent_dirty(fs_info->pinned_extents, 3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3565 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3892
3893 btrfs_put_block_group(cache);
3566 return 0; 3894 return 0;
3567} 3895}
3568 3896
3569static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3570 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3571{ 3903{
3572 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3573 spin_lock(&cache->lock); 3905 if (sinfo) {
3574 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3575 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3576 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3577 } else { 3924 } else {
3578 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3579 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3580 } 3935 }
3581 spin_unlock(&cache->lock); 3936 return ret;
3582 spin_unlock(&cache->space_info->lock);
3583 return 0;
3584} 3937}
3585 3938
3586int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3611,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3611 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3612 3965
3613 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3614 return 0; 3969 return 0;
3615} 3970}
3616 3971
@@ -3637,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3637 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3638 } 3993 }
3639 3994
3995 start += len;
3996
3640 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3641 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3642 cache->pinned -= len; 3999 cache->pinned -= len;
3643 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3644 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3645 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3646
3647 start += len;
3648 } 4010 }
3649 4011
3650 if (cache) 4012 if (cache)
@@ -3657,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3657{ 4019{
3658 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3659 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3660 u64 start; 4024 u64 start;
3661 u64 end; 4025 u64 end;
4026 int idx;
3662 int ret; 4027 int ret;
3663 4028
3664 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3679,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3679 cond_resched(); 4044 cond_resched();
3680 } 4045 }
3681 4046
3682 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
3684 4049 &fs_info->durable_block_rsv_list, list) {
3685static int pin_down_bytes(struct btrfs_trans_handle *trans,
3686 struct btrfs_root *root,
3687 struct btrfs_path *path,
3688 u64 bytenr, u64 num_bytes,
3689 int is_data, int reserved,
3690 struct extent_buffer **must_clean)
3691{
3692 int err = 0;
3693 struct extent_buffer *buf;
3694
3695 if (is_data)
3696 goto pinit;
3697
3698 /*
3699 * discard is sloooow, and so triggering discards on
3700 * individual btree blocks isn't a good plan. Just
3701 * pin everything in discard mode.
3702 */
3703 if (btrfs_test_opt(root, DISCARD))
3704 goto pinit;
3705 4050
3706 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 4051 idx = trans->transid & 0x1;
3707 if (!buf) 4052 if (block_rsv->freed[idx] > 0) {
3708 goto pinit; 4053 block_rsv_add_bytes(block_rsv,
4054 block_rsv->freed[idx], 0);
4055 block_rsv->freed[idx] = 0;
4056 }
4057 if (atomic_read(&block_rsv->usage) == 0) {
4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3709 4059
3710 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3711 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3712 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3713 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3714 */ 4064 }
3715 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3716 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3717 u64 header_owner = btrfs_header_owner(buf);
3718 u64 header_transid = btrfs_header_generation(buf);
3719 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3720 header_transid == trans->transid &&
3721 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3722 *must_clean = buf;
3723 return 1;
3724 } 4067 }
3725 btrfs_tree_unlock(buf);
3726 } 4068 }
3727 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3728pinit:
3729 if (path)
3730 btrfs_set_path_blocking(path);
3731 /* unlocks the pinned mutex */
3732 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3733 4070
3734 BUG_ON(err < 0);
3735 return 0; 4071 return 0;
3736} 4072}
3737 4073
@@ -3892,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3892 BUG_ON(ret); 4228 BUG_ON(ret);
3893 } 4229 }
3894 } else { 4230 } else {
3895 int mark_free = 0;
3896 struct extent_buffer *must_clean = NULL;
3897
3898 if (found_extent) { 4231 if (found_extent) {
3899 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3900 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3907,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3907 } 4240 }
3908 } 4241 }
3909 4242
3910 ret = pin_down_bytes(trans, root, path, bytenr,
3911 num_bytes, is_data, 0, &must_clean);
3912 if (ret > 0)
3913 mark_free = 1;
3914 BUG_ON(ret < 0);
3915 /*
3916 * it is going to be very rare for someone to be waiting
3917 * on the block we're freeing. del_items might need to
3918 * schedule, so rather than get fancy, just force it
3919 * to blocking here
3920 */
3921 if (must_clean)
3922 btrfs_set_lock_blocking(must_clean);
3923
3924 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3925 num_to_del); 4244 num_to_del);
3926 BUG_ON(ret); 4245 BUG_ON(ret);
3927 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3928 4247
3929 if (must_clean) {
3930 clean_tree_block(NULL, root, must_clean);
3931 btrfs_tree_unlock(must_clean);
3932 free_extent_buffer(must_clean);
3933 }
3934
3935 if (is_data) { 4248 if (is_data) {
3936 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3937 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3941,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3941 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3942 } 4255 }
3943 4256
3944 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3945 mark_free);
3946 BUG_ON(ret); 4258 BUG_ON(ret);
3947 } 4259 }
3948 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3950,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3950} 4262}
3951 4263
3952/* 4264/*
3953 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3954 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3955 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3956 * removes it from the tree. 4268 * removes it from the tree.
@@ -3962,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3962 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3963 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3964 struct rb_node *node; 4276 struct rb_node *node;
3965 int ret; 4277 int ret = 0;
3966 4278
3967 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3968 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4014,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4014 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4015 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4016 4328
4017 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4018 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4019 head->must_insert_reserved); 4331 ret = 1;
4020 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4021 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4022 return 0; 4335 return ret;
4023out: 4336out:
4024 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4025 return 0; 4338 return 0;
4026} 4339}
4027 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 if (block_rsv->space_info != cache->space_info)
4364 goto out;
4365
4366 if (btrfs_header_generation(buf) == trans->transid) {
4367 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4368 ret = check_ref_cleanup(trans, root, buf->start);
4369 if (!ret)
4370 goto pin;
4371 }
4372
4373 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4374 pin_down_extent(root, cache, buf->start, buf->len, 1);
4375 goto pin;
4376 }
4377
4378 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4379
4380 btrfs_add_free_space(cache, buf->start, buf->len);
4381 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4382 if (ret == -EAGAIN) {
4383 /* block group became read-only */
4384 update_reserved_bytes(cache, buf->len, 0, 1);
4385 goto out;
4386 }
4387
4388 ret = 1;
4389 spin_lock(&block_rsv->lock);
4390 if (block_rsv->reserved < block_rsv->size) {
4391 block_rsv->reserved += buf->len;
4392 ret = 0;
4393 }
4394 spin_unlock(&block_rsv->lock);
4395
4396 if (ret) {
4397 spin_lock(&cache->space_info->lock);
4398 cache->space_info->bytes_reserved -= buf->len;
4399 spin_unlock(&cache->space_info->lock);
4400 }
4401 goto out;
4402 }
4403pin:
4404 if (block_rsv->durable && !cache->ro) {
4405 ret = 0;
4406 spin_lock(&cache->lock);
4407 if (!cache->ro) {
4408 cache->reserved_pinned += buf->len;
4409 ret = 1;
4410 }
4411 spin_unlock(&cache->lock);
4412
4413 if (ret) {
4414 spin_lock(&block_rsv->lock);
4415 block_rsv->freed[trans->transid & 0x1] += buf->len;
4416 spin_unlock(&block_rsv->lock);
4417 }
4418 }
4419out:
4420 btrfs_put_block_group(cache);
4421}
4422
4028int btrfs_free_extent(struct btrfs_trans_handle *trans, 4423int btrfs_free_extent(struct btrfs_trans_handle *trans,
4029 struct btrfs_root *root, 4424 struct btrfs_root *root,
4030 u64 bytenr, u64 num_bytes, u64 parent, 4425 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4046,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4046 parent, root_objectid, (int)owner, 4441 parent, root_objectid, (int)owner,
4047 BTRFS_DROP_DELAYED_REF, NULL); 4442 BTRFS_DROP_DELAYED_REF, NULL);
4048 BUG_ON(ret); 4443 BUG_ON(ret);
4049 ret = check_ref_cleanup(trans, root, bytenr);
4050 BUG_ON(ret);
4051 } else { 4444 } else {
4052 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4445 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4053 parent, root_objectid, owner, 4446 parent, root_objectid, owner,
@@ -4057,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4057 return ret; 4450 return ret;
4058} 4451}
4059 4452
4060int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4061 struct btrfs_root *root,
4062 u64 bytenr, u32 blocksize,
4063 u64 parent, u64 root_objectid, int level)
4064{
4065 u64 used;
4066 spin_lock(&root->node_lock);
4067 used = btrfs_root_used(&root->root_item) - blocksize;
4068 btrfs_set_root_used(&root->root_item, used);
4069 spin_unlock(&root->node_lock);
4070
4071 return btrfs_free_extent(trans, root, bytenr, blocksize,
4072 parent, root_objectid, level, 0);
4073}
4074
4075static u64 stripe_align(struct btrfs_root *root, u64 val) 4453static u64 stripe_align(struct btrfs_root *root, u64 val)
4076{ 4454{
4077 u64 mask = ((u64)root->stripesize - 1); 4455 u64 mask = ((u64)root->stripesize - 1);
@@ -4124,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4124 return 0; 4502 return 0;
4125} 4503}
4126 4504
4505static int get_block_group_index(struct btrfs_block_group_cache *cache)
4506{
4507 int index;
4508 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4509 index = 0;
4510 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4511 index = 1;
4512 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4513 index = 2;
4514 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4515 index = 3;
4516 else
4517 index = 4;
4518 return index;
4519}
4520
4127enum btrfs_loop_type { 4521enum btrfs_loop_type {
4128 LOOP_FIND_IDEAL = 0, 4522 LOOP_FIND_IDEAL = 0,
4129 LOOP_CACHING_NOWAIT = 1, 4523 LOOP_CACHING_NOWAIT = 1,
@@ -4145,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4145 u64 num_bytes, u64 empty_size, 4539 u64 num_bytes, u64 empty_size,
4146 u64 search_start, u64 search_end, 4540 u64 search_start, u64 search_end,
4147 u64 hint_byte, struct btrfs_key *ins, 4541 u64 hint_byte, struct btrfs_key *ins,
4148 u64 exclude_start, u64 exclude_nr,
4149 int data) 4542 int data)
4150{ 4543{
4151 int ret = 0; 4544 int ret = 0;
@@ -4158,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4158 struct btrfs_space_info *space_info; 4551 struct btrfs_space_info *space_info;
4159 int last_ptr_loop = 0; 4552 int last_ptr_loop = 0;
4160 int loop = 0; 4553 int loop = 0;
4554 int index = 0;
4161 bool found_uncached_bg = false; 4555 bool found_uncached_bg = false;
4162 bool failed_cluster_refill = false; 4556 bool failed_cluster_refill = false;
4163 bool failed_alloc = false; 4557 bool failed_alloc = false;
@@ -4170,6 +4564,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4564 ins->offset = 0;
4171 4565
4172 space_info = __find_space_info(root->fs_info, data); 4566 space_info = __find_space_info(root->fs_info, data);
4567 if (!space_info) {
4568 printk(KERN_ERR "No space info for %d\n", data);
4569 return -ENOSPC;
4570 }
4173 4571
4174 if (orig_root->ref_cows || empty_size) 4572 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4573 allowed_chunk_alloc = 1;
@@ -4223,6 +4621,7 @@ ideal_cache:
4223 btrfs_put_block_group(block_group); 4621 btrfs_put_block_group(block_group);
4224 up_read(&space_info->groups_sem); 4622 up_read(&space_info->groups_sem);
4225 } else { 4623 } else {
4624 index = get_block_group_index(block_group);
4226 goto have_block_group; 4625 goto have_block_group;
4227 } 4626 }
4228 } else if (block_group) { 4627 } else if (block_group) {
@@ -4231,7 +4630,8 @@ ideal_cache:
4231 } 4630 }
4232search: 4631search:
4233 down_read(&space_info->groups_sem); 4632 down_read(&space_info->groups_sem);
4234 list_for_each_entry(block_group, &space_info->block_groups, list) { 4633 list_for_each_entry(block_group, &space_info->block_groups[index],
4634 list) {
4235 u64 offset; 4635 u64 offset;
4236 int cached; 4636 int cached;
4237 4637
@@ -4422,23 +4822,22 @@ checks:
4422 goto loop; 4822 goto loop;
4423 } 4823 }
4424 4824
4425 if (exclude_nr > 0 && 4825 ins->objectid = search_start;
4426 (search_start + num_bytes > exclude_start && 4826 ins->offset = num_bytes;
4427 search_start < exclude_start + exclude_nr)) { 4827
4428 search_start = exclude_start + exclude_nr; 4828 if (offset < search_start)
4829 btrfs_add_free_space(block_group, offset,
4830 search_start - offset);
4831 BUG_ON(offset > search_start);
4429 4832
4833 ret = update_reserved_bytes(block_group, num_bytes, 1,
4834 (data & BTRFS_BLOCK_GROUP_DATA));
4835 if (ret == -EAGAIN) {
4430 btrfs_add_free_space(block_group, offset, num_bytes); 4836 btrfs_add_free_space(block_group, offset, num_bytes);
4431 /*
4432 * if search_start is still in this block group
4433 * then we just re-search this block group
4434 */
4435 if (search_start >= block_group->key.objectid &&
4436 search_start < (block_group->key.objectid +
4437 block_group->key.offset))
4438 goto have_block_group;
4439 goto loop; 4837 goto loop;
4440 } 4838 }
4441 4839
4840 /* we are all good, lets return */
4442 ins->objectid = search_start; 4841 ins->objectid = search_start;
4443 ins->offset = num_bytes; 4842 ins->offset = num_bytes;
4444 4843
@@ -4446,18 +4845,18 @@ checks:
4446 btrfs_add_free_space(block_group, offset, 4845 btrfs_add_free_space(block_group, offset,
4447 search_start - offset); 4846 search_start - offset);
4448 BUG_ON(offset > search_start); 4847 BUG_ON(offset > search_start);
4449
4450 update_reserved_extents(block_group, num_bytes, 1);
4451
4452 /* we are all good, lets return */
4453 break; 4848 break;
4454loop: 4849loop:
4455 failed_cluster_refill = false; 4850 failed_cluster_refill = false;
4456 failed_alloc = false; 4851 failed_alloc = false;
4852 BUG_ON(index != get_block_group_index(block_group));
4457 btrfs_put_block_group(block_group); 4853 btrfs_put_block_group(block_group);
4458 } 4854 }
4459 up_read(&space_info->groups_sem); 4855 up_read(&space_info->groups_sem);
4460 4856
4857 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4858 goto search;
4859
4461 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4860 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4462 * for them to make caching progress. Also 4861 * for them to make caching progress. Also
4463 * determine the best possible bg to cache 4862 * determine the best possible bg to cache
@@ -4471,6 +4870,7 @@ loop:
4471 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4870 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4472 (found_uncached_bg || empty_size || empty_cluster || 4871 (found_uncached_bg || empty_size || empty_cluster ||
4473 allowed_chunk_alloc)) { 4872 allowed_chunk_alloc)) {
4873 index = 0;
4474 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4874 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4475 found_uncached_bg = false; 4875 found_uncached_bg = false;
4476 loop++; 4876 loop++;
@@ -4553,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4553 int dump_block_groups) 4953 int dump_block_groups)
4554{ 4954{
4555 struct btrfs_block_group_cache *cache; 4955 struct btrfs_block_group_cache *cache;
4956 int index = 0;
4556 4957
4557 spin_lock(&info->lock); 4958 spin_lock(&info->lock);
4558 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4959 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4559 (unsigned long long)(info->total_bytes - info->bytes_used - 4960 (unsigned long long)(info->total_bytes - info->bytes_used -
4560 info->bytes_pinned - info->bytes_reserved - 4961 info->bytes_pinned - info->bytes_reserved -
4561 info->bytes_super), 4962 info->bytes_readonly),
4562 (info->full) ? "" : "not "); 4963 (info->full) ? "" : "not ");
4563 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4964 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4564 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4965 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4565 "\n",
4566 (unsigned long long)info->total_bytes, 4966 (unsigned long long)info->total_bytes,
4967 (unsigned long long)info->bytes_used,
4567 (unsigned long long)info->bytes_pinned, 4968 (unsigned long long)info->bytes_pinned,
4568 (unsigned long long)info->bytes_delalloc, 4969 (unsigned long long)info->bytes_reserved,
4569 (unsigned long long)info->bytes_may_use, 4970 (unsigned long long)info->bytes_may_use,
4570 (unsigned long long)info->bytes_used, 4971 (unsigned long long)info->bytes_readonly);
4571 (unsigned long long)info->bytes_root,
4572 (unsigned long long)info->bytes_super,
4573 (unsigned long long)info->bytes_reserved);
4574 spin_unlock(&info->lock); 4972 spin_unlock(&info->lock);
4575 4973
4576 if (!dump_block_groups) 4974 if (!dump_block_groups)
4577 return; 4975 return;
4578 4976
4579 down_read(&info->groups_sem); 4977 down_read(&info->groups_sem);
4580 list_for_each_entry(cache, &info->block_groups, list) { 4978again:
4979 list_for_each_entry(cache, &info->block_groups[index], list) {
4581 spin_lock(&cache->lock); 4980 spin_lock(&cache->lock);
4582 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4981 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4583 "%llu pinned %llu reserved\n", 4982 "%llu pinned %llu reserved\n",
@@ -4589,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4589 btrfs_dump_free_space(cache, bytes); 4988 btrfs_dump_free_space(cache, bytes);
4590 spin_unlock(&cache->lock); 4989 spin_unlock(&cache->lock);
4591 } 4990 }
4991 if (++index < BTRFS_NR_RAID_TYPES)
4992 goto again;
4592 up_read(&info->groups_sem); 4993 up_read(&info->groups_sem);
4593} 4994}
4594 4995
@@ -4614,9 +5015,8 @@ again:
4614 5015
4615 WARN_ON(num_bytes < root->sectorsize); 5016 WARN_ON(num_bytes < root->sectorsize);
4616 ret = find_free_extent(trans, root, num_bytes, empty_size, 5017 ret = find_free_extent(trans, root, num_bytes, empty_size,
4617 search_start, search_end, hint_byte, ins, 5018 search_start, search_end, hint_byte,
4618 trans->alloc_exclude_start, 5019 ins, data);
4619 trans->alloc_exclude_nr, data);
4620 5020
4621 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5021 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4622 num_bytes = num_bytes >> 1; 5022 num_bytes = num_bytes >> 1;
@@ -4654,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4654 ret = btrfs_discard_extent(root, start, len); 5054 ret = btrfs_discard_extent(root, start, len);
4655 5055
4656 btrfs_add_free_space(cache, start, len); 5056 btrfs_add_free_space(cache, start, len);
4657 update_reserved_extents(cache, len, 0); 5057 update_reserved_bytes(cache, len, 0, 1);
4658 btrfs_put_block_group(cache); 5058 btrfs_put_block_group(cache);
4659 5059
4660 return ret; 5060 return ret;
@@ -4717,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4717 btrfs_mark_buffer_dirty(path->nodes[0]); 5117 btrfs_mark_buffer_dirty(path->nodes[0]);
4718 btrfs_free_path(path); 5118 btrfs_free_path(path);
4719 5119
4720 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5120 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4721 1, 0);
4722 if (ret) { 5121 if (ret) {
4723 printk(KERN_ERR "btrfs update block group failed for %llu " 5122 printk(KERN_ERR "btrfs update block group failed for %llu "
4724 "%llu\n", (unsigned long long)ins->objectid, 5123 "%llu\n", (unsigned long long)ins->objectid,
@@ -4778,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4778 btrfs_mark_buffer_dirty(leaf); 5177 btrfs_mark_buffer_dirty(leaf);
4779 btrfs_free_path(path); 5178 btrfs_free_path(path);
4780 5179
4781 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5180 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4782 1, 0);
4783 if (ret) { 5181 if (ret) {
4784 printk(KERN_ERR "btrfs update block group failed for %llu " 5182 printk(KERN_ERR "btrfs update block group failed for %llu "
4785 "%llu\n", (unsigned long long)ins->objectid, 5183 "%llu\n", (unsigned long long)ins->objectid,
@@ -4855,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4855 put_caching_control(caching_ctl); 5253 put_caching_control(caching_ctl);
4856 } 5254 }
4857 5255
4858 update_reserved_extents(block_group, ins->offset, 1); 5256 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5257 BUG_ON(ret);
4859 btrfs_put_block_group(block_group); 5258 btrfs_put_block_group(block_group);
4860 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5259 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4861 0, owner, offset, ins, 1); 5260 0, owner, offset, ins, 1);
4862 return ret; 5261 return ret;
4863} 5262}
4864 5263
4865/*
4866 * finds a free extent and does all the dirty work required for allocation
4867 * returns the key for the extent through ins, and a tree buffer for
4868 * the first block of the extent through buf.
4869 *
4870 * returns 0 if everything worked, non-zero otherwise.
4871 */
4872static int alloc_tree_block(struct btrfs_trans_handle *trans,
4873 struct btrfs_root *root,
4874 u64 num_bytes, u64 parent, u64 root_objectid,
4875 struct btrfs_disk_key *key, int level,
4876 u64 empty_size, u64 hint_byte, u64 search_end,
4877 struct btrfs_key *ins)
4878{
4879 int ret;
4880 u64 flags = 0;
4881
4882 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4883 empty_size, hint_byte, search_end,
4884 ins, 0);
4885 if (ret)
4886 return ret;
4887
4888 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4889 if (parent == 0)
4890 parent = ins->objectid;
4891 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4892 } else
4893 BUG_ON(parent > 0);
4894
4895 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4896 struct btrfs_delayed_extent_op *extent_op;
4897 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4898 BUG_ON(!extent_op);
4899 if (key)
4900 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4901 else
4902 memset(&extent_op->key, 0, sizeof(extent_op->key));
4903 extent_op->flags_to_set = flags;
4904 extent_op->update_key = 1;
4905 extent_op->update_flags = 1;
4906 extent_op->is_data = 0;
4907
4908 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4909 ins->offset, parent, root_objectid,
4910 level, BTRFS_ADD_DELAYED_EXTENT,
4911 extent_op);
4912 BUG_ON(ret);
4913 }
4914
4915 if (root_objectid == root->root_key.objectid) {
4916 u64 used;
4917 spin_lock(&root->node_lock);
4918 used = btrfs_root_used(&root->root_item) + num_bytes;
4919 btrfs_set_root_used(&root->root_item, used);
4920 spin_unlock(&root->node_lock);
4921 }
4922 return ret;
4923}
4924
4925struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5264struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4926 struct btrfs_root *root, 5265 struct btrfs_root *root,
4927 u64 bytenr, u32 blocksize, 5266 u64 bytenr, u32 blocksize,
@@ -4960,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4960 return buf; 5299 return buf;
4961} 5300}
4962 5301
5302static struct btrfs_block_rsv *
5303use_block_rsv(struct btrfs_trans_handle *trans,
5304 struct btrfs_root *root, u32 blocksize)
5305{
5306 struct btrfs_block_rsv *block_rsv;
5307 int ret;
5308
5309 block_rsv = get_block_rsv(trans, root);
5310
5311 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize);
5313 if (ret)
5314 return ERR_PTR(ret);
5315 return block_rsv;
5316 }
5317
5318 ret = block_rsv_use_bytes(block_rsv, blocksize);
5319 if (!ret)
5320 return block_rsv;
5321
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC);
5328}
5329
5330static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5331{
5332 block_rsv_add_bytes(block_rsv, blocksize, 0);
5333 block_rsv_release_bytes(block_rsv, NULL, 0);
5334}
5335
4963/* 5336/*
4964 * helper function to allocate a block for a given tree 5337 * finds a free extent and does all the dirty work required for allocation
5338 * returns the key for the extent through ins, and a tree buffer for
5339 * the first block of the extent through buf.
5340 *
4965 * returns the tree buffer or NULL. 5341 * returns the tree buffer or NULL.
4966 */ 5342 */
4967struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5343struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4971,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4971 u64 hint, u64 empty_size) 5347 u64 hint, u64 empty_size)
4972{ 5348{
4973 struct btrfs_key ins; 5349 struct btrfs_key ins;
4974 int ret; 5350 struct btrfs_block_rsv *block_rsv;
4975 struct extent_buffer *buf; 5351 struct extent_buffer *buf;
5352 u64 flags = 0;
5353 int ret;
4976 5354
4977 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5355
4978 key, level, empty_size, hint, (u64)-1, &ins); 5356 block_rsv = use_block_rsv(trans, root, blocksize);
5357 if (IS_ERR(block_rsv))
5358 return ERR_CAST(block_rsv);
5359
5360 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5361 empty_size, hint, (u64)-1, &ins, 0);
4979 if (ret) { 5362 if (ret) {
4980 BUG_ON(ret > 0); 5363 unuse_block_rsv(block_rsv, blocksize);
4981 return ERR_PTR(ret); 5364 return ERR_PTR(ret);
4982 } 5365 }
4983 5366
4984 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5367 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4985 blocksize, level); 5368 blocksize, level);
5369 BUG_ON(IS_ERR(buf));
5370
5371 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5372 if (parent == 0)
5373 parent = ins.objectid;
5374 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5375 } else
5376 BUG_ON(parent > 0);
5377
5378 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5379 struct btrfs_delayed_extent_op *extent_op;
5380 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5381 BUG_ON(!extent_op);
5382 if (key)
5383 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5384 else
5385 memset(&extent_op->key, 0, sizeof(extent_op->key));
5386 extent_op->flags_to_set = flags;
5387 extent_op->update_key = 1;
5388 extent_op->update_flags = 1;
5389 extent_op->is_data = 0;
5390
5391 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5392 ins.offset, parent, root_objectid,
5393 level, BTRFS_ADD_DELAYED_EXTENT,
5394 extent_op);
5395 BUG_ON(ret);
5396 }
4986 return buf; 5397 return buf;
4987} 5398}
4988 5399
@@ -5205,6 +5616,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5616 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5617 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5618 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5619 if (!next)
5620 return -ENOMEM;
5208 reada = 1; 5621 reada = 1;
5209 } 5622 }
5210 btrfs_tree_lock(next); 5623 btrfs_tree_lock(next);
@@ -5305,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5305 struct btrfs_path *path, 5718 struct btrfs_path *path,
5306 struct walk_control *wc) 5719 struct walk_control *wc)
5307{ 5720{
5308 int ret = 0; 5721 int ret;
5309 int level = wc->level; 5722 int level = wc->level;
5310 struct extent_buffer *eb = path->nodes[level]; 5723 struct extent_buffer *eb = path->nodes[level];
5311 u64 parent = 0; 5724 u64 parent = 0;
@@ -5383,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5383 btrfs_header_owner(path->nodes[level + 1])); 5796 btrfs_header_owner(path->nodes[level + 1]));
5384 } 5797 }
5385 5798
5386 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5799 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5387 root->root_key.objectid, level, 0);
5388 BUG_ON(ret);
5389out: 5800out:
5390 wc->refs[level] = 0; 5801 wc->refs[level] = 0;
5391 wc->flags[level] = 0; 5802 wc->flags[level] = 0;
5392 return ret; 5803 return 0;
5393} 5804}
5394 5805
5395static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5806static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5402,10 +5813,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5402 int ret; 5813 int ret;
5403 5814
5404 while (level >= 0) { 5815 while (level >= 0) {
5405 if (path->slots[level] >=
5406 btrfs_header_nritems(path->nodes[level]))
5407 break;
5408
5409 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5816 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5410 if (ret > 0) 5817 if (ret > 0)
5411 break; 5818 break;
@@ -5413,11 +5820,16 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5413 if (level == 0) 5820 if (level == 0)
5414 break; 5821 break;
5415 5822
5823 if (path->slots[level] >=
5824 btrfs_header_nritems(path->nodes[level]))
5825 break;
5826
5416 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5827 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5417 if (ret > 0) { 5828 if (ret > 0) {
5418 path->slots[level]++; 5829 path->slots[level]++;
5419 continue; 5830 continue;
5420 } 5831 } else if (ret < 0)
5832 return ret;
5421 level = wc->level; 5833 level = wc->level;
5422 } 5834 }
5423 return 0; 5835 return 0;
@@ -5466,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5466 * also make sure backrefs for the shared block and all lower level 5878 * also make sure backrefs for the shared block and all lower level
5467 * blocks are properly updated. 5879 * blocks are properly updated.
5468 */ 5880 */
5469int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5881int btrfs_drop_snapshot(struct btrfs_root *root,
5882 struct btrfs_block_rsv *block_rsv, int update_ref)
5470{ 5883{
5471 struct btrfs_path *path; 5884 struct btrfs_path *path;
5472 struct btrfs_trans_handle *trans; 5885 struct btrfs_trans_handle *trans;
@@ -5484,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5484 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5897 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5485 BUG_ON(!wc); 5898 BUG_ON(!wc);
5486 5899
5487 trans = btrfs_start_transaction(tree_root, 1); 5900 trans = btrfs_start_transaction(tree_root, 0);
5901 if (block_rsv)
5902 trans->block_rsv = block_rsv;
5488 5903
5489 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5904 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5490 level = btrfs_header_level(root->node); 5905 level = btrfs_header_level(root->node);
@@ -5572,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5572 } 5987 }
5573 5988
5574 BUG_ON(wc->level == 0); 5989 BUG_ON(wc->level == 0);
5575 if (trans->transaction->in_commit || 5990 if (btrfs_should_end_transaction(trans, tree_root)) {
5576 trans->transaction->delayed_refs.flushing) {
5577 ret = btrfs_update_root(trans, tree_root, 5991 ret = btrfs_update_root(trans, tree_root,
5578 &root->root_key, 5992 &root->root_key,
5579 root_item); 5993 root_item);
5580 BUG_ON(ret); 5994 BUG_ON(ret);
5581 5995
5582 btrfs_end_transaction(trans, tree_root); 5996 btrfs_end_transaction_throttle(trans, tree_root);
5583 trans = btrfs_start_transaction(tree_root, 1); 5997 trans = btrfs_start_transaction(tree_root, 0);
5584 } else { 5998 if (block_rsv)
5585 unsigned long update; 5999 trans->block_rsv = block_rsv;
5586 update = trans->delayed_ref_updates;
5587 trans->delayed_ref_updates = 0;
5588 if (update)
5589 btrfs_run_delayed_refs(trans, tree_root,
5590 update);
5591 } 6000 }
5592 } 6001 }
5593 btrfs_release_path(root, path); 6002 btrfs_release_path(root, path);
@@ -5615,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5615 kfree(root); 6024 kfree(root);
5616 } 6025 }
5617out: 6026out:
5618 btrfs_end_transaction(trans, tree_root); 6027 btrfs_end_transaction_throttle(trans, tree_root);
5619 kfree(wc); 6028 kfree(wc);
5620 btrfs_free_path(path); 6029 btrfs_free_path(path);
5621 return err; 6030 return err;
@@ -6561,6 +6970,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6561 struct btrfs_key key; 6970 struct btrfs_key key;
6562 struct inode *inode = NULL; 6971 struct inode *inode = NULL;
6563 struct btrfs_file_extent_item *fi; 6972 struct btrfs_file_extent_item *fi;
6973 struct extent_state *cached_state = NULL;
6564 u64 num_bytes; 6974 u64 num_bytes;
6565 u64 skip_objectid = 0; 6975 u64 skip_objectid = 0;
6566 u32 nritems; 6976 u32 nritems;
@@ -6589,12 +6999,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6589 } 6999 }
6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 7000 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6591 7001
6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 7002 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6593 key.offset + num_bytes - 1, GFP_NOFS); 7003 key.offset + num_bytes - 1, 0, &cached_state,
7004 GFP_NOFS);
6594 btrfs_drop_extent_cache(inode, key.offset, 7005 btrfs_drop_extent_cache(inode, key.offset,
6595 key.offset + num_bytes - 1, 1); 7006 key.offset + num_bytes - 1, 1);
6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 7007 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6597 key.offset + num_bytes - 1, GFP_NOFS); 7008 key.offset + num_bytes - 1, &cached_state,
7009 GFP_NOFS);
6598 cond_resched(); 7010 cond_resched();
6599 } 7011 }
6600 iput(inode); 7012 iput(inode);
@@ -7208,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7208 return flags; 7620 return flags;
7209} 7621}
7210 7622
7211static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7623static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7212 struct btrfs_block_group_cache *shrink_block_group,
7213 int force)
7214{ 7624{
7215 struct btrfs_trans_handle *trans; 7625 struct btrfs_space_info *sinfo = cache->space_info;
7216 u64 new_alloc_flags; 7626 u64 num_bytes;
7217 u64 calc; 7627 int ret = -ENOSPC;
7218 7628
7219 spin_lock(&shrink_block_group->lock); 7629 if (cache->ro)
7220 if (btrfs_block_group_used(&shrink_block_group->item) + 7630 return 0;
7221 shrink_block_group->reserved > 0) {
7222 spin_unlock(&shrink_block_group->lock);
7223 7631
7224 trans = btrfs_start_transaction(root, 1); 7632 spin_lock(&sinfo->lock);
7225 spin_lock(&shrink_block_group->lock); 7633 spin_lock(&cache->lock);
7634 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7635 cache->bytes_super - btrfs_block_group_used(&cache->item);
7636
7637 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7638 sinfo->bytes_may_use + sinfo->bytes_readonly +
7639 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7640 sinfo->bytes_readonly += num_bytes;
7641 sinfo->bytes_reserved += cache->reserved_pinned;
7642 cache->reserved_pinned = 0;
7643 cache->ro = 1;
7644 ret = 0;
7645 }
7646 spin_unlock(&cache->lock);
7647 spin_unlock(&sinfo->lock);
7648 return ret;
7649}
7226 7650
7227 new_alloc_flags = update_block_group_flags(root, 7651int btrfs_set_block_group_ro(struct btrfs_root *root,
7228 shrink_block_group->flags); 7652 struct btrfs_block_group_cache *cache)
7229 if (new_alloc_flags != shrink_block_group->flags) {
7230 calc =
7231 btrfs_block_group_used(&shrink_block_group->item);
7232 } else {
7233 calc = shrink_block_group->key.offset;
7234 }
7235 spin_unlock(&shrink_block_group->lock);
7236 7653
7237 do_chunk_alloc(trans, root->fs_info->extent_root, 7654{
7238 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7655 struct btrfs_trans_handle *trans;
7656 u64 alloc_flags;
7657 int ret;
7239 7658
7240 btrfs_end_transaction(trans, root); 7659 BUG_ON(cache->ro);
7241 } else
7242 spin_unlock(&shrink_block_group->lock);
7243 return 0;
7244}
7245 7660
7661 trans = btrfs_join_transaction(root, 1);
7662 BUG_ON(IS_ERR(trans));
7246 7663
7247int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7664 alloc_flags = update_block_group_flags(root, cache->flags);
7248 struct btrfs_block_group_cache *group) 7665 if (alloc_flags != cache->flags)
7666 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7249 7667
7668 ret = set_block_group_ro(cache);
7669 if (!ret)
7670 goto out;
7671 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7672 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7673 if (ret < 0)
7674 goto out;
7675 ret = set_block_group_ro(cache);
7676out:
7677 btrfs_end_transaction(trans, root);
7678 return ret;
7679}
7680
7681int btrfs_set_block_group_rw(struct btrfs_root *root,
7682 struct btrfs_block_group_cache *cache)
7250{ 7683{
7251 __alloc_chunk_for_shrink(root, group, 1); 7684 struct btrfs_space_info *sinfo = cache->space_info;
7252 set_block_group_readonly(group); 7685 u64 num_bytes;
7686
7687 BUG_ON(!cache->ro);
7688
7689 spin_lock(&sinfo->lock);
7690 spin_lock(&cache->lock);
7691 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7692 cache->bytes_super - btrfs_block_group_used(&cache->item);
7693 sinfo->bytes_readonly -= num_bytes;
7694 cache->ro = 0;
7695 spin_unlock(&cache->lock);
7696 spin_unlock(&sinfo->lock);
7253 return 0; 7697 return 0;
7254} 7698}
7255 7699
@@ -7366,7 +7810,6 @@ static int find_first_block_group(struct btrfs_root *root,
7366 } 7810 }
7367 path->slots[0]++; 7811 path->slots[0]++;
7368 } 7812 }
7369 ret = -ENOENT;
7370out: 7813out:
7371 return ret; 7814 return ret;
7372} 7815}
@@ -7417,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7417 */ 7860 */
7418 synchronize_rcu(); 7861 synchronize_rcu();
7419 7862
7863 release_global_block_rsv(info);
7864
7420 while(!list_empty(&info->space_info)) { 7865 while(!list_empty(&info->space_info)) {
7421 space_info = list_entry(info->space_info.next, 7866 space_info = list_entry(info->space_info.next,
7422 struct btrfs_space_info, 7867 struct btrfs_space_info,
7423 list); 7868 list);
7424 7869 if (space_info->bytes_pinned > 0 ||
7870 space_info->bytes_reserved > 0) {
7871 WARN_ON(1);
7872 dump_space_info(space_info, 0, 0);
7873 }
7425 list_del(&space_info->list); 7874 list_del(&space_info->list);
7426 kfree(space_info); 7875 kfree(space_info);
7427 } 7876 }
7428 return 0; 7877 return 0;
7429} 7878}
7430 7879
7880static void __link_block_group(struct btrfs_space_info *space_info,
7881 struct btrfs_block_group_cache *cache)
7882{
7883 int index = get_block_group_index(cache);
7884
7885 down_write(&space_info->groups_sem);
7886 list_add_tail(&cache->list, &space_info->block_groups[index]);
7887 up_write(&space_info->groups_sem);
7888}
7889
7431int btrfs_read_block_groups(struct btrfs_root *root) 7890int btrfs_read_block_groups(struct btrfs_root *root)
7432{ 7891{
7433 struct btrfs_path *path; 7892 struct btrfs_path *path;
@@ -7449,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7449 7908
7450 while (1) { 7909 while (1) {
7451 ret = find_first_block_group(root, path, &key); 7910 ret = find_first_block_group(root, path, &key);
7452 if (ret > 0) { 7911 if (ret > 0)
7453 ret = 0; 7912 break;
7454 goto error;
7455 }
7456 if (ret != 0) 7913 if (ret != 0)
7457 goto error; 7914 goto error;
7458 7915
@@ -7461,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7461 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7918 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7462 if (!cache) { 7919 if (!cache) {
7463 ret = -ENOMEM; 7920 ret = -ENOMEM;
7464 break; 7921 goto error;
7465 } 7922 }
7466 7923
7467 atomic_set(&cache->count, 1); 7924 atomic_set(&cache->count, 1);
@@ -7518,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7518 BUG_ON(ret); 7975 BUG_ON(ret);
7519 cache->space_info = space_info; 7976 cache->space_info = space_info;
7520 spin_lock(&cache->space_info->lock); 7977 spin_lock(&cache->space_info->lock);
7521 cache->space_info->bytes_super += cache->bytes_super; 7978 cache->space_info->bytes_readonly += cache->bytes_super;
7522 spin_unlock(&cache->space_info->lock); 7979 spin_unlock(&cache->space_info->lock);
7523 7980
7524 down_write(&space_info->groups_sem); 7981 __link_block_group(space_info, cache);
7525 list_add_tail(&cache->list, &space_info->block_groups);
7526 up_write(&space_info->groups_sem);
7527 7982
7528 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7983 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7529 BUG_ON(ret); 7984 BUG_ON(ret);
7530 7985
7531 set_avail_alloc_bits(root->fs_info, cache->flags); 7986 set_avail_alloc_bits(root->fs_info, cache->flags);
7532 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7987 if (btrfs_chunk_readonly(root, cache->key.objectid))
7533 set_block_group_readonly(cache); 7988 set_block_group_ro(cache);
7534 } 7989 }
7990
7991 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7992 if (!(get_alloc_profile(root, space_info->flags) &
7993 (BTRFS_BLOCK_GROUP_RAID10 |
7994 BTRFS_BLOCK_GROUP_RAID1 |
7995 BTRFS_BLOCK_GROUP_DUP)))
7996 continue;
7997 /*
7998 * avoid allocating from un-mirrored block group if there are
7999 * mirrored block groups.
8000 */
8001 list_for_each_entry(cache, &space_info->block_groups[3], list)
8002 set_block_group_ro(cache);
8003 list_for_each_entry(cache, &space_info->block_groups[4], list)
8004 set_block_group_ro(cache);
8005 }
8006
8007 init_global_block_rsv(info);
7535 ret = 0; 8008 ret = 0;
7536error: 8009error:
7537 btrfs_free_path(path); 8010 btrfs_free_path(path);
@@ -7592,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7592 BUG_ON(ret); 8065 BUG_ON(ret);
7593 8066
7594 spin_lock(&cache->space_info->lock); 8067 spin_lock(&cache->space_info->lock);
7595 cache->space_info->bytes_super += cache->bytes_super; 8068 cache->space_info->bytes_readonly += cache->bytes_super;
7596 spin_unlock(&cache->space_info->lock); 8069 spin_unlock(&cache->space_info->lock);
7597 8070
7598 down_write(&cache->space_info->groups_sem); 8071 __link_block_group(cache->space_info, cache);
7599 list_add_tail(&cache->list, &cache->space_info->block_groups);
7600 up_write(&cache->space_info->groups_sem);
7601 8072
7602 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8073 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7603 BUG_ON(ret); 8074 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96577e8bf9fd..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -104,8 +103,8 @@ void extent_io_exit(void)
104void extent_io_tree_init(struct extent_io_tree *tree, 103void extent_io_tree_init(struct extent_io_tree *tree,
105 struct address_space *mapping, gfp_t mask) 104 struct address_space *mapping, gfp_t mask)
106{ 105{
107 tree->state.rb_node = NULL; 106 tree->state = RB_ROOT;
108 tree->buffer.rb_node = NULL; 107 tree->buffer = RB_ROOT;
109 tree->ops = NULL; 108 tree->ops = NULL;
110 tree->dirty_bytes = 0; 109 tree->dirty_bytes = 0;
111 spin_lock_init(&tree->lock); 110 spin_lock_init(&tree->lock);
@@ -136,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
136 return state; 135 return state;
137} 136}
138 137
139static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
140{ 139{
141 if (!state) 140 if (!state)
142 return; 141 return;
@@ -336,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
336} 335}
337 336
338static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
339 struct extent_state *state, 338 struct extent_state *state, int *bits)
340 unsigned long bits)
341{ 339{
342 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
343 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
344 state->start, state->end, 342 state, bits);
345 state->state, bits);
346 } 343 }
347 344
348 return 0; 345 return 0;
349} 346}
350 347
351static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
352 struct extent_state *state, 349 struct extent_state *state, int *bits)
353 unsigned long bits)
354{ 350{
355 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -368,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
368 */ 364 */
369static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
370 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
371 int bits) 367 int *bits)
372{ 368{
373 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
374 int ret; 371 int ret;
375 372
376 if (end < start) { 373 if (end < start) {
@@ -385,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
385 if (ret) 382 if (ret)
386 return ret; 383 return ret;
387 384
388 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
390 state->state |= bits; 387 state->state |= bits_to_set;
391 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
392 if (node) { 389 if (node) {
393 struct extent_state *found; 390 struct extent_state *found;
@@ -457,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
457 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
458 */ 455 */
459static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
460 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
461 int delete) 458 int *bits, int wake)
462{ 459{
463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
464 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
465 462
466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
467 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
468 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
469 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -472,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
472 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
473 if (wake) 470 if (wake)
474 wake_up(&state->wq); 471 wake_up(&state->wq);
475 if (delete || state->state == 0) { 472 if (state->state == 0) {
476 if (state->tree) { 473 if (state->tree) {
477 clear_state_cb(tree, state, state->state);
478 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
479 state->tree = NULL; 475 state->tree = NULL;
480 free_extent_state(state); 476 free_extent_state(state);
@@ -513,7 +509,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 509 u64 last_end;
514 int err; 510 int err;
515 int set = 0; 511 int set = 0;
512 int clear = 0;
513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
516 517
518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
519 clear = 1;
517again: 520again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 521 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 522 prealloc = alloc_extent_state(mask);
@@ -524,14 +527,20 @@ again:
524 spin_lock(&tree->lock); 527 spin_lock(&tree->lock);
525 if (cached_state) { 528 if (cached_state) {
526 cached = *cached_state; 529 cached = *cached_state;
527 *cached_state = NULL; 530
528 cached_state = NULL; 531 if (clear) {
532 *cached_state = NULL;
533 cached_state = NULL;
534 }
535
529 if (cached && cached->tree && cached->start == start) { 536 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 537 if (clear)
538 atomic_dec(&cached->refs);
531 state = cached; 539 state = cached;
532 goto hit_next; 540 goto hit_next;
533 } 541 }
534 free_extent_state(cached); 542 if (clear)
543 free_extent_state(cached);
535 } 544 }
536 /* 545 /*
537 * this search will find the extents that end after 546 * this search will find the extents that end after
@@ -572,8 +581,7 @@ hit_next:
572 if (err) 581 if (err)
573 goto out; 582 goto out;
574 if (state->end <= end) { 583 if (state->end <= end) {
575 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
576 delete);
577 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
578 goto out; 586 goto out;
579 start = last_end + 1; 587 start = last_end + 1;
@@ -594,7 +602,7 @@ hit_next:
594 if (wake) 602 if (wake)
595 wake_up(&state->wq); 603 wake_up(&state->wq);
596 604
597 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
598 606
599 prealloc = NULL; 607 prealloc = NULL;
600 goto out; 608 goto out;
@@ -605,7 +613,7 @@ hit_next:
605 else 613 else
606 next_node = NULL; 614 next_node = NULL;
607 615
608 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
609 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
610 goto out; 618 goto out;
611 start = last_end + 1; 619 start = last_end + 1;
@@ -698,19 +706,19 @@ out:
698 706
699static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
700 struct extent_state *state, 708 struct extent_state *state,
701 int bits) 709 int *bits)
702{ 710{
703 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
704 713
705 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
706 if (ret) 715 if (ret)
707 return ret; 716 return ret;
708 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
709 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
710 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
711 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
712 } 720 }
713 state->state |= bits; 721 state->state |= bits_to_set;
714 722
715 return 0; 723 return 0;
716} 724}
@@ -737,10 +745,9 @@ static void cache_state(struct extent_state *state,
737 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
738 */ 746 */
739 747
740static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
741 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
742 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
743 gfp_t mask)
744{ 751{
745 struct extent_state *state; 752 struct extent_state *state;
746 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -749,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 u64 last_start; 756 u64 last_start;
750 u64 last_end; 757 u64 last_end;
751 758
759 bits |= EXTENT_FIRST_DELALLOC;
752again: 760again:
753 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
754 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -770,7 +778,7 @@ again:
770 */ 778 */
771 node = tree_search(tree, start); 779 node = tree_search(tree, start);
772 if (!node) { 780 if (!node) {
773 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
774 prealloc = NULL; 782 prealloc = NULL;
775 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
776 goto out; 784 goto out;
@@ -794,7 +802,7 @@ hit_next:
794 goto out; 802 goto out;
795 } 803 }
796 804
797 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
798 if (err) 806 if (err)
799 goto out; 807 goto out;
800 808
@@ -844,7 +852,7 @@ hit_next:
844 if (err) 852 if (err)
845 goto out; 853 goto out;
846 if (state->end <= end) { 854 if (state->end <= end) {
847 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
848 if (err) 856 if (err)
849 goto out; 857 goto out;
850 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -869,7 +877,7 @@ hit_next:
869 else 877 else
870 this_end = last_start - 1; 878 this_end = last_start - 1;
871 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
872 bits); 880 &bits);
873 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
874 if (err) { 882 if (err) {
875 prealloc = NULL; 883 prealloc = NULL;
@@ -895,7 +903,7 @@ hit_next:
895 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
896 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
897 905
898 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
899 if (err) { 907 if (err) {
900 prealloc = NULL; 908 prealloc = NULL;
901 goto out; 909 goto out;
@@ -946,11 +954,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 954}
947 955
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 956int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 957 struct extent_state **cached_state, gfp_t mask)
950{ 958{
951 return set_extent_bit(tree, start, end, 959 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 960 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 961 0, NULL, cached_state, mask);
954} 962}
955 963
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 964int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -958,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
958{ 966{
959 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
960 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
962 NULL, mask);
963} 970}
964 971
965int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +991,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 991}
985 992
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 993static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 994 u64 end, struct extent_state **cached_state,
995 gfp_t mask)
988{ 996{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 997 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 998 cached_state, mask);
991} 999}
992 1000
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1001int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1179,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1179 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1180 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1181static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1182 u64 *start, u64 *end, u64 max_bytes,
1183 struct extent_state **cached_state)
1175{ 1184{
1176 struct rb_node *node; 1185 struct rb_node *node;
1177 struct extent_state *state; 1186 struct extent_state *state;
@@ -1203,8 +1212,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1212 *end = state->end;
1204 goto out; 1213 goto out;
1205 } 1214 }
1206 if (!found) 1215 if (!found) {
1207 *start = state->start; 1216 *start = state->start;
1217 *cached_state = state;
1218 atomic_inc(&state->refs);
1219 }
1208 found++; 1220 found++;
1209 *end = state->end; 1221 *end = state->end;
1210 cur_start = state->end + 1; 1222 cur_start = state->end + 1;
@@ -1336,10 +1348,11 @@ again:
1336 delalloc_start = *start; 1348 delalloc_start = *start;
1337 delalloc_end = 0; 1349 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1350 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1351 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1352 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1353 *start = delalloc_start;
1342 *end = delalloc_end; 1354 *end = delalloc_end;
1355 free_extent_state(cached_state);
1343 return found; 1356 return found;
1344 } 1357 }
1345 1358
@@ -1421,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1421 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1422 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1423 1436
1424 if (op & EXTENT_CLEAR_ACCOUNTING)
1425 clear_bits |= EXTENT_DO_ACCOUNTING;
1426
1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1428 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1429 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1722,7 +1732,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1732 }
1723 1733
1724 if (!uptodate) { 1734 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1735 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1736 ClearPageUptodate(page);
1727 SetPageError(page); 1737 SetPageError(page);
1728 } 1738 }
@@ -1750,7 +1760,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1760static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1761{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1762 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1763 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1764 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1765 struct extent_io_tree *tree;
1755 u64 start; 1766 u64 start;
1756 u64 end; 1767 u64 end;
@@ -1773,7 +1784,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1784 else
1774 whole_page = 0; 1785 whole_page = 0;
1775 1786
1776 if (--bvec >= bio->bi_io_vec) 1787 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1788 prefetchw(&bvec->bv_page->flags);
1778 1789
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1790 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1829,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1829 }
1819 check_page_locked(tree, page); 1830 check_page_locked(tree, page);
1820 } 1831 }
1821 } while (bvec >= bio->bi_io_vec); 1832 } while (bvec <= bvec_end);
1822 1833
1823 bio_put(bio); 1834 bio_put(bio);
1824} 1835}
@@ -1901,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1901 1912
1902 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1903 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1904 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1905 else 1916 else
1906 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1907 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2005,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2005 sector_t sector; 2016 sector_t sector;
2006 struct extent_map *em; 2017 struct extent_map *em;
2007 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2008 int ret; 2020 int ret;
2009 int nr = 0; 2021 int nr = 0;
2010 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2016,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2016 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2017 2029
2018 end = page_end; 2030 end = page_end;
2019 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2020 2040
2021 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2022 char *userpage; 2042 char *userpage;
@@ -2574,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2574 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2594 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2575 }; 2595 };
2576 struct writeback_control wbc_writepages = { 2596 struct writeback_control wbc_writepages = {
2577 .bdi = wbc->bdi,
2578 .sync_mode = wbc->sync_mode, 2597 .sync_mode = wbc->sync_mode,
2579 .older_than_this = NULL, 2598 .older_than_this = NULL,
2580 .nr_to_write = 64, 2599 .nr_to_write = 64,
@@ -2608,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2608 .sync_io = mode == WB_SYNC_ALL, 2627 .sync_io = mode == WB_SYNC_ALL,
2609 }; 2628 };
2610 struct writeback_control wbc_writepages = { 2629 struct writeback_control wbc_writepages = {
2611 .bdi = inode->i_mapping->backing_dev_info,
2612 .sync_mode = mode, 2630 .sync_mode = mode,
2613 .older_than_this = NULL, 2631 .older_than_this = NULL,
2614 .nr_to_write = nr_pages * 2, 2632 .nr_to_write = nr_pages * 2,
@@ -2663,33 +2681,20 @@ int extent_readpages(struct extent_io_tree *tree,
2663{ 2681{
2664 struct bio *bio = NULL; 2682 struct bio *bio = NULL;
2665 unsigned page_idx; 2683 unsigned page_idx;
2666 struct pagevec pvec;
2667 unsigned long bio_flags = 0; 2684 unsigned long bio_flags = 0;
2668 2685
2669 pagevec_init(&pvec, 0);
2670 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2686 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2671 struct page *page = list_entry(pages->prev, struct page, lru); 2687 struct page *page = list_entry(pages->prev, struct page, lru);
2672 2688
2673 prefetchw(&page->flags); 2689 prefetchw(&page->flags);
2674 list_del(&page->lru); 2690 list_del(&page->lru);
2675 /* 2691 if (!add_to_page_cache_lru(page, mapping,
2676 * what we want to do here is call add_to_page_cache_lru,
2677 * but that isn't exported, so we reproduce it here
2678 */
2679 if (!add_to_page_cache(page, mapping,
2680 page->index, GFP_KERNEL)) { 2692 page->index, GFP_KERNEL)) {
2681
2682 /* open coding of lru_cache_add, also not exported */
2683 page_cache_get(page);
2684 if (!pagevec_add(&pvec, page))
2685 __pagevec_lru_add_file(&pvec);
2686 __extent_read_full_page(tree, page, get_extent, 2693 __extent_read_full_page(tree, page, get_extent,
2687 &bio, 0, &bio_flags); 2694 &bio, 0, &bio_flags);
2688 } 2695 }
2689 page_cache_release(page); 2696 page_cache_release(page);
2690 } 2697 }
2691 if (pagevec_count(&pvec))
2692 __pagevec_lru_add_file(&pvec);
2693 BUG_ON(!list_empty(pages)); 2698 BUG_ON(!list_empty(pages));
2694 if (bio) 2699 if (bio)
2695 submit_one_bio(READ, bio, 0, bio_flags); 2700 submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2709,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2709int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2710 struct page *page, unsigned long offset)
2706{ 2711{
2712 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2713 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2714 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2715 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2718,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2718 if (start > end)
2713 return 0; 2719 return 0;
2714 2720
2715 lock_extent(tree, start, end, GFP_NOFS); 2721 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2722 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2723 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2724 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2725 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2726 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2727 return 0;
2722} 2728}
2723 2729
@@ -2920,16 +2926,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2926 get_extent_t *get_extent)
2921{ 2927{
2922 struct inode *inode = mapping->host; 2928 struct inode *inode = mapping->host;
2929 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2930 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2931 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2932 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2933 struct extent_map *em;
2927 2934
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2935 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2936 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2937 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2938 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2939 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2940 if (!em || IS_ERR(em))
2934 return 0; 2941 return 0;
2935 2942
@@ -2951,6 +2958,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2951 u32 flags = 0; 2958 u32 flags = 0;
2952 u64 disko = 0; 2959 u64 disko = 0;
2953 struct extent_map *em = NULL; 2960 struct extent_map *em = NULL;
2961 struct extent_state *cached_state = NULL;
2954 int end = 0; 2962 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2963 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2964 unsigned long emflags;
@@ -2959,8 +2967,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 if (len == 0) 2967 if (len == 0)
2960 return -EINVAL; 2968 return -EINVAL;
2961 2969
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2970 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2963 GFP_NOFS); 2971 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2972 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2973 if (!em)
2966 goto out; 2974 goto out;
@@ -3023,8 +3031,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3023out_free: 3031out_free:
3024 free_extent_map(em); 3032 free_extent_map(em);
3025out: 3033out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3034 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3035 &cached_state, GFP_NOFS);
3028 return ret; 3036 return ret;
3029} 3037}
3030 3038
@@ -3165,10 +3173,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3165 spin_unlock(&tree->buffer_lock); 3173 spin_unlock(&tree->buffer_lock);
3166 goto free_eb; 3174 goto free_eb;
3167 } 3175 }
3168 spin_unlock(&tree->buffer_lock);
3169
3170 /* add one reference for the tree */ 3176 /* add one reference for the tree */
3171 atomic_inc(&eb->refs); 3177 atomic_inc(&eb->refs);
3178 spin_unlock(&tree->buffer_lock);
3172 return eb; 3179 return eb;
3173 3180
3174free_eb: 3181free_eb:
@@ -3265,7 +3272,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3265} 3272}
3266 3273
3267int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3274int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3268 struct extent_buffer *eb) 3275 struct extent_buffer *eb,
3276 struct extent_state **cached_state)
3269{ 3277{
3270 unsigned long i; 3278 unsigned long i;
3271 struct page *page; 3279 struct page *page;
@@ -3275,7 +3283,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3275 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3283 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3276 3284
3277 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3285 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3278 GFP_NOFS); 3286 cached_state, GFP_NOFS);
3279 for (i = 0; i < num_pages; i++) { 3287 for (i = 0; i < num_pages; i++) {
3280 page = extent_buffer_page(eb, i); 3288 page = extent_buffer_page(eb, i);
3281 if (page) 3289 if (page)
@@ -3335,7 +3343,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3335} 3343}
3336 3344
3337int extent_buffer_uptodate(struct extent_io_tree *tree, 3345int extent_buffer_uptodate(struct extent_io_tree *tree,
3338 struct extent_buffer *eb) 3346 struct extent_buffer *eb,
3347 struct extent_state *cached_state)
3339{ 3348{
3340 int ret = 0; 3349 int ret = 0;
3341 unsigned long num_pages; 3350 unsigned long num_pages;
@@ -3347,7 +3356,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3347 return 1; 3356 return 1;
3348 3357
3349 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3358 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3350 EXTENT_UPTODATE, 1, NULL); 3359 EXTENT_UPTODATE, 1, cached_state);
3351 if (ret) 3360 if (ret)
3352 return ret; 3361 return ret;
3353 3362
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -163,6 +165,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 165int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 166 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 167int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
168int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
169 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 170int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 171 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 172int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -174,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
174 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
175 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
176 180
181void free_extent_state(struct extent_state *state);
177int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
178 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
179int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -183,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
183 gfp_t mask); 188 gfp_t mask);
184int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
185 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
186int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
187 gfp_t mask); 195 gfp_t mask);
188int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -196,7 +204,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 204int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 205 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 206int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 207 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 208int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 209 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 210int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +289,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 289int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 290 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 291int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 292 struct extent_buffer *eb,
293 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 294int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 295 struct extent_buffer *eb,
296 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 297int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 298 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 299 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f90..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
@@ -35,7 +34,7 @@ void extent_map_exit(void)
35 */ 34 */
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 35void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 36{
38 tree->map.rb_node = NULL; 37 tree->map = RB_ROOT;
39 rwlock_init(&tree->lock); 38 rwlock_init(&tree->lock);
40} 39}
41 40
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
@@ -148,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148} 149}
149 150
150 151
151int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
152 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
153{ 155{
154 u32 sum; 156 u32 sum;
155 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
156 int bio_index = 0; 158 int bio_index = 0;
157 u64 offset; 159 u64 offset = 0;
158 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
159 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
160 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -173,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
173 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
174 176
175 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
176 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
177 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
178 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
179 if (ret == 0) 184 if (ret == 0)
180 goto found; 185 goto found;
@@ -237,6 +242,7 @@ found:
237 else 242 else
238 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
239 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
240 bio_index++; 246 bio_index++;
241 bvec++; 247 bvec++;
242 } 248 }
@@ -244,6 +250,18 @@ found:
244 return 0; 250 return 0;
245} 251}
246 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
247int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
248 struct list_head *list) 266 struct list_head *list)
249{ 267{
@@ -656,6 +674,9 @@ again:
656 goto found; 674 goto found;
657 } 675 }
658 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
659 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
660 u32 item_size; 681 u32 item_size;
661 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c02033596f02..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -45,32 +46,42 @@
45static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
46 int write_bytes, 47 int write_bytes,
47 struct page **prepared_pages, 48 struct page **prepared_pages,
48 const char __user *buf) 49 struct iov_iter *i)
49{ 50{
50 long page_fault = 0; 51 size_t copied;
51 int i; 52 int pg = 0;
52 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
53 54
54 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
55 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
56 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
57 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
58 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
59 62
60 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
61 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
62 page_fault = __copy_from_user(page_address(page) + offset, 65
63 buf, count);
64 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
65 flush_dcache_page(page); 67 flush_dcache_page(page);
66 kunmap(page); 68 iov_iter_advance(i, copied);
67 buf += count; 69 write_bytes -= copied;
68 write_bytes -= count;
69 70
70 if (page_fault) 71 if (unlikely(copied == 0)) {
71 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
72 } 83 }
73 return page_fault ? -EFAULT : 0; 84 return 0;
74} 85}
75 86
76/* 87/*
@@ -123,9 +134,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 134 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 135
125 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
127 if (err) 138 NULL);
128 return err; 139 BUG_ON(err);
129 140
130 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
131 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -140,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
140 * at this time. 151 * at this time.
141 */ 152 */
142 } 153 }
143 return err; 154 return 0;
144} 155}
145 156
146/* 157/*
@@ -720,13 +731,15 @@ again:
720 inode->i_ino, orig_offset); 731 inode->i_ino, orig_offset);
721 BUG_ON(ret); 732 BUG_ON(ret);
722 } 733 }
723 fi = btrfs_item_ptr(leaf, path->slots[0],
724 struct btrfs_file_extent_item);
725 if (del_nr == 0) { 734 if (del_nr == 0) {
735 fi = btrfs_item_ptr(leaf, path->slots[0],
736 struct btrfs_file_extent_item);
726 btrfs_set_file_extent_type(leaf, fi, 737 btrfs_set_file_extent_type(leaf, fi,
727 BTRFS_FILE_EXTENT_REG); 738 BTRFS_FILE_EXTENT_REG);
728 btrfs_mark_buffer_dirty(leaf); 739 btrfs_mark_buffer_dirty(leaf);
729 } else { 740 } else {
741 fi = btrfs_item_ptr(leaf, del_slot - 1,
742 struct btrfs_file_extent_item);
730 btrfs_set_file_extent_type(leaf, fi, 743 btrfs_set_file_extent_type(leaf, fi,
731 BTRFS_FILE_EXTENT_REG); 744 BTRFS_FILE_EXTENT_REG);
732 btrfs_set_file_extent_num_bytes(leaf, fi, 745 btrfs_set_file_extent_num_bytes(leaf, fi,
@@ -751,6 +764,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
751 loff_t pos, unsigned long first_index, 764 loff_t pos, unsigned long first_index,
752 unsigned long last_index, size_t write_bytes) 765 unsigned long last_index, size_t write_bytes)
753{ 766{
767 struct extent_state *cached_state = NULL;
754 int i; 768 int i;
755 unsigned long index = pos >> PAGE_CACHE_SHIFT; 769 unsigned long index = pos >> PAGE_CACHE_SHIFT;
756 struct inode *inode = fdentry(file)->d_inode; 770 struct inode *inode = fdentry(file)->d_inode;
@@ -779,16 +793,18 @@ again:
779 } 793 }
780 if (start_pos < inode->i_size) { 794 if (start_pos < inode->i_size) {
781 struct btrfs_ordered_extent *ordered; 795 struct btrfs_ordered_extent *ordered;
782 lock_extent(&BTRFS_I(inode)->io_tree, 796 lock_extent_bits(&BTRFS_I(inode)->io_tree,
783 start_pos, last_pos - 1, GFP_NOFS); 797 start_pos, last_pos - 1, 0, &cached_state,
798 GFP_NOFS);
784 ordered = btrfs_lookup_first_ordered_extent(inode, 799 ordered = btrfs_lookup_first_ordered_extent(inode,
785 last_pos - 1); 800 last_pos - 1);
786 if (ordered && 801 if (ordered &&
787 ordered->file_offset + ordered->len > start_pos && 802 ordered->file_offset + ordered->len > start_pos &&
788 ordered->file_offset < last_pos) { 803 ordered->file_offset < last_pos) {
789 btrfs_put_ordered_extent(ordered); 804 btrfs_put_ordered_extent(ordered);
790 unlock_extent(&BTRFS_I(inode)->io_tree, 805 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
791 start_pos, last_pos - 1, GFP_NOFS); 806 start_pos, last_pos - 1,
807 &cached_state, GFP_NOFS);
792 for (i = 0; i < num_pages; i++) { 808 for (i = 0; i < num_pages; i++) {
793 unlock_page(pages[i]); 809 unlock_page(pages[i]);
794 page_cache_release(pages[i]); 810 page_cache_release(pages[i]);
@@ -800,12 +816,13 @@ again:
800 if (ordered) 816 if (ordered)
801 btrfs_put_ordered_extent(ordered); 817 btrfs_put_ordered_extent(ordered);
802 818
803 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 819 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
804 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 820 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
805 EXTENT_DO_ACCOUNTING, 821 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
806 GFP_NOFS); 822 GFP_NOFS);
807 unlock_extent(&BTRFS_I(inode)->io_tree, 823 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
808 start_pos, last_pos - 1, GFP_NOFS); 824 start_pos, last_pos - 1, &cached_state,
825 GFP_NOFS);
809 } 826 }
810 for (i = 0; i < num_pages; i++) { 827 for (i = 0; i < num_pages; i++) {
811 clear_page_dirty_for_io(pages[i]); 828 clear_page_dirty_for_io(pages[i]);
@@ -815,45 +832,46 @@ again:
815 return 0; 832 return 0;
816} 833}
817 834
818static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
819 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
820{ 838{
821 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
822 loff_t start_pos; 846 loff_t start_pos;
823 ssize_t num_written = 0; 847 ssize_t num_written = 0;
824 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
825 int ret = 0; 851 int ret = 0;
826 struct inode *inode = fdentry(file)->d_inode;
827 struct btrfs_root *root = BTRFS_I(inode)->root;
828 struct page **pages = NULL;
829 int nrptrs; 852 int nrptrs;
830 struct page *pinned[2];
831 unsigned long first_index; 853 unsigned long first_index;
832 unsigned long last_index; 854 unsigned long last_index;
833 int will_write; 855 int will_write;
856 int buffered = 0;
834 857
835 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
836 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
837 860
838 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
839 PAGE_CACHE_SIZE / (sizeof(struct page *)));
840 pinned[0] = NULL; 861 pinned[0] = NULL;
841 pinned[1] = NULL; 862 pinned[1] = NULL;
842 863
843 pos = *ppos;
844 start_pos = pos; 864 start_pos = pos;
845 865
846 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
847 867
848 /* do the reserve before the mutex lock in case we have to do some
849 * flushing. We wouldn't deadlock, but this is more polite.
850 */
851 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
852 if (err)
853 goto out_nolock;
854
855 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
856 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
857 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
858 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
859 if (err) 877 if (err)
@@ -867,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
867 goto out; 885 goto out;
868 886
869 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
870 889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
871 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
872 929
873 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
874 start_pos = pos; 931 start_pos = pos;
875 932
876 BTRFS_I(inode)->sequence++;
877 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
878 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
879 935
880 /* 936 /*
881 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -892,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
892 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
893 } 949 }
894 } 950 }
895 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
896 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
897 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
898 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -903,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
903 } 959 }
904 } 960 }
905 961
906 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
907 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
908 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
909 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
910 offset); 966 offset);
911 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
912 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -914,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
914 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
915 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
916 972
917 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
918 if (ret) 974 if (ret)
919 goto out; 975 goto out;
920 976
@@ -922,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 pos, first_index, last_index, 978 pos, first_index, last_index,
923 write_bytes); 979 write_bytes);
924 if (ret) { 980 if (ret) {
925 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
926 write_bytes);
927 goto out; 982 goto out;
928 } 983 }
929 984
930 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
931 write_bytes, pages, buf); 986 write_bytes, pages, &i);
932 if (ret) { 987 if (ret == 0) {
933 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
934 write_bytes); 989 num_pages, pos, write_bytes);
935 btrfs_drop_pages(pages, num_pages);
936 goto out;
937 } 990 }
938 991
939 ret = dirty_and_release_pages(NULL, root, file, pages,
940 num_pages, pos, write_bytes);
941 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
942 if (ret) { 993 if (ret) {
943 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
944 write_bytes);
945 goto out; 995 goto out;
946 } 996 }
947 997
@@ -957,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
957 btrfs_throttle(root); 1007 btrfs_throttle(root);
958 } 1008 }
959 1009
960 buf += write_bytes;
961 count -= write_bytes;
962 pos += write_bytes; 1010 pos += write_bytes;
963 num_written += write_bytes; 1011 num_written += write_bytes;
964 1012
@@ -968,9 +1016,7 @@ out:
968 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
969 if (ret) 1017 if (ret)
970 err = ret; 1018 err = ret;
971 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
972 1019
973out_nolock:
974 kfree(pages); 1020 kfree(pages);
975 if (pinned[0]) 1021 if (pinned[0])
976 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1000,7 +1046,7 @@ out_nolock:
1000 num_written = err; 1046 num_written = err;
1001 1047
1002 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1003 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1004 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1005 file->f_dentry); 1051 file->f_dentry);
1006 if (ret == 0) { 1052 if (ret == 0) {
@@ -1015,7 +1061,7 @@ out_nolock:
1015 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1016 } 1062 }
1017 } 1063 }
1018 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1019 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1020 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1021 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1055,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1055 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1056 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1057 */ 1103 */
1058int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1059{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1060 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1061 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1062 int ret = 0; 1109 int ret = 0;
@@ -1093,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1093 /* 1140 /*
1094 * ok we haven't committed the transaction yet, lets do a commit 1141 * ok we haven't committed the transaction yet, lets do a commit
1095 */ 1142 */
1096 if (file && file->private_data) 1143 if (file->private_data)
1097 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1098 1145
1099 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1100 if (!trans) { 1147 if (IS_ERR(trans)) {
1101 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1102 goto out; 1149 goto out;
1103 } 1150 }
1104 1151
@@ -1133,7 +1180,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1133 } 1180 }
1134 mutex_lock(&dentry->d_inode->i_mutex); 1181 mutex_lock(&dentry->d_inode->i_mutex);
1135out: 1182out:
1136 return ret > 0 ? EIO : ret; 1183 return ret > 0 ? -EIO : ret;
1137} 1184}
1138 1185
1139static const struct vm_operations_struct btrfs_file_vm_ops = { 1186static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -1143,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
1143 1190
1144static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1191static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1145{ 1192{
1146 vma->vm_ops = &btrfs_file_vm_ops; 1193 struct address_space *mapping = filp->f_mapping;
1194
1195 if (!mapping->a_ops->readpage)
1196 return -ENOEXEC;
1197
1147 file_accessed(filp); 1198 file_accessed(filp);
1199 vma->vm_ops = &btrfs_file_vm_ops;
1200 vma->vm_flags |= VM_CAN_NONLINEAR;
1201
1148 return 0; 1202 return 0;
1149} 1203}
1150 1204
1151const struct file_operations btrfs_file_operations = { 1205const struct file_operations btrfs_file_operations = {
1152 .llseek = generic_file_llseek, 1206 .llseek = generic_file_llseek,
1153 .read = do_sync_read, 1207 .read = do_sync_read,
1208 .write = do_sync_write,
1154 .aio_read = generic_file_aio_read, 1209 .aio_read = generic_file_aio_read,
1155 .splice_read = generic_file_splice_read, 1210 .splice_read = generic_file_splice_read,
1156 .write = btrfs_file_write, 1211 .aio_write = btrfs_file_aio_write,
1157 .mmap = btrfs_file_mmap, 1212 .mmap = btrfs_file_mmap,
1158 .open = generic_file_open, 1213 .open = generic_file_open,
1159 .release = btrfs_release_file, 1214 .release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
@@ -870,7 +871,7 @@ __btrfs_return_cluster_to_free_space(
870 tree_insert_offset(&block_group->free_space_offset, 871 tree_insert_offset(&block_group->free_space_offset,
871 entry->offset, &entry->offset_index, 0); 872 entry->offset, &entry->offset_index, 0);
872 } 873 }
873 cluster->root.rb_node = NULL; 874 cluster->root = RB_ROOT;
874 875
875out: 876out:
876 spin_unlock(&cluster->lock); 877 spin_unlock(&cluster->lock);
@@ -1355,7 +1356,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355{ 1356{
1356 spin_lock_init(&cluster->lock); 1357 spin_lock_init(&cluster->lock);
1357 spin_lock_init(&cluster->refill_lock); 1358 spin_lock_init(&cluster->refill_lock);
1358 cluster->root.rb_node = NULL; 1359 cluster->root = RB_ROOT;
1359 cluster->max_size = 0; 1360 cluster->max_size = 0;
1360 cluster->points_to_bitmap = false; 1361 cluster->points_to_bitmap = false;
1361 INIT_LIST_HEAD(&cluster->block_group_list); 1362 INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cd109972fa6..c03864406af3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -251,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
251 inline_len, compressed_size, 252 inline_len, compressed_size,
252 compressed_pages); 253 compressed_pages);
253 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
254 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
255 return 0; 257 return 0;
256} 258}
@@ -379,7 +381,8 @@ again:
379 * change at any time if we discover bad compression ratios. 381 * change at any time if we discover bad compression ratios.
380 */ 382 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 383 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 384 (btrfs_test_opt(root, COMPRESS) ||
385 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 386 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 387 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 388
@@ -412,6 +415,7 @@ again:
412 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
413 BUG_ON(!trans); 416 BUG_ON(!trans);
414 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
415 419
416 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
417 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -437,7 +441,6 @@ again:
437 start, end, NULL, 441 start, end, NULL,
438 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
439 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
440 EXTENT_CLEAR_ACCOUNTING |
441 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
442 445
443 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -483,8 +486,10 @@ again:
483 nr_pages_ret = 0; 486 nr_pages_ret = 0;
484 487
485 /* flag the file so we don't compress in the future */ 488 /* flag the file so we don't compress in the future */
486 if (!btrfs_test_opt(root, FORCE_COMPRESS)) 489 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
490 !(BTRFS_I(inode)->force_compress)) {
487 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 491 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
492 }
488 } 493 }
489 if (will_compress) { 494 if (will_compress) {
490 *num_added += 1; 495 *num_added += 1;
@@ -570,8 +575,8 @@ retry:
570 unsigned long nr_written = 0; 575 unsigned long nr_written = 0;
571 576
572 lock_extent(io_tree, async_extent->start, 577 lock_extent(io_tree, async_extent->start,
573 async_extent->start + 578 async_extent->start +
574 async_extent->ram_size - 1, GFP_NOFS); 579 async_extent->ram_size - 1, GFP_NOFS);
575 580
576 /* allocate blocks */ 581 /* allocate blocks */
577 ret = cow_file_range(inode, async_cow->locked_page, 582 ret = cow_file_range(inode, async_cow->locked_page,
@@ -693,6 +698,38 @@ retry:
693 return 0; 698 return 0;
694} 699}
695 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
696/* 733/*
697 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
698 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -730,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
730 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
731 BUG_ON(!trans); 768 BUG_ON(!trans);
732 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
733 771
734 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
735 773
@@ -749,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
749 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
750 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
751 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
752 EXTENT_CLEAR_ACCOUNTING |
753 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
754 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
755 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -765,35 +802,13 @@ static noinline int cow_file_range(struct inode *inode,
765 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
766 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
767 804
768 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
769 read_lock(&BTRFS_I(inode)->extent_tree.lock);
770 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
771 start, num_bytes);
772 if (em) {
773 /*
774 * if block start isn't an actual block number then find the
775 * first block in this inode and use that as a hint. If that
776 * block is also bogus then just don't worry about it.
777 */
778 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
779 free_extent_map(em);
780 em = search_extent_mapping(em_tree, 0, 0);
781 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
782 alloc_hint = em->block_start;
783 if (em)
784 free_extent_map(em);
785 } else {
786 alloc_hint = em->block_start;
787 free_extent_map(em);
788 }
789 }
790 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
791 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
792 807
793 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
794 unsigned long op; 809 unsigned long op;
795 810
796 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 811 cur_alloc_size = disk_num_bytes;
797 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 812 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
798 root->sectorsize, 0, alloc_hint, 813 root->sectorsize, 0, alloc_hint,
799 (u64)-1, &ins, 1); 814 (u64)-1, &ins, 1);
@@ -1170,6 +1185,13 @@ out_check:
1170 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1171 BUG_ON(ret); 1186 BUG_ON(ret);
1172 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1173 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1174 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1175 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1211,7 +1233,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1211 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1233 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1212 ret = run_delalloc_nocow(inode, locked_page, start, end, 1234 ret = run_delalloc_nocow(inode, locked_page, start, end,
1213 page_started, 0, nr_written); 1235 page_started, 0, nr_written);
1214 else if (!btrfs_test_opt(root, COMPRESS)) 1236 else if (!btrfs_test_opt(root, COMPRESS) &&
1237 !(BTRFS_I(inode)->force_compress))
1215 ret = cow_file_range(inode, locked_page, start, end, 1238 ret = cow_file_range(inode, locked_page, start, end,
1216 page_started, nr_written, 1); 1239 page_started, nr_written, 1);
1217 else 1240 else
@@ -1221,36 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1221} 1244}
1222 1245
1223static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1224 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1225{ 1248{
1226 struct btrfs_root *root = BTRFS_I(inode)->root; 1249 /* not delalloc, ignore it */
1227 u64 size;
1228
1229 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1230 return 0; 1251 return 0;
1231 1252
1232 size = orig->end - orig->start + 1; 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1233 if (size > root->fs_info->max_extent) {
1234 u64 num_extents;
1235 u64 new_size;
1236
1237 new_size = orig->end - split + 1;
1238 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1239 root->fs_info->max_extent);
1240
1241 /*
1242 * if we break a large extent up then leave oustanding_extents
1243 * be, since we've already accounted for the large extent.
1244 */
1245 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1246 root->fs_info->max_extent) < num_extents)
1247 return 0;
1248 }
1249
1250 spin_lock(&BTRFS_I(inode)->accounting_lock);
1251 BTRFS_I(inode)->outstanding_extents++;
1252 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1253
1254 return 0; 1254 return 0;
1255} 1255}
1256 1256
@@ -1264,42 +1264,11 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 struct extent_state *new, 1264 struct extent_state *new,
1265 struct extent_state *other) 1265 struct extent_state *other)
1266{ 1266{
1267 struct btrfs_root *root = BTRFS_I(inode)->root;
1268 u64 new_size, old_size;
1269 u64 num_extents;
1270
1271 /* not delalloc, ignore it */ 1267 /* not delalloc, ignore it */
1272 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1273 return 0; 1269 return 0;
1274 1270
1275 old_size = other->end - other->start + 1; 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1276 if (new->start < other->start)
1277 new_size = other->end - new->start + 1;
1278 else
1279 new_size = new->end - other->start + 1;
1280
1281 /* we're not bigger than the max, unreserve the space and go */
1282 if (new_size <= root->fs_info->max_extent) {
1283 spin_lock(&BTRFS_I(inode)->accounting_lock);
1284 BTRFS_I(inode)->outstanding_extents--;
1285 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1286 return 0;
1287 }
1288
1289 /*
1290 * If we grew by another max_extent, just return, we want to keep that
1291 * reserved amount.
1292 */
1293 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1294 root->fs_info->max_extent);
1295 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1296 root->fs_info->max_extent) > num_extents)
1297 return 0;
1298
1299 spin_lock(&BTRFS_I(inode)->accounting_lock);
1300 BTRFS_I(inode)->outstanding_extents--;
1301 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1302
1303 return 0; 1272 return 0;
1304} 1273}
1305 1274
@@ -1308,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1308 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1309 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1310 */ 1279 */
1311static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1312 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1313{ 1282{
1314 1283
1315 /* 1284 /*
@@ -1317,16 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1317 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1318 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1319 */ 1288 */
1320 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1321 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1292
1293 if (*bits & EXTENT_FIRST_DELALLOC)
1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1295 else
1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1322 1297
1323 spin_lock(&BTRFS_I(inode)->accounting_lock);
1324 BTRFS_I(inode)->outstanding_extents++;
1325 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1326 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1327 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1328 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1329 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1330 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1331 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1332 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1340,44 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1340 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1341 */ 1312 */
1342static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1343 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1344{ 1315{
1345 /* 1316 /*
1346 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1347 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1348 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1349 */ 1320 */
1350 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1351 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1352 1324
1353 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1354 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1355 BTRFS_I(inode)->outstanding_extents--; 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1356 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1357 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1329
1358 } 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1359 1335
1360 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1361 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1362 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1363 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1364 "%llu %llu\n",
1365 (unsigned long long)
1366 state->end - state->start + 1,
1367 (unsigned long long)
1368 root->fs_info->delalloc_bytes);
1369 btrfs_delalloc_free_space(root, inode, (u64)-1);
1370 root->fs_info->delalloc_bytes = 0;
1371 BTRFS_I(inode)->delalloc_bytes = 0;
1372 } else {
1373 btrfs_delalloc_free_space(root, inode,
1374 state->end -
1375 state->start + 1);
1376 root->fs_info->delalloc_bytes -= state->end -
1377 state->start + 1;
1378 BTRFS_I(inode)->delalloc_bytes -= state->end -
1379 state->start + 1;
1380 }
1381 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1382 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1383 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1426,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1426 */ 1385 */
1427static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1428 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1429 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1430{ 1390{
1431 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1432 int ret = 0; 1392 int ret = 0;
@@ -1445,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1445 * are inserted into the btree 1405 * are inserted into the btree
1446 */ 1406 */
1447static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1448 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1449{ 1410{
1450 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1451 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1456,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1456 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1457 */ 1418 */
1458static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1459 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1460{ 1422{
1461 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1462 int ret = 0; 1424 int ret = 0;
@@ -1467,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1467 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1429 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1468 BUG_ON(ret); 1430 BUG_ON(ret);
1469 1431
1470 if (!(rw & (1 << BIO_RW))) { 1432 if (!(rw & REQ_WRITE)) {
1471 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1433 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1472 return btrfs_submit_compressed_read(inode, bio, 1434 return btrfs_submit_compressed_read(inode, bio,
1473 mirror_num, bio_flags); 1435 mirror_num, bio_flags);
@@ -1481,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1481 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1482 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1483 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1484 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1485 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1486 } 1449 }
1487 1450
@@ -1508,12 +1471,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1508 return 0; 1471 return 0;
1509} 1472}
1510 1473
1511int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1474int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1475 struct extent_state **cached_state)
1512{ 1476{
1513 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1477 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1514 WARN_ON(1); 1478 WARN_ON(1);
1515 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1479 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1516 GFP_NOFS); 1480 cached_state, GFP_NOFS);
1517} 1481}
1518 1482
1519/* see btrfs_writepage_start_hook for details on why this is required */ 1483/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1490,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1526{ 1490{
1527 struct btrfs_writepage_fixup *fixup; 1491 struct btrfs_writepage_fixup *fixup;
1528 struct btrfs_ordered_extent *ordered; 1492 struct btrfs_ordered_extent *ordered;
1493 struct extent_state *cached_state = NULL;
1529 struct page *page; 1494 struct page *page;
1530 struct inode *inode; 1495 struct inode *inode;
1531 u64 page_start; 1496 u64 page_start;
@@ -1544,7 +1509,8 @@ again:
1544 page_start = page_offset(page); 1509 page_start = page_offset(page);
1545 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1510 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1546 1511
1547 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1512 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1513 &cached_state, GFP_NOFS);
1548 1514
1549 /* already ordered? We're done */ 1515 /* already ordered? We're done */
1550 if (PagePrivate2(page)) 1516 if (PagePrivate2(page))
@@ -1552,17 +1518,19 @@ again:
1552 1518
1553 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1519 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1554 if (ordered) { 1520 if (ordered) {
1555 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1521 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1556 page_end, GFP_NOFS); 1522 page_end, &cached_state, GFP_NOFS);
1557 unlock_page(page); 1523 unlock_page(page);
1558 btrfs_start_ordered_extent(inode, ordered, 1); 1524 btrfs_start_ordered_extent(inode, ordered, 1);
1559 goto again; 1525 goto again;
1560 } 1526 }
1561 1527
1562 btrfs_set_extent_delalloc(inode, page_start, page_end); 1528 BUG();
1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1563 ClearPageChecked(page); 1530 ClearPageChecked(page);
1564out: 1531out:
1565 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1532 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1533 &cached_state, GFP_NOFS);
1566out_page: 1534out_page:
1567 unlock_page(page); 1535 unlock_page(page);
1568 page_cache_release(page); 1536 page_cache_release(page);
@@ -1681,24 +1649,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1681 * before we start the transaction. It limits the amount of btree 1649 * before we start the transaction. It limits the amount of btree
1682 * reads required while inside the transaction. 1650 * reads required while inside the transaction.
1683 */ 1651 */
1684static noinline void reada_csum(struct btrfs_root *root,
1685 struct btrfs_path *path,
1686 struct btrfs_ordered_extent *ordered_extent)
1687{
1688 struct btrfs_ordered_sum *sum;
1689 u64 bytenr;
1690
1691 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1692 list);
1693 bytenr = sum->sums[0].bytenr;
1694
1695 /*
1696 * we don't care about the results, the point of this search is
1697 * just to get the btree leaves into ram
1698 */
1699 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1700}
1701
1702/* as ordered data IO finishes, this gets called so we can finish 1652/* as ordered data IO finishes, this gets called so we can finish
1703 * an ordered extent if the range of bytes in the file it covers are 1653 * an ordered extent if the range of bytes in the file it covers are
1704 * fully written. 1654 * fully written.
@@ -1706,60 +1656,39 @@ static noinline void reada_csum(struct btrfs_root *root,
1706static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1707{ 1657{
1708 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1709 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1710 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1711 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1712 struct btrfs_path *path; 1662 struct extent_state *cached_state = NULL;
1713 int compressed = 0; 1663 int compressed = 0;
1714 int ret; 1664 int ret;
1715 1665
1716 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1666 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1667 end - start + 1);
1717 if (!ret) 1668 if (!ret)
1718 return 0; 1669 return 0;
1719
1720 /*
1721 * before we join the transaction, try to do some of our IO.
1722 * This will limit the amount of IO that we have to do with
1723 * the transaction running. We're unlikely to need to do any
1724 * IO if the file extents are new, the disk_i_size checks
1725 * covers the most common case.
1726 */
1727 if (start < BTRFS_I(inode)->disk_i_size) {
1728 path = btrfs_alloc_path();
1729 if (path) {
1730 ret = btrfs_lookup_file_extent(NULL, root, path,
1731 inode->i_ino,
1732 start, 0);
1733 ordered_extent = btrfs_lookup_ordered_extent(inode,
1734 start);
1735 if (!list_empty(&ordered_extent->list)) {
1736 btrfs_release_path(root, path);
1737 reada_csum(root, path, ordered_extent);
1738 }
1739 btrfs_free_path(path);
1740 }
1741 }
1742
1743 if (!ordered_extent)
1744 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1745 BUG_ON(!ordered_extent); 1670 BUG_ON(!ordered_extent);
1671
1746 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1672 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1747 BUG_ON(!list_empty(&ordered_extent->list)); 1673 BUG_ON(!list_empty(&ordered_extent->list));
1748 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1749 if (!ret) { 1675 if (!ret) {
1750 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1751 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1752 BUG_ON(ret); 1680 BUG_ON(ret);
1753 btrfs_end_transaction(trans, root);
1754 } 1681 }
1755 goto out; 1682 goto out;
1756 } 1683 }
1757 1684
1758 lock_extent(io_tree, ordered_extent->file_offset, 1685 lock_extent_bits(io_tree, ordered_extent->file_offset,
1759 ordered_extent->file_offset + ordered_extent->len - 1, 1686 ordered_extent->file_offset + ordered_extent->len - 1,
1760 GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1761 1688
1762 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1763 1692
1764 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1765 compressed = 1; 1694 compressed = 1;
@@ -1784,18 +1713,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1784 ordered_extent->len); 1713 ordered_extent->len);
1785 BUG_ON(ret); 1714 BUG_ON(ret);
1786 } 1715 }
1787 unlock_extent(io_tree, ordered_extent->file_offset, 1716 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1788 ordered_extent->file_offset + ordered_extent->len - 1, 1717 ordered_extent->file_offset +
1789 GFP_NOFS); 1718 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1719
1790 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1791 &ordered_extent->list); 1721 &ordered_extent->list);
1792 1722
1793 /* this also removes the ordered extent from the tree */
1794 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1795 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1796 BUG_ON(ret); 1725 BUG_ON(ret);
1797 btrfs_end_transaction(trans, root);
1798out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1799 /* once for us */ 1730 /* once for us */
1800 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1801 /* once for the tree */ 1732 /* once for the tree */
@@ -1910,14 +1841,14 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1910 bio->bi_size = 0; 1841 bio->bi_size = 0;
1911 1842
1912 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1843 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1913 if (failed_bio->bi_rw & (1 << BIO_RW)) 1844 if (failed_bio->bi_rw & REQ_WRITE)
1914 rw = WRITE; 1845 rw = WRITE;
1915 else 1846 else
1916 rw = READ; 1847 rw = READ;
1917 1848
1918 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1919 failrec->last_mirror, 1850 failrec->last_mirror,
1920 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1921 return 0; 1852 return 0;
1922} 1853}
1923 1854
@@ -2072,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2072} 2003}
2073 2004
2074/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
2075 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
2076 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
2077 */ 2128 */
2078int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2079{ 2130{
2080 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2081 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2082 2136
2083 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2084 2141
2085 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2086 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2087 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2088 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2089 } 2148 }
2090 2149
2091 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2166 }
2092 2167
2093 spin_unlock(&root->list_lock); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2094 2173
2095 /* 2174 if (block_rsv)
2096 * insert an orphan item to track this unlinked/truncated file 2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2097 */
2098 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2099 2176
2100 return ret; 2177 /* grab metadata reservation from transaction handle */
2178 if (reserve) {
2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2180 BUG_ON(ret);
2181 }
2182
2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2101} 2196}
2102 2197
2103/* 2198/*
@@ -2107,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2107int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2108{ 2203{
2109 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2110 int ret = 0; 2207 int ret = 0;
2111 2208
2112 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2113 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2114 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2115 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2116 return 0;
2117 } 2213 }
2118 2214
2119 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2120 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2121 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2122 return 0;
2123 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2124 2220
2125 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2126 2225
2127 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2128 2228
2129 return ret; 2229 return 0;
2130} 2230}
2131 2231
2132/* 2232/*
@@ -2143,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2143 struct inode *inode; 2243 struct inode *inode;
2144 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2145 2245
2146 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2147 return; 2247 return;
2148 2248
2149 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2195,17 +2295,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2195 found_key.objectid = found_key.offset; 2295 found_key.objectid = found_key.offset;
2196 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2197 found_key.offset = 0; 2297 found_key.offset = 0;
2198 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2199 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2200 break;
2201 2300
2202 /* 2301 /*
2203 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2204 * the proper thing when we hit it 2303 * the proper thing when we hit it
2205 */ 2304 */
2206 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2207 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2208 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2209 2308
2210 /* 2309 /*
2211 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2214,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2214 * do a destroy_inode 2313 * do a destroy_inode
2215 */ 2314 */
2216 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2217 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2218 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2219 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2220 iput(inode); 2319 iput(inode);
@@ -2232,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2232 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2233 iput(inode); 2332 iput(inode);
2234 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2235 2346
2236 if (nr_unlink) 2347 if (nr_unlink)
2237 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2238 if (nr_truncate) 2349 if (nr_truncate)
2239 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2240
2241 btrfs_free_path(path);
2242} 2351}
2243 2352
2244/* 2353/*
@@ -2557,29 +2666,201 @@ out:
2557 return ret; 2666 return ret;
2558} 2667}
2559 2668
2560static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs = 1;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2561{ 2701{
2562 struct btrfs_root *root;
2563 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2564 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2565 int ret; 2711 int ret;
2566 unsigned long nr = 0;
2567 2712
2568 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2569 2716
2570 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2571 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2572 * 1 for orphan 2719
2573 */ 2720 /* check if there is someone else holds reference */
2574 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2575 if (ret) 2722 return ERR_PTR(-ENOSPC);
2576 return ret; 2723
2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2577 2726
2578 trans = btrfs_start_transaction(root, 1); 2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2729
2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2579 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2580 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2581 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2741 }
2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2582 } 2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2583 2864
2584 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2585 2866
@@ -2587,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2587 2868
2588 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2589 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2590 2872
2591 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2592 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2593 2877
2594 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2595 2879 __unlink_end_trans(trans, root);
2596 btrfs_end_transaction_throttle(trans, root);
2597 btrfs_unreserve_metadata_space(root, 6);
2598 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2599 return ret; 2881 return ret;
2600} 2882}
@@ -2656,7 +2938,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2656 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2938 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2657 ret = btrfs_update_inode(trans, root, dir); 2939 ret = btrfs_update_inode(trans, root, dir);
2658 BUG_ON(ret); 2940 BUG_ON(ret);
2659 dir->i_sb->s_dirt = 1;
2660 2941
2661 btrfs_free_path(path); 2942 btrfs_free_path(path);
2662 return 0; 2943 return 0;
@@ -2666,7 +2947,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2666{ 2947{
2667 struct inode *inode = dentry->d_inode; 2948 struct inode *inode = dentry->d_inode;
2668 int err = 0; 2949 int err = 0;
2669 int ret;
2670 struct btrfs_root *root = BTRFS_I(dir)->root; 2950 struct btrfs_root *root = BTRFS_I(dir)->root;
2671 struct btrfs_trans_handle *trans; 2951 struct btrfs_trans_handle *trans;
2672 unsigned long nr = 0; 2952 unsigned long nr = 0;
@@ -2675,15 +2955,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2675 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2955 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2676 return -ENOTEMPTY; 2956 return -ENOTEMPTY;
2677 2957
2678 ret = btrfs_reserve_metadata_space(root, 5); 2958 trans = __unlink_start_trans(dir, dentry);
2679 if (ret) 2959 if (IS_ERR(trans))
2680 return ret;
2681
2682 trans = btrfs_start_transaction(root, 1);
2683 if (IS_ERR(trans)) {
2684 btrfs_unreserve_metadata_space(root, 5);
2685 return PTR_ERR(trans); 2960 return PTR_ERR(trans);
2686 }
2687 2961
2688 btrfs_set_trans_block_group(trans, dir); 2962 btrfs_set_trans_block_group(trans, dir);
2689 2963
@@ -2706,12 +2980,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2706 btrfs_i_size_write(inode, 0); 2980 btrfs_i_size_write(inode, 0);
2707out: 2981out:
2708 nr = trans->blocks_used; 2982 nr = trans->blocks_used;
2709 ret = btrfs_end_transaction_throttle(trans, root); 2983 __unlink_end_trans(trans, root);
2710 btrfs_unreserve_metadata_space(root, 5);
2711 btrfs_btree_balance_dirty(root, nr); 2984 btrfs_btree_balance_dirty(root, nr);
2712 2985
2713 if (ret && !err)
2714 err = ret;
2715 return err; 2986 return err;
2716} 2987}
2717 2988
@@ -3108,6 +3379,7 @@ out:
3108 if (pending_del_nr) { 3379 if (pending_del_nr) {
3109 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3380 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3110 pending_del_nr); 3381 pending_del_nr);
3382 BUG_ON(ret);
3111 } 3383 }
3112 btrfs_free_path(path); 3384 btrfs_free_path(path);
3113 return err; 3385 return err;
@@ -3123,6 +3395,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3123 struct btrfs_root *root = BTRFS_I(inode)->root; 3395 struct btrfs_root *root = BTRFS_I(inode)->root;
3124 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3396 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3125 struct btrfs_ordered_extent *ordered; 3397 struct btrfs_ordered_extent *ordered;
3398 struct extent_state *cached_state = NULL;
3126 char *kaddr; 3399 char *kaddr;
3127 u32 blocksize = root->sectorsize; 3400 u32 blocksize = root->sectorsize;
3128 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3401 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3134,11 +3407,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3134 3407
3135 if ((offset & (blocksize - 1)) == 0) 3408 if ((offset & (blocksize - 1)) == 0)
3136 goto out; 3409 goto out;
3137 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3410 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3138 if (ret)
3139 goto out;
3140
3141 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3142 if (ret) 3411 if (ret)
3143 goto out; 3412 goto out;
3144 3413
@@ -3146,8 +3415,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3146again: 3415again:
3147 page = grab_cache_page(mapping, index); 3416 page = grab_cache_page(mapping, index);
3148 if (!page) { 3417 if (!page) {
3149 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3418 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3150 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3151 goto out; 3419 goto out;
3152 } 3420 }
3153 3421
@@ -3169,12 +3437,14 @@ again:
3169 } 3437 }
3170 wait_on_page_writeback(page); 3438 wait_on_page_writeback(page);
3171 3439
3172 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3440 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3441 GFP_NOFS);
3173 set_page_extent_mapped(page); 3442 set_page_extent_mapped(page);
3174 3443
3175 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3444 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3176 if (ordered) { 3445 if (ordered) {
3177 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3446 unlock_extent_cached(io_tree, page_start, page_end,
3447 &cached_state, GFP_NOFS);
3178 unlock_page(page); 3448 unlock_page(page);
3179 page_cache_release(page); 3449 page_cache_release(page);
3180 btrfs_start_ordered_extent(inode, ordered, 1); 3450 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3182,13 +3452,15 @@ again:
3182 goto again; 3452 goto again;
3183 } 3453 }
3184 3454
3185 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3455 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3186 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3456 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3187 GFP_NOFS); 3457 0, 0, &cached_state, GFP_NOFS);
3188 3458
3189 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3459 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3460 &cached_state);
3190 if (ret) { 3461 if (ret) {
3191 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3462 unlock_extent_cached(io_tree, page_start, page_end,
3463 &cached_state, GFP_NOFS);
3192 goto out_unlock; 3464 goto out_unlock;
3193 } 3465 }
3194 3466
@@ -3201,12 +3473,12 @@ again:
3201 } 3473 }
3202 ClearPageChecked(page); 3474 ClearPageChecked(page);
3203 set_page_dirty(page); 3475 set_page_dirty(page);
3204 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3476 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3477 GFP_NOFS);
3205 3478
3206out_unlock: 3479out_unlock:
3207 if (ret) 3480 if (ret)
3208 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3481 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3209 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3210 unlock_page(page); 3482 unlock_page(page);
3211 page_cache_release(page); 3483 page_cache_release(page);
3212out: 3484out:
@@ -3218,7 +3490,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3218 struct btrfs_trans_handle *trans; 3490 struct btrfs_trans_handle *trans;
3219 struct btrfs_root *root = BTRFS_I(inode)->root; 3491 struct btrfs_root *root = BTRFS_I(inode)->root;
3220 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3492 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3221 struct extent_map *em; 3493 struct extent_map *em = NULL;
3494 struct extent_state *cached_state = NULL;
3222 u64 mask = root->sectorsize - 1; 3495 u64 mask = root->sectorsize - 1;
3223 u64 hole_start = (inode->i_size + mask) & ~mask; 3496 u64 hole_start = (inode->i_size + mask) & ~mask;
3224 u64 block_end = (size + mask) & ~mask; 3497 u64 block_end = (size + mask) & ~mask;
@@ -3234,11 +3507,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3234 struct btrfs_ordered_extent *ordered; 3507 struct btrfs_ordered_extent *ordered;
3235 btrfs_wait_ordered_range(inode, hole_start, 3508 btrfs_wait_ordered_range(inode, hole_start,
3236 block_end - hole_start); 3509 block_end - hole_start);
3237 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3510 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3511 &cached_state, GFP_NOFS);
3238 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3512 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3239 if (!ordered) 3513 if (!ordered)
3240 break; 3514 break;
3241 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3515 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3516 &cached_state, GFP_NOFS);
3242 btrfs_put_ordered_extent(ordered); 3517 btrfs_put_ordered_extent(ordered);
3243 } 3518 }
3244 3519
@@ -3253,11 +3528,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3253 u64 hint_byte = 0; 3528 u64 hint_byte = 0;
3254 hole_size = last_byte - cur_offset; 3529 hole_size = last_byte - cur_offset;
3255 3530
3256 err = btrfs_reserve_metadata_space(root, 2); 3531 trans = btrfs_start_transaction(root, 2);
3257 if (err) 3532 if (IS_ERR(trans)) {
3533 err = PTR_ERR(trans);
3258 break; 3534 break;
3259 3535 }
3260 trans = btrfs_start_transaction(root, 1);
3261 btrfs_set_trans_block_group(trans, inode); 3536 btrfs_set_trans_block_group(trans, inode);
3262 3537
3263 err = btrfs_drop_extents(trans, inode, cur_offset, 3538 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3275,15 +3550,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3275 last_byte - 1, 0); 3550 last_byte - 1, 0);
3276 3551
3277 btrfs_end_transaction(trans, root); 3552 btrfs_end_transaction(trans, root);
3278 btrfs_unreserve_metadata_space(root, 2);
3279 } 3553 }
3280 free_extent_map(em); 3554 free_extent_map(em);
3555 em = NULL;
3281 cur_offset = last_byte; 3556 cur_offset = last_byte;
3282 if (cur_offset >= block_end) 3557 if (cur_offset >= block_end)
3283 break; 3558 break;
3284 } 3559 }
3285 3560
3286 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3561 free_extent_map(em);
3562 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3563 GFP_NOFS);
3287 return err; 3564 return err;
3288} 3565}
3289 3566
@@ -3308,11 +3585,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3308 } 3585 }
3309 } 3586 }
3310 3587
3311 ret = btrfs_reserve_metadata_space(root, 1); 3588 trans = btrfs_start_transaction(root, 5);
3312 if (ret) 3589 if (IS_ERR(trans))
3313 return ret; 3590 return PTR_ERR(trans);
3314 3591
3315 trans = btrfs_start_transaction(root, 1);
3316 btrfs_set_trans_block_group(trans, inode); 3592 btrfs_set_trans_block_group(trans, inode);
3317 3593
3318 ret = btrfs_orphan_add(trans, inode); 3594 ret = btrfs_orphan_add(trans, inode);
@@ -3320,7 +3596,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3320 3596
3321 nr = trans->blocks_used; 3597 nr = trans->blocks_used;
3322 btrfs_end_transaction(trans, root); 3598 btrfs_end_transaction(trans, root);
3323 btrfs_unreserve_metadata_space(root, 1);
3324 btrfs_btree_balance_dirty(root, nr); 3599 btrfs_btree_balance_dirty(root, nr);
3325 3600
3326 if (attr->ia_size > inode->i_size) { 3601 if (attr->ia_size > inode->i_size) {
@@ -3333,8 +3608,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3333 i_size_write(inode, attr->ia_size); 3608 i_size_write(inode, attr->ia_size);
3334 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3609 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3335 3610
3336 trans = btrfs_start_transaction(root, 1); 3611 trans = btrfs_start_transaction(root, 0);
3612 BUG_ON(IS_ERR(trans));
3337 btrfs_set_trans_block_group(trans, inode); 3613 btrfs_set_trans_block_group(trans, inode);
3614 trans->block_rsv = root->orphan_block_rsv;
3615 BUG_ON(!trans->block_rsv);
3338 3616
3339 ret = btrfs_update_inode(trans, root, inode); 3617 ret = btrfs_update_inode(trans, root, inode);
3340 BUG_ON(ret); 3618 BUG_ON(ret);
@@ -3377,17 +3655,19 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3377 if (err) 3655 if (err)
3378 return err; 3656 return err;
3379 } 3657 }
3380 attr->ia_valid &= ~ATTR_SIZE;
3381 3658
3382 if (attr->ia_valid) 3659 if (attr->ia_valid) {
3383 err = inode_setattr(inode, attr); 3660 setattr_copy(inode, attr);
3661 mark_inode_dirty(inode);
3662
3663 if (attr->ia_valid & ATTR_MODE)
3664 err = btrfs_acl_chmod(inode);
3665 }
3384 3666
3385 if (!err && ((attr->ia_valid & ATTR_MODE)))
3386 err = btrfs_acl_chmod(inode);
3387 return err; 3667 return err;
3388} 3668}
3389 3669
3390void btrfs_delete_inode(struct inode *inode) 3670void btrfs_evict_inode(struct inode *inode)
3391{ 3671{
3392 struct btrfs_trans_handle *trans; 3672 struct btrfs_trans_handle *trans;
3393 struct btrfs_root *root = BTRFS_I(inode)->root; 3673 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3395,10 +3675,14 @@ void btrfs_delete_inode(struct inode *inode)
3395 int ret; 3675 int ret;
3396 3676
3397 truncate_inode_pages(&inode->i_data, 0); 3677 truncate_inode_pages(&inode->i_data, 0);
3678 if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
3679 goto no_delete;
3680
3398 if (is_bad_inode(inode)) { 3681 if (is_bad_inode(inode)) {
3399 btrfs_orphan_del(NULL, inode); 3682 btrfs_orphan_del(NULL, inode);
3400 goto no_delete; 3683 goto no_delete;
3401 } 3684 }
3685 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
3402 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3686 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3403 3687
3404 if (root->fs_info->log_root_recovering) { 3688 if (root->fs_info->log_root_recovering) {
@@ -3414,10 +3698,21 @@ void btrfs_delete_inode(struct inode *inode)
3414 btrfs_i_size_write(inode, 0); 3698 btrfs_i_size_write(inode, 0);
3415 3699
3416 while (1) { 3700 while (1) {
3417 trans = btrfs_start_transaction(root, 1); 3701 trans = btrfs_start_transaction(root, 0);
3702 BUG_ON(IS_ERR(trans));
3418 btrfs_set_trans_block_group(trans, inode); 3703 btrfs_set_trans_block_group(trans, inode);
3419 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3704 trans->block_rsv = root->orphan_block_rsv;
3420 3705
3706 ret = btrfs_block_rsv_check(trans, root,
3707 root->orphan_block_rsv, 0, 5);
3708 if (ret) {
3709 BUG_ON(ret != -EAGAIN);
3710 ret = btrfs_commit_transaction(trans, root);
3711 BUG_ON(ret);
3712 continue;
3713 }
3714
3715 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3421 if (ret != -EAGAIN) 3716 if (ret != -EAGAIN)
3422 break; 3717 break;
3423 3718
@@ -3425,6 +3720,7 @@ void btrfs_delete_inode(struct inode *inode)
3425 btrfs_end_transaction(trans, root); 3720 btrfs_end_transaction(trans, root);
3426 trans = NULL; 3721 trans = NULL;
3427 btrfs_btree_balance_dirty(root, nr); 3722 btrfs_btree_balance_dirty(root, nr);
3723
3428 } 3724 }
3429 3725
3430 if (ret == 0) { 3726 if (ret == 0) {
@@ -3436,7 +3732,7 @@ void btrfs_delete_inode(struct inode *inode)
3436 btrfs_end_transaction(trans, root); 3732 btrfs_end_transaction(trans, root);
3437 btrfs_btree_balance_dirty(root, nr); 3733 btrfs_btree_balance_dirty(root, nr);
3438no_delete: 3734no_delete:
3439 clear_inode(inode); 3735 end_writeback(inode);
3440 return; 3736 return;
3441} 3737}
3442 3738
@@ -3567,7 +3863,7 @@ again:
3567 p = &parent->rb_right; 3863 p = &parent->rb_right;
3568 else { 3864 else {
3569 WARN_ON(!(entry->vfs_inode.i_state & 3865 WARN_ON(!(entry->vfs_inode.i_state &
3570 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3866 (I_WILL_FREE | I_FREEING)));
3571 rb_erase(parent, &root->inode_tree); 3867 rb_erase(parent, &root->inode_tree);
3572 RB_CLEAR_NODE(parent); 3868 RB_CLEAR_NODE(parent);
3573 spin_unlock(&root->inode_lock); 3869 spin_unlock(&root->inode_lock);
@@ -3646,7 +3942,7 @@ again:
3646 if (atomic_read(&inode->i_count) > 1) 3942 if (atomic_read(&inode->i_count) > 1)
3647 d_prune_aliases(inode); 3943 d_prune_aliases(inode);
3648 /* 3944 /*
3649 * btrfs_drop_inode will remove it from 3945 * btrfs_drop_inode will have it removed from
3650 * the inode cache when its usage count 3946 * the inode cache when its usage count
3651 * hits zero. 3947 * hits zero.
3652 */ 3948 */
@@ -3665,39 +3961,10 @@ again:
3665 return 0; 3961 return 0;
3666} 3962}
3667 3963
3668static noinline void init_btrfs_i(struct inode *inode)
3669{
3670 struct btrfs_inode *bi = BTRFS_I(inode);
3671
3672 bi->generation = 0;
3673 bi->sequence = 0;
3674 bi->last_trans = 0;
3675 bi->last_sub_trans = 0;
3676 bi->logged_trans = 0;
3677 bi->delalloc_bytes = 0;
3678 bi->reserved_bytes = 0;
3679 bi->disk_i_size = 0;
3680 bi->flags = 0;
3681 bi->index_cnt = (u64)-1;
3682 bi->last_unlink_trans = 0;
3683 bi->ordered_data_close = 0;
3684 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3685 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3686 inode->i_mapping, GFP_NOFS);
3687 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3688 inode->i_mapping, GFP_NOFS);
3689 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3690 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3691 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3692 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3693 mutex_init(&BTRFS_I(inode)->log_mutex);
3694}
3695
3696static int btrfs_init_locked_inode(struct inode *inode, void *p) 3964static int btrfs_init_locked_inode(struct inode *inode, void *p)
3697{ 3965{
3698 struct btrfs_iget_args *args = p; 3966 struct btrfs_iget_args *args = p;
3699 inode->i_ino = args->ino; 3967 inode->i_ino = args->ino;
3700 init_btrfs_i(inode);
3701 BTRFS_I(inode)->root = args->root; 3968 BTRFS_I(inode)->root = args->root;
3702 btrfs_set_inode_space_info(args->root, inode); 3969 btrfs_set_inode_space_info(args->root, inode);
3703 return 0; 3970 return 0;
@@ -3729,7 +3996,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3729 * Returns in *is_new if the inode was read from disk 3996 * Returns in *is_new if the inode was read from disk
3730 */ 3997 */
3731struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3998struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3732 struct btrfs_root *root) 3999 struct btrfs_root *root, int *new)
3733{ 4000{
3734 struct inode *inode; 4001 struct inode *inode;
3735 4002
@@ -3744,6 +4011,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3744 4011
3745 inode_tree_add(inode); 4012 inode_tree_add(inode);
3746 unlock_new_inode(inode); 4013 unlock_new_inode(inode);
4014 if (new)
4015 *new = 1;
3747 } 4016 }
3748 4017
3749 return inode; 4018 return inode;
@@ -3758,8 +4027,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3758 if (!inode) 4027 if (!inode)
3759 return ERR_PTR(-ENOMEM); 4028 return ERR_PTR(-ENOMEM);
3760 4029
3761 init_btrfs_i(inode);
3762
3763 BTRFS_I(inode)->root = root; 4030 BTRFS_I(inode)->root = root;
3764 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4031 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3765 BTRFS_I(inode)->dummy_inode = 1; 4032 BTRFS_I(inode)->dummy_inode = 1;
@@ -3796,7 +4063,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3796 return NULL; 4063 return NULL;
3797 4064
3798 if (location.type == BTRFS_INODE_ITEM_KEY) { 4065 if (location.type == BTRFS_INODE_ITEM_KEY) {
3799 inode = btrfs_iget(dir->i_sb, &location, root); 4066 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3800 return inode; 4067 return inode;
3801 } 4068 }
3802 4069
@@ -3811,7 +4078,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3811 else 4078 else
3812 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4079 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3813 } else { 4080 } else {
3814 inode = btrfs_iget(dir->i_sb, &location, sub_root); 4081 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3815 } 4082 }
3816 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4083 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3817 4084
@@ -4010,16 +4277,16 @@ err:
4010 return ret; 4277 return ret;
4011} 4278}
4012 4279
4013int btrfs_write_inode(struct inode *inode, int wait) 4280int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4014{ 4281{
4015 struct btrfs_root *root = BTRFS_I(inode)->root; 4282 struct btrfs_root *root = BTRFS_I(inode)->root;
4016 struct btrfs_trans_handle *trans; 4283 struct btrfs_trans_handle *trans;
4017 int ret = 0; 4284 int ret = 0;
4018 4285
4019 if (root->fs_info->btree_inode == inode) 4286 if (BTRFS_I(inode)->dummy_inode)
4020 return 0; 4287 return 0;
4021 4288
4022 if (wait) { 4289 if (wbc->sync_mode == WB_SYNC_ALL) {
4023 trans = btrfs_join_transaction(root, 1); 4290 trans = btrfs_join_transaction(root, 1);
4024 btrfs_set_trans_block_group(trans, inode); 4291 btrfs_set_trans_block_group(trans, inode);
4025 ret = btrfs_commit_transaction(trans, root); 4292 ret = btrfs_commit_transaction(trans, root);
@@ -4037,10 +4304,38 @@ void btrfs_dirty_inode(struct inode *inode)
4037{ 4304{
4038 struct btrfs_root *root = BTRFS_I(inode)->root; 4305 struct btrfs_root *root = BTRFS_I(inode)->root;
4039 struct btrfs_trans_handle *trans; 4306 struct btrfs_trans_handle *trans;
4307 int ret;
4308
4309 if (BTRFS_I(inode)->dummy_inode)
4310 return;
4040 4311
4041 trans = btrfs_join_transaction(root, 1); 4312 trans = btrfs_join_transaction(root, 1);
4042 btrfs_set_trans_block_group(trans, inode); 4313 btrfs_set_trans_block_group(trans, inode);
4043 btrfs_update_inode(trans, root, inode); 4314
4315 ret = btrfs_update_inode(trans, root, inode);
4316 if (ret && ret == -ENOSPC) {
4317 /* whoops, lets try again with the full transaction */
4318 btrfs_end_transaction(trans, root);
4319 trans = btrfs_start_transaction(root, 1);
4320 if (IS_ERR(trans)) {
4321 if (printk_ratelimit()) {
4322 printk(KERN_ERR "btrfs: fail to "
4323 "dirty inode %lu error %ld\n",
4324 inode->i_ino, PTR_ERR(trans));
4325 }
4326 return;
4327 }
4328 btrfs_set_trans_block_group(trans, inode);
4329
4330 ret = btrfs_update_inode(trans, root, inode);
4331 if (ret) {
4332 if (printk_ratelimit()) {
4333 printk(KERN_ERR "btrfs: fail to "
4334 "dirty inode %lu error %d\n",
4335 inode->i_ino, ret);
4336 }
4337 }
4338 }
4044 btrfs_end_transaction(trans, root); 4339 btrfs_end_transaction(trans, root);
4045} 4340}
4046 4341
@@ -4158,7 +4453,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4158 * btrfs_get_inode_index_count has an explanation for the magic 4453 * btrfs_get_inode_index_count has an explanation for the magic
4159 * number 4454 * number
4160 */ 4455 */
4161 init_btrfs_i(inode);
4162 BTRFS_I(inode)->index_cnt = 2; 4456 BTRFS_I(inode)->index_cnt = 2;
4163 BTRFS_I(inode)->root = root; 4457 BTRFS_I(inode)->root = root;
4164 BTRFS_I(inode)->generation = trans->transid; 4458 BTRFS_I(inode)->generation = trans->transid;
@@ -4187,16 +4481,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4187 if (ret != 0) 4481 if (ret != 0)
4188 goto fail; 4482 goto fail;
4189 4483
4190 inode->i_uid = current_fsuid(); 4484 inode_init_owner(inode, dir, mode);
4191
4192 if (dir && (dir->i_mode & S_ISGID)) {
4193 inode->i_gid = dir->i_gid;
4194 if (S_ISDIR(mode))
4195 mode |= S_ISGID;
4196 } else
4197 inode->i_gid = current_fsgid();
4198
4199 inode->i_mode = mode;
4200 inode->i_ino = objectid; 4485 inode->i_ino = objectid;
4201 inode_set_bytes(inode, 0); 4486 inode_set_bytes(inode, 0);
4202 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4487 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4322,26 +4607,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4322 if (!new_valid_dev(rdev)) 4607 if (!new_valid_dev(rdev))
4323 return -EINVAL; 4608 return -EINVAL;
4324 4609
4610 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4611 if (err)
4612 return err;
4613
4325 /* 4614 /*
4326 * 2 for inode item and ref 4615 * 2 for inode item and ref
4327 * 2 for dir items 4616 * 2 for dir items
4328 * 1 for xattr if selinux is on 4617 * 1 for xattr if selinux is on
4329 */ 4618 */
4330 err = btrfs_reserve_metadata_space(root, 5); 4619 trans = btrfs_start_transaction(root, 5);
4331 if (err) 4620 if (IS_ERR(trans))
4332 return err; 4621 return PTR_ERR(trans);
4333 4622
4334 trans = btrfs_start_transaction(root, 1);
4335 if (!trans)
4336 goto fail;
4337 btrfs_set_trans_block_group(trans, dir); 4623 btrfs_set_trans_block_group(trans, dir);
4338 4624
4339 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4340 if (err) {
4341 err = -ENOSPC;
4342 goto out_unlock;
4343 }
4344
4345 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4625 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4346 dentry->d_name.len, 4626 dentry->d_name.len,
4347 dentry->d_parent->d_inode->i_ino, objectid, 4627 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4370,13 +4650,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4370out_unlock: 4650out_unlock:
4371 nr = trans->blocks_used; 4651 nr = trans->blocks_used;
4372 btrfs_end_transaction_throttle(trans, root); 4652 btrfs_end_transaction_throttle(trans, root);
4373fail: 4653 btrfs_btree_balance_dirty(root, nr);
4374 btrfs_unreserve_metadata_space(root, 5);
4375 if (drop_inode) { 4654 if (drop_inode) {
4376 inode_dec_link_count(inode); 4655 inode_dec_link_count(inode);
4377 iput(inode); 4656 iput(inode);
4378 } 4657 }
4379 btrfs_btree_balance_dirty(root, nr);
4380 return err; 4658 return err;
4381} 4659}
4382 4660
@@ -4386,32 +4664,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4386 struct btrfs_trans_handle *trans; 4664 struct btrfs_trans_handle *trans;
4387 struct btrfs_root *root = BTRFS_I(dir)->root; 4665 struct btrfs_root *root = BTRFS_I(dir)->root;
4388 struct inode *inode = NULL; 4666 struct inode *inode = NULL;
4389 int err;
4390 int drop_inode = 0; 4667 int drop_inode = 0;
4668 int err;
4391 unsigned long nr = 0; 4669 unsigned long nr = 0;
4392 u64 objectid; 4670 u64 objectid;
4393 u64 index = 0; 4671 u64 index = 0;
4394 4672
4673 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4674 if (err)
4675 return err;
4395 /* 4676 /*
4396 * 2 for inode item and ref 4677 * 2 for inode item and ref
4397 * 2 for dir items 4678 * 2 for dir items
4398 * 1 for xattr if selinux is on 4679 * 1 for xattr if selinux is on
4399 */ 4680 */
4400 err = btrfs_reserve_metadata_space(root, 5); 4681 trans = btrfs_start_transaction(root, 5);
4401 if (err) 4682 if (IS_ERR(trans))
4402 return err; 4683 return PTR_ERR(trans);
4403 4684
4404 trans = btrfs_start_transaction(root, 1);
4405 if (!trans)
4406 goto fail;
4407 btrfs_set_trans_block_group(trans, dir); 4685 btrfs_set_trans_block_group(trans, dir);
4408 4686
4409 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4410 if (err) {
4411 err = -ENOSPC;
4412 goto out_unlock;
4413 }
4414
4415 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4687 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4416 dentry->d_name.len, 4688 dentry->d_name.len,
4417 dentry->d_parent->d_inode->i_ino, 4689 dentry->d_parent->d_inode->i_ino,
@@ -4443,8 +4715,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4443out_unlock: 4715out_unlock:
4444 nr = trans->blocks_used; 4716 nr = trans->blocks_used;
4445 btrfs_end_transaction_throttle(trans, root); 4717 btrfs_end_transaction_throttle(trans, root);
4446fail:
4447 btrfs_unreserve_metadata_space(root, 5);
4448 if (drop_inode) { 4718 if (drop_inode) {
4449 inode_dec_link_count(inode); 4719 inode_dec_link_count(inode);
4450 iput(inode); 4720 iput(inode);
@@ -4471,21 +4741,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4471 if (root->objectid != BTRFS_I(inode)->root->objectid) 4741 if (root->objectid != BTRFS_I(inode)->root->objectid)
4472 return -EPERM; 4742 return -EPERM;
4473 4743
4474 /*
4475 * 1 item for inode ref
4476 * 2 items for dir items
4477 */
4478 err = btrfs_reserve_metadata_space(root, 3);
4479 if (err)
4480 return err;
4481
4482 btrfs_inc_nlink(inode); 4744 btrfs_inc_nlink(inode);
4483 4745
4484 err = btrfs_set_inode_index(dir, &index); 4746 err = btrfs_set_inode_index(dir, &index);
4485 if (err) 4747 if (err)
4486 goto fail; 4748 goto fail;
4487 4749
4488 trans = btrfs_start_transaction(root, 1); 4750 /*
4751 * 1 item for inode ref
4752 * 2 items for dir items
4753 */
4754 trans = btrfs_start_transaction(root, 3);
4755 if (IS_ERR(trans)) {
4756 err = PTR_ERR(trans);
4757 goto fail;
4758 }
4489 4759
4490 btrfs_set_trans_block_group(trans, dir); 4760 btrfs_set_trans_block_group(trans, dir);
4491 atomic_inc(&inode->i_count); 4761 atomic_inc(&inode->i_count);
@@ -4504,7 +4774,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4504 nr = trans->blocks_used; 4774 nr = trans->blocks_used;
4505 btrfs_end_transaction_throttle(trans, root); 4775 btrfs_end_transaction_throttle(trans, root);
4506fail: 4776fail:
4507 btrfs_unreserve_metadata_space(root, 3);
4508 if (drop_inode) { 4777 if (drop_inode) {
4509 inode_dec_link_count(inode); 4778 inode_dec_link_count(inode);
4510 iput(inode); 4779 iput(inode);
@@ -4524,28 +4793,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4524 u64 index = 0; 4793 u64 index = 0;
4525 unsigned long nr = 1; 4794 unsigned long nr = 1;
4526 4795
4796 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4797 if (err)
4798 return err;
4799
4527 /* 4800 /*
4528 * 2 items for inode and ref 4801 * 2 items for inode and ref
4529 * 2 items for dir items 4802 * 2 items for dir items
4530 * 1 for xattr if selinux is on 4803 * 1 for xattr if selinux is on
4531 */ 4804 */
4532 err = btrfs_reserve_metadata_space(root, 5); 4805 trans = btrfs_start_transaction(root, 5);
4533 if (err) 4806 if (IS_ERR(trans))
4534 return err; 4807 return PTR_ERR(trans);
4535
4536 trans = btrfs_start_transaction(root, 1);
4537 if (!trans) {
4538 err = -ENOMEM;
4539 goto out_unlock;
4540 }
4541 btrfs_set_trans_block_group(trans, dir); 4808 btrfs_set_trans_block_group(trans, dir);
4542 4809
4543 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4544 if (err) {
4545 err = -ENOSPC;
4546 goto out_unlock;
4547 }
4548
4549 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4810 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4550 dentry->d_name.len, 4811 dentry->d_name.len,
4551 dentry->d_parent->d_inode->i_ino, objectid, 4812 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4585,9 +4846,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4585out_fail: 4846out_fail:
4586 nr = trans->blocks_used; 4847 nr = trans->blocks_used;
4587 btrfs_end_transaction_throttle(trans, root); 4848 btrfs_end_transaction_throttle(trans, root);
4588
4589out_unlock:
4590 btrfs_unreserve_metadata_space(root, 5);
4591 if (drop_on_err) 4849 if (drop_on_err)
4592 iput(inode); 4850 iput(inode);
4593 btrfs_btree_balance_dirty(root, nr); 4851 btrfs_btree_balance_dirty(root, nr);
@@ -4845,6 +5103,7 @@ again:
4845 } 5103 }
4846 flush_dcache_page(page); 5104 flush_dcache_page(page);
4847 } else if (create && PageUptodate(page)) { 5105 } else if (create && PageUptodate(page)) {
5106 WARN_ON(1);
4848 if (!trans) { 5107 if (!trans) {
4849 kunmap(page); 5108 kunmap(page);
4850 free_extent_map(em); 5109 free_extent_map(em);
@@ -4941,11 +5200,651 @@ out:
4941 return em; 5200 return em;
4942} 5201}
4943 5202
5203static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5204 u64 start, u64 len)
5205{
5206 struct btrfs_root *root = BTRFS_I(inode)->root;
5207 struct btrfs_trans_handle *trans;
5208 struct extent_map *em;
5209 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5210 struct btrfs_key ins;
5211 u64 alloc_hint;
5212 int ret;
5213
5214 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5215
5216 trans = btrfs_join_transaction(root, 0);
5217 if (!trans)
5218 return ERR_PTR(-ENOMEM);
5219
5220 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5221
5222 alloc_hint = get_extent_allocation_hint(inode, start, len);
5223 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5224 alloc_hint, (u64)-1, &ins, 1);
5225 if (ret) {
5226 em = ERR_PTR(ret);
5227 goto out;
5228 }
5229
5230 em = alloc_extent_map(GFP_NOFS);
5231 if (!em) {
5232 em = ERR_PTR(-ENOMEM);
5233 goto out;
5234 }
5235
5236 em->start = start;
5237 em->orig_start = em->start;
5238 em->len = ins.offset;
5239
5240 em->block_start = ins.objectid;
5241 em->block_len = ins.offset;
5242 em->bdev = root->fs_info->fs_devices->latest_bdev;
5243 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5244
5245 while (1) {
5246 write_lock(&em_tree->lock);
5247 ret = add_extent_mapping(em_tree, em);
5248 write_unlock(&em_tree->lock);
5249 if (ret != -EEXIST)
5250 break;
5251 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5252 }
5253
5254 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5255 ins.offset, ins.offset, 0);
5256 if (ret) {
5257 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5258 em = ERR_PTR(ret);
5259 }
5260out:
5261 btrfs_end_transaction(trans, root);
5262 return em;
5263}
5264
5265/*
5266 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5267 * block must be cow'd
5268 */
5269static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5270 struct inode *inode, u64 offset, u64 len)
5271{
5272 struct btrfs_path *path;
5273 int ret;
5274 struct extent_buffer *leaf;
5275 struct btrfs_root *root = BTRFS_I(inode)->root;
5276 struct btrfs_file_extent_item *fi;
5277 struct btrfs_key key;
5278 u64 disk_bytenr;
5279 u64 backref_offset;
5280 u64 extent_end;
5281 u64 num_bytes;
5282 int slot;
5283 int found_type;
5284
5285 path = btrfs_alloc_path();
5286 if (!path)
5287 return -ENOMEM;
5288
5289 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5290 offset, 0);
5291 if (ret < 0)
5292 goto out;
5293
5294 slot = path->slots[0];
5295 if (ret == 1) {
5296 if (slot == 0) {
5297 /* can't find the item, must cow */
5298 ret = 0;
5299 goto out;
5300 }
5301 slot--;
5302 }
5303 ret = 0;
5304 leaf = path->nodes[0];
5305 btrfs_item_key_to_cpu(leaf, &key, slot);
5306 if (key.objectid != inode->i_ino ||
5307 key.type != BTRFS_EXTENT_DATA_KEY) {
5308 /* not our file or wrong item type, must cow */
5309 goto out;
5310 }
5311
5312 if (key.offset > offset) {
5313 /* Wrong offset, must cow */
5314 goto out;
5315 }
5316
5317 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5318 found_type = btrfs_file_extent_type(leaf, fi);
5319 if (found_type != BTRFS_FILE_EXTENT_REG &&
5320 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5321 /* not a regular extent, must cow */
5322 goto out;
5323 }
5324 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5325 backref_offset = btrfs_file_extent_offset(leaf, fi);
5326
5327 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5328 if (extent_end < offset + len) {
5329 /* extent doesn't include our full range, must cow */
5330 goto out;
5331 }
5332
5333 if (btrfs_extent_readonly(root, disk_bytenr))
5334 goto out;
5335
5336 /*
5337 * look for other files referencing this extent, if we
5338 * find any we must cow
5339 */
5340 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5341 key.offset - backref_offset, disk_bytenr))
5342 goto out;
5343
5344 /*
5345 * adjust disk_bytenr and num_bytes to cover just the bytes
5346 * in this extent we are about to write. If there
5347 * are any csums in that range we have to cow in order
5348 * to keep the csums correct
5349 */
5350 disk_bytenr += backref_offset;
5351 disk_bytenr += offset - key.offset;
5352 num_bytes = min(offset + len, extent_end) - offset;
5353 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5354 goto out;
5355 /*
5356 * all of the above have passed, it is safe to overwrite this extent
5357 * without cow
5358 */
5359 ret = 1;
5360out:
5361 btrfs_free_path(path);
5362 return ret;
5363}
5364
5365static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5366 struct buffer_head *bh_result, int create)
5367{
5368 struct extent_map *em;
5369 struct btrfs_root *root = BTRFS_I(inode)->root;
5370 u64 start = iblock << inode->i_blkbits;
5371 u64 len = bh_result->b_size;
5372 struct btrfs_trans_handle *trans;
5373
5374 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5375 if (IS_ERR(em))
5376 return PTR_ERR(em);
5377
5378 /*
5379 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5380 * io. INLINE is special, and we could probably kludge it in here, but
5381 * it's still buffered so for safety lets just fall back to the generic
5382 * buffered path.
5383 *
5384 * For COMPRESSED we _have_ to read the entire extent in so we can
5385 * decompress it, so there will be buffering required no matter what we
5386 * do, so go ahead and fallback to buffered.
5387 *
5388 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5389 * to buffered IO. Don't blame me, this is the price we pay for using
5390 * the generic code.
5391 */
5392 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5393 em->block_start == EXTENT_MAP_INLINE) {
5394 free_extent_map(em);
5395 return -ENOTBLK;
5396 }
5397
5398 /* Just a good old fashioned hole, return */
5399 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5400 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5401 free_extent_map(em);
5402 /* DIO will do one hole at a time, so just unlock a sector */
5403 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5404 start + root->sectorsize - 1, GFP_NOFS);
5405 return 0;
5406 }
5407
5408 /*
5409 * We don't allocate a new extent in the following cases
5410 *
5411 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5412 * existing extent.
5413 * 2) The extent is marked as PREALLOC. We're good to go here and can
5414 * just use the extent.
5415 *
5416 */
5417 if (!create) {
5418 len = em->len - (start - em->start);
5419 goto map;
5420 }
5421
5422 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5423 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5424 em->block_start != EXTENT_MAP_HOLE)) {
5425 int type;
5426 int ret;
5427 u64 block_start;
5428
5429 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5430 type = BTRFS_ORDERED_PREALLOC;
5431 else
5432 type = BTRFS_ORDERED_NOCOW;
5433 len = min(len, em->len - (start - em->start));
5434 block_start = em->block_start + (start - em->start);
5435
5436 /*
5437 * we're not going to log anything, but we do need
5438 * to make sure the current transaction stays open
5439 * while we look for nocow cross refs
5440 */
5441 trans = btrfs_join_transaction(root, 0);
5442 if (!trans)
5443 goto must_cow;
5444
5445 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5446 ret = btrfs_add_ordered_extent_dio(inode, start,
5447 block_start, len, len, type);
5448 btrfs_end_transaction(trans, root);
5449 if (ret) {
5450 free_extent_map(em);
5451 return ret;
5452 }
5453 goto unlock;
5454 }
5455 btrfs_end_transaction(trans, root);
5456 }
5457must_cow:
5458 /*
5459 * this will cow the extent, reset the len in case we changed
5460 * it above
5461 */
5462 len = bh_result->b_size;
5463 free_extent_map(em);
5464 em = btrfs_new_extent_direct(inode, start, len);
5465 if (IS_ERR(em))
5466 return PTR_ERR(em);
5467 len = min(len, em->len - (start - em->start));
5468unlock:
5469 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5470 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5471 0, NULL, GFP_NOFS);
5472map:
5473 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5474 inode->i_blkbits;
5475 bh_result->b_size = len;
5476 bh_result->b_bdev = em->bdev;
5477 set_buffer_mapped(bh_result);
5478 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5479 set_buffer_new(bh_result);
5480
5481 free_extent_map(em);
5482
5483 return 0;
5484}
5485
5486struct btrfs_dio_private {
5487 struct inode *inode;
5488 u64 logical_offset;
5489 u64 disk_bytenr;
5490 u64 bytes;
5491 u32 *csums;
5492 void *private;
5493};
5494
5495static void btrfs_endio_direct_read(struct bio *bio, int err)
5496{
5497 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5498 struct bio_vec *bvec = bio->bi_io_vec;
5499 struct btrfs_dio_private *dip = bio->bi_private;
5500 struct inode *inode = dip->inode;
5501 struct btrfs_root *root = BTRFS_I(inode)->root;
5502 u64 start;
5503 u32 *private = dip->csums;
5504
5505 start = dip->logical_offset;
5506 do {
5507 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5508 struct page *page = bvec->bv_page;
5509 char *kaddr;
5510 u32 csum = ~(u32)0;
5511 unsigned long flags;
5512
5513 local_irq_save(flags);
5514 kaddr = kmap_atomic(page, KM_IRQ0);
5515 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5516 csum, bvec->bv_len);
5517 btrfs_csum_final(csum, (char *)&csum);
5518 kunmap_atomic(kaddr, KM_IRQ0);
5519 local_irq_restore(flags);
5520
5521 flush_dcache_page(bvec->bv_page);
5522 if (csum != *private) {
5523 printk(KERN_ERR "btrfs csum failed ino %lu off"
5524 " %llu csum %u private %u\n",
5525 inode->i_ino, (unsigned long long)start,
5526 csum, *private);
5527 err = -EIO;
5528 }
5529 }
5530
5531 start += bvec->bv_len;
5532 private++;
5533 bvec++;
5534 } while (bvec <= bvec_end);
5535
5536 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5537 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5538 bio->bi_private = dip->private;
5539
5540 kfree(dip->csums);
5541 kfree(dip);
5542 dio_end_io(bio, err);
5543}
5544
5545static void btrfs_endio_direct_write(struct bio *bio, int err)
5546{
5547 struct btrfs_dio_private *dip = bio->bi_private;
5548 struct inode *inode = dip->inode;
5549 struct btrfs_root *root = BTRFS_I(inode)->root;
5550 struct btrfs_trans_handle *trans;
5551 struct btrfs_ordered_extent *ordered = NULL;
5552 struct extent_state *cached_state = NULL;
5553 int ret;
5554
5555 if (err)
5556 goto out_done;
5557
5558 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5559 dip->logical_offset, dip->bytes);
5560 if (!ret)
5561 goto out_done;
5562
5563 BUG_ON(!ordered);
5564
5565 trans = btrfs_join_transaction(root, 1);
5566 if (!trans) {
5567 err = -ENOMEM;
5568 goto out;
5569 }
5570 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5571
5572 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5573 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5574 if (!ret)
5575 ret = btrfs_update_inode(trans, root, inode);
5576 err = ret;
5577 goto out;
5578 }
5579
5580 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5581 ordered->file_offset + ordered->len - 1, 0,
5582 &cached_state, GFP_NOFS);
5583
5584 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5585 ret = btrfs_mark_extent_written(trans, inode,
5586 ordered->file_offset,
5587 ordered->file_offset +
5588 ordered->len);
5589 if (ret) {
5590 err = ret;
5591 goto out_unlock;
5592 }
5593 } else {
5594 ret = insert_reserved_file_extent(trans, inode,
5595 ordered->file_offset,
5596 ordered->start,
5597 ordered->disk_len,
5598 ordered->len,
5599 ordered->len,
5600 0, 0, 0,
5601 BTRFS_FILE_EXTENT_REG);
5602 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5603 ordered->file_offset, ordered->len);
5604 if (ret) {
5605 err = ret;
5606 WARN_ON(1);
5607 goto out_unlock;
5608 }
5609 }
5610
5611 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5612 btrfs_ordered_update_i_size(inode, 0, ordered);
5613 btrfs_update_inode(trans, root, inode);
5614out_unlock:
5615 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5616 ordered->file_offset + ordered->len - 1,
5617 &cached_state, GFP_NOFS);
5618out:
5619 btrfs_delalloc_release_metadata(inode, ordered->len);
5620 btrfs_end_transaction(trans, root);
5621 btrfs_put_ordered_extent(ordered);
5622 btrfs_put_ordered_extent(ordered);
5623out_done:
5624 bio->bi_private = dip->private;
5625
5626 kfree(dip->csums);
5627 kfree(dip);
5628 dio_end_io(bio, err);
5629}
5630
5631static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5632 struct bio *bio, int mirror_num,
5633 unsigned long bio_flags, u64 offset)
5634{
5635 int ret;
5636 struct btrfs_root *root = BTRFS_I(inode)->root;
5637 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5638 BUG_ON(ret);
5639 return 0;
5640}
5641
5642static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5643 loff_t file_offset)
5644{
5645 struct btrfs_root *root = BTRFS_I(inode)->root;
5646 struct btrfs_dio_private *dip;
5647 struct bio_vec *bvec = bio->bi_io_vec;
5648 u64 start;
5649 int skip_sum;
5650 int write = rw & REQ_WRITE;
5651 int ret = 0;
5652
5653 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5654
5655 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5656 if (!dip) {
5657 ret = -ENOMEM;
5658 goto free_ordered;
5659 }
5660 dip->csums = NULL;
5661
5662 if (!skip_sum) {
5663 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5664 if (!dip->csums) {
5665 ret = -ENOMEM;
5666 goto free_ordered;
5667 }
5668 }
5669
5670 dip->private = bio->bi_private;
5671 dip->inode = inode;
5672 dip->logical_offset = file_offset;
5673
5674 start = dip->logical_offset;
5675 dip->bytes = 0;
5676 do {
5677 dip->bytes += bvec->bv_len;
5678 bvec++;
5679 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5680
5681 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5682 bio->bi_private = dip;
5683
5684 if (write)
5685 bio->bi_end_io = btrfs_endio_direct_write;
5686 else
5687 bio->bi_end_io = btrfs_endio_direct_read;
5688
5689 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5690 if (ret)
5691 goto out_err;
5692
5693 if (write && !skip_sum) {
5694 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5695 inode, rw, bio, 0, 0,
5696 dip->logical_offset,
5697 __btrfs_submit_bio_start_direct_io,
5698 __btrfs_submit_bio_done);
5699 if (ret)
5700 goto out_err;
5701 return;
5702 } else if (!skip_sum)
5703 btrfs_lookup_bio_sums_dio(root, inode, bio,
5704 dip->logical_offset, dip->csums);
5705
5706 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5707 if (ret)
5708 goto out_err;
5709 return;
5710out_err:
5711 kfree(dip->csums);
5712 kfree(dip);
5713free_ordered:
5714 /*
5715 * If this is a write, we need to clean up the reserved space and kill
5716 * the ordered extent.
5717 */
5718 if (write) {
5719 struct btrfs_ordered_extent *ordered;
5720 ordered = btrfs_lookup_ordered_extent(inode,
5721 dip->logical_offset);
5722 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5723 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5724 btrfs_free_reserved_extent(root, ordered->start,
5725 ordered->disk_len);
5726 btrfs_put_ordered_extent(ordered);
5727 btrfs_put_ordered_extent(ordered);
5728 }
5729 bio_endio(bio, ret);
5730}
5731
5732static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5733 const struct iovec *iov, loff_t offset,
5734 unsigned long nr_segs)
5735{
5736 int seg;
5737 size_t size;
5738 unsigned long addr;
5739 unsigned blocksize_mask = root->sectorsize - 1;
5740 ssize_t retval = -EINVAL;
5741 loff_t end = offset;
5742
5743 if (offset & blocksize_mask)
5744 goto out;
5745
5746 /* Check the memory alignment. Blocks cannot straddle pages */
5747 for (seg = 0; seg < nr_segs; seg++) {
5748 addr = (unsigned long)iov[seg].iov_base;
5749 size = iov[seg].iov_len;
5750 end += size;
5751 if ((addr & blocksize_mask) || (size & blocksize_mask))
5752 goto out;
5753 }
5754 retval = 0;
5755out:
5756 return retval;
5757}
4944static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5758static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4945 const struct iovec *iov, loff_t offset, 5759 const struct iovec *iov, loff_t offset,
4946 unsigned long nr_segs) 5760 unsigned long nr_segs)
4947{ 5761{
4948 return -EINVAL; 5762 struct file *file = iocb->ki_filp;
5763 struct inode *inode = file->f_mapping->host;
5764 struct btrfs_ordered_extent *ordered;
5765 struct extent_state *cached_state = NULL;
5766 u64 lockstart, lockend;
5767 ssize_t ret;
5768 int writing = rw & WRITE;
5769 int write_bits = 0;
5770 size_t count = iov_length(iov, nr_segs);
5771
5772 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5773 offset, nr_segs)) {
5774 return 0;
5775 }
5776
5777 lockstart = offset;
5778 lockend = offset + count - 1;
5779
5780 if (writing) {
5781 ret = btrfs_delalloc_reserve_space(inode, count);
5782 if (ret)
5783 goto out;
5784 }
5785
5786 while (1) {
5787 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5788 0, &cached_state, GFP_NOFS);
5789 /*
5790 * We're concerned with the entire range that we're going to be
5791 * doing DIO to, so we need to make sure theres no ordered
5792 * extents in this range.
5793 */
5794 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5795 lockend - lockstart + 1);
5796 if (!ordered)
5797 break;
5798 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5799 &cached_state, GFP_NOFS);
5800 btrfs_start_ordered_extent(inode, ordered, 1);
5801 btrfs_put_ordered_extent(ordered);
5802 cond_resched();
5803 }
5804
5805 /*
5806 * we don't use btrfs_set_extent_delalloc because we don't want
5807 * the dirty or uptodate bits
5808 */
5809 if (writing) {
5810 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5811 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5812 EXTENT_DELALLOC, 0, NULL, &cached_state,
5813 GFP_NOFS);
5814 if (ret) {
5815 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5816 lockend, EXTENT_LOCKED | write_bits,
5817 1, 0, &cached_state, GFP_NOFS);
5818 goto out;
5819 }
5820 }
5821
5822 free_extent_state(cached_state);
5823 cached_state = NULL;
5824
5825 ret = __blockdev_direct_IO(rw, iocb, inode,
5826 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5827 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5828 btrfs_submit_direct, 0);
5829
5830 if (ret < 0 && ret != -EIOCBQUEUED) {
5831 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5832 offset + iov_length(iov, nr_segs) - 1,
5833 EXTENT_LOCKED | write_bits, 1, 0,
5834 &cached_state, GFP_NOFS);
5835 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5836 /*
5837 * We're falling back to buffered, unlock the section we didn't
5838 * do IO on.
5839 */
5840 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5841 offset + iov_length(iov, nr_segs) - 1,
5842 EXTENT_LOCKED | write_bits, 1, 0,
5843 &cached_state, GFP_NOFS);
5844 }
5845out:
5846 free_extent_state(cached_state);
5847 return ret;
4949} 5848}
4950 5849
4951static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5850static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5021,6 +5920,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5021{ 5920{
5022 struct extent_io_tree *tree; 5921 struct extent_io_tree *tree;
5023 struct btrfs_ordered_extent *ordered; 5922 struct btrfs_ordered_extent *ordered;
5923 struct extent_state *cached_state = NULL;
5024 u64 page_start = page_offset(page); 5924 u64 page_start = page_offset(page);
5025 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 5925 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
5026 5926
@@ -5039,7 +5939,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5039 btrfs_releasepage(page, GFP_NOFS); 5939 btrfs_releasepage(page, GFP_NOFS);
5040 return; 5940 return;
5041 } 5941 }
5042 lock_extent(tree, page_start, page_end, GFP_NOFS); 5942 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5943 GFP_NOFS);
5043 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 5944 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5044 page_offset(page)); 5945 page_offset(page));
5045 if (ordered) { 5946 if (ordered) {
@@ -5050,7 +5951,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5050 clear_extent_bit(tree, page_start, page_end, 5951 clear_extent_bit(tree, page_start, page_end,
5051 EXTENT_DIRTY | EXTENT_DELALLOC | 5952 EXTENT_DIRTY | EXTENT_DELALLOC |
5052 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 5953 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5053 NULL, GFP_NOFS); 5954 &cached_state, GFP_NOFS);
5054 /* 5955 /*
5055 * whoever cleared the private bit is responsible 5956 * whoever cleared the private bit is responsible
5056 * for the finish_ordered_io 5957 * for the finish_ordered_io
@@ -5060,11 +5961,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5060 page_start, page_end); 5961 page_start, page_end);
5061 } 5962 }
5062 btrfs_put_ordered_extent(ordered); 5963 btrfs_put_ordered_extent(ordered);
5063 lock_extent(tree, page_start, page_end, GFP_NOFS); 5964 cached_state = NULL;
5965 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5966 GFP_NOFS);
5064 } 5967 }
5065 clear_extent_bit(tree, page_start, page_end, 5968 clear_extent_bit(tree, page_start, page_end,
5066 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 5969 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5067 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 5970 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5068 __btrfs_releasepage(page, GFP_NOFS); 5971 __btrfs_releasepage(page, GFP_NOFS);
5069 5972
5070 ClearPageChecked(page); 5973 ClearPageChecked(page);
@@ -5097,6 +6000,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5097 struct btrfs_root *root = BTRFS_I(inode)->root; 6000 struct btrfs_root *root = BTRFS_I(inode)->root;
5098 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6001 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5099 struct btrfs_ordered_extent *ordered; 6002 struct btrfs_ordered_extent *ordered;
6003 struct extent_state *cached_state = NULL;
5100 char *kaddr; 6004 char *kaddr;
5101 unsigned long zero_start; 6005 unsigned long zero_start;
5102 loff_t size; 6006 loff_t size;
@@ -5104,7 +6008,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5104 u64 page_start; 6008 u64 page_start;
5105 u64 page_end; 6009 u64 page_end;
5106 6010
5107 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6011 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5108 if (ret) { 6012 if (ret) {
5109 if (ret == -ENOMEM) 6013 if (ret == -ENOMEM)
5110 ret = VM_FAULT_OOM; 6014 ret = VM_FAULT_OOM;
@@ -5113,13 +6017,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5113 goto out; 6017 goto out;
5114 } 6018 }
5115 6019
5116 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5117 if (ret) {
5118 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5119 ret = VM_FAULT_SIGBUS;
5120 goto out;
5121 }
5122
5123 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6020 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5124again: 6021again:
5125 lock_page(page); 6022 lock_page(page);
@@ -5129,13 +6026,13 @@ again:
5129 6026
5130 if ((page->mapping != inode->i_mapping) || 6027 if ((page->mapping != inode->i_mapping) ||
5131 (page_start >= size)) { 6028 (page_start >= size)) {
5132 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5133 /* page got truncated out from underneath us */ 6029 /* page got truncated out from underneath us */
5134 goto out_unlock; 6030 goto out_unlock;
5135 } 6031 }
5136 wait_on_page_writeback(page); 6032 wait_on_page_writeback(page);
5137 6033
5138 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 6034 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
6035 GFP_NOFS);
5139 set_page_extent_mapped(page); 6036 set_page_extent_mapped(page);
5140 6037
5141 /* 6038 /*
@@ -5144,7 +6041,8 @@ again:
5144 */ 6041 */
5145 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6042 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5146 if (ordered) { 6043 if (ordered) {
5147 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6044 unlock_extent_cached(io_tree, page_start, page_end,
6045 &cached_state, GFP_NOFS);
5148 unlock_page(page); 6046 unlock_page(page);
5149 btrfs_start_ordered_extent(inode, ordered, 1); 6047 btrfs_start_ordered_extent(inode, ordered, 1);
5150 btrfs_put_ordered_extent(ordered); 6048 btrfs_put_ordered_extent(ordered);
@@ -5158,15 +6056,16 @@ again:
5158 * is probably a better way to do this, but for now keep consistent with 6056 * is probably a better way to do this, but for now keep consistent with
5159 * prepare_pages in the normal write path. 6057 * prepare_pages in the normal write path.
5160 */ 6058 */
5161 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 6059 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5162 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6060 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5163 GFP_NOFS); 6061 0, 0, &cached_state, GFP_NOFS);
5164 6062
5165 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 6063 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
6064 &cached_state);
5166 if (ret) { 6065 if (ret) {
5167 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6066 unlock_extent_cached(io_tree, page_start, page_end,
6067 &cached_state, GFP_NOFS);
5168 ret = VM_FAULT_SIGBUS; 6068 ret = VM_FAULT_SIGBUS;
5169 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5170 goto out_unlock; 6069 goto out_unlock;
5171 } 6070 }
5172 ret = 0; 6071 ret = 0;
@@ -5190,13 +6089,13 @@ again:
5190 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6089 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5191 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6090 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5192 6091
5193 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6092 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5194 6093
5195out_unlock: 6094out_unlock:
5196 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5197 if (!ret) 6095 if (!ret)
5198 return VM_FAULT_LOCKED; 6096 return VM_FAULT_LOCKED;
5199 unlock_page(page); 6097 unlock_page(page);
6098 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5200out: 6099out:
5201 return ret; 6100 return ret;
5202} 6101}
@@ -5221,8 +6120,10 @@ static void btrfs_truncate(struct inode *inode)
5221 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6120 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5222 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6121 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5223 6122
5224 trans = btrfs_start_transaction(root, 1); 6123 trans = btrfs_start_transaction(root, 0);
6124 BUG_ON(IS_ERR(trans));
5225 btrfs_set_trans_block_group(trans, inode); 6125 btrfs_set_trans_block_group(trans, inode);
6126 trans->block_rsv = root->orphan_block_rsv;
5226 6127
5227 /* 6128 /*
5228 * setattr is responsible for setting the ordered_data_close flag, 6129 * setattr is responsible for setting the ordered_data_close flag,
@@ -5245,6 +6146,23 @@ static void btrfs_truncate(struct inode *inode)
5245 btrfs_add_ordered_operation(trans, root, inode); 6146 btrfs_add_ordered_operation(trans, root, inode);
5246 6147
5247 while (1) { 6148 while (1) {
6149 if (!trans) {
6150 trans = btrfs_start_transaction(root, 0);
6151 BUG_ON(IS_ERR(trans));
6152 btrfs_set_trans_block_group(trans, inode);
6153 trans->block_rsv = root->orphan_block_rsv;
6154 }
6155
6156 ret = btrfs_block_rsv_check(trans, root,
6157 root->orphan_block_rsv, 0, 5);
6158 if (ret) {
6159 BUG_ON(ret != -EAGAIN);
6160 ret = btrfs_commit_transaction(trans, root);
6161 BUG_ON(ret);
6162 trans = NULL;
6163 continue;
6164 }
6165
5248 ret = btrfs_truncate_inode_items(trans, root, inode, 6166 ret = btrfs_truncate_inode_items(trans, root, inode,
5249 inode->i_size, 6167 inode->i_size,
5250 BTRFS_EXTENT_DATA_KEY); 6168 BTRFS_EXTENT_DATA_KEY);
@@ -5256,10 +6174,8 @@ static void btrfs_truncate(struct inode *inode)
5256 6174
5257 nr = trans->blocks_used; 6175 nr = trans->blocks_used;
5258 btrfs_end_transaction(trans, root); 6176 btrfs_end_transaction(trans, root);
6177 trans = NULL;
5259 btrfs_btree_balance_dirty(root, nr); 6178 btrfs_btree_balance_dirty(root, nr);
5260
5261 trans = btrfs_start_transaction(root, 1);
5262 btrfs_set_trans_block_group(trans, inode);
5263 } 6179 }
5264 6180
5265 if (ret == 0 && inode->i_nlink > 0) { 6181 if (ret == 0 && inode->i_nlink > 0) {
@@ -5320,21 +6236,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5320struct inode *btrfs_alloc_inode(struct super_block *sb) 6236struct inode *btrfs_alloc_inode(struct super_block *sb)
5321{ 6237{
5322 struct btrfs_inode *ei; 6238 struct btrfs_inode *ei;
6239 struct inode *inode;
5323 6240
5324 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6241 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5325 if (!ei) 6242 if (!ei)
5326 return NULL; 6243 return NULL;
6244
6245 ei->root = NULL;
6246 ei->space_info = NULL;
6247 ei->generation = 0;
6248 ei->sequence = 0;
5327 ei->last_trans = 0; 6249 ei->last_trans = 0;
5328 ei->last_sub_trans = 0; 6250 ei->last_sub_trans = 0;
5329 ei->logged_trans = 0; 6251 ei->logged_trans = 0;
5330 ei->outstanding_extents = 0; 6252 ei->delalloc_bytes = 0;
5331 ei->reserved_extents = 0; 6253 ei->reserved_bytes = 0;
5332 ei->root = NULL; 6254 ei->disk_i_size = 0;
6255 ei->flags = 0;
6256 ei->index_cnt = (u64)-1;
6257 ei->last_unlink_trans = 0;
6258
5333 spin_lock_init(&ei->accounting_lock); 6259 spin_lock_init(&ei->accounting_lock);
6260 atomic_set(&ei->outstanding_extents, 0);
6261 ei->reserved_extents = 0;
6262
6263 ei->ordered_data_close = 0;
6264 ei->orphan_meta_reserved = 0;
6265 ei->dummy_inode = 0;
6266 ei->force_compress = 0;
6267
6268 inode = &ei->vfs_inode;
6269 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6270 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6271 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6272 mutex_init(&ei->log_mutex);
5334 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6273 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5335 INIT_LIST_HEAD(&ei->i_orphan); 6274 INIT_LIST_HEAD(&ei->i_orphan);
6275 INIT_LIST_HEAD(&ei->delalloc_inodes);
5336 INIT_LIST_HEAD(&ei->ordered_operations); 6276 INIT_LIST_HEAD(&ei->ordered_operations);
5337 return &ei->vfs_inode; 6277 RB_CLEAR_NODE(&ei->rb_node);
6278
6279 return inode;
5338} 6280}
5339 6281
5340void btrfs_destroy_inode(struct inode *inode) 6282void btrfs_destroy_inode(struct inode *inode)
@@ -5344,6 +6286,8 @@ void btrfs_destroy_inode(struct inode *inode)
5344 6286
5345 WARN_ON(!list_empty(&inode->i_dentry)); 6287 WARN_ON(!list_empty(&inode->i_dentry));
5346 WARN_ON(inode->i_data.nrpages); 6288 WARN_ON(inode->i_data.nrpages);
6289 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6290 WARN_ON(BTRFS_I(inode)->reserved_extents);
5347 6291
5348 /* 6292 /*
5349 * This can happen where we create an inode, but somebody else also 6293 * This can happen where we create an inode, but somebody else also
@@ -5364,13 +6308,13 @@ void btrfs_destroy_inode(struct inode *inode)
5364 spin_unlock(&root->fs_info->ordered_extent_lock); 6308 spin_unlock(&root->fs_info->ordered_extent_lock);
5365 } 6309 }
5366 6310
5367 spin_lock(&root->list_lock); 6311 spin_lock(&root->orphan_lock);
5368 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6312 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5369 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6313 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5370 inode->i_ino); 6314 inode->i_ino);
5371 list_del_init(&BTRFS_I(inode)->i_orphan); 6315 list_del_init(&BTRFS_I(inode)->i_orphan);
5372 } 6316 }
5373 spin_unlock(&root->list_lock); 6317 spin_unlock(&root->orphan_lock);
5374 6318
5375 while (1) { 6319 while (1) {
5376 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6320 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5392,14 +6336,14 @@ free:
5392 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6336 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5393} 6337}
5394 6338
5395void btrfs_drop_inode(struct inode *inode) 6339int btrfs_drop_inode(struct inode *inode)
5396{ 6340{
5397 struct btrfs_root *root = BTRFS_I(inode)->root; 6341 struct btrfs_root *root = BTRFS_I(inode)->root;
5398 6342
5399 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 6343 if (btrfs_root_refs(&root->root_item) == 0)
5400 generic_delete_inode(inode); 6344 return 1;
5401 else 6345 else
5402 generic_drop_inode(inode); 6346 return generic_drop_inode(inode);
5403} 6347}
5404 6348
5405static void init_once(void *foo) 6349static void init_once(void *foo)
@@ -5492,19 +6436,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5492 if (S_ISDIR(old_inode->i_mode) && new_inode && 6436 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5493 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6437 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5494 return -ENOTEMPTY; 6438 return -ENOTEMPTY;
5495
5496 /*
5497 * We want to reserve the absolute worst case amount of items. So if
5498 * both inodes are subvols and we need to unlink them then that would
5499 * require 4 item modifications, but if they are both normal inodes it
5500 * would require 5 item modifications, so we'll assume their normal
5501 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5502 * should cover the worst case number of items we'll modify.
5503 */
5504 ret = btrfs_reserve_metadata_space(root, 11);
5505 if (ret)
5506 return ret;
5507
5508 /* 6439 /*
5509 * we're using rename to replace one file with another. 6440 * we're using rename to replace one file with another.
5510 * and the replacement file is large. Start IO on it now so 6441 * and the replacement file is large. Start IO on it now so
@@ -5517,8 +6448,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5517 /* close the racy window with snapshot create/destroy ioctl */ 6448 /* close the racy window with snapshot create/destroy ioctl */
5518 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6449 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5519 down_read(&root->fs_info->subvol_sem); 6450 down_read(&root->fs_info->subvol_sem);
6451 /*
6452 * We want to reserve the absolute worst case amount of items. So if
6453 * both inodes are subvols and we need to unlink them then that would
6454 * require 4 item modifications, but if they are both normal inodes it
6455 * would require 5 item modifications, so we'll assume their normal
6456 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6457 * should cover the worst case number of items we'll modify.
6458 */
6459 trans = btrfs_start_transaction(root, 20);
6460 if (IS_ERR(trans))
6461 return PTR_ERR(trans);
5520 6462
5521 trans = btrfs_start_transaction(root, 1);
5522 btrfs_set_trans_block_group(trans, new_dir); 6463 btrfs_set_trans_block_group(trans, new_dir);
5523 6464
5524 if (dest != root) 6465 if (dest != root)
@@ -5617,7 +6558,6 @@ out_fail:
5617 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6558 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5618 up_read(&root->fs_info->subvol_sem); 6559 up_read(&root->fs_info->subvol_sem);
5619 6560
5620 btrfs_unreserve_metadata_space(root, 11);
5621 return ret; 6561 return ret;
5622} 6562}
5623 6563
@@ -5669,6 +6609,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5669 return 0; 6609 return 0;
5670} 6610}
5671 6611
6612int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6613{
6614 struct btrfs_inode *binode;
6615 struct inode *inode = NULL;
6616
6617 spin_lock(&root->fs_info->delalloc_lock);
6618 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6619 binode = list_entry(root->fs_info->delalloc_inodes.next,
6620 struct btrfs_inode, delalloc_inodes);
6621 inode = igrab(&binode->vfs_inode);
6622 if (inode) {
6623 list_move_tail(&binode->delalloc_inodes,
6624 &root->fs_info->delalloc_inodes);
6625 break;
6626 }
6627
6628 list_del_init(&binode->delalloc_inodes);
6629 cond_resched_lock(&root->fs_info->delalloc_lock);
6630 }
6631 spin_unlock(&root->fs_info->delalloc_lock);
6632
6633 if (inode) {
6634 write_inode_now(inode, 0);
6635 if (delay_iput)
6636 btrfs_add_delayed_iput(inode);
6637 else
6638 iput(inode);
6639 return 1;
6640 }
6641 return 0;
6642}
6643
5672static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6644static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5673 const char *symname) 6645 const char *symname)
5674{ 6646{
@@ -5692,26 +6664,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5692 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6664 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5693 return -ENAMETOOLONG; 6665 return -ENAMETOOLONG;
5694 6666
6667 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6668 if (err)
6669 return err;
5695 /* 6670 /*
5696 * 2 items for inode item and ref 6671 * 2 items for inode item and ref
5697 * 2 items for dir items 6672 * 2 items for dir items
5698 * 1 item for xattr if selinux is on 6673 * 1 item for xattr if selinux is on
5699 */ 6674 */
5700 err = btrfs_reserve_metadata_space(root, 5); 6675 trans = btrfs_start_transaction(root, 5);
5701 if (err) 6676 if (IS_ERR(trans))
5702 return err; 6677 return PTR_ERR(trans);
5703 6678
5704 trans = btrfs_start_transaction(root, 1);
5705 if (!trans)
5706 goto out_fail;
5707 btrfs_set_trans_block_group(trans, dir); 6679 btrfs_set_trans_block_group(trans, dir);
5708 6680
5709 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5710 if (err) {
5711 err = -ENOSPC;
5712 goto out_unlock;
5713 }
5714
5715 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6681 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5716 dentry->d_name.len, 6682 dentry->d_name.len,
5717 dentry->d_parent->d_inode->i_ino, objectid, 6683 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5783,8 +6749,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5783out_unlock: 6749out_unlock:
5784 nr = trans->blocks_used; 6750 nr = trans->blocks_used;
5785 btrfs_end_transaction_throttle(trans, root); 6751 btrfs_end_transaction_throttle(trans, root);
5786out_fail:
5787 btrfs_unreserve_metadata_space(root, 5);
5788 if (drop_inode) { 6752 if (drop_inode) {
5789 inode_dec_link_count(inode); 6753 inode_dec_link_count(inode);
5790 iput(inode); 6754 iput(inode);
@@ -5793,36 +6757,28 @@ out_fail:
5793 return err; 6757 return err;
5794} 6758}
5795 6759
5796static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6760int btrfs_prealloc_file_range(struct inode *inode, int mode,
5797 u64 alloc_hint, int mode, loff_t actual_len) 6761 u64 start, u64 num_bytes, u64 min_size,
6762 loff_t actual_len, u64 *alloc_hint)
5798{ 6763{
5799 struct btrfs_trans_handle *trans; 6764 struct btrfs_trans_handle *trans;
5800 struct btrfs_root *root = BTRFS_I(inode)->root; 6765 struct btrfs_root *root = BTRFS_I(inode)->root;
5801 struct btrfs_key ins; 6766 struct btrfs_key ins;
5802 u64 alloc_size;
5803 u64 cur_offset = start; 6767 u64 cur_offset = start;
5804 u64 num_bytes = end - start;
5805 int ret = 0; 6768 int ret = 0;
5806 u64 i_size;
5807 6769
5808 while (num_bytes > 0) { 6770 while (num_bytes > 0) {
5809 alloc_size = min(num_bytes, root->fs_info->max_extent); 6771 trans = btrfs_start_transaction(root, 3);
5810 6772 if (IS_ERR(trans)) {
5811 trans = btrfs_start_transaction(root, 1); 6773 ret = PTR_ERR(trans);
5812 6774 break;
5813 ret = btrfs_reserve_extent(trans, root, alloc_size,
5814 root->sectorsize, 0, alloc_hint,
5815 (u64)-1, &ins, 1);
5816 if (ret) {
5817 WARN_ON(1);
5818 goto stop_trans;
5819 } 6775 }
5820 6776
5821 ret = btrfs_reserve_metadata_space(root, 3); 6777 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6778 0, *alloc_hint, (u64)-1, &ins, 1);
5822 if (ret) { 6779 if (ret) {
5823 btrfs_free_reserved_extent(root, ins.objectid, 6780 btrfs_end_transaction(trans, root);
5824 ins.offset); 6781 break;
5825 goto stop_trans;
5826 } 6782 }
5827 6783
5828 ret = insert_reserved_file_extent(trans, inode, 6784 ret = insert_reserved_file_extent(trans, inode,
@@ -5836,37 +6792,33 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5836 6792
5837 num_bytes -= ins.offset; 6793 num_bytes -= ins.offset;
5838 cur_offset += ins.offset; 6794 cur_offset += ins.offset;
5839 alloc_hint = ins.objectid + ins.offset; 6795 *alloc_hint = ins.objectid + ins.offset;
5840 6796
5841 inode->i_ctime = CURRENT_TIME; 6797 inode->i_ctime = CURRENT_TIME;
5842 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6798 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5843 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6799 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5844 cur_offset > inode->i_size) { 6800 (actual_len > inode->i_size) &&
6801 (cur_offset > inode->i_size)) {
5845 if (cur_offset > actual_len) 6802 if (cur_offset > actual_len)
5846 i_size = actual_len; 6803 i_size_write(inode, actual_len);
5847 else 6804 else
5848 i_size = cur_offset; 6805 i_size_write(inode, cur_offset);
5849 i_size_write(inode, i_size); 6806 i_size_write(inode, cur_offset);
5850 btrfs_ordered_update_i_size(inode, i_size, NULL); 6807 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5851 } 6808 }
5852 6809
5853 ret = btrfs_update_inode(trans, root, inode); 6810 ret = btrfs_update_inode(trans, root, inode);
5854 BUG_ON(ret); 6811 BUG_ON(ret);
5855 6812
5856 btrfs_end_transaction(trans, root); 6813 btrfs_end_transaction(trans, root);
5857 btrfs_unreserve_metadata_space(root, 3);
5858 } 6814 }
5859 return ret; 6815 return ret;
5860
5861stop_trans:
5862 btrfs_end_transaction(trans, root);
5863 return ret;
5864
5865} 6816}
5866 6817
5867static long btrfs_fallocate(struct inode *inode, int mode, 6818static long btrfs_fallocate(struct inode *inode, int mode,
5868 loff_t offset, loff_t len) 6819 loff_t offset, loff_t len)
5869{ 6820{
6821 struct extent_state *cached_state = NULL;
5870 u64 cur_offset; 6822 u64 cur_offset;
5871 u64 last_byte; 6823 u64 last_byte;
5872 u64 alloc_start; 6824 u64 alloc_start;
@@ -5893,8 +6845,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5893 goto out; 6845 goto out;
5894 } 6846 }
5895 6847
5896 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6848 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5897 alloc_end - alloc_start);
5898 if (ret) 6849 if (ret)
5899 goto out; 6850 goto out;
5900 6851
@@ -5905,16 +6856,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5905 /* the extent lock is ordered inside the running 6856 /* the extent lock is ordered inside the running
5906 * transaction 6857 * transaction
5907 */ 6858 */
5908 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6859 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
5909 GFP_NOFS); 6860 locked_end, 0, &cached_state, GFP_NOFS);
5910 ordered = btrfs_lookup_first_ordered_extent(inode, 6861 ordered = btrfs_lookup_first_ordered_extent(inode,
5911 alloc_end - 1); 6862 alloc_end - 1);
5912 if (ordered && 6863 if (ordered &&
5913 ordered->file_offset + ordered->len > alloc_start && 6864 ordered->file_offset + ordered->len > alloc_start &&
5914 ordered->file_offset < alloc_end) { 6865 ordered->file_offset < alloc_end) {
5915 btrfs_put_ordered_extent(ordered); 6866 btrfs_put_ordered_extent(ordered);
5916 unlock_extent(&BTRFS_I(inode)->io_tree, 6867 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
5917 alloc_start, locked_end, GFP_NOFS); 6868 alloc_start, locked_end,
6869 &cached_state, GFP_NOFS);
5918 /* 6870 /*
5919 * we can't wait on the range with the transaction 6871 * we can't wait on the range with the transaction
5920 * running or with the extent lock held 6872 * running or with the extent lock held
@@ -5938,16 +6890,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5938 if (em->block_start == EXTENT_MAP_HOLE || 6890 if (em->block_start == EXTENT_MAP_HOLE ||
5939 (cur_offset >= inode->i_size && 6891 (cur_offset >= inode->i_size &&
5940 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6892 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5941 ret = prealloc_file_range(inode, 6893 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
5942 cur_offset, last_byte, 6894 last_byte - cur_offset,
5943 alloc_hint, mode, offset+len); 6895 1 << inode->i_blkbits,
6896 offset + len,
6897 &alloc_hint);
5944 if (ret < 0) { 6898 if (ret < 0) {
5945 free_extent_map(em); 6899 free_extent_map(em);
5946 break; 6900 break;
5947 } 6901 }
5948 } 6902 }
5949 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5950 alloc_hint = em->block_start;
5951 free_extent_map(em); 6903 free_extent_map(em);
5952 6904
5953 cur_offset = last_byte; 6905 cur_offset = last_byte;
@@ -5956,11 +6908,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5956 break; 6908 break;
5957 } 6909 }
5958 } 6910 }
5959 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6911 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5960 GFP_NOFS); 6912 &cached_state, GFP_NOFS);
5961 6913
5962 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6914 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5963 alloc_end - alloc_start);
5964out: 6915out:
5965 mutex_unlock(&inode->i_mutex); 6916 mutex_unlock(&inode->i_mutex);
5966 return ret; 6917 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -238,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
238 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
239 u64 index = 0; 240 u64 index = 0;
240 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
241 /* 246 /*
242 * 1 - inode item 247 * 1 - inode item
243 * 2 - refs 248 * 2 - refs
244 * 1 - root item 249 * 1 - root item
245 * 2 - dir items 250 * 2 - dir items
246 */ 251 */
247 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
248 if (ret) 253 if (IS_ERR(trans))
249 return ret; 254 return PTR_ERR(trans);
250
251 trans = btrfs_start_transaction(root, 1);
252 BUG_ON(!trans);
253
254 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
255 0, &objectid);
256 if (ret)
257 goto fail;
258 255
259 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
260 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -344,13 +341,10 @@ fail:
344 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
345 if (err && !ret) 342 if (err && !ret)
346 ret = err; 343 ret = err;
347
348 btrfs_unreserve_metadata_space(root, 6);
349 return ret; 344 return ret;
350} 345}
351 346
352static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
353 char *name, int namelen)
354{ 348{
355 struct inode *inode; 349 struct inode *inode;
356 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -360,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
360 if (!root->ref_cows) 354 if (!root->ref_cows)
361 return -EINVAL; 355 return -EINVAL;
362 356
363 /*
364 * 1 - inode item
365 * 2 - refs
366 * 1 - root item
367 * 2 - dir items
368 */
369 ret = btrfs_reserve_metadata_space(root, 6);
370 if (ret)
371 goto fail;
372
373 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
374 if (!pending_snapshot) { 358 if (!pending_snapshot)
375 ret = -ENOMEM; 359 return -ENOMEM;
376 btrfs_unreserve_metadata_space(root, 6); 360
377 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
378 }
379 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
380 if (!pending_snapshot->name) {
381 ret = -ENOMEM;
382 kfree(pending_snapshot);
383 btrfs_unreserve_metadata_space(root, 6);
384 goto fail;
385 }
386 memcpy(pending_snapshot->name, name, namelen);
387 pending_snapshot->name[namelen] = '\0';
388 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
389 trans = btrfs_start_transaction(root, 1);
390 BUG_ON(!trans);
391 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
392 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
393 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
394 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
395 BUG_ON(ret); 377 BUG_ON(ret);
396 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
397 384
398 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
399 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -404,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
404 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
405 ret = 0; 392 ret = 0;
406fail: 393fail:
394 kfree(pending_snapshot);
407 return ret; 395 return ret;
408} 396}
409 397
@@ -455,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
455 goto out_up_read; 443 goto out_up_read;
456 444
457 if (snap_src) { 445 if (snap_src) {
458 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
459 name, namelen);
460 } else { 447 } else {
461 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
462 name, namelen); 449 name, namelen);
@@ -474,7 +461,79 @@ out_unlock:
474 return error; 461 return error;
475} 462}
476 463
477static int btrfs_defrag_file(struct file *file) 464static int should_defrag_range(struct inode *inode, u64 start, u64 len,
465 int thresh, u64 *last_len, u64 *skip,
466 u64 *defrag_end)
467{
468 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
469 struct extent_map *em = NULL;
470 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
471 int ret = 1;
472
473
474 if (thresh == 0)
475 thresh = 256 * 1024;
476
477 /*
478 * make sure that once we start defragging and extent, we keep on
479 * defragging it
480 */
481 if (start < *defrag_end)
482 return 1;
483
484 *skip = 0;
485
486 /*
487 * hopefully we have this extent in the tree already, try without
488 * the full extent lock
489 */
490 read_lock(&em_tree->lock);
491 em = lookup_extent_mapping(em_tree, start, len);
492 read_unlock(&em_tree->lock);
493
494 if (!em) {
495 /* get the big lock and read metadata off disk */
496 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
497 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
498 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
499
500 if (IS_ERR(em))
501 return 0;
502 }
503
504 /* this will cover holes, and inline extents */
505 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
506 ret = 0;
507
508 /*
509 * we hit a real extent, if it is big don't bother defragging it again
510 */
511 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
512 ret = 0;
513
514 /*
515 * last_len ends up being a counter of how many bytes we've defragged.
516 * every time we choose not to defrag an extent, we reset *last_len
517 * so that the next tiny extent will force a defrag.
518 *
519 * The end result of this is that tiny extents before a single big
520 * extent will force at least part of that big extent to be defragged.
521 */
522 if (ret) {
523 *last_len += len;
524 *defrag_end = extent_map_end(em);
525 } else {
526 *last_len = 0;
527 *skip = extent_map_end(em);
528 *defrag_end = 0;
529 }
530
531 free_extent_map(em);
532 return ret;
533}
534
535static int btrfs_defrag_file(struct file *file,
536 struct btrfs_ioctl_defrag_range_args *range)
478{ 537{
479 struct inode *inode = fdentry(file)->d_inode; 538 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 539 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +545,88 @@ static int btrfs_defrag_file(struct file *file)
486 unsigned long total_read = 0; 545 unsigned long total_read = 0;
487 u64 page_start; 546 u64 page_start;
488 u64 page_end; 547 u64 page_end;
548 u64 last_len = 0;
549 u64 skip = 0;
550 u64 defrag_end = 0;
489 unsigned long i; 551 unsigned long i;
490 int ret; 552 int ret;
491 553
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 554 if (inode->i_size == 0)
493 if (ret) 555 return 0;
494 return -ENOSPC; 556
557 if (range->start + range->len > range->start) {
558 last_index = min_t(u64, inode->i_size - 1,
559 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
560 } else {
561 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
562 }
563
564 i = range->start >> PAGE_CACHE_SHIFT;
565 while (i <= last_index) {
566 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
567 PAGE_CACHE_SIZE,
568 range->extent_thresh,
569 &last_len, &skip,
570 &defrag_end)) {
571 unsigned long next;
572 /*
573 * the should_defrag function tells us how much to skip
574 * bump our counter by the suggested amount
575 */
576 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
577 i = max(i + 1, next);
578 continue;
579 }
495 580
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 581 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 582 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 583 min(last_index, i + ra_pages - 1));
502 } 584 }
503 total_read++; 585 total_read++;
586 mutex_lock(&inode->i_mutex);
587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
588 BTRFS_I(inode)->force_compress = 1;
589
590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
591 if (ret)
592 goto err_unlock;
504again: 593again:
594 if (inode->i_size == 0 ||
595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
596 ret = 0;
597 goto err_reservations;
598 }
599
505 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 601 if (!page) {
507 goto out_unlock; 602 ret = -ENOMEM;
603 goto err_reservations;
604 }
605
508 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
510 lock_page(page); 608 lock_page(page);
511 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
512 unlock_page(page); 610 unlock_page(page);
513 page_cache_release(page); 611 page_cache_release(page);
514 goto out_unlock; 612 ret = -EIO;
613 goto err_reservations;
515 } 614 }
516 } 615 }
517 616
617 if (page->mapping != inode->i_mapping) {
618 unlock_page(page);
619 page_cache_release(page);
620 goto again;
621 }
622
518 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
519 624
625 if (PageDirty(page)) {
626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
627 goto loop_unlock;
628 }
629
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 630 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 631 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 632 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +647,53 @@ again:
537 * page if it is dirtied again later 647 * page if it is dirtied again later
538 */ 648 */
539 clear_page_dirty_for_io(page); 649 clear_page_dirty_for_io(page);
650 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
651 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
652 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 653
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 654 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
655 ClearPageChecked(page);
542 set_page_dirty(page); 656 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 657 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
658
659loop_unlock:
544 unlock_page(page); 660 unlock_page(page);
545 page_cache_release(page); 661 page_cache_release(page);
662 mutex_unlock(&inode->i_mutex);
663
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
665 i++;
666 }
667
668 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
669 filemap_flush(inode->i_mapping);
670
671 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
672 /* the filemap_flush will queue IO into the worker threads, but
673 * we have to make sure the IO is actually started and that
674 * ordered extents get created before we return
675 */
676 atomic_inc(&root->fs_info->async_submit_draining);
677 while (atomic_read(&root->fs_info->nr_async_submits) ||
678 atomic_read(&root->fs_info->async_delalloc_pages)) {
679 wait_event(root->fs_info->async_submit_wait,
680 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
681 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
682 }
683 atomic_dec(&root->fs_info->async_submit_draining);
684
685 mutex_lock(&inode->i_mutex);
686 BTRFS_I(inode)->force_compress = 0;
687 mutex_unlock(&inode->i_mutex);
547 } 688 }
548 689
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 690 return 0;
691
692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
695 mutex_unlock(&inode->i_mutex);
696 return ret;
552} 697}
553 698
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 699static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +753,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 753 mod = 1;
609 sizestr++; 754 sizestr++;
610 } 755 }
611 new_size = btrfs_parse_size(sizestr); 756 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 757 if (new_size == 0) {
613 ret = -EINVAL; 758 ret = -EINVAL;
614 goto out_unlock; 759 goto out_unlock;
@@ -643,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
643 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
644 789
645 if (new_size > old_size) { 790 if (new_size > old_size) {
646 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
647 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
648 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
649 } else { 794 } else {
@@ -743,6 +888,330 @@ out:
743 return ret; 888 return ret;
744} 889}
745 890
891static noinline int key_in_sk(struct btrfs_key *key,
892 struct btrfs_ioctl_search_key *sk)
893{
894 struct btrfs_key test;
895 int ret;
896
897 test.objectid = sk->min_objectid;
898 test.type = sk->min_type;
899 test.offset = sk->min_offset;
900
901 ret = btrfs_comp_cpu_keys(key, &test);
902 if (ret < 0)
903 return 0;
904
905 test.objectid = sk->max_objectid;
906 test.type = sk->max_type;
907 test.offset = sk->max_offset;
908
909 ret = btrfs_comp_cpu_keys(key, &test);
910 if (ret > 0)
911 return 0;
912 return 1;
913}
914
915static noinline int copy_to_sk(struct btrfs_root *root,
916 struct btrfs_path *path,
917 struct btrfs_key *key,
918 struct btrfs_ioctl_search_key *sk,
919 char *buf,
920 unsigned long *sk_offset,
921 int *num_found)
922{
923 u64 found_transid;
924 struct extent_buffer *leaf;
925 struct btrfs_ioctl_search_header sh;
926 unsigned long item_off;
927 unsigned long item_len;
928 int nritems;
929 int i;
930 int slot;
931 int found = 0;
932 int ret = 0;
933
934 leaf = path->nodes[0];
935 slot = path->slots[0];
936 nritems = btrfs_header_nritems(leaf);
937
938 if (btrfs_header_generation(leaf) > sk->max_transid) {
939 i = nritems;
940 goto advance_key;
941 }
942 found_transid = btrfs_header_generation(leaf);
943
944 for (i = slot; i < nritems; i++) {
945 item_off = btrfs_item_ptr_offset(leaf, i);
946 item_len = btrfs_item_size_nr(leaf, i);
947
948 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
949 item_len = 0;
950
951 if (sizeof(sh) + item_len + *sk_offset >
952 BTRFS_SEARCH_ARGS_BUFSIZE) {
953 ret = 1;
954 goto overflow;
955 }
956
957 btrfs_item_key_to_cpu(leaf, key, i);
958 if (!key_in_sk(key, sk))
959 continue;
960
961 sh.objectid = key->objectid;
962 sh.offset = key->offset;
963 sh.type = key->type;
964 sh.len = item_len;
965 sh.transid = found_transid;
966
967 /* copy search result header */
968 memcpy(buf + *sk_offset, &sh, sizeof(sh));
969 *sk_offset += sizeof(sh);
970
971 if (item_len) {
972 char *p = buf + *sk_offset;
973 /* copy the item */
974 read_extent_buffer(leaf, p,
975 item_off, item_len);
976 *sk_offset += item_len;
977 }
978 found++;
979
980 if (*num_found >= sk->nr_items)
981 break;
982 }
983advance_key:
984 ret = 0;
985 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
986 key->offset++;
987 else if (key->type < (u8)-1 && key->type < sk->max_type) {
988 key->offset = 0;
989 key->type++;
990 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
991 key->offset = 0;
992 key->type = 0;
993 key->objectid++;
994 } else
995 ret = 1;
996overflow:
997 *num_found += found;
998 return ret;
999}
1000
1001static noinline int search_ioctl(struct inode *inode,
1002 struct btrfs_ioctl_search_args *args)
1003{
1004 struct btrfs_root *root;
1005 struct btrfs_key key;
1006 struct btrfs_key max_key;
1007 struct btrfs_path *path;
1008 struct btrfs_ioctl_search_key *sk = &args->key;
1009 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1010 int ret;
1011 int num_found = 0;
1012 unsigned long sk_offset = 0;
1013
1014 path = btrfs_alloc_path();
1015 if (!path)
1016 return -ENOMEM;
1017
1018 if (sk->tree_id == 0) {
1019 /* search the root of the inode that was passed */
1020 root = BTRFS_I(inode)->root;
1021 } else {
1022 key.objectid = sk->tree_id;
1023 key.type = BTRFS_ROOT_ITEM_KEY;
1024 key.offset = (u64)-1;
1025 root = btrfs_read_fs_root_no_name(info, &key);
1026 if (IS_ERR(root)) {
1027 printk(KERN_ERR "could not find root %llu\n",
1028 sk->tree_id);
1029 btrfs_free_path(path);
1030 return -ENOENT;
1031 }
1032 }
1033
1034 key.objectid = sk->min_objectid;
1035 key.type = sk->min_type;
1036 key.offset = sk->min_offset;
1037
1038 max_key.objectid = sk->max_objectid;
1039 max_key.type = sk->max_type;
1040 max_key.offset = sk->max_offset;
1041
1042 path->keep_locks = 1;
1043
1044 while(1) {
1045 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1046 sk->min_transid);
1047 if (ret != 0) {
1048 if (ret > 0)
1049 ret = 0;
1050 goto err;
1051 }
1052 ret = copy_to_sk(root, path, &key, sk, args->buf,
1053 &sk_offset, &num_found);
1054 btrfs_release_path(root, path);
1055 if (ret || num_found >= sk->nr_items)
1056 break;
1057
1058 }
1059 ret = 0;
1060err:
1061 sk->nr_items = num_found;
1062 btrfs_free_path(path);
1063 return ret;
1064}
1065
1066static noinline int btrfs_ioctl_tree_search(struct file *file,
1067 void __user *argp)
1068{
1069 struct btrfs_ioctl_search_args *args;
1070 struct inode *inode;
1071 int ret;
1072
1073 if (!capable(CAP_SYS_ADMIN))
1074 return -EPERM;
1075
1076 args = kmalloc(sizeof(*args), GFP_KERNEL);
1077 if (!args)
1078 return -ENOMEM;
1079
1080 if (copy_from_user(args, argp, sizeof(*args))) {
1081 kfree(args);
1082 return -EFAULT;
1083 }
1084 inode = fdentry(file)->d_inode;
1085 ret = search_ioctl(inode, args);
1086 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1087 ret = -EFAULT;
1088 kfree(args);
1089 return ret;
1090}
1091
1092/*
1093 * Search INODE_REFs to identify path name of 'dirid' directory
1094 * in a 'tree_id' tree. and sets path name to 'name'.
1095 */
1096static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1097 u64 tree_id, u64 dirid, char *name)
1098{
1099 struct btrfs_root *root;
1100 struct btrfs_key key;
1101 char *ptr;
1102 int ret = -1;
1103 int slot;
1104 int len;
1105 int total_len = 0;
1106 struct btrfs_inode_ref *iref;
1107 struct extent_buffer *l;
1108 struct btrfs_path *path;
1109
1110 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1111 name[0]='\0';
1112 return 0;
1113 }
1114
1115 path = btrfs_alloc_path();
1116 if (!path)
1117 return -ENOMEM;
1118
1119 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1120
1121 key.objectid = tree_id;
1122 key.type = BTRFS_ROOT_ITEM_KEY;
1123 key.offset = (u64)-1;
1124 root = btrfs_read_fs_root_no_name(info, &key);
1125 if (IS_ERR(root)) {
1126 printk(KERN_ERR "could not find root %llu\n", tree_id);
1127 ret = -ENOENT;
1128 goto out;
1129 }
1130
1131 key.objectid = dirid;
1132 key.type = BTRFS_INODE_REF_KEY;
1133 key.offset = (u64)-1;
1134
1135 while(1) {
1136 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1137 if (ret < 0)
1138 goto out;
1139
1140 l = path->nodes[0];
1141 slot = path->slots[0];
1142 if (ret > 0 && slot > 0)
1143 slot--;
1144 btrfs_item_key_to_cpu(l, &key, slot);
1145
1146 if (ret > 0 && (key.objectid != dirid ||
1147 key.type != BTRFS_INODE_REF_KEY)) {
1148 ret = -ENOENT;
1149 goto out;
1150 }
1151
1152 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1153 len = btrfs_inode_ref_name_len(l, iref);
1154 ptr -= len + 1;
1155 total_len += len + 1;
1156 if (ptr < name)
1157 goto out;
1158
1159 *(ptr + len) = '/';
1160 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1161
1162 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1163 break;
1164
1165 btrfs_release_path(root, path);
1166 key.objectid = key.offset;
1167 key.offset = (u64)-1;
1168 dirid = key.objectid;
1169
1170 }
1171 if (ptr < name)
1172 goto out;
1173 memcpy(name, ptr, total_len);
1174 name[total_len]='\0';
1175 ret = 0;
1176out:
1177 btrfs_free_path(path);
1178 return ret;
1179}
1180
1181static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1182 void __user *argp)
1183{
1184 struct btrfs_ioctl_ino_lookup_args *args;
1185 struct inode *inode;
1186 int ret;
1187
1188 if (!capable(CAP_SYS_ADMIN))
1189 return -EPERM;
1190
1191 args = kmalloc(sizeof(*args), GFP_KERNEL);
1192 if (!args)
1193 return -ENOMEM;
1194
1195 if (copy_from_user(args, argp, sizeof(*args))) {
1196 kfree(args);
1197 return -EFAULT;
1198 }
1199 inode = fdentry(file)->d_inode;
1200
1201 if (args->treeid == 0)
1202 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1203
1204 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1205 args->treeid, args->objectid,
1206 args->name);
1207
1208 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1209 ret = -EFAULT;
1210
1211 kfree(args);
1212 return ret;
1213}
1214
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1215static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1216 void __user *arg)
748{ 1217{
@@ -808,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
808 if (err) 1277 if (err)
809 goto out_up_write; 1278 goto out_up_write;
810 1279
811 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out_up_write;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
812 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
813 dest->root_key.objectid, 1288 dest->root_key.objectid,
814 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -822,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
822 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
823 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
824 1299
825 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
826 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
827 dest->root_key.objectid); 1302 root->fs_info->tree_root,
828 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
829 1306
830 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
831 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -849,10 +1326,11 @@ out:
849 return err; 1326 return err;
850} 1327}
851 1328
852static int btrfs_ioctl_defrag(struct file *file) 1329static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1330{
854 struct inode *inode = fdentry(file)->d_inode; 1331 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1332 struct btrfs_root *root = BTRFS_I(inode)->root;
1333 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1334 int ret;
857 1335
858 ret = mnt_want_write(file->f_path.mnt); 1336 ret = mnt_want_write(file->f_path.mnt);
@@ -865,16 +1343,44 @@ static int btrfs_ioctl_defrag(struct file *file)
865 ret = -EPERM; 1343 ret = -EPERM;
866 goto out; 1344 goto out;
867 } 1345 }
868 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
869 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
870 break; 1350 break;
871 case S_IFREG: 1351 case S_IFREG:
872 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
873 ret = -EINVAL; 1353 ret = -EINVAL;
874 goto out; 1354 goto out;
875 } 1355 }
876 btrfs_defrag_file(file); 1356
1357 range = kzalloc(sizeof(*range), GFP_KERNEL);
1358 if (!range) {
1359 ret = -ENOMEM;
1360 goto out;
1361 }
1362
1363 if (argp) {
1364 if (copy_from_user(range, argp,
1365 sizeof(*range))) {
1366 ret = -EFAULT;
1367 kfree(range);
1368 goto out;
1369 }
1370 /* compression requires us to start the IO */
1371 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1372 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1373 range->extent_thresh = (u32)-1;
1374 }
1375 } else {
1376 /* the rest are all set to zero by kzalloc */
1377 range->len = (u64)-1;
1378 }
1379 ret = btrfs_defrag_file(file, range);
1380 kfree(range);
877 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
878 } 1384 }
879out: 1385out:
880 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -952,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
952 */ 1458 */
953 1459
954 /* the destination must be opened for writing */ 1460 /* the destination must be opened for writing */
955 if (!(file->f_mode & FMODE_WRITE)) 1461 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
956 return -EINVAL; 1462 return -EINVAL;
957 1463
958 ret = mnt_want_write(file->f_path.mnt); 1464 ret = mnt_want_write(file->f_path.mnt);
@@ -964,12 +1470,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
964 ret = -EBADF; 1470 ret = -EBADF;
965 goto out_drop_write; 1471 goto out_drop_write;
966 } 1472 }
1473
967 src = src_file->f_dentry->d_inode; 1474 src = src_file->f_dentry->d_inode;
968 1475
969 ret = -EINVAL; 1476 ret = -EINVAL;
970 if (src == inode) 1477 if (src == inode)
971 goto out_fput; 1478 goto out_fput;
972 1479
1480 /* the src must be open for reading */
1481 if (!(src_file->f_mode & FMODE_READ))
1482 goto out_fput;
1483
973 ret = -EISDIR; 1484 ret = -EISDIR;
974 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 1485 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
975 goto out_fput; 1486 goto out_fput;
@@ -1000,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1000 1511
1001 /* determine range to clone */ 1512 /* determine range to clone */
1002 ret = -EINVAL; 1513 ret = -EINVAL;
1003 if (off >= src->i_size || off + len > src->i_size) 1514 if (off + len > src->i_size || off + len < off)
1004 goto out_unlock; 1515 goto out_unlock;
1005 if (len == 0) 1516 if (len == 0)
1006 olen = len = src->i_size - off; 1517 olen = len = src->i_size - off;
@@ -1028,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1028 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1029 } 1540 }
1030 1541
1031 trans = btrfs_start_transaction(root, 1);
1032 BUG_ON(!trans);
1033
1034 /* punch hole in destination first */
1035 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1036
1037 /* clone data */ 1542 /* clone data */
1038 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1039 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1044,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1044 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1045 * tree. 1550 * tree.
1046 */ 1551 */
1047 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1048 if (ret < 0) 1553 if (ret < 0)
1049 goto out; 1554 goto out;
1050 1555
@@ -1073,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1073 u64 disko = 0, diskl = 0; 1578 u64 disko = 0, diskl = 0;
1074 u64 datao = 0, datal = 0; 1579 u64 datao = 0, datal = 0;
1075 u8 comp; 1580 u8 comp;
1581 u64 endoff;
1076 1582
1077 size = btrfs_item_size_nr(leaf, slot); 1583 size = btrfs_item_size_nr(leaf, slot);
1078 read_extent_buffer(leaf, buf, 1584 read_extent_buffer(leaf, buf,
@@ -1107,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1107 new_key.objectid = inode->i_ino; 1613 new_key.objectid = inode->i_ino;
1108 new_key.offset = key.offset + destoff - off; 1614 new_key.offset = key.offset + destoff - off;
1109 1615
1616 trans = btrfs_start_transaction(root, 1);
1617 if (IS_ERR(trans)) {
1618 ret = PTR_ERR(trans);
1619 goto out;
1620 }
1621
1110 if (type == BTRFS_FILE_EXTENT_REG || 1622 if (type == BTRFS_FILE_EXTENT_REG ||
1111 type == BTRFS_FILE_EXTENT_PREALLOC) { 1623 type == BTRFS_FILE_EXTENT_PREALLOC) {
1624 if (off > key.offset) {
1625 datao += off - key.offset;
1626 datal -= off - key.offset;
1627 }
1628
1629 if (key.offset + datal > off + len)
1630 datal = off + len - key.offset;
1631
1632 ret = btrfs_drop_extents(trans, inode,
1633 new_key.offset,
1634 new_key.offset + datal,
1635 &hint_byte, 1);
1636 BUG_ON(ret);
1637
1112 ret = btrfs_insert_empty_item(trans, root, path, 1638 ret = btrfs_insert_empty_item(trans, root, path,
1113 &new_key, size); 1639 &new_key, size);
1114 if (ret) 1640 BUG_ON(ret);
1115 goto out;
1116 1641
1117 leaf = path->nodes[0]; 1642 leaf = path->nodes[0];
1118 slot = path->slots[0]; 1643 slot = path->slots[0];
@@ -1123,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1123 extent = btrfs_item_ptr(leaf, slot, 1648 extent = btrfs_item_ptr(leaf, slot,
1124 struct btrfs_file_extent_item); 1649 struct btrfs_file_extent_item);
1125 1650
1126 if (off > key.offset) {
1127 datao += off - key.offset;
1128 datal -= off - key.offset;
1129 }
1130
1131 if (key.offset + datal > off + len)
1132 datal = off + len - key.offset;
1133
1134 /* disko == 0 means it's a hole */ 1651 /* disko == 0 means it's a hole */
1135 if (!disko) 1652 if (!disko)
1136 datao = 0; 1653 datao = 0;
@@ -1161,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1161 1678
1162 if (comp && (skip || trim)) { 1679 if (comp && (skip || trim)) {
1163 ret = -EINVAL; 1680 ret = -EINVAL;
1681 btrfs_end_transaction(trans, root);
1164 goto out; 1682 goto out;
1165 } 1683 }
1166 size -= skip + trim; 1684 size -= skip + trim;
1167 datal -= skip + trim; 1685 datal -= skip + trim;
1686
1687 ret = btrfs_drop_extents(trans, inode,
1688 new_key.offset,
1689 new_key.offset + datal,
1690 &hint_byte, 1);
1691 BUG_ON(ret);
1692
1168 ret = btrfs_insert_empty_item(trans, root, path, 1693 ret = btrfs_insert_empty_item(trans, root, path,
1169 &new_key, size); 1694 &new_key, size);
1170 if (ret) 1695 BUG_ON(ret);
1171 goto out;
1172 1696
1173 if (skip) { 1697 if (skip) {
1174 u32 start = 1698 u32 start =
@@ -1186,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1186 } 1710 }
1187 1711
1188 btrfs_mark_buffer_dirty(leaf); 1712 btrfs_mark_buffer_dirty(leaf);
1189 } 1713 btrfs_release_path(root, path);
1190 1714
1715 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1716
1717 /*
1718 * we round up to the block size at eof when
1719 * determining which extents to clone above,
1720 * but shouldn't round up the file size
1721 */
1722 endoff = new_key.offset + datal;
1723 if (endoff > off+olen)
1724 endoff = off+olen;
1725 if (endoff > inode->i_size)
1726 btrfs_i_size_write(inode, endoff);
1727
1728 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1729 ret = btrfs_update_inode(trans, root, inode);
1730 BUG_ON(ret);
1731 btrfs_end_transaction(trans, root);
1732 }
1191next: 1733next:
1192 btrfs_release_path(root, path); 1734 btrfs_release_path(root, path);
1193 key.offset++; 1735 key.offset++;
@@ -1195,17 +1737,7 @@ next:
1195 ret = 0; 1737 ret = 0;
1196out: 1738out:
1197 btrfs_release_path(root, path); 1739 btrfs_release_path(root, path);
1198 if (ret == 0) {
1199 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1200 if (destoff + olen > inode->i_size)
1201 btrfs_i_size_write(inode, destoff + olen);
1202 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1203 ret = btrfs_update_inode(trans, root, inode);
1204 }
1205 btrfs_end_transaction(trans, root);
1206 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1740 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1207 if (ret)
1208 vmtruncate(inode, 0);
1209out_unlock: 1741out_unlock:
1210 mutex_unlock(&src->i_mutex); 1742 mutex_unlock(&src->i_mutex);
1211 mutex_unlock(&inode->i_mutex); 1743 mutex_unlock(&inode->i_mutex);
@@ -1274,6 +1806,157 @@ out:
1274 return ret; 1806 return ret;
1275} 1807}
1276 1808
1809static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1810{
1811 struct inode *inode = fdentry(file)->d_inode;
1812 struct btrfs_root *root = BTRFS_I(inode)->root;
1813 struct btrfs_root *new_root;
1814 struct btrfs_dir_item *di;
1815 struct btrfs_trans_handle *trans;
1816 struct btrfs_path *path;
1817 struct btrfs_key location;
1818 struct btrfs_disk_key disk_key;
1819 struct btrfs_super_block *disk_super;
1820 u64 features;
1821 u64 objectid = 0;
1822 u64 dir_id;
1823
1824 if (!capable(CAP_SYS_ADMIN))
1825 return -EPERM;
1826
1827 if (copy_from_user(&objectid, argp, sizeof(objectid)))
1828 return -EFAULT;
1829
1830 if (!objectid)
1831 objectid = root->root_key.objectid;
1832
1833 location.objectid = objectid;
1834 location.type = BTRFS_ROOT_ITEM_KEY;
1835 location.offset = (u64)-1;
1836
1837 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1838 if (IS_ERR(new_root))
1839 return PTR_ERR(new_root);
1840
1841 if (btrfs_root_refs(&new_root->root_item) == 0)
1842 return -ENOENT;
1843
1844 path = btrfs_alloc_path();
1845 if (!path)
1846 return -ENOMEM;
1847 path->leave_spinning = 1;
1848
1849 trans = btrfs_start_transaction(root, 1);
1850 if (!trans) {
1851 btrfs_free_path(path);
1852 return -ENOMEM;
1853 }
1854
1855 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1856 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1857 dir_id, "default", 7, 1);
1858 if (IS_ERR_OR_NULL(di)) {
1859 btrfs_free_path(path);
1860 btrfs_end_transaction(trans, root);
1861 printk(KERN_ERR "Umm, you don't have the default dir item, "
1862 "this isn't going to work\n");
1863 return -ENOENT;
1864 }
1865
1866 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1867 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1868 btrfs_mark_buffer_dirty(path->nodes[0]);
1869 btrfs_free_path(path);
1870
1871 disk_super = &root->fs_info->super_copy;
1872 features = btrfs_super_incompat_flags(disk_super);
1873 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1874 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1875 btrfs_set_super_incompat_flags(disk_super, features);
1876 }
1877 btrfs_end_transaction(trans, root);
1878
1879 return 0;
1880}
1881
1882long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1883{
1884 struct btrfs_ioctl_space_args space_args;
1885 struct btrfs_ioctl_space_info space;
1886 struct btrfs_ioctl_space_info *dest;
1887 struct btrfs_ioctl_space_info *dest_orig;
1888 struct btrfs_ioctl_space_info *user_dest;
1889 struct btrfs_space_info *info;
1890 int alloc_size;
1891 int ret = 0;
1892 int slot_count = 0;
1893
1894 if (copy_from_user(&space_args,
1895 (struct btrfs_ioctl_space_args __user *)arg,
1896 sizeof(space_args)))
1897 return -EFAULT;
1898
1899 /* first we count slots */
1900 rcu_read_lock();
1901 list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
1902 slot_count++;
1903 rcu_read_unlock();
1904
1905 /* space_slots == 0 means they are asking for a count */
1906 if (space_args.space_slots == 0) {
1907 space_args.total_spaces = slot_count;
1908 goto out;
1909 }
1910 alloc_size = sizeof(*dest) * slot_count;
1911 /* we generally have at most 6 or so space infos, one for each raid
1912 * level. So, a whole page should be more than enough for everyone
1913 */
1914 if (alloc_size > PAGE_CACHE_SIZE)
1915 return -ENOMEM;
1916
1917 space_args.total_spaces = 0;
1918 dest = kmalloc(alloc_size, GFP_NOFS);
1919 if (!dest)
1920 return -ENOMEM;
1921 dest_orig = dest;
1922
1923 /* now we have a buffer to copy into */
1924 rcu_read_lock();
1925 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
1926 /* make sure we don't copy more than we allocated
1927 * in our buffer
1928 */
1929 if (slot_count == 0)
1930 break;
1931 slot_count--;
1932
1933 /* make sure userland has enough room in their buffer */
1934 if (space_args.total_spaces >= space_args.space_slots)
1935 break;
1936
1937 space.flags = info->flags;
1938 space.total_bytes = info->total_bytes;
1939 space.used_bytes = info->bytes_used;
1940 memcpy(dest, &space, sizeof(space));
1941 dest++;
1942 space_args.total_spaces++;
1943 }
1944 rcu_read_unlock();
1945
1946 user_dest = (struct btrfs_ioctl_space_info *)
1947 (arg + sizeof(struct btrfs_ioctl_space_args));
1948
1949 if (copy_to_user(user_dest, dest_orig, alloc_size))
1950 ret = -EFAULT;
1951
1952 kfree(dest_orig);
1953out:
1954 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
1955 ret = -EFAULT;
1956
1957 return ret;
1958}
1959
1277/* 1960/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 1961 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 1962 * to deadlocks. They should only be used by applications that
@@ -1320,8 +2003,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1320 return btrfs_ioctl_snap_create(file, argp, 1); 2003 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 2004 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 2005 return btrfs_ioctl_snap_destroy(file, argp);
2006 case BTRFS_IOC_DEFAULT_SUBVOL:
2007 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 2008 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 2009 return btrfs_ioctl_defrag(file, NULL);
2010 case BTRFS_IOC_DEFRAG_RANGE:
2011 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 2012 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 2013 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 2014 case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2025,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2025 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2026 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2027 return btrfs_ioctl_trans_end(file);
2028 case BTRFS_IOC_TREE_SEARCH:
2029 return btrfs_ioctl_tree_search(file, argp);
2030 case BTRFS_IOC_INO_LOOKUP:
2031 return btrfs_ioctl_ino_lookup(file, argp);
2032 case BTRFS_IOC_SPACE_INFO:
2033 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2034 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2035 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2036 return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid;
36 __u64 objectid;
37 char name[BTRFS_INO_LOOKUP_PATH_MAX];
38};
39
40struct btrfs_ioctl_search_key {
41 /* which root are we searching. 0 is the tree of tree roots */
42 __u64 tree_id;
43
44 /* keys returned will be >= min and <= max */
45 __u64 min_objectid;
46 __u64 max_objectid;
47
48 /* keys returned will be >= min and <= max */
49 __u64 min_offset;
50 __u64 max_offset;
51
52 /* max and min transids to search for */
53 __u64 min_transid;
54 __u64 max_transid;
55
56 /* keys returned will be >= min and <= max */
57 __u32 min_type;
58 __u32 max_type;
59
60 /*
61 * how many items did userland ask for, and how many are we
62 * returning
63 */
64 __u32 nr_items;
65
66 /* align to 64 bits */
67 __u32 unused;
68
69 /* some extra for later */
70 __u64 unused1;
71 __u64 unused2;
72 __u64 unused3;
73 __u64 unused4;
74};
75
76struct btrfs_ioctl_search_header {
77 __u64 transid;
78 __u64 objectid;
79 __u64 offset;
80 __u32 type;
81 __u32 len;
82};
83
84#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
85/*
86 * the buf is an array of search headers where
87 * each header is followed by the actual item
88 * the type field is expanded to 32 bits for alignment
89 */
90struct btrfs_ioctl_search_args {
91 struct btrfs_ioctl_search_key key;
92 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
93};
94
33struct btrfs_ioctl_clone_range_args { 95struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 96 __s64 src_fd;
35 __u64 src_offset, src_length; 97 __u64 src_offset, src_length;
36 __u64 dest_offset; 98 __u64 dest_offset;
37}; 99};
38 100
101/* flags for the defrag range ioctl */
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2
104
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info {
130 __u64 flags;
131 __u64 total_bytes;
132 __u64 used_bytes;
133};
134
135struct btrfs_ioctl_space_args {
136 __u64 space_slots;
137 __u64 total_spaces;
138 struct btrfs_ioctl_space_info spaces[0];
139};
140
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 141#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 142 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 143#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 169 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 170#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 171 struct btrfs_ioctl_vol_args)
172#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
173 struct btrfs_ioctl_defrag_range_args)
174#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
175 struct btrfs_ioctl_search_args)
176#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
177 struct btrfs_ioctl_ino_lookup_args)
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args)
70#endif 181#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -125,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
125 return 1; 124 return 1;
126} 125}
127 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
128/* 136/*
129 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset 138 * the first one less than this offset
@@ -162,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
162 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
163 * inserted. 171 * inserted.
164 */ 172 */
165int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
166 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
167{ 176{
168 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
169 struct rb_node *node; 178 struct rb_node *node;
@@ -174,7 +183,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 183 if (!entry)
175 return -ENOMEM; 184 return -ENOMEM;
176 185
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 186 entry->file_offset = file_offset;
179 entry->start = start; 187 entry->start = start;
180 entry->len = len; 188 entry->len = len;
@@ -184,26 +192,44 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
184 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
185 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
186 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
187 /* one ref for the tree */ 198 /* one ref for the tree */
188 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
189 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
190 INIT_LIST_HEAD(&entry->list); 201 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 202 INIT_LIST_HEAD(&entry->root_extent_list);
192 203
204 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 205 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 206 &entry->rb_node);
195 BUG_ON(node); 207 BUG_ON(node);
208 spin_unlock(&tree->lock);
196 209
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 210 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 211 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 212 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 213 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 214
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 215 BUG_ON(node);
204 return 0; 216 return 0;
205} 217}
206 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
207/* 233/*
208 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
209 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -216,9 +242,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 242 struct btrfs_ordered_inode_tree *tree;
217 243
218 tree = &BTRFS_I(inode)->ordered_tree; 244 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 245 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 246 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 247 spin_unlock(&tree->lock);
222 return 0; 248 return 0;
223} 249}
224 250
@@ -232,15 +258,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 258 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 259 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 260int btrfs_dec_test_ordered_pending(struct inode *inode,
261 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 262 u64 file_offset, u64 io_size)
236{ 263{
237 struct btrfs_ordered_inode_tree *tree; 264 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 265 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 266 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 267 int ret;
241 268
242 tree = &BTRFS_I(inode)->ordered_tree; 269 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 270 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 271 node = tree_search(tree, file_offset);
245 if (!node) { 272 if (!node) {
246 ret = 1; 273 ret = 1;
@@ -264,7 +291,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 291 else
265 ret = 1; 292 ret = 1;
266out: 293out:
267 mutex_unlock(&tree->mutex); 294 if (!ret && cached && entry) {
295 *cached = entry;
296 atomic_inc(&entry->refs);
297 }
298 spin_unlock(&tree->lock);
268 return ret == 0; 299 return ret == 0;
269} 300}
270 301
@@ -291,13 +322,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 322
292/* 323/*
293 * remove an ordered extent from the tree. No references are dropped 324 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 325 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 326 * while you call this function.
296 */ 327 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 328static int __btrfs_remove_ordered_extent(struct inode *inode,
298 struct btrfs_ordered_extent *entry) 329 struct btrfs_ordered_extent *entry)
299{ 330{
300 struct btrfs_ordered_inode_tree *tree; 331 struct btrfs_ordered_inode_tree *tree;
332 struct btrfs_root *root = BTRFS_I(inode)->root;
301 struct rb_node *node; 333 struct rb_node *node;
302 334
303 tree = &BTRFS_I(inode)->ordered_tree; 335 tree = &BTRFS_I(inode)->ordered_tree;
@@ -306,13 +338,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
306 tree->last = NULL; 338 tree->last = NULL;
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 340
309 spin_lock(&BTRFS_I(inode)->accounting_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
310 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1);
314
315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
316 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
317 343
318 /* 344 /*
@@ -324,7 +350,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
324 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 350 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
325 list_del_init(&BTRFS_I(inode)->ordered_operations); 351 list_del_init(&BTRFS_I(inode)->ordered_operations);
326 } 352 }
327 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 353 spin_unlock(&root->fs_info->ordered_extent_lock);
328 354
329 return 0; 355 return 0;
330} 356}
@@ -340,9 +366,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 366 int ret;
341 367
342 tree = &BTRFS_I(inode)->ordered_tree; 368 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 369 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 370 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 371 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 372 wake_up(&entry->wait);
347 373
348 return ret; 374 return ret;
@@ -485,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
485 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
486 * for pdflush to find them 512 * for pdflush to find them
487 */ 513 */
488 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
489 if (wait) { 516 if (wait) {
490 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
491 &entry->flags)); 518 &entry->flags));
@@ -567,7 +594,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 594 struct btrfs_ordered_extent *entry = NULL;
568 595
569 tree = &BTRFS_I(inode)->ordered_tree; 596 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 597 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 598 node = tree_search(tree, file_offset);
572 if (!node) 599 if (!node)
573 goto out; 600 goto out;
@@ -578,7 +605,48 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 605 if (entry)
579 atomic_inc(&entry->refs); 606 atomic_inc(&entry->refs);
580out: 607out:
581 mutex_unlock(&tree->mutex); 608 spin_unlock(&tree->lock);
609 return entry;
610}
611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
582 return entry; 650 return entry;
583} 651}
584 652
@@ -594,7 +662,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 662 struct btrfs_ordered_extent *entry = NULL;
595 663
596 tree = &BTRFS_I(inode)->ordered_tree; 664 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 665 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 666 node = tree_search(tree, file_offset);
599 if (!node) 667 if (!node)
600 goto out; 668 goto out;
@@ -602,7 +670,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 670 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 671 atomic_inc(&entry->refs);
604out: 672out:
605 mutex_unlock(&tree->mutex); 673 spin_unlock(&tree->lock);
606 return entry; 674 return entry;
607} 675}
608 676
@@ -629,7 +697,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
629 else 697 else
630 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 698 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
631 699
632 mutex_lock(&tree->mutex); 700 spin_lock(&tree->lock);
633 disk_i_size = BTRFS_I(inode)->disk_i_size; 701 disk_i_size = BTRFS_I(inode)->disk_i_size;
634 702
635 /* truncate file */ 703 /* truncate file */
@@ -735,7 +803,7 @@ out:
735 */ 803 */
736 if (ordered) 804 if (ordered)
737 __btrfs_remove_ordered_extent(inode, ordered); 805 __btrfs_remove_ordered_extent(inode, ordered);
738 mutex_unlock(&tree->mutex); 806 spin_unlock(&tree->lock);
739 if (ordered) 807 if (ordered)
740 wake_up(&ordered->wait); 808 wake_up(&ordered->wait);
741 return ret; 809 return ret;
@@ -762,7 +830,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
762 if (!ordered) 830 if (!ordered)
763 return 1; 831 return 1;
764 832
765 mutex_lock(&tree->mutex); 833 spin_lock(&tree->lock);
766 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 834 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
767 if (disk_bytenr >= ordered_sum->bytenr) { 835 if (disk_bytenr >= ordered_sum->bytenr) {
768 num_sectors = ordered_sum->len / sectorsize; 836 num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +845,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
777 } 845 }
778 } 846 }
779out: 847out:
780 mutex_unlock(&tree->mutex); 848 spin_unlock(&tree->lock);
781 btrfs_put_ordered_extent(ordered); 849 btrfs_put_ordered_extent(ordered);
782 return ret; 850 return ret;
783} 851}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -128,8 +130,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 130static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 131btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 132{
131 mutex_init(&t->mutex); 133 spin_lock_init(&t->lock);
132 t->tree.rb_node = NULL; 134 t->tree = RB_ROOT;
133 t->last = NULL; 135 t->last = NULL;
134} 136}
135 137
@@ -137,9 +139,12 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 139int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 140 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 141int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 142 struct btrfs_ordered_extent **cached,
143 u64 file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
143int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
144 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
145 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -150,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
150int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
151struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
152btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
153int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
154 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
52 52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) 53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{ 54{
55 tree->root.rb_node = NULL; 55 tree->root = RB_ROOT;
56 INIT_LIST_HEAD(&tree->list); 56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock); 57 spin_lock_init(&tree->lock);
58} 58}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ed3e4a2ec2c8..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -43,8 +44,12 @@ struct tree_entry {
43struct backref_node { 44struct backref_node {
44 struct rb_node rb_node; 45 struct rb_node rb_node;
45 u64 bytenr; 46 u64 bytenr;
46 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
47 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
48 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
49 struct list_head upper; 54 struct list_head upper;
50 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -55,9 +60,9 @@ struct backref_node {
55 struct extent_buffer *eb; 60 struct extent_buffer *eb;
56 /* level of tree block */ 61 /* level of tree block */
57 unsigned int level:8; 62 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
59 unsigned int old_root:1; 64 unsigned int cowonly:1;
60 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
61 unsigned int lowest:1; 66 unsigned int lowest:1;
62 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
63 unsigned int locked:1; 68 unsigned int locked:1;
@@ -65,6 +70,16 @@ struct backref_node {
65 unsigned int processed:1; 70 unsigned int processed:1;
66 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
67 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
68}; 83};
69 84
70/* 85/*
@@ -73,7 +88,6 @@ struct backref_node {
73struct backref_edge { 88struct backref_edge {
74 struct list_head list[2]; 89 struct list_head list[2];
75 struct backref_node *node[2]; 90 struct backref_node *node[2];
76 u64 blockptr;
77}; 91};
78 92
79#define LOWER 0 93#define LOWER 0
@@ -82,9 +96,25 @@ struct backref_edge {
82struct backref_cache { 96struct backref_cache {
83 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root; 98 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
86 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
88}; 118};
89 119
90/* 120/*
@@ -112,15 +142,6 @@ struct tree_block {
112 unsigned int key_ready:1; 142 unsigned int key_ready:1;
113}; 143};
114 144
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
125 146
126struct file_extent_cluster { 147struct file_extent_cluster {
@@ -137,58 +158,130 @@ struct reloc_control {
137 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
138 /* inode for moving data */ 159 /* inode for moving data */
139 struct inode *data_inode; 160 struct inode *data_inode;
140 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
141 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
142 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
143 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
144 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
145 /* list of reloc trees */ 171 /* list of reloc trees */
146 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
147 u64 search_start; 178 u64 search_start;
148 u64 extents_found; 179 u64 extents_found;
149 u64 extents_skipped; 180
150 int stage; 181 int block_rsv_retries;
151 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
152 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
153 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
154}; 188};
155 189
156/* stages of data relocation */ 190/* stages of data relocation */
157#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
158#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
159 193
160/* 194static void remove_backref_node(struct backref_cache *cache,
161 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
162 */ 196static void __mark_block_processed(struct reloc_control *rc,
163struct async_merge { 197 struct backref_node *node);
164 struct btrfs_work work;
165 struct reloc_control *rc;
166 struct btrfs_root *root;
167 struct completion *done;
168 atomic_t *num_pending;
169};
170 198
171static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
172{ 200{
173 tree->rb_root.rb_node = NULL; 201 tree->rb_root = RB_ROOT;
174 spin_lock_init(&tree->lock); 202 spin_lock_init(&tree->lock);
175} 203}
176 204
177static void backref_cache_init(struct backref_cache *cache) 205static void backref_cache_init(struct backref_cache *cache)
178{ 206{
179 int i; 207 int i;
180 cache->rb_root.rb_node = NULL; 208 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
184} 276}
185 277
186static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
187{ 280{
188 memset(node, 0, sizeof(*node)); 281 if (edge) {
189 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
190 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
191 RB_CLEAR_NODE(&node->rb_node); 284 }
192} 285}
193 286
194static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -249,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
249 edges[idx++] = edge; 342 edges[idx++] = edge;
250 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
251 } 344 }
345 BUG_ON(node->detached);
252 *index = idx; 346 *index = idx;
253 return node; 347 return node;
254} 348}
@@ -280,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
280 return NULL; 374 return NULL;
281} 375}
282 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
283static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
284{ 386{
285 if (node->eb) { 387 if (node->eb) {
286 if (node->locked) { 388 unlock_node_buffer(node);
287 btrfs_tree_unlock(node->eb);
288 node->locked = 0;
289 }
290 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
291 node->eb = NULL; 390 node->eb = NULL;
292 } 391 }
@@ -295,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
295static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
296 struct backref_node *node) 395 struct backref_node *node)
297{ 396{
298 BUG_ON(!node->lowest);
299 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
300 398
301 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
302 list_del(&node->lower); 401 list_del(&node->lower);
303 402 if (!RB_EMPTY_NODE(&node->rb_node))
304 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
305 kfree(node); 404 free_backref_node(tree, node);
306} 405}
307 406
308/* 407/*
@@ -317,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
317 if (!node) 416 if (!node)
318 return; 417 return;
319 418
320 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
321 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
322 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
323 list[LOWER]); 422 list[LOWER]);
324 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
325 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
326 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
327 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
328 /* 435 /*
329 * add the node to pending list if no other 436 * add the node to leaf node list if no other
330 * child block cached. 437 * child block cached.
331 */ 438 */
332 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
333 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
334 &cache->pending[upper->level]);
335 upper->lowest = 1; 441 upper->lowest = 1;
336 } 442 }
337 } 443 }
444
338 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
339} 446}
340 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
341/* 534/*
342 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
343 */ 536 */
@@ -452,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
452 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
453 * block are also cached. 646 * block are also cached.
454 */ 647 */
455static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
456 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
457 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
458 int level, u64 bytenr) 651 int level, u64 bytenr)
459{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
460 struct btrfs_path *path1; 654 struct btrfs_path *path1;
461 struct btrfs_path *path2; 655 struct btrfs_path *path2;
462 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -472,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
472 unsigned long end; 666 unsigned long end;
473 unsigned long ptr; 667 unsigned long ptr;
474 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
475 int ret; 671 int ret;
476 int err = 0; 672 int err = 0;
477 673
@@ -482,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
482 goto out; 678 goto out;
483 } 679 }
484 680
485 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
486 if (!node) { 682 if (!node) {
487 err = -ENOMEM; 683 err = -ENOMEM;
488 goto out; 684 goto out;
489 } 685 }
490 686
491 backref_node_init(node);
492 node->bytenr = bytenr; 687 node->bytenr = bytenr;
493 node->owner = 0;
494 node->level = level; 688 node->level = level;
495 node->lowest = 1; 689 node->lowest = 1;
496 cur = node; 690 cur = node;
@@ -586,17 +780,21 @@ again:
586#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
589 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
592 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
593 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
594 root = find_tree_root(rc, eb, ref0); 787 if (key.objectid == key.offset) {
595 if (root) 788 root = find_tree_root(rc, eb, ref0);
596 cur->root = root; 789 if (root && !should_ignore_root(root))
597 else 790 cur->root = root;
598 cur->old_root = 1; 791 else
599 break; 792 list_add(&cur->list, &useless);
793 break;
794 }
795 if (is_cowonly_root(btrfs_ref_root_v0(eb,
796 ref0)))
797 cur->cowonly = 1;
600 } 798 }
601#else 799#else
602 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 800 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -613,22 +811,20 @@ again:
613 break; 811 break;
614 } 812 }
615 813
616 edge = kzalloc(sizeof(*edge), GFP_NOFS); 814 edge = alloc_backref_edge(cache);
617 if (!edge) { 815 if (!edge) {
618 err = -ENOMEM; 816 err = -ENOMEM;
619 goto out; 817 goto out;
620 } 818 }
621 rb_node = tree_search(&cache->rb_root, key.offset); 819 rb_node = tree_search(&cache->rb_root, key.offset);
622 if (!rb_node) { 820 if (!rb_node) {
623 upper = kmalloc(sizeof(*upper), GFP_NOFS); 821 upper = alloc_backref_node(cache);
624 if (!upper) { 822 if (!upper) {
625 kfree(edge); 823 free_backref_edge(cache, edge);
626 err = -ENOMEM; 824 err = -ENOMEM;
627 goto out; 825 goto out;
628 } 826 }
629 backref_node_init(upper);
630 upper->bytenr = key.offset; 827 upper->bytenr = key.offset;
631 upper->owner = 0;
632 upper->level = cur->level + 1; 828 upper->level = cur->level + 1;
633 /* 829 /*
634 * backrefs for the upper level block isn't 830 * backrefs for the upper level block isn't
@@ -638,11 +834,12 @@ again:
638 } else { 834 } else {
639 upper = rb_entry(rb_node, struct backref_node, 835 upper = rb_entry(rb_node, struct backref_node,
640 rb_node); 836 rb_node);
837 BUG_ON(!upper->checked);
641 INIT_LIST_HEAD(&edge->list[UPPER]); 838 INIT_LIST_HEAD(&edge->list[UPPER]);
642 } 839 }
643 list_add(&edge->list[LOWER], &cur->upper); 840 list_add_tail(&edge->list[LOWER], &cur->upper);
644 edge->node[UPPER] = upper;
645 edge->node[LOWER] = cur; 841 edge->node[LOWER] = cur;
842 edge->node[UPPER] = upper;
646 843
647 goto next; 844 goto next;
648 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 845 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -656,11 +853,17 @@ again:
656 goto out; 853 goto out;
657 } 854 }
658 855
856 if (!root->ref_cows)
857 cur->cowonly = 1;
858
659 if (btrfs_root_level(&root->root_item) == cur->level) { 859 if (btrfs_root_level(&root->root_item) == cur->level) {
660 /* tree root */ 860 /* tree root */
661 BUG_ON(btrfs_root_bytenr(&root->root_item) != 861 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662 cur->bytenr); 862 cur->bytenr);
663 cur->root = root; 863 if (should_ignore_root(root))
864 list_add(&cur->list, &useless);
865 else
866 cur->root = root;
664 break; 867 break;
665 } 868 }
666 869
@@ -691,11 +894,14 @@ again:
691 if (!path2->nodes[level]) { 894 if (!path2->nodes[level]) {
692 BUG_ON(btrfs_root_bytenr(&root->root_item) != 895 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693 lower->bytenr); 896 lower->bytenr);
694 lower->root = root; 897 if (should_ignore_root(root))
898 list_add(&lower->list, &useless);
899 else
900 lower->root = root;
695 break; 901 break;
696 } 902 }
697 903
698 edge = kzalloc(sizeof(*edge), GFP_NOFS); 904 edge = alloc_backref_edge(cache);
699 if (!edge) { 905 if (!edge) {
700 err = -ENOMEM; 906 err = -ENOMEM;
701 goto out; 907 goto out;
@@ -704,16 +910,17 @@ again:
704 eb = path2->nodes[level]; 910 eb = path2->nodes[level];
705 rb_node = tree_search(&cache->rb_root, eb->start); 911 rb_node = tree_search(&cache->rb_root, eb->start);
706 if (!rb_node) { 912 if (!rb_node) {
707 upper = kmalloc(sizeof(*upper), GFP_NOFS); 913 upper = alloc_backref_node(cache);
708 if (!upper) { 914 if (!upper) {
709 kfree(edge); 915 free_backref_edge(cache, edge);
710 err = -ENOMEM; 916 err = -ENOMEM;
711 goto out; 917 goto out;
712 } 918 }
713 backref_node_init(upper);
714 upper->bytenr = eb->start; 919 upper->bytenr = eb->start;
715 upper->owner = btrfs_header_owner(eb); 920 upper->owner = btrfs_header_owner(eb);
716 upper->level = lower->level + 1; 921 upper->level = lower->level + 1;
922 if (!root->ref_cows)
923 upper->cowonly = 1;
717 924
718 /* 925 /*
719 * if we know the block isn't shared 926 * if we know the block isn't shared
@@ -743,10 +950,12 @@ again:
743 rb_node); 950 rb_node);
744 BUG_ON(!upper->checked); 951 BUG_ON(!upper->checked);
745 INIT_LIST_HEAD(&edge->list[UPPER]); 952 INIT_LIST_HEAD(&edge->list[UPPER]);
953 if (!upper->owner)
954 upper->owner = btrfs_header_owner(eb);
746 } 955 }
747 list_add_tail(&edge->list[LOWER], &lower->upper); 956 list_add_tail(&edge->list[LOWER], &lower->upper);
748 edge->node[UPPER] = upper;
749 edge->node[LOWER] = lower; 957 edge->node[LOWER] = lower;
958 edge->node[UPPER] = upper;
750 959
751 if (rb_node) 960 if (rb_node)
752 break; 961 break;
@@ -784,8 +993,13 @@ next:
784 * into the cache. 993 * into the cache.
785 */ 994 */
786 BUG_ON(!node->checked); 995 BUG_ON(!node->checked);
787 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 996 cowonly = node->cowonly;
788 BUG_ON(rb_node); 997 if (!cowonly) {
998 rb_node = tree_insert(&cache->rb_root, node->bytenr,
999 &node->rb_node);
1000 BUG_ON(rb_node);
1001 list_add_tail(&node->lower, &cache->leaves);
1002 }
789 1003
790 list_for_each_entry(edge, &node->upper, list[LOWER]) 1004 list_for_each_entry(edge, &node->upper, list[LOWER])
791 list_add_tail(&edge->list[UPPER], &list); 1005 list_add_tail(&edge->list[UPPER], &list);
@@ -794,6 +1008,14 @@ next:
794 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1008 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795 list_del_init(&edge->list[UPPER]); 1009 list_del_init(&edge->list[UPPER]);
796 upper = edge->node[UPPER]; 1010 upper = edge->node[UPPER];
1011 if (upper->detached) {
1012 list_del(&edge->list[LOWER]);
1013 lower = edge->node[LOWER];
1014 free_backref_edge(cache, edge);
1015 if (list_empty(&lower->upper))
1016 list_add(&lower->list, &useless);
1017 continue;
1018 }
797 1019
798 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1020 if (!RB_EMPTY_NODE(&upper->rb_node)) {
799 if (upper->lowest) { 1021 if (upper->lowest) {
@@ -806,25 +1028,69 @@ next:
806 } 1028 }
807 1029
808 BUG_ON(!upper->checked); 1030 BUG_ON(!upper->checked);
809 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1031 BUG_ON(cowonly != upper->cowonly);
810 &upper->rb_node); 1032 if (!cowonly) {
811 BUG_ON(rb_node); 1033 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1034 &upper->rb_node);
1035 BUG_ON(rb_node);
1036 }
812 1037
813 list_add_tail(&edge->list[UPPER], &upper->lower); 1038 list_add_tail(&edge->list[UPPER], &upper->lower);
814 1039
815 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1040 list_for_each_entry(edge, &upper->upper, list[LOWER])
816 list_add_tail(&edge->list[UPPER], &list); 1041 list_add_tail(&edge->list[UPPER], &list);
817 } 1042 }
1043 /*
1044 * process useless backref nodes. backref nodes for tree leaves
1045 * are deleted from the cache. backref nodes for upper level
1046 * tree blocks are left in the cache to avoid unnecessary backref
1047 * lookup.
1048 */
1049 while (!list_empty(&useless)) {
1050 upper = list_entry(useless.next, struct backref_node, list);
1051 list_del_init(&upper->list);
1052 BUG_ON(!list_empty(&upper->upper));
1053 if (upper == node)
1054 node = NULL;
1055 if (upper->lowest) {
1056 list_del_init(&upper->lower);
1057 upper->lowest = 0;
1058 }
1059 while (!list_empty(&upper->lower)) {
1060 edge = list_entry(upper->lower.next,
1061 struct backref_edge, list[UPPER]);
1062 list_del(&edge->list[UPPER]);
1063 list_del(&edge->list[LOWER]);
1064 lower = edge->node[LOWER];
1065 free_backref_edge(cache, edge);
1066
1067 if (list_empty(&lower->upper))
1068 list_add(&lower->list, &useless);
1069 }
1070 __mark_block_processed(rc, upper);
1071 if (upper->level > 0) {
1072 list_add(&upper->list, &cache->detached);
1073 upper->detached = 1;
1074 } else {
1075 rb_erase(&upper->rb_node, &cache->rb_root);
1076 free_backref_node(cache, upper);
1077 }
1078 }
818out: 1079out:
819 btrfs_free_path(path1); 1080 btrfs_free_path(path1);
820 btrfs_free_path(path2); 1081 btrfs_free_path(path2);
821 if (err) { 1082 if (err) {
822 INIT_LIST_HEAD(&list); 1083 while (!list_empty(&useless)) {
1084 lower = list_entry(useless.next,
1085 struct backref_node, upper);
1086 list_del_init(&lower->upper);
1087 }
823 upper = node; 1088 upper = node;
1089 INIT_LIST_HEAD(&list);
824 while (upper) { 1090 while (upper) {
825 if (RB_EMPTY_NODE(&upper->rb_node)) { 1091 if (RB_EMPTY_NODE(&upper->rb_node)) {
826 list_splice_tail(&upper->upper, &list); 1092 list_splice_tail(&upper->upper, &list);
827 kfree(upper); 1093 free_backref_node(cache, upper);
828 } 1094 }
829 1095
830 if (list_empty(&list)) 1096 if (list_empty(&list))
@@ -832,15 +1098,104 @@ out:
832 1098
833 edge = list_entry(list.next, struct backref_edge, 1099 edge = list_entry(list.next, struct backref_edge,
834 list[LOWER]); 1100 list[LOWER]);
1101 list_del(&edge->list[LOWER]);
835 upper = edge->node[UPPER]; 1102 upper = edge->node[UPPER];
836 kfree(edge); 1103 free_backref_edge(cache, edge);
837 } 1104 }
838 return ERR_PTR(err); 1105 return ERR_PTR(err);
839 } 1106 }
1107 BUG_ON(node && node->detached);
840 return node; 1108 return node;
841} 1109}
842 1110
843/* 1111/*
1112 * helper to add backref node for the newly created snapshot.
1113 * the backref node is created by cloning backref node that
1114 * corresponds to root of source tree
1115 */
1116static int clone_backref_node(struct btrfs_trans_handle *trans,
1117 struct reloc_control *rc,
1118 struct btrfs_root *src,
1119 struct btrfs_root *dest)
1120{
1121 struct btrfs_root *reloc_root = src->reloc_root;
1122 struct backref_cache *cache = &rc->backref_cache;
1123 struct backref_node *node = NULL;
1124 struct backref_node *new_node;
1125 struct backref_edge *edge;
1126 struct backref_edge *new_edge;
1127 struct rb_node *rb_node;
1128
1129 if (cache->last_trans > 0)
1130 update_backref_cache(trans, cache);
1131
1132 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1133 if (rb_node) {
1134 node = rb_entry(rb_node, struct backref_node, rb_node);
1135 if (node->detached)
1136 node = NULL;
1137 else
1138 BUG_ON(node->new_bytenr != reloc_root->node->start);
1139 }
1140
1141 if (!node) {
1142 rb_node = tree_search(&cache->rb_root,
1143 reloc_root->commit_root->start);
1144 if (rb_node) {
1145 node = rb_entry(rb_node, struct backref_node,
1146 rb_node);
1147 BUG_ON(node->detached);
1148 }
1149 }
1150
1151 if (!node)
1152 return 0;
1153
1154 new_node = alloc_backref_node(cache);
1155 if (!new_node)
1156 return -ENOMEM;
1157
1158 new_node->bytenr = dest->node->start;
1159 new_node->level = node->level;
1160 new_node->lowest = node->lowest;
1161 new_node->root = dest;
1162
1163 if (!node->lowest) {
1164 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1165 new_edge = alloc_backref_edge(cache);
1166 if (!new_edge)
1167 goto fail;
1168
1169 new_edge->node[UPPER] = new_node;
1170 new_edge->node[LOWER] = edge->node[LOWER];
1171 list_add_tail(&new_edge->list[UPPER],
1172 &new_node->lower);
1173 }
1174 }
1175
1176 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1177 &new_node->rb_node);
1178 BUG_ON(rb_node);
1179
1180 if (!new_node->lowest) {
1181 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1182 list_add_tail(&new_edge->list[LOWER],
1183 &new_edge->node[LOWER]->upper);
1184 }
1185 }
1186 return 0;
1187fail:
1188 while (!list_empty(&new_node->lower)) {
1189 new_edge = list_entry(new_node->lower.next,
1190 struct backref_edge, list[UPPER]);
1191 list_del(&new_edge->list[UPPER]);
1192 free_backref_edge(cache, new_edge);
1193 }
1194 free_backref_node(cache, new_node);
1195 return -ENOMEM;
1196}
1197
1198/*
844 * helper to add 'address of tree root -> reloc tree' mapping 1199 * helper to add 'address of tree root -> reloc tree' mapping
845 */ 1200 */
846static int __add_reloc_root(struct btrfs_root *root) 1201static int __add_reloc_root(struct btrfs_root *root)
@@ -900,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
900 return 0; 1255 return 0;
901} 1256}
902 1257
903/* 1258static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
904 * create reloc tree for a given fs tree. reloc tree is just a 1259 struct btrfs_root *root, u64 objectid)
905 * snapshot of the fs tree with special root objectid.
906 */
907int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908 struct btrfs_root *root)
909{ 1260{
910 struct btrfs_root *reloc_root; 1261 struct btrfs_root *reloc_root;
911 struct extent_buffer *eb; 1262 struct extent_buffer *eb;
@@ -913,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
913 struct btrfs_key root_key; 1264 struct btrfs_key root_key;
914 int ret; 1265 int ret;
915 1266
916 if (root->reloc_root) {
917 reloc_root = root->reloc_root;
918 reloc_root->last_trans = trans->transid;
919 return 0;
920 }
921
922 if (!root->fs_info->reloc_ctl ||
923 !root->fs_info->reloc_ctl->create_reloc_root ||
924 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925 return 0;
926
927 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1267 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928 BUG_ON(!root_item); 1268 BUG_ON(!root_item);
929 1269
930 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1270 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931 root_key.type = BTRFS_ROOT_ITEM_KEY; 1271 root_key.type = BTRFS_ROOT_ITEM_KEY;
932 root_key.offset = root->root_key.objectid; 1272 root_key.offset = objectid;
933 1273
934 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1274 if (root->root_key.objectid == objectid) {
935 BTRFS_TREE_RELOC_OBJECTID); 1275 /* called by btrfs_init_reloc_root */
936 BUG_ON(ret); 1276 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1277 BTRFS_TREE_RELOC_OBJECTID);
1278 BUG_ON(ret);
1279
1280 btrfs_set_root_last_snapshot(&root->root_item,
1281 trans->transid - 1);
1282 } else {
1283 /*
1284 * called by btrfs_reloc_post_snapshot_hook.
1285 * the source tree is a reloc tree, all tree blocks
1286 * modified after it was created have RELOC flag
1287 * set in their headers. so it's OK to not update
1288 * the 'last_snapshot'.
1289 */
1290 ret = btrfs_copy_root(trans, root, root->node, &eb,
1291 BTRFS_TREE_RELOC_OBJECTID);
1292 BUG_ON(ret);
1293 }
937 1294
938 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1295 memcpy(root_item, &root->root_item, sizeof(*root_item));
940 btrfs_set_root_refs(root_item, 1);
941 btrfs_set_root_bytenr(root_item, eb->start); 1296 btrfs_set_root_bytenr(root_item, eb->start);
942 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1297 btrfs_set_root_level(root_item, btrfs_header_level(eb));
943 btrfs_set_root_generation(root_item, trans->transid); 1298 btrfs_set_root_generation(root_item, trans->transid);
944 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1299
945 root_item->drop_level = 0; 1300 if (root->root_key.objectid == objectid) {
1301 btrfs_set_root_refs(root_item, 0);
1302 memset(&root_item->drop_progress, 0,
1303 sizeof(struct btrfs_disk_key));
1304 root_item->drop_level = 0;
1305 }
946 1306
947 btrfs_tree_unlock(eb); 1307 btrfs_tree_unlock(eb);
948 free_extent_buffer(eb); 1308 free_extent_buffer(eb);
@@ -956,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
956 &root_key); 1316 &root_key);
957 BUG_ON(IS_ERR(reloc_root)); 1317 BUG_ON(IS_ERR(reloc_root));
958 reloc_root->last_trans = trans->transid; 1318 reloc_root->last_trans = trans->transid;
1319 return reloc_root;
1320}
1321
1322/*
1323 * create reloc tree for a given fs tree. reloc tree is just a
1324 * snapshot of the fs tree with special root objectid.
1325 */
1326int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 struct btrfs_root *reloc_root;
1330 struct reloc_control *rc = root->fs_info->reloc_ctl;
1331 int clear_rsv = 0;
1332
1333 if (root->reloc_root) {
1334 reloc_root = root->reloc_root;
1335 reloc_root->last_trans = trans->transid;
1336 return 0;
1337 }
1338
1339 if (!rc || !rc->create_reloc_tree ||
1340 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1341 return 0;
1342
1343 if (!trans->block_rsv) {
1344 trans->block_rsv = rc->block_rsv;
1345 clear_rsv = 1;
1346 }
1347 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1348 if (clear_rsv)
1349 trans->block_rsv = NULL;
959 1350
960 __add_reloc_root(reloc_root); 1351 __add_reloc_root(reloc_root);
961 root->reloc_root = reloc_root; 1352 root->reloc_root = reloc_root;
@@ -979,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
979 reloc_root = root->reloc_root; 1370 reloc_root = root->reloc_root;
980 root_item = &reloc_root->root_item; 1371 root_item = &reloc_root->root_item;
981 1372
982 if (btrfs_root_refs(root_item) == 0) { 1373 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1374 btrfs_root_refs(root_item) == 0) {
983 root->reloc_root = NULL; 1375 root->reloc_root = NULL;
984 del = 1; 1376 del = 1;
985 } 1377 }
@@ -1101,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1101 goto out; 1493 goto out;
1102 } 1494 }
1103 1495
1104 if (new_bytenr) 1496 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1105 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 ret = 0; 1497 ret = 0;
1107out: 1498out:
1108 btrfs_free_path(path); 1499 btrfs_free_path(path);
@@ -1113,19 +1504,18 @@ out:
1113 * update file extent items in the tree leaf to point to 1504 * update file extent items in the tree leaf to point to
1114 * the new locations. 1505 * the new locations.
1115 */ 1506 */
1116static int replace_file_extents(struct btrfs_trans_handle *trans, 1507static noinline_for_stack
1117 struct reloc_control *rc, 1508int replace_file_extents(struct btrfs_trans_handle *trans,
1118 struct btrfs_root *root, 1509 struct reloc_control *rc,
1119 struct extent_buffer *leaf, 1510 struct btrfs_root *root,
1120 struct list_head *inode_list) 1511 struct extent_buffer *leaf)
1121{ 1512{
1122 struct btrfs_key key; 1513 struct btrfs_key key;
1123 struct btrfs_file_extent_item *fi; 1514 struct btrfs_file_extent_item *fi;
1124 struct inode *inode = NULL; 1515 struct inode *inode = NULL;
1125 struct inodevec *ivec = NULL;
1126 u64 parent; 1516 u64 parent;
1127 u64 bytenr; 1517 u64 bytenr;
1128 u64 new_bytenr; 1518 u64 new_bytenr = 0;
1129 u64 num_bytes; 1519 u64 num_bytes;
1130 u64 end; 1520 u64 end;
1131 u32 nritems; 1521 u32 nritems;
@@ -1165,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1165 * to complete and drop the extent cache 1555 * to complete and drop the extent cache
1166 */ 1556 */
1167 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1557 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170 BUG_ON(!ivec);
1171 ivec->nr = 0;
1172 list_add_tail(&ivec->list, inode_list);
1173 }
1174 if (first) { 1558 if (first) {
1175 inode = find_next_inode(root, key.objectid); 1559 inode = find_next_inode(root, key.objectid);
1176 if (inode)
1177 ivec->inode[ivec->nr++] = inode;
1178 first = 0; 1560 first = 0;
1179 } else if (inode && inode->i_ino < key.objectid) { 1561 } else if (inode && inode->i_ino < key.objectid) {
1562 btrfs_add_delayed_iput(inode);
1180 inode = find_next_inode(root, key.objectid); 1563 inode = find_next_inode(root, key.objectid);
1181 if (inode)
1182 ivec->inode[ivec->nr++] = inode;
1183 } 1564 }
1184 if (inode && inode->i_ino == key.objectid) { 1565 if (inode && inode->i_ino == key.objectid) {
1185 end = key.offset + 1566 end = key.offset +
@@ -1203,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1203 1584
1204 ret = get_new_location(rc->data_inode, &new_bytenr, 1585 ret = get_new_location(rc->data_inode, &new_bytenr,
1205 bytenr, num_bytes); 1586 bytenr, num_bytes);
1206 if (ret > 0) 1587 if (ret > 0) {
1588 WARN_ON(1);
1207 continue; 1589 continue;
1590 }
1208 BUG_ON(ret < 0); 1591 BUG_ON(ret < 0);
1209 1592
1210 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1593 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1224,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1224 } 1607 }
1225 if (dirty) 1608 if (dirty)
1226 btrfs_mark_buffer_dirty(leaf); 1609 btrfs_mark_buffer_dirty(leaf);
1610 if (inode)
1611 btrfs_add_delayed_iput(inode);
1227 return 0; 1612 return 0;
1228} 1613}
1229 1614
@@ -1247,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1247 * if no block got replaced, 0 is returned. if there are other 1632 * if no block got replaced, 0 is returned. if there are other
1248 * errors, a negative error number is returned. 1633 * errors, a negative error number is returned.
1249 */ 1634 */
1250static int replace_path(struct btrfs_trans_handle *trans, 1635static noinline_for_stack
1251 struct btrfs_root *dest, struct btrfs_root *src, 1636int replace_path(struct btrfs_trans_handle *trans,
1252 struct btrfs_path *path, struct btrfs_key *next_key, 1637 struct btrfs_root *dest, struct btrfs_root *src,
1253 struct extent_buffer **leaf, 1638 struct btrfs_path *path, struct btrfs_key *next_key,
1254 int lowest_level, int max_level) 1639 int lowest_level, int max_level)
1255{ 1640{
1256 struct extent_buffer *eb; 1641 struct extent_buffer *eb;
1257 struct extent_buffer *parent; 1642 struct extent_buffer *parent;
@@ -1262,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1262 u64 new_ptr_gen; 1647 u64 new_ptr_gen;
1263 u64 last_snapshot; 1648 u64 last_snapshot;
1264 u32 blocksize; 1649 u32 blocksize;
1650 int cow = 0;
1265 int level; 1651 int level;
1266 int ret; 1652 int ret;
1267 int slot; 1653 int slot;
1268 1654
1269 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1656 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(lowest_level > 1 && leaf);
1272 1657
1273 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1658 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274 1659again:
1275 slot = path->slots[lowest_level]; 1660 slot = path->slots[lowest_level];
1276 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1661 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277 1662
@@ -1285,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1285 return 0; 1670 return 0;
1286 } 1671 }
1287 1672
1288 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1673 if (cow) {
1289 BUG_ON(ret); 1674 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1675 BUG_ON(ret);
1676 }
1290 btrfs_set_lock_blocking(eb); 1677 btrfs_set_lock_blocking(eb);
1291 1678
1292 if (next_key) { 1679 if (next_key) {
@@ -1330,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1330 1717
1331 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1718 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332 memcmp_node_keys(parent, slot, path, level)) { 1719 memcmp_node_keys(parent, slot, path, level)) {
1333 if (level <= lowest_level && !leaf) { 1720 if (level <= lowest_level) {
1334 ret = 0; 1721 ret = 0;
1335 break; 1722 break;
1336 } 1723 }
@@ -1338,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1338 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1339 old_ptr_gen); 1726 old_ptr_gen);
1340 btrfs_tree_lock(eb); 1727 btrfs_tree_lock(eb);
1341 ret = btrfs_cow_block(trans, dest, eb, parent, 1728 if (cow) {
1342 slot, &eb); 1729 ret = btrfs_cow_block(trans, dest, eb, parent,
1343 BUG_ON(ret); 1730 slot, &eb);
1344 btrfs_set_lock_blocking(eb); 1731 BUG_ON(ret);
1345
1346 if (level <= lowest_level) {
1347 *leaf = eb;
1348 ret = 0;
1349 break;
1350 } 1732 }
1733 btrfs_set_lock_blocking(eb);
1351 1734
1352 btrfs_tree_unlock(parent); 1735 btrfs_tree_unlock(parent);
1353 free_extent_buffer(parent); 1736 free_extent_buffer(parent);
@@ -1356,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1356 continue; 1739 continue;
1357 } 1740 }
1358 1741
1742 if (!cow) {
1743 btrfs_tree_unlock(parent);
1744 free_extent_buffer(parent);
1745 cow = 1;
1746 goto again;
1747 }
1748
1359 btrfs_node_key_to_cpu(path->nodes[level], &key, 1749 btrfs_node_key_to_cpu(path->nodes[level], &key,
1360 path->slots[level]); 1750 path->slots[level]);
1361 btrfs_release_path(src, path); 1751 btrfs_release_path(src, path);
@@ -1561,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1561 return 0; 1951 return 0;
1562} 1952}
1563 1953
1564static void put_inodes(struct list_head *list)
1565{
1566 struct inodevec *ivec;
1567 while (!list_empty(list)) {
1568 ivec = list_entry(list->next, struct inodevec, list);
1569 list_del(&ivec->list);
1570 while (ivec->nr > 0) {
1571 ivec->nr--;
1572 iput(ivec->inode[ivec->nr]);
1573 }
1574 kfree(ivec);
1575 }
1576}
1577
1578static int find_next_key(struct btrfs_path *path, int level, 1954static int find_next_key(struct btrfs_path *path, int level,
1579 struct btrfs_key *key) 1955 struct btrfs_key *key)
1580 1956
@@ -1607,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1607 struct btrfs_root *reloc_root; 1983 struct btrfs_root *reloc_root;
1608 struct btrfs_root_item *root_item; 1984 struct btrfs_root_item *root_item;
1609 struct btrfs_path *path; 1985 struct btrfs_path *path;
1610 struct extent_buffer *leaf = NULL; 1986 struct extent_buffer *leaf;
1611 unsigned long nr; 1987 unsigned long nr;
1612 int level; 1988 int level;
1613 int max_level; 1989 int max_level;
1614 int replaced = 0; 1990 int replaced = 0;
1615 int ret; 1991 int ret;
1616 int err = 0; 1992 int err = 0;
1993 u32 min_reserved;
1617 1994
1618 path = btrfs_alloc_path(); 1995 path = btrfs_alloc_path();
1619 if (!path) 1996 if (!path)
@@ -1647,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1647 btrfs_unlock_up_safe(path, 0); 2024 btrfs_unlock_up_safe(path, 0);
1648 } 2025 }
1649 2026
1650 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2027 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1651 trans = btrfs_start_transaction(root, 1); 2028 memset(&next_key, 0, sizeof(next_key));
1652 2029
1653 leaf = path->nodes[0]; 2030 while (1) {
1654 btrfs_item_key_to_cpu(leaf, &key, 0); 2031 trans = btrfs_start_transaction(root, 0);
1655 btrfs_release_path(reloc_root, path); 2032 trans->block_rsv = rc->block_rsv;
1656 2033
1657 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1658 if (ret < 0) { 2035 min_reserved, 0);
1659 err = ret; 2036 if (ret) {
1660 goto out; 2037 BUG_ON(ret != -EAGAIN);
2038 ret = btrfs_commit_transaction(trans, root);
2039 BUG_ON(ret);
2040 continue;
1661 } 2041 }
1662 2042
1663 leaf = path->nodes[0];
1664 btrfs_unlock_up_safe(path, 1);
1665 ret = replace_file_extents(trans, rc, root, leaf,
1666 &inode_list);
1667 if (ret < 0)
1668 err = ret;
1669 goto out;
1670 }
1671
1672 memset(&next_key, 0, sizeof(next_key));
1673
1674 while (1) {
1675 leaf = NULL;
1676 replaced = 0; 2043 replaced = 0;
1677 trans = btrfs_start_transaction(root, 1);
1678 max_level = level; 2044 max_level = level;
1679 2045
1680 ret = walk_down_reloc_tree(reloc_root, path, &level); 2046 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1688,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1688 if (!find_next_key(path, level, &key) && 2054 if (!find_next_key(path, level, &key) &&
1689 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2055 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690 ret = 0; 2056 ret = 0;
1691 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1692 ret = replace_path(trans, root, reloc_root,
1693 path, &next_key, &leaf,
1694 level, max_level);
1695 } else { 2057 } else {
1696 ret = replace_path(trans, root, reloc_root, 2058 ret = replace_path(trans, root, reloc_root, path,
1697 path, &next_key, NULL, 2059 &next_key, level, max_level);
1698 level, max_level);
1699 } 2060 }
1700 if (ret < 0) { 2061 if (ret < 0) {
1701 err = ret; 2062 err = ret;
@@ -1707,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1707 btrfs_node_key_to_cpu(path->nodes[level], &key, 2068 btrfs_node_key_to_cpu(path->nodes[level], &key,
1708 path->slots[level]); 2069 path->slots[level]);
1709 replaced = 1; 2070 replaced = 1;
1710 } else if (leaf) {
1711 /*
1712 * no block got replaced, try replacing file extents
1713 */
1714 btrfs_item_key_to_cpu(leaf, &key, 0);
1715 ret = replace_file_extents(trans, rc, root, leaf,
1716 &inode_list);
1717 btrfs_tree_unlock(leaf);
1718 free_extent_buffer(leaf);
1719 BUG_ON(ret < 0);
1720 } 2071 }
1721 2072
1722 ret = walk_up_reloc_tree(reloc_root, path, &level); 2073 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1733,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1733 root_item->drop_level = level; 2084 root_item->drop_level = level;
1734 2085
1735 nr = trans->blocks_used; 2086 nr = trans->blocks_used;
1736 btrfs_end_transaction(trans, root); 2087 btrfs_end_transaction_throttle(trans, root);
1737 2088
1738 btrfs_btree_balance_dirty(root, nr); 2089 btrfs_btree_balance_dirty(root, nr);
1739 2090
1740 /*
1741 * put inodes outside transaction, otherwise we may deadlock.
1742 */
1743 put_inodes(&inode_list);
1744
1745 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2091 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1746 invalidate_extent_cache(root, &key, &next_key); 2092 invalidate_extent_cache(root, &key, &next_key);
1747 } 2093 }
@@ -1764,87 +2110,125 @@ out:
1764 sizeof(root_item->drop_progress)); 2110 sizeof(root_item->drop_progress));
1765 root_item->drop_level = 0; 2111 root_item->drop_level = 0;
1766 btrfs_set_root_refs(root_item, 0); 2112 btrfs_set_root_refs(root_item, 0);
2113 btrfs_update_reloc_root(trans, root);
1767 } 2114 }
1768 2115
1769 nr = trans->blocks_used; 2116 nr = trans->blocks_used;
1770 btrfs_end_transaction(trans, root); 2117 btrfs_end_transaction_throttle(trans, root);
1771 2118
1772 btrfs_btree_balance_dirty(root, nr); 2119 btrfs_btree_balance_dirty(root, nr);
1773 2120
1774 put_inodes(&inode_list);
1775
1776 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2121 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1777 invalidate_extent_cache(root, &key, &next_key); 2122 invalidate_extent_cache(root, &key, &next_key);
1778 2123
1779 return err; 2124 return err;
1780} 2125}
1781 2126
1782/* 2127static noinline_for_stack
1783 * callback for the work threads. 2128int prepare_to_merge(struct reloc_control *rc, int err)
1784 * this function merges reloc tree with corresponding fs tree,
1785 * and then drops the reloc tree.
1786 */
1787static void merge_func(struct btrfs_work *work)
1788{ 2129{
1789 struct btrfs_trans_handle *trans; 2130 struct btrfs_root *root = rc->extent_root;
1790 struct btrfs_root *root;
1791 struct btrfs_root *reloc_root; 2131 struct btrfs_root *reloc_root;
1792 struct async_merge *async; 2132 struct btrfs_trans_handle *trans;
2133 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0;
2135 int ret;
2136 int retries = 0;
2137
2138 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2140 rc->merging_rsv_size += rc->nodes_relocated * 2;
2141 mutex_unlock(&root->fs_info->trans_mutex);
2142again:
2143 if (!err) {
2144 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries);
2147 if (ret)
2148 err = ret;
2149 }
2150
2151 trans = btrfs_join_transaction(rc->extent_root, 1);
2152
2153 if (!err) {
2154 if (num_bytes != rc->merging_rsv_size) {
2155 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again;
2160 }
2161 }
1793 2162
1794 async = container_of(work, struct async_merge, work); 2163 rc->merge_reloc_tree = 1;
1795 reloc_root = async->root; 2164
2165 while (!list_empty(&rc->reloc_roots)) {
2166 reloc_root = list_entry(rc->reloc_roots.next,
2167 struct btrfs_root, root_list);
2168 list_del_init(&reloc_root->root_list);
1796 2169
1797 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1798 root = read_fs_root(reloc_root->fs_info, 2170 root = read_fs_root(reloc_root->fs_info,
1799 reloc_root->root_key.offset); 2171 reloc_root->root_key.offset);
1800 BUG_ON(IS_ERR(root)); 2172 BUG_ON(IS_ERR(root));
1801 BUG_ON(root->reloc_root != reloc_root); 2173 BUG_ON(root->reloc_root != reloc_root);
1802 2174
1803 merge_reloc_root(async->rc, root); 2175 /*
1804 2176 * set reference count to 1, so btrfs_recover_relocation
1805 trans = btrfs_start_transaction(root, 1); 2177 * knows it should resumes merging
2178 */
2179 if (!err)
2180 btrfs_set_root_refs(&reloc_root->root_item, 1);
1806 btrfs_update_reloc_root(trans, root); 2181 btrfs_update_reloc_root(trans, root);
1807 btrfs_end_transaction(trans, root);
1808 }
1809 2182
1810 btrfs_drop_snapshot(reloc_root, 0); 2183 list_add(&reloc_root->root_list, &reloc_roots);
2184 }
1811 2185
1812 if (atomic_dec_and_test(async->num_pending)) 2186 list_splice(&reloc_roots, &rc->reloc_roots);
1813 complete(async->done);
1814 2187
1815 kfree(async); 2188 if (!err)
2189 btrfs_commit_transaction(trans, rc->extent_root);
2190 else
2191 btrfs_end_transaction(trans, rc->extent_root);
2192 return err;
1816} 2193}
1817 2194
1818static int merge_reloc_roots(struct reloc_control *rc) 2195static noinline_for_stack
2196int merge_reloc_roots(struct reloc_control *rc)
1819{ 2197{
1820 struct async_merge *async;
1821 struct btrfs_root *root; 2198 struct btrfs_root *root;
1822 struct completion done; 2199 struct btrfs_root *reloc_root;
1823 atomic_t num_pending; 2200 LIST_HEAD(reloc_roots);
2201 int found = 0;
2202 int ret;
2203again:
2204 root = rc->extent_root;
2205 mutex_lock(&root->fs_info->trans_mutex);
2206 list_splice_init(&rc->reloc_roots, &reloc_roots);
2207 mutex_unlock(&root->fs_info->trans_mutex);
1824 2208
1825 init_completion(&done); 2209 while (!list_empty(&reloc_roots)) {
1826 atomic_set(&num_pending, 1); 2210 found = 1;
2211 reloc_root = list_entry(reloc_roots.next,
2212 struct btrfs_root, root_list);
1827 2213
1828 while (!list_empty(&rc->reloc_roots)) { 2214 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1829 root = list_entry(rc->reloc_roots.next, 2215 root = read_fs_root(reloc_root->fs_info,
1830 struct btrfs_root, root_list); 2216 reloc_root->root_key.offset);
1831 list_del_init(&root->root_list); 2217 BUG_ON(IS_ERR(root));
2218 BUG_ON(root->reloc_root != reloc_root);
1832 2219
1833 async = kmalloc(sizeof(*async), GFP_NOFS); 2220 ret = merge_reloc_root(rc, root);
1834 BUG_ON(!async); 2221 BUG_ON(ret);
1835 async->work.func = merge_func; 2222 } else {
1836 async->work.flags = 0; 2223 list_del_init(&reloc_root->root_list);
1837 async->rc = rc; 2224 }
1838 async->root = root; 2225 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1839 async->done = &done;
1840 async->num_pending = &num_pending;
1841 atomic_inc(&num_pending);
1842 btrfs_queue_worker(&rc->workers, &async->work);
1843 } 2226 }
1844 2227
1845 if (!atomic_dec_and_test(&num_pending)) 2228 if (found) {
1846 wait_for_completion(&done); 2229 found = 0;
1847 2230 goto again;
2231 }
1848 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2232 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1849 return 0; 2233 return 0;
1850} 2234}
@@ -1875,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1875 return btrfs_record_root_in_trans(trans, root); 2259 return btrfs_record_root_in_trans(trans, root);
1876} 2260}
1877 2261
1878/* 2262static noinline_for_stack
1879 * select one tree from trees that references the block. 2263struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1880 * for blocks in refernce counted trees, we preper reloc tree. 2264 struct reloc_control *rc,
1881 * if no reloc tree found and reloc_only is true, NULL is returned. 2265 struct backref_node *node,
1882 */ 2266 struct backref_edge *edges[], int *nr)
1883static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1884 struct backref_node *node,
1885 struct backref_edge *edges[],
1886 int *nr, int reloc_only)
1887{ 2267{
1888 struct backref_node *next; 2268 struct backref_node *next;
1889 struct btrfs_root *root; 2269 struct btrfs_root *root;
1890 int index; 2270 int index = 0;
1891 int loop = 0; 2271
1892again:
1893 index = 0;
1894 next = node; 2272 next = node;
1895 while (1) { 2273 while (1) {
1896 cond_resched(); 2274 cond_resched();
1897 next = walk_up_backref(next, edges, &index); 2275 next = walk_up_backref(next, edges, &index);
1898 root = next->root; 2276 root = next->root;
1899 if (!root) { 2277 BUG_ON(!root);
1900 BUG_ON(!node->old_root); 2278 BUG_ON(!root->ref_cows);
1901 goto skip;
1902 }
1903
1904 /* no other choice for non-refernce counted tree */
1905 if (!root->ref_cows) {
1906 BUG_ON(reloc_only);
1907 break;
1908 }
1909 2279
1910 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2280 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1911 record_reloc_root_in_trans(trans, root); 2281 record_reloc_root_in_trans(trans, root);
1912 break; 2282 break;
1913 } 2283 }
1914 2284
1915 if (loop) { 2285 btrfs_record_root_in_trans(trans, root);
1916 btrfs_record_root_in_trans(trans, root); 2286 root = root->reloc_root;
2287
2288 if (next->new_bytenr != root->node->start) {
2289 BUG_ON(next->new_bytenr);
2290 BUG_ON(!list_empty(&next->list));
2291 next->new_bytenr = root->node->start;
2292 next->root = root;
2293 list_add_tail(&next->list,
2294 &rc->backref_cache.changed);
2295 __mark_block_processed(rc, next);
1917 break; 2296 break;
1918 } 2297 }
1919 2298
1920 if (reloc_only || next != node) { 2299 WARN_ON(1);
1921 if (!root->reloc_root)
1922 btrfs_record_root_in_trans(trans, root);
1923 root = root->reloc_root;
1924 /*
1925 * if the reloc tree was created in current
1926 * transation, there is no node in backref tree
1927 * corresponds to the root of the reloc tree.
1928 */
1929 if (btrfs_root_last_snapshot(&root->root_item) ==
1930 trans->transid - 1)
1931 break;
1932 }
1933skip:
1934 root = NULL; 2300 root = NULL;
1935 next = walk_down_backref(edges, &index); 2301 next = walk_down_backref(edges, &index);
1936 if (!next || next->level <= node->level) 2302 if (!next || next->level <= node->level)
1937 break; 2303 break;
1938 } 2304 }
2305 if (!root)
2306 return NULL;
1939 2307
1940 if (!root && !loop && !reloc_only) { 2308 *nr = index;
1941 loop = 1; 2309 next = node;
1942 goto again; 2310 /* setup backref node path for btrfs_reloc_cow_block */
2311 while (1) {
2312 rc->backref_cache.path[next->level] = next;
2313 if (--index < 0)
2314 break;
2315 next = edges[index]->node[UPPER];
1943 } 2316 }
1944
1945 if (root)
1946 *nr = index;
1947 else
1948 *nr = 0;
1949
1950 return root; 2317 return root;
1951} 2318}
1952 2319
2320/*
2321 * select a tree root for relocation. return NULL if the block
2322 * is reference counted. we should use do_relocation() in this
2323 * case. return a tree root pointer if the block isn't reference
2324 * counted. return -ENOENT if the block is root of reloc tree.
2325 */
1953static noinline_for_stack 2326static noinline_for_stack
1954struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2327struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1955 struct backref_node *node) 2328 struct backref_node *node)
1956{ 2329{
2330 struct backref_node *next;
2331 struct btrfs_root *root;
2332 struct btrfs_root *fs_root = NULL;
1957 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2333 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1958 int nr; 2334 int index = 0;
1959 return __select_one_root(trans, node, edges, &nr, 0); 2335
2336 next = node;
2337 while (1) {
2338 cond_resched();
2339 next = walk_up_backref(next, edges, &index);
2340 root = next->root;
2341 BUG_ON(!root);
2342
2343 /* no other choice for non-refernce counted tree */
2344 if (!root->ref_cows)
2345 return root;
2346
2347 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2348 fs_root = root;
2349
2350 if (next != node)
2351 return NULL;
2352
2353 next = walk_down_backref(edges, &index);
2354 if (!next || next->level <= node->level)
2355 break;
2356 }
2357
2358 if (!fs_root)
2359 return ERR_PTR(-ENOENT);
2360 return fs_root;
1960} 2361}
1961 2362
1962static noinline_for_stack 2363static noinline_for_stack
1963struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2364u64 calcu_metadata_size(struct reloc_control *rc,
1964 struct backref_node *node, 2365 struct backref_node *node, int reserve)
1965 struct backref_edge *edges[], int *nr)
1966{ 2366{
1967 return __select_one_root(trans, node, edges, nr, 1); 2367 struct backref_node *next = node;
2368 struct backref_edge *edge;
2369 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2370 u64 num_bytes = 0;
2371 int index = 0;
2372
2373 BUG_ON(reserve && node->processed);
2374
2375 while (next) {
2376 cond_resched();
2377 while (1) {
2378 if (next->processed && (reserve || next != node))
2379 break;
2380
2381 num_bytes += btrfs_level_size(rc->extent_root,
2382 next->level);
2383
2384 if (list_empty(&next->upper))
2385 break;
2386
2387 edge = list_entry(next->upper.next,
2388 struct backref_edge, list[LOWER]);
2389 edges[index++] = edge;
2390 next = edge->node[UPPER];
2391 }
2392 next = walk_down_backref(edges, &index);
2393 }
2394 return num_bytes;
1968} 2395}
1969 2396
1970static void grab_path_buffers(struct btrfs_path *path, 2397static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1971 struct backref_node *node, 2398 struct reloc_control *rc,
1972 struct backref_edge *edges[], int nr) 2399 struct backref_node *node)
1973{ 2400{
1974 int i = 0; 2401 struct btrfs_root *root = rc->extent_root;
1975 while (1) { 2402 u64 num_bytes;
1976 drop_node_buffer(node); 2403 int ret;
1977 node->eb = path->nodes[node->level]; 2404
1978 BUG_ON(!node->eb); 2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1979 if (path->locks[node->level])
1980 node->locked = 1;
1981 path->nodes[node->level] = NULL;
1982 path->locks[node->level] = 0;
1983
1984 if (i >= nr)
1985 break;
1986 2406
1987 edges[i]->blockptr = node->eb->start; 2407 trans->block_rsv = rc->block_rsv;
1988 node = edges[i]->node[UPPER]; 2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1989 i++; 2409 &rc->block_rsv_retries);
2410 if (ret) {
2411 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1;
2413 return ret;
1990 } 2414 }
2415
2416 rc->block_rsv_retries = 0;
2417 return 0;
2418}
2419
2420static void release_metadata_space(struct reloc_control *rc,
2421 struct backref_node *node)
2422{
2423 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2424 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1991} 2425}
1992 2426
1993/* 2427/*
@@ -1998,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1998 * in that case this function just updates pointers. 2432 * in that case this function just updates pointers.
1999 */ 2433 */
2000static int do_relocation(struct btrfs_trans_handle *trans, 2434static int do_relocation(struct btrfs_trans_handle *trans,
2435 struct reloc_control *rc,
2001 struct backref_node *node, 2436 struct backref_node *node,
2002 struct btrfs_key *key, 2437 struct btrfs_key *key,
2003 struct btrfs_path *path, int lowest) 2438 struct btrfs_path *path, int lowest)
@@ -2018,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2018 BUG_ON(lowest && node->eb); 2453 BUG_ON(lowest && node->eb);
2019 2454
2020 path->lowest_level = node->level + 1; 2455 path->lowest_level = node->level + 1;
2456 rc->backref_cache.path[node->level] = node;
2021 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2457 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2022 cond_resched(); 2458 cond_resched();
2023 if (node->eb && node->eb->start == edge->blockptr)
2024 continue;
2025 2459
2026 upper = edge->node[UPPER]; 2460 upper = edge->node[UPPER];
2027 root = select_reloc_root(trans, upper, edges, &nr); 2461 root = select_reloc_root(trans, rc, upper, edges, &nr);
2028 if (!root) 2462 BUG_ON(!root);
2029 continue; 2463
2030 2464 if (upper->eb && !upper->locked) {
2031 if (upper->eb && !upper->locked) 2465 if (!lowest) {
2466 ret = btrfs_bin_search(upper->eb, key,
2467 upper->level, &slot);
2468 BUG_ON(ret);
2469 bytenr = btrfs_node_blockptr(upper->eb, slot);
2470 if (node->eb->start == bytenr)
2471 goto next;
2472 }
2032 drop_node_buffer(upper); 2473 drop_node_buffer(upper);
2474 }
2033 2475
2034 if (!upper->eb) { 2476 if (!upper->eb) {
2035 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2477 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2039,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2039 } 2481 }
2040 BUG_ON(ret > 0); 2482 BUG_ON(ret > 0);
2041 2483
2042 slot = path->slots[upper->level]; 2484 if (!upper->eb) {
2485 upper->eb = path->nodes[upper->level];
2486 path->nodes[upper->level] = NULL;
2487 } else {
2488 BUG_ON(upper->eb != path->nodes[upper->level]);
2489 }
2043 2490
2044 btrfs_unlock_up_safe(path, upper->level + 1); 2491 upper->locked = 1;
2045 grab_path_buffers(path, upper, edges, nr); 2492 path->locks[upper->level] = 0;
2046 2493
2494 slot = path->slots[upper->level];
2047 btrfs_release_path(NULL, path); 2495 btrfs_release_path(NULL, path);
2048 } else { 2496 } else {
2049 ret = btrfs_bin_search(upper->eb, key, upper->level, 2497 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2052,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2052 } 2500 }
2053 2501
2054 bytenr = btrfs_node_blockptr(upper->eb, slot); 2502 bytenr = btrfs_node_blockptr(upper->eb, slot);
2055 if (!lowest) { 2503 if (lowest) {
2056 if (node->eb->start == bytenr) { 2504 BUG_ON(bytenr != node->bytenr);
2057 btrfs_tree_unlock(upper->eb);
2058 upper->locked = 0;
2059 continue;
2060 }
2061 } else { 2505 } else {
2062 BUG_ON(node->bytenr != bytenr); 2506 if (node->eb->start == bytenr)
2507 goto next;
2063 } 2508 }
2064 2509
2065 blocksize = btrfs_level_size(root, node->level); 2510 blocksize = btrfs_level_size(root, node->level);
@@ -2071,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2071 if (!node->eb) { 2516 if (!node->eb) {
2072 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2517 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2073 slot, &eb); 2518 slot, &eb);
2519 btrfs_tree_unlock(eb);
2520 free_extent_buffer(eb);
2074 if (ret < 0) { 2521 if (ret < 0) {
2075 err = ret; 2522 err = ret;
2076 break; 2523 goto next;
2077 } 2524 }
2078 btrfs_set_lock_blocking(eb); 2525 BUG_ON(node->eb != eb);
2079 node->eb = eb;
2080 node->locked = 1;
2081 } else { 2526 } else {
2082 btrfs_set_node_blockptr(upper->eb, slot, 2527 btrfs_set_node_blockptr(upper->eb, slot,
2083 node->eb->start); 2528 node->eb->start);
@@ -2095,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2095 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2540 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2096 BUG_ON(ret); 2541 BUG_ON(ret);
2097 } 2542 }
2098 if (!lowest) { 2543next:
2099 btrfs_tree_unlock(upper->eb); 2544 if (!upper->pending)
2100 upper->locked = 0; 2545 drop_node_buffer(upper);
2101 } 2546 else
2547 unlock_node_buffer(upper);
2548 if (err)
2549 break;
2102 } 2550 }
2551
2552 if (!err && node->pending) {
2553 drop_node_buffer(node);
2554 list_move_tail(&node->list, &rc->backref_cache.changed);
2555 node->pending = 0;
2556 }
2557
2103 path->lowest_level = 0; 2558 path->lowest_level = 0;
2559 BUG_ON(err == -ENOSPC);
2104 return err; 2560 return err;
2105} 2561}
2106 2562
2107static int link_to_upper(struct btrfs_trans_handle *trans, 2563static int link_to_upper(struct btrfs_trans_handle *trans,
2564 struct reloc_control *rc,
2108 struct backref_node *node, 2565 struct backref_node *node,
2109 struct btrfs_path *path) 2566 struct btrfs_path *path)
2110{ 2567{
2111 struct btrfs_key key; 2568 struct btrfs_key key;
2112 if (!node->eb || list_empty(&node->upper))
2113 return 0;
2114 2569
2115 btrfs_node_key_to_cpu(node->eb, &key, 0); 2570 btrfs_node_key_to_cpu(node->eb, &key, 0);
2116 return do_relocation(trans, node, &key, path, 0); 2571 return do_relocation(trans, rc, node, &key, path, 0);
2117} 2572}
2118 2573
2119static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2574static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2120 struct backref_cache *cache, 2575 struct reloc_control *rc,
2121 struct btrfs_path *path) 2576 struct btrfs_path *path, int err)
2122{ 2577{
2578 LIST_HEAD(list);
2579 struct backref_cache *cache = &rc->backref_cache;
2123 struct backref_node *node; 2580 struct backref_node *node;
2124 int level; 2581 int level;
2125 int ret; 2582 int ret;
2126 int err = 0;
2127 2583
2128 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2584 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2129 while (!list_empty(&cache->pending[level])) { 2585 while (!list_empty(&cache->pending[level])) {
2130 node = list_entry(cache->pending[level].next, 2586 node = list_entry(cache->pending[level].next,
2131 struct backref_node, lower); 2587 struct backref_node, list);
2132 BUG_ON(node->level != level); 2588 list_move_tail(&node->list, &list);
2589 BUG_ON(!node->pending);
2133 2590
2134 ret = link_to_upper(trans, node, path); 2591 if (!err) {
2135 if (ret < 0) 2592 ret = link_to_upper(trans, rc, node, path);
2136 err = ret; 2593 if (ret < 0)
2137 /* 2594 err = ret;
2138 * this remove the node from the pending list and 2595 }
2139 * may add some other nodes to the level + 1
2140 * pending list
2141 */
2142 remove_backref_node(cache, node);
2143 } 2596 }
2597 list_splice_init(&list, &cache->pending[level]);
2144 } 2598 }
2145 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2146 return err; 2599 return err;
2147} 2600}
2148 2601
2149static void mark_block_processed(struct reloc_control *rc, 2602static void mark_block_processed(struct reloc_control *rc,
2150 struct backref_node *node) 2603 u64 bytenr, u32 blocksize)
2604{
2605 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2606 EXTENT_DIRTY, GFP_NOFS);
2607}
2608
2609static void __mark_block_processed(struct reloc_control *rc,
2610 struct backref_node *node)
2151{ 2611{
2152 u32 blocksize; 2612 u32 blocksize;
2153 if (node->level == 0 || 2613 if (node->level == 0 ||
2154 in_block_group(node->bytenr, rc->block_group)) { 2614 in_block_group(node->bytenr, rc->block_group)) {
2155 blocksize = btrfs_level_size(rc->extent_root, node->level); 2615 blocksize = btrfs_level_size(rc->extent_root, node->level);
2156 set_extent_bits(&rc->processed_blocks, node->bytenr, 2616 mark_block_processed(rc, node->bytenr, blocksize);
2157 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2158 GFP_NOFS);
2159 } 2617 }
2160 node->processed = 1; 2618 node->processed = 1;
2161} 2619}
@@ -2178,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2178 if (next->processed) 2636 if (next->processed)
2179 break; 2637 break;
2180 2638
2181 mark_block_processed(rc, next); 2639 __mark_block_processed(rc, next);
2182 2640
2183 if (list_empty(&next->upper)) 2641 if (list_empty(&next->upper))
2184 break; 2642 break;
@@ -2201,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2201 return 0; 2659 return 0;
2202} 2660}
2203 2661
2204/*
2205 * check if there are any file extent pointers in the leaf point to
2206 * data require processing
2207 */
2208static int check_file_extents(struct reloc_control *rc,
2209 u64 bytenr, u32 blocksize, u64 ptr_gen)
2210{
2211 struct btrfs_key found_key;
2212 struct btrfs_file_extent_item *fi;
2213 struct extent_buffer *leaf;
2214 u32 nritems;
2215 int i;
2216 int ret = 0;
2217
2218 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2219
2220 nritems = btrfs_header_nritems(leaf);
2221 for (i = 0; i < nritems; i++) {
2222 cond_resched();
2223 btrfs_item_key_to_cpu(leaf, &found_key, i);
2224 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2225 continue;
2226 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2227 if (btrfs_file_extent_type(leaf, fi) ==
2228 BTRFS_FILE_EXTENT_INLINE)
2229 continue;
2230 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2231 if (bytenr == 0)
2232 continue;
2233 if (in_block_group(bytenr, rc->block_group)) {
2234 ret = 1;
2235 break;
2236 }
2237 }
2238 free_extent_buffer(leaf);
2239 return ret;
2240}
2241
2242/*
2243 * scan child blocks of a given block to find blocks require processing
2244 */
2245static int add_child_blocks(struct btrfs_trans_handle *trans,
2246 struct reloc_control *rc,
2247 struct backref_node *node,
2248 struct rb_root *blocks)
2249{
2250 struct tree_block *block;
2251 struct rb_node *rb_node;
2252 u64 bytenr;
2253 u64 ptr_gen;
2254 u32 blocksize;
2255 u32 nritems;
2256 int i;
2257 int err = 0;
2258
2259 nritems = btrfs_header_nritems(node->eb);
2260 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272
2273 readahead_tree_block(rc->extent_root,
2274 bytenr, blocksize, ptr_gen);
2275 }
2276
2277 for (i = 0; i < nritems; i++) {
2278 cond_resched();
2279 bytenr = btrfs_node_blockptr(node->eb, i);
2280 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2281 if (ptr_gen == trans->transid)
2282 continue;
2283 if (!in_block_group(bytenr, rc->block_group) &&
2284 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2285 continue;
2286 if (tree_block_processed(bytenr, blocksize, rc))
2287 continue;
2288 if (!in_block_group(bytenr, rc->block_group) &&
2289 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2290 continue;
2291
2292 block = kmalloc(sizeof(*block), GFP_NOFS);
2293 if (!block) {
2294 err = -ENOMEM;
2295 break;
2296 }
2297 block->bytenr = bytenr;
2298 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2299 block->level = node->level - 1;
2300 block->key_ready = 1;
2301 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2302 BUG_ON(rb_node);
2303 }
2304 if (err)
2305 free_block_list(blocks);
2306 return err;
2307}
2308
2309/*
2310 * find adjacent blocks require processing
2311 */
2312static noinline_for_stack
2313int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2314 struct reloc_control *rc,
2315 struct backref_cache *cache,
2316 struct rb_root *blocks, int level,
2317 struct backref_node **upper)
2318{
2319 struct backref_node *node;
2320 int ret = 0;
2321
2322 WARN_ON(!list_empty(&cache->pending[level]));
2323
2324 if (list_empty(&cache->pending[level + 1]))
2325 return 1;
2326
2327 node = list_entry(cache->pending[level + 1].next,
2328 struct backref_node, lower);
2329 if (node->eb)
2330 ret = add_child_blocks(trans, rc, node, blocks);
2331
2332 *upper = node;
2333 return ret;
2334}
2335
2336static int get_tree_block_key(struct reloc_control *rc, 2662static int get_tree_block_key(struct reloc_control *rc,
2337 struct tree_block *block) 2663 struct tree_block *block)
2338{ 2664{
@@ -2370,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2370 struct btrfs_path *path) 2696 struct btrfs_path *path)
2371{ 2697{
2372 struct btrfs_root *root; 2698 struct btrfs_root *root;
2373 int ret; 2699 int release = 0;
2700 int ret = 0;
2374 2701
2702 if (!node)
2703 return 0;
2704
2705 BUG_ON(node->processed);
2375 root = select_one_root(trans, node); 2706 root = select_one_root(trans, node);
2376 if (unlikely(!root)) { 2707 if (root == ERR_PTR(-ENOENT)) {
2377 rc->found_old_snapshot = 1;
2378 update_processed_blocks(rc, node); 2708 update_processed_blocks(rc, node);
2379 return 0; 2709 goto out;
2380 } 2710 }
2381 2711
2382 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2712 if (!root || root->ref_cows) {
2383 ret = do_relocation(trans, node, key, path, 1); 2713 ret = reserve_metadata_space(trans, rc, node);
2384 if (ret < 0) 2714 if (ret)
2385 goto out;
2386 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2387 ret = replace_file_extents(trans, rc, root,
2388 node->eb, NULL);
2389 if (ret < 0)
2390 goto out;
2391 }
2392 drop_node_buffer(node);
2393 } else if (!root->ref_cows) {
2394 path->lowest_level = node->level;
2395 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2396 btrfs_release_path(root, path);
2397 if (ret < 0)
2398 goto out; 2715 goto out;
2399 } else if (root != node->root) { 2716 release = 1;
2400 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2401 } 2717 }
2402 2718
2403 update_processed_blocks(rc, node); 2719 if (root) {
2404 ret = 0; 2720 if (root->ref_cows) {
2721 BUG_ON(node->new_bytenr);
2722 BUG_ON(!list_empty(&node->list));
2723 btrfs_record_root_in_trans(trans, root);
2724 root = root->reloc_root;
2725 node->new_bytenr = root->node->start;
2726 node->root = root;
2727 list_add_tail(&node->list, &rc->backref_cache.changed);
2728 } else {
2729 path->lowest_level = node->level;
2730 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2731 btrfs_release_path(root, path);
2732 if (ret > 0)
2733 ret = 0;
2734 }
2735 if (!ret)
2736 update_processed_blocks(rc, node);
2737 } else {
2738 ret = do_relocation(trans, rc, node, key, path, 1);
2739 }
2405out: 2740out:
2406 drop_node_buffer(node); 2741 if (ret || node->level == 0 || node->cowonly) {
2742 if (release)
2743 release_metadata_space(rc, node);
2744 remove_backref_node(&rc->backref_cache, node);
2745 }
2407 return ret; 2746 return ret;
2408} 2747}
2409 2748
@@ -2414,12 +2753,10 @@ static noinline_for_stack
2414int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2753int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2415 struct reloc_control *rc, struct rb_root *blocks) 2754 struct reloc_control *rc, struct rb_root *blocks)
2416{ 2755{
2417 struct backref_cache *cache;
2418 struct backref_node *node; 2756 struct backref_node *node;
2419 struct btrfs_path *path; 2757 struct btrfs_path *path;
2420 struct tree_block *block; 2758 struct tree_block *block;
2421 struct rb_node *rb_node; 2759 struct rb_node *rb_node;
2422 int level = -1;
2423 int ret; 2760 int ret;
2424 int err = 0; 2761 int err = 0;
2425 2762
@@ -2427,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2427 if (!path) 2764 if (!path)
2428 return -ENOMEM; 2765 return -ENOMEM;
2429 2766
2430 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2431 if (!cache) {
2432 btrfs_free_path(path);
2433 return -ENOMEM;
2434 }
2435
2436 backref_cache_init(cache);
2437
2438 rb_node = rb_first(blocks); 2767 rb_node = rb_first(blocks);
2439 while (rb_node) { 2768 while (rb_node) {
2440 block = rb_entry(rb_node, struct tree_block, rb_node); 2769 block = rb_entry(rb_node, struct tree_block, rb_node);
2441 if (level == -1)
2442 level = block->level;
2443 else
2444 BUG_ON(level != block->level);
2445 if (!block->key_ready) 2770 if (!block->key_ready)
2446 reada_tree_block(rc, block); 2771 reada_tree_block(rc, block);
2447 rb_node = rb_next(rb_node); 2772 rb_node = rb_next(rb_node);
@@ -2459,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2459 while (rb_node) { 2784 while (rb_node) {
2460 block = rb_entry(rb_node, struct tree_block, rb_node); 2785 block = rb_entry(rb_node, struct tree_block, rb_node);
2461 2786
2462 node = build_backref_tree(rc, cache, &block->key, 2787 node = build_backref_tree(rc, &block->key,
2463 block->level, block->bytenr); 2788 block->level, block->bytenr);
2464 if (IS_ERR(node)) { 2789 if (IS_ERR(node)) {
2465 err = PTR_ERR(node); 2790 err = PTR_ERR(node);
@@ -2469,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2469 ret = relocate_tree_block(trans, rc, node, &block->key, 2794 ret = relocate_tree_block(trans, rc, node, &block->key,
2470 path); 2795 path);
2471 if (ret < 0) { 2796 if (ret < 0) {
2472 err = ret; 2797 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2798 err = ret;
2473 goto out; 2799 goto out;
2474 } 2800 }
2475 remove_backref_node(cache, node);
2476 rb_node = rb_next(rb_node); 2801 rb_node = rb_next(rb_node);
2477 } 2802 }
2478 2803out:
2479 if (level > 0)
2480 goto out;
2481
2482 free_block_list(blocks); 2804 free_block_list(blocks);
2805 err = finish_pending_nodes(trans, rc, path, err);
2483 2806
2484 /* 2807 btrfs_free_path(path);
2485 * now backrefs of some upper level tree blocks have been cached, 2808 return err;
2486 * try relocating blocks referenced by these upper level blocks. 2809}
2487 */
2488 while (1) {
2489 struct backref_node *upper = NULL;
2490 if (trans->transaction->in_commit ||
2491 trans->transaction->delayed_refs.flushing)
2492 break;
2493 2810
2494 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2811static noinline_for_stack
2495 &upper); 2812int prealloc_file_extent_cluster(struct inode *inode,
2496 if (ret < 0) 2813 struct file_extent_cluster *cluster)
2497 err = ret; 2814{
2498 if (ret != 0) 2815 u64 alloc_hint = 0;
2499 break; 2816 u64 start;
2817 u64 end;
2818 u64 offset = BTRFS_I(inode)->index_cnt;
2819 u64 num_bytes;
2820 int nr = 0;
2821 int ret = 0;
2500 2822
2501 rb_node = rb_first(blocks); 2823 BUG_ON(cluster->start != cluster->boundary[0]);
2502 while (rb_node) { 2824 mutex_lock(&inode->i_mutex);
2503 block = rb_entry(rb_node, struct tree_block, rb_node);
2504 if (trans->transaction->in_commit ||
2505 trans->transaction->delayed_refs.flushing)
2506 goto out;
2507 BUG_ON(!block->key_ready);
2508 node = build_backref_tree(rc, cache, &block->key,
2509 level, block->bytenr);
2510 if (IS_ERR(node)) {
2511 err = PTR_ERR(node);
2512 goto out;
2513 }
2514 2825
2515 ret = relocate_tree_block(trans, rc, node, 2826 ret = btrfs_check_data_free_space(inode, cluster->end +
2516 &block->key, path); 2827 1 - cluster->start);
2517 if (ret < 0) { 2828 if (ret)
2518 err = ret; 2829 goto out;
2519 goto out;
2520 }
2521 remove_backref_node(cache, node);
2522 rb_node = rb_next(rb_node);
2523 }
2524 free_block_list(blocks);
2525 2830
2526 if (upper) { 2831 while (nr < cluster->nr) {
2527 ret = link_to_upper(trans, upper, path); 2832 start = cluster->boundary[nr] - offset;
2528 if (ret < 0) { 2833 if (nr + 1 < cluster->nr)
2529 err = ret; 2834 end = cluster->boundary[nr + 1] - 1 - offset;
2530 break; 2835 else
2531 } 2836 end = cluster->end - offset;
2532 remove_backref_node(cache, upper); 2837
2533 } 2838 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2839 num_bytes = end + 1 - start;
2840 ret = btrfs_prealloc_file_range(inode, 0, start,
2841 num_bytes, num_bytes,
2842 end + 1, &alloc_hint);
2843 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2844 if (ret)
2845 break;
2846 nr++;
2534 } 2847 }
2848 btrfs_free_reserved_data_space(inode, cluster->end +
2849 1 - cluster->start);
2535out: 2850out:
2536 free_block_list(blocks); 2851 mutex_unlock(&inode->i_mutex);
2537 2852 return ret;
2538 ret = finish_pending_nodes(trans, cache, path);
2539 if (ret < 0)
2540 err = ret;
2541
2542 kfree(cache);
2543 btrfs_free_path(path);
2544 return err;
2545} 2853}
2546 2854
2547static noinline_for_stack 2855static noinline_for_stack
@@ -2587,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2587 u64 offset = BTRFS_I(inode)->index_cnt; 2895 u64 offset = BTRFS_I(inode)->index_cnt;
2588 unsigned long index; 2896 unsigned long index;
2589 unsigned long last_index; 2897 unsigned long last_index;
2590 unsigned int dirty_page = 0;
2591 struct page *page; 2898 struct page *page;
2592 struct file_ra_state *ra; 2899 struct file_ra_state *ra;
2593 int nr = 0; 2900 int nr = 0;
@@ -2600,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2600 if (!ra) 2907 if (!ra)
2601 return -ENOMEM; 2908 return -ENOMEM;
2602 2909
2603 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2910 ret = prealloc_file_extent_cluster(inode, cluster);
2604 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2911 if (ret)
2912 goto out;
2605 2913
2606 mutex_lock(&inode->i_mutex); 2914 file_ra_state_init(ra, inode->i_mapping);
2607 2915
2608 i_size_write(inode, cluster->end + 1 - offset);
2609 ret = setup_extent_mapping(inode, cluster->start - offset, 2916 ret = setup_extent_mapping(inode, cluster->start - offset,
2610 cluster->end - offset, cluster->start); 2917 cluster->end - offset, cluster->start);
2611 if (ret) 2918 if (ret)
2612 goto out_unlock; 2919 goto out;
2613
2614 file_ra_state_init(ra, inode->i_mapping);
2615 2920
2616 WARN_ON(cluster->start != cluster->boundary[0]); 2921 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2922 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2617 while (index <= last_index) { 2923 while (index <= last_index) {
2924 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2925 if (ret)
2926 goto out;
2927
2618 page = find_lock_page(inode->i_mapping, index); 2928 page = find_lock_page(inode->i_mapping, index);
2619 if (!page) { 2929 if (!page) {
2620 page_cache_sync_readahead(inode->i_mapping, 2930 page_cache_sync_readahead(inode->i_mapping,
@@ -2622,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2622 last_index + 1 - index); 2932 last_index + 1 - index);
2623 page = grab_cache_page(inode->i_mapping, index); 2933 page = grab_cache_page(inode->i_mapping, index);
2624 if (!page) { 2934 if (!page) {
2935 btrfs_delalloc_release_metadata(inode,
2936 PAGE_CACHE_SIZE);
2625 ret = -ENOMEM; 2937 ret = -ENOMEM;
2626 goto out_unlock; 2938 goto out;
2627 } 2939 }
2628 } 2940 }
2629 2941
@@ -2639,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2639 if (!PageUptodate(page)) { 2951 if (!PageUptodate(page)) {
2640 unlock_page(page); 2952 unlock_page(page);
2641 page_cache_release(page); 2953 page_cache_release(page);
2954 btrfs_delalloc_release_metadata(inode,
2955 PAGE_CACHE_SIZE);
2642 ret = -EIO; 2956 ret = -EIO;
2643 goto out_unlock; 2957 goto out;
2644 } 2958 }
2645 } 2959 }
2646 2960
@@ -2659,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2973 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2974 nr++;
2661 } 2975 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end);
2663 2976
2977 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 set_page_dirty(page); 2978 set_page_dirty(page);
2665 dirty_page++;
2666 2979
2667 unlock_extent(&BTRFS_I(inode)->io_tree, 2980 unlock_extent(&BTRFS_I(inode)->io_tree,
2668 page_start, page_end, GFP_NOFS); 2981 page_start, page_end, GFP_NOFS);
@@ -2670,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2670 page_cache_release(page); 2983 page_cache_release(page);
2671 2984
2672 index++; 2985 index++;
2673 if (nr < cluster->nr && 2986 balance_dirty_pages_ratelimited(inode->i_mapping);
2674 page_end + 1 + offset == cluster->boundary[nr]) { 2987 btrfs_throttle(BTRFS_I(inode)->root);
2675 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2676 dirty_page);
2677 dirty_page = 0;
2678 }
2679 }
2680 if (dirty_page) {
2681 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2682 dirty_page);
2683 } 2988 }
2684 WARN_ON(nr != cluster->nr); 2989 WARN_ON(nr != cluster->nr);
2685out_unlock: 2990out:
2686 mutex_unlock(&inode->i_mutex);
2687 kfree(ra); 2991 kfree(ra);
2688 return ret; 2992 return ret;
2689} 2993}
@@ -2869,9 +3173,6 @@ out:
2869static int block_use_full_backref(struct reloc_control *rc, 3173static int block_use_full_backref(struct reloc_control *rc,
2870 struct extent_buffer *eb) 3174 struct extent_buffer *eb)
2871{ 3175{
2872 struct btrfs_path *path;
2873 struct btrfs_extent_item *ei;
2874 struct btrfs_key key;
2875 u64 flags; 3176 u64 flags;
2876 int ret; 3177 int ret;
2877 3178
@@ -2879,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2879 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3180 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2880 return 1; 3181 return 1;
2881 3182
2882 path = btrfs_alloc_path(); 3183 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2883 BUG_ON(!path); 3184 eb->start, eb->len, NULL, &flags);
2884
2885 key.objectid = eb->start;
2886 key.type = BTRFS_EXTENT_ITEM_KEY;
2887 key.offset = eb->len;
2888
2889 path->search_commit_root = 1;
2890 path->skip_locking = 1;
2891 ret = btrfs_search_slot(NULL, rc->extent_root,
2892 &key, path, 0, 0);
2893 BUG_ON(ret); 3185 BUG_ON(ret);
2894 3186
2895 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2896 struct btrfs_extent_item);
2897 flags = btrfs_extent_flags(path->nodes[0], ei);
2898 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2899 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3187 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2900 ret = 1; 3188 ret = 1;
2901 else 3189 else
2902 ret = 0; 3190 ret = 0;
2903 btrfs_free_path(path);
2904 return ret; 3191 return ret;
2905} 3192}
2906 3193
@@ -3073,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
3073 struct btrfs_extent_inline_ref *iref; 3360 struct btrfs_extent_inline_ref *iref;
3074 unsigned long ptr; 3361 unsigned long ptr;
3075 unsigned long end; 3362 unsigned long end;
3076 u32 blocksize; 3363 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3077 int ret; 3364 int ret;
3078 int err = 0; 3365 int err = 0;
3079 3366
3080 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3081 extent_key->offset);
3082 BUG_ON(ret < 0);
3083 if (ret > 0) {
3084 /* the relocated data is fragmented */
3085 rc->extents_skipped++;
3086 btrfs_release_path(rc->extent_root, path);
3087 return 0;
3088 }
3089
3090 blocksize = btrfs_level_size(rc->extent_root, 0);
3091
3092 eb = path->nodes[0]; 3367 eb = path->nodes[0];
3093 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3368 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3094 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3369 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3169,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
3169 */ 3444 */
3170static noinline_for_stack 3445static noinline_for_stack
3171int find_next_extent(struct btrfs_trans_handle *trans, 3446int find_next_extent(struct btrfs_trans_handle *trans,
3172 struct reloc_control *rc, struct btrfs_path *path) 3447 struct reloc_control *rc, struct btrfs_path *path,
3448 struct btrfs_key *extent_key)
3173{ 3449{
3174 struct btrfs_key key; 3450 struct btrfs_key key;
3175 struct extent_buffer *leaf; 3451 struct extent_buffer *leaf;
@@ -3224,6 +3500,7 @@ next:
3224 rc->search_start = end + 1; 3500 rc->search_start = end + 1;
3225 } else { 3501 } else {
3226 rc->search_start = key.objectid + key.offset; 3502 rc->search_start = key.objectid + key.offset;
3503 memcpy(extent_key, &key, sizeof(key));
3227 return 0; 3504 return 0;
3228 } 3505 }
3229 } 3506 }
@@ -3261,12 +3538,49 @@ static int check_extent_flags(u64 flags)
3261 return 0; 3538 return 0;
3262} 3539}
3263 3540
3541static noinline_for_stack
3542int prepare_to_relocate(struct reloc_control *rc)
3543{
3544 struct btrfs_trans_handle *trans;
3545 int ret;
3546
3547 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3548 if (!rc->block_rsv)
3549 return -ENOMEM;
3550
3551 /*
3552 * reserve some space for creating reloc trees.
3553 * btrfs_init_reloc_root will use them when there
3554 * is no reservation in transaction handle.
3555 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256,
3558 &rc->block_rsv_retries);
3559 if (ret)
3560 return ret;
3561
3562 rc->block_rsv->refill_used = 1;
3563 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3564
3565 memset(&rc->cluster, 0, sizeof(rc->cluster));
3566 rc->search_start = rc->block_group->key.objectid;
3567 rc->extents_found = 0;
3568 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571
3572 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc);
3574
3575 trans = btrfs_join_transaction(rc->extent_root, 1);
3576 btrfs_commit_transaction(trans, rc->extent_root);
3577 return 0;
3578}
3264 3579
3265static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3580static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3266{ 3581{
3267 struct rb_root blocks = RB_ROOT; 3582 struct rb_root blocks = RB_ROOT;
3268 struct btrfs_key key; 3583 struct btrfs_key key;
3269 struct file_extent_cluster *cluster;
3270 struct btrfs_trans_handle *trans = NULL; 3584 struct btrfs_trans_handle *trans = NULL;
3271 struct btrfs_path *path; 3585 struct btrfs_path *path;
3272 struct btrfs_extent_item *ei; 3586 struct btrfs_extent_item *ei;
@@ -3276,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3276 int ret; 3590 int ret;
3277 int err = 0; 3591 int err = 0;
3278 3592
3279 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3280 if (!cluster)
3281 return -ENOMEM;
3282
3283 path = btrfs_alloc_path(); 3593 path = btrfs_alloc_path();
3284 if (!path) { 3594 if (!path)
3285 kfree(cluster);
3286 return -ENOMEM; 3595 return -ENOMEM;
3287 }
3288
3289 rc->extents_found = 0;
3290 rc->extents_skipped = 0;
3291
3292 rc->search_start = rc->block_group->key.objectid;
3293 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3294 GFP_NOFS);
3295
3296 rc->create_reloc_root = 1;
3297 set_reloc_control(rc);
3298 3596
3299 trans = btrfs_start_transaction(rc->extent_root, 1); 3597 ret = prepare_to_relocate(rc);
3300 btrfs_commit_transaction(trans, rc->extent_root); 3598 if (ret) {
3599 err = ret;
3600 goto out_free;
3601 }
3301 3602
3302 while (1) { 3603 while (1) {
3303 trans = btrfs_start_transaction(rc->extent_root, 1); 3604 trans = btrfs_start_transaction(rc->extent_root, 0);
3304 3605
3305 ret = find_next_extent(trans, rc, path); 3606 if (update_backref_cache(trans, &rc->backref_cache)) {
3607 btrfs_end_transaction(trans, rc->extent_root);
3608 continue;
3609 }
3610
3611 ret = find_next_extent(trans, rc, path, &key);
3306 if (ret < 0) 3612 if (ret < 0)
3307 err = ret; 3613 err = ret;
3308 if (ret != 0) 3614 if (ret != 0)
@@ -3312,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3312 3618
3313 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3619 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3314 struct btrfs_extent_item); 3620 struct btrfs_extent_item);
3315 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3621 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3316 item_size = btrfs_item_size_nr(path->nodes[0],
3317 path->slots[0]);
3318 if (item_size >= sizeof(*ei)) { 3622 if (item_size >= sizeof(*ei)) {
3319 flags = btrfs_extent_flags(path->nodes[0], ei); 3623 flags = btrfs_extent_flags(path->nodes[0], ei);
3320 ret = check_extent_flags(flags); 3624 ret = check_extent_flags(flags);
@@ -3355,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3355 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3659 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3356 ret = add_tree_block(rc, &key, path, &blocks); 3660 ret = add_tree_block(rc, &key, path, &blocks);
3357 } else if (rc->stage == UPDATE_DATA_PTRS && 3661 } else if (rc->stage == UPDATE_DATA_PTRS &&
3358 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3662 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3359 ret = add_data_references(rc, &key, path, &blocks); 3663 ret = add_data_references(rc, &key, path, &blocks);
3360 } else { 3664 } else {
3361 btrfs_release_path(rc->extent_root, path); 3665 btrfs_release_path(rc->extent_root, path);
3362 ret = 0; 3666 ret = 0;
3363 } 3667 }
3364 if (ret < 0) { 3668 if (ret < 0) {
3365 err = 0; 3669 err = ret;
3366 break; 3670 break;
3367 } 3671 }
3368 3672
3369 if (!RB_EMPTY_ROOT(&blocks)) { 3673 if (!RB_EMPTY_ROOT(&blocks)) {
3370 ret = relocate_tree_blocks(trans, rc, &blocks); 3674 ret = relocate_tree_blocks(trans, rc, &blocks);
3371 if (ret < 0) { 3675 if (ret < 0) {
3676 if (ret != -EAGAIN) {
3677 err = ret;
3678 break;
3679 }
3680 rc->extents_found--;
3681 rc->search_start = key.objectid;
3682 }
3683 }
3684
3685 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3686 rc->block_rsv, 0, 5);
3687 if (ret < 0) {
3688 if (ret != -EAGAIN) {
3372 err = ret; 3689 err = ret;
3690 WARN_ON(1);
3373 break; 3691 break;
3374 } 3692 }
3693 rc->commit_transaction = 1;
3375 } 3694 }
3376 3695
3377 nr = trans->blocks_used; 3696 if (rc->commit_transaction) {
3378 btrfs_end_transaction(trans, rc->extent_root); 3697 rc->commit_transaction = 0;
3698 ret = btrfs_commit_transaction(trans, rc->extent_root);
3699 BUG_ON(ret);
3700 } else {
3701 nr = trans->blocks_used;
3702 btrfs_end_transaction_throttle(trans, rc->extent_root);
3703 btrfs_btree_balance_dirty(rc->extent_root, nr);
3704 }
3379 trans = NULL; 3705 trans = NULL;
3380 btrfs_btree_balance_dirty(rc->extent_root, nr);
3381 3706
3382 if (rc->stage == MOVE_DATA_EXTENTS && 3707 if (rc->stage == MOVE_DATA_EXTENTS &&
3383 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3708 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3384 rc->found_file_extent = 1; 3709 rc->found_file_extent = 1;
3385 ret = relocate_data_extent(rc->data_inode, 3710 ret = relocate_data_extent(rc->data_inode,
3386 &key, cluster); 3711 &key, &rc->cluster);
3387 if (ret < 0) { 3712 if (ret < 0) {
3388 err = ret; 3713 err = ret;
3389 break; 3714 break;
3390 } 3715 }
3391 } 3716 }
3392 } 3717 }
3393 btrfs_free_path(path); 3718
3719 btrfs_release_path(rc->extent_root, path);
3720 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3721 GFP_NOFS);
3394 3722
3395 if (trans) { 3723 if (trans) {
3396 nr = trans->blocks_used; 3724 nr = trans->blocks_used;
3397 btrfs_end_transaction(trans, rc->extent_root); 3725 btrfs_end_transaction_throttle(trans, rc->extent_root);
3398 btrfs_btree_balance_dirty(rc->extent_root, nr); 3726 btrfs_btree_balance_dirty(rc->extent_root, nr);
3399 } 3727 }
3400 3728
3401 if (!err) { 3729 if (!err) {
3402 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3730 ret = relocate_file_extent_cluster(rc->data_inode,
3731 &rc->cluster);
3403 if (ret < 0) 3732 if (ret < 0)
3404 err = ret; 3733 err = ret;
3405 } 3734 }
3406 3735
3407 kfree(cluster); 3736 rc->create_reloc_tree = 0;
3737 set_reloc_control(rc);
3408 3738
3409 rc->create_reloc_root = 0; 3739 backref_cache_cleanup(&rc->backref_cache);
3410 smp_mb(); 3740 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3411 3741
3412 if (rc->extents_found > 0) { 3742 err = prepare_to_merge(rc, err);
3413 trans = btrfs_start_transaction(rc->extent_root, 1);
3414 btrfs_commit_transaction(trans, rc->extent_root);
3415 }
3416 3743
3417 merge_reloc_roots(rc); 3744 merge_reloc_roots(rc);
3418 3745
3746 rc->merge_reloc_tree = 0;
3419 unset_reloc_control(rc); 3747 unset_reloc_control(rc);
3748 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3420 3749
3421 /* get rid of pinned extents */ 3750 /* get rid of pinned extents */
3422 trans = btrfs_start_transaction(rc->extent_root, 1); 3751 trans = btrfs_join_transaction(rc->extent_root, 1);
3423 btrfs_commit_transaction(trans, rc->extent_root); 3752 btrfs_commit_transaction(trans, rc->extent_root);
3424 3753out_free:
3754 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3755 btrfs_free_path(path);
3425 return err; 3756 return err;
3426} 3757}
3427 3758
@@ -3447,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3447 btrfs_set_inode_generation(leaf, item, 1); 3778 btrfs_set_inode_generation(leaf, item, 1);
3448 btrfs_set_inode_size(leaf, item, 0); 3779 btrfs_set_inode_size(leaf, item, 0);
3449 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3780 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3450 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3781 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3782 BTRFS_INODE_PREALLOC);
3451 btrfs_mark_buffer_dirty(leaf); 3783 btrfs_mark_buffer_dirty(leaf);
3452 btrfs_release_path(root, path); 3784 btrfs_release_path(root, path);
3453out: 3785out:
@@ -3459,8 +3791,9 @@ out:
3459 * helper to create inode for data relocation. 3791 * helper to create inode for data relocation.
3460 * the inode is in data relocation tree and its link count is 0 3792 * the inode is in data relocation tree and its link count is 0
3461 */ 3793 */
3462static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3794static noinline_for_stack
3463 struct btrfs_block_group_cache *group) 3795struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3796 struct btrfs_block_group_cache *group)
3464{ 3797{
3465 struct inode *inode = NULL; 3798 struct inode *inode = NULL;
3466 struct btrfs_trans_handle *trans; 3799 struct btrfs_trans_handle *trans;
@@ -3474,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3474 if (IS_ERR(root)) 3807 if (IS_ERR(root))
3475 return ERR_CAST(root); 3808 return ERR_CAST(root);
3476 3809
3477 trans = btrfs_start_transaction(root, 1); 3810 trans = btrfs_start_transaction(root, 6);
3478 BUG_ON(!trans); 3811 if (IS_ERR(trans))
3812 return ERR_CAST(trans);
3479 3813
3480 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3814 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3481 if (err) 3815 if (err)
@@ -3487,7 +3821,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3487 key.objectid = objectid; 3821 key.objectid = objectid;
3488 key.type = BTRFS_INODE_ITEM_KEY; 3822 key.type = BTRFS_INODE_ITEM_KEY;
3489 key.offset = 0; 3823 key.offset = 0;
3490 inode = btrfs_iget(root->fs_info->sb, &key, root); 3824 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3825 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492 BTRFS_I(inode)->index_cnt = group->key.objectid; 3826 BTRFS_I(inode)->index_cnt = group->key.objectid;
3493 3827
@@ -3495,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3495out: 3829out:
3496 nr = trans->blocks_used; 3830 nr = trans->blocks_used;
3497 btrfs_end_transaction(trans, root); 3831 btrfs_end_transaction(trans, root);
3498
3499 btrfs_btree_balance_dirty(root, nr); 3832 btrfs_btree_balance_dirty(root, nr);
3500 if (err) { 3833 if (err) {
3501 if (inode) 3834 if (inode)
@@ -3505,6 +3838,21 @@ out:
3505 return inode; 3838 return inode;
3506} 3839}
3507 3840
3841static struct reloc_control *alloc_reloc_control(void)
3842{
3843 struct reloc_control *rc;
3844
3845 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3846 if (!rc)
3847 return NULL;
3848
3849 INIT_LIST_HEAD(&rc->reloc_roots);
3850 backref_cache_init(&rc->backref_cache);
3851 mapping_tree_init(&rc->reloc_root_tree);
3852 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3853 return rc;
3854}
3855
3508/* 3856/*
3509 * function to relocate all extents in a block group. 3857 * function to relocate all extents in a block group.
3510 */ 3858 */
@@ -3513,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3513 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3861 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3514 struct reloc_control *rc; 3862 struct reloc_control *rc;
3515 int ret; 3863 int ret;
3864 int rw = 0;
3516 int err = 0; 3865 int err = 0;
3517 3866
3518 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3867 rc = alloc_reloc_control();
3519 if (!rc) 3868 if (!rc)
3520 return -ENOMEM; 3869 return -ENOMEM;
3521 3870
3522 mapping_tree_init(&rc->reloc_root_tree); 3871 rc->extent_root = extent_root;
3523 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3524 INIT_LIST_HEAD(&rc->reloc_roots);
3525 3872
3526 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3873 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3527 BUG_ON(!rc->block_group); 3874 BUG_ON(!rc->block_group);
3528 3875
3529 btrfs_init_workers(&rc->workers, "relocate", 3876 if (!rc->block_group->ro) {
3530 fs_info->thread_pool_size, NULL); 3877 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3531 3878 if (ret) {
3532 rc->extent_root = extent_root; 3879 err = ret;
3533 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3880 goto out;
3881 }
3882 rw = 1;
3883 }
3534 3884
3535 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3536 if (IS_ERR(rc->data_inode)) { 3886 if (IS_ERR(rc->data_inode)) {
@@ -3547,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3547 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3897 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3548 3898
3549 while (1) { 3899 while (1) {
3550 rc->extents_found = 0;
3551 rc->extents_skipped = 0;
3552
3553 mutex_lock(&fs_info->cleaner_mutex); 3900 mutex_lock(&fs_info->cleaner_mutex);
3554 3901
3555 btrfs_clean_old_snapshots(fs_info->tree_root); 3902 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3558,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3558 mutex_unlock(&fs_info->cleaner_mutex); 3905 mutex_unlock(&fs_info->cleaner_mutex);
3559 if (ret < 0) { 3906 if (ret < 0) {
3560 err = ret; 3907 err = ret;
3561 break; 3908 goto out;
3562 } 3909 }
3563 3910
3564 if (rc->extents_found == 0) 3911 if (rc->extents_found == 0)
@@ -3572,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3572 invalidate_mapping_pages(rc->data_inode->i_mapping, 3919 invalidate_mapping_pages(rc->data_inode->i_mapping,
3573 0, -1); 3920 0, -1);
3574 rc->stage = UPDATE_DATA_PTRS; 3921 rc->stage = UPDATE_DATA_PTRS;
3575 } else if (rc->stage == UPDATE_DATA_PTRS &&
3576 rc->extents_skipped >= rc->extents_found) {
3577 iput(rc->data_inode);
3578 rc->data_inode = create_reloc_inode(fs_info,
3579 rc->block_group);
3580 if (IS_ERR(rc->data_inode)) {
3581 err = PTR_ERR(rc->data_inode);
3582 rc->data_inode = NULL;
3583 break;
3584 }
3585 rc->stage = MOVE_DATA_EXTENTS;
3586 rc->found_file_extent = 0;
3587 } 3922 }
3588 } 3923 }
3589 3924
@@ -3596,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3596 WARN_ON(rc->block_group->reserved > 0); 3931 WARN_ON(rc->block_group->reserved > 0);
3597 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3932 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3598out: 3933out:
3934 if (err && rw)
3935 btrfs_set_block_group_rw(extent_root, rc->block_group);
3599 iput(rc->data_inode); 3936 iput(rc->data_inode);
3600 btrfs_stop_workers(&rc->workers);
3601 btrfs_put_block_group(rc->block_group); 3937 btrfs_put_block_group(rc->block_group);
3602 kfree(rc); 3938 kfree(rc);
3603 return err; 3939 return err;
@@ -3608,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3608 struct btrfs_trans_handle *trans; 3944 struct btrfs_trans_handle *trans;
3609 int ret; 3945 int ret;
3610 3946
3611 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3947 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3612 3948
3613 memset(&root->root_item.drop_progress, 0, 3949 memset(&root->root_item.drop_progress, 0,
3614 sizeof(root->root_item.drop_progress)); 3950 sizeof(root->root_item.drop_progress));
@@ -3701,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 if (list_empty(&reloc_roots)) 4037 if (list_empty(&reloc_roots))
3702 goto out; 4038 goto out;
3703 4039
3704 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4040 rc = alloc_reloc_control();
3705 if (!rc) { 4041 if (!rc) {
3706 err = -ENOMEM; 4042 err = -ENOMEM;
3707 goto out; 4043 goto out;
3708 } 4044 }
3709 4045
3710 mapping_tree_init(&rc->reloc_root_tree);
3711 INIT_LIST_HEAD(&rc->reloc_roots);
3712 btrfs_init_workers(&rc->workers, "relocate",
3713 root->fs_info->thread_pool_size, NULL);
3714 rc->extent_root = root->fs_info->extent_root; 4046 rc->extent_root = root->fs_info->extent_root;
3715 4047
3716 set_reloc_control(rc); 4048 set_reloc_control(rc);
3717 4049
4050 trans = btrfs_join_transaction(rc->extent_root, 1);
4051
4052 rc->merge_reloc_tree = 1;
4053
3718 while (!list_empty(&reloc_roots)) { 4054 while (!list_empty(&reloc_roots)) {
3719 reloc_root = list_entry(reloc_roots.next, 4055 reloc_root = list_entry(reloc_roots.next,
3720 struct btrfs_root, root_list); 4056 struct btrfs_root, root_list);
@@ -3734,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3734 fs_root->reloc_root = reloc_root; 4070 fs_root->reloc_root = reloc_root;
3735 } 4071 }
3736 4072
3737 trans = btrfs_start_transaction(rc->extent_root, 1);
3738 btrfs_commit_transaction(trans, rc->extent_root); 4073 btrfs_commit_transaction(trans, rc->extent_root);
3739 4074
3740 merge_reloc_roots(rc); 4075 merge_reloc_roots(rc);
3741 4076
3742 unset_reloc_control(rc); 4077 unset_reloc_control(rc);
3743 4078
3744 trans = btrfs_start_transaction(rc->extent_root, 1); 4079 trans = btrfs_join_transaction(rc->extent_root, 1);
3745 btrfs_commit_transaction(trans, rc->extent_root); 4080 btrfs_commit_transaction(trans, rc->extent_root);
3746out: 4081out:
3747 if (rc) { 4082 kfree(rc);
3748 btrfs_stop_workers(&rc->workers);
3749 kfree(rc);
3750 }
3751 while (!list_empty(&reloc_roots)) { 4083 while (!list_empty(&reloc_roots)) {
3752 reloc_root = list_entry(reloc_roots.next, 4084 reloc_root = list_entry(reloc_roots.next,
3753 struct btrfs_root, root_list); 4085 struct btrfs_root, root_list);
@@ -3764,7 +4096,8 @@ out:
3764 BTRFS_DATA_RELOC_TREE_OBJECTID); 4096 BTRFS_DATA_RELOC_TREE_OBJECTID);
3765 if (IS_ERR(fs_root)) 4097 if (IS_ERR(fs_root))
3766 err = PTR_ERR(fs_root); 4098 err = PTR_ERR(fs_root);
3767 btrfs_orphan_cleanup(fs_root); 4099 else
4100 btrfs_orphan_cleanup(fs_root);
3768 } 4101 }
3769 return err; 4102 return err;
3770} 4103}
@@ -3812,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3812 btrfs_put_ordered_extent(ordered); 4145 btrfs_put_ordered_extent(ordered);
3813 return 0; 4146 return 0;
3814} 4147}
4148
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4150 struct btrfs_root *root, struct extent_buffer *buf,
4151 struct extent_buffer *cow)
4152{
4153 struct reloc_control *rc;
4154 struct backref_node *node;
4155 int first_cow = 0;
4156 int level;
4157 int ret;
4158
4159 rc = root->fs_info->reloc_ctl;
4160 if (!rc)
4161 return;
4162
4163 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4164 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4165
4166 level = btrfs_header_level(buf);
4167 if (btrfs_header_generation(buf) <=
4168 btrfs_root_last_snapshot(&root->root_item))
4169 first_cow = 1;
4170
4171 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4172 rc->create_reloc_tree) {
4173 WARN_ON(!first_cow && level == 0);
4174
4175 node = rc->backref_cache.path[level];
4176 BUG_ON(node->bytenr != buf->start &&
4177 node->new_bytenr != buf->start);
4178
4179 drop_node_buffer(node);
4180 extent_buffer_get(cow);
4181 node->eb = cow;
4182 node->new_bytenr = cow->start;
4183
4184 if (!node->pending) {
4185 list_move_tail(&node->list,
4186 &rc->backref_cache.pending[level]);
4187 node->pending = 1;
4188 }
4189
4190 if (first_cow)
4191 __mark_block_processed(rc, node);
4192
4193 if (first_cow && level > 0)
4194 rc->nodes_relocated += buf->len;
4195 }
4196
4197 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4198 ret = replace_file_extents(trans, rc, root, cow);
4199 BUG_ON(ret);
4200 }
4201}
4202
4203/*
4204 * called before creating snapshot. it calculates metadata reservation
4205 * requried for relocating tree blocks in the snapshot
4206 */
4207void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4208 struct btrfs_pending_snapshot *pending,
4209 u64 *bytes_to_reserve)
4210{
4211 struct btrfs_root *root;
4212 struct reloc_control *rc;
4213
4214 root = pending->root;
4215 if (!root->reloc_root)
4216 return;
4217
4218 rc = root->fs_info->reloc_ctl;
4219 if (!rc->merge_reloc_tree)
4220 return;
4221
4222 root = root->reloc_root;
4223 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4224 /*
4225 * relocation is in the stage of merging trees. the space
4226 * used by merging a reloc tree is twice the size of
4227 * relocated tree nodes in the worst case. half for cowing
4228 * the reloc tree, half for cowing the fs tree. the space
4229 * used by cowing the reloc tree will be freed after the
4230 * tree is dropped. if we create snapshot, cowing the fs
4231 * tree may use more space than it frees. so we need
4232 * reserve extra space.
4233 */
4234 *bytes_to_reserve += rc->nodes_relocated;
4235}
4236
4237/*
4238 * called after snapshot is created. migrate block reservation
4239 * and create reloc root for the newly created snapshot
4240 */
4241void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4242 struct btrfs_pending_snapshot *pending)
4243{
4244 struct btrfs_root *root = pending->root;
4245 struct btrfs_root *reloc_root;
4246 struct btrfs_root *new_root;
4247 struct reloc_control *rc;
4248 int ret;
4249
4250 if (!root->reloc_root)
4251 return;
4252
4253 rc = root->fs_info->reloc_ctl;
4254 rc->merging_rsv_size += rc->nodes_relocated;
4255
4256 if (rc->merge_reloc_tree) {
4257 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4258 rc->block_rsv,
4259 rc->nodes_relocated);
4260 BUG_ON(ret);
4261 }
4262
4263 new_root = pending->snap;
4264 reloc_root = create_reloc_root(trans, root->reloc_root,
4265 new_root->root_key.objectid);
4266
4267 __add_reloc_root(reloc_root);
4268 new_root->reloc_root = reloc_root;
4269
4270 if (rc->create_reloc_tree) {
4271 ret = clone_backref_node(trans, rc, root, reloc_root);
4272 BUG_ON(ret);
4273 }
4274}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
313{ 330{
314 struct btrfs_path *path; 331 struct btrfs_path *path;
315 int ret; 332 int ret;
316 u32 refs;
317 struct btrfs_root_item *ri; 333 struct btrfs_root_item *ri;
318 struct extent_buffer *leaf; 334 struct extent_buffer *leaf;
319 335
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
327 leaf = path->nodes[0]; 343 leaf = path->nodes[0];
328 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); 344 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
329 345
330 refs = btrfs_disk_root_refs(leaf, ri);
331 BUG_ON(refs != 0);
332 ret = btrfs_del_item(trans, root, path); 346 ret = btrfs_del_item(trans, root, path);
333out: 347out:
334 btrfs_free_path(path); 348 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e64575..1776dbd8dc98 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -63,22 +64,21 @@ static void btrfs_put_super(struct super_block *sb)
63} 64}
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
73 73
74static match_table_t tokens = { 74static match_table_t tokens = {
75 {Opt_degraded, "degraded"}, 75 {Opt_degraded, "degraded"},
76 {Opt_subvol, "subvol=%s"}, 76 {Opt_subvol, "subvol=%s"},
77 {Opt_subvolid, "subvolid=%d"},
77 {Opt_device, "device=%s"}, 78 {Opt_device, "device=%s"},
78 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
79 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
80 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
81 {Opt_max_extent, "max_extent=%s"},
82 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
83 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
84 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
@@ -95,31 +95,6 @@ static match_table_t tokens = {
95 {Opt_err, NULL}, 95 {Opt_err, NULL},
96}; 96};
97 97
98u64 btrfs_parse_size(char *str)
99{
100 u64 res;
101 int mult = 1;
102 char *end;
103 char last;
104
105 res = simple_strtoul(str, &end, 10);
106
107 last = end[0];
108 if (isalpha(last)) {
109 last = tolower(last);
110 switch (last) {
111 case 'g':
112 mult *= 1024;
113 case 'm':
114 mult *= 1024;
115 case 'k':
116 mult *= 1024;
117 }
118 res = res * mult;
119 }
120 return res;
121}
122
123/* 98/*
124 * Regular mount options parser. Everything that is needed only when 99 * Regular mount options parser. Everything that is needed only when
125 * reading in a new superblock is parsed here. 100 * reading in a new superblock is parsed here.
@@ -128,7 +103,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
128{ 103{
129 struct btrfs_fs_info *info = root->fs_info; 104 struct btrfs_fs_info *info = root->fs_info;
130 substring_t args[MAX_OPT_ARGS]; 105 substring_t args[MAX_OPT_ARGS];
131 char *p, *num; 106 char *p, *num, *orig;
132 int intarg; 107 int intarg;
133 int ret = 0; 108 int ret = 0;
134 109
@@ -143,6 +118,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
143 if (!options) 118 if (!options)
144 return -ENOMEM; 119 return -ENOMEM;
145 120
121 orig = options;
146 122
147 while ((p = strsep(&options, ",")) != NULL) { 123 while ((p = strsep(&options, ",")) != NULL) {
148 int token; 124 int token;
@@ -156,6 +132,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
156 btrfs_set_opt(info->mount_opt, DEGRADED); 132 btrfs_set_opt(info->mount_opt, DEGRADED);
157 break; 133 break;
158 case Opt_subvol: 134 case Opt_subvol:
135 case Opt_subvolid:
159 case Opt_device: 136 case Opt_device:
160 /* 137 /*
161 * These are parsed by btrfs_parse_early_options 138 * These are parsed by btrfs_parse_early_options
@@ -210,22 +187,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
210 info->thread_pool_size); 187 info->thread_pool_size);
211 } 188 }
212 break; 189 break;
213 case Opt_max_extent:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->max_extent = btrfs_parse_size(num);
217 kfree(num);
218
219 info->max_extent = max_t(u64,
220 info->max_extent, root->sectorsize);
221 printk(KERN_INFO "btrfs: max_extent at %llu\n",
222 (unsigned long long)info->max_extent);
223 }
224 break;
225 case Opt_max_inline: 190 case Opt_max_inline:
226 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
227 if (num) { 192 if (num) {
228 info->max_inline = btrfs_parse_size(num); 193 info->max_inline = memparse(num, NULL);
229 kfree(num); 194 kfree(num);
230 195
231 if (info->max_inline) { 196 if (info->max_inline) {
@@ -240,7 +205,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
240 case Opt_alloc_start: 205 case Opt_alloc_start:
241 num = match_strdup(&args[0]); 206 num = match_strdup(&args[0]);
242 if (num) { 207 if (num) {
243 info->alloc_start = btrfs_parse_size(num); 208 info->alloc_start = memparse(num, NULL);
244 kfree(num); 209 kfree(num);
245 printk(KERN_INFO 210 printk(KERN_INFO
246 "btrfs: allocations start at %llu\n", 211 "btrfs: allocations start at %llu\n",
@@ -280,7 +245,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
280 } 245 }
281 } 246 }
282out: 247out:
283 kfree(options); 248 kfree(orig);
284 return ret; 249 return ret;
285} 250}
286 251
@@ -291,12 +256,13 @@ out:
291 * only when we need to allocate a new super block. 256 * only when we need to allocate a new super block.
292 */ 257 */
293static int btrfs_parse_early_options(const char *options, fmode_t flags, 258static int btrfs_parse_early_options(const char *options, fmode_t flags,
294 void *holder, char **subvol_name, 259 void *holder, char **subvol_name, u64 *subvol_objectid,
295 struct btrfs_fs_devices **fs_devices) 260 struct btrfs_fs_devices **fs_devices)
296{ 261{
297 substring_t args[MAX_OPT_ARGS]; 262 substring_t args[MAX_OPT_ARGS];
298 char *opts, *p; 263 char *opts, *p;
299 int error = 0; 264 int error = 0;
265 int intarg;
300 266
301 if (!options) 267 if (!options)
302 goto out; 268 goto out;
@@ -319,6 +285,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
319 case Opt_subvol: 285 case Opt_subvol:
320 *subvol_name = match_strdup(&args[0]); 286 *subvol_name = match_strdup(&args[0]);
321 break; 287 break;
288 case Opt_subvolid:
289 intarg = 0;
290 error = match_int(&args[0], &intarg);
291 if (!error) {
292 /* we want the original fs_tree */
293 if (!intarg)
294 *subvol_objectid =
295 BTRFS_FS_TREE_OBJECTID;
296 else
297 *subvol_objectid = intarg;
298 }
299 break;
322 case Opt_device: 300 case Opt_device:
323 error = btrfs_scan_one_device(match_strdup(&args[0]), 301 error = btrfs_scan_one_device(match_strdup(&args[0]),
324 flags, holder, fs_devices); 302 flags, holder, fs_devices);
@@ -346,6 +324,112 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
346 return error; 324 return error;
347} 325}
348 326
327static struct dentry *get_default_root(struct super_block *sb,
328 u64 subvol_objectid)
329{
330 struct btrfs_root *root = sb->s_fs_info;
331 struct btrfs_root *new_root;
332 struct btrfs_dir_item *di;
333 struct btrfs_path *path;
334 struct btrfs_key location;
335 struct inode *inode;
336 struct dentry *dentry;
337 u64 dir_id;
338 int new = 0;
339
340 /*
341 * We have a specific subvol we want to mount, just setup location and
342 * go look up the root.
343 */
344 if (subvol_objectid) {
345 location.objectid = subvol_objectid;
346 location.type = BTRFS_ROOT_ITEM_KEY;
347 location.offset = (u64)-1;
348 goto find_root;
349 }
350
351 path = btrfs_alloc_path();
352 if (!path)
353 return ERR_PTR(-ENOMEM);
354 path->leave_spinning = 1;
355
356 /*
357 * Find the "default" dir item which points to the root item that we
358 * will mount by default if we haven't been given a specific subvolume
359 * to mount.
360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (IS_ERR(di))
364 return ERR_CAST(di);
365 if (!di) {
366 /*
367 * Ok the default dir item isn't there. This is weird since
368 * it's always been there, but don't freak out, just try and
369 * mount to root most subvolume.
370 */
371 btrfs_free_path(path);
372 dir_id = BTRFS_FIRST_FREE_OBJECTID;
373 new_root = root->fs_info->fs_root;
374 goto setup_root;
375 }
376
377 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
378 btrfs_free_path(path);
379
380find_root:
381 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
382 if (IS_ERR(new_root))
383 return ERR_PTR(PTR_ERR(new_root));
384
385 if (btrfs_root_refs(&new_root->root_item) == 0)
386 return ERR_PTR(-ENOENT);
387
388 dir_id = btrfs_root_dirid(&new_root->root_item);
389setup_root:
390 location.objectid = dir_id;
391 location.type = BTRFS_INODE_ITEM_KEY;
392 location.offset = 0;
393
394 inode = btrfs_iget(sb, &location, new_root, &new);
395 if (IS_ERR(inode))
396 return ERR_CAST(inode);
397
398 /*
399 * If we're just mounting the root most subvol put the inode and return
400 * a reference to the dentry. We will have already gotten a reference
401 * to the inode in btrfs_fill_super so we're good to go.
402 */
403 if (!new && sb->s_root->d_inode == inode) {
404 iput(inode);
405 return dget(sb->s_root);
406 }
407
408 if (new) {
409 const struct qstr name = { .name = "/", .len = 1 };
410
411 /*
412 * New inode, we need to make the dentry a sibling of s_root so
413 * everything gets cleaned up properly on unmount.
414 */
415 dentry = d_alloc(sb->s_root, &name);
416 if (!dentry) {
417 iput(inode);
418 return ERR_PTR(-ENOMEM);
419 }
420 d_splice_alias(inode, dentry);
421 } else {
422 /*
423 * We found the inode in cache, just find a dentry for it and
424 * put the reference to the inode we just got.
425 */
426 dentry = d_find_alias(inode);
427 iput(inode);
428 }
429
430 return dentry;
431}
432
349static int btrfs_fill_super(struct super_block *sb, 433static int btrfs_fill_super(struct super_block *sb,
350 struct btrfs_fs_devices *fs_devices, 434 struct btrfs_fs_devices *fs_devices,
351 void *data, int silent) 435 void *data, int silent)
@@ -379,7 +463,7 @@ static int btrfs_fill_super(struct super_block *sb,
379 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 463 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
380 key.type = BTRFS_INODE_ITEM_KEY; 464 key.type = BTRFS_INODE_ITEM_KEY;
381 key.offset = 0; 465 key.offset = 0;
382 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 466 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
383 if (IS_ERR(inode)) { 467 if (IS_ERR(inode)) {
384 err = PTR_ERR(inode); 468 err = PTR_ERR(inode);
385 goto fail_close; 469 goto fail_close;
@@ -391,12 +475,6 @@ static int btrfs_fill_super(struct super_block *sb,
391 err = -ENOMEM; 475 err = -ENOMEM;
392 goto fail_close; 476 goto fail_close;
393 } 477 }
394#if 0
395 /* this does the super kobj at the same time */
396 err = btrfs_sysfs_add_super(tree_root->fs_info);
397 if (err)
398 goto fail_close;
399#endif
400 478
401 sb->s_root = root_dentry; 479 sb->s_root = root_dentry;
402 480
@@ -422,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
422 btrfs_start_delalloc_inodes(root, 0); 500 btrfs_start_delalloc_inodes(root, 0);
423 btrfs_wait_ordered_extents(root, 0, 0); 501 btrfs_wait_ordered_extents(root, 0, 0);
424 502
425 trans = btrfs_start_transaction(root, 1); 503 trans = btrfs_start_transaction(root, 0);
426 ret = btrfs_commit_transaction(trans, root); 504 ret = btrfs_commit_transaction(trans, root);
427 return ret; 505 return ret;
428} 506}
@@ -440,9 +518,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
440 seq_puts(seq, ",nodatacow"); 518 seq_puts(seq, ",nodatacow");
441 if (btrfs_test_opt(root, NOBARRIER)) 519 if (btrfs_test_opt(root, NOBARRIER))
442 seq_puts(seq, ",nobarrier"); 520 seq_puts(seq, ",nobarrier");
443 if (info->max_extent != (u64)-1)
444 seq_printf(seq, ",max_extent=%llu",
445 (unsigned long long)info->max_extent);
446 if (info->max_inline != 8192 * 1024) 521 if (info->max_inline != 8192 * 1024)
447 seq_printf(seq, ",max_inline=%llu", 522 seq_printf(seq, ",max_inline=%llu",
448 (unsigned long long)info->max_inline); 523 (unsigned long long)info->max_inline);
@@ -488,19 +563,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
488static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 563static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
489 const char *dev_name, void *data, struct vfsmount *mnt) 564 const char *dev_name, void *data, struct vfsmount *mnt)
490{ 565{
491 char *subvol_name = NULL;
492 struct block_device *bdev = NULL; 566 struct block_device *bdev = NULL;
493 struct super_block *s; 567 struct super_block *s;
494 struct dentry *root; 568 struct dentry *root;
495 struct btrfs_fs_devices *fs_devices = NULL; 569 struct btrfs_fs_devices *fs_devices = NULL;
496 fmode_t mode = FMODE_READ; 570 fmode_t mode = FMODE_READ;
571 char *subvol_name = NULL;
572 u64 subvol_objectid = 0;
497 int error = 0; 573 int error = 0;
574 int found = 0;
498 575
499 if (!(flags & MS_RDONLY)) 576 if (!(flags & MS_RDONLY))
500 mode |= FMODE_WRITE; 577 mode |= FMODE_WRITE;
501 578
502 error = btrfs_parse_early_options(data, mode, fs_type, 579 error = btrfs_parse_early_options(data, mode, fs_type,
503 &subvol_name, &fs_devices); 580 &subvol_name, &subvol_objectid,
581 &fs_devices);
504 if (error) 582 if (error)
505 return error; 583 return error;
506 584
@@ -529,6 +607,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
529 goto error_close_devices; 607 goto error_close_devices;
530 } 608 }
531 609
610 found = 1;
532 btrfs_close_devices(fs_devices); 611 btrfs_close_devices(fs_devices);
533 } else { 612 } else {
534 char b[BDEVNAME_SIZE]; 613 char b[BDEVNAME_SIZE];
@@ -546,25 +625,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
546 s->s_flags |= MS_ACTIVE; 625 s->s_flags |= MS_ACTIVE;
547 } 626 }
548 627
549 if (!strcmp(subvol_name, ".")) 628 root = get_default_root(s, subvol_objectid);
550 root = dget(s->s_root); 629 if (IS_ERR(root)) {
551 else { 630 error = PTR_ERR(root);
552 mutex_lock(&s->s_root->d_inode->i_mutex); 631 deactivate_locked_super(s);
553 root = lookup_one_len(subvol_name, s->s_root, 632 goto error;
633 }
634 /* if they gave us a subvolume name bind mount into that */
635 if (strcmp(subvol_name, ".")) {
636 struct dentry *new_root;
637 mutex_lock(&root->d_inode->i_mutex);
638 new_root = lookup_one_len(subvol_name, root,
554 strlen(subvol_name)); 639 strlen(subvol_name));
555 mutex_unlock(&s->s_root->d_inode->i_mutex); 640 mutex_unlock(&root->d_inode->i_mutex);
556 641
557 if (IS_ERR(root)) { 642 if (IS_ERR(new_root)) {
558 deactivate_locked_super(s); 643 deactivate_locked_super(s);
559 error = PTR_ERR(root); 644 error = PTR_ERR(new_root);
560 goto error_free_subvol_name; 645 dput(root);
646 goto error_close_devices;
561 } 647 }
562 if (!root->d_inode) { 648 if (!new_root->d_inode) {
563 dput(root); 649 dput(root);
650 dput(new_root);
564 deactivate_locked_super(s); 651 deactivate_locked_super(s);
565 error = -ENXIO; 652 error = -ENXIO;
566 goto error_free_subvol_name; 653 goto error_close_devices;
567 } 654 }
655 dput(root);
656 root = new_root;
568 } 657 }
569 658
570 mnt->mnt_sb = s; 659 mnt->mnt_sb = s;
@@ -579,6 +668,7 @@ error_close_devices:
579 btrfs_close_devices(fs_devices); 668 btrfs_close_devices(fs_devices);
580error_free_subvol_name: 669error_free_subvol_name:
581 kfree(subvol_name); 670 kfree(subvol_name);
671error:
582 return error; 672 return error;
583} 673}
584 674
@@ -606,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
606 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 696 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
607 return -EINVAL; 697 return -EINVAL;
608 698
609 /* recover relocation */ 699 ret = btrfs_cleanup_fs_roots(root->fs_info);
610 ret = btrfs_recover_relocation(root);
611 WARN_ON(ret); 700 WARN_ON(ret);
612 701
613 ret = btrfs_cleanup_fs_roots(root->fs_info); 702 /* recover relocation */
703 ret = btrfs_recover_relocation(root);
614 WARN_ON(ret); 704 WARN_ON(ret);
615 705
616 sb->s_flags &= ~MS_RDONLY; 706 sb->s_flags &= ~MS_RDONLY;
@@ -623,13 +713,20 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
623{ 713{
624 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 714 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
625 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 715 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
716 struct list_head *head = &root->fs_info->space_info;
717 struct btrfs_space_info *found;
718 u64 total_used = 0;
626 int bits = dentry->d_sb->s_blocksize_bits; 719 int bits = dentry->d_sb->s_blocksize_bits;
627 __be32 *fsid = (__be32 *)root->fs_info->fsid; 720 __be32 *fsid = (__be32 *)root->fs_info->fsid;
628 721
722 rcu_read_lock();
723 list_for_each_entry_rcu(found, head, list)
724 total_used += found->disk_used;
725 rcu_read_unlock();
726
629 buf->f_namelen = BTRFS_NAME_LEN; 727 buf->f_namelen = BTRFS_NAME_LEN;
630 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
631 buf->f_bfree = buf->f_blocks - 729 buf->f_bfree = buf->f_blocks - (total_used >> bits);
632 (btrfs_super_bytes_used(disk_super) >> bits);
633 buf->f_bavail = buf->f_bfree; 730 buf->f_bavail = buf->f_bfree;
634 buf->f_bsize = dentry->d_sb->s_blocksize; 731 buf->f_bsize = dentry->d_sb->s_blocksize;
635 buf->f_type = BTRFS_SUPER_MAGIC; 732 buf->f_type = BTRFS_SUPER_MAGIC;
@@ -700,7 +797,7 @@ static int btrfs_unfreeze(struct super_block *sb)
700 797
701static const struct super_operations btrfs_super_ops = { 798static const struct super_operations btrfs_super_ops = {
702 .drop_inode = btrfs_drop_inode, 799 .drop_inode = btrfs_drop_inode,
703 .delete_inode = btrfs_delete_inode, 800 .evict_inode = btrfs_evict_inode,
704 .put_super = btrfs_put_super, 801 .put_super = btrfs_put_super,
705 .sync_fs = btrfs_sync_fs, 802 .sync_fs = btrfs_sync_fs,
706 .show_options = btrfs_show_options, 803 .show_options = btrfs_show_options,
@@ -721,11 +818,14 @@ static const struct file_operations btrfs_ctl_fops = {
721}; 818};
722 819
723static struct miscdevice btrfs_misc = { 820static struct miscdevice btrfs_misc = {
724 .minor = MISC_DYNAMIC_MINOR, 821 .minor = BTRFS_MINOR,
725 .name = "btrfs-control", 822 .name = "btrfs-control",
726 .fops = &btrfs_ctl_fops 823 .fops = &btrfs_ctl_fops
727}; 824};
728 825
826MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
827MODULE_ALIAS("devname:btrfs-control");
828
729static int btrfs_interface_init(void) 829static int btrfs_interface_init(void)
730{ 830{
731 return misc_register(&btrfs_misc); 831 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
164 complete(&root->kobj_unregister); 164 complete(&root->kobj_unregister);
165} 165}
166 166
167static struct sysfs_ops btrfs_super_attr_ops = { 167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show, 168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store, 169 .store = btrfs_super_attr_store,
170}; 170};
171 171
172static struct sysfs_ops btrfs_root_attr_ops = { 172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show, 173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store, 174 .store = btrfs_root_attr_store,
175}; 175};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -69,7 +70,7 @@ static noinline int join_transaction(struct btrfs_root *root)
69 cur_trans->commit_done = 0; 70 cur_trans->commit_done = 0;
70 cur_trans->start_time = get_seconds(); 71 cur_trans->start_time = get_seconds();
71 72
72 cur_trans->delayed_refs.root.rb_node = NULL; 73 cur_trans->delayed_refs.root = RB_ROOT;
73 cur_trans->delayed_refs.num_entries = 0; 74 cur_trans->delayed_refs.num_entries = 0;
74 cur_trans->delayed_refs.num_heads_ready = 0; 75 cur_trans->delayed_refs.num_heads_ready = 0;
75 cur_trans->delayed_refs.num_heads = 0; 76 cur_trans->delayed_refs.num_heads = 0;
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -169,54 +165,89 @@ enum btrfs_trans_type {
169 TRANS_USERSPACE, 165 TRANS_USERSPACE,
170}; 166};
171 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
173 int num_blocks, int type) 178 u64 num_items, int type)
174{ 179{
175 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
177 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
178 188
179 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
180 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE))
183 wait_current_trans(root); 191 wait_current_trans(root);
192
184 ret = join_transaction(root); 193 ret = join_transaction(root);
185 BUG_ON(ret); 194 BUG_ON(ret);
186 195
187 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
188 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
189 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
190 h->blocks_used = 0; 202 h->blocks_used = 0;
191 h->block_group = 0; 203 h->block_group = 0;
192 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
193 h->alloc_exclude_start = 0;
194 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
195 207
196 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
197 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
198 226
199 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
200 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
201 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
202 return h; 233 return h;
203} 234}
204 235
205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
206 int num_blocks) 237 int num_items)
207{ 238{
208 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
209} 240}
210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
211 int num_blocks) 242 int num_blocks)
212{ 243{
213 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
214} 245}
215 246
216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
217 int num_blocks) 248 int num_blocks)
218{ 249{
219 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
220} 251}
221 252
222/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -290,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
290 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
291} 322}
292 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
293static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
294 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
295{ 352{
296 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
297 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
298 int count = 0; 355 int count = 0;
299 356
@@ -317,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 count++; 374 count++;
318 } 375 }
319 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
320 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
321 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
322 WARN_ON(cur_trans != trans->transaction);
323 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
324 cur_trans->num_writers--; 393 cur_trans->num_writers--;
325 394
@@ -607,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
607 676
608 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
609 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
610 680
611 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
612 switch_commit_root(root); 682 switch_commit_root(root);
@@ -631,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
631int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
632{ 702{
633 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
634 int ret;
635 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
636 unsigned long nr; 706 unsigned long nr;
637 707
638 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
639 if (root->defrag_running)
640 return 0; 709 return 0;
641 trans = btrfs_start_transaction(root, 1); 710
642 while (1) { 711 while (1) {
643 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
644 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
645 nr = trans->blocks_used; 718 nr = trans->blocks_used;
646 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
647 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
648 cond_resched(); 721 cond_resched();
649 722
650 trans = btrfs_start_transaction(root, 1);
651 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
652 break; 724 break;
653 } 725 }
654 root->defrag_running = 0; 726 root->defrag_running = 0;
655 smp_mb(); 727 return ret;
656 btrfs_end_transaction(trans, root);
657 return 0;
658} 728}
659 729
660#if 0 730#if 0
@@ -760,29 +830,72 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 830 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 831 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
833 struct btrfs_root *parent_root;
834 struct inode *parent_inode;
835 struct dentry *dentry;
763 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
764 struct extent_buffer *old; 837 struct extent_buffer *old;
765 int ret; 838 int ret;
839 int retries = 0;
840 u64 to_reserve = 0;
841 u64 index = 0;
766 u64 objectid; 842 u64 objectid;
767 843
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 845 if (!new_root_item) {
770 ret = -ENOMEM; 846 pending->error = -ENOMEM;
771 goto fail; 847 goto fail;
772 } 848 }
849
773 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
774 if (ret) 851 if (ret) {
852 pending->error = ret;
775 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
867
868 key.objectid = objectid;
869 key.offset = (u64)-1;
870 key.type = BTRFS_ROOT_ITEM_KEY;
871
872 trans->block_rsv = &pending->block_rsv;
873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
877 record_root_in_trans(trans, parent_root);
878
879 /*
880 * insert the directory item
881 */
882 ret = btrfs_set_inode_index(parent_inode, &index);
883 BUG_ON(ret);
884 ret = btrfs_insert_dir_item(trans, parent_root,
885 dentry->d_name.name, dentry->d_name.len,
886 parent_inode->i_ino, &key,
887 BTRFS_FT_DIR, index);
888 BUG_ON(ret);
889
890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
893 BUG_ON(ret);
776 894
777 record_root_in_trans(trans, root); 895 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 896 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 897 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780 898
781 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785
786 old = btrfs_lock_root_node(root); 899 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old); 900 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old); 901 btrfs_set_lock_blocking(old);
@@ -792,62 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
792 free_extent_buffer(old); 905 free_extent_buffer(old);
793 906
794 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
796 new_root_item); 909 key.offset = trans->transid;
910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
797 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
799 if (ret) 913 BUG_ON(ret);
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key));
804fail:
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 914
823 /* 915 /*
824 * insert the directory item 916 * insert root back/forward references
825 */ 917 */
826 namelen = strlen(pending->name); 918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
827 ret = btrfs_set_inode_index(parent_inode, &index);
828 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen,
830 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index);
832
833 if (ret)
834 goto fail;
835
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret);
839
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid,
842 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
844 namelen); 921 dentry->d_name.name, dentry->d_name.len);
845
846 BUG_ON(ret); 922 BUG_ON(ret);
847 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
848fail: 930fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 931 kfree(new_root_item);
850 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
851} 934}
852 935
853/* 936/*
@@ -867,25 +950,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 950 return 0;
868} 951}
869 952
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 953static void update_super_roots(struct btrfs_root *root)
890{ 954{
891 struct btrfs_root_item *root_item; 955 struct btrfs_root_item *root_item;
@@ -914,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
914 return ret; 978 return ret;
915} 979}
916 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
917int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
918 struct btrfs_root *root) 992 struct btrfs_root *root)
919{ 993{
@@ -935,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
935 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
936 BUG_ON(ret); 1010 BUG_ON(ret);
937 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
938 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
939 /* 1015 /*
940 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -987,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
987 snap_pending = 1; 1063 snap_pending = 1;
988 1064
989 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
990 prepare_to_wait(&cur_trans->writer_wait, &wait,
991 TASK_UNINTERRUPTIBLE);
992
993 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
994 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
995 else if (should_grow) 1068 else if (should_grow)
@@ -997,13 +1070,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
997 1070
998 mutex_unlock(&root->fs_info->trans_mutex); 1071 mutex_unlock(&root->fs_info->trans_mutex);
999 1072
1000 if (flush_on_commit) { 1073 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 1074 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 1075 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 1076 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 1077 }
1008 1078
1009 /* 1079 /*
@@ -1015,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1015 */ 1085 */
1016 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
1017 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
1018 smp_mb(); 1091 smp_mb();
1019 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
1020 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1100,9 +1173,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1100 1173
1101 btrfs_finish_extent_commit(trans, root); 1174 btrfs_finish_extent_commit(trans, root);
1102 1175
1103 /* do the directory inserts of any pending snapshot creations */
1104 finish_pending_snapshots(trans, root->fs_info);
1105
1106 mutex_lock(&root->fs_info->trans_mutex); 1176 mutex_lock(&root->fs_info->trans_mutex);
1107 1177
1108 cur_trans->commit_done = 1; 1178 cur_trans->commit_done = 1;
@@ -1145,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1145 1215
1146 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1147 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1148 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1149 else 1219 else
1150 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1151 } 1221 }
1152 return 0; 1222 return 0;
1153} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -134,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
134 struct btrfs_root *root) 135 struct btrfs_root *root)
135{ 136{
136 int ret; 137 int ret;
138 int err = 0;
137 139
138 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 141 if (root->log_root) {
@@ -154,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
154 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
155 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
156 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
157 BUG_ON(ret); 159 if (ret)
160 err = ret;
158 } 161 }
159 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
160 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
161 BUG_ON(ret); 164 if (ret)
165 err = ret;
162 } 166 }
163 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
164 root->log_batch++; 168 root->log_batch++;
165 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
166 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
167 return 0; 171 return err;
168} 172}
169 173
170/* 174/*
@@ -375,7 +379,7 @@ insert:
375 BUG_ON(ret); 379 BUG_ON(ret);
376 } 380 }
377 } else if (ret) { 381 } else if (ret) {
378 BUG(); 382 return ret;
379 } 383 }
380 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
381 path->slots[0]); 385 path->slots[0]);
@@ -445,7 +449,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 449 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 450 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 451 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 452 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 453 if (IS_ERR(inode)) {
450 inode = NULL; 454 inode = NULL;
451 } else if (is_bad_inode(inode)) { 455 } else if (is_bad_inode(inode)) {
@@ -1698,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1698 1702
1699 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1700 1704
1701 wc->process_func(root, next, wc, ptr_gen);
1702
1703 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1704 path->slots[*level]++; 1708 path->slots[*level]++;
1705 if (wc->free) { 1709 if (wc->free) {
1706 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1733,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1733 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1734 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1735 1739
1736 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1737 parent = path->nodes[*level];
1738 else
1739 parent = path->nodes[*level + 1];
1740
1741 bytenr = path->nodes[*level]->start;
1742
1743 blocksize = btrfs_level_size(root, *level);
1744 root_owner = btrfs_header_owner(parent);
1745 root_gen = btrfs_header_generation(parent);
1746
1747 wc->process_func(root, path->nodes[*level], wc,
1748 btrfs_header_generation(path->nodes[*level]));
1749
1750 if (wc->free) {
1751 next = path->nodes[*level];
1752 btrfs_tree_lock(next);
1753 clean_tree_block(trans, root, next);
1754 btrfs_set_lock_blocking(next);
1755 btrfs_wait_tree_block_writeback(next);
1756 btrfs_tree_unlock(next);
1757
1758 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1759 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1760 BUG_ON(ret);
1761 }
1762 free_extent_buffer(path->nodes[*level]);
1763 path->nodes[*level] = NULL;
1764 *level += 1;
1765 1741
1766 cond_resched(); 1742 cond_resched();
1767 return 0; 1743 return 0;
@@ -1780,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1780 1756
1781 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1782 slot = path->slots[i]; 1758 slot = path->slots[i];
1783 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1784 struct extent_buffer *node; 1760 struct extent_buffer *node;
1785 node = path->nodes[i]; 1761 node = path->nodes[i];
1786 path->slots[i]++; 1762 path->slots[i]++;
@@ -2046,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2046 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2047 2023
2048 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2049 BUG_ON(ret);
2050 2025
2051 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2052 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2055,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2056 } 2031 }
2057 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2058 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2059 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2060 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2128,15 +2112,10 @@ out:
2128 return 0; 2112 return 0;
2129} 2113}
2130 2114
2131/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2132 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2133 * at commit time of the full transaction
2134 */
2135int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2136{ 2117{
2137 int ret; 2118 int ret;
2138 struct btrfs_root *log;
2139 struct key;
2140 u64 start; 2119 u64 start;
2141 u64 end; 2120 u64 end;
2142 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2144,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2144 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2145 }; 2124 };
2146 2125
2147 if (!root->log_root || root->fs_info->log_root_recovering)
2148 return 0;
2149
2150 log = root->log_root;
2151 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2152 BUG_ON(ret); 2127 BUG_ON(ret);
2153 2128
@@ -2161,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2161 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2162 } 2137 }
2163 2138
2164 if (log->log_transid > 0) {
2165 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2166 &log->root_key);
2167 BUG_ON(ret);
2168 }
2169 root->log_root = NULL;
2170 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2171 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2172 return 0; 2163 return 0;
2173} 2164}
2174 2165
@@ -2202,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2202 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2203 struct btrfs_path *path; 2194 struct btrfs_path *path;
2204 int ret; 2195 int ret;
2196 int err = 0;
2205 int bytes_del = 0; 2197 int bytes_del = 0;
2206 2198
2207 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2217,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2217 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2218 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2219 name, name_len, -1); 2211 name, name_len, -1);
2220 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2221 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2222 bytes_del += name_len; 2218 bytes_del += name_len;
2223 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2225,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2225 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2226 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2227 index, name, name_len, -1); 2223 index, name, name_len, -1);
2228 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2229 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2230 bytes_del += name_len; 2230 bytes_del += name_len;
2231 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2243,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2243 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2244 2244
2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2246 if (ret == 0) { 2250 if (ret == 0) {
2247 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2248 u64 i_size; 2252 u64 i_size;
@@ -2260,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2260 ret = 0; 2264 ret = 0;
2261 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2262 } 2266 }
2263 2267fail:
2264 btrfs_free_path(path); 2268 btrfs_free_path(path);
2265 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2266 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2267 2275
2268 return 0; 2276 return 0;
@@ -2290,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2290 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2291 dirid, &index); 2299 dirid, &index);
2292 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2293 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2294 2306
2295 return ret; 2307 return ret;
@@ -2317,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2317 else 2329 else
2318 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2319 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2320 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2321 2334
2322 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2323 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2342,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2342 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2343 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2344 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2345 int ret; 2359 int ret;
2346 int i; 2360 int i;
2347 int nritems; 2361 int nritems;
@@ -2404,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2404 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2405 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2406 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2407 } 2425 }
2408 } 2426 }
2409 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2431,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2431 goto done; 2449 goto done;
2432 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2433 &min_key); 2451 &min_key);
2434 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2435 } 2456 }
2436 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2437 2458
@@ -2453,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2453 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2454 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2455 &tmp); 2476 &tmp);
2456 2477 if (ret)
2457 BUG_ON(ret); 2478 err = ret;
2458 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2459 goto done; 2481 goto done;
2460 } 2482 }
2461 } 2483 }
2462done: 2484done:
2463 *last_offset_ret = last_offset;
2464 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2465 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2466 2487
2467 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2468 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2469 first_offset, last_offset); 2490 /*
2470 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2471 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2472} 2501}
2473 2502
2474/* 2503/*
@@ -2500,7 +2529,8 @@ again:
2500 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2501 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2502 &max_key); 2531 &max_key);
2503 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2504 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2505 break; 2535 break;
2506 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2534,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2534 2564
2535 while (1) { 2565 while (1) {
2536 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2537 2567 BUG_ON(ret == 0);
2538 if (ret != 1) 2568 if (ret < 0)
2539 break; 2569 break;
2540 2570
2541 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2553,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2553 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2554 } 2584 }
2555 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2556 return 0; 2586 return ret;
2557} 2587}
2558 2588
2559static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2586,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2586 } 2616 }
2587 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2588 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2589 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2590 2623
2591 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2592 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2659,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2659 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2660 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2661 */ 2694 */
2695 ret = 0;
2662 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2663 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2664 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2665 list); 2699 list);
2666 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2667 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2668 list_del(&sums->list); 2702 list_del(&sums->list);
2669 kfree(sums); 2703 kfree(sums);
2670 } 2704 }
2671 return 0; 2705 return ret;
2672} 2706}
2673 2707
2674/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2696,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2696 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2697 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2698 u32 size; 2732 u32 size;
2733 int err = 0;
2699 int ret; 2734 int ret;
2700 int nritems; 2735 int nritems;
2701 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2738,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2738 } else { 2773 } else {
2739 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2740 } 2775 }
2741 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2742 path->keep_locks = 1; 2780 path->keep_locks = 1;
2743 2781
2744 while (1) { 2782 while (1) {
@@ -2767,7 +2805,10 @@ again:
2767 2805
2768 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2769 ins_nr, inode_only); 2807 ins_nr, inode_only);
2770 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2771 ins_nr = 1; 2812 ins_nr = 1;
2772 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2773next_slot: 2814next_slot:
@@ -2783,7 +2824,10 @@ next_slot:
2783 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2784 ins_start_slot, 2825 ins_start_slot,
2785 ins_nr, inode_only); 2826 ins_nr, inode_only);
2786 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2787 ins_nr = 0; 2831 ins_nr = 0;
2788 } 2832 }
2789 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2801,7 +2845,10 @@ next_slot:
2801 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2802 ins_start_slot, 2846 ins_start_slot,
2803 ins_nr, inode_only); 2847 ins_nr, inode_only);
2804 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2805 ins_nr = 0; 2852 ins_nr = 0;
2806 } 2853 }
2807 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2809,14 +2856,18 @@ next_slot:
2809 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2810 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2811 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2812 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2813 } 2863 }
2814 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2815 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2816 2867
2817 btrfs_free_path(path); 2868 btrfs_free_path(path);
2818 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2819 return 0; 2870 return err;
2820} 2871}
2821 2872
2822/* 2873/*
@@ -2941,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2941 goto end_no_trans; 2992 goto end_no_trans;
2942 } 2993 }
2943 2994
2944 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2945 2998
2946 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2947 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2948 3002
2949 /* 3003 /*
2950 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2954,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2954 */ 3008 */
2955 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2956 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2957 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2958 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2959 3015
2960 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2961 while (1) { 3017 while (1) {
@@ -2969,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2969 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2970 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2971 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2972 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2973 } 3030 }
2974 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2975 break; 3032 break;
2976 3033
2977 parent = parent->d_parent; 3034 parent = parent->d_parent;
2978 } 3035 }
2979no_parent:
2980 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2981 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2982end_no_trans: 3044end_no_trans:
2983 return ret; 3045 return ret;
@@ -3019,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3019 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3020 BUG_ON(!path); 3082 BUG_ON(!path);
3021 3083
3022 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3023 3085
3024 wc.trans = trans; 3086 wc.trans = trans;
3025 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..dd318ff280b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -256,13 +257,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 257 wake_up(&fs_info->async_submit_wait);
257 258
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 260
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 261 if (cur->bi_rw & REQ_SYNC)
264 num_sync_run++; 262 num_sync_run++;
265 263
264 submit_bio(cur->bi_rw, cur);
265 num_run++;
266 batch_run++;
266 if (need_resched()) { 267 if (need_resched()) {
267 if (num_sync_run) { 268 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 269 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +326,6 @@ loop_lock:
325 num_sync_run = 0; 326 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 327 blk_run_backing_dev(bdi, NULL);
327 } 328 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 329 /*
339 * IO has already been through a long path to get here. Checksumming, 330 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 331 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +337,16 @@ loop_lock:
346 * cared about found its way down here. 337 * cared about found its way down here.
347 */ 338 */
348 blk_run_backing_dev(bdi, NULL); 339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched();
342 if (again)
343 goto loop;
344
345 spin_lock(&device->io_lock);
346 if (device->pending_bios.head || device->pending_sync_bios.head)
347 goto loop_lock;
348 spin_unlock(&device->io_lock);
349
349done: 350done:
350 return 0; 351 return 0;
351} 352}
@@ -365,6 +366,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 366 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 367 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 368 u64 found_transid = btrfs_super_generation(disk_super);
369 char *name;
368 370
369 fs_devices = find_fsid(disk_super->fsid); 371 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 372 if (!fs_devices) {
@@ -411,6 +413,12 @@ static noinline int device_list_add(const char *path,
411 413
412 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 415 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS);
418 if (!name)
419 return -ENOMEM;
420 kfree(device->name);
421 device->name = name;
414 } 422 }
415 423
416 if (found_transid > fs_devices->latest_trans) { 424 if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +600,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
592 goto error_close; 600 goto error_close;
593 601
594 disk_super = (struct btrfs_super_block *)bh->b_data; 602 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 603 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 604 if (devid != device->devid)
597 goto error_brelse; 605 goto error_brelse;
598 606
@@ -694,7 +702,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 goto error_close; 702 goto error_close;
695 } 703 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 704 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 705 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 706 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 707 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 708 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1089,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1089 if (!path) 1097 if (!path)
1090 return -ENOMEM; 1098 return -ENOMEM;
1091 1099
1092 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1093 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1094 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1095 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1187,7 +1195,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 goto error_close; 1195 goto error_close;
1188 } 1196 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1197 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1198 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1199 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1200 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1201 disk_super->fsid);
@@ -1478,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1478 goto error; 1486 goto error;
1479 } 1487 }
1480 1488
1481 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1482 lock_chunks(root); 1490 lock_chunks(root);
1483 1491
1484 device->barriers = 1; 1492 device->barriers = 1;
@@ -1743,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1743 1751
1744 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1745 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1746 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1747 1756
1748 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1749 BUG_ON(!trans); 1758 BUG_ON(!trans);
1750 1759
1751 lock_chunks(root); 1760 lock_chunks(root);
@@ -1917,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1917 break; 1926 break;
1918 BUG_ON(ret); 1927 BUG_ON(ret);
1919 1928
1920 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1921 BUG_ON(!trans); 1930 BUG_ON(!trans);
1922 1931
1923 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2086,11 +2095,7 @@ again:
2086 } 2095 }
2087 2096
2088 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2089 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2090 if (!trans) {
2091 ret = -ENOMEM;
2092 goto done;
2093 }
2094 lock_chunks(root); 2099 lock_chunks(root);
2095 2100
2096 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
@@ -2191,9 +2196,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2191 min_stripes = 2; 2196 min_stripes = 2;
2192 } 2197 }
2193 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2194 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2199 if (fs_devices->rw_devices < 2)
2195 if (num_stripes < 2)
2196 return -ENOSPC; 2200 return -ENOSPC;
2201 num_stripes = 2;
2197 min_stripes = 2; 2202 min_stripes = 2;
2198 } 2203 }
2199 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2237,8 +2242,16 @@ again:
2237 do_div(calc_size, stripe_len); 2242 do_div(calc_size, stripe_len);
2238 calc_size *= stripe_len; 2243 calc_size *= stripe_len;
2239 } 2244 }
2245
2240 /* we don't want tiny stripes */ 2246 /* we don't want tiny stripes */
2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2247 if (!looped)
2248 calc_size = max_t(u64, min_stripe_size, calc_size);
2249
2250 /*
2251 * we're about to do_div by the stripe_len so lets make sure
2252 * we end up with something bigger than a stripe
2253 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4);
2242 2255
2243 do_div(calc_size, stripe_len); 2256 do_div(calc_size, stripe_len);
2244 calc_size *= stripe_len; 2257 calc_size *= stripe_len;
@@ -2638,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2638 int max_errors = 0; 2651 int max_errors = 0;
2639 struct btrfs_multi_bio *multi = NULL; 2652 struct btrfs_multi_bio *multi = NULL;
2640 2653
2641 if (multi_ret && !(rw & (1 << BIO_RW))) 2654 if (multi_ret && !(rw & REQ_WRITE))
2642 stripes_allocated = 1; 2655 stripes_allocated = 1;
2643again: 2656again:
2644 if (multi_ret) { 2657 if (multi_ret) {
@@ -2674,7 +2687,7 @@ again:
2674 mirror_num = 0; 2687 mirror_num = 0;
2675 2688
2676 /* if our multi bio struct is too small, back off and try again */ 2689 /* if our multi bio struct is too small, back off and try again */
2677 if (rw & (1 << BIO_RW)) { 2690 if (rw & REQ_WRITE) {
2678 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2691 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2679 BTRFS_BLOCK_GROUP_DUP)) { 2692 BTRFS_BLOCK_GROUP_DUP)) {
2680 stripes_required = map->num_stripes; 2693 stripes_required = map->num_stripes;
@@ -2684,7 +2697,7 @@ again:
2684 max_errors = 1; 2697 max_errors = 1;
2685 } 2698 }
2686 } 2699 }
2687 if (multi_ret && (rw & (1 << BIO_RW)) && 2700 if (multi_ret && (rw & REQ_WRITE) &&
2688 stripes_allocated < stripes_required) { 2701 stripes_allocated < stripes_required) {
2689 stripes_allocated = map->num_stripes; 2702 stripes_allocated = map->num_stripes;
2690 free_extent_map(em); 2703 free_extent_map(em);
@@ -2720,7 +2733,7 @@ again:
2720 num_stripes = 1; 2733 num_stripes = 1;
2721 stripe_index = 0; 2734 stripe_index = 0;
2722 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2735 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2723 if (unplug_page || (rw & (1 << BIO_RW))) 2736 if (unplug_page || (rw & REQ_WRITE))
2724 num_stripes = map->num_stripes; 2737 num_stripes = map->num_stripes;
2725 else if (mirror_num) 2738 else if (mirror_num)
2726 stripe_index = mirror_num - 1; 2739 stripe_index = mirror_num - 1;
@@ -2731,7 +2744,7 @@ again:
2731 } 2744 }
2732 2745
2733 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2746 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2734 if (rw & (1 << BIO_RW)) 2747 if (rw & REQ_WRITE)
2735 num_stripes = map->num_stripes; 2748 num_stripes = map->num_stripes;
2736 else if (mirror_num) 2749 else if (mirror_num)
2737 stripe_index = mirror_num - 1; 2750 stripe_index = mirror_num - 1;
@@ -2742,7 +2755,7 @@ again:
2742 stripe_index = do_div(stripe_nr, factor); 2755 stripe_index = do_div(stripe_nr, factor);
2743 stripe_index *= map->sub_stripes; 2756 stripe_index *= map->sub_stripes;
2744 2757
2745 if (unplug_page || (rw & (1 << BIO_RW))) 2758 if (unplug_page || (rw & REQ_WRITE))
2746 num_stripes = map->sub_stripes; 2759 num_stripes = map->sub_stripes;
2747 else if (mirror_num) 2760 else if (mirror_num)
2748 stripe_index += mirror_num - 1; 2761 stripe_index += mirror_num - 1;
@@ -2932,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2932 struct btrfs_pending_bios *pending_bios; 2945 struct btrfs_pending_bios *pending_bios;
2933 2946
2934 /* don't bother with additional async steps for reads, right now */ 2947 /* don't bother with additional async steps for reads, right now */
2935 if (!(rw & (1 << BIO_RW))) { 2948 if (!(rw & REQ_WRITE)) {
2936 bio_get(bio); 2949 bio_get(bio);
2937 submit_bio(rw, bio); 2950 submit_bio(rw, bio);
2938 bio_put(bio); 2951 bio_put(bio);
@@ -2951,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2951 bio->bi_rw |= rw; 2964 bio->bi_rw |= rw;
2952 2965
2953 spin_lock(&device->io_lock); 2966 spin_lock(&device->io_lock);
2954 if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) 2967 if (bio->bi_rw & REQ_SYNC)
2955 pending_bios = &device->pending_sync_bios; 2968 pending_bios = &device->pending_sync_bios;
2956 else 2969 else
2957 pending_bios = &device->pending_bios; 2970 pending_bios = &device->pending_bios;
@@ -3382,6 +3395,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3382 key.type = 0; 3395 key.type = 0;
3383again: 3396again:
3384 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3397 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3398 if (ret < 0)
3399 goto error;
3385 while (1) { 3400 while (1) {
3386 leaf = path->nodes[0]; 3401 leaf = path->nodes[0];
3387 slot = path->slots[0]; 3402 slot = path->slots[0];
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
@@ -282,7 +276,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 276 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 277 * attributes are handled directly.
284 */ 278 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 279const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 280#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 281 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 282 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);