aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorRussell King <rmk+kernel@arm.linux.org.uk>2010-08-06 13:13:54 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2010-08-06 13:13:54 -0400
commit11e4afb49b7fa1fc8e1ffd850c1806dd86a08204 (patch)
tree9e57efcb106ae912f7bec718feb3f8ec607559bb /fs/btrfs
parent162500b3a3ff39d941d29db49b41a16667ae44f0 (diff)
parent9b2a606d3898fcb2eedb6faded3bb37549590ac4 (diff)
Merge branches 'gemini' and 'misc' into devel
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c13
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c243
-rw-r--r--fs/btrfs/ctree.h167
-rw-r--r--fs/btrfs/delayed-ref.c102
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c197
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2264
-rw-r--r--fs/btrfs/extent_io.c103
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c29
-rw-r--r--fs/btrfs/file.c182
-rw-r--r--fs/btrfs/free-space-cache.c1
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1780
-rw-r--r--fs/btrfs/ioctl.c235
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c87
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/relocation.c1975
-rw-r--r--fs/btrfs/root-tree.c26
-rw-r--r--fs/btrfs/super.c65
-rw-r--r--fs/btrfs/transaction.c315
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c242
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c34
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h6
35 files changed, 5240 insertions, 2961 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
@@ -59,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
60 if (size > 0) { 61 if (size > 0) {
61 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl))
64 return acl;
62 set_cached_acl(inode, type, acl); 65 set_cached_acl(inode, type, acl);
63 } 66 }
64 kfree(value); 67 kfree(value);
@@ -159,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
159 int ret; 162 int ret;
160 struct posix_acl *acl = NULL; 163 struct posix_acl *acl = NULL;
161 164
165 if (!is_owner_or_cap(dentry->d_inode))
166 return -EPERM;
167
168 if (!IS_POSIXACL(dentry->d_inode))
169 return -EOPNOTSUPP;
170
162 if (value) { 171 if (value) {
163 acl = posix_acl_from_xattr(value, size); 172 acl = posix_acl_from_xattr(value, size);
164 if (acl == NULL) { 173 if (acl == NULL) {
@@ -281,14 +290,14 @@ int btrfs_acl_chmod(struct inode *inode)
281 return ret; 290 return ret;
282} 291}
283 292
284struct xattr_handler btrfs_xattr_acl_default_handler = { 293const struct xattr_handler btrfs_xattr_acl_default_handler = {
285 .prefix = POSIX_ACL_XATTR_DEFAULT, 294 .prefix = POSIX_ACL_XATTR_DEFAULT,
286 .flags = ACL_TYPE_DEFAULT, 295 .flags = ACL_TYPE_DEFAULT,
287 .get = btrfs_xattr_acl_get, 296 .get = btrfs_xattr_acl_get,
288 .set = btrfs_xattr_acl_set, 297 .set = btrfs_xattr_acl_set,
289}; 298};
290 299
291struct xattr_handler btrfs_xattr_acl_access_handler = { 300const struct xattr_handler btrfs_xattr_acl_access_handler = {
292 .prefix = POSIX_ACL_XATTR_ACCESS, 301 .prefix = POSIX_ACL_XATTR_ACCESS,
293 .flags = ACL_TYPE_ACCESS, 302 .flags = ACL_TYPE_ACCESS,
294 .get = btrfs_xattr_acl_get, 303 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
@@ -376,6 +377,7 @@ again:
376 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
377 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
378 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
379 goto again; 381 goto again;
380 } 382 }
381 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28b92a7218ab..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -279,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
279static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
280 struct btrfs_root *root, 281 struct btrfs_root *root,
281 struct extent_buffer *buf, 282 struct extent_buffer *buf,
282 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
283{ 285{
284 u64 refs; 286 u64 refs;
285 u64 owner; 287 u64 owner;
@@ -365,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 BUG_ON(ret); 367 BUG_ON(ret);
366 } 368 }
367 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
368 } 371 }
369 return 0; 372 return 0;
370} 373}
@@ -391,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
391 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
392 struct extent_buffer *cow; 395 struct extent_buffer *cow;
393 int level; 396 int level;
397 int last_ref = 0;
394 int unlock_orig = 0; 398 int unlock_orig = 0;
395 u64 parent_start; 399 u64 parent_start;
396 400
@@ -441,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
441 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
442 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
443 447
444 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
445 452
446 if (buf == root->node) { 453 if (buf == root->node) {
447 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -456,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
456 extent_buffer_get(cow); 463 extent_buffer_get(cow);
457 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
458 465
459 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
460 parent_start, root->root_key.objectid, level); 467 last_ref);
461 free_extent_buffer(buf); 468 free_extent_buffer(buf);
462 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
463 } else { 470 } else {
@@ -472,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
472 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
473 trans->transid); 480 trans->transid);
474 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
475 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
476 parent_start, root->root_key.objectid, level); 483 last_ref);
477 } 484 }
478 if (unlock_orig) 485 if (unlock_orig)
479 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -948,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
948 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
949} 956}
950 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
951/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
952 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
953 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1018,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1018 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1019 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1020 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1021 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1022 1049
1023 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1024 root->node = child; 1051 root->node = child;
@@ -1033,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1033 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1034 /* once for the path */ 1061 /* once for the path */
1035 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1036 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1037 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038 /* once for the root ptr */ 1066 /* once for the root ptr */
1039 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1040 return ret; 1068 return 0;
1041 } 1069 }
1042 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1043 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1087,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1087 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1088 ret = wret; 1116 ret = wret;
1089 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1090 u64 bytenr = right->start;
1091 u32 blocksize = right->len;
1092
1093 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1094 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1095 free_extent_buffer(right);
1096 right = NULL;
1097 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1098 1); 1121 1);
1099 if (wret) 1122 if (wret)
1100 ret = wret; 1123 ret = wret;
1101 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1102 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1103 root->root_key.objectid, 1126 free_extent_buffer(right);
1104 level); 1127 right = NULL;
1105 if (wret)
1106 ret = wret;
1107 } else { 1128 } else {
1108 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1109 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1135,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1135 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1136 } 1157 }
1137 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1138 /* we've managed to empty the middle node, drop it */
1139 u64 bytenr = mid->start;
1140 u32 blocksize = mid->len;
1141
1142 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1143 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1144 free_extent_buffer(mid);
1145 mid = NULL;
1146 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1147 if (wret) 1162 if (wret)
1148 ret = wret; 1163 ret = wret;
1149 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1150 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1151 if (wret) 1166 free_extent_buffer(mid);
1152 ret = wret; 1167 mid = NULL;
1153 } else { 1168 } else {
1154 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1155 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1589,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1589 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1590 1605
1591 ret = -EAGAIN; 1606 ret = -EAGAIN;
1592 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1593 if (tmp) { 1608 if (tmp) {
1594 /* 1609 /*
1595 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1739,7 +1754,6 @@ again:
1739 p->nodes[level + 1], 1754 p->nodes[level + 1],
1740 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1741 if (err) { 1756 if (err) {
1742 free_extent_buffer(b);
1743 ret = err; 1757 ret = err;
1744 goto done; 1758 goto done;
1745 } 1759 }
@@ -2075,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2075 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2076 return PTR_ERR(c); 2090 return PTR_ERR(c);
2077 2091
2092 root_add_used(root, root->nodesize);
2093
2078 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2079 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2080 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2133,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2133 int nritems; 2149 int nritems;
2134 2150
2135 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2136 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2137 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2138 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2201,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2201 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2202 return PTR_ERR(split); 2219 return PTR_ERR(split);
2203 2220
2221 root_add_used(root, root->nodesize);
2222
2204 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2205 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2206 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2285,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2285 return ret; 2304 return ret;
2286} 2305}
2287 2306
2307/*
2308 * min slot controls the lowest index we're willing to push to the
2309 * right. We'll push up to and including min_slot, but no lower
2310 */
2288static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 2311static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2289 struct btrfs_root *root, 2312 struct btrfs_root *root,
2290 struct btrfs_path *path, 2313 struct btrfs_path *path,
2291 int data_size, int empty, 2314 int data_size, int empty,
2292 struct extent_buffer *right, 2315 struct extent_buffer *right,
2293 int free_space, u32 left_nritems) 2316 int free_space, u32 left_nritems,
2317 u32 min_slot)
2294{ 2318{
2295 struct extent_buffer *left = path->nodes[0]; 2319 struct extent_buffer *left = path->nodes[0];
2296 struct extent_buffer *upper = path->nodes[1]; 2320 struct extent_buffer *upper = path->nodes[1];
@@ -2308,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2308 if (empty) 2332 if (empty)
2309 nr = 0; 2333 nr = 0;
2310 else 2334 else
2311 nr = 1; 2335 nr = max_t(u32, 1, min_slot);
2312 2336
2313 if (path->slots[0] >= left_nritems) 2337 if (path->slots[0] >= left_nritems)
2314 push_space += data_size; 2338 push_space += data_size;
@@ -2414,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2414 2438
2415 if (left_nritems) 2439 if (left_nritems)
2416 btrfs_mark_buffer_dirty(left); 2440 btrfs_mark_buffer_dirty(left);
2441 else
2442 clean_tree_block(trans, root, left);
2443
2417 btrfs_mark_buffer_dirty(right); 2444 btrfs_mark_buffer_dirty(right);
2418 2445
2419 btrfs_item_key(right, &disk_key, 0); 2446 btrfs_item_key(right, &disk_key, 0);
@@ -2447,10 +2474,14 @@ out_unlock:
2447 * 2474 *
2448 * returns 1 if the push failed because the other node didn't have enough 2475 * returns 1 if the push failed because the other node didn't have enough
2449 * room, 0 if everything worked out and < 0 if there were major errors. 2476 * room, 0 if everything worked out and < 0 if there were major errors.
2477 *
2478 * this will push starting from min_slot to the end of the leaf. It won't
2479 * push any slot lower than min_slot
2450 */ 2480 */
2451static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2452 *root, struct btrfs_path *path, int data_size, 2482 *root, struct btrfs_path *path,
2453 int empty) 2483 int min_data_size, int data_size,
2484 int empty, u32 min_slot)
2454{ 2485{
2455 struct extent_buffer *left = path->nodes[0]; 2486 struct extent_buffer *left = path->nodes[0];
2456 struct extent_buffer *right; 2487 struct extent_buffer *right;
@@ -2492,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2492 if (left_nritems == 0) 2523 if (left_nritems == 0)
2493 goto out_unlock; 2524 goto out_unlock;
2494 2525
2495 return __push_leaf_right(trans, root, path, data_size, empty, 2526 return __push_leaf_right(trans, root, path, min_data_size, empty,
2496 right, free_space, left_nritems); 2527 right, free_space, left_nritems, min_slot);
2497out_unlock: 2528out_unlock:
2498 btrfs_tree_unlock(right); 2529 btrfs_tree_unlock(right);
2499 free_extent_buffer(right); 2530 free_extent_buffer(right);
@@ -2503,12 +2534,17 @@ out_unlock:
2503/* 2534/*
2504 * push some data in the path leaf to the left, trying to free up at 2535 * push some data in the path leaf to the left, trying to free up at
2505 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2536 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2537 *
2538 * max_slot can put a limit on how far into the leaf we'll push items. The
2539 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2540 * items
2506 */ 2541 */
2507static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 2542static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2508 struct btrfs_root *root, 2543 struct btrfs_root *root,
2509 struct btrfs_path *path, int data_size, 2544 struct btrfs_path *path, int data_size,
2510 int empty, struct extent_buffer *left, 2545 int empty, struct extent_buffer *left,
2511 int free_space, int right_nritems) 2546 int free_space, u32 right_nritems,
2547 u32 max_slot)
2512{ 2548{
2513 struct btrfs_disk_key disk_key; 2549 struct btrfs_disk_key disk_key;
2514 struct extent_buffer *right = path->nodes[0]; 2550 struct extent_buffer *right = path->nodes[0];
@@ -2527,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2527 slot = path->slots[1]; 2563 slot = path->slots[1];
2528 2564
2529 if (empty) 2565 if (empty)
2530 nr = right_nritems; 2566 nr = min(right_nritems, max_slot);
2531 else 2567 else
2532 nr = right_nritems - 1; 2568 nr = min(right_nritems - 1, max_slot);
2533 2569
2534 for (i = 0; i < nr; i++) { 2570 for (i = 0; i < nr; i++) {
2535 item = btrfs_item_nr(right, i); 2571 item = btrfs_item_nr(right, i);
@@ -2659,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2659 btrfs_mark_buffer_dirty(left); 2695 btrfs_mark_buffer_dirty(left);
2660 if (right_nritems) 2696 if (right_nritems)
2661 btrfs_mark_buffer_dirty(right); 2697 btrfs_mark_buffer_dirty(right);
2698 else
2699 clean_tree_block(trans, root, right);
2662 2700
2663 btrfs_item_key(right, &disk_key, 0); 2701 btrfs_item_key(right, &disk_key, 0);
2664 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2702 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2668,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2668 /* then fixup the leaf pointer in the path */ 2706 /* then fixup the leaf pointer in the path */
2669 if (path->slots[0] < push_items) { 2707 if (path->slots[0] < push_items) {
2670 path->slots[0] += old_left_nritems; 2708 path->slots[0] += old_left_nritems;
2671 if (btrfs_header_nritems(path->nodes[0]) == 0)
2672 clean_tree_block(trans, root, path->nodes[0]);
2673 btrfs_tree_unlock(path->nodes[0]); 2709 btrfs_tree_unlock(path->nodes[0]);
2674 free_extent_buffer(path->nodes[0]); 2710 free_extent_buffer(path->nodes[0]);
2675 path->nodes[0] = left; 2711 path->nodes[0] = left;
@@ -2690,10 +2726,14 @@ out:
2690/* 2726/*
2691 * push some data in the path leaf to the left, trying to free up at 2727 * push some data in the path leaf to the left, trying to free up at
2692 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2728 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2729 *
2730 * max_slot can put a limit on how far into the leaf we'll push items. The
2731 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
2732 * items
2693 */ 2733 */
2694static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2734static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2695 *root, struct btrfs_path *path, int data_size, 2735 *root, struct btrfs_path *path, int min_data_size,
2696 int empty) 2736 int data_size, int empty, u32 max_slot)
2697{ 2737{
2698 struct extent_buffer *right = path->nodes[0]; 2738 struct extent_buffer *right = path->nodes[0];
2699 struct extent_buffer *left; 2739 struct extent_buffer *left;
@@ -2739,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2739 goto out; 2779 goto out;
2740 } 2780 }
2741 2781
2742 return __push_leaf_left(trans, root, path, data_size, 2782 return __push_leaf_left(trans, root, path, min_data_size,
2743 empty, left, free_space, right_nritems); 2783 empty, left, free_space, right_nritems,
2784 max_slot);
2744out: 2785out:
2745 btrfs_tree_unlock(left); 2786 btrfs_tree_unlock(left);
2746 free_extent_buffer(left); 2787 free_extent_buffer(left);
@@ -2833,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2833} 2874}
2834 2875
2835/* 2876/*
2877 * double splits happen when we need to insert a big item in the middle
2878 * of a leaf. A double split can leave us with 3 mostly empty leaves:
2879 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2880 * A B C
2881 *
2882 * We avoid this by trying to push the items on either side of our target
2883 * into the adjacent leaves. If all goes well we can avoid the double split
2884 * completely.
2885 */
2886static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2887 struct btrfs_root *root,
2888 struct btrfs_path *path,
2889 int data_size)
2890{
2891 int ret;
2892 int progress = 0;
2893 int slot;
2894 u32 nritems;
2895
2896 slot = path->slots[0];
2897
2898 /*
2899 * try to push all the items after our slot into the
2900 * right leaf
2901 */
2902 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2903 if (ret < 0)
2904 return ret;
2905
2906 if (ret == 0)
2907 progress++;
2908
2909 nritems = btrfs_header_nritems(path->nodes[0]);
2910 /*
2911 * our goal is to get our slot at the start or end of a leaf. If
2912 * we've done so we're done
2913 */
2914 if (path->slots[0] == 0 || path->slots[0] == nritems)
2915 return 0;
2916
2917 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2918 return 0;
2919
2920 /* try to push all the items before our slot into the next leaf */
2921 slot = path->slots[0];
2922 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2923 if (ret < 0)
2924 return ret;
2925
2926 if (ret == 0)
2927 progress++;
2928
2929 if (progress)
2930 return 0;
2931 return 1;
2932}
2933
2934/*
2836 * split the path's leaf in two, making sure there is at least data_size 2935 * split the path's leaf in two, making sure there is at least data_size
2837 * available for the resulting leaf level of the path. 2936 * available for the resulting leaf level of the path.
2838 * 2937 *
@@ -2854,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2854 int wret; 2953 int wret;
2855 int split; 2954 int split;
2856 int num_doubles = 0; 2955 int num_doubles = 0;
2956 int tried_avoid_double = 0;
2857 2957
2858 l = path->nodes[0]; 2958 l = path->nodes[0];
2859 slot = path->slots[0]; 2959 slot = path->slots[0];
@@ -2862,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2862 return -EOVERFLOW; 2962 return -EOVERFLOW;
2863 2963
2864 /* first try to make some room by pushing left and right */ 2964 /* first try to make some room by pushing left and right */
2865 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2965 if (data_size) {
2866 wret = push_leaf_right(trans, root, path, data_size, 0); 2966 wret = push_leaf_right(trans, root, path, data_size,
2967 data_size, 0, 0);
2867 if (wret < 0) 2968 if (wret < 0)
2868 return wret; 2969 return wret;
2869 if (wret) { 2970 if (wret) {
2870 wret = push_leaf_left(trans, root, path, data_size, 0); 2971 wret = push_leaf_left(trans, root, path, data_size,
2972 data_size, 0, (u32)-1);
2871 if (wret < 0) 2973 if (wret < 0)
2872 return wret; 2974 return wret;
2873 } 2975 }
@@ -2901,6 +3003,8 @@ again:
2901 if (mid != nritems && 3003 if (mid != nritems &&
2902 leaf_space_used(l, mid, nritems - mid) + 3004 leaf_space_used(l, mid, nritems - mid) +
2903 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3005 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3006 if (data_size && !tried_avoid_double)
3007 goto push_for_double;
2904 split = 2; 3008 split = 2;
2905 } 3009 }
2906 } 3010 }
@@ -2917,6 +3021,8 @@ again:
2917 if (mid != nritems && 3021 if (mid != nritems &&
2918 leaf_space_used(l, mid, nritems - mid) + 3022 leaf_space_used(l, mid, nritems - mid) +
2919 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3023 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3024 if (data_size && !tried_avoid_double)
3025 goto push_for_double;
2920 split = 2 ; 3026 split = 2 ;
2921 } 3027 }
2922 } 3028 }
@@ -2931,10 +3037,10 @@ again:
2931 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3037 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2932 root->root_key.objectid, 3038 root->root_key.objectid,
2933 &disk_key, 0, l->start, 0); 3039 &disk_key, 0, l->start, 0);
2934 if (IS_ERR(right)) { 3040 if (IS_ERR(right))
2935 BUG_ON(1);
2936 return PTR_ERR(right); 3041 return PTR_ERR(right);
2937 } 3042
3043 root_add_used(root, root->leafsize);
2938 3044
2939 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 3045 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2940 btrfs_set_header_bytenr(right, right->start); 3046 btrfs_set_header_bytenr(right, right->start);
@@ -2997,6 +3103,13 @@ again:
2997 } 3103 }
2998 3104
2999 return ret; 3105 return ret;
3106
3107push_for_double:
3108 push_for_double_split(trans, root, path, data_size);
3109 tried_avoid_double = 1;
3110 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3111 return 0;
3112 goto again;
3000} 3113}
3001 3114
3002static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3115static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3040,6 +3153,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3153 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3154 goto err;
3042 3155
3156 /* the leaf has changed, it now has room. return now */
3157 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3158 goto err;
3159
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3160 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3161 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3162 struct btrfs_file_extent_item);
@@ -3049,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3049 3166
3050 btrfs_set_path_blocking(path); 3167 btrfs_set_path_blocking(path);
3051 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3168 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3052 BUG_ON(ret); 3169 if (ret)
3170 goto err;
3053 3171
3054 path->keep_locks = 0; 3172 path->keep_locks = 0;
3055 btrfs_unlock_up_safe(path, 1); 3173 btrfs_unlock_up_safe(path, 1);
@@ -3791,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3791 */ 3909 */
3792 btrfs_unlock_up_safe(path, 0); 3910 btrfs_unlock_up_safe(path, 0);
3793 3911
3794 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3912 root_sub_used(root, leaf->len);
3795 0, root->root_key.objectid, 0); 3913
3796 return ret; 3914 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3915 return 0;
3797} 3916}
3798/* 3917/*
3799 * delete the item at the leaf level in path. If that empties 3918 * delete the item at the leaf level in path. If that empties
@@ -3860,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3860 if (leaf == root->node) { 3979 if (leaf == root->node) {
3861 btrfs_set_header_level(leaf, 0); 3980 btrfs_set_header_level(leaf, 0);
3862 } else { 3981 } else {
3982 btrfs_set_path_blocking(path);
3983 clean_tree_block(trans, root, leaf);
3863 ret = btrfs_del_leaf(trans, root, path, leaf); 3984 ret = btrfs_del_leaf(trans, root, path, leaf);
3864 BUG_ON(ret); 3985 BUG_ON(ret);
3865 } 3986 }
@@ -3885,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3885 extent_buffer_get(leaf); 4006 extent_buffer_get(leaf);
3886 4007
3887 btrfs_set_path_blocking(path); 4008 btrfs_set_path_blocking(path);
3888 wret = push_leaf_left(trans, root, path, 1, 1); 4009 wret = push_leaf_left(trans, root, path, 1, 1,
4010 1, (u32)-1);
3889 if (wret < 0 && wret != -ENOSPC) 4011 if (wret < 0 && wret != -ENOSPC)
3890 ret = wret; 4012 ret = wret;
3891 4013
3892 if (path->nodes[0] == leaf && 4014 if (path->nodes[0] == leaf &&
3893 btrfs_header_nritems(leaf)) { 4015 btrfs_header_nritems(leaf)) {
3894 wret = push_leaf_right(trans, root, path, 1, 1); 4016 wret = push_leaf_right(trans, root, path, 1,
4017 1, 1, 0);
3895 if (wret < 0 && wret != -ENOSPC) 4018 if (wret < 0 && wret != -ENOSPC)
3896 ret = wret; 4019 ret = wret;
3897 } 4020 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0af2e3868573..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -33,6 +34,7 @@
33 34
34struct btrfs_trans_handle; 35struct btrfs_trans_handle;
35struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
36extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -662,6 +664,7 @@ struct btrfs_csum_item {
662#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
663#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
664#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
665 668
666struct btrfs_block_group_item { 669struct btrfs_block_group_item {
667 __le64 used; 670 __le64 used;
@@ -673,42 +676,46 @@ struct btrfs_space_info {
673 u64 flags; 676 u64 flags;
674 677
675 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
676 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
677 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
678 transaction finishes */ 682 transaction finishes */
679 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
680 current allocations */ 684 current allocations */
681 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
682 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
683 u64 bytes_root; /* the number of bytes needed to commit a
684 transaction */
685 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
686 delalloc/allocations */ 688 delalloc/allocations */
687 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
688 delayed allocation */
689 690
690 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
691 chunks for this space */ 692 chunks for this space */
692 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
693 this space */ 694 this space */
694 int force_delalloc; /* make people start doing filemap_flush until
695 we're under a threshold */
696 695
697 struct list_head list; 696 struct list_head list;
698 697
699 /* for controlling how we free up space for allocations */
700 wait_queue_head_t allocate_wait;
701 wait_queue_head_t flush_wait;
702 int allocating_chunk;
703 int flushing;
704
705 /* for block groups in our same type */ 698 /* for block groups in our same type */
706 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
707 spinlock_t lock; 700 spinlock_t lock;
708 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
709 atomic_t caching_threads; 702 atomic_t caching_threads;
710}; 703};
711 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
712/* 719/*
713 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
714 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -759,6 +766,7 @@ struct btrfs_block_group_cache {
759 spinlock_t lock; 766 spinlock_t lock;
760 u64 pinned; 767 u64 pinned;
761 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
762 u64 bytes_super; 770 u64 bytes_super;
763 u64 flags; 771 u64 flags;
764 u64 sectorsize; 772 u64 sectorsize;
@@ -824,6 +832,22 @@ struct btrfs_fs_info {
824 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
825 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
826 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
827 u64 generation; 851 u64 generation;
828 u64 last_trans_committed; 852 u64 last_trans_committed;
829 853
@@ -834,7 +858,6 @@ struct btrfs_fs_info {
834 u64 last_trans_log_full_commit; 858 u64 last_trans_log_full_commit;
835 u64 open_ioctl_trans; 859 u64 open_ioctl_trans;
836 unsigned long mount_opt; 860 unsigned long mount_opt;
837 u64 max_extent;
838 u64 max_inline; 861 u64 max_inline;
839 u64 alloc_start; 862 u64 alloc_start;
840 struct btrfs_transaction *running_transaction; 863 struct btrfs_transaction *running_transaction;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2361void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2362 2435
2363/* file.c */ 2436/* file.c */
2364int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2366 int skip_pinned); 2439 int skip_pinned);
2367int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
@@ -318,107 +319,6 @@ out:
318} 319}
319 320
320/* 321/*
321 * helper function to lookup reference count and flags of extent.
322 *
323 * the head node for delayed ref is used to store the sum of all the
324 * reference count modifications queued up in the rbtree. the head
325 * node may also store the extent flags to set. This way you can check
326 * to see what the reference count and extent flags would be if all of
327 * the delayed refs are not processed.
328 */
329int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
330 struct btrfs_root *root, u64 bytenr,
331 u64 num_bytes, u64 *refs, u64 *flags)
332{
333 struct btrfs_delayed_ref_node *ref;
334 struct btrfs_delayed_ref_head *head;
335 struct btrfs_delayed_ref_root *delayed_refs;
336 struct btrfs_path *path;
337 struct btrfs_extent_item *ei;
338 struct extent_buffer *leaf;
339 struct btrfs_key key;
340 u32 item_size;
341 u64 num_refs;
342 u64 extent_flags;
343 int ret;
344
345 path = btrfs_alloc_path();
346 if (!path)
347 return -ENOMEM;
348
349 key.objectid = bytenr;
350 key.type = BTRFS_EXTENT_ITEM_KEY;
351 key.offset = num_bytes;
352 delayed_refs = &trans->transaction->delayed_refs;
353again:
354 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
355 &key, path, 0, 0);
356 if (ret < 0)
357 goto out;
358
359 if (ret == 0) {
360 leaf = path->nodes[0];
361 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
362 if (item_size >= sizeof(*ei)) {
363 ei = btrfs_item_ptr(leaf, path->slots[0],
364 struct btrfs_extent_item);
365 num_refs = btrfs_extent_refs(leaf, ei);
366 extent_flags = btrfs_extent_flags(leaf, ei);
367 } else {
368#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
369 struct btrfs_extent_item_v0 *ei0;
370 BUG_ON(item_size != sizeof(*ei0));
371 ei0 = btrfs_item_ptr(leaf, path->slots[0],
372 struct btrfs_extent_item_v0);
373 num_refs = btrfs_extent_refs_v0(leaf, ei0);
374 /* FIXME: this isn't correct for data */
375 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
376#else
377 BUG();
378#endif
379 }
380 BUG_ON(num_refs == 0);
381 } else {
382 num_refs = 0;
383 extent_flags = 0;
384 ret = 0;
385 }
386
387 spin_lock(&delayed_refs->lock);
388 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
389 if (ref) {
390 head = btrfs_delayed_node_to_head(ref);
391 if (!mutex_trylock(&head->mutex)) {
392 atomic_inc(&ref->refs);
393 spin_unlock(&delayed_refs->lock);
394
395 btrfs_release_path(root->fs_info->extent_root, path);
396
397 mutex_lock(&head->mutex);
398 mutex_unlock(&head->mutex);
399 btrfs_put_delayed_ref(ref);
400 goto again;
401 }
402 if (head->extent_op && head->extent_op->update_flags)
403 extent_flags |= head->extent_op->flags_to_set;
404 else
405 BUG_ON(num_refs == 0);
406
407 num_refs += ref->ref_mod;
408 mutex_unlock(&head->mutex);
409 }
410 WARN_ON(num_refs == 0);
411 if (refs)
412 *refs = num_refs;
413 if (flags)
414 *flags = extent_flags;
415out:
416 spin_unlock(&delayed_refs->lock);
417 btrfs_free_path(path);
418 return ret;
419}
420
421/*
422 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
423 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
424 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11d0ad30e203..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root); 45static void free_fs_root(struct btrfs_root *root);
45 46
46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
47
48/* 47/*
49 * end_io_wq structs are used to do processing in task context when an IO is 48 * end_io_wq structs are used to do processing in task context when an IO is
50 * complete. This is used during reads to verify checksums, and it is used 49 * complete. This is used during reads to verify checksums, and it is used
@@ -75,6 +74,11 @@ struct async_submit_bio {
75 int rw; 74 int rw;
76 int mirror_num; 75 int mirror_num;
77 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
78 struct btrfs_work work; 82 struct btrfs_work work;
79}; 83};
80 84
@@ -535,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
535 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
536 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
537 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
538 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
539} 544}
540 545
541static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -557,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
557 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
558 563
559 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
560 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
561} 567}
562 568
563static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -571,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
571int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
572 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
573 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
574 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
575 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
576{ 583{
@@ -593,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
593 600
594 async->work.flags = 0; 601 async->work.flags = 0;
595 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
596 604
597 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
598 606
@@ -628,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
628 636
629static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
630 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
631 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
632{ 641{
633 /* 642 /*
634 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -639,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
639} 648}
640 649
641static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
642 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
643{ 653{
644 /* 654 /*
645 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -649,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
649} 659}
650 660
651static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
652 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
653{ 664{
654 int ret; 665 int ret;
655 666
@@ -672,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
672 */ 683 */
673 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
674 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
675 __btree_submit_bio_start, 687 __btree_submit_bio_start,
676 __btree_submit_bio_done); 688 __btree_submit_bio_done);
677} 689}
@@ -895,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
895 root->ref_cows = 0; 907 root->ref_cows = 0;
896 root->track_dirty = 0; 908 root->track_dirty = 0;
897 root->in_radix = 0; 909 root->in_radix = 0;
898 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
899 912
900 root->fs_info = fs_info; 913 root->fs_info = fs_info;
901 root->objectid = objectid; 914 root->objectid = objectid;
@@ -904,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
904 root->name = NULL; 917 root->name = NULL;
905 root->in_sysfs = 0; 918 root->in_sysfs = 0;
906 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
907 922
908 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
909 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
910 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
911 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
912 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
913 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
914 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
915 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
916 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -969,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
969 return 0; 985 return 0;
970} 986}
971 987
972int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
973 struct btrfs_fs_info *fs_info)
974{
975 struct extent_buffer *eb;
976 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
977 u64 start = 0;
978 u64 end = 0;
979 int ret;
980
981 if (!log_root_tree)
982 return 0;
983
984 while (1) {
985 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
986 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
987 if (ret)
988 break;
989
990 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
991 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
992 }
993 eb = fs_info->log_root_tree->node;
994
995 WARN_ON(btrfs_header_level(eb) != 0);
996 WARN_ON(btrfs_header_nritems(eb) != 0);
997
998 ret = btrfs_free_reserved_extent(fs_info->tree_root,
999 eb->start, eb->len);
1000 BUG_ON(ret);
1001
1002 free_extent_buffer(eb);
1003 kfree(fs_info->log_root_tree);
1004 fs_info->log_root_tree = NULL;
1005 return 0;
1006}
1007
1008static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1009 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1010{ 990{
@@ -1192,19 +1172,23 @@ again:
1192 if (root) 1172 if (root)
1193 return root; 1173 return root;
1194 1174
1195 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1196 if (ret == 0)
1197 ret = -ENOENT;
1198 if (ret < 0)
1199 return ERR_PTR(ret);
1200
1201 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1202 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1203 return root; 1177 return root;
1204 1178
1205 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1206 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1207 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1208 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1209 if (ret) 1193 if (ret)
1210 goto fail; 1194 goto fail;
@@ -1213,10 +1197,9 @@ again:
1213 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1214 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1215 root); 1199 root);
1216 if (ret == 0) { 1200 if (ret == 0)
1217 root->in_radix = 1; 1201 root->in_radix = 1;
1218 root->clean_orphans = 1; 1202
1219 }
1220 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1221 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1222 if (ret) { 1205 if (ret) {
@@ -1374,19 +1357,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1374{ 1357{
1375 int err; 1358 int err;
1376 1359
1377 bdi->name = "btrfs";
1378 bdi->capabilities = BDI_CAP_MAP_COPY; 1360 bdi->capabilities = BDI_CAP_MAP_COPY;
1379 err = bdi_init(bdi); 1361 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1380 if (err) 1362 if (err)
1381 return err; 1363 return err;
1382 1364
1383 err = bdi_register(bdi, NULL, "btrfs-%d",
1384 atomic_inc_return(&btrfs_bdi_num));
1385 if (err) {
1386 bdi_destroy(bdi);
1387 return err;
1388 }
1389
1390 bdi->ra_pages = default_backing_dev_info.ra_pages; 1365 bdi->ra_pages = default_backing_dev_info.ra_pages;
1391 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1366 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1392 bdi->unplug_io_data = info; 1367 bdi->unplug_io_data = info;
@@ -1470,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1470 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1471 1446
1472 do { 1447 do {
1473 smp_mb();
1474 if (root->fs_info->closing)
1475 break;
1476
1477 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1478 1449
1479 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1486,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1486 if (freezing(current)) { 1457 if (freezing(current)) {
1487 refrigerator(); 1458 refrigerator();
1488 } else { 1459 } else {
1489 smp_mb();
1490 if (root->fs_info->closing)
1491 break;
1492 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1493 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1494 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1495 } 1464 }
1496 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1502,36 +1471,40 @@ static int transaction_kthread(void *arg)
1502 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1503 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1504 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1505 unsigned long now; 1475 unsigned long now;
1506 unsigned long delay; 1476 unsigned long delay;
1507 int ret; 1477 int ret;
1508 1478
1509 do { 1479 do {
1510 smp_mb();
1511 if (root->fs_info->closing)
1512 break;
1513
1514 delay = HZ * 30; 1480 delay = HZ * 30;
1515 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1516 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1517 1483
1518 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1519 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1520 if (!cur) { 1486 if (!cur) {
1521 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1522 goto sleep; 1488 goto sleep;
1523 } 1489 }
1524 1490
1525 now = get_seconds(); 1491 now = get_seconds();
1526 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1527 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1528 delay = HZ * 5; 1495 delay = HZ * 5;
1529 goto sleep; 1496 goto sleep;
1530 } 1497 }
1531 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1532 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1533 ret = btrfs_commit_transaction(trans, root);
1534 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1535sleep: 1508sleep:
1536 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1537 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1539,10 +1512,10 @@ sleep:
1539 if (freezing(current)) { 1512 if (freezing(current)) {
1540 refrigerator(); 1513 refrigerator();
1541 } else { 1514 } else {
1542 if (root->fs_info->closing)
1543 break;
1544 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1545 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1546 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1547 } 1520 }
1548 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1629,12 +1602,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1629 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1630 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1631 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1632 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1633 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1634 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
1635 atomic_set(&fs_info->nr_async_bios, 0); 1615 atomic_set(&fs_info->nr_async_bios, 0);
1636 fs_info->sb = sb; 1616 fs_info->sb = sb;
1637 fs_info->max_extent = (u64)-1;
1638 fs_info->max_inline = 8192 * 1024; 1617 fs_info->max_inline = 8192 * 1024;
1639 fs_info->metadata_ratio = 0; 1618 fs_info->metadata_ratio = 0;
1640 1619
@@ -1769,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1769 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1770 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1771 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1772 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1773 fs_info->thread_pool_size,
1774 &fs_info->generic_worker);
1775 1751
1776 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1777 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1819,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1819 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1820 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1821 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1822 btrfs_start_workers(&fs_info->enospc_workers, 1);
1823 1798
1824 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1825 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1922,17 +1897,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1922 1897
1923 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1924 1899
1925 btrfs_read_block_groups(extent_root);
1926
1927 fs_info->generation = generation; 1900 fs_info->generation = generation;
1928 fs_info->last_trans_committed = generation; 1901 fs_info->last_trans_committed = generation;
1929 fs_info->data_alloc_profile = (u64)-1; 1902 fs_info->data_alloc_profile = (u64)-1;
1930 fs_info->metadata_alloc_profile = (u64)-1; 1903 fs_info->metadata_alloc_profile = (u64)-1;
1931 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1906 ret = btrfs_read_block_groups(extent_root);
1907 if (ret) {
1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1909 goto fail_block_groups;
1910 }
1911
1932 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1933 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1934 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
1935 goto fail_csum_root; 1915 goto fail_block_groups;
1936 1916
1937 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1917 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1938 tree_root, 1918 tree_root,
@@ -1961,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1961 btrfs_level_size(tree_root, 1941 btrfs_level_size(tree_root,
1962 btrfs_super_log_root_level(disk_super)); 1942 btrfs_super_log_root_level(disk_super));
1963 1943
1964 log_tree_root = kzalloc(sizeof(struct btrfs_root), 1944 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1965 GFP_NOFS); 1945 if (!log_tree_root) {
1946 err = -ENOMEM;
1947 goto fail_trans_kthread;
1948 }
1966 1949
1967 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1950 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1968 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1951 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1983,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1983 BUG_ON(ret); 1966 BUG_ON(ret);
1984 1967
1985 if (!(sb->s_flags & MS_RDONLY)) { 1968 if (!(sb->s_flags & MS_RDONLY)) {
1969 ret = btrfs_cleanup_fs_roots(fs_info);
1970 BUG_ON(ret);
1971
1986 ret = btrfs_recover_relocation(tree_root); 1972 ret = btrfs_recover_relocation(tree_root);
1987 if (ret < 0) { 1973 if (ret < 0) {
1988 printk(KERN_WARNING 1974 printk(KERN_WARNING
@@ -1999,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1999 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1985 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2000 if (!fs_info->fs_root) 1986 if (!fs_info->fs_root)
2001 goto fail_trans_kthread; 1987 goto fail_trans_kthread;
1988 if (IS_ERR(fs_info->fs_root)) {
1989 err = PTR_ERR(fs_info->fs_root);
1990 goto fail_trans_kthread;
1991 }
2002 1992
2003 if (!(sb->s_flags & MS_RDONLY)) { 1993 if (!(sb->s_flags & MS_RDONLY)) {
2004 down_read(&fs_info->cleanup_work_sem); 1994 down_read(&fs_info->cleanup_work_sem);
@@ -2020,7 +2010,8 @@ fail_cleaner:
2020 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2010 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2021 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2011 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2022 2012
2023fail_csum_root: 2013fail_block_groups:
2014 btrfs_free_block_groups(fs_info);
2024 free_extent_buffer(csum_root->node); 2015 free_extent_buffer(csum_root->node);
2025 free_extent_buffer(csum_root->commit_root); 2016 free_extent_buffer(csum_root->commit_root);
2026fail_dev_root: 2017fail_dev_root:
@@ -2045,7 +2036,6 @@ fail_sb_buffer:
2045 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2046 btrfs_stop_workers(&fs_info->endio_write_workers); 2037 btrfs_stop_workers(&fs_info->endio_write_workers);
2047 btrfs_stop_workers(&fs_info->submit_workers); 2038 btrfs_stop_workers(&fs_info->submit_workers);
2048 btrfs_stop_workers(&fs_info->enospc_workers);
2049fail_iput: 2039fail_iput:
2050 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2051 iput(fs_info->btree_inode); 2041 iput(fs_info->btree_inode);
@@ -2410,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2410 down_write(&root->fs_info->cleanup_work_sem); 2400 down_write(&root->fs_info->cleanup_work_sem);
2411 up_write(&root->fs_info->cleanup_work_sem); 2401 up_write(&root->fs_info->cleanup_work_sem);
2412 2402
2413 trans = btrfs_start_transaction(root, 1); 2403 trans = btrfs_join_transaction(root, 1);
2414 ret = btrfs_commit_transaction(trans, root); 2404 ret = btrfs_commit_transaction(trans, root);
2415 BUG_ON(ret); 2405 BUG_ON(ret);
2416 /* run commit again to drop the original snapshot */ 2406 /* run commit again to drop the original snapshot */
2417 trans = btrfs_start_transaction(root, 1); 2407 trans = btrfs_join_transaction(root, 1);
2418 btrfs_commit_transaction(trans, root); 2408 btrfs_commit_transaction(trans, root);
2419 ret = btrfs_write_and_wait_transaction(NULL, root); 2409 ret = btrfs_write_and_wait_transaction(NULL, root);
2420 BUG_ON(ret); 2410 BUG_ON(ret);
@@ -2431,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
2431 fs_info->closing = 1; 2421 fs_info->closing = 1;
2432 smp_mb(); 2422 smp_mb();
2433 2423
2434 kthread_stop(root->fs_info->transaction_kthread);
2435 kthread_stop(root->fs_info->cleaner_kthread);
2436
2437 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2438 ret = btrfs_commit_super(root); 2425 ret = btrfs_commit_super(root);
2439 if (ret) 2426 if (ret)
2440 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2441 } 2428 }
2442 2429
2430 kthread_stop(root->fs_info->transaction_kthread);
2431 kthread_stop(root->fs_info->cleaner_kthread);
2432
2443 fs_info->closing = 2; 2433 fs_info->closing = 2;
2444 smp_mb(); 2434 smp_mb();
2445 2435
@@ -2478,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
2478 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2479 btrfs_stop_workers(&fs_info->endio_write_workers); 2469 btrfs_stop_workers(&fs_info->endio_write_workers);
2480 btrfs_stop_workers(&fs_info->submit_workers); 2470 btrfs_stop_workers(&fs_info->submit_workers);
2481 btrfs_stop_workers(&fs_info->enospc_workers);
2482 2471
2483 btrfs_close_devices(fs_info->fs_devices); 2472 btrfs_close_devices(fs_info->fs_devices);
2484 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2473 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1727b26fb194..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -34,10 +35,9 @@
34 35
35static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
36 struct btrfs_root *root, 37 struct btrfs_root *root,
37 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
38 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
39static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -60,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
60static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 64 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -90,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
90 84
91void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
92{ 86{
93 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
94 kfree(cache); 91 kfree(cache);
92 }
95} 93}
96 94
97/* 95/*
@@ -318,7 +316,7 @@ static int caching_kthread(void *data)
318 316
319 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
320 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
321 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
322 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
323 321
324 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -506,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
506 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
507 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
508 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
509 rcu_read_lock(); 510 rcu_read_lock();
510 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
511 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -609,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
609} 610}
610 611
611/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
612 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
613 * 721 *
614 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1588,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1588 u64 start, u64 len) 1696 u64 start, u64 len)
1589{ 1697{
1590 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1591 DISCARD_FL_BARRIER); 1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1592} 1700}
1593 1701
1594static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1870,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1870 return ret; 1978 return ret;
1871} 1979}
1872 1980
1873
1874/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1875static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1890,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1890 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1891 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1892 if (insert_reserved) { 1999 if (insert_reserved) {
1893 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1894 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1895
1896 ret = pin_down_bytes(trans, root, NULL,
1897 node->bytenr, node->num_bytes,
1898 head->is_data, 1, &must_clean);
1899 if (ret > 0)
1900 mark_free = 1;
1901
1902 if (must_clean) {
1903 clean_tree_block(NULL, root, must_clean);
1904 btrfs_tree_unlock(must_clean);
1905 free_extent_buffer(must_clean);
1906 }
1907 if (head->is_data) { 2002 if (head->is_data) {
1908 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1909 node->bytenr, 2004 node->bytenr,
1910 node->num_bytes); 2005 node->num_bytes);
1911 BUG_ON(ret); 2006 BUG_ON(ret);
1912 } 2007 }
1913 if (mark_free) {
1914 ret = btrfs_free_reserved_extent(root,
1915 node->bytenr,
1916 node->num_bytes);
1917 BUG_ON(ret);
1918 }
1919 } 2008 }
1920 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1921 return 0; 2010 return 0;
@@ -2346,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2346 ret = 0; 2435 ret = 0;
2347out: 2436out:
2348 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2349 return ret; 2440 return ret;
2350} 2441}
2351 2442
@@ -2659,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2659 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2660{ 2751{
2661 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2662 2761
2663 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2664 if (found) { 2763 if (found) {
2665 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2666 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2667 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2668 found->full = 0; 2768 found->full = 0;
2669 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2670 *space_info = found; 2770 *space_info = found;
@@ -2674,16 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2674 if (!found) 2774 if (!found)
2675 return -ENOMEM; 2775 return -ENOMEM;
2676 2776
2677 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2678 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2679 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2680 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2681 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2682 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2683 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2684 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2685 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2686 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2687 found->full = 0; 2791 found->full = 0;
2688 found->force_alloc = 0; 2792 found->force_alloc = 0;
2689 *space_info = found; 2793 *space_info = found;
@@ -2708,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2708 } 2812 }
2709} 2813}
2710 2814
2711static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2712{
2713 spin_lock(&cache->space_info->lock);
2714 spin_lock(&cache->lock);
2715 if (!cache->ro) {
2716 cache->space_info->bytes_readonly += cache->key.offset -
2717 btrfs_block_group_used(&cache->item);
2718 cache->ro = 1;
2719 }
2720 spin_unlock(&cache->lock);
2721 spin_unlock(&cache->space_info->lock);
2722}
2723
2724u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2725{ 2816{
2726 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2749,492 +2840,49 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2749 return flags; 2840 return flags;
2750} 2841}
2751 2842
2752static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2753{ 2844{
2754 struct btrfs_fs_info *info = root->fs_info; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2755 u64 alloc_profile; 2846 flags |= root->fs_info->avail_data_alloc_bits &
2756 2847 root->fs_info->data_alloc_profile;
2757 if (data) { 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2758 alloc_profile = info->avail_data_alloc_bits & 2849 flags |= root->fs_info->avail_system_alloc_bits &
2759 info->data_alloc_profile; 2850 root->fs_info->system_alloc_profile;
2760 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2761 } else if (root == root->fs_info->chunk_root) { 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2762 alloc_profile = info->avail_system_alloc_bits & 2853 root->fs_info->metadata_alloc_profile;
2763 info->system_alloc_profile; 2854 return btrfs_reduce_alloc_profile(root, flags);
2764 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2765 } else {
2766 alloc_profile = info->avail_metadata_alloc_bits &
2767 info->metadata_alloc_profile;
2768 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2769 }
2770
2771 return btrfs_reduce_alloc_profile(root, data);
2772} 2855}
2773 2856
2774void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2775{ 2858{
2776 u64 alloc_target; 2859 u64 flags;
2777
2778 alloc_target = btrfs_get_alloc_profile(root, 1);
2779 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2780 alloc_target);
2781}
2782
2783static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2784{
2785 u64 num_bytes;
2786 int level;
2787
2788 level = BTRFS_MAX_LEVEL - 2;
2789 /*
2790 * NOTE: these calculations are absolutely the worst possible case.
2791 * This assumes that _every_ item we insert will require a new leaf, and
2792 * that the tree has grown to its maximum level size.
2793 */
2794
2795 /*
2796 * for every item we insert we could insert both an extent item and a
2797 * extent ref item. Then for ever item we insert, we will need to cow
2798 * both the original leaf, plus the leaf to the left and right of it.
2799 *
2800 * Unless we are talking about the extent root, then we just want the
2801 * number of items * 2, since we just need the extent item plus its ref.
2802 */
2803 if (root == root->fs_info->extent_root)
2804 num_bytes = num_items * 2;
2805 else
2806 num_bytes = (num_items + (2 * num_items)) * 3;
2807
2808 /*
2809 * num_bytes is total number of leaves we could need times the leaf
2810 * size, and then for every leaf we could end up cow'ing 2 nodes per
2811 * level, down to the leaf level.
2812 */
2813 num_bytes = (num_bytes * root->leafsize) +
2814 (num_bytes * (level * 2)) * root->nodesize;
2815
2816 return num_bytes;
2817}
2818
2819/*
2820 * Unreserve metadata space for delalloc. If we have less reserved credits than
2821 * we have extents, this function does nothing.
2822 */
2823int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2824 struct inode *inode, int num_items)
2825{
2826 struct btrfs_fs_info *info = root->fs_info;
2827 struct btrfs_space_info *meta_sinfo;
2828 u64 num_bytes;
2829 u64 alloc_target;
2830 bool bug = false;
2831
2832 /* get the space info for where the metadata will live */
2833 alloc_target = btrfs_get_alloc_profile(root, 0);
2834 meta_sinfo = __find_space_info(info, alloc_target);
2835
2836 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2837 num_items);
2838
2839 spin_lock(&meta_sinfo->lock);
2840 spin_lock(&BTRFS_I(inode)->accounting_lock);
2841 if (BTRFS_I(inode)->reserved_extents <=
2842 BTRFS_I(inode)->outstanding_extents) {
2843 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2844 spin_unlock(&meta_sinfo->lock);
2845 return 0;
2846 }
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2848
2849 BTRFS_I(inode)->reserved_extents--;
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2851
2852 if (meta_sinfo->bytes_delalloc < num_bytes) {
2853 bug = true;
2854 meta_sinfo->bytes_delalloc = 0;
2855 } else {
2856 meta_sinfo->bytes_delalloc -= num_bytes;
2857 }
2858 spin_unlock(&meta_sinfo->lock);
2859
2860 BUG_ON(bug);
2861
2862 return 0;
2863}
2864
2865static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2866{
2867 u64 thresh;
2868
2869 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2870 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2871 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2872 meta_sinfo->bytes_may_use;
2873 2860
2874 thresh = meta_sinfo->total_bytes - thresh; 2861 if (data)
2875 thresh *= 80; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2876 do_div(thresh, 100); 2863 else if (root == root->fs_info->chunk_root)
2877 if (thresh <= meta_sinfo->bytes_delalloc) 2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2878 meta_sinfo->force_delalloc = 1;
2879 else 2865 else
2880 meta_sinfo->force_delalloc = 0; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2881}
2882
2883struct async_flush {
2884 struct btrfs_root *root;
2885 struct btrfs_space_info *info;
2886 struct btrfs_work work;
2887};
2888
2889static noinline void flush_delalloc_async(struct btrfs_work *work)
2890{
2891 struct async_flush *async;
2892 struct btrfs_root *root;
2893 struct btrfs_space_info *info;
2894
2895 async = container_of(work, struct async_flush, work);
2896 root = async->root;
2897 info = async->info;
2898
2899 btrfs_start_delalloc_inodes(root, 0);
2900 wake_up(&info->flush_wait);
2901 btrfs_wait_ordered_extents(root, 0, 0);
2902
2903 spin_lock(&info->lock);
2904 info->flushing = 0;
2905 spin_unlock(&info->lock);
2906 wake_up(&info->flush_wait);
2907
2908 kfree(async);
2909}
2910
2911static void wait_on_flush(struct btrfs_space_info *info)
2912{
2913 DEFINE_WAIT(wait);
2914 u64 used;
2915
2916 while (1) {
2917 prepare_to_wait(&info->flush_wait, &wait,
2918 TASK_UNINTERRUPTIBLE);
2919 spin_lock(&info->lock);
2920 if (!info->flushing) {
2921 spin_unlock(&info->lock);
2922 break;
2923 }
2924
2925 used = info->bytes_used + info->bytes_reserved +
2926 info->bytes_pinned + info->bytes_readonly +
2927 info->bytes_super + info->bytes_root +
2928 info->bytes_may_use + info->bytes_delalloc;
2929 if (used < info->total_bytes) {
2930 spin_unlock(&info->lock);
2931 break;
2932 }
2933 spin_unlock(&info->lock);
2934 schedule();
2935 }
2936 finish_wait(&info->flush_wait, &wait);
2937}
2938
2939static void flush_delalloc(struct btrfs_root *root,
2940 struct btrfs_space_info *info)
2941{
2942 struct async_flush *async;
2943 bool wait = false;
2944
2945 spin_lock(&info->lock);
2946
2947 if (!info->flushing) {
2948 info->flushing = 1;
2949 init_waitqueue_head(&info->flush_wait);
2950 } else {
2951 wait = true;
2952 }
2953
2954 spin_unlock(&info->lock);
2955
2956 if (wait) {
2957 wait_on_flush(info);
2958 return;
2959 }
2960
2961 async = kzalloc(sizeof(*async), GFP_NOFS);
2962 if (!async)
2963 goto flush;
2964
2965 async->root = root;
2966 async->info = info;
2967 async->work.func = flush_delalloc_async;
2968
2969 btrfs_queue_worker(&root->fs_info->enospc_workers,
2970 &async->work);
2971 wait_on_flush(info);
2972 return;
2973
2974flush:
2975 btrfs_start_delalloc_inodes(root, 0);
2976 btrfs_wait_ordered_extents(root, 0, 0);
2977
2978 spin_lock(&info->lock);
2979 info->flushing = 0;
2980 spin_unlock(&info->lock);
2981 wake_up(&info->flush_wait);
2982}
2983
2984static int maybe_allocate_chunk(struct btrfs_root *root,
2985 struct btrfs_space_info *info)
2986{
2987 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2988 struct btrfs_trans_handle *trans;
2989 bool wait = false;
2990 int ret = 0;
2991 u64 min_metadata;
2992 u64 free_space;
2993
2994 free_space = btrfs_super_total_bytes(disk_super);
2995 /*
2996 * we allow the metadata to grow to a max of either 10gb or 5% of the
2997 * space in the volume.
2998 */
2999 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3000 div64_u64(free_space * 5, 100));
3001 if (info->total_bytes >= min_metadata) {
3002 spin_unlock(&info->lock);
3003 return 0;
3004 }
3005
3006 if (info->full) {
3007 spin_unlock(&info->lock);
3008 return 0;
3009 }
3010
3011 if (!info->allocating_chunk) {
3012 info->force_alloc = 1;
3013 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113 2867
3114 BTRFS_I(inode)->reserved_extents++; 2868 return get_alloc_profile(root, flags);
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122} 2869}
3123 2870
3124/* 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
2885 u64 used;
3238 int ret = 0, committed = 0; 2886 int ret = 0, committed = 0;
3239 2887
3240 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
@@ -3247,10 +2895,11 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3247again: 2895again:
3248 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3249 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 2900 data_sinfo->bytes_may_use;
3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 2901
2902 if (used + bytes > data_sinfo->total_bytes) {
3254 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3255 2904
3256 /* 2905 /*
@@ -3264,15 +2913,15 @@ again:
3264 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3265alloc: 2914alloc:
3266 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3267 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3268 if (!trans) 2917 if (IS_ERR(trans))
3269 return -ENOMEM; 2918 return PTR_ERR(trans);
3270 2919
3271 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3272 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3273 alloc_target, 0); 2922 alloc_target, 0);
3274 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3275 if (ret) 2924 if (ret < 0)
3276 return ret; 2925 return ret;
3277 2926
3278 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3287,25 +2936,26 @@ alloc:
3287 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3288 committed = 1; 2937 committed = 1;
3289 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3290 if (!trans) 2939 if (IS_ERR(trans))
3291 return -ENOMEM; 2940 return PTR_ERR(trans);
3292 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3293 if (ret) 2942 if (ret)
3294 return ret; 2943 return ret;
3295 goto again; 2944 goto again;
3296 } 2945 }
3297 2946
3298 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3299 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3300 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3301 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3302 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3303 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3304 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3305 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3306 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3307 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3308 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3309 return -ENOSPC; 2959 return -ENOSPC;
3310 } 2960 }
3311 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3316,12 +2966,13 @@ alloc:
3316} 2966}
3317 2967
3318/* 2968/*
3319 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3320 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3321 */ 2972 */
3322void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3323 struct inode *inode, u64 bytes)
3324{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3325 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3326 2977
3327 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3334,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3334 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3335} 2986}
3336 2987
3337/* called when we are adding a delalloc extent to the inode's io_tree */
3338void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3339 u64 bytes)
3340{
3341 struct btrfs_space_info *data_sinfo;
3342
3343 /* get the space info for where this inode will be storing its data */
3344 data_sinfo = BTRFS_I(inode)->space_info;
3345
3346 /* make sure we have enough space to handle the data first */
3347 spin_lock(&data_sinfo->lock);
3348 data_sinfo->bytes_delalloc += bytes;
3349
3350 /*
3351 * we are adding a delalloc extent without calling
3352 * btrfs_check_data_free_space first. This happens on a weird
3353 * writepage condition, but shouldn't hurt our accounting
3354 */
3355 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3356 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3357 BTRFS_I(inode)->reserved_bytes = 0;
3358 } else {
3359 data_sinfo->bytes_may_use -= bytes;
3360 BTRFS_I(inode)->reserved_bytes -= bytes;
3361 }
3362
3363 spin_unlock(&data_sinfo->lock);
3364}
3365
3366/* called when we are clearing an delalloc extent from the inode's io_tree */
3367void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3368 u64 bytes)
3369{
3370 struct btrfs_space_info *info;
3371
3372 info = BTRFS_I(inode)->space_info;
3373
3374 spin_lock(&info->lock);
3375 info->bytes_delalloc -= bytes;
3376 spin_unlock(&info->lock);
3377}
3378
3379static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3380{ 2989{
3381 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3389,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3389 rcu_read_unlock(); 2998 rcu_read_unlock();
3390} 2999}
3391 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3392static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3393 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3394 u64 flags, int force) 3019 u64 flags, int force)
3395{ 3020{
3396 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3397 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3398 u64 thresh;
3399 int ret = 0; 3023 int ret = 0;
3400 3024
3401 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3418,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3418 goto out; 3042 goto out;
3419 } 3043 }
3420 3044
3421 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3422 thresh = div_factor(thresh, 8);
3423 if (!force &&
3424 (space_info->bytes_used + space_info->bytes_pinned +
3425 space_info->bytes_reserved + alloc_bytes) < thresh) {
3426 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3427 goto out; 3047 goto out;
3428 } 3048 }
@@ -3444,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3444 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3445 if (ret) 3065 if (ret)
3446 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3447 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3448 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3449out: 3071out:
@@ -3451,13 +3073,713 @@ out:
3451 return ret; 3073 return ret;
3452} 3074}
3453 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3454static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3455 struct btrfs_root *root, 3777 struct btrfs_root *root,
3456 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3457 int mark_free)
3458{ 3779{
3459 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3460 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3461 u64 total = num_bytes; 3783 u64 total = num_bytes;
3462 u64 old_val; 3784 u64 old_val;
3463 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3476,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3476 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3477 if (!cache) 3799 if (!cache)
3478 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3479 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3480 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3481 3809
@@ -3488,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3488 old_val += num_bytes; 3816 old_val += num_bytes;
3489 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3490 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3491 cache->space_info->bytes_used += num_bytes;
3492 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3493 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3494 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3495 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3496 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3497 } else { 3824 } else {
3498 old_val -= num_bytes; 3825 old_val -= num_bytes;
3499 cache->space_info->bytes_used -= num_bytes;
3500 if (cache->ro)
3501 cache->space_info->bytes_readonly += num_bytes;
3502 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3503 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3504 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3505 if (mark_free) {
3506 int ret;
3507
3508 ret = btrfs_discard_extent(root, bytenr,
3509 num_bytes);
3510 WARN_ON(ret);
3511 3833
3512 ret = btrfs_add_free_space(cache, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3513 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3514 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3515 }
3516 } 3837 }
3517 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3518 total -= num_bytes; 3839 total -= num_bytes;
@@ -3536,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3536 return bytenr; 3857 return bytenr;
3537} 3858}
3538 3859
3539/* 3860static int pin_down_extent(struct btrfs_root *root,
3540 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3541 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3542int btrfs_pin_extent(struct btrfs_root *root,
3543 u64 bytenr, u64 num_bytes, int reserved)
3544{ 3863{
3545 struct btrfs_fs_info *fs_info = root->fs_info;
3546 struct btrfs_block_group_cache *cache;
3547
3548 cache = btrfs_lookup_block_group(fs_info, bytenr);
3549 BUG_ON(!cache);
3550
3551 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3552 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3553 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3559,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3559 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3560 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3561 3874
3562 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3563 3879
3564 set_extent_dirty(fs_info->pinned_extents, 3880/*
3565 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3892
3893 btrfs_put_block_group(cache);
3566 return 0; 3894 return 0;
3567} 3895}
3568 3896
3569static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3570 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3571{ 3903{
3572 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3573 spin_lock(&cache->lock); 3905 if (sinfo) {
3574 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3575 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3576 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3577 } else { 3924 } else {
3578 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3579 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3580 } 3935 }
3581 spin_unlock(&cache->lock); 3936 return ret;
3582 spin_unlock(&cache->space_info->lock);
3583 return 0;
3584} 3937}
3585 3938
3586int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3611,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3611 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3612 3965
3613 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3614 return 0; 3969 return 0;
3615} 3970}
3616 3971
@@ -3637,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3637 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3638 } 3993 }
3639 3994
3995 start += len;
3996
3640 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3641 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3642 cache->pinned -= len; 3999 cache->pinned -= len;
3643 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3644 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3645 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3646
3647 start += len;
3648 } 4010 }
3649 4011
3650 if (cache) 4012 if (cache)
@@ -3657,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3657{ 4019{
3658 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3659 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3660 u64 start; 4024 u64 start;
3661 u64 end; 4025 u64 end;
4026 int idx;
3662 int ret; 4027 int ret;
3663 4028
3664 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3679,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3679 cond_resched(); 4044 cond_resched();
3680 } 4045 }
3681 4046
3682 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
3684 4049 &fs_info->durable_block_rsv_list, list) {
3685static int pin_down_bytes(struct btrfs_trans_handle *trans,
3686 struct btrfs_root *root,
3687 struct btrfs_path *path,
3688 u64 bytenr, u64 num_bytes,
3689 int is_data, int reserved,
3690 struct extent_buffer **must_clean)
3691{
3692 int err = 0;
3693 struct extent_buffer *buf;
3694
3695 if (is_data)
3696 goto pinit;
3697
3698 /*
3699 * discard is sloooow, and so triggering discards on
3700 * individual btree blocks isn't a good plan. Just
3701 * pin everything in discard mode.
3702 */
3703 if (btrfs_test_opt(root, DISCARD))
3704 goto pinit;
3705 4050
3706 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 4051 idx = trans->transid & 0x1;
3707 if (!buf) 4052 if (block_rsv->freed[idx] > 0) {
3708 goto pinit; 4053 block_rsv_add_bytes(block_rsv,
4054 block_rsv->freed[idx], 0);
4055 block_rsv->freed[idx] = 0;
4056 }
4057 if (atomic_read(&block_rsv->usage) == 0) {
4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3709 4059
3710 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3711 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3712 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3713 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3714 */ 4064 }
3715 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3716 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3717 u64 header_owner = btrfs_header_owner(buf);
3718 u64 header_transid = btrfs_header_generation(buf);
3719 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3720 header_transid == trans->transid &&
3721 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3722 *must_clean = buf;
3723 return 1;
3724 } 4067 }
3725 btrfs_tree_unlock(buf);
3726 } 4068 }
3727 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3728pinit:
3729 if (path)
3730 btrfs_set_path_blocking(path);
3731 /* unlocks the pinned mutex */
3732 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3733 4070
3734 BUG_ON(err < 0);
3735 return 0; 4071 return 0;
3736} 4072}
3737 4073
@@ -3892,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3892 BUG_ON(ret); 4228 BUG_ON(ret);
3893 } 4229 }
3894 } else { 4230 } else {
3895 int mark_free = 0;
3896 struct extent_buffer *must_clean = NULL;
3897
3898 if (found_extent) { 4231 if (found_extent) {
3899 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3900 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3907,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3907 } 4240 }
3908 } 4241 }
3909 4242
3910 ret = pin_down_bytes(trans, root, path, bytenr,
3911 num_bytes, is_data, 0, &must_clean);
3912 if (ret > 0)
3913 mark_free = 1;
3914 BUG_ON(ret < 0);
3915 /*
3916 * it is going to be very rare for someone to be waiting
3917 * on the block we're freeing. del_items might need to
3918 * schedule, so rather than get fancy, just force it
3919 * to blocking here
3920 */
3921 if (must_clean)
3922 btrfs_set_lock_blocking(must_clean);
3923
3924 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3925 num_to_del); 4244 num_to_del);
3926 BUG_ON(ret); 4245 BUG_ON(ret);
3927 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3928 4247
3929 if (must_clean) {
3930 clean_tree_block(NULL, root, must_clean);
3931 btrfs_tree_unlock(must_clean);
3932 free_extent_buffer(must_clean);
3933 }
3934
3935 if (is_data) { 4248 if (is_data) {
3936 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3937 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3941,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3941 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3942 } 4255 }
3943 4256
3944 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3945 mark_free);
3946 BUG_ON(ret); 4258 BUG_ON(ret);
3947 } 4259 }
3948 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3950,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3950} 4262}
3951 4263
3952/* 4264/*
3953 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3954 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3955 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3956 * removes it from the tree. 4268 * removes it from the tree.
@@ -3962,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3962 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3963 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3964 struct rb_node *node; 4276 struct rb_node *node;
3965 int ret; 4277 int ret = 0;
3966 4278
3967 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3968 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4014,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4014 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4015 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4016 4328
4017 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4018 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4019 head->must_insert_reserved); 4331 ret = 1;
4020 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4021 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4022 return 0; 4335 return ret;
4023out: 4336out:
4024 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4025 return 0; 4338 return 0;
4026} 4339}
4027 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 if (block_rsv->space_info != cache->space_info)
4364 goto out;
4365
4366 if (btrfs_header_generation(buf) == trans->transid) {
4367 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4368 ret = check_ref_cleanup(trans, root, buf->start);
4369 if (!ret)
4370 goto pin;
4371 }
4372
4373 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4374 pin_down_extent(root, cache, buf->start, buf->len, 1);
4375 goto pin;
4376 }
4377
4378 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4379
4380 btrfs_add_free_space(cache, buf->start, buf->len);
4381 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4382 if (ret == -EAGAIN) {
4383 /* block group became read-only */
4384 update_reserved_bytes(cache, buf->len, 0, 1);
4385 goto out;
4386 }
4387
4388 ret = 1;
4389 spin_lock(&block_rsv->lock);
4390 if (block_rsv->reserved < block_rsv->size) {
4391 block_rsv->reserved += buf->len;
4392 ret = 0;
4393 }
4394 spin_unlock(&block_rsv->lock);
4395
4396 if (ret) {
4397 spin_lock(&cache->space_info->lock);
4398 cache->space_info->bytes_reserved -= buf->len;
4399 spin_unlock(&cache->space_info->lock);
4400 }
4401 goto out;
4402 }
4403pin:
4404 if (block_rsv->durable && !cache->ro) {
4405 ret = 0;
4406 spin_lock(&cache->lock);
4407 if (!cache->ro) {
4408 cache->reserved_pinned += buf->len;
4409 ret = 1;
4410 }
4411 spin_unlock(&cache->lock);
4412
4413 if (ret) {
4414 spin_lock(&block_rsv->lock);
4415 block_rsv->freed[trans->transid & 0x1] += buf->len;
4416 spin_unlock(&block_rsv->lock);
4417 }
4418 }
4419out:
4420 btrfs_put_block_group(cache);
4421}
4422
4028int btrfs_free_extent(struct btrfs_trans_handle *trans, 4423int btrfs_free_extent(struct btrfs_trans_handle *trans,
4029 struct btrfs_root *root, 4424 struct btrfs_root *root,
4030 u64 bytenr, u64 num_bytes, u64 parent, 4425 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4046,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4046 parent, root_objectid, (int)owner, 4441 parent, root_objectid, (int)owner,
4047 BTRFS_DROP_DELAYED_REF, NULL); 4442 BTRFS_DROP_DELAYED_REF, NULL);
4048 BUG_ON(ret); 4443 BUG_ON(ret);
4049 ret = check_ref_cleanup(trans, root, bytenr);
4050 BUG_ON(ret);
4051 } else { 4444 } else {
4052 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4445 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4053 parent, root_objectid, owner, 4446 parent, root_objectid, owner,
@@ -4057,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4057 return ret; 4450 return ret;
4058} 4451}
4059 4452
4060int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4061 struct btrfs_root *root,
4062 u64 bytenr, u32 blocksize,
4063 u64 parent, u64 root_objectid, int level)
4064{
4065 u64 used;
4066 spin_lock(&root->node_lock);
4067 used = btrfs_root_used(&root->root_item) - blocksize;
4068 btrfs_set_root_used(&root->root_item, used);
4069 spin_unlock(&root->node_lock);
4070
4071 return btrfs_free_extent(trans, root, bytenr, blocksize,
4072 parent, root_objectid, level, 0);
4073}
4074
4075static u64 stripe_align(struct btrfs_root *root, u64 val) 4453static u64 stripe_align(struct btrfs_root *root, u64 val)
4076{ 4454{
4077 u64 mask = ((u64)root->stripesize - 1); 4455 u64 mask = ((u64)root->stripesize - 1);
@@ -4124,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4124 return 0; 4502 return 0;
4125} 4503}
4126 4504
4505static int get_block_group_index(struct btrfs_block_group_cache *cache)
4506{
4507 int index;
4508 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4509 index = 0;
4510 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4511 index = 1;
4512 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4513 index = 2;
4514 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4515 index = 3;
4516 else
4517 index = 4;
4518 return index;
4519}
4520
4127enum btrfs_loop_type { 4521enum btrfs_loop_type {
4128 LOOP_FIND_IDEAL = 0, 4522 LOOP_FIND_IDEAL = 0,
4129 LOOP_CACHING_NOWAIT = 1, 4523 LOOP_CACHING_NOWAIT = 1,
@@ -4145,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4145 u64 num_bytes, u64 empty_size, 4539 u64 num_bytes, u64 empty_size,
4146 u64 search_start, u64 search_end, 4540 u64 search_start, u64 search_end,
4147 u64 hint_byte, struct btrfs_key *ins, 4541 u64 hint_byte, struct btrfs_key *ins,
4148 u64 exclude_start, u64 exclude_nr,
4149 int data) 4542 int data)
4150{ 4543{
4151 int ret = 0; 4544 int ret = 0;
@@ -4158,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4158 struct btrfs_space_info *space_info; 4551 struct btrfs_space_info *space_info;
4159 int last_ptr_loop = 0; 4552 int last_ptr_loop = 0;
4160 int loop = 0; 4553 int loop = 0;
4554 int index = 0;
4161 bool found_uncached_bg = false; 4555 bool found_uncached_bg = false;
4162 bool failed_cluster_refill = false; 4556 bool failed_cluster_refill = false;
4163 bool failed_alloc = false; 4557 bool failed_alloc = false;
@@ -4170,6 +4564,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4564 ins->offset = 0;
4171 4565
4172 space_info = __find_space_info(root->fs_info, data); 4566 space_info = __find_space_info(root->fs_info, data);
4567 if (!space_info) {
4568 printk(KERN_ERR "No space info for %d\n", data);
4569 return -ENOSPC;
4570 }
4173 4571
4174 if (orig_root->ref_cows || empty_size) 4572 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4573 allowed_chunk_alloc = 1;
@@ -4223,6 +4621,7 @@ ideal_cache:
4223 btrfs_put_block_group(block_group); 4621 btrfs_put_block_group(block_group);
4224 up_read(&space_info->groups_sem); 4622 up_read(&space_info->groups_sem);
4225 } else { 4623 } else {
4624 index = get_block_group_index(block_group);
4226 goto have_block_group; 4625 goto have_block_group;
4227 } 4626 }
4228 } else if (block_group) { 4627 } else if (block_group) {
@@ -4231,7 +4630,8 @@ ideal_cache:
4231 } 4630 }
4232search: 4631search:
4233 down_read(&space_info->groups_sem); 4632 down_read(&space_info->groups_sem);
4234 list_for_each_entry(block_group, &space_info->block_groups, list) { 4633 list_for_each_entry(block_group, &space_info->block_groups[index],
4634 list) {
4235 u64 offset; 4635 u64 offset;
4236 int cached; 4636 int cached;
4237 4637
@@ -4422,23 +4822,22 @@ checks:
4422 goto loop; 4822 goto loop;
4423 } 4823 }
4424 4824
4425 if (exclude_nr > 0 && 4825 ins->objectid = search_start;
4426 (search_start + num_bytes > exclude_start && 4826 ins->offset = num_bytes;
4427 search_start < exclude_start + exclude_nr)) {
4428 search_start = exclude_start + exclude_nr;
4429 4827
4828 if (offset < search_start)
4829 btrfs_add_free_space(block_group, offset,
4830 search_start - offset);
4831 BUG_ON(offset > search_start);
4832
4833 ret = update_reserved_bytes(block_group, num_bytes, 1,
4834 (data & BTRFS_BLOCK_GROUP_DATA));
4835 if (ret == -EAGAIN) {
4430 btrfs_add_free_space(block_group, offset, num_bytes); 4836 btrfs_add_free_space(block_group, offset, num_bytes);
4431 /*
4432 * if search_start is still in this block group
4433 * then we just re-search this block group
4434 */
4435 if (search_start >= block_group->key.objectid &&
4436 search_start < (block_group->key.objectid +
4437 block_group->key.offset))
4438 goto have_block_group;
4439 goto loop; 4837 goto loop;
4440 } 4838 }
4441 4839
4840 /* we are all good, lets return */
4442 ins->objectid = search_start; 4841 ins->objectid = search_start;
4443 ins->offset = num_bytes; 4842 ins->offset = num_bytes;
4444 4843
@@ -4446,18 +4845,18 @@ checks:
4446 btrfs_add_free_space(block_group, offset, 4845 btrfs_add_free_space(block_group, offset,
4447 search_start - offset); 4846 search_start - offset);
4448 BUG_ON(offset > search_start); 4847 BUG_ON(offset > search_start);
4449
4450 update_reserved_extents(block_group, num_bytes, 1);
4451
4452 /* we are all good, lets return */
4453 break; 4848 break;
4454loop: 4849loop:
4455 failed_cluster_refill = false; 4850 failed_cluster_refill = false;
4456 failed_alloc = false; 4851 failed_alloc = false;
4852 BUG_ON(index != get_block_group_index(block_group));
4457 btrfs_put_block_group(block_group); 4853 btrfs_put_block_group(block_group);
4458 } 4854 }
4459 up_read(&space_info->groups_sem); 4855 up_read(&space_info->groups_sem);
4460 4856
4857 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4858 goto search;
4859
4461 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4860 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4462 * for them to make caching progress. Also 4861 * for them to make caching progress. Also
4463 * determine the best possible bg to cache 4862 * determine the best possible bg to cache
@@ -4471,6 +4870,7 @@ loop:
4471 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4870 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4472 (found_uncached_bg || empty_size || empty_cluster || 4871 (found_uncached_bg || empty_size || empty_cluster ||
4473 allowed_chunk_alloc)) { 4872 allowed_chunk_alloc)) {
4873 index = 0;
4474 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4874 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4475 found_uncached_bg = false; 4875 found_uncached_bg = false;
4476 loop++; 4876 loop++;
@@ -4553,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4553 int dump_block_groups) 4953 int dump_block_groups)
4554{ 4954{
4555 struct btrfs_block_group_cache *cache; 4955 struct btrfs_block_group_cache *cache;
4956 int index = 0;
4556 4957
4557 spin_lock(&info->lock); 4958 spin_lock(&info->lock);
4558 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4959 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4559 (unsigned long long)(info->total_bytes - info->bytes_used - 4960 (unsigned long long)(info->total_bytes - info->bytes_used -
4560 info->bytes_pinned - info->bytes_reserved - 4961 info->bytes_pinned - info->bytes_reserved -
4561 info->bytes_super), 4962 info->bytes_readonly),
4562 (info->full) ? "" : "not "); 4963 (info->full) ? "" : "not ");
4563 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4964 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4564 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4965 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4565 "\n",
4566 (unsigned long long)info->total_bytes, 4966 (unsigned long long)info->total_bytes,
4967 (unsigned long long)info->bytes_used,
4567 (unsigned long long)info->bytes_pinned, 4968 (unsigned long long)info->bytes_pinned,
4568 (unsigned long long)info->bytes_delalloc, 4969 (unsigned long long)info->bytes_reserved,
4569 (unsigned long long)info->bytes_may_use, 4970 (unsigned long long)info->bytes_may_use,
4570 (unsigned long long)info->bytes_used, 4971 (unsigned long long)info->bytes_readonly);
4571 (unsigned long long)info->bytes_root,
4572 (unsigned long long)info->bytes_super,
4573 (unsigned long long)info->bytes_reserved);
4574 spin_unlock(&info->lock); 4972 spin_unlock(&info->lock);
4575 4973
4576 if (!dump_block_groups) 4974 if (!dump_block_groups)
4577 return; 4975 return;
4578 4976
4579 down_read(&info->groups_sem); 4977 down_read(&info->groups_sem);
4580 list_for_each_entry(cache, &info->block_groups, list) { 4978again:
4979 list_for_each_entry(cache, &info->block_groups[index], list) {
4581 spin_lock(&cache->lock); 4980 spin_lock(&cache->lock);
4582 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4981 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4583 "%llu pinned %llu reserved\n", 4982 "%llu pinned %llu reserved\n",
@@ -4589,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4589 btrfs_dump_free_space(cache, bytes); 4988 btrfs_dump_free_space(cache, bytes);
4590 spin_unlock(&cache->lock); 4989 spin_unlock(&cache->lock);
4591 } 4990 }
4991 if (++index < BTRFS_NR_RAID_TYPES)
4992 goto again;
4592 up_read(&info->groups_sem); 4993 up_read(&info->groups_sem);
4593} 4994}
4594 4995
@@ -4614,9 +5015,8 @@ again:
4614 5015
4615 WARN_ON(num_bytes < root->sectorsize); 5016 WARN_ON(num_bytes < root->sectorsize);
4616 ret = find_free_extent(trans, root, num_bytes, empty_size, 5017 ret = find_free_extent(trans, root, num_bytes, empty_size,
4617 search_start, search_end, hint_byte, ins, 5018 search_start, search_end, hint_byte,
4618 trans->alloc_exclude_start, 5019 ins, data);
4619 trans->alloc_exclude_nr, data);
4620 5020
4621 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5021 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4622 num_bytes = num_bytes >> 1; 5022 num_bytes = num_bytes >> 1;
@@ -4654,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4654 ret = btrfs_discard_extent(root, start, len); 5054 ret = btrfs_discard_extent(root, start, len);
4655 5055
4656 btrfs_add_free_space(cache, start, len); 5056 btrfs_add_free_space(cache, start, len);
4657 update_reserved_extents(cache, len, 0); 5057 update_reserved_bytes(cache, len, 0, 1);
4658 btrfs_put_block_group(cache); 5058 btrfs_put_block_group(cache);
4659 5059
4660 return ret; 5060 return ret;
@@ -4717,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4717 btrfs_mark_buffer_dirty(path->nodes[0]); 5117 btrfs_mark_buffer_dirty(path->nodes[0]);
4718 btrfs_free_path(path); 5118 btrfs_free_path(path);
4719 5119
4720 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5120 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4721 1, 0);
4722 if (ret) { 5121 if (ret) {
4723 printk(KERN_ERR "btrfs update block group failed for %llu " 5122 printk(KERN_ERR "btrfs update block group failed for %llu "
4724 "%llu\n", (unsigned long long)ins->objectid, 5123 "%llu\n", (unsigned long long)ins->objectid,
@@ -4778,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4778 btrfs_mark_buffer_dirty(leaf); 5177 btrfs_mark_buffer_dirty(leaf);
4779 btrfs_free_path(path); 5178 btrfs_free_path(path);
4780 5179
4781 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5180 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4782 1, 0);
4783 if (ret) { 5181 if (ret) {
4784 printk(KERN_ERR "btrfs update block group failed for %llu " 5182 printk(KERN_ERR "btrfs update block group failed for %llu "
4785 "%llu\n", (unsigned long long)ins->objectid, 5183 "%llu\n", (unsigned long long)ins->objectid,
@@ -4855,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4855 put_caching_control(caching_ctl); 5253 put_caching_control(caching_ctl);
4856 } 5254 }
4857 5255
4858 update_reserved_extents(block_group, ins->offset, 1); 5256 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5257 BUG_ON(ret);
4859 btrfs_put_block_group(block_group); 5258 btrfs_put_block_group(block_group);
4860 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5259 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4861 0, owner, offset, ins, 1); 5260 0, owner, offset, ins, 1);
4862 return ret; 5261 return ret;
4863} 5262}
4864 5263
4865/*
4866 * finds a free extent and does all the dirty work required for allocation
4867 * returns the key for the extent through ins, and a tree buffer for
4868 * the first block of the extent through buf.
4869 *
4870 * returns 0 if everything worked, non-zero otherwise.
4871 */
4872static int alloc_tree_block(struct btrfs_trans_handle *trans,
4873 struct btrfs_root *root,
4874 u64 num_bytes, u64 parent, u64 root_objectid,
4875 struct btrfs_disk_key *key, int level,
4876 u64 empty_size, u64 hint_byte, u64 search_end,
4877 struct btrfs_key *ins)
4878{
4879 int ret;
4880 u64 flags = 0;
4881
4882 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4883 empty_size, hint_byte, search_end,
4884 ins, 0);
4885 if (ret)
4886 return ret;
4887
4888 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4889 if (parent == 0)
4890 parent = ins->objectid;
4891 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4892 } else
4893 BUG_ON(parent > 0);
4894
4895 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4896 struct btrfs_delayed_extent_op *extent_op;
4897 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4898 BUG_ON(!extent_op);
4899 if (key)
4900 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4901 else
4902 memset(&extent_op->key, 0, sizeof(extent_op->key));
4903 extent_op->flags_to_set = flags;
4904 extent_op->update_key = 1;
4905 extent_op->update_flags = 1;
4906 extent_op->is_data = 0;
4907
4908 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4909 ins->offset, parent, root_objectid,
4910 level, BTRFS_ADD_DELAYED_EXTENT,
4911 extent_op);
4912 BUG_ON(ret);
4913 }
4914
4915 if (root_objectid == root->root_key.objectid) {
4916 u64 used;
4917 spin_lock(&root->node_lock);
4918 used = btrfs_root_used(&root->root_item) + num_bytes;
4919 btrfs_set_root_used(&root->root_item, used);
4920 spin_unlock(&root->node_lock);
4921 }
4922 return ret;
4923}
4924
4925struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5264struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4926 struct btrfs_root *root, 5265 struct btrfs_root *root,
4927 u64 bytenr, u32 blocksize, 5266 u64 bytenr, u32 blocksize,
@@ -4960,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4960 return buf; 5299 return buf;
4961} 5300}
4962 5301
5302static struct btrfs_block_rsv *
5303use_block_rsv(struct btrfs_trans_handle *trans,
5304 struct btrfs_root *root, u32 blocksize)
5305{
5306 struct btrfs_block_rsv *block_rsv;
5307 int ret;
5308
5309 block_rsv = get_block_rsv(trans, root);
5310
5311 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize);
5313 if (ret)
5314 return ERR_PTR(ret);
5315 return block_rsv;
5316 }
5317
5318 ret = block_rsv_use_bytes(block_rsv, blocksize);
5319 if (!ret)
5320 return block_rsv;
5321
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC);
5328}
5329
5330static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5331{
5332 block_rsv_add_bytes(block_rsv, blocksize, 0);
5333 block_rsv_release_bytes(block_rsv, NULL, 0);
5334}
5335
4963/* 5336/*
4964 * helper function to allocate a block for a given tree 5337 * finds a free extent and does all the dirty work required for allocation
5338 * returns the key for the extent through ins, and a tree buffer for
5339 * the first block of the extent through buf.
5340 *
4965 * returns the tree buffer or NULL. 5341 * returns the tree buffer or NULL.
4966 */ 5342 */
4967struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5343struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4971,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4971 u64 hint, u64 empty_size) 5347 u64 hint, u64 empty_size)
4972{ 5348{
4973 struct btrfs_key ins; 5349 struct btrfs_key ins;
4974 int ret; 5350 struct btrfs_block_rsv *block_rsv;
4975 struct extent_buffer *buf; 5351 struct extent_buffer *buf;
5352 u64 flags = 0;
5353 int ret;
4976 5354
4977 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5355
4978 key, level, empty_size, hint, (u64)-1, &ins); 5356 block_rsv = use_block_rsv(trans, root, blocksize);
5357 if (IS_ERR(block_rsv))
5358 return ERR_CAST(block_rsv);
5359
5360 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5361 empty_size, hint, (u64)-1, &ins, 0);
4979 if (ret) { 5362 if (ret) {
4980 BUG_ON(ret > 0); 5363 unuse_block_rsv(block_rsv, blocksize);
4981 return ERR_PTR(ret); 5364 return ERR_PTR(ret);
4982 } 5365 }
4983 5366
4984 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5367 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4985 blocksize, level); 5368 blocksize, level);
5369 BUG_ON(IS_ERR(buf));
5370
5371 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5372 if (parent == 0)
5373 parent = ins.objectid;
5374 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5375 } else
5376 BUG_ON(parent > 0);
5377
5378 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5379 struct btrfs_delayed_extent_op *extent_op;
5380 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5381 BUG_ON(!extent_op);
5382 if (key)
5383 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5384 else
5385 memset(&extent_op->key, 0, sizeof(extent_op->key));
5386 extent_op->flags_to_set = flags;
5387 extent_op->update_key = 1;
5388 extent_op->update_flags = 1;
5389 extent_op->is_data = 0;
5390
5391 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5392 ins.offset, parent, root_objectid,
5393 level, BTRFS_ADD_DELAYED_EXTENT,
5394 extent_op);
5395 BUG_ON(ret);
5396 }
4986 return buf; 5397 return buf;
4987} 5398}
4988 5399
@@ -5205,6 +5616,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5616 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5617 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5618 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5619 if (!next)
5620 return -ENOMEM;
5208 reada = 1; 5621 reada = 1;
5209 } 5622 }
5210 btrfs_tree_lock(next); 5623 btrfs_tree_lock(next);
@@ -5305,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5305 struct btrfs_path *path, 5718 struct btrfs_path *path,
5306 struct walk_control *wc) 5719 struct walk_control *wc)
5307{ 5720{
5308 int ret = 0; 5721 int ret;
5309 int level = wc->level; 5722 int level = wc->level;
5310 struct extent_buffer *eb = path->nodes[level]; 5723 struct extent_buffer *eb = path->nodes[level];
5311 u64 parent = 0; 5724 u64 parent = 0;
@@ -5383,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5383 btrfs_header_owner(path->nodes[level + 1])); 5796 btrfs_header_owner(path->nodes[level + 1]));
5384 } 5797 }
5385 5798
5386 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5799 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5387 root->root_key.objectid, level, 0);
5388 BUG_ON(ret);
5389out: 5800out:
5390 wc->refs[level] = 0; 5801 wc->refs[level] = 0;
5391 wc->flags[level] = 0; 5802 wc->flags[level] = 0;
5392 return ret; 5803 return 0;
5393} 5804}
5394 5805
5395static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5806static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5417,7 +5828,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5417 if (ret > 0) { 5828 if (ret > 0) {
5418 path->slots[level]++; 5829 path->slots[level]++;
5419 continue; 5830 continue;
5420 } 5831 } else if (ret < 0)
5832 return ret;
5421 level = wc->level; 5833 level = wc->level;
5422 } 5834 }
5423 return 0; 5835 return 0;
@@ -5466,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5466 * also make sure backrefs for the shared block and all lower level 5878 * also make sure backrefs for the shared block and all lower level
5467 * blocks are properly updated. 5879 * blocks are properly updated.
5468 */ 5880 */
5469int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5881int btrfs_drop_snapshot(struct btrfs_root *root,
5882 struct btrfs_block_rsv *block_rsv, int update_ref)
5470{ 5883{
5471 struct btrfs_path *path; 5884 struct btrfs_path *path;
5472 struct btrfs_trans_handle *trans; 5885 struct btrfs_trans_handle *trans;
@@ -5484,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5484 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5897 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5485 BUG_ON(!wc); 5898 BUG_ON(!wc);
5486 5899
5487 trans = btrfs_start_transaction(tree_root, 1); 5900 trans = btrfs_start_transaction(tree_root, 0);
5901 if (block_rsv)
5902 trans->block_rsv = block_rsv;
5488 5903
5489 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5904 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5490 level = btrfs_header_level(root->node); 5905 level = btrfs_header_level(root->node);
@@ -5572,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5572 } 5987 }
5573 5988
5574 BUG_ON(wc->level == 0); 5989 BUG_ON(wc->level == 0);
5575 if (trans->transaction->in_commit || 5990 if (btrfs_should_end_transaction(trans, tree_root)) {
5576 trans->transaction->delayed_refs.flushing) {
5577 ret = btrfs_update_root(trans, tree_root, 5991 ret = btrfs_update_root(trans, tree_root,
5578 &root->root_key, 5992 &root->root_key,
5579 root_item); 5993 root_item);
5580 BUG_ON(ret); 5994 BUG_ON(ret);
5581 5995
5582 btrfs_end_transaction(trans, tree_root); 5996 btrfs_end_transaction_throttle(trans, tree_root);
5583 trans = btrfs_start_transaction(tree_root, 1); 5997 trans = btrfs_start_transaction(tree_root, 0);
5584 } else { 5998 if (block_rsv)
5585 unsigned long update; 5999 trans->block_rsv = block_rsv;
5586 update = trans->delayed_ref_updates;
5587 trans->delayed_ref_updates = 0;
5588 if (update)
5589 btrfs_run_delayed_refs(trans, tree_root,
5590 update);
5591 } 6000 }
5592 } 6001 }
5593 btrfs_release_path(root, path); 6002 btrfs_release_path(root, path);
@@ -5615,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5615 kfree(root); 6024 kfree(root);
5616 } 6025 }
5617out: 6026out:
5618 btrfs_end_transaction(trans, tree_root); 6027 btrfs_end_transaction_throttle(trans, tree_root);
5619 kfree(wc); 6028 kfree(wc);
5620 btrfs_free_path(path); 6029 btrfs_free_path(path);
5621 return err; 6030 return err;
@@ -7211,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7211 return flags; 7620 return flags;
7212} 7621}
7213 7622
7214static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7623static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7215 struct btrfs_block_group_cache *shrink_block_group,
7216 int force)
7217{ 7624{
7218 struct btrfs_trans_handle *trans; 7625 struct btrfs_space_info *sinfo = cache->space_info;
7219 u64 new_alloc_flags; 7626 u64 num_bytes;
7220 u64 calc; 7627 int ret = -ENOSPC;
7221 7628
7222 spin_lock(&shrink_block_group->lock); 7629 if (cache->ro)
7223 if (btrfs_block_group_used(&shrink_block_group->item) + 7630 return 0;
7224 shrink_block_group->reserved > 0) {
7225 spin_unlock(&shrink_block_group->lock);
7226 7631
7227 trans = btrfs_start_transaction(root, 1); 7632 spin_lock(&sinfo->lock);
7228 spin_lock(&shrink_block_group->lock); 7633 spin_lock(&cache->lock);
7634 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7635 cache->bytes_super - btrfs_block_group_used(&cache->item);
7636
7637 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7638 sinfo->bytes_may_use + sinfo->bytes_readonly +
7639 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7640 sinfo->bytes_readonly += num_bytes;
7641 sinfo->bytes_reserved += cache->reserved_pinned;
7642 cache->reserved_pinned = 0;
7643 cache->ro = 1;
7644 ret = 0;
7645 }
7646 spin_unlock(&cache->lock);
7647 spin_unlock(&sinfo->lock);
7648 return ret;
7649}
7229 7650
7230 new_alloc_flags = update_block_group_flags(root, 7651int btrfs_set_block_group_ro(struct btrfs_root *root,
7231 shrink_block_group->flags); 7652 struct btrfs_block_group_cache *cache)
7232 if (new_alloc_flags != shrink_block_group->flags) {
7233 calc =
7234 btrfs_block_group_used(&shrink_block_group->item);
7235 } else {
7236 calc = shrink_block_group->key.offset;
7237 }
7238 spin_unlock(&shrink_block_group->lock);
7239 7653
7240 do_chunk_alloc(trans, root->fs_info->extent_root, 7654{
7241 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7655 struct btrfs_trans_handle *trans;
7656 u64 alloc_flags;
7657 int ret;
7242 7658
7243 btrfs_end_transaction(trans, root); 7659 BUG_ON(cache->ro);
7244 } else
7245 spin_unlock(&shrink_block_group->lock);
7246 return 0;
7247}
7248 7660
7661 trans = btrfs_join_transaction(root, 1);
7662 BUG_ON(IS_ERR(trans));
7249 7663
7250int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7664 alloc_flags = update_block_group_flags(root, cache->flags);
7251 struct btrfs_block_group_cache *group) 7665 if (alloc_flags != cache->flags)
7666 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7667
7668 ret = set_block_group_ro(cache);
7669 if (!ret)
7670 goto out;
7671 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7672 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7673 if (ret < 0)
7674 goto out;
7675 ret = set_block_group_ro(cache);
7676out:
7677 btrfs_end_transaction(trans, root);
7678 return ret;
7679}
7252 7680
7681int btrfs_set_block_group_rw(struct btrfs_root *root,
7682 struct btrfs_block_group_cache *cache)
7253{ 7683{
7254 __alloc_chunk_for_shrink(root, group, 1); 7684 struct btrfs_space_info *sinfo = cache->space_info;
7255 set_block_group_readonly(group); 7685 u64 num_bytes;
7686
7687 BUG_ON(!cache->ro);
7688
7689 spin_lock(&sinfo->lock);
7690 spin_lock(&cache->lock);
7691 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7692 cache->bytes_super - btrfs_block_group_used(&cache->item);
7693 sinfo->bytes_readonly -= num_bytes;
7694 cache->ro = 0;
7695 spin_unlock(&cache->lock);
7696 spin_unlock(&sinfo->lock);
7256 return 0; 7697 return 0;
7257} 7698}
7258 7699
@@ -7369,7 +7810,6 @@ static int find_first_block_group(struct btrfs_root *root,
7369 } 7810 }
7370 path->slots[0]++; 7811 path->slots[0]++;
7371 } 7812 }
7372 ret = -ENOENT;
7373out: 7813out:
7374 return ret; 7814 return ret;
7375} 7815}
@@ -7420,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7420 */ 7860 */
7421 synchronize_rcu(); 7861 synchronize_rcu();
7422 7862
7863 release_global_block_rsv(info);
7864
7423 while(!list_empty(&info->space_info)) { 7865 while(!list_empty(&info->space_info)) {
7424 space_info = list_entry(info->space_info.next, 7866 space_info = list_entry(info->space_info.next,
7425 struct btrfs_space_info, 7867 struct btrfs_space_info,
7426 list); 7868 list);
7427 7869 if (space_info->bytes_pinned > 0 ||
7870 space_info->bytes_reserved > 0) {
7871 WARN_ON(1);
7872 dump_space_info(space_info, 0, 0);
7873 }
7428 list_del(&space_info->list); 7874 list_del(&space_info->list);
7429 kfree(space_info); 7875 kfree(space_info);
7430 } 7876 }
7431 return 0; 7877 return 0;
7432} 7878}
7433 7879
7880static void __link_block_group(struct btrfs_space_info *space_info,
7881 struct btrfs_block_group_cache *cache)
7882{
7883 int index = get_block_group_index(cache);
7884
7885 down_write(&space_info->groups_sem);
7886 list_add_tail(&cache->list, &space_info->block_groups[index]);
7887 up_write(&space_info->groups_sem);
7888}
7889
7434int btrfs_read_block_groups(struct btrfs_root *root) 7890int btrfs_read_block_groups(struct btrfs_root *root)
7435{ 7891{
7436 struct btrfs_path *path; 7892 struct btrfs_path *path;
@@ -7452,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7452 7908
7453 while (1) { 7909 while (1) {
7454 ret = find_first_block_group(root, path, &key); 7910 ret = find_first_block_group(root, path, &key);
7455 if (ret > 0) { 7911 if (ret > 0)
7456 ret = 0; 7912 break;
7457 goto error;
7458 }
7459 if (ret != 0) 7913 if (ret != 0)
7460 goto error; 7914 goto error;
7461 7915
@@ -7464,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7464 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7918 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7465 if (!cache) { 7919 if (!cache) {
7466 ret = -ENOMEM; 7920 ret = -ENOMEM;
7467 break; 7921 goto error;
7468 } 7922 }
7469 7923
7470 atomic_set(&cache->count, 1); 7924 atomic_set(&cache->count, 1);
@@ -7521,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7521 BUG_ON(ret); 7975 BUG_ON(ret);
7522 cache->space_info = space_info; 7976 cache->space_info = space_info;
7523 spin_lock(&cache->space_info->lock); 7977 spin_lock(&cache->space_info->lock);
7524 cache->space_info->bytes_super += cache->bytes_super; 7978 cache->space_info->bytes_readonly += cache->bytes_super;
7525 spin_unlock(&cache->space_info->lock); 7979 spin_unlock(&cache->space_info->lock);
7526 7980
7527 down_write(&space_info->groups_sem); 7981 __link_block_group(space_info, cache);
7528 list_add_tail(&cache->list, &space_info->block_groups);
7529 up_write(&space_info->groups_sem);
7530 7982
7531 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7983 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7532 BUG_ON(ret); 7984 BUG_ON(ret);
7533 7985
7534 set_avail_alloc_bits(root->fs_info, cache->flags); 7986 set_avail_alloc_bits(root->fs_info, cache->flags);
7535 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7987 if (btrfs_chunk_readonly(root, cache->key.objectid))
7536 set_block_group_readonly(cache); 7988 set_block_group_ro(cache);
7537 } 7989 }
7990
7991 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7992 if (!(get_alloc_profile(root, space_info->flags) &
7993 (BTRFS_BLOCK_GROUP_RAID10 |
7994 BTRFS_BLOCK_GROUP_RAID1 |
7995 BTRFS_BLOCK_GROUP_DUP)))
7996 continue;
7997 /*
7998 * avoid allocating from un-mirrored block group if there are
7999 * mirrored block groups.
8000 */
8001 list_for_each_entry(cache, &space_info->block_groups[3], list)
8002 set_block_group_ro(cache);
8003 list_for_each_entry(cache, &space_info->block_groups[4], list)
8004 set_block_group_ro(cache);
8005 }
8006
8007 init_global_block_rsv(info);
7538 ret = 0; 8008 ret = 0;
7539error: 8009error:
7540 btrfs_free_path(path); 8010 btrfs_free_path(path);
@@ -7595,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7595 BUG_ON(ret); 8065 BUG_ON(ret);
7596 8066
7597 spin_lock(&cache->space_info->lock); 8067 spin_lock(&cache->space_info->lock);
7598 cache->space_info->bytes_super += cache->bytes_super; 8068 cache->space_info->bytes_readonly += cache->bytes_super;
7599 spin_unlock(&cache->space_info->lock); 8069 spin_unlock(&cache->space_info->lock);
7600 8070
7601 down_write(&cache->space_info->groups_sem); 8071 __link_block_group(cache->space_info, cache);
7602 list_add_tail(&cache->list, &cache->space_info->block_groups);
7603 up_write(&cache->space_info->groups_sem);
7604 8072
7605 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8073 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7606 BUG_ON(ret); 8074 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c99121ac5d6b..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -136,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
136 return state; 135 return state;
137} 136}
138 137
139static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
140{ 139{
141 if (!state) 140 if (!state)
142 return; 141 return;
@@ -336,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
336} 335}
337 336
338static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
339 struct extent_state *state, 338 struct extent_state *state, int *bits)
340 unsigned long bits)
341{ 339{
342 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
343 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
344 state->start, state->end, 342 state, bits);
345 state->state, bits);
346 } 343 }
347 344
348 return 0; 345 return 0;
349} 346}
350 347
351static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
352 struct extent_state *state, 349 struct extent_state *state, int *bits)
353 unsigned long bits)
354{ 350{
355 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -368,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
368 */ 364 */
369static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
370 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
371 int bits) 367 int *bits)
372{ 368{
373 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
374 int ret; 371 int ret;
375 372
376 if (end < start) { 373 if (end < start) {
@@ -385,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
385 if (ret) 382 if (ret)
386 return ret; 383 return ret;
387 384
388 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
390 state->state |= bits; 387 state->state |= bits_to_set;
391 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
392 if (node) { 389 if (node) {
393 struct extent_state *found; 390 struct extent_state *found;
@@ -457,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
457 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
458 */ 455 */
459static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
460 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
461 int delete) 458 int *bits, int wake)
462{ 459{
463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
464 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
465 462
466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
467 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
468 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
469 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -472,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
472 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
473 if (wake) 470 if (wake)
474 wake_up(&state->wq); 471 wake_up(&state->wq);
475 if (delete || state->state == 0) { 472 if (state->state == 0) {
476 if (state->tree) { 473 if (state->tree) {
477 clear_state_cb(tree, state, state->state);
478 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
479 state->tree = NULL; 475 state->tree = NULL;
480 free_extent_state(state); 476 free_extent_state(state);
@@ -515,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
515 int set = 0; 511 int set = 0;
516 int clear = 0; 512 int clear = 0;
517 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
519 clear = 1; 519 clear = 1;
520again: 520again:
@@ -581,8 +581,7 @@ hit_next:
581 if (err) 581 if (err)
582 goto out; 582 goto out;
583 if (state->end <= end) { 583 if (state->end <= end) {
584 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
585 delete);
586 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
587 goto out; 586 goto out;
588 start = last_end + 1; 587 start = last_end + 1;
@@ -603,7 +602,7 @@ hit_next:
603 if (wake) 602 if (wake)
604 wake_up(&state->wq); 603 wake_up(&state->wq);
605 604
606 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
607 606
608 prealloc = NULL; 607 prealloc = NULL;
609 goto out; 608 goto out;
@@ -614,7 +613,7 @@ hit_next:
614 else 613 else
615 next_node = NULL; 614 next_node = NULL;
616 615
617 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
618 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
619 goto out; 618 goto out;
620 start = last_end + 1; 619 start = last_end + 1;
@@ -707,19 +706,19 @@ out:
707 706
708static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
709 struct extent_state *state, 708 struct extent_state *state,
710 int bits) 709 int *bits)
711{ 710{
712 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
713 713
714 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
715 if (ret) 715 if (ret)
716 return ret; 716 return ret;
717 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
719 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
720 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
721 } 720 }
722 state->state |= bits; 721 state->state |= bits_to_set;
723 722
724 return 0; 723 return 0;
725} 724}
@@ -746,10 +745,9 @@ static void cache_state(struct extent_state *state,
746 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
747 */ 746 */
748 747
749static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
750 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
751 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
752 gfp_t mask)
753{ 751{
754 struct extent_state *state; 752 struct extent_state *state;
755 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -758,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
758 u64 last_start; 756 u64 last_start;
759 u64 last_end; 757 u64 last_end;
760 758
759 bits |= EXTENT_FIRST_DELALLOC;
761again: 760again:
762 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
763 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -779,7 +778,7 @@ again:
779 */ 778 */
780 node = tree_search(tree, start); 779 node = tree_search(tree, start);
781 if (!node) { 780 if (!node) {
782 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
783 prealloc = NULL; 782 prealloc = NULL;
784 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
785 goto out; 784 goto out;
@@ -803,7 +802,7 @@ hit_next:
803 goto out; 802 goto out;
804 } 803 }
805 804
806 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
807 if (err) 806 if (err)
808 goto out; 807 goto out;
809 808
@@ -853,7 +852,7 @@ hit_next:
853 if (err) 852 if (err)
854 goto out; 853 goto out;
855 if (state->end <= end) { 854 if (state->end <= end) {
856 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
857 if (err) 856 if (err)
858 goto out; 857 goto out;
859 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -878,7 +877,7 @@ hit_next:
878 else 877 else
879 this_end = last_start - 1; 878 this_end = last_start - 1;
880 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
881 bits); 880 &bits);
882 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
883 if (err) { 882 if (err) {
884 prealloc = NULL; 883 prealloc = NULL;
@@ -904,7 +903,7 @@ hit_next:
904 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
905 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
906 905
907 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
908 if (err) { 907 if (err) {
909 prealloc = NULL; 908 prealloc = NULL;
910 goto out; 909 goto out;
@@ -967,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
967{ 966{
968 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
969 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
970 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
971 NULL, mask);
972} 970}
973 971
974int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1436,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1436 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1437 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1438 1436
1439 if (op & EXTENT_CLEAR_ACCOUNTING)
1440 clear_bits |= EXTENT_DO_ACCOUNTING;
1441
1442 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1443 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1917,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1917 1912
1918 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1919 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1920 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1921 else 1916 else
1922 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1923 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2021,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2021 sector_t sector; 2016 sector_t sector;
2022 struct extent_map *em; 2017 struct extent_map *em;
2023 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2024 int ret; 2020 int ret;
2025 int nr = 0; 2021 int nr = 0;
2026 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2032,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2032 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2033 2029
2034 end = page_end; 2030 end = page_end;
2035 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2036 2040
2037 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2038 char *userpage; 2042 char *userpage;
@@ -2590,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2590 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2594 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2591 }; 2595 };
2592 struct writeback_control wbc_writepages = { 2596 struct writeback_control wbc_writepages = {
2593 .bdi = wbc->bdi,
2594 .sync_mode = wbc->sync_mode, 2597 .sync_mode = wbc->sync_mode,
2595 .older_than_this = NULL, 2598 .older_than_this = NULL,
2596 .nr_to_write = 64, 2599 .nr_to_write = 64,
@@ -2624,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2624 .sync_io = mode == WB_SYNC_ALL, 2627 .sync_io = mode == WB_SYNC_ALL,
2625 }; 2628 };
2626 struct writeback_control wbc_writepages = { 2629 struct writeback_control wbc_writepages = {
2627 .bdi = inode->i_mapping->backing_dev_info,
2628 .sync_mode = mode, 2630 .sync_mode = mode,
2629 .older_than_this = NULL, 2631 .older_than_this = NULL,
2630 .nr_to_write = nr_pages * 2, 2632 .nr_to_write = nr_pages * 2,
@@ -2679,33 +2681,20 @@ int extent_readpages(struct extent_io_tree *tree,
2679{ 2681{
2680 struct bio *bio = NULL; 2682 struct bio *bio = NULL;
2681 unsigned page_idx; 2683 unsigned page_idx;
2682 struct pagevec pvec;
2683 unsigned long bio_flags = 0; 2684 unsigned long bio_flags = 0;
2684 2685
2685 pagevec_init(&pvec, 0);
2686 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2686 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2687 struct page *page = list_entry(pages->prev, struct page, lru); 2687 struct page *page = list_entry(pages->prev, struct page, lru);
2688 2688
2689 prefetchw(&page->flags); 2689 prefetchw(&page->flags);
2690 list_del(&page->lru); 2690 list_del(&page->lru);
2691 /* 2691 if (!add_to_page_cache_lru(page, mapping,
2692 * what we want to do here is call add_to_page_cache_lru,
2693 * but that isn't exported, so we reproduce it here
2694 */
2695 if (!add_to_page_cache(page, mapping,
2696 page->index, GFP_KERNEL)) { 2692 page->index, GFP_KERNEL)) {
2697
2698 /* open coding of lru_cache_add, also not exported */
2699 page_cache_get(page);
2700 if (!pagevec_add(&pvec, page))
2701 __pagevec_lru_add_file(&pvec);
2702 __extent_read_full_page(tree, page, get_extent, 2693 __extent_read_full_page(tree, page, get_extent,
2703 &bio, 0, &bio_flags); 2694 &bio, 0, &bio_flags);
2704 } 2695 }
2705 page_cache_release(page); 2696 page_cache_release(page);
2706 } 2697 }
2707 if (pagevec_count(&pvec))
2708 __pagevec_lru_add_file(&pvec);
2709 BUG_ON(!list_empty(pages)); 2698 BUG_ON(!list_empty(pages));
2710 if (bio) 2699 if (bio)
2711 submit_one_bio(READ, bio, 0, bio_flags); 2700 submit_one_bio(READ, bio, 0, bio_flags);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 28d87ba60ce8..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
@@ -148,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148} 149}
149 150
150 151
151int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
152 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
153{ 155{
154 u32 sum; 156 u32 sum;
155 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
156 int bio_index = 0; 158 int bio_index = 0;
157 u64 offset; 159 u64 offset = 0;
158 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
159 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
160 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -173,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
173 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
174 176
175 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
176 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
177 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
178 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
179 if (ret == 0) 184 if (ret == 0)
180 goto found; 185 goto found;
@@ -237,6 +242,7 @@ found:
237 else 242 else
238 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
239 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
240 bio_index++; 246 bio_index++;
241 bvec++; 247 bvec++;
242 } 248 }
@@ -244,6 +250,18 @@ found:
244 return 0; 250 return 0;
245} 251}
246 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
247int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
248 struct list_head *list) 266 struct list_head *list)
249{ 267{
@@ -656,6 +674,9 @@ again:
656 goto found; 674 goto found;
657 } 675 }
658 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
659 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
660 u32 item_size; 681 u32 item_size;
661 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee3323c7fc1c..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -45,32 +46,42 @@
45static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
46 int write_bytes, 47 int write_bytes,
47 struct page **prepared_pages, 48 struct page **prepared_pages,
48 const char __user *buf) 49 struct iov_iter *i)
49{ 50{
50 long page_fault = 0; 51 size_t copied;
51 int i; 52 int pg = 0;
52 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
53 54
54 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
55 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
56 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
57 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
58 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
59 62
60 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
61 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
62 page_fault = __copy_from_user(page_address(page) + offset, 65
63 buf, count);
64 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
65 flush_dcache_page(page); 67 flush_dcache_page(page);
66 kunmap(page); 68 iov_iter_advance(i, copied);
67 buf += count; 69 write_bytes -= copied;
68 write_bytes -= count;
69 70
70 if (page_fault) 71 if (unlikely(copied == 0)) {
71 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
72 } 83 }
73 return page_fault ? -EFAULT : 0; 84 return 0;
74} 85}
75 86
76/* 87/*
@@ -125,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
125 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
127 NULL); 138 NULL);
128 if (err) 139 BUG_ON(err);
129 return err;
130 140
131 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
132 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -141,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
141 * at this time. 151 * at this time.
142 */ 152 */
143 } 153 }
144 return err; 154 return 0;
145} 155}
146 156
147/* 157/*
@@ -822,45 +832,46 @@ again:
822 return 0; 832 return 0;
823} 833}
824 834
825static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
826 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
827{ 838{
828 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
829 loff_t start_pos; 846 loff_t start_pos;
830 ssize_t num_written = 0; 847 ssize_t num_written = 0;
831 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
832 int ret = 0; 851 int ret = 0;
833 struct inode *inode = fdentry(file)->d_inode;
834 struct btrfs_root *root = BTRFS_I(inode)->root;
835 struct page **pages = NULL;
836 int nrptrs; 852 int nrptrs;
837 struct page *pinned[2];
838 unsigned long first_index; 853 unsigned long first_index;
839 unsigned long last_index; 854 unsigned long last_index;
840 int will_write; 855 int will_write;
856 int buffered = 0;
841 857
842 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
843 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
844 860
845 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
846 PAGE_CACHE_SIZE / (sizeof(struct page *)));
847 pinned[0] = NULL; 861 pinned[0] = NULL;
848 pinned[1] = NULL; 862 pinned[1] = NULL;
849 863
850 pos = *ppos;
851 start_pos = pos; 864 start_pos = pos;
852 865
853 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
854 867
855 /* do the reserve before the mutex lock in case we have to do some
856 * flushing. We wouldn't deadlock, but this is more polite.
857 */
858 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
859 if (err)
860 goto out_nolock;
861
862 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
863 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
864 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
865 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
866 if (err) 877 if (err)
@@ -874,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
874 goto out; 885 goto out;
875 886
876 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
877 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
878 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
879 929
880 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
881 start_pos = pos; 931 start_pos = pos;
882 932
883 BTRFS_I(inode)->sequence++;
884 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
885 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
886 935
887 /* 936 /*
888 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -899,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
899 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
900 } 949 }
901 } 950 }
902 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
903 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
904 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
905 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -910,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
910 } 959 }
911 } 960 }
912 961
913 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
914 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
915 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
916 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
917 offset); 966 offset);
918 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
919 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -921,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
921 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
922 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
923 972
924 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
925 if (ret) 974 if (ret)
926 goto out; 975 goto out;
927 976
@@ -929,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
929 pos, first_index, last_index, 978 pos, first_index, last_index,
930 write_bytes); 979 write_bytes);
931 if (ret) { 980 if (ret) {
932 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
933 write_bytes);
934 goto out; 982 goto out;
935 } 983 }
936 984
937 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
938 write_bytes, pages, buf); 986 write_bytes, pages, &i);
939 if (ret) { 987 if (ret == 0) {
940 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
941 write_bytes); 989 num_pages, pos, write_bytes);
942 btrfs_drop_pages(pages, num_pages);
943 goto out;
944 } 990 }
945 991
946 ret = dirty_and_release_pages(NULL, root, file, pages,
947 num_pages, pos, write_bytes);
948 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
949 if (ret) { 993 if (ret) {
950 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
951 write_bytes);
952 goto out; 995 goto out;
953 } 996 }
954 997
@@ -964,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
964 btrfs_throttle(root); 1007 btrfs_throttle(root);
965 } 1008 }
966 1009
967 buf += write_bytes;
968 count -= write_bytes;
969 pos += write_bytes; 1010 pos += write_bytes;
970 num_written += write_bytes; 1011 num_written += write_bytes;
971 1012
@@ -975,9 +1016,7 @@ out:
975 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
976 if (ret) 1017 if (ret)
977 err = ret; 1018 err = ret;
978 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
979 1019
980out_nolock:
981 kfree(pages); 1020 kfree(pages);
982 if (pinned[0]) 1021 if (pinned[0])
983 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1007,7 +1046,7 @@ out_nolock:
1007 num_written = err; 1046 num_written = err;
1008 1047
1009 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1010 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1011 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1012 file->f_dentry); 1051 file->f_dentry);
1013 if (ret == 0) { 1052 if (ret == 0) {
@@ -1022,7 +1061,7 @@ out_nolock:
1022 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1023 } 1062 }
1024 } 1063 }
1025 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1026 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1027 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1028 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1062,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1062 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1063 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1064 */ 1103 */
1065int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1066{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1067 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1068 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1069 int ret = 0; 1109 int ret = 0;
@@ -1100,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1100 /* 1140 /*
1101 * ok we haven't committed the transaction yet, lets do a commit 1141 * ok we haven't committed the transaction yet, lets do a commit
1102 */ 1142 */
1103 if (file && file->private_data) 1143 if (file->private_data)
1104 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1105 1145
1106 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1107 if (!trans) { 1147 if (IS_ERR(trans)) {
1108 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1109 goto out; 1149 goto out;
1110 } 1150 }
1111 1151
@@ -1150,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
1150 1190
1151static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1191static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1152{ 1192{
1153 vma->vm_ops = &btrfs_file_vm_ops; 1193 struct address_space *mapping = filp->f_mapping;
1194
1195 if (!mapping->a_ops->readpage)
1196 return -ENOEXEC;
1197
1154 file_accessed(filp); 1198 file_accessed(filp);
1199 vma->vm_ops = &btrfs_file_vm_ops;
1200 vma->vm_flags |= VM_CAN_NONLINEAR;
1201
1155 return 0; 1202 return 0;
1156} 1203}
1157 1204
1158const struct file_operations btrfs_file_operations = { 1205const struct file_operations btrfs_file_operations = {
1159 .llseek = generic_file_llseek, 1206 .llseek = generic_file_llseek,
1160 .read = do_sync_read, 1207 .read = do_sync_read,
1208 .write = do_sync_write,
1161 .aio_read = generic_file_aio_read, 1209 .aio_read = generic_file_aio_read,
1162 .splice_read = generic_file_splice_read, 1210 .splice_read = generic_file_splice_read,
1163 .write = btrfs_file_write, 1211 .aio_write = btrfs_file_aio_write,
1164 .mmap = btrfs_file_mmap, 1212 .mmap = btrfs_file_mmap,
1165 .open = generic_file_open, 1213 .open = generic_file_open,
1166 .release = btrfs_release_file, 1214 .release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02bb099845fd..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -251,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
251 inline_len, compressed_size, 252 inline_len, compressed_size,
252 compressed_pages); 253 compressed_pages);
253 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
254 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
255 return 0; 257 return 0;
256} 258}
@@ -413,6 +415,7 @@ again:
413 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
414 BUG_ON(!trans); 416 BUG_ON(!trans);
415 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
416 419
417 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
418 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -438,7 +441,6 @@ again:
438 start, end, NULL, 441 start, end, NULL,
439 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
440 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
441 EXTENT_CLEAR_ACCOUNTING |
442 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
443 445
444 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -696,6 +698,38 @@ retry:
696 return 0; 698 return 0;
697} 699}
698 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
699/* 733/*
700 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
701 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -733,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
733 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
734 BUG_ON(!trans); 768 BUG_ON(!trans);
735 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
736 771
737 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
738 773
@@ -752,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
752 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
753 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
754 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
755 EXTENT_CLEAR_ACCOUNTING |
756 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
757 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
758 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -768,35 +802,13 @@ static noinline int cow_file_range(struct inode *inode,
768 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
769 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
770 804
771 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
772 read_lock(&BTRFS_I(inode)->extent_tree.lock);
773 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
774 start, num_bytes);
775 if (em) {
776 /*
777 * if block start isn't an actual block number then find the
778 * first block in this inode and use that as a hint. If that
779 * block is also bogus then just don't worry about it.
780 */
781 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
782 free_extent_map(em);
783 em = search_extent_mapping(em_tree, 0, 0);
784 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
785 alloc_hint = em->block_start;
786 if (em)
787 free_extent_map(em);
788 } else {
789 alloc_hint = em->block_start;
790 free_extent_map(em);
791 }
792 }
793 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
794 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
795 807
796 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
797 unsigned long op; 809 unsigned long op;
798 810
799 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 811 cur_alloc_size = disk_num_bytes;
800 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 812 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
801 root->sectorsize, 0, alloc_hint, 813 root->sectorsize, 0, alloc_hint,
802 (u64)-1, &ins, 1); 814 (u64)-1, &ins, 1);
@@ -1173,6 +1185,13 @@ out_check:
1173 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1174 BUG_ON(ret); 1186 BUG_ON(ret);
1175 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1176 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1177 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1178 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1225,36 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1225} 1244}
1226 1245
1227static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1228 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1229{ 1248{
1230 struct btrfs_root *root = BTRFS_I(inode)->root; 1249 /* not delalloc, ignore it */
1231 u64 size;
1232
1233 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1234 return 0; 1251 return 0;
1235 1252
1236 size = orig->end - orig->start + 1; 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1237 if (size > root->fs_info->max_extent) {
1238 u64 num_extents;
1239 u64 new_size;
1240
1241 new_size = orig->end - split + 1;
1242 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1243 root->fs_info->max_extent);
1244
1245 /*
1246 * if we break a large extent up then leave oustanding_extents
1247 * be, since we've already accounted for the large extent.
1248 */
1249 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1250 root->fs_info->max_extent) < num_extents)
1251 return 0;
1252 }
1253
1254 spin_lock(&BTRFS_I(inode)->accounting_lock);
1255 BTRFS_I(inode)->outstanding_extents++;
1256 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1257
1258 return 0; 1254 return 0;
1259} 1255}
1260 1256
@@ -1268,42 +1264,11 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1268 struct extent_state *new, 1264 struct extent_state *new,
1269 struct extent_state *other) 1265 struct extent_state *other)
1270{ 1266{
1271 struct btrfs_root *root = BTRFS_I(inode)->root;
1272 u64 new_size, old_size;
1273 u64 num_extents;
1274
1275 /* not delalloc, ignore it */ 1267 /* not delalloc, ignore it */
1276 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1277 return 0; 1269 return 0;
1278 1270
1279 old_size = other->end - other->start + 1; 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1280 if (new->start < other->start)
1281 new_size = other->end - new->start + 1;
1282 else
1283 new_size = new->end - other->start + 1;
1284
1285 /* we're not bigger than the max, unreserve the space and go */
1286 if (new_size <= root->fs_info->max_extent) {
1287 spin_lock(&BTRFS_I(inode)->accounting_lock);
1288 BTRFS_I(inode)->outstanding_extents--;
1289 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1290 return 0;
1291 }
1292
1293 /*
1294 * If we grew by another max_extent, just return, we want to keep that
1295 * reserved amount.
1296 */
1297 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1298 root->fs_info->max_extent);
1299 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1300 root->fs_info->max_extent) > num_extents)
1301 return 0;
1302
1303 spin_lock(&BTRFS_I(inode)->accounting_lock);
1304 BTRFS_I(inode)->outstanding_extents--;
1305 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1306
1307 return 0; 1272 return 0;
1308} 1273}
1309 1274
@@ -1312,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1312 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1313 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1314 */ 1279 */
1315static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1316 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1317{ 1282{
1318 1283
1319 /* 1284 /*
@@ -1321,16 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1321 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1322 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1323 */ 1288 */
1324 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1292
1293 if (*bits & EXTENT_FIRST_DELALLOC)
1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1295 else
1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1326 1297
1327 spin_lock(&BTRFS_I(inode)->accounting_lock);
1328 BTRFS_I(inode)->outstanding_extents++;
1329 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1330 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1331 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1332 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1333 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1334 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1335 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1336 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1344,44 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1344 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1345 */ 1312 */
1346static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1347 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1348{ 1315{
1349 /* 1316 /*
1350 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1351 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1352 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1353 */ 1320 */
1354 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1355 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1356 1324
1357 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1358 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1359 BTRFS_I(inode)->outstanding_extents--; 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1360 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1361 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1329
1362 } 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1363 1335
1364 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1365 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1366 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1367 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1368 "%llu %llu\n",
1369 (unsigned long long)
1370 state->end - state->start + 1,
1371 (unsigned long long)
1372 root->fs_info->delalloc_bytes);
1373 btrfs_delalloc_free_space(root, inode, (u64)-1);
1374 root->fs_info->delalloc_bytes = 0;
1375 BTRFS_I(inode)->delalloc_bytes = 0;
1376 } else {
1377 btrfs_delalloc_free_space(root, inode,
1378 state->end -
1379 state->start + 1);
1380 root->fs_info->delalloc_bytes -= state->end -
1381 state->start + 1;
1382 BTRFS_I(inode)->delalloc_bytes -= state->end -
1383 state->start + 1;
1384 }
1385 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1386 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1387 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1430,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1430 */ 1385 */
1431static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1432 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1433 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1434{ 1390{
1435 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1436 int ret = 0; 1392 int ret = 0;
@@ -1449,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1449 * are inserted into the btree 1405 * are inserted into the btree
1450 */ 1406 */
1451static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1452 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1453{ 1410{
1454 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1455 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1460,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1460 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1461 */ 1418 */
1462static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1463 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1464{ 1422{
1465 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1466 int ret = 0; 1424 int ret = 0;
@@ -1485,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1485 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1486 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1487 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1488 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1489 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1490 } 1449 }
1491 1450
@@ -1566,6 +1525,7 @@ again:
1566 goto again; 1525 goto again;
1567 } 1526 }
1568 1527
1528 BUG();
1569 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1570 ClearPageChecked(page); 1530 ClearPageChecked(page);
1571out: 1531out:
@@ -1696,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1696static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1697{ 1657{
1698 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1700 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1701 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1702 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1714,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1714 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1715 if (!ret) { 1675 if (!ret) {
1716 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1717 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1718 BUG_ON(ret); 1680 BUG_ON(ret);
1719 btrfs_end_transaction(trans, root);
1720 } 1681 }
1721 goto out; 1682 goto out;
1722 } 1683 }
@@ -1726,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1726 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1727 1688
1728 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1729 1692
1730 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1731 compressed = 1; 1694 compressed = 1;
@@ -1757,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1757 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1758 &ordered_extent->list); 1721 &ordered_extent->list);
1759 1722
1760 /* this also removes the ordered extent from the tree */
1761 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1762 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1763 BUG_ON(ret); 1725 BUG_ON(ret);
1764 btrfs_end_transaction(trans, root);
1765out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1766 /* once for us */ 1730 /* once for us */
1767 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1768 /* once for the tree */ 1732 /* once for the tree */
@@ -1884,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1884 1848
1885 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1886 failrec->last_mirror, 1850 failrec->last_mirror,
1887 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1888 return 0; 1852 return 0;
1889} 1853}
1890 1854
@@ -2039,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2039} 2003}
2040 2004
2041/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
2042 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
2043 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
2044 */ 2128 */
2045int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2046{ 2130{
2047 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2048 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2049 2136
2050 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2051 2141
2052 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2053 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2054 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2055 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2056 } 2148 }
2057 2149
2058 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2166 }
2059 2167
2060 spin_unlock(&root->list_lock); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2061 2173
2062 /* 2174 if (block_rsv)
2063 * insert an orphan item to track this unlinked/truncated file 2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2064 */
2065 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2066 2176
2067 return ret; 2177 /* grab metadata reservation from transaction handle */
2178 if (reserve) {
2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2180 BUG_ON(ret);
2181 }
2182
2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2068} 2196}
2069 2197
2070/* 2198/*
@@ -2074,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2074int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2075{ 2203{
2076 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2077 int ret = 0; 2207 int ret = 0;
2078 2208
2079 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2080 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2081 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2082 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2083 return 0;
2084 } 2213 }
2085 2214
2086 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2087 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2088 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2089 return 0;
2090 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2091 2220
2092 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2093 2225
2094 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2095 2228
2096 return ret; 2229 return 0;
2097} 2230}
2098 2231
2099/* 2232/*
@@ -2110,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2110 struct inode *inode; 2243 struct inode *inode;
2111 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2112 2245
2113 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2114 return; 2247 return;
2115 2248
2116 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2163,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2163 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2164 found_key.offset = 0; 2297 found_key.offset = 0;
2165 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2166 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2167 break;
2168 2300
2169 /* 2301 /*
2170 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2171 * the proper thing when we hit it 2303 * the proper thing when we hit it
2172 */ 2304 */
2173 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2174 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2175 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2176 2308
2177 /* 2309 /*
2178 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2181,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2181 * do a destroy_inode 2313 * do a destroy_inode
2182 */ 2314 */
2183 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2184 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2185 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2186 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2187 iput(inode); 2319 iput(inode);
@@ -2199,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2199 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2200 iput(inode); 2332 iput(inode);
2201 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2202 2346
2203 if (nr_unlink) 2347 if (nr_unlink)
2204 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2205 if (nr_truncate) 2349 if (nr_truncate)
2206 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2207
2208 btrfs_free_path(path);
2209} 2351}
2210 2352
2211/* 2353/*
@@ -2524,44 +2666,217 @@ out:
2524 return ret; 2666 return ret;
2525} 2667}
2526 2668
2527static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs = 1;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2528{ 2701{
2529 struct btrfs_root *root;
2530 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2531 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2532 int ret; 2711 int ret;
2533 unsigned long nr = 0;
2534 2712
2535 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2536 2716
2537 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2538 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2539 * 1 for orphan 2719
2540 */ 2720 /* check if there is someone else holds reference */
2541 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2542 if (ret) 2722 return ERR_PTR(-ENOSPC);
2543 return ret;
2544 2723
2545 trans = btrfs_start_transaction(root, 1); 2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2729
2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2546 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2547 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2548 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2549 } 2741 }
2550 2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2864
2551 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2552 2866
2553 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2867 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2554 2868
2555 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2556 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2557 2872
2558 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2559 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2560 2877
2561 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2562 2879 __unlink_end_trans(trans, root);
2563 btrfs_end_transaction_throttle(trans, root);
2564 btrfs_unreserve_metadata_space(root, 6);
2565 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2566 return ret; 2881 return ret;
2567} 2882}
@@ -2633,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2633{ 2948{
2634 struct inode *inode = dentry->d_inode; 2949 struct inode *inode = dentry->d_inode;
2635 int err = 0; 2950 int err = 0;
2636 int ret;
2637 struct btrfs_root *root = BTRFS_I(dir)->root; 2951 struct btrfs_root *root = BTRFS_I(dir)->root;
2638 struct btrfs_trans_handle *trans; 2952 struct btrfs_trans_handle *trans;
2639 unsigned long nr = 0; 2953 unsigned long nr = 0;
@@ -2642,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2642 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2956 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2643 return -ENOTEMPTY; 2957 return -ENOTEMPTY;
2644 2958
2645 ret = btrfs_reserve_metadata_space(root, 5); 2959 trans = __unlink_start_trans(dir, dentry);
2646 if (ret) 2960 if (IS_ERR(trans))
2647 return ret;
2648
2649 trans = btrfs_start_transaction(root, 1);
2650 if (IS_ERR(trans)) {
2651 btrfs_unreserve_metadata_space(root, 5);
2652 return PTR_ERR(trans); 2961 return PTR_ERR(trans);
2653 }
2654 2962
2655 btrfs_set_trans_block_group(trans, dir); 2963 btrfs_set_trans_block_group(trans, dir);
2656 2964
@@ -2673,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2673 btrfs_i_size_write(inode, 0); 2981 btrfs_i_size_write(inode, 0);
2674out: 2982out:
2675 nr = trans->blocks_used; 2983 nr = trans->blocks_used;
2676 ret = btrfs_end_transaction_throttle(trans, root); 2984 __unlink_end_trans(trans, root);
2677 btrfs_unreserve_metadata_space(root, 5);
2678 btrfs_btree_balance_dirty(root, nr); 2985 btrfs_btree_balance_dirty(root, nr);
2679 2986
2680 if (ret && !err)
2681 err = ret;
2682 return err; 2987 return err;
2683} 2988}
2684 2989
@@ -3075,6 +3380,7 @@ out:
3075 if (pending_del_nr) { 3380 if (pending_del_nr) {
3076 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3381 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3077 pending_del_nr); 3382 pending_del_nr);
3383 BUG_ON(ret);
3078 } 3384 }
3079 btrfs_free_path(path); 3385 btrfs_free_path(path);
3080 return err; 3386 return err;
@@ -3102,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3102 3408
3103 if ((offset & (blocksize - 1)) == 0) 3409 if ((offset & (blocksize - 1)) == 0)
3104 goto out; 3410 goto out;
3105 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3411 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3106 if (ret)
3107 goto out;
3108
3109 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3110 if (ret) 3412 if (ret)
3111 goto out; 3413 goto out;
3112 3414
@@ -3114,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3114again: 3416again:
3115 page = grab_cache_page(mapping, index); 3417 page = grab_cache_page(mapping, index);
3116 if (!page) { 3418 if (!page) {
3117 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3419 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3118 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3119 goto out; 3420 goto out;
3120 } 3421 }
3121 3422
@@ -3178,8 +3479,7 @@ again:
3178 3479
3179out_unlock: 3480out_unlock:
3180 if (ret) 3481 if (ret)
3181 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3482 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3182 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3183 unlock_page(page); 3483 unlock_page(page);
3184 page_cache_release(page); 3484 page_cache_release(page);
3185out: 3485out:
@@ -3191,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3191 struct btrfs_trans_handle *trans; 3491 struct btrfs_trans_handle *trans;
3192 struct btrfs_root *root = BTRFS_I(inode)->root; 3492 struct btrfs_root *root = BTRFS_I(inode)->root;
3193 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3493 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3194 struct extent_map *em; 3494 struct extent_map *em = NULL;
3195 struct extent_state *cached_state = NULL; 3495 struct extent_state *cached_state = NULL;
3196 u64 mask = root->sectorsize - 1; 3496 u64 mask = root->sectorsize - 1;
3197 u64 hole_start = (inode->i_size + mask) & ~mask; 3497 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3229,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3229 u64 hint_byte = 0; 3529 u64 hint_byte = 0;
3230 hole_size = last_byte - cur_offset; 3530 hole_size = last_byte - cur_offset;
3231 3531
3232 err = btrfs_reserve_metadata_space(root, 2); 3532 trans = btrfs_start_transaction(root, 2);
3233 if (err) 3533 if (IS_ERR(trans)) {
3534 err = PTR_ERR(trans);
3234 break; 3535 break;
3235 3536 }
3236 trans = btrfs_start_transaction(root, 1);
3237 btrfs_set_trans_block_group(trans, inode); 3537 btrfs_set_trans_block_group(trans, inode);
3238 3538
3239 err = btrfs_drop_extents(trans, inode, cur_offset, 3539 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3251,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3251 last_byte - 1, 0); 3551 last_byte - 1, 0);
3252 3552
3253 btrfs_end_transaction(trans, root); 3553 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 2);
3255 } 3554 }
3256 free_extent_map(em); 3555 free_extent_map(em);
3556 em = NULL;
3257 cur_offset = last_byte; 3557 cur_offset = last_byte;
3258 if (cur_offset >= block_end) 3558 if (cur_offset >= block_end)
3259 break; 3559 break;
3260 } 3560 }
3261 3561
3562 free_extent_map(em);
3262 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3563 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3263 GFP_NOFS); 3564 GFP_NOFS);
3264 return err; 3565 return err;
@@ -3285,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3285 } 3586 }
3286 } 3587 }
3287 3588
3288 ret = btrfs_reserve_metadata_space(root, 1); 3589 trans = btrfs_start_transaction(root, 5);
3289 if (ret) 3590 if (IS_ERR(trans))
3290 return ret; 3591 return PTR_ERR(trans);
3291 3592
3292 trans = btrfs_start_transaction(root, 1);
3293 btrfs_set_trans_block_group(trans, inode); 3593 btrfs_set_trans_block_group(trans, inode);
3294 3594
3295 ret = btrfs_orphan_add(trans, inode); 3595 ret = btrfs_orphan_add(trans, inode);
@@ -3297,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3297 3597
3298 nr = trans->blocks_used; 3598 nr = trans->blocks_used;
3299 btrfs_end_transaction(trans, root); 3599 btrfs_end_transaction(trans, root);
3300 btrfs_unreserve_metadata_space(root, 1);
3301 btrfs_btree_balance_dirty(root, nr); 3600 btrfs_btree_balance_dirty(root, nr);
3302 3601
3303 if (attr->ia_size > inode->i_size) { 3602 if (attr->ia_size > inode->i_size) {
@@ -3310,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3310 i_size_write(inode, attr->ia_size); 3609 i_size_write(inode, attr->ia_size);
3311 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3610 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3312 3611
3313 trans = btrfs_start_transaction(root, 1); 3612 trans = btrfs_start_transaction(root, 0);
3613 BUG_ON(IS_ERR(trans));
3314 btrfs_set_trans_block_group(trans, inode); 3614 btrfs_set_trans_block_group(trans, inode);
3615 trans->block_rsv = root->orphan_block_rsv;
3616 BUG_ON(!trans->block_rsv);
3315 3617
3316 ret = btrfs_update_inode(trans, root, inode); 3618 ret = btrfs_update_inode(trans, root, inode);
3317 BUG_ON(ret); 3619 BUG_ON(ret);
@@ -3391,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
3391 btrfs_i_size_write(inode, 0); 3693 btrfs_i_size_write(inode, 0);
3392 3694
3393 while (1) { 3695 while (1) {
3394 trans = btrfs_start_transaction(root, 1); 3696 trans = btrfs_start_transaction(root, 0);
3697 BUG_ON(IS_ERR(trans));
3395 btrfs_set_trans_block_group(trans, inode); 3698 btrfs_set_trans_block_group(trans, inode);
3396 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3699 trans->block_rsv = root->orphan_block_rsv;
3397 3700
3701 ret = btrfs_block_rsv_check(trans, root,
3702 root->orphan_block_rsv, 0, 5);
3703 if (ret) {
3704 BUG_ON(ret != -EAGAIN);
3705 ret = btrfs_commit_transaction(trans, root);
3706 BUG_ON(ret);
3707 continue;
3708 }
3709
3710 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3398 if (ret != -EAGAIN) 3711 if (ret != -EAGAIN)
3399 break; 3712 break;
3400 3713
@@ -3402,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
3402 btrfs_end_transaction(trans, root); 3715 btrfs_end_transaction(trans, root);
3403 trans = NULL; 3716 trans = NULL;
3404 btrfs_btree_balance_dirty(root, nr); 3717 btrfs_btree_balance_dirty(root, nr);
3718
3405 } 3719 }
3406 3720
3407 if (ret == 0) { 3721 if (ret == 0) {
@@ -3642,40 +3956,10 @@ again:
3642 return 0; 3956 return 0;
3643} 3957}
3644 3958
3645static noinline void init_btrfs_i(struct inode *inode)
3646{
3647 struct btrfs_inode *bi = BTRFS_I(inode);
3648
3649 bi->generation = 0;
3650 bi->sequence = 0;
3651 bi->last_trans = 0;
3652 bi->last_sub_trans = 0;
3653 bi->logged_trans = 0;
3654 bi->delalloc_bytes = 0;
3655 bi->reserved_bytes = 0;
3656 bi->disk_i_size = 0;
3657 bi->flags = 0;
3658 bi->index_cnt = (u64)-1;
3659 bi->last_unlink_trans = 0;
3660 bi->ordered_data_close = 0;
3661 bi->force_compress = 0;
3662 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3663 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3664 inode->i_mapping, GFP_NOFS);
3665 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3666 inode->i_mapping, GFP_NOFS);
3667 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3668 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3669 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3670 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3671 mutex_init(&BTRFS_I(inode)->log_mutex);
3672}
3673
3674static int btrfs_init_locked_inode(struct inode *inode, void *p) 3959static int btrfs_init_locked_inode(struct inode *inode, void *p)
3675{ 3960{
3676 struct btrfs_iget_args *args = p; 3961 struct btrfs_iget_args *args = p;
3677 inode->i_ino = args->ino; 3962 inode->i_ino = args->ino;
3678 init_btrfs_i(inode);
3679 BTRFS_I(inode)->root = args->root; 3963 BTRFS_I(inode)->root = args->root;
3680 btrfs_set_inode_space_info(args->root, inode); 3964 btrfs_set_inode_space_info(args->root, inode);
3681 return 0; 3965 return 0;
@@ -3738,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3738 if (!inode) 4022 if (!inode)
3739 return ERR_PTR(-ENOMEM); 4023 return ERR_PTR(-ENOMEM);
3740 4024
3741 init_btrfs_i(inode);
3742
3743 BTRFS_I(inode)->root = root; 4025 BTRFS_I(inode)->root = root;
3744 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4026 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3745 BTRFS_I(inode)->dummy_inode = 1; 4027 BTRFS_I(inode)->dummy_inode = 1;
@@ -3996,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3996 struct btrfs_trans_handle *trans; 4278 struct btrfs_trans_handle *trans;
3997 int ret = 0; 4279 int ret = 0;
3998 4280
3999 if (root->fs_info->btree_inode == inode) 4281 if (BTRFS_I(inode)->dummy_inode)
4000 return 0; 4282 return 0;
4001 4283
4002 if (wbc->sync_mode == WB_SYNC_ALL) { 4284 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4017,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
4017{ 4299{
4018 struct btrfs_root *root = BTRFS_I(inode)->root; 4300 struct btrfs_root *root = BTRFS_I(inode)->root;
4019 struct btrfs_trans_handle *trans; 4301 struct btrfs_trans_handle *trans;
4302 int ret;
4303
4304 if (BTRFS_I(inode)->dummy_inode)
4305 return;
4020 4306
4021 trans = btrfs_join_transaction(root, 1); 4307 trans = btrfs_join_transaction(root, 1);
4022 btrfs_set_trans_block_group(trans, inode); 4308 btrfs_set_trans_block_group(trans, inode);
4023 btrfs_update_inode(trans, root, inode); 4309
4310 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) {
4312 /* whoops, lets try again with the full transaction */
4313 btrfs_end_transaction(trans, root);
4314 trans = btrfs_start_transaction(root, 1);
4315 if (IS_ERR(trans)) {
4316 if (printk_ratelimit()) {
4317 printk(KERN_ERR "btrfs: fail to "
4318 "dirty inode %lu error %ld\n",
4319 inode->i_ino, PTR_ERR(trans));
4320 }
4321 return;
4322 }
4323 btrfs_set_trans_block_group(trans, inode);
4324
4325 ret = btrfs_update_inode(trans, root, inode);
4326 if (ret) {
4327 if (printk_ratelimit()) {
4328 printk(KERN_ERR "btrfs: fail to "
4329 "dirty inode %lu error %d\n",
4330 inode->i_ino, ret);
4331 }
4332 }
4333 }
4024 btrfs_end_transaction(trans, root); 4334 btrfs_end_transaction(trans, root);
4025} 4335}
4026 4336
@@ -4138,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4138 * btrfs_get_inode_index_count has an explanation for the magic 4448 * btrfs_get_inode_index_count has an explanation for the magic
4139 * number 4449 * number
4140 */ 4450 */
4141 init_btrfs_i(inode);
4142 BTRFS_I(inode)->index_cnt = 2; 4451 BTRFS_I(inode)->index_cnt = 2;
4143 BTRFS_I(inode)->root = root; 4452 BTRFS_I(inode)->root = root;
4144 BTRFS_I(inode)->generation = trans->transid; 4453 BTRFS_I(inode)->generation = trans->transid;
@@ -4167,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4167 if (ret != 0) 4476 if (ret != 0)
4168 goto fail; 4477 goto fail;
4169 4478
4170 inode->i_uid = current_fsuid(); 4479 inode_init_owner(inode, dir, mode);
4171
4172 if (dir && (dir->i_mode & S_ISGID)) {
4173 inode->i_gid = dir->i_gid;
4174 if (S_ISDIR(mode))
4175 mode |= S_ISGID;
4176 } else
4177 inode->i_gid = current_fsgid();
4178
4179 inode->i_mode = mode;
4180 inode->i_ino = objectid; 4480 inode->i_ino = objectid;
4181 inode_set_bytes(inode, 0); 4481 inode_set_bytes(inode, 0);
4182 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4482 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4302,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4302 if (!new_valid_dev(rdev)) 4602 if (!new_valid_dev(rdev))
4303 return -EINVAL; 4603 return -EINVAL;
4304 4604
4605 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4606 if (err)
4607 return err;
4608
4305 /* 4609 /*
4306 * 2 for inode item and ref 4610 * 2 for inode item and ref
4307 * 2 for dir items 4611 * 2 for dir items
4308 * 1 for xattr if selinux is on 4612 * 1 for xattr if selinux is on
4309 */ 4613 */
4310 err = btrfs_reserve_metadata_space(root, 5); 4614 trans = btrfs_start_transaction(root, 5);
4311 if (err) 4615 if (IS_ERR(trans))
4312 return err; 4616 return PTR_ERR(trans);
4313 4617
4314 trans = btrfs_start_transaction(root, 1);
4315 if (!trans)
4316 goto fail;
4317 btrfs_set_trans_block_group(trans, dir); 4618 btrfs_set_trans_block_group(trans, dir);
4318 4619
4319 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4320 if (err) {
4321 err = -ENOSPC;
4322 goto out_unlock;
4323 }
4324
4325 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4620 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4326 dentry->d_name.len, 4621 dentry->d_name.len,
4327 dentry->d_parent->d_inode->i_ino, objectid, 4622 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4350,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4350out_unlock: 4645out_unlock:
4351 nr = trans->blocks_used; 4646 nr = trans->blocks_used;
4352 btrfs_end_transaction_throttle(trans, root); 4647 btrfs_end_transaction_throttle(trans, root);
4353fail: 4648 btrfs_btree_balance_dirty(root, nr);
4354 btrfs_unreserve_metadata_space(root, 5);
4355 if (drop_inode) { 4649 if (drop_inode) {
4356 inode_dec_link_count(inode); 4650 inode_dec_link_count(inode);
4357 iput(inode); 4651 iput(inode);
4358 } 4652 }
4359 btrfs_btree_balance_dirty(root, nr);
4360 return err; 4653 return err;
4361} 4654}
4362 4655
@@ -4366,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4366 struct btrfs_trans_handle *trans; 4659 struct btrfs_trans_handle *trans;
4367 struct btrfs_root *root = BTRFS_I(dir)->root; 4660 struct btrfs_root *root = BTRFS_I(dir)->root;
4368 struct inode *inode = NULL; 4661 struct inode *inode = NULL;
4369 int err;
4370 int drop_inode = 0; 4662 int drop_inode = 0;
4663 int err;
4371 unsigned long nr = 0; 4664 unsigned long nr = 0;
4372 u64 objectid; 4665 u64 objectid;
4373 u64 index = 0; 4666 u64 index = 0;
4374 4667
4668 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4669 if (err)
4670 return err;
4375 /* 4671 /*
4376 * 2 for inode item and ref 4672 * 2 for inode item and ref
4377 * 2 for dir items 4673 * 2 for dir items
4378 * 1 for xattr if selinux is on 4674 * 1 for xattr if selinux is on
4379 */ 4675 */
4380 err = btrfs_reserve_metadata_space(root, 5); 4676 trans = btrfs_start_transaction(root, 5);
4381 if (err) 4677 if (IS_ERR(trans))
4382 return err; 4678 return PTR_ERR(trans);
4383 4679
4384 trans = btrfs_start_transaction(root, 1);
4385 if (!trans)
4386 goto fail;
4387 btrfs_set_trans_block_group(trans, dir); 4680 btrfs_set_trans_block_group(trans, dir);
4388 4681
4389 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4390 if (err) {
4391 err = -ENOSPC;
4392 goto out_unlock;
4393 }
4394
4395 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4682 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4396 dentry->d_name.len, 4683 dentry->d_name.len,
4397 dentry->d_parent->d_inode->i_ino, 4684 dentry->d_parent->d_inode->i_ino,
@@ -4423,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4423out_unlock: 4710out_unlock:
4424 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4425 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
4426fail:
4427 btrfs_unreserve_metadata_space(root, 5);
4428 if (drop_inode) { 4713 if (drop_inode) {
4429 inode_dec_link_count(inode); 4714 inode_dec_link_count(inode);
4430 iput(inode); 4715 iput(inode);
@@ -4451,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4451 if (root->objectid != BTRFS_I(inode)->root->objectid) 4736 if (root->objectid != BTRFS_I(inode)->root->objectid)
4452 return -EPERM; 4737 return -EPERM;
4453 4738
4454 /*
4455 * 1 item for inode ref
4456 * 2 items for dir items
4457 */
4458 err = btrfs_reserve_metadata_space(root, 3);
4459 if (err)
4460 return err;
4461
4462 btrfs_inc_nlink(inode); 4739 btrfs_inc_nlink(inode);
4463 4740
4464 err = btrfs_set_inode_index(dir, &index); 4741 err = btrfs_set_inode_index(dir, &index);
4465 if (err) 4742 if (err)
4466 goto fail; 4743 goto fail;
4467 4744
4468 trans = btrfs_start_transaction(root, 1); 4745 /*
4746 * 1 item for inode ref
4747 * 2 items for dir items
4748 */
4749 trans = btrfs_start_transaction(root, 3);
4750 if (IS_ERR(trans)) {
4751 err = PTR_ERR(trans);
4752 goto fail;
4753 }
4469 4754
4470 btrfs_set_trans_block_group(trans, dir); 4755 btrfs_set_trans_block_group(trans, dir);
4471 atomic_inc(&inode->i_count); 4756 atomic_inc(&inode->i_count);
@@ -4484,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4484 nr = trans->blocks_used; 4769 nr = trans->blocks_used;
4485 btrfs_end_transaction_throttle(trans, root); 4770 btrfs_end_transaction_throttle(trans, root);
4486fail: 4771fail:
4487 btrfs_unreserve_metadata_space(root, 3);
4488 if (drop_inode) { 4772 if (drop_inode) {
4489 inode_dec_link_count(inode); 4773 inode_dec_link_count(inode);
4490 iput(inode); 4774 iput(inode);
@@ -4504,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4504 u64 index = 0; 4788 u64 index = 0;
4505 unsigned long nr = 1; 4789 unsigned long nr = 1;
4506 4790
4791 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4792 if (err)
4793 return err;
4794
4507 /* 4795 /*
4508 * 2 items for inode and ref 4796 * 2 items for inode and ref
4509 * 2 items for dir items 4797 * 2 items for dir items
4510 * 1 for xattr if selinux is on 4798 * 1 for xattr if selinux is on
4511 */ 4799 */
4512 err = btrfs_reserve_metadata_space(root, 5); 4800 trans = btrfs_start_transaction(root, 5);
4513 if (err) 4801 if (IS_ERR(trans))
4514 return err; 4802 return PTR_ERR(trans);
4515
4516 trans = btrfs_start_transaction(root, 1);
4517 if (!trans) {
4518 err = -ENOMEM;
4519 goto out_unlock;
4520 }
4521 btrfs_set_trans_block_group(trans, dir); 4803 btrfs_set_trans_block_group(trans, dir);
4522 4804
4523 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4524 if (err) {
4525 err = -ENOSPC;
4526 goto out_fail;
4527 }
4528
4529 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4530 dentry->d_name.len, 4806 dentry->d_name.len,
4531 dentry->d_parent->d_inode->i_ino, objectid, 4807 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4565,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4565out_fail: 4841out_fail:
4566 nr = trans->blocks_used; 4842 nr = trans->blocks_used;
4567 btrfs_end_transaction_throttle(trans, root); 4843 btrfs_end_transaction_throttle(trans, root);
4568
4569out_unlock:
4570 btrfs_unreserve_metadata_space(root, 5);
4571 if (drop_on_err) 4844 if (drop_on_err)
4572 iput(inode); 4845 iput(inode);
4573 btrfs_btree_balance_dirty(root, nr); 4846 btrfs_btree_balance_dirty(root, nr);
@@ -4825,6 +5098,7 @@ again:
4825 } 5098 }
4826 flush_dcache_page(page); 5099 flush_dcache_page(page);
4827 } else if (create && PageUptodate(page)) { 5100 } else if (create && PageUptodate(page)) {
5101 WARN_ON(1);
4828 if (!trans) { 5102 if (!trans) {
4829 kunmap(page); 5103 kunmap(page);
4830 free_extent_map(em); 5104 free_extent_map(em);
@@ -4921,11 +5195,651 @@ out:
4921 return em; 5195 return em;
4922} 5196}
4923 5197
5198static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5199 u64 start, u64 len)
5200{
5201 struct btrfs_root *root = BTRFS_I(inode)->root;
5202 struct btrfs_trans_handle *trans;
5203 struct extent_map *em;
5204 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5205 struct btrfs_key ins;
5206 u64 alloc_hint;
5207 int ret;
5208
5209 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5210
5211 trans = btrfs_join_transaction(root, 0);
5212 if (!trans)
5213 return ERR_PTR(-ENOMEM);
5214
5215 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5216
5217 alloc_hint = get_extent_allocation_hint(inode, start, len);
5218 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5219 alloc_hint, (u64)-1, &ins, 1);
5220 if (ret) {
5221 em = ERR_PTR(ret);
5222 goto out;
5223 }
5224
5225 em = alloc_extent_map(GFP_NOFS);
5226 if (!em) {
5227 em = ERR_PTR(-ENOMEM);
5228 goto out;
5229 }
5230
5231 em->start = start;
5232 em->orig_start = em->start;
5233 em->len = ins.offset;
5234
5235 em->block_start = ins.objectid;
5236 em->block_len = ins.offset;
5237 em->bdev = root->fs_info->fs_devices->latest_bdev;
5238 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5239
5240 while (1) {
5241 write_lock(&em_tree->lock);
5242 ret = add_extent_mapping(em_tree, em);
5243 write_unlock(&em_tree->lock);
5244 if (ret != -EEXIST)
5245 break;
5246 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5247 }
5248
5249 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5250 ins.offset, ins.offset, 0);
5251 if (ret) {
5252 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5253 em = ERR_PTR(ret);
5254 }
5255out:
5256 btrfs_end_transaction(trans, root);
5257 return em;
5258}
5259
5260/*
5261 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5262 * block must be cow'd
5263 */
5264static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5265 struct inode *inode, u64 offset, u64 len)
5266{
5267 struct btrfs_path *path;
5268 int ret;
5269 struct extent_buffer *leaf;
5270 struct btrfs_root *root = BTRFS_I(inode)->root;
5271 struct btrfs_file_extent_item *fi;
5272 struct btrfs_key key;
5273 u64 disk_bytenr;
5274 u64 backref_offset;
5275 u64 extent_end;
5276 u64 num_bytes;
5277 int slot;
5278 int found_type;
5279
5280 path = btrfs_alloc_path();
5281 if (!path)
5282 return -ENOMEM;
5283
5284 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5285 offset, 0);
5286 if (ret < 0)
5287 goto out;
5288
5289 slot = path->slots[0];
5290 if (ret == 1) {
5291 if (slot == 0) {
5292 /* can't find the item, must cow */
5293 ret = 0;
5294 goto out;
5295 }
5296 slot--;
5297 }
5298 ret = 0;
5299 leaf = path->nodes[0];
5300 btrfs_item_key_to_cpu(leaf, &key, slot);
5301 if (key.objectid != inode->i_ino ||
5302 key.type != BTRFS_EXTENT_DATA_KEY) {
5303 /* not our file or wrong item type, must cow */
5304 goto out;
5305 }
5306
5307 if (key.offset > offset) {
5308 /* Wrong offset, must cow */
5309 goto out;
5310 }
5311
5312 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5313 found_type = btrfs_file_extent_type(leaf, fi);
5314 if (found_type != BTRFS_FILE_EXTENT_REG &&
5315 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5316 /* not a regular extent, must cow */
5317 goto out;
5318 }
5319 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5320 backref_offset = btrfs_file_extent_offset(leaf, fi);
5321
5322 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5323 if (extent_end < offset + len) {
5324 /* extent doesn't include our full range, must cow */
5325 goto out;
5326 }
5327
5328 if (btrfs_extent_readonly(root, disk_bytenr))
5329 goto out;
5330
5331 /*
5332 * look for other files referencing this extent, if we
5333 * find any we must cow
5334 */
5335 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5336 key.offset - backref_offset, disk_bytenr))
5337 goto out;
5338
5339 /*
5340 * adjust disk_bytenr and num_bytes to cover just the bytes
5341 * in this extent we are about to write. If there
5342 * are any csums in that range we have to cow in order
5343 * to keep the csums correct
5344 */
5345 disk_bytenr += backref_offset;
5346 disk_bytenr += offset - key.offset;
5347 num_bytes = min(offset + len, extent_end) - offset;
5348 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5349 goto out;
5350 /*
5351 * all of the above have passed, it is safe to overwrite this extent
5352 * without cow
5353 */
5354 ret = 1;
5355out:
5356 btrfs_free_path(path);
5357 return ret;
5358}
5359
5360static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5361 struct buffer_head *bh_result, int create)
5362{
5363 struct extent_map *em;
5364 struct btrfs_root *root = BTRFS_I(inode)->root;
5365 u64 start = iblock << inode->i_blkbits;
5366 u64 len = bh_result->b_size;
5367 struct btrfs_trans_handle *trans;
5368
5369 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5370 if (IS_ERR(em))
5371 return PTR_ERR(em);
5372
5373 /*
5374 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5375 * io. INLINE is special, and we could probably kludge it in here, but
5376 * it's still buffered so for safety lets just fall back to the generic
5377 * buffered path.
5378 *
5379 * For COMPRESSED we _have_ to read the entire extent in so we can
5380 * decompress it, so there will be buffering required no matter what we
5381 * do, so go ahead and fallback to buffered.
5382 *
5383 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5384 * to buffered IO. Don't blame me, this is the price we pay for using
5385 * the generic code.
5386 */
5387 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5388 em->block_start == EXTENT_MAP_INLINE) {
5389 free_extent_map(em);
5390 return -ENOTBLK;
5391 }
5392
5393 /* Just a good old fashioned hole, return */
5394 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5395 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5396 free_extent_map(em);
5397 /* DIO will do one hole at a time, so just unlock a sector */
5398 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5399 start + root->sectorsize - 1, GFP_NOFS);
5400 return 0;
5401 }
5402
5403 /*
5404 * We don't allocate a new extent in the following cases
5405 *
5406 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5407 * existing extent.
5408 * 2) The extent is marked as PREALLOC. We're good to go here and can
5409 * just use the extent.
5410 *
5411 */
5412 if (!create) {
5413 len = em->len - (start - em->start);
5414 goto map;
5415 }
5416
5417 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5418 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5419 em->block_start != EXTENT_MAP_HOLE)) {
5420 int type;
5421 int ret;
5422 u64 block_start;
5423
5424 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5425 type = BTRFS_ORDERED_PREALLOC;
5426 else
5427 type = BTRFS_ORDERED_NOCOW;
5428 len = min(len, em->len - (start - em->start));
5429 block_start = em->block_start + (start - em->start);
5430
5431 /*
5432 * we're not going to log anything, but we do need
5433 * to make sure the current transaction stays open
5434 * while we look for nocow cross refs
5435 */
5436 trans = btrfs_join_transaction(root, 0);
5437 if (!trans)
5438 goto must_cow;
5439
5440 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5441 ret = btrfs_add_ordered_extent_dio(inode, start,
5442 block_start, len, len, type);
5443 btrfs_end_transaction(trans, root);
5444 if (ret) {
5445 free_extent_map(em);
5446 return ret;
5447 }
5448 goto unlock;
5449 }
5450 btrfs_end_transaction(trans, root);
5451 }
5452must_cow:
5453 /*
5454 * this will cow the extent, reset the len in case we changed
5455 * it above
5456 */
5457 len = bh_result->b_size;
5458 free_extent_map(em);
5459 em = btrfs_new_extent_direct(inode, start, len);
5460 if (IS_ERR(em))
5461 return PTR_ERR(em);
5462 len = min(len, em->len - (start - em->start));
5463unlock:
5464 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5465 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5466 0, NULL, GFP_NOFS);
5467map:
5468 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5469 inode->i_blkbits;
5470 bh_result->b_size = len;
5471 bh_result->b_bdev = em->bdev;
5472 set_buffer_mapped(bh_result);
5473 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5474 set_buffer_new(bh_result);
5475
5476 free_extent_map(em);
5477
5478 return 0;
5479}
5480
5481struct btrfs_dio_private {
5482 struct inode *inode;
5483 u64 logical_offset;
5484 u64 disk_bytenr;
5485 u64 bytes;
5486 u32 *csums;
5487 void *private;
5488};
5489
5490static void btrfs_endio_direct_read(struct bio *bio, int err)
5491{
5492 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5493 struct bio_vec *bvec = bio->bi_io_vec;
5494 struct btrfs_dio_private *dip = bio->bi_private;
5495 struct inode *inode = dip->inode;
5496 struct btrfs_root *root = BTRFS_I(inode)->root;
5497 u64 start;
5498 u32 *private = dip->csums;
5499
5500 start = dip->logical_offset;
5501 do {
5502 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5503 struct page *page = bvec->bv_page;
5504 char *kaddr;
5505 u32 csum = ~(u32)0;
5506 unsigned long flags;
5507
5508 local_irq_save(flags);
5509 kaddr = kmap_atomic(page, KM_IRQ0);
5510 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5511 csum, bvec->bv_len);
5512 btrfs_csum_final(csum, (char *)&csum);
5513 kunmap_atomic(kaddr, KM_IRQ0);
5514 local_irq_restore(flags);
5515
5516 flush_dcache_page(bvec->bv_page);
5517 if (csum != *private) {
5518 printk(KERN_ERR "btrfs csum failed ino %lu off"
5519 " %llu csum %u private %u\n",
5520 inode->i_ino, (unsigned long long)start,
5521 csum, *private);
5522 err = -EIO;
5523 }
5524 }
5525
5526 start += bvec->bv_len;
5527 private++;
5528 bvec++;
5529 } while (bvec <= bvec_end);
5530
5531 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5532 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5533 bio->bi_private = dip->private;
5534
5535 kfree(dip->csums);
5536 kfree(dip);
5537 dio_end_io(bio, err);
5538}
5539
5540static void btrfs_endio_direct_write(struct bio *bio, int err)
5541{
5542 struct btrfs_dio_private *dip = bio->bi_private;
5543 struct inode *inode = dip->inode;
5544 struct btrfs_root *root = BTRFS_I(inode)->root;
5545 struct btrfs_trans_handle *trans;
5546 struct btrfs_ordered_extent *ordered = NULL;
5547 struct extent_state *cached_state = NULL;
5548 int ret;
5549
5550 if (err)
5551 goto out_done;
5552
5553 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5554 dip->logical_offset, dip->bytes);
5555 if (!ret)
5556 goto out_done;
5557
5558 BUG_ON(!ordered);
5559
5560 trans = btrfs_join_transaction(root, 1);
5561 if (!trans) {
5562 err = -ENOMEM;
5563 goto out;
5564 }
5565 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5566
5567 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5568 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5569 if (!ret)
5570 ret = btrfs_update_inode(trans, root, inode);
5571 err = ret;
5572 goto out;
5573 }
5574
5575 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5576 ordered->file_offset + ordered->len - 1, 0,
5577 &cached_state, GFP_NOFS);
5578
5579 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5580 ret = btrfs_mark_extent_written(trans, inode,
5581 ordered->file_offset,
5582 ordered->file_offset +
5583 ordered->len);
5584 if (ret) {
5585 err = ret;
5586 goto out_unlock;
5587 }
5588 } else {
5589 ret = insert_reserved_file_extent(trans, inode,
5590 ordered->file_offset,
5591 ordered->start,
5592 ordered->disk_len,
5593 ordered->len,
5594 ordered->len,
5595 0, 0, 0,
5596 BTRFS_FILE_EXTENT_REG);
5597 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5598 ordered->file_offset, ordered->len);
5599 if (ret) {
5600 err = ret;
5601 WARN_ON(1);
5602 goto out_unlock;
5603 }
5604 }
5605
5606 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5607 btrfs_ordered_update_i_size(inode, 0, ordered);
5608 btrfs_update_inode(trans, root, inode);
5609out_unlock:
5610 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5611 ordered->file_offset + ordered->len - 1,
5612 &cached_state, GFP_NOFS);
5613out:
5614 btrfs_delalloc_release_metadata(inode, ordered->len);
5615 btrfs_end_transaction(trans, root);
5616 btrfs_put_ordered_extent(ordered);
5617 btrfs_put_ordered_extent(ordered);
5618out_done:
5619 bio->bi_private = dip->private;
5620
5621 kfree(dip->csums);
5622 kfree(dip);
5623 dio_end_io(bio, err);
5624}
5625
5626static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5627 struct bio *bio, int mirror_num,
5628 unsigned long bio_flags, u64 offset)
5629{
5630 int ret;
5631 struct btrfs_root *root = BTRFS_I(inode)->root;
5632 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5633 BUG_ON(ret);
5634 return 0;
5635}
5636
5637static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5638 loff_t file_offset)
5639{
5640 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum;
5645 int write = rw & (1 << BIO_RW);
5646 int ret = 0;
5647
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5649
5650 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5651 if (!dip) {
5652 ret = -ENOMEM;
5653 goto free_ordered;
5654 }
5655 dip->csums = NULL;
5656
5657 if (!skip_sum) {
5658 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5659 if (!dip->csums) {
5660 ret = -ENOMEM;
5661 goto free_ordered;
5662 }
5663 }
5664
5665 dip->private = bio->bi_private;
5666 dip->inode = inode;
5667 dip->logical_offset = file_offset;
5668
5669 start = dip->logical_offset;
5670 dip->bytes = 0;
5671 do {
5672 dip->bytes += bvec->bv_len;
5673 bvec++;
5674 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5675
5676 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5677 bio->bi_private = dip;
5678
5679 if (write)
5680 bio->bi_end_io = btrfs_endio_direct_write;
5681 else
5682 bio->bi_end_io = btrfs_endio_direct_read;
5683
5684 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5685 if (ret)
5686 goto out_err;
5687
5688 if (write && !skip_sum) {
5689 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5690 inode, rw, bio, 0, 0,
5691 dip->logical_offset,
5692 __btrfs_submit_bio_start_direct_io,
5693 __btrfs_submit_bio_done);
5694 if (ret)
5695 goto out_err;
5696 return;
5697 } else if (!skip_sum)
5698 btrfs_lookup_bio_sums_dio(root, inode, bio,
5699 dip->logical_offset, dip->csums);
5700
5701 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5702 if (ret)
5703 goto out_err;
5704 return;
5705out_err:
5706 kfree(dip->csums);
5707 kfree(dip);
5708free_ordered:
5709 /*
5710 * If this is a write, we need to clean up the reserved space and kill
5711 * the ordered extent.
5712 */
5713 if (write) {
5714 struct btrfs_ordered_extent *ordered;
5715 ordered = btrfs_lookup_ordered_extent(inode,
5716 dip->logical_offset);
5717 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5718 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5719 btrfs_free_reserved_extent(root, ordered->start,
5720 ordered->disk_len);
5721 btrfs_put_ordered_extent(ordered);
5722 btrfs_put_ordered_extent(ordered);
5723 }
5724 bio_endio(bio, ret);
5725}
5726
5727static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5728 const struct iovec *iov, loff_t offset,
5729 unsigned long nr_segs)
5730{
5731 int seg;
5732 size_t size;
5733 unsigned long addr;
5734 unsigned blocksize_mask = root->sectorsize - 1;
5735 ssize_t retval = -EINVAL;
5736 loff_t end = offset;
5737
5738 if (offset & blocksize_mask)
5739 goto out;
5740
5741 /* Check the memory alignment. Blocks cannot straddle pages */
5742 for (seg = 0; seg < nr_segs; seg++) {
5743 addr = (unsigned long)iov[seg].iov_base;
5744 size = iov[seg].iov_len;
5745 end += size;
5746 if ((addr & blocksize_mask) || (size & blocksize_mask))
5747 goto out;
5748 }
5749 retval = 0;
5750out:
5751 return retval;
5752}
4924static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5753static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4925 const struct iovec *iov, loff_t offset, 5754 const struct iovec *iov, loff_t offset,
4926 unsigned long nr_segs) 5755 unsigned long nr_segs)
4927{ 5756{
4928 return -EINVAL; 5757 struct file *file = iocb->ki_filp;
5758 struct inode *inode = file->f_mapping->host;
5759 struct btrfs_ordered_extent *ordered;
5760 struct extent_state *cached_state = NULL;
5761 u64 lockstart, lockend;
5762 ssize_t ret;
5763 int writing = rw & WRITE;
5764 int write_bits = 0;
5765 size_t count = iov_length(iov, nr_segs);
5766
5767 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5768 offset, nr_segs)) {
5769 return 0;
5770 }
5771
5772 lockstart = offset;
5773 lockend = offset + count - 1;
5774
5775 if (writing) {
5776 ret = btrfs_delalloc_reserve_space(inode, count);
5777 if (ret)
5778 goto out;
5779 }
5780
5781 while (1) {
5782 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5783 0, &cached_state, GFP_NOFS);
5784 /*
5785 * We're concerned with the entire range that we're going to be
5786 * doing DIO to, so we need to make sure theres no ordered
5787 * extents in this range.
5788 */
5789 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5790 lockend - lockstart + 1);
5791 if (!ordered)
5792 break;
5793 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5794 &cached_state, GFP_NOFS);
5795 btrfs_start_ordered_extent(inode, ordered, 1);
5796 btrfs_put_ordered_extent(ordered);
5797 cond_resched();
5798 }
5799
5800 /*
5801 * we don't use btrfs_set_extent_delalloc because we don't want
5802 * the dirty or uptodate bits
5803 */
5804 if (writing) {
5805 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5806 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 EXTENT_DELALLOC, 0, NULL, &cached_state,
5808 GFP_NOFS);
5809 if (ret) {
5810 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5811 lockend, EXTENT_LOCKED | write_bits,
5812 1, 0, &cached_state, GFP_NOFS);
5813 goto out;
5814 }
5815 }
5816
5817 free_extent_state(cached_state);
5818 cached_state = NULL;
5819
5820 ret = __blockdev_direct_IO(rw, iocb, inode,
5821 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5822 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5823 btrfs_submit_direct, 0);
5824
5825 if (ret < 0 && ret != -EIOCBQUEUED) {
5826 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5827 offset + iov_length(iov, nr_segs) - 1,
5828 EXTENT_LOCKED | write_bits, 1, 0,
5829 &cached_state, GFP_NOFS);
5830 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5831 /*
5832 * We're falling back to buffered, unlock the section we didn't
5833 * do IO on.
5834 */
5835 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5836 offset + iov_length(iov, nr_segs) - 1,
5837 EXTENT_LOCKED | write_bits, 1, 0,
5838 &cached_state, GFP_NOFS);
5839 }
5840out:
5841 free_extent_state(cached_state);
5842 return ret;
4929} 5843}
4930 5844
4931static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5845static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5089,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5089 u64 page_start; 6003 u64 page_start;
5090 u64 page_end; 6004 u64 page_end;
5091 6005
5092 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6006 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5093 if (ret) { 6007 if (ret) {
5094 if (ret == -ENOMEM) 6008 if (ret == -ENOMEM)
5095 ret = VM_FAULT_OOM; 6009 ret = VM_FAULT_OOM;
@@ -5098,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5098 goto out; 6012 goto out;
5099 } 6013 }
5100 6014
5101 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5102 if (ret) {
5103 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5104 ret = VM_FAULT_SIGBUS;
5105 goto out;
5106 }
5107
5108 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6015 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5109again: 6016again:
5110 lock_page(page); 6017 lock_page(page);
@@ -5114,7 +6021,6 @@ again:
5114 6021
5115 if ((page->mapping != inode->i_mapping) || 6022 if ((page->mapping != inode->i_mapping) ||
5116 (page_start >= size)) { 6023 (page_start >= size)) {
5117 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5118 /* page got truncated out from underneath us */ 6024 /* page got truncated out from underneath us */
5119 goto out_unlock; 6025 goto out_unlock;
5120 } 6026 }
@@ -5155,7 +6061,6 @@ again:
5155 unlock_extent_cached(io_tree, page_start, page_end, 6061 unlock_extent_cached(io_tree, page_start, page_end,
5156 &cached_state, GFP_NOFS); 6062 &cached_state, GFP_NOFS);
5157 ret = VM_FAULT_SIGBUS; 6063 ret = VM_FAULT_SIGBUS;
5158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5159 goto out_unlock; 6064 goto out_unlock;
5160 } 6065 }
5161 ret = 0; 6066 ret = 0;
@@ -5182,10 +6087,10 @@ again:
5182 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6087 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5183 6088
5184out_unlock: 6089out_unlock:
5185 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5186 if (!ret) 6090 if (!ret)
5187 return VM_FAULT_LOCKED; 6091 return VM_FAULT_LOCKED;
5188 unlock_page(page); 6092 unlock_page(page);
6093 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5189out: 6094out:
5190 return ret; 6095 return ret;
5191} 6096}
@@ -5210,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
5210 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6115 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5211 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6116 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5212 6117
5213 trans = btrfs_start_transaction(root, 1); 6118 trans = btrfs_start_transaction(root, 0);
6119 BUG_ON(IS_ERR(trans));
5214 btrfs_set_trans_block_group(trans, inode); 6120 btrfs_set_trans_block_group(trans, inode);
6121 trans->block_rsv = root->orphan_block_rsv;
5215 6122
5216 /* 6123 /*
5217 * setattr is responsible for setting the ordered_data_close flag, 6124 * setattr is responsible for setting the ordered_data_close flag,
@@ -5234,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
5234 btrfs_add_ordered_operation(trans, root, inode); 6141 btrfs_add_ordered_operation(trans, root, inode);
5235 6142
5236 while (1) { 6143 while (1) {
6144 if (!trans) {
6145 trans = btrfs_start_transaction(root, 0);
6146 BUG_ON(IS_ERR(trans));
6147 btrfs_set_trans_block_group(trans, inode);
6148 trans->block_rsv = root->orphan_block_rsv;
6149 }
6150
6151 ret = btrfs_block_rsv_check(trans, root,
6152 root->orphan_block_rsv, 0, 5);
6153 if (ret) {
6154 BUG_ON(ret != -EAGAIN);
6155 ret = btrfs_commit_transaction(trans, root);
6156 BUG_ON(ret);
6157 trans = NULL;
6158 continue;
6159 }
6160
5237 ret = btrfs_truncate_inode_items(trans, root, inode, 6161 ret = btrfs_truncate_inode_items(trans, root, inode,
5238 inode->i_size, 6162 inode->i_size,
5239 BTRFS_EXTENT_DATA_KEY); 6163 BTRFS_EXTENT_DATA_KEY);
@@ -5245,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
5245 6169
5246 nr = trans->blocks_used; 6170 nr = trans->blocks_used;
5247 btrfs_end_transaction(trans, root); 6171 btrfs_end_transaction(trans, root);
6172 trans = NULL;
5248 btrfs_btree_balance_dirty(root, nr); 6173 btrfs_btree_balance_dirty(root, nr);
5249
5250 trans = btrfs_start_transaction(root, 1);
5251 btrfs_set_trans_block_group(trans, inode);
5252 } 6174 }
5253 6175
5254 if (ret == 0 && inode->i_nlink > 0) { 6176 if (ret == 0 && inode->i_nlink > 0) {
@@ -5309,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5309struct inode *btrfs_alloc_inode(struct super_block *sb) 6231struct inode *btrfs_alloc_inode(struct super_block *sb)
5310{ 6232{
5311 struct btrfs_inode *ei; 6233 struct btrfs_inode *ei;
6234 struct inode *inode;
5312 6235
5313 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6236 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5314 if (!ei) 6237 if (!ei)
5315 return NULL; 6238 return NULL;
6239
6240 ei->root = NULL;
6241 ei->space_info = NULL;
6242 ei->generation = 0;
6243 ei->sequence = 0;
5316 ei->last_trans = 0; 6244 ei->last_trans = 0;
5317 ei->last_sub_trans = 0; 6245 ei->last_sub_trans = 0;
5318 ei->logged_trans = 0; 6246 ei->logged_trans = 0;
5319 ei->outstanding_extents = 0; 6247 ei->delalloc_bytes = 0;
5320 ei->reserved_extents = 0; 6248 ei->reserved_bytes = 0;
5321 ei->root = NULL; 6249 ei->disk_i_size = 0;
6250 ei->flags = 0;
6251 ei->index_cnt = (u64)-1;
6252 ei->last_unlink_trans = 0;
6253
5322 spin_lock_init(&ei->accounting_lock); 6254 spin_lock_init(&ei->accounting_lock);
6255 atomic_set(&ei->outstanding_extents, 0);
6256 ei->reserved_extents = 0;
6257
6258 ei->ordered_data_close = 0;
6259 ei->orphan_meta_reserved = 0;
6260 ei->dummy_inode = 0;
6261 ei->force_compress = 0;
6262
6263 inode = &ei->vfs_inode;
6264 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6265 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6266 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6267 mutex_init(&ei->log_mutex);
5323 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6268 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5324 INIT_LIST_HEAD(&ei->i_orphan); 6269 INIT_LIST_HEAD(&ei->i_orphan);
6270 INIT_LIST_HEAD(&ei->delalloc_inodes);
5325 INIT_LIST_HEAD(&ei->ordered_operations); 6271 INIT_LIST_HEAD(&ei->ordered_operations);
5326 return &ei->vfs_inode; 6272 RB_CLEAR_NODE(&ei->rb_node);
6273
6274 return inode;
5327} 6275}
5328 6276
5329void btrfs_destroy_inode(struct inode *inode) 6277void btrfs_destroy_inode(struct inode *inode)
@@ -5333,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
5333 6281
5334 WARN_ON(!list_empty(&inode->i_dentry)); 6282 WARN_ON(!list_empty(&inode->i_dentry));
5335 WARN_ON(inode->i_data.nrpages); 6283 WARN_ON(inode->i_data.nrpages);
6284 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6285 WARN_ON(BTRFS_I(inode)->reserved_extents);
5336 6286
5337 /* 6287 /*
5338 * This can happen where we create an inode, but somebody else also 6288 * This can happen where we create an inode, but somebody else also
@@ -5353,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
5353 spin_unlock(&root->fs_info->ordered_extent_lock); 6303 spin_unlock(&root->fs_info->ordered_extent_lock);
5354 } 6304 }
5355 6305
5356 spin_lock(&root->list_lock); 6306 spin_lock(&root->orphan_lock);
5357 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5358 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5359 inode->i_ino); 6309 inode->i_ino);
5360 list_del_init(&BTRFS_I(inode)->i_orphan); 6310 list_del_init(&BTRFS_I(inode)->i_orphan);
5361 } 6311 }
5362 spin_unlock(&root->list_lock); 6312 spin_unlock(&root->orphan_lock);
5363 6313
5364 while (1) { 6314 while (1) {
5365 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6315 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5384,7 +6334,6 @@ free:
5384void btrfs_drop_inode(struct inode *inode) 6334void btrfs_drop_inode(struct inode *inode)
5385{ 6335{
5386 struct btrfs_root *root = BTRFS_I(inode)->root; 6336 struct btrfs_root *root = BTRFS_I(inode)->root;
5387
5388 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 6337 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5389 generic_delete_inode(inode); 6338 generic_delete_inode(inode);
5390 else 6339 else
@@ -5481,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5481 if (S_ISDIR(old_inode->i_mode) && new_inode && 6430 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5482 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6431 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5483 return -ENOTEMPTY; 6432 return -ENOTEMPTY;
5484
5485 /*
5486 * We want to reserve the absolute worst case amount of items. So if
5487 * both inodes are subvols and we need to unlink them then that would
5488 * require 4 item modifications, but if they are both normal inodes it
5489 * would require 5 item modifications, so we'll assume their normal
5490 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5491 * should cover the worst case number of items we'll modify.
5492 */
5493 ret = btrfs_reserve_metadata_space(root, 11);
5494 if (ret)
5495 return ret;
5496
5497 /* 6433 /*
5498 * we're using rename to replace one file with another. 6434 * we're using rename to replace one file with another.
5499 * and the replacement file is large. Start IO on it now so 6435 * and the replacement file is large. Start IO on it now so
@@ -5506,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5506 /* close the racy window with snapshot create/destroy ioctl */ 6442 /* close the racy window with snapshot create/destroy ioctl */
5507 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6443 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5508 down_read(&root->fs_info->subvol_sem); 6444 down_read(&root->fs_info->subvol_sem);
6445 /*
6446 * We want to reserve the absolute worst case amount of items. So if
6447 * both inodes are subvols and we need to unlink them then that would
6448 * require 4 item modifications, but if they are both normal inodes it
6449 * would require 5 item modifications, so we'll assume their normal
6450 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6451 * should cover the worst case number of items we'll modify.
6452 */
6453 trans = btrfs_start_transaction(root, 20);
6454 if (IS_ERR(trans))
6455 return PTR_ERR(trans);
5509 6456
5510 trans = btrfs_start_transaction(root, 1);
5511 btrfs_set_trans_block_group(trans, new_dir); 6457 btrfs_set_trans_block_group(trans, new_dir);
5512 6458
5513 if (dest != root) 6459 if (dest != root)
@@ -5606,7 +6552,6 @@ out_fail:
5606 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6552 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5607 up_read(&root->fs_info->subvol_sem); 6553 up_read(&root->fs_info->subvol_sem);
5608 6554
5609 btrfs_unreserve_metadata_space(root, 11);
5610 return ret; 6555 return ret;
5611} 6556}
5612 6557
@@ -5658,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5658 return 0; 6603 return 0;
5659} 6604}
5660 6605
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6607{
6608 struct btrfs_inode *binode;
6609 struct inode *inode = NULL;
6610
6611 spin_lock(&root->fs_info->delalloc_lock);
6612 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6613 binode = list_entry(root->fs_info->delalloc_inodes.next,
6614 struct btrfs_inode, delalloc_inodes);
6615 inode = igrab(&binode->vfs_inode);
6616 if (inode) {
6617 list_move_tail(&binode->delalloc_inodes,
6618 &root->fs_info->delalloc_inodes);
6619 break;
6620 }
6621
6622 list_del_init(&binode->delalloc_inodes);
6623 cond_resched_lock(&root->fs_info->delalloc_lock);
6624 }
6625 spin_unlock(&root->fs_info->delalloc_lock);
6626
6627 if (inode) {
6628 write_inode_now(inode, 0);
6629 if (delay_iput)
6630 btrfs_add_delayed_iput(inode);
6631 else
6632 iput(inode);
6633 return 1;
6634 }
6635 return 0;
6636}
6637
5661static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6638static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5662 const char *symname) 6639 const char *symname)
5663{ 6640{
@@ -5681,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5681 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6658 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5682 return -ENAMETOOLONG; 6659 return -ENAMETOOLONG;
5683 6660
6661 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6662 if (err)
6663 return err;
5684 /* 6664 /*
5685 * 2 items for inode item and ref 6665 * 2 items for inode item and ref
5686 * 2 items for dir items 6666 * 2 items for dir items
5687 * 1 item for xattr if selinux is on 6667 * 1 item for xattr if selinux is on
5688 */ 6668 */
5689 err = btrfs_reserve_metadata_space(root, 5); 6669 trans = btrfs_start_transaction(root, 5);
5690 if (err) 6670 if (IS_ERR(trans))
5691 return err; 6671 return PTR_ERR(trans);
5692 6672
5693 trans = btrfs_start_transaction(root, 1);
5694 if (!trans)
5695 goto out_fail;
5696 btrfs_set_trans_block_group(trans, dir); 6673 btrfs_set_trans_block_group(trans, dir);
5697 6674
5698 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5699 if (err) {
5700 err = -ENOSPC;
5701 goto out_unlock;
5702 }
5703
5704 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6675 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5705 dentry->d_name.len, 6676 dentry->d_name.len,
5706 dentry->d_parent->d_inode->i_ino, objectid, 6677 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5772,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5772out_unlock: 6743out_unlock:
5773 nr = trans->blocks_used; 6744 nr = trans->blocks_used;
5774 btrfs_end_transaction_throttle(trans, root); 6745 btrfs_end_transaction_throttle(trans, root);
5775out_fail:
5776 btrfs_unreserve_metadata_space(root, 5);
5777 if (drop_inode) { 6746 if (drop_inode) {
5778 inode_dec_link_count(inode); 6747 inode_dec_link_count(inode);
5779 iput(inode); 6748 iput(inode);
@@ -5782,36 +6751,28 @@ out_fail:
5782 return err; 6751 return err;
5783} 6752}
5784 6753
5785static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6754int btrfs_prealloc_file_range(struct inode *inode, int mode,
5786 u64 alloc_hint, int mode, loff_t actual_len) 6755 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint)
5787{ 6757{
5788 struct btrfs_trans_handle *trans; 6758 struct btrfs_trans_handle *trans;
5789 struct btrfs_root *root = BTRFS_I(inode)->root; 6759 struct btrfs_root *root = BTRFS_I(inode)->root;
5790 struct btrfs_key ins; 6760 struct btrfs_key ins;
5791 u64 alloc_size;
5792 u64 cur_offset = start; 6761 u64 cur_offset = start;
5793 u64 num_bytes = end - start;
5794 int ret = 0; 6762 int ret = 0;
5795 u64 i_size;
5796 6763
5797 while (num_bytes > 0) { 6764 while (num_bytes > 0) {
5798 alloc_size = min(num_bytes, root->fs_info->max_extent); 6765 trans = btrfs_start_transaction(root, 3);
5799 6766 if (IS_ERR(trans)) {
5800 trans = btrfs_start_transaction(root, 1); 6767 ret = PTR_ERR(trans);
5801 6768 break;
5802 ret = btrfs_reserve_extent(trans, root, alloc_size,
5803 root->sectorsize, 0, alloc_hint,
5804 (u64)-1, &ins, 1);
5805 if (ret) {
5806 WARN_ON(1);
5807 goto stop_trans;
5808 } 6769 }
5809 6770
5810 ret = btrfs_reserve_metadata_space(root, 3); 6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1);
5811 if (ret) { 6773 if (ret) {
5812 btrfs_free_reserved_extent(root, ins.objectid, 6774 btrfs_end_transaction(trans, root);
5813 ins.offset); 6775 break;
5814 goto stop_trans;
5815 } 6776 }
5816 6777
5817 ret = insert_reserved_file_extent(trans, inode, 6778 ret = insert_reserved_file_extent(trans, inode,
@@ -5825,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5825 6786
5826 num_bytes -= ins.offset; 6787 num_bytes -= ins.offset;
5827 cur_offset += ins.offset; 6788 cur_offset += ins.offset;
5828 alloc_hint = ins.objectid + ins.offset; 6789 *alloc_hint = ins.objectid + ins.offset;
5829 6790
5830 inode->i_ctime = CURRENT_TIME; 6791 inode->i_ctime = CURRENT_TIME;
5831 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6792 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5832 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5833 (actual_len > inode->i_size) && 6794 (actual_len > inode->i_size) &&
5834 (cur_offset > inode->i_size)) { 6795 (cur_offset > inode->i_size)) {
5835
5836 if (cur_offset > actual_len) 6796 if (cur_offset > actual_len)
5837 i_size = actual_len; 6797 i_size_write(inode, actual_len);
5838 else 6798 else
5839 i_size = cur_offset; 6799 i_size_write(inode, cur_offset);
5840 i_size_write(inode, i_size); 6800 i_size_write(inode, cur_offset);
5841 btrfs_ordered_update_i_size(inode, i_size, NULL); 6801 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5842 } 6802 }
5843 6803
5844 ret = btrfs_update_inode(trans, root, inode); 6804 ret = btrfs_update_inode(trans, root, inode);
5845 BUG_ON(ret); 6805 BUG_ON(ret);
5846 6806
5847 btrfs_end_transaction(trans, root); 6807 btrfs_end_transaction(trans, root);
5848 btrfs_unreserve_metadata_space(root, 3);
5849 } 6808 }
5850 return ret; 6809 return ret;
5851
5852stop_trans:
5853 btrfs_end_transaction(trans, root);
5854 return ret;
5855
5856} 6810}
5857 6811
5858static long btrfs_fallocate(struct inode *inode, int mode, 6812static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5885,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5885 goto out; 6839 goto out;
5886 } 6840 }
5887 6841
5888 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6842 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5889 alloc_end - alloc_start);
5890 if (ret) 6843 if (ret)
5891 goto out; 6844 goto out;
5892 6845
@@ -5931,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5931 if (em->block_start == EXTENT_MAP_HOLE || 6884 if (em->block_start == EXTENT_MAP_HOLE ||
5932 (cur_offset >= inode->i_size && 6885 (cur_offset >= inode->i_size &&
5933 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6886 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5934 ret = prealloc_file_range(inode, 6887 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
5935 cur_offset, last_byte, 6888 last_byte - cur_offset,
5936 alloc_hint, mode, offset+len); 6889 1 << inode->i_blkbits,
6890 offset + len,
6891 &alloc_hint);
5937 if (ret < 0) { 6892 if (ret < 0) {
5938 free_extent_map(em); 6893 free_extent_map(em);
5939 break; 6894 break;
5940 } 6895 }
5941 } 6896 }
5942 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5943 alloc_hint = em->block_start;
5944 free_extent_map(em); 6897 free_extent_map(em);
5945 6898
5946 cur_offset = last_byte; 6899 cur_offset = last_byte;
@@ -5952,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5952 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6905 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5953 &cached_state, GFP_NOFS); 6906 &cached_state, GFP_NOFS);
5954 6907
5955 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6908 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5956 alloc_end - alloc_start);
5957out: 6909out:
5958 mutex_unlock(&inode->i_mutex); 6910 mutex_unlock(&inode->i_mutex);
5959 return ret; 6911 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2845c6ceecd2..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -48,7 +49,6 @@
48#include "print-tree.h" 49#include "print-tree.h"
49#include "volumes.h" 50#include "volumes.h"
50#include "locking.h" 51#include "locking.h"
51#include "ctree.h"
52 52
53/* Mask out flags that are inappropriate for the given type of inode. */ 53/* Mask out flags that are inappropriate for the given type of inode. */
54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -511,7 +497,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 497 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); 498 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513 499
514 if (!em) 500 if (IS_ERR(em))
515 return 0; 501 return 0;
516 } 502 }
517 503
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1212,6 +1189,9 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1212 return -EPERM; 1189 return -EPERM;
1213 1190
1214 args = kmalloc(sizeof(*args), GFP_KERNEL); 1191 args = kmalloc(sizeof(*args), GFP_KERNEL);
1192 if (!args)
1193 return -ENOMEM;
1194
1215 if (copy_from_user(args, argp, sizeof(*args))) { 1195 if (copy_from_user(args, argp, sizeof(*args))) {
1216 kfree(args); 1196 kfree(args);
1217 return -EFAULT; 1197 return -EFAULT;
@@ -1297,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1297 if (err) 1277 if (err)
1298 goto out_up_write; 1278 goto out_up_write;
1299 1279
1300 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out_up_write;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1301 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1302 dest->root_key.objectid, 1288 dest->root_key.objectid,
1303 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1311,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1311 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1312 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1313 1299
1314 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1315 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1316 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1317 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1318 1306
1319 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1320 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1355,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1355 ret = -EPERM; 1343 ret = -EPERM;
1356 goto out; 1344 goto out;
1357 } 1345 }
1358 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1359 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1360 break; 1350 break;
1361 case S_IFREG: 1351 case S_IFREG:
1362 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1375,6 +1365,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1375 sizeof(*range))) { 1365 sizeof(*range))) {
1376 ret = -EFAULT; 1366 ret = -EFAULT;
1377 kfree(range); 1367 kfree(range);
1368 goto out;
1378 } 1369 }
1379 /* compression requires us to start the IO */ 1370 /* compression requires us to start the IO */
1380 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1371 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
@@ -1385,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1385 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1386 range->len = (u64)-1; 1377 range->len = (u64)-1;
1387 } 1378 }
1388 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1389 kfree(range); 1380 kfree(range);
1390 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1391 } 1384 }
1392out: 1385out:
1393 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1465,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1465 */ 1458 */
1466 1459
1467 /* the destination must be opened for writing */ 1460 /* the destination must be opened for writing */
1468 if (!(file->f_mode & FMODE_WRITE)) 1461 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1469 return -EINVAL; 1462 return -EINVAL;
1470 1463
1471 ret = mnt_want_write(file->f_path.mnt); 1464 ret = mnt_want_write(file->f_path.mnt);
@@ -1477,12 +1470,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1477 ret = -EBADF; 1470 ret = -EBADF;
1478 goto out_drop_write; 1471 goto out_drop_write;
1479 } 1472 }
1473
1480 src = src_file->f_dentry->d_inode; 1474 src = src_file->f_dentry->d_inode;
1481 1475
1482 ret = -EINVAL; 1476 ret = -EINVAL;
1483 if (src == inode) 1477 if (src == inode)
1484 goto out_fput; 1478 goto out_fput;
1485 1479
1480 /* the src must be open for reading */
1481 if (!(src_file->f_mode & FMODE_READ))
1482 goto out_fput;
1483
1486 ret = -EISDIR; 1484 ret = -EISDIR;
1487 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 1485 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
1488 goto out_fput; 1486 goto out_fput;
@@ -1513,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1513 1511
1514 /* determine range to clone */ 1512 /* determine range to clone */
1515 ret = -EINVAL; 1513 ret = -EINVAL;
1516 if (off >= src->i_size || off + len > src->i_size) 1514 if (off + len > src->i_size || off + len < off)
1517 goto out_unlock; 1515 goto out_unlock;
1518 if (len == 0) 1516 if (len == 0)
1519 olen = len = src->i_size - off; 1517 olen = len = src->i_size - off;
@@ -1541,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1541 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1542 } 1540 }
1543 1541
1544 trans = btrfs_start_transaction(root, 1);
1545 BUG_ON(!trans);
1546
1547 /* punch hole in destination first */
1548 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1549
1550 /* clone data */ 1542 /* clone data */
1551 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1552 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1557,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1557 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1558 * tree. 1550 * tree.
1559 */ 1551 */
1560 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1561 if (ret < 0) 1553 if (ret < 0)
1562 goto out; 1554 goto out;
1563 1555
@@ -1586,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1586 u64 disko = 0, diskl = 0; 1578 u64 disko = 0, diskl = 0;
1587 u64 datao = 0, datal = 0; 1579 u64 datao = 0, datal = 0;
1588 u8 comp; 1580 u8 comp;
1581 u64 endoff;
1589 1582
1590 size = btrfs_item_size_nr(leaf, slot); 1583 size = btrfs_item_size_nr(leaf, slot);
1591 read_extent_buffer(leaf, buf, 1584 read_extent_buffer(leaf, buf,
@@ -1620,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1620 new_key.objectid = inode->i_ino; 1613 new_key.objectid = inode->i_ino;
1621 new_key.offset = key.offset + destoff - off; 1614 new_key.offset = key.offset + destoff - off;
1622 1615
1616 trans = btrfs_start_transaction(root, 1);
1617 if (IS_ERR(trans)) {
1618 ret = PTR_ERR(trans);
1619 goto out;
1620 }
1621
1623 if (type == BTRFS_FILE_EXTENT_REG || 1622 if (type == BTRFS_FILE_EXTENT_REG ||
1624 type == BTRFS_FILE_EXTENT_PREALLOC) { 1623 type == BTRFS_FILE_EXTENT_PREALLOC) {
1624 if (off > key.offset) {
1625 datao += off - key.offset;
1626 datal -= off - key.offset;
1627 }
1628
1629 if (key.offset + datal > off + len)
1630 datal = off + len - key.offset;
1631
1632 ret = btrfs_drop_extents(trans, inode,
1633 new_key.offset,
1634 new_key.offset + datal,
1635 &hint_byte, 1);
1636 BUG_ON(ret);
1637
1625 ret = btrfs_insert_empty_item(trans, root, path, 1638 ret = btrfs_insert_empty_item(trans, root, path,
1626 &new_key, size); 1639 &new_key, size);
1627 if (ret) 1640 BUG_ON(ret);
1628 goto out;
1629 1641
1630 leaf = path->nodes[0]; 1642 leaf = path->nodes[0];
1631 slot = path->slots[0]; 1643 slot = path->slots[0];
@@ -1636,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1636 extent = btrfs_item_ptr(leaf, slot, 1648 extent = btrfs_item_ptr(leaf, slot,
1637 struct btrfs_file_extent_item); 1649 struct btrfs_file_extent_item);
1638 1650
1639 if (off > key.offset) {
1640 datao += off - key.offset;
1641 datal -= off - key.offset;
1642 }
1643
1644 if (key.offset + datal > off + len)
1645 datal = off + len - key.offset;
1646
1647 /* disko == 0 means it's a hole */ 1651 /* disko == 0 means it's a hole */
1648 if (!disko) 1652 if (!disko)
1649 datao = 0; 1653 datao = 0;
@@ -1674,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1674 1678
1675 if (comp && (skip || trim)) { 1679 if (comp && (skip || trim)) {
1676 ret = -EINVAL; 1680 ret = -EINVAL;
1681 btrfs_end_transaction(trans, root);
1677 goto out; 1682 goto out;
1678 } 1683 }
1679 size -= skip + trim; 1684 size -= skip + trim;
1680 datal -= skip + trim; 1685 datal -= skip + trim;
1686
1687 ret = btrfs_drop_extents(trans, inode,
1688 new_key.offset,
1689 new_key.offset + datal,
1690 &hint_byte, 1);
1691 BUG_ON(ret);
1692
1681 ret = btrfs_insert_empty_item(trans, root, path, 1693 ret = btrfs_insert_empty_item(trans, root, path,
1682 &new_key, size); 1694 &new_key, size);
1683 if (ret) 1695 BUG_ON(ret);
1684 goto out;
1685 1696
1686 if (skip) { 1697 if (skip) {
1687 u32 start = 1698 u32 start =
@@ -1699,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1699 } 1710 }
1700 1711
1701 btrfs_mark_buffer_dirty(leaf); 1712 btrfs_mark_buffer_dirty(leaf);
1702 } 1713 btrfs_release_path(root, path);
1714
1715 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1703 1716
1717 /*
1718 * we round up to the block size at eof when
1719 * determining which extents to clone above,
1720 * but shouldn't round up the file size
1721 */
1722 endoff = new_key.offset + datal;
1723 if (endoff > off+olen)
1724 endoff = off+olen;
1725 if (endoff > inode->i_size)
1726 btrfs_i_size_write(inode, endoff);
1727
1728 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1729 ret = btrfs_update_inode(trans, root, inode);
1730 BUG_ON(ret);
1731 btrfs_end_transaction(trans, root);
1732 }
1704next: 1733next:
1705 btrfs_release_path(root, path); 1734 btrfs_release_path(root, path);
1706 key.offset++; 1735 key.offset++;
@@ -1708,17 +1737,7 @@ next:
1708 ret = 0; 1737 ret = 0;
1709out: 1738out:
1710 btrfs_release_path(root, path); 1739 btrfs_release_path(root, path);
1711 if (ret == 0) {
1712 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1713 if (destoff + olen > inode->i_size)
1714 btrfs_i_size_write(inode, destoff + olen);
1715 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1716 ret = btrfs_update_inode(trans, root, inode);
1717 }
1718 btrfs_end_transaction(trans, root);
1719 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1740 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1720 if (ret)
1721 vmtruncate(inode, 0);
1722out_unlock: 1741out_unlock:
1723 mutex_unlock(&src->i_mutex); 1742 mutex_unlock(&src->i_mutex);
1724 mutex_unlock(&inode->i_mutex); 1743 mutex_unlock(&inode->i_mutex);
@@ -1836,7 +1855,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1836 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 1855 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1837 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 1856 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1838 dir_id, "default", 7, 1); 1857 dir_id, "default", 7, 1);
1839 if (!di) { 1858 if (IS_ERR_OR_NULL(di)) {
1840 btrfs_free_path(path); 1859 btrfs_free_path(path);
1841 btrfs_end_transaction(trans, root); 1860 btrfs_end_transaction(trans, root);
1842 printk(KERN_ERR "Umm, you don't have the default dir item, " 1861 printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a8ffecd0b491..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -125,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
125 return 1; 124 return 1;
126} 125}
127 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
128/* 136/*
129 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset 138 * the first one less than this offset
@@ -162,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
162 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
163 * inserted. 171 * inserted.
164 */ 172 */
165int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
166 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
167{ 176{
168 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
169 struct rb_node *node; 178 struct rb_node *node;
@@ -183,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
183 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
184 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
185 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
186 /* one ref for the tree */ 198 /* one ref for the tree */
187 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
188 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -204,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
204 return 0; 216 return 0;
205} 217}
206 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
207/* 233/*
208 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
209 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -303,6 +329,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
303 struct btrfs_ordered_extent *entry) 329 struct btrfs_ordered_extent *entry)
304{ 330{
305 struct btrfs_ordered_inode_tree *tree; 331 struct btrfs_ordered_inode_tree *tree;
332 struct btrfs_root *root = BTRFS_I(inode)->root;
306 struct rb_node *node; 333 struct rb_node *node;
307 334
308 tree = &BTRFS_I(inode)->ordered_tree; 335 tree = &BTRFS_I(inode)->ordered_tree;
@@ -311,13 +338,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
315 BTRFS_I(inode)->outstanding_extents--;
316 spin_unlock(&BTRFS_I(inode)->accounting_lock);
317 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
318 inode, 1);
319
320 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
321 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
322 343
323 /* 344 /*
@@ -329,7 +350,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
329 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 350 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
330 list_del_init(&BTRFS_I(inode)->ordered_operations); 351 list_del_init(&BTRFS_I(inode)->ordered_operations);
331 } 352 }
332 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 353 spin_unlock(&root->fs_info->ordered_extent_lock);
333 354
334 return 0; 355 return 0;
335} 356}
@@ -490,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
490 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
491 * for pdflush to find them 512 * for pdflush to find them
492 */ 513 */
493 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
494 if (wait) { 516 if (wait) {
495 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
496 &entry->flags)); 518 &entry->flags));
@@ -587,6 +609,47 @@ out:
587 return entry; 609 return entry;
588} 610}
589 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
590/* 653/*
591 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
592 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -43,8 +44,12 @@ struct tree_entry {
43struct backref_node { 44struct backref_node {
44 struct rb_node rb_node; 45 struct rb_node rb_node;
45 u64 bytenr; 46 u64 bytenr;
46 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
47 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
48 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
49 struct list_head upper; 54 struct list_head upper;
50 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -55,9 +60,9 @@ struct backref_node {
55 struct extent_buffer *eb; 60 struct extent_buffer *eb;
56 /* level of tree block */ 61 /* level of tree block */
57 unsigned int level:8; 62 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
59 unsigned int old_root:1; 64 unsigned int cowonly:1;
60 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
61 unsigned int lowest:1; 66 unsigned int lowest:1;
62 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
63 unsigned int locked:1; 68 unsigned int locked:1;
@@ -65,6 +70,16 @@ struct backref_node {
65 unsigned int processed:1; 70 unsigned int processed:1;
66 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
67 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
68}; 83};
69 84
70/* 85/*
@@ -73,7 +88,6 @@ struct backref_node {
73struct backref_edge { 88struct backref_edge {
74 struct list_head list[2]; 89 struct list_head list[2];
75 struct backref_node *node[2]; 90 struct backref_node *node[2];
76 u64 blockptr;
77}; 91};
78 92
79#define LOWER 0 93#define LOWER 0
@@ -82,9 +96,25 @@ struct backref_edge {
82struct backref_cache { 96struct backref_cache {
83 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root; 98 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
86 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
88}; 118};
89 119
90/* 120/*
@@ -112,15 +142,6 @@ struct tree_block {
112 unsigned int key_ready:1; 142 unsigned int key_ready:1;
113}; 143};
114 144
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
125 146
126struct file_extent_cluster { 147struct file_extent_cluster {
@@ -137,36 +158,43 @@ struct reloc_control {
137 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
138 /* inode for moving data */ 159 /* inode for moving data */
139 struct inode *data_inode; 160 struct inode *data_inode;
140 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
141 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
142 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
143 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
144 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
145 /* list of reloc trees */ 171 /* list of reloc trees */
146 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
147 u64 search_start; 178 u64 search_start;
148 u64 extents_found; 179 u64 extents_found;
149 u64 extents_skipped; 180
150 int stage; 181 int block_rsv_retries;
151 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
152 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
153 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
154}; 188};
155 189
156/* stages of data relocation */ 190/* stages of data relocation */
157#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
158#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
159 193
160/* 194static void remove_backref_node(struct backref_cache *cache,
161 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
162 */ 196static void __mark_block_processed(struct reloc_control *rc,
163struct async_merge { 197 struct backref_node *node);
164 struct btrfs_work work;
165 struct reloc_control *rc;
166 struct btrfs_root *root;
167 struct completion *done;
168 atomic_t *num_pending;
169};
170 198
171static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
172{ 200{
@@ -180,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
180 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
184} 276}
185 277
186static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
187{ 280{
188 memset(node, 0, sizeof(*node)); 281 if (edge) {
189 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
190 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
191 RB_CLEAR_NODE(&node->rb_node); 284 }
192} 285}
193 286
194static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -249,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
249 edges[idx++] = edge; 342 edges[idx++] = edge;
250 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
251 } 344 }
345 BUG_ON(node->detached);
252 *index = idx; 346 *index = idx;
253 return node; 347 return node;
254} 348}
@@ -280,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
280 return NULL; 374 return NULL;
281} 375}
282 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
283static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
284{ 386{
285 if (node->eb) { 387 if (node->eb) {
286 if (node->locked) { 388 unlock_node_buffer(node);
287 btrfs_tree_unlock(node->eb);
288 node->locked = 0;
289 }
290 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
291 node->eb = NULL; 390 node->eb = NULL;
292 } 391 }
@@ -295,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
295static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
296 struct backref_node *node) 395 struct backref_node *node)
297{ 396{
298 BUG_ON(!node->lowest);
299 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
300 398
301 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
302 list_del(&node->lower); 401 list_del(&node->lower);
303 402 if (!RB_EMPTY_NODE(&node->rb_node))
304 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
305 kfree(node); 404 free_backref_node(tree, node);
306} 405}
307 406
308/* 407/*
@@ -317,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
317 if (!node) 416 if (!node)
318 return; 417 return;
319 418
320 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
321 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
322 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
323 list[LOWER]); 422 list[LOWER]);
324 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
325 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
326 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
327 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
328 /* 435 /*
329 * add the node to pending list if no other 436 * add the node to leaf node list if no other
330 * child block cached. 437 * child block cached.
331 */ 438 */
332 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
333 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
334 &cache->pending[upper->level]);
335 upper->lowest = 1; 441 upper->lowest = 1;
336 } 442 }
337 } 443 }
444
338 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
339} 446}
340 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
341/* 534/*
342 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
343 */ 536 */
@@ -452,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
452 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
453 * block are also cached. 646 * block are also cached.
454 */ 647 */
455static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
456 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
457 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
458 int level, u64 bytenr) 651 int level, u64 bytenr)
459{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
460 struct btrfs_path *path1; 654 struct btrfs_path *path1;
461 struct btrfs_path *path2; 655 struct btrfs_path *path2;
462 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -472,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
472 unsigned long end; 666 unsigned long end;
473 unsigned long ptr; 667 unsigned long ptr;
474 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
475 int ret; 671 int ret;
476 int err = 0; 672 int err = 0;
477 673
@@ -482,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
482 goto out; 678 goto out;
483 } 679 }
484 680
485 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
486 if (!node) { 682 if (!node) {
487 err = -ENOMEM; 683 err = -ENOMEM;
488 goto out; 684 goto out;
489 } 685 }
490 686
491 backref_node_init(node);
492 node->bytenr = bytenr; 687 node->bytenr = bytenr;
493 node->owner = 0;
494 node->level = level; 688 node->level = level;
495 node->lowest = 1; 689 node->lowest = 1;
496 cur = node; 690 cur = node;
@@ -586,17 +780,21 @@ again:
586#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
589 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
592 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
593 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
594 root = find_tree_root(rc, eb, ref0); 787 if (key.objectid == key.offset) {
595 if (root) 788 root = find_tree_root(rc, eb, ref0);
596 cur->root = root; 789 if (root && !should_ignore_root(root))
597 else 790 cur->root = root;
598 cur->old_root = 1; 791 else
599 break; 792 list_add(&cur->list, &useless);
793 break;
794 }
795 if (is_cowonly_root(btrfs_ref_root_v0(eb,
796 ref0)))
797 cur->cowonly = 1;
600 } 798 }
601#else 799#else
602 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 800 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -613,22 +811,20 @@ again:
613 break; 811 break;
614 } 812 }
615 813
616 edge = kzalloc(sizeof(*edge), GFP_NOFS); 814 edge = alloc_backref_edge(cache);
617 if (!edge) { 815 if (!edge) {
618 err = -ENOMEM; 816 err = -ENOMEM;
619 goto out; 817 goto out;
620 } 818 }
621 rb_node = tree_search(&cache->rb_root, key.offset); 819 rb_node = tree_search(&cache->rb_root, key.offset);
622 if (!rb_node) { 820 if (!rb_node) {
623 upper = kmalloc(sizeof(*upper), GFP_NOFS); 821 upper = alloc_backref_node(cache);
624 if (!upper) { 822 if (!upper) {
625 kfree(edge); 823 free_backref_edge(cache, edge);
626 err = -ENOMEM; 824 err = -ENOMEM;
627 goto out; 825 goto out;
628 } 826 }
629 backref_node_init(upper);
630 upper->bytenr = key.offset; 827 upper->bytenr = key.offset;
631 upper->owner = 0;
632 upper->level = cur->level + 1; 828 upper->level = cur->level + 1;
633 /* 829 /*
634 * backrefs for the upper level block isn't 830 * backrefs for the upper level block isn't
@@ -638,11 +834,12 @@ again:
638 } else { 834 } else {
639 upper = rb_entry(rb_node, struct backref_node, 835 upper = rb_entry(rb_node, struct backref_node,
640 rb_node); 836 rb_node);
837 BUG_ON(!upper->checked);
641 INIT_LIST_HEAD(&edge->list[UPPER]); 838 INIT_LIST_HEAD(&edge->list[UPPER]);
642 } 839 }
643 list_add(&edge->list[LOWER], &cur->upper); 840 list_add_tail(&edge->list[LOWER], &cur->upper);
644 edge->node[UPPER] = upper;
645 edge->node[LOWER] = cur; 841 edge->node[LOWER] = cur;
842 edge->node[UPPER] = upper;
646 843
647 goto next; 844 goto next;
648 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 845 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -656,11 +853,17 @@ again:
656 goto out; 853 goto out;
657 } 854 }
658 855
856 if (!root->ref_cows)
857 cur->cowonly = 1;
858
659 if (btrfs_root_level(&root->root_item) == cur->level) { 859 if (btrfs_root_level(&root->root_item) == cur->level) {
660 /* tree root */ 860 /* tree root */
661 BUG_ON(btrfs_root_bytenr(&root->root_item) != 861 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662 cur->bytenr); 862 cur->bytenr);
663 cur->root = root; 863 if (should_ignore_root(root))
864 list_add(&cur->list, &useless);
865 else
866 cur->root = root;
664 break; 867 break;
665 } 868 }
666 869
@@ -691,11 +894,14 @@ again:
691 if (!path2->nodes[level]) { 894 if (!path2->nodes[level]) {
692 BUG_ON(btrfs_root_bytenr(&root->root_item) != 895 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693 lower->bytenr); 896 lower->bytenr);
694 lower->root = root; 897 if (should_ignore_root(root))
898 list_add(&lower->list, &useless);
899 else
900 lower->root = root;
695 break; 901 break;
696 } 902 }
697 903
698 edge = kzalloc(sizeof(*edge), GFP_NOFS); 904 edge = alloc_backref_edge(cache);
699 if (!edge) { 905 if (!edge) {
700 err = -ENOMEM; 906 err = -ENOMEM;
701 goto out; 907 goto out;
@@ -704,16 +910,17 @@ again:
704 eb = path2->nodes[level]; 910 eb = path2->nodes[level];
705 rb_node = tree_search(&cache->rb_root, eb->start); 911 rb_node = tree_search(&cache->rb_root, eb->start);
706 if (!rb_node) { 912 if (!rb_node) {
707 upper = kmalloc(sizeof(*upper), GFP_NOFS); 913 upper = alloc_backref_node(cache);
708 if (!upper) { 914 if (!upper) {
709 kfree(edge); 915 free_backref_edge(cache, edge);
710 err = -ENOMEM; 916 err = -ENOMEM;
711 goto out; 917 goto out;
712 } 918 }
713 backref_node_init(upper);
714 upper->bytenr = eb->start; 919 upper->bytenr = eb->start;
715 upper->owner = btrfs_header_owner(eb); 920 upper->owner = btrfs_header_owner(eb);
716 upper->level = lower->level + 1; 921 upper->level = lower->level + 1;
922 if (!root->ref_cows)
923 upper->cowonly = 1;
717 924
718 /* 925 /*
719 * if we know the block isn't shared 926 * if we know the block isn't shared
@@ -743,10 +950,12 @@ again:
743 rb_node); 950 rb_node);
744 BUG_ON(!upper->checked); 951 BUG_ON(!upper->checked);
745 INIT_LIST_HEAD(&edge->list[UPPER]); 952 INIT_LIST_HEAD(&edge->list[UPPER]);
953 if (!upper->owner)
954 upper->owner = btrfs_header_owner(eb);
746 } 955 }
747 list_add_tail(&edge->list[LOWER], &lower->upper); 956 list_add_tail(&edge->list[LOWER], &lower->upper);
748 edge->node[UPPER] = upper;
749 edge->node[LOWER] = lower; 957 edge->node[LOWER] = lower;
958 edge->node[UPPER] = upper;
750 959
751 if (rb_node) 960 if (rb_node)
752 break; 961 break;
@@ -784,8 +993,13 @@ next:
784 * into the cache. 993 * into the cache.
785 */ 994 */
786 BUG_ON(!node->checked); 995 BUG_ON(!node->checked);
787 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 996 cowonly = node->cowonly;
788 BUG_ON(rb_node); 997 if (!cowonly) {
998 rb_node = tree_insert(&cache->rb_root, node->bytenr,
999 &node->rb_node);
1000 BUG_ON(rb_node);
1001 list_add_tail(&node->lower, &cache->leaves);
1002 }
789 1003
790 list_for_each_entry(edge, &node->upper, list[LOWER]) 1004 list_for_each_entry(edge, &node->upper, list[LOWER])
791 list_add_tail(&edge->list[UPPER], &list); 1005 list_add_tail(&edge->list[UPPER], &list);
@@ -794,6 +1008,14 @@ next:
794 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1008 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795 list_del_init(&edge->list[UPPER]); 1009 list_del_init(&edge->list[UPPER]);
796 upper = edge->node[UPPER]; 1010 upper = edge->node[UPPER];
1011 if (upper->detached) {
1012 list_del(&edge->list[LOWER]);
1013 lower = edge->node[LOWER];
1014 free_backref_edge(cache, edge);
1015 if (list_empty(&lower->upper))
1016 list_add(&lower->list, &useless);
1017 continue;
1018 }
797 1019
798 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1020 if (!RB_EMPTY_NODE(&upper->rb_node)) {
799 if (upper->lowest) { 1021 if (upper->lowest) {
@@ -806,25 +1028,69 @@ next:
806 } 1028 }
807 1029
808 BUG_ON(!upper->checked); 1030 BUG_ON(!upper->checked);
809 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1031 BUG_ON(cowonly != upper->cowonly);
810 &upper->rb_node); 1032 if (!cowonly) {
811 BUG_ON(rb_node); 1033 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1034 &upper->rb_node);
1035 BUG_ON(rb_node);
1036 }
812 1037
813 list_add_tail(&edge->list[UPPER], &upper->lower); 1038 list_add_tail(&edge->list[UPPER], &upper->lower);
814 1039
815 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1040 list_for_each_entry(edge, &upper->upper, list[LOWER])
816 list_add_tail(&edge->list[UPPER], &list); 1041 list_add_tail(&edge->list[UPPER], &list);
817 } 1042 }
1043 /*
1044 * process useless backref nodes. backref nodes for tree leaves
1045 * are deleted from the cache. backref nodes for upper level
1046 * tree blocks are left in the cache to avoid unnecessary backref
1047 * lookup.
1048 */
1049 while (!list_empty(&useless)) {
1050 upper = list_entry(useless.next, struct backref_node, list);
1051 list_del_init(&upper->list);
1052 BUG_ON(!list_empty(&upper->upper));
1053 if (upper == node)
1054 node = NULL;
1055 if (upper->lowest) {
1056 list_del_init(&upper->lower);
1057 upper->lowest = 0;
1058 }
1059 while (!list_empty(&upper->lower)) {
1060 edge = list_entry(upper->lower.next,
1061 struct backref_edge, list[UPPER]);
1062 list_del(&edge->list[UPPER]);
1063 list_del(&edge->list[LOWER]);
1064 lower = edge->node[LOWER];
1065 free_backref_edge(cache, edge);
1066
1067 if (list_empty(&lower->upper))
1068 list_add(&lower->list, &useless);
1069 }
1070 __mark_block_processed(rc, upper);
1071 if (upper->level > 0) {
1072 list_add(&upper->list, &cache->detached);
1073 upper->detached = 1;
1074 } else {
1075 rb_erase(&upper->rb_node, &cache->rb_root);
1076 free_backref_node(cache, upper);
1077 }
1078 }
818out: 1079out:
819 btrfs_free_path(path1); 1080 btrfs_free_path(path1);
820 btrfs_free_path(path2); 1081 btrfs_free_path(path2);
821 if (err) { 1082 if (err) {
822 INIT_LIST_HEAD(&list); 1083 while (!list_empty(&useless)) {
1084 lower = list_entry(useless.next,
1085 struct backref_node, upper);
1086 list_del_init(&lower->upper);
1087 }
823 upper = node; 1088 upper = node;
1089 INIT_LIST_HEAD(&list);
824 while (upper) { 1090 while (upper) {
825 if (RB_EMPTY_NODE(&upper->rb_node)) { 1091 if (RB_EMPTY_NODE(&upper->rb_node)) {
826 list_splice_tail(&upper->upper, &list); 1092 list_splice_tail(&upper->upper, &list);
827 kfree(upper); 1093 free_backref_node(cache, upper);
828 } 1094 }
829 1095
830 if (list_empty(&list)) 1096 if (list_empty(&list))
@@ -832,15 +1098,104 @@ out:
832 1098
833 edge = list_entry(list.next, struct backref_edge, 1099 edge = list_entry(list.next, struct backref_edge,
834 list[LOWER]); 1100 list[LOWER]);
1101 list_del(&edge->list[LOWER]);
835 upper = edge->node[UPPER]; 1102 upper = edge->node[UPPER];
836 kfree(edge); 1103 free_backref_edge(cache, edge);
837 } 1104 }
838 return ERR_PTR(err); 1105 return ERR_PTR(err);
839 } 1106 }
1107 BUG_ON(node && node->detached);
840 return node; 1108 return node;
841} 1109}
842 1110
843/* 1111/*
1112 * helper to add backref node for the newly created snapshot.
1113 * the backref node is created by cloning backref node that
1114 * corresponds to root of source tree
1115 */
1116static int clone_backref_node(struct btrfs_trans_handle *trans,
1117 struct reloc_control *rc,
1118 struct btrfs_root *src,
1119 struct btrfs_root *dest)
1120{
1121 struct btrfs_root *reloc_root = src->reloc_root;
1122 struct backref_cache *cache = &rc->backref_cache;
1123 struct backref_node *node = NULL;
1124 struct backref_node *new_node;
1125 struct backref_edge *edge;
1126 struct backref_edge *new_edge;
1127 struct rb_node *rb_node;
1128
1129 if (cache->last_trans > 0)
1130 update_backref_cache(trans, cache);
1131
1132 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1133 if (rb_node) {
1134 node = rb_entry(rb_node, struct backref_node, rb_node);
1135 if (node->detached)
1136 node = NULL;
1137 else
1138 BUG_ON(node->new_bytenr != reloc_root->node->start);
1139 }
1140
1141 if (!node) {
1142 rb_node = tree_search(&cache->rb_root,
1143 reloc_root->commit_root->start);
1144 if (rb_node) {
1145 node = rb_entry(rb_node, struct backref_node,
1146 rb_node);
1147 BUG_ON(node->detached);
1148 }
1149 }
1150
1151 if (!node)
1152 return 0;
1153
1154 new_node = alloc_backref_node(cache);
1155 if (!new_node)
1156 return -ENOMEM;
1157
1158 new_node->bytenr = dest->node->start;
1159 new_node->level = node->level;
1160 new_node->lowest = node->lowest;
1161 new_node->root = dest;
1162
1163 if (!node->lowest) {
1164 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1165 new_edge = alloc_backref_edge(cache);
1166 if (!new_edge)
1167 goto fail;
1168
1169 new_edge->node[UPPER] = new_node;
1170 new_edge->node[LOWER] = edge->node[LOWER];
1171 list_add_tail(&new_edge->list[UPPER],
1172 &new_node->lower);
1173 }
1174 }
1175
1176 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1177 &new_node->rb_node);
1178 BUG_ON(rb_node);
1179
1180 if (!new_node->lowest) {
1181 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1182 list_add_tail(&new_edge->list[LOWER],
1183 &new_edge->node[LOWER]->upper);
1184 }
1185 }
1186 return 0;
1187fail:
1188 while (!list_empty(&new_node->lower)) {
1189 new_edge = list_entry(new_node->lower.next,
1190 struct backref_edge, list[UPPER]);
1191 list_del(&new_edge->list[UPPER]);
1192 free_backref_edge(cache, new_edge);
1193 }
1194 free_backref_node(cache, new_node);
1195 return -ENOMEM;
1196}
1197
1198/*
844 * helper to add 'address of tree root -> reloc tree' mapping 1199 * helper to add 'address of tree root -> reloc tree' mapping
845 */ 1200 */
846static int __add_reloc_root(struct btrfs_root *root) 1201static int __add_reloc_root(struct btrfs_root *root)
@@ -900,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
900 return 0; 1255 return 0;
901} 1256}
902 1257
903/* 1258static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
904 * create reloc tree for a given fs tree. reloc tree is just a 1259 struct btrfs_root *root, u64 objectid)
905 * snapshot of the fs tree with special root objectid.
906 */
907int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908 struct btrfs_root *root)
909{ 1260{
910 struct btrfs_root *reloc_root; 1261 struct btrfs_root *reloc_root;
911 struct extent_buffer *eb; 1262 struct extent_buffer *eb;
@@ -913,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
913 struct btrfs_key root_key; 1264 struct btrfs_key root_key;
914 int ret; 1265 int ret;
915 1266
916 if (root->reloc_root) {
917 reloc_root = root->reloc_root;
918 reloc_root->last_trans = trans->transid;
919 return 0;
920 }
921
922 if (!root->fs_info->reloc_ctl ||
923 !root->fs_info->reloc_ctl->create_reloc_root ||
924 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925 return 0;
926
927 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1267 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928 BUG_ON(!root_item); 1268 BUG_ON(!root_item);
929 1269
930 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1270 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931 root_key.type = BTRFS_ROOT_ITEM_KEY; 1271 root_key.type = BTRFS_ROOT_ITEM_KEY;
932 root_key.offset = root->root_key.objectid; 1272 root_key.offset = objectid;
933 1273
934 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1274 if (root->root_key.objectid == objectid) {
935 BTRFS_TREE_RELOC_OBJECTID); 1275 /* called by btrfs_init_reloc_root */
936 BUG_ON(ret); 1276 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1277 BTRFS_TREE_RELOC_OBJECTID);
1278 BUG_ON(ret);
1279
1280 btrfs_set_root_last_snapshot(&root->root_item,
1281 trans->transid - 1);
1282 } else {
1283 /*
1284 * called by btrfs_reloc_post_snapshot_hook.
1285 * the source tree is a reloc tree, all tree blocks
1286 * modified after it was created have RELOC flag
1287 * set in their headers. so it's OK to not update
1288 * the 'last_snapshot'.
1289 */
1290 ret = btrfs_copy_root(trans, root, root->node, &eb,
1291 BTRFS_TREE_RELOC_OBJECTID);
1292 BUG_ON(ret);
1293 }
937 1294
938 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1295 memcpy(root_item, &root->root_item, sizeof(*root_item));
940 btrfs_set_root_refs(root_item, 1);
941 btrfs_set_root_bytenr(root_item, eb->start); 1296 btrfs_set_root_bytenr(root_item, eb->start);
942 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1297 btrfs_set_root_level(root_item, btrfs_header_level(eb));
943 btrfs_set_root_generation(root_item, trans->transid); 1298 btrfs_set_root_generation(root_item, trans->transid);
944 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1299
945 root_item->drop_level = 0; 1300 if (root->root_key.objectid == objectid) {
1301 btrfs_set_root_refs(root_item, 0);
1302 memset(&root_item->drop_progress, 0,
1303 sizeof(struct btrfs_disk_key));
1304 root_item->drop_level = 0;
1305 }
946 1306
947 btrfs_tree_unlock(eb); 1307 btrfs_tree_unlock(eb);
948 free_extent_buffer(eb); 1308 free_extent_buffer(eb);
@@ -956,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
956 &root_key); 1316 &root_key);
957 BUG_ON(IS_ERR(reloc_root)); 1317 BUG_ON(IS_ERR(reloc_root));
958 reloc_root->last_trans = trans->transid; 1318 reloc_root->last_trans = trans->transid;
1319 return reloc_root;
1320}
1321
1322/*
1323 * create reloc tree for a given fs tree. reloc tree is just a
1324 * snapshot of the fs tree with special root objectid.
1325 */
1326int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 struct btrfs_root *reloc_root;
1330 struct reloc_control *rc = root->fs_info->reloc_ctl;
1331 int clear_rsv = 0;
1332
1333 if (root->reloc_root) {
1334 reloc_root = root->reloc_root;
1335 reloc_root->last_trans = trans->transid;
1336 return 0;
1337 }
1338
1339 if (!rc || !rc->create_reloc_tree ||
1340 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1341 return 0;
1342
1343 if (!trans->block_rsv) {
1344 trans->block_rsv = rc->block_rsv;
1345 clear_rsv = 1;
1346 }
1347 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1348 if (clear_rsv)
1349 trans->block_rsv = NULL;
959 1350
960 __add_reloc_root(reloc_root); 1351 __add_reloc_root(reloc_root);
961 root->reloc_root = reloc_root; 1352 root->reloc_root = reloc_root;
@@ -979,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
979 reloc_root = root->reloc_root; 1370 reloc_root = root->reloc_root;
980 root_item = &reloc_root->root_item; 1371 root_item = &reloc_root->root_item;
981 1372
982 if (btrfs_root_refs(root_item) == 0) { 1373 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1374 btrfs_root_refs(root_item) == 0) {
983 root->reloc_root = NULL; 1375 root->reloc_root = NULL;
984 del = 1; 1376 del = 1;
985 } 1377 }
@@ -1101,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1101 goto out; 1493 goto out;
1102 } 1494 }
1103 1495
1104 if (new_bytenr) 1496 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1105 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 ret = 0; 1497 ret = 0;
1107out: 1498out:
1108 btrfs_free_path(path); 1499 btrfs_free_path(path);
@@ -1113,19 +1504,18 @@ out:
1113 * update file extent items in the tree leaf to point to 1504 * update file extent items in the tree leaf to point to
1114 * the new locations. 1505 * the new locations.
1115 */ 1506 */
1116static int replace_file_extents(struct btrfs_trans_handle *trans, 1507static noinline_for_stack
1117 struct reloc_control *rc, 1508int replace_file_extents(struct btrfs_trans_handle *trans,
1118 struct btrfs_root *root, 1509 struct reloc_control *rc,
1119 struct extent_buffer *leaf, 1510 struct btrfs_root *root,
1120 struct list_head *inode_list) 1511 struct extent_buffer *leaf)
1121{ 1512{
1122 struct btrfs_key key; 1513 struct btrfs_key key;
1123 struct btrfs_file_extent_item *fi; 1514 struct btrfs_file_extent_item *fi;
1124 struct inode *inode = NULL; 1515 struct inode *inode = NULL;
1125 struct inodevec *ivec = NULL;
1126 u64 parent; 1516 u64 parent;
1127 u64 bytenr; 1517 u64 bytenr;
1128 u64 new_bytenr; 1518 u64 new_bytenr = 0;
1129 u64 num_bytes; 1519 u64 num_bytes;
1130 u64 end; 1520 u64 end;
1131 u32 nritems; 1521 u32 nritems;
@@ -1165,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1165 * to complete and drop the extent cache 1555 * to complete and drop the extent cache
1166 */ 1556 */
1167 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1557 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170 BUG_ON(!ivec);
1171 ivec->nr = 0;
1172 list_add_tail(&ivec->list, inode_list);
1173 }
1174 if (first) { 1558 if (first) {
1175 inode = find_next_inode(root, key.objectid); 1559 inode = find_next_inode(root, key.objectid);
1176 if (inode)
1177 ivec->inode[ivec->nr++] = inode;
1178 first = 0; 1560 first = 0;
1179 } else if (inode && inode->i_ino < key.objectid) { 1561 } else if (inode && inode->i_ino < key.objectid) {
1562 btrfs_add_delayed_iput(inode);
1180 inode = find_next_inode(root, key.objectid); 1563 inode = find_next_inode(root, key.objectid);
1181 if (inode)
1182 ivec->inode[ivec->nr++] = inode;
1183 } 1564 }
1184 if (inode && inode->i_ino == key.objectid) { 1565 if (inode && inode->i_ino == key.objectid) {
1185 end = key.offset + 1566 end = key.offset +
@@ -1203,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1203 1584
1204 ret = get_new_location(rc->data_inode, &new_bytenr, 1585 ret = get_new_location(rc->data_inode, &new_bytenr,
1205 bytenr, num_bytes); 1586 bytenr, num_bytes);
1206 if (ret > 0) 1587 if (ret > 0) {
1588 WARN_ON(1);
1207 continue; 1589 continue;
1590 }
1208 BUG_ON(ret < 0); 1591 BUG_ON(ret < 0);
1209 1592
1210 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1593 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1224,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1224 } 1607 }
1225 if (dirty) 1608 if (dirty)
1226 btrfs_mark_buffer_dirty(leaf); 1609 btrfs_mark_buffer_dirty(leaf);
1610 if (inode)
1611 btrfs_add_delayed_iput(inode);
1227 return 0; 1612 return 0;
1228} 1613}
1229 1614
@@ -1247,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1247 * if no block got replaced, 0 is returned. if there are other 1632 * if no block got replaced, 0 is returned. if there are other
1248 * errors, a negative error number is returned. 1633 * errors, a negative error number is returned.
1249 */ 1634 */
1250static int replace_path(struct btrfs_trans_handle *trans, 1635static noinline_for_stack
1251 struct btrfs_root *dest, struct btrfs_root *src, 1636int replace_path(struct btrfs_trans_handle *trans,
1252 struct btrfs_path *path, struct btrfs_key *next_key, 1637 struct btrfs_root *dest, struct btrfs_root *src,
1253 struct extent_buffer **leaf, 1638 struct btrfs_path *path, struct btrfs_key *next_key,
1254 int lowest_level, int max_level) 1639 int lowest_level, int max_level)
1255{ 1640{
1256 struct extent_buffer *eb; 1641 struct extent_buffer *eb;
1257 struct extent_buffer *parent; 1642 struct extent_buffer *parent;
@@ -1262,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1262 u64 new_ptr_gen; 1647 u64 new_ptr_gen;
1263 u64 last_snapshot; 1648 u64 last_snapshot;
1264 u32 blocksize; 1649 u32 blocksize;
1650 int cow = 0;
1265 int level; 1651 int level;
1266 int ret; 1652 int ret;
1267 int slot; 1653 int slot;
1268 1654
1269 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1656 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(lowest_level > 1 && leaf);
1272 1657
1273 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1658 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274 1659again:
1275 slot = path->slots[lowest_level]; 1660 slot = path->slots[lowest_level];
1276 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1661 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277 1662
@@ -1285,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1285 return 0; 1670 return 0;
1286 } 1671 }
1287 1672
1288 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1673 if (cow) {
1289 BUG_ON(ret); 1674 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1675 BUG_ON(ret);
1676 }
1290 btrfs_set_lock_blocking(eb); 1677 btrfs_set_lock_blocking(eb);
1291 1678
1292 if (next_key) { 1679 if (next_key) {
@@ -1330,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1330 1717
1331 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1718 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332 memcmp_node_keys(parent, slot, path, level)) { 1719 memcmp_node_keys(parent, slot, path, level)) {
1333 if (level <= lowest_level && !leaf) { 1720 if (level <= lowest_level) {
1334 ret = 0; 1721 ret = 0;
1335 break; 1722 break;
1336 } 1723 }
@@ -1338,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1338 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1339 old_ptr_gen); 1726 old_ptr_gen);
1340 btrfs_tree_lock(eb); 1727 btrfs_tree_lock(eb);
1341 ret = btrfs_cow_block(trans, dest, eb, parent, 1728 if (cow) {
1342 slot, &eb); 1729 ret = btrfs_cow_block(trans, dest, eb, parent,
1343 BUG_ON(ret); 1730 slot, &eb);
1344 btrfs_set_lock_blocking(eb); 1731 BUG_ON(ret);
1345
1346 if (level <= lowest_level) {
1347 *leaf = eb;
1348 ret = 0;
1349 break;
1350 } 1732 }
1733 btrfs_set_lock_blocking(eb);
1351 1734
1352 btrfs_tree_unlock(parent); 1735 btrfs_tree_unlock(parent);
1353 free_extent_buffer(parent); 1736 free_extent_buffer(parent);
@@ -1356,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1356 continue; 1739 continue;
1357 } 1740 }
1358 1741
1742 if (!cow) {
1743 btrfs_tree_unlock(parent);
1744 free_extent_buffer(parent);
1745 cow = 1;
1746 goto again;
1747 }
1748
1359 btrfs_node_key_to_cpu(path->nodes[level], &key, 1749 btrfs_node_key_to_cpu(path->nodes[level], &key,
1360 path->slots[level]); 1750 path->slots[level]);
1361 btrfs_release_path(src, path); 1751 btrfs_release_path(src, path);
@@ -1561,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1561 return 0; 1951 return 0;
1562} 1952}
1563 1953
1564static void put_inodes(struct list_head *list)
1565{
1566 struct inodevec *ivec;
1567 while (!list_empty(list)) {
1568 ivec = list_entry(list->next, struct inodevec, list);
1569 list_del(&ivec->list);
1570 while (ivec->nr > 0) {
1571 ivec->nr--;
1572 iput(ivec->inode[ivec->nr]);
1573 }
1574 kfree(ivec);
1575 }
1576}
1577
1578static int find_next_key(struct btrfs_path *path, int level, 1954static int find_next_key(struct btrfs_path *path, int level,
1579 struct btrfs_key *key) 1955 struct btrfs_key *key)
1580 1956
@@ -1607,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1607 struct btrfs_root *reloc_root; 1983 struct btrfs_root *reloc_root;
1608 struct btrfs_root_item *root_item; 1984 struct btrfs_root_item *root_item;
1609 struct btrfs_path *path; 1985 struct btrfs_path *path;
1610 struct extent_buffer *leaf = NULL; 1986 struct extent_buffer *leaf;
1611 unsigned long nr; 1987 unsigned long nr;
1612 int level; 1988 int level;
1613 int max_level; 1989 int max_level;
1614 int replaced = 0; 1990 int replaced = 0;
1615 int ret; 1991 int ret;
1616 int err = 0; 1992 int err = 0;
1993 u32 min_reserved;
1617 1994
1618 path = btrfs_alloc_path(); 1995 path = btrfs_alloc_path();
1619 if (!path) 1996 if (!path)
@@ -1647,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1647 btrfs_unlock_up_safe(path, 0); 2024 btrfs_unlock_up_safe(path, 0);
1648 } 2025 }
1649 2026
1650 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2027 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1651 trans = btrfs_start_transaction(root, 1); 2028 memset(&next_key, 0, sizeof(next_key));
1652 2029
1653 leaf = path->nodes[0]; 2030 while (1) {
1654 btrfs_item_key_to_cpu(leaf, &key, 0); 2031 trans = btrfs_start_transaction(root, 0);
1655 btrfs_release_path(reloc_root, path); 2032 trans->block_rsv = rc->block_rsv;
1656 2033
1657 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1658 if (ret < 0) { 2035 min_reserved, 0);
1659 err = ret; 2036 if (ret) {
1660 goto out; 2037 BUG_ON(ret != -EAGAIN);
2038 ret = btrfs_commit_transaction(trans, root);
2039 BUG_ON(ret);
2040 continue;
1661 } 2041 }
1662 2042
1663 leaf = path->nodes[0];
1664 btrfs_unlock_up_safe(path, 1);
1665 ret = replace_file_extents(trans, rc, root, leaf,
1666 &inode_list);
1667 if (ret < 0)
1668 err = ret;
1669 goto out;
1670 }
1671
1672 memset(&next_key, 0, sizeof(next_key));
1673
1674 while (1) {
1675 leaf = NULL;
1676 replaced = 0; 2043 replaced = 0;
1677 trans = btrfs_start_transaction(root, 1);
1678 max_level = level; 2044 max_level = level;
1679 2045
1680 ret = walk_down_reloc_tree(reloc_root, path, &level); 2046 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1688,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1688 if (!find_next_key(path, level, &key) && 2054 if (!find_next_key(path, level, &key) &&
1689 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2055 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690 ret = 0; 2056 ret = 0;
1691 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1692 ret = replace_path(trans, root, reloc_root,
1693 path, &next_key, &leaf,
1694 level, max_level);
1695 } else { 2057 } else {
1696 ret = replace_path(trans, root, reloc_root, 2058 ret = replace_path(trans, root, reloc_root, path,
1697 path, &next_key, NULL, 2059 &next_key, level, max_level);
1698 level, max_level);
1699 } 2060 }
1700 if (ret < 0) { 2061 if (ret < 0) {
1701 err = ret; 2062 err = ret;
@@ -1707,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1707 btrfs_node_key_to_cpu(path->nodes[level], &key, 2068 btrfs_node_key_to_cpu(path->nodes[level], &key,
1708 path->slots[level]); 2069 path->slots[level]);
1709 replaced = 1; 2070 replaced = 1;
1710 } else if (leaf) {
1711 /*
1712 * no block got replaced, try replacing file extents
1713 */
1714 btrfs_item_key_to_cpu(leaf, &key, 0);
1715 ret = replace_file_extents(trans, rc, root, leaf,
1716 &inode_list);
1717 btrfs_tree_unlock(leaf);
1718 free_extent_buffer(leaf);
1719 BUG_ON(ret < 0);
1720 } 2071 }
1721 2072
1722 ret = walk_up_reloc_tree(reloc_root, path, &level); 2073 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1733,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1733 root_item->drop_level = level; 2084 root_item->drop_level = level;
1734 2085
1735 nr = trans->blocks_used; 2086 nr = trans->blocks_used;
1736 btrfs_end_transaction(trans, root); 2087 btrfs_end_transaction_throttle(trans, root);
1737 2088
1738 btrfs_btree_balance_dirty(root, nr); 2089 btrfs_btree_balance_dirty(root, nr);
1739 2090
1740 /*
1741 * put inodes outside transaction, otherwise we may deadlock.
1742 */
1743 put_inodes(&inode_list);
1744
1745 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2091 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1746 invalidate_extent_cache(root, &key, &next_key); 2092 invalidate_extent_cache(root, &key, &next_key);
1747 } 2093 }
@@ -1764,87 +2110,125 @@ out:
1764 sizeof(root_item->drop_progress)); 2110 sizeof(root_item->drop_progress));
1765 root_item->drop_level = 0; 2111 root_item->drop_level = 0;
1766 btrfs_set_root_refs(root_item, 0); 2112 btrfs_set_root_refs(root_item, 0);
2113 btrfs_update_reloc_root(trans, root);
1767 } 2114 }
1768 2115
1769 nr = trans->blocks_used; 2116 nr = trans->blocks_used;
1770 btrfs_end_transaction(trans, root); 2117 btrfs_end_transaction_throttle(trans, root);
1771 2118
1772 btrfs_btree_balance_dirty(root, nr); 2119 btrfs_btree_balance_dirty(root, nr);
1773 2120
1774 put_inodes(&inode_list);
1775
1776 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2121 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1777 invalidate_extent_cache(root, &key, &next_key); 2122 invalidate_extent_cache(root, &key, &next_key);
1778 2123
1779 return err; 2124 return err;
1780} 2125}
1781 2126
1782/* 2127static noinline_for_stack
1783 * callback for the work threads. 2128int prepare_to_merge(struct reloc_control *rc, int err)
1784 * this function merges reloc tree with corresponding fs tree,
1785 * and then drops the reloc tree.
1786 */
1787static void merge_func(struct btrfs_work *work)
1788{ 2129{
1789 struct btrfs_trans_handle *trans; 2130 struct btrfs_root *root = rc->extent_root;
1790 struct btrfs_root *root;
1791 struct btrfs_root *reloc_root; 2131 struct btrfs_root *reloc_root;
1792 struct async_merge *async; 2132 struct btrfs_trans_handle *trans;
2133 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0;
2135 int ret;
2136 int retries = 0;
2137
2138 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2140 rc->merging_rsv_size += rc->nodes_relocated * 2;
2141 mutex_unlock(&root->fs_info->trans_mutex);
2142again:
2143 if (!err) {
2144 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries);
2147 if (ret)
2148 err = ret;
2149 }
2150
2151 trans = btrfs_join_transaction(rc->extent_root, 1);
2152
2153 if (!err) {
2154 if (num_bytes != rc->merging_rsv_size) {
2155 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again;
2160 }
2161 }
1793 2162
1794 async = container_of(work, struct async_merge, work); 2163 rc->merge_reloc_tree = 1;
1795 reloc_root = async->root; 2164
2165 while (!list_empty(&rc->reloc_roots)) {
2166 reloc_root = list_entry(rc->reloc_roots.next,
2167 struct btrfs_root, root_list);
2168 list_del_init(&reloc_root->root_list);
1796 2169
1797 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1798 root = read_fs_root(reloc_root->fs_info, 2170 root = read_fs_root(reloc_root->fs_info,
1799 reloc_root->root_key.offset); 2171 reloc_root->root_key.offset);
1800 BUG_ON(IS_ERR(root)); 2172 BUG_ON(IS_ERR(root));
1801 BUG_ON(root->reloc_root != reloc_root); 2173 BUG_ON(root->reloc_root != reloc_root);
1802 2174
1803 merge_reloc_root(async->rc, root); 2175 /*
1804 2176 * set reference count to 1, so btrfs_recover_relocation
1805 trans = btrfs_start_transaction(root, 1); 2177 * knows it should resumes merging
2178 */
2179 if (!err)
2180 btrfs_set_root_refs(&reloc_root->root_item, 1);
1806 btrfs_update_reloc_root(trans, root); 2181 btrfs_update_reloc_root(trans, root);
1807 btrfs_end_transaction(trans, root);
1808 }
1809 2182
1810 btrfs_drop_snapshot(reloc_root, 0); 2183 list_add(&reloc_root->root_list, &reloc_roots);
2184 }
1811 2185
1812 if (atomic_dec_and_test(async->num_pending)) 2186 list_splice(&reloc_roots, &rc->reloc_roots);
1813 complete(async->done);
1814 2187
1815 kfree(async); 2188 if (!err)
2189 btrfs_commit_transaction(trans, rc->extent_root);
2190 else
2191 btrfs_end_transaction(trans, rc->extent_root);
2192 return err;
1816} 2193}
1817 2194
1818static int merge_reloc_roots(struct reloc_control *rc) 2195static noinline_for_stack
2196int merge_reloc_roots(struct reloc_control *rc)
1819{ 2197{
1820 struct async_merge *async;
1821 struct btrfs_root *root; 2198 struct btrfs_root *root;
1822 struct completion done; 2199 struct btrfs_root *reloc_root;
1823 atomic_t num_pending; 2200 LIST_HEAD(reloc_roots);
2201 int found = 0;
2202 int ret;
2203again:
2204 root = rc->extent_root;
2205 mutex_lock(&root->fs_info->trans_mutex);
2206 list_splice_init(&rc->reloc_roots, &reloc_roots);
2207 mutex_unlock(&root->fs_info->trans_mutex);
1824 2208
1825 init_completion(&done); 2209 while (!list_empty(&reloc_roots)) {
1826 atomic_set(&num_pending, 1); 2210 found = 1;
2211 reloc_root = list_entry(reloc_roots.next,
2212 struct btrfs_root, root_list);
1827 2213
1828 while (!list_empty(&rc->reloc_roots)) { 2214 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1829 root = list_entry(rc->reloc_roots.next, 2215 root = read_fs_root(reloc_root->fs_info,
1830 struct btrfs_root, root_list); 2216 reloc_root->root_key.offset);
1831 list_del_init(&root->root_list); 2217 BUG_ON(IS_ERR(root));
2218 BUG_ON(root->reloc_root != reloc_root);
1832 2219
1833 async = kmalloc(sizeof(*async), GFP_NOFS); 2220 ret = merge_reloc_root(rc, root);
1834 BUG_ON(!async); 2221 BUG_ON(ret);
1835 async->work.func = merge_func; 2222 } else {
1836 async->work.flags = 0; 2223 list_del_init(&reloc_root->root_list);
1837 async->rc = rc; 2224 }
1838 async->root = root; 2225 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1839 async->done = &done;
1840 async->num_pending = &num_pending;
1841 atomic_inc(&num_pending);
1842 btrfs_queue_worker(&rc->workers, &async->work);
1843 } 2226 }
1844 2227
1845 if (!atomic_dec_and_test(&num_pending)) 2228 if (found) {
1846 wait_for_completion(&done); 2229 found = 0;
1847 2230 goto again;
2231 }
1848 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2232 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1849 return 0; 2233 return 0;
1850} 2234}
@@ -1875,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1875 return btrfs_record_root_in_trans(trans, root); 2259 return btrfs_record_root_in_trans(trans, root);
1876} 2260}
1877 2261
1878/* 2262static noinline_for_stack
1879 * select one tree from trees that references the block. 2263struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1880 * for blocks in refernce counted trees, we preper reloc tree. 2264 struct reloc_control *rc,
1881 * if no reloc tree found and reloc_only is true, NULL is returned. 2265 struct backref_node *node,
1882 */ 2266 struct backref_edge *edges[], int *nr)
1883static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1884 struct backref_node *node,
1885 struct backref_edge *edges[],
1886 int *nr, int reloc_only)
1887{ 2267{
1888 struct backref_node *next; 2268 struct backref_node *next;
1889 struct btrfs_root *root; 2269 struct btrfs_root *root;
1890 int index; 2270 int index = 0;
1891 int loop = 0; 2271
1892again:
1893 index = 0;
1894 next = node; 2272 next = node;
1895 while (1) { 2273 while (1) {
1896 cond_resched(); 2274 cond_resched();
1897 next = walk_up_backref(next, edges, &index); 2275 next = walk_up_backref(next, edges, &index);
1898 root = next->root; 2276 root = next->root;
1899 if (!root) { 2277 BUG_ON(!root);
1900 BUG_ON(!node->old_root); 2278 BUG_ON(!root->ref_cows);
1901 goto skip;
1902 }
1903
1904 /* no other choice for non-refernce counted tree */
1905 if (!root->ref_cows) {
1906 BUG_ON(reloc_only);
1907 break;
1908 }
1909 2279
1910 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2280 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1911 record_reloc_root_in_trans(trans, root); 2281 record_reloc_root_in_trans(trans, root);
1912 break; 2282 break;
1913 } 2283 }
1914 2284
1915 if (loop) { 2285 btrfs_record_root_in_trans(trans, root);
1916 btrfs_record_root_in_trans(trans, root); 2286 root = root->reloc_root;
2287
2288 if (next->new_bytenr != root->node->start) {
2289 BUG_ON(next->new_bytenr);
2290 BUG_ON(!list_empty(&next->list));
2291 next->new_bytenr = root->node->start;
2292 next->root = root;
2293 list_add_tail(&next->list,
2294 &rc->backref_cache.changed);
2295 __mark_block_processed(rc, next);
1917 break; 2296 break;
1918 } 2297 }
1919 2298
1920 if (reloc_only || next != node) { 2299 WARN_ON(1);
1921 if (!root->reloc_root)
1922 btrfs_record_root_in_trans(trans, root);
1923 root = root->reloc_root;
1924 /*
1925 * if the reloc tree was created in current
1926 * transation, there is no node in backref tree
1927 * corresponds to the root of the reloc tree.
1928 */
1929 if (btrfs_root_last_snapshot(&root->root_item) ==
1930 trans->transid - 1)
1931 break;
1932 }
1933skip:
1934 root = NULL; 2300 root = NULL;
1935 next = walk_down_backref(edges, &index); 2301 next = walk_down_backref(edges, &index);
1936 if (!next || next->level <= node->level) 2302 if (!next || next->level <= node->level)
1937 break; 2303 break;
1938 } 2304 }
2305 if (!root)
2306 return NULL;
1939 2307
1940 if (!root && !loop && !reloc_only) { 2308 *nr = index;
1941 loop = 1; 2309 next = node;
1942 goto again; 2310 /* setup backref node path for btrfs_reloc_cow_block */
2311 while (1) {
2312 rc->backref_cache.path[next->level] = next;
2313 if (--index < 0)
2314 break;
2315 next = edges[index]->node[UPPER];
1943 } 2316 }
1944
1945 if (root)
1946 *nr = index;
1947 else
1948 *nr = 0;
1949
1950 return root; 2317 return root;
1951} 2318}
1952 2319
2320/*
2321 * select a tree root for relocation. return NULL if the block
2322 * is reference counted. we should use do_relocation() in this
2323 * case. return a tree root pointer if the block isn't reference
2324 * counted. return -ENOENT if the block is root of reloc tree.
2325 */
1953static noinline_for_stack 2326static noinline_for_stack
1954struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2327struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1955 struct backref_node *node) 2328 struct backref_node *node)
1956{ 2329{
2330 struct backref_node *next;
2331 struct btrfs_root *root;
2332 struct btrfs_root *fs_root = NULL;
1957 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2333 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1958 int nr; 2334 int index = 0;
1959 return __select_one_root(trans, node, edges, &nr, 0); 2335
2336 next = node;
2337 while (1) {
2338 cond_resched();
2339 next = walk_up_backref(next, edges, &index);
2340 root = next->root;
2341 BUG_ON(!root);
2342
2343 /* no other choice for non-refernce counted tree */
2344 if (!root->ref_cows)
2345 return root;
2346
2347 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2348 fs_root = root;
2349
2350 if (next != node)
2351 return NULL;
2352
2353 next = walk_down_backref(edges, &index);
2354 if (!next || next->level <= node->level)
2355 break;
2356 }
2357
2358 if (!fs_root)
2359 return ERR_PTR(-ENOENT);
2360 return fs_root;
1960} 2361}
1961 2362
1962static noinline_for_stack 2363static noinline_for_stack
1963struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2364u64 calcu_metadata_size(struct reloc_control *rc,
1964 struct backref_node *node, 2365 struct backref_node *node, int reserve)
1965 struct backref_edge *edges[], int *nr)
1966{ 2366{
1967 return __select_one_root(trans, node, edges, nr, 1); 2367 struct backref_node *next = node;
2368 struct backref_edge *edge;
2369 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2370 u64 num_bytes = 0;
2371 int index = 0;
2372
2373 BUG_ON(reserve && node->processed);
2374
2375 while (next) {
2376 cond_resched();
2377 while (1) {
2378 if (next->processed && (reserve || next != node))
2379 break;
2380
2381 num_bytes += btrfs_level_size(rc->extent_root,
2382 next->level);
2383
2384 if (list_empty(&next->upper))
2385 break;
2386
2387 edge = list_entry(next->upper.next,
2388 struct backref_edge, list[LOWER]);
2389 edges[index++] = edge;
2390 next = edge->node[UPPER];
2391 }
2392 next = walk_down_backref(edges, &index);
2393 }
2394 return num_bytes;
1968} 2395}
1969 2396
1970static void grab_path_buffers(struct btrfs_path *path, 2397static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1971 struct backref_node *node, 2398 struct reloc_control *rc,
1972 struct backref_edge *edges[], int nr) 2399 struct backref_node *node)
1973{ 2400{
1974 int i = 0; 2401 struct btrfs_root *root = rc->extent_root;
1975 while (1) { 2402 u64 num_bytes;
1976 drop_node_buffer(node); 2403 int ret;
1977 node->eb = path->nodes[node->level]; 2404
1978 BUG_ON(!node->eb); 2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1979 if (path->locks[node->level])
1980 node->locked = 1;
1981 path->nodes[node->level] = NULL;
1982 path->locks[node->level] = 0;
1983
1984 if (i >= nr)
1985 break;
1986 2406
1987 edges[i]->blockptr = node->eb->start; 2407 trans->block_rsv = rc->block_rsv;
1988 node = edges[i]->node[UPPER]; 2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1989 i++; 2409 &rc->block_rsv_retries);
2410 if (ret) {
2411 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1;
2413 return ret;
1990 } 2414 }
2415
2416 rc->block_rsv_retries = 0;
2417 return 0;
2418}
2419
2420static void release_metadata_space(struct reloc_control *rc,
2421 struct backref_node *node)
2422{
2423 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2424 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1991} 2425}
1992 2426
1993/* 2427/*
@@ -1998,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1998 * in that case this function just updates pointers. 2432 * in that case this function just updates pointers.
1999 */ 2433 */
2000static int do_relocation(struct btrfs_trans_handle *trans, 2434static int do_relocation(struct btrfs_trans_handle *trans,
2435 struct reloc_control *rc,
2001 struct backref_node *node, 2436 struct backref_node *node,
2002 struct btrfs_key *key, 2437 struct btrfs_key *key,
2003 struct btrfs_path *path, int lowest) 2438 struct btrfs_path *path, int lowest)
@@ -2018,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2018 BUG_ON(lowest && node->eb); 2453 BUG_ON(lowest && node->eb);
2019 2454
2020 path->lowest_level = node->level + 1; 2455 path->lowest_level = node->level + 1;
2456 rc->backref_cache.path[node->level] = node;
2021 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2457 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2022 cond_resched(); 2458 cond_resched();
2023 if (node->eb && node->eb->start == edge->blockptr)
2024 continue;
2025 2459
2026 upper = edge->node[UPPER]; 2460 upper = edge->node[UPPER];
2027 root = select_reloc_root(trans, upper, edges, &nr); 2461 root = select_reloc_root(trans, rc, upper, edges, &nr);
2028 if (!root) 2462 BUG_ON(!root);
2029 continue; 2463
2030 2464 if (upper->eb && !upper->locked) {
2031 if (upper->eb && !upper->locked) 2465 if (!lowest) {
2466 ret = btrfs_bin_search(upper->eb, key,
2467 upper->level, &slot);
2468 BUG_ON(ret);
2469 bytenr = btrfs_node_blockptr(upper->eb, slot);
2470 if (node->eb->start == bytenr)
2471 goto next;
2472 }
2032 drop_node_buffer(upper); 2473 drop_node_buffer(upper);
2474 }
2033 2475
2034 if (!upper->eb) { 2476 if (!upper->eb) {
2035 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2477 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2039,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2039 } 2481 }
2040 BUG_ON(ret > 0); 2482 BUG_ON(ret > 0);
2041 2483
2042 slot = path->slots[upper->level]; 2484 if (!upper->eb) {
2485 upper->eb = path->nodes[upper->level];
2486 path->nodes[upper->level] = NULL;
2487 } else {
2488 BUG_ON(upper->eb != path->nodes[upper->level]);
2489 }
2043 2490
2044 btrfs_unlock_up_safe(path, upper->level + 1); 2491 upper->locked = 1;
2045 grab_path_buffers(path, upper, edges, nr); 2492 path->locks[upper->level] = 0;
2046 2493
2494 slot = path->slots[upper->level];
2047 btrfs_release_path(NULL, path); 2495 btrfs_release_path(NULL, path);
2048 } else { 2496 } else {
2049 ret = btrfs_bin_search(upper->eb, key, upper->level, 2497 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2052,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2052 } 2500 }
2053 2501
2054 bytenr = btrfs_node_blockptr(upper->eb, slot); 2502 bytenr = btrfs_node_blockptr(upper->eb, slot);
2055 if (!lowest) { 2503 if (lowest) {
2056 if (node->eb->start == bytenr) { 2504 BUG_ON(bytenr != node->bytenr);
2057 btrfs_tree_unlock(upper->eb);
2058 upper->locked = 0;
2059 continue;
2060 }
2061 } else { 2505 } else {
2062 BUG_ON(node->bytenr != bytenr); 2506 if (node->eb->start == bytenr)
2507 goto next;
2063 } 2508 }
2064 2509
2065 blocksize = btrfs_level_size(root, node->level); 2510 blocksize = btrfs_level_size(root, node->level);
@@ -2071,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2071 if (!node->eb) { 2516 if (!node->eb) {
2072 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2517 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2073 slot, &eb); 2518 slot, &eb);
2519 btrfs_tree_unlock(eb);
2520 free_extent_buffer(eb);
2074 if (ret < 0) { 2521 if (ret < 0) {
2075 err = ret; 2522 err = ret;
2076 break; 2523 goto next;
2077 } 2524 }
2078 btrfs_set_lock_blocking(eb); 2525 BUG_ON(node->eb != eb);
2079 node->eb = eb;
2080 node->locked = 1;
2081 } else { 2526 } else {
2082 btrfs_set_node_blockptr(upper->eb, slot, 2527 btrfs_set_node_blockptr(upper->eb, slot,
2083 node->eb->start); 2528 node->eb->start);
@@ -2095,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2095 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2540 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2096 BUG_ON(ret); 2541 BUG_ON(ret);
2097 } 2542 }
2098 if (!lowest) { 2543next:
2099 btrfs_tree_unlock(upper->eb); 2544 if (!upper->pending)
2100 upper->locked = 0; 2545 drop_node_buffer(upper);
2101 } 2546 else
2547 unlock_node_buffer(upper);
2548 if (err)
2549 break;
2102 } 2550 }
2551
2552 if (!err && node->pending) {
2553 drop_node_buffer(node);
2554 list_move_tail(&node->list, &rc->backref_cache.changed);
2555 node->pending = 0;
2556 }
2557
2103 path->lowest_level = 0; 2558 path->lowest_level = 0;
2559 BUG_ON(err == -ENOSPC);
2104 return err; 2560 return err;
2105} 2561}
2106 2562
2107static int link_to_upper(struct btrfs_trans_handle *trans, 2563static int link_to_upper(struct btrfs_trans_handle *trans,
2564 struct reloc_control *rc,
2108 struct backref_node *node, 2565 struct backref_node *node,
2109 struct btrfs_path *path) 2566 struct btrfs_path *path)
2110{ 2567{
2111 struct btrfs_key key; 2568 struct btrfs_key key;
2112 if (!node->eb || list_empty(&node->upper))
2113 return 0;
2114 2569
2115 btrfs_node_key_to_cpu(node->eb, &key, 0); 2570 btrfs_node_key_to_cpu(node->eb, &key, 0);
2116 return do_relocation(trans, node, &key, path, 0); 2571 return do_relocation(trans, rc, node, &key, path, 0);
2117} 2572}
2118 2573
2119static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2574static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2120 struct backref_cache *cache, 2575 struct reloc_control *rc,
2121 struct btrfs_path *path) 2576 struct btrfs_path *path, int err)
2122{ 2577{
2578 LIST_HEAD(list);
2579 struct backref_cache *cache = &rc->backref_cache;
2123 struct backref_node *node; 2580 struct backref_node *node;
2124 int level; 2581 int level;
2125 int ret; 2582 int ret;
2126 int err = 0;
2127 2583
2128 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2584 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2129 while (!list_empty(&cache->pending[level])) { 2585 while (!list_empty(&cache->pending[level])) {
2130 node = list_entry(cache->pending[level].next, 2586 node = list_entry(cache->pending[level].next,
2131 struct backref_node, lower); 2587 struct backref_node, list);
2132 BUG_ON(node->level != level); 2588 list_move_tail(&node->list, &list);
2589 BUG_ON(!node->pending);
2133 2590
2134 ret = link_to_upper(trans, node, path); 2591 if (!err) {
2135 if (ret < 0) 2592 ret = link_to_upper(trans, rc, node, path);
2136 err = ret; 2593 if (ret < 0)
2137 /* 2594 err = ret;
2138 * this remove the node from the pending list and 2595 }
2139 * may add some other nodes to the level + 1
2140 * pending list
2141 */
2142 remove_backref_node(cache, node);
2143 } 2596 }
2597 list_splice_init(&list, &cache->pending[level]);
2144 } 2598 }
2145 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2146 return err; 2599 return err;
2147} 2600}
2148 2601
2149static void mark_block_processed(struct reloc_control *rc, 2602static void mark_block_processed(struct reloc_control *rc,
2150 struct backref_node *node) 2603 u64 bytenr, u32 blocksize)
2604{
2605 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2606 EXTENT_DIRTY, GFP_NOFS);
2607}
2608
2609static void __mark_block_processed(struct reloc_control *rc,
2610 struct backref_node *node)
2151{ 2611{
2152 u32 blocksize; 2612 u32 blocksize;
2153 if (node->level == 0 || 2613 if (node->level == 0 ||
2154 in_block_group(node->bytenr, rc->block_group)) { 2614 in_block_group(node->bytenr, rc->block_group)) {
2155 blocksize = btrfs_level_size(rc->extent_root, node->level); 2615 blocksize = btrfs_level_size(rc->extent_root, node->level);
2156 set_extent_bits(&rc->processed_blocks, node->bytenr, 2616 mark_block_processed(rc, node->bytenr, blocksize);
2157 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2158 GFP_NOFS);
2159 } 2617 }
2160 node->processed = 1; 2618 node->processed = 1;
2161} 2619}
@@ -2178,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2178 if (next->processed) 2636 if (next->processed)
2179 break; 2637 break;
2180 2638
2181 mark_block_processed(rc, next); 2639 __mark_block_processed(rc, next);
2182 2640
2183 if (list_empty(&next->upper)) 2641 if (list_empty(&next->upper))
2184 break; 2642 break;
@@ -2201,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2201 return 0; 2659 return 0;
2202} 2660}
2203 2661
2204/*
2205 * check if there are any file extent pointers in the leaf point to
2206 * data require processing
2207 */
2208static int check_file_extents(struct reloc_control *rc,
2209 u64 bytenr, u32 blocksize, u64 ptr_gen)
2210{
2211 struct btrfs_key found_key;
2212 struct btrfs_file_extent_item *fi;
2213 struct extent_buffer *leaf;
2214 u32 nritems;
2215 int i;
2216 int ret = 0;
2217
2218 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2219
2220 nritems = btrfs_header_nritems(leaf);
2221 for (i = 0; i < nritems; i++) {
2222 cond_resched();
2223 btrfs_item_key_to_cpu(leaf, &found_key, i);
2224 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2225 continue;
2226 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2227 if (btrfs_file_extent_type(leaf, fi) ==
2228 BTRFS_FILE_EXTENT_INLINE)
2229 continue;
2230 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2231 if (bytenr == 0)
2232 continue;
2233 if (in_block_group(bytenr, rc->block_group)) {
2234 ret = 1;
2235 break;
2236 }
2237 }
2238 free_extent_buffer(leaf);
2239 return ret;
2240}
2241
2242/*
2243 * scan child blocks of a given block to find blocks require processing
2244 */
2245static int add_child_blocks(struct btrfs_trans_handle *trans,
2246 struct reloc_control *rc,
2247 struct backref_node *node,
2248 struct rb_root *blocks)
2249{
2250 struct tree_block *block;
2251 struct rb_node *rb_node;
2252 u64 bytenr;
2253 u64 ptr_gen;
2254 u32 blocksize;
2255 u32 nritems;
2256 int i;
2257 int err = 0;
2258
2259 nritems = btrfs_header_nritems(node->eb);
2260 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272
2273 readahead_tree_block(rc->extent_root,
2274 bytenr, blocksize, ptr_gen);
2275 }
2276
2277 for (i = 0; i < nritems; i++) {
2278 cond_resched();
2279 bytenr = btrfs_node_blockptr(node->eb, i);
2280 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2281 if (ptr_gen == trans->transid)
2282 continue;
2283 if (!in_block_group(bytenr, rc->block_group) &&
2284 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2285 continue;
2286 if (tree_block_processed(bytenr, blocksize, rc))
2287 continue;
2288 if (!in_block_group(bytenr, rc->block_group) &&
2289 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2290 continue;
2291
2292 block = kmalloc(sizeof(*block), GFP_NOFS);
2293 if (!block) {
2294 err = -ENOMEM;
2295 break;
2296 }
2297 block->bytenr = bytenr;
2298 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2299 block->level = node->level - 1;
2300 block->key_ready = 1;
2301 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2302 BUG_ON(rb_node);
2303 }
2304 if (err)
2305 free_block_list(blocks);
2306 return err;
2307}
2308
2309/*
2310 * find adjacent blocks require processing
2311 */
2312static noinline_for_stack
2313int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2314 struct reloc_control *rc,
2315 struct backref_cache *cache,
2316 struct rb_root *blocks, int level,
2317 struct backref_node **upper)
2318{
2319 struct backref_node *node;
2320 int ret = 0;
2321
2322 WARN_ON(!list_empty(&cache->pending[level]));
2323
2324 if (list_empty(&cache->pending[level + 1]))
2325 return 1;
2326
2327 node = list_entry(cache->pending[level + 1].next,
2328 struct backref_node, lower);
2329 if (node->eb)
2330 ret = add_child_blocks(trans, rc, node, blocks);
2331
2332 *upper = node;
2333 return ret;
2334}
2335
2336static int get_tree_block_key(struct reloc_control *rc, 2662static int get_tree_block_key(struct reloc_control *rc,
2337 struct tree_block *block) 2663 struct tree_block *block)
2338{ 2664{
@@ -2370,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2370 struct btrfs_path *path) 2696 struct btrfs_path *path)
2371{ 2697{
2372 struct btrfs_root *root; 2698 struct btrfs_root *root;
2373 int ret; 2699 int release = 0;
2700 int ret = 0;
2374 2701
2702 if (!node)
2703 return 0;
2704
2705 BUG_ON(node->processed);
2375 root = select_one_root(trans, node); 2706 root = select_one_root(trans, node);
2376 if (unlikely(!root)) { 2707 if (root == ERR_PTR(-ENOENT)) {
2377 rc->found_old_snapshot = 1;
2378 update_processed_blocks(rc, node); 2708 update_processed_blocks(rc, node);
2379 return 0; 2709 goto out;
2380 } 2710 }
2381 2711
2382 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2712 if (!root || root->ref_cows) {
2383 ret = do_relocation(trans, node, key, path, 1); 2713 ret = reserve_metadata_space(trans, rc, node);
2384 if (ret < 0) 2714 if (ret)
2385 goto out;
2386 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2387 ret = replace_file_extents(trans, rc, root,
2388 node->eb, NULL);
2389 if (ret < 0)
2390 goto out;
2391 }
2392 drop_node_buffer(node);
2393 } else if (!root->ref_cows) {
2394 path->lowest_level = node->level;
2395 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2396 btrfs_release_path(root, path);
2397 if (ret < 0)
2398 goto out; 2715 goto out;
2399 } else if (root != node->root) { 2716 release = 1;
2400 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2401 } 2717 }
2402 2718
2403 update_processed_blocks(rc, node); 2719 if (root) {
2404 ret = 0; 2720 if (root->ref_cows) {
2721 BUG_ON(node->new_bytenr);
2722 BUG_ON(!list_empty(&node->list));
2723 btrfs_record_root_in_trans(trans, root);
2724 root = root->reloc_root;
2725 node->new_bytenr = root->node->start;
2726 node->root = root;
2727 list_add_tail(&node->list, &rc->backref_cache.changed);
2728 } else {
2729 path->lowest_level = node->level;
2730 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2731 btrfs_release_path(root, path);
2732 if (ret > 0)
2733 ret = 0;
2734 }
2735 if (!ret)
2736 update_processed_blocks(rc, node);
2737 } else {
2738 ret = do_relocation(trans, rc, node, key, path, 1);
2739 }
2405out: 2740out:
2406 drop_node_buffer(node); 2741 if (ret || node->level == 0 || node->cowonly) {
2742 if (release)
2743 release_metadata_space(rc, node);
2744 remove_backref_node(&rc->backref_cache, node);
2745 }
2407 return ret; 2746 return ret;
2408} 2747}
2409 2748
@@ -2414,12 +2753,10 @@ static noinline_for_stack
2414int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2753int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2415 struct reloc_control *rc, struct rb_root *blocks) 2754 struct reloc_control *rc, struct rb_root *blocks)
2416{ 2755{
2417 struct backref_cache *cache;
2418 struct backref_node *node; 2756 struct backref_node *node;
2419 struct btrfs_path *path; 2757 struct btrfs_path *path;
2420 struct tree_block *block; 2758 struct tree_block *block;
2421 struct rb_node *rb_node; 2759 struct rb_node *rb_node;
2422 int level = -1;
2423 int ret; 2760 int ret;
2424 int err = 0; 2761 int err = 0;
2425 2762
@@ -2427,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2427 if (!path) 2764 if (!path)
2428 return -ENOMEM; 2765 return -ENOMEM;
2429 2766
2430 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2431 if (!cache) {
2432 btrfs_free_path(path);
2433 return -ENOMEM;
2434 }
2435
2436 backref_cache_init(cache);
2437
2438 rb_node = rb_first(blocks); 2767 rb_node = rb_first(blocks);
2439 while (rb_node) { 2768 while (rb_node) {
2440 block = rb_entry(rb_node, struct tree_block, rb_node); 2769 block = rb_entry(rb_node, struct tree_block, rb_node);
2441 if (level == -1)
2442 level = block->level;
2443 else
2444 BUG_ON(level != block->level);
2445 if (!block->key_ready) 2770 if (!block->key_ready)
2446 reada_tree_block(rc, block); 2771 reada_tree_block(rc, block);
2447 rb_node = rb_next(rb_node); 2772 rb_node = rb_next(rb_node);
@@ -2459,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2459 while (rb_node) { 2784 while (rb_node) {
2460 block = rb_entry(rb_node, struct tree_block, rb_node); 2785 block = rb_entry(rb_node, struct tree_block, rb_node);
2461 2786
2462 node = build_backref_tree(rc, cache, &block->key, 2787 node = build_backref_tree(rc, &block->key,
2463 block->level, block->bytenr); 2788 block->level, block->bytenr);
2464 if (IS_ERR(node)) { 2789 if (IS_ERR(node)) {
2465 err = PTR_ERR(node); 2790 err = PTR_ERR(node);
@@ -2469,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2469 ret = relocate_tree_block(trans, rc, node, &block->key, 2794 ret = relocate_tree_block(trans, rc, node, &block->key,
2470 path); 2795 path);
2471 if (ret < 0) { 2796 if (ret < 0) {
2472 err = ret; 2797 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2798 err = ret;
2473 goto out; 2799 goto out;
2474 } 2800 }
2475 remove_backref_node(cache, node);
2476 rb_node = rb_next(rb_node); 2801 rb_node = rb_next(rb_node);
2477 } 2802 }
2478 2803out:
2479 if (level > 0)
2480 goto out;
2481
2482 free_block_list(blocks); 2804 free_block_list(blocks);
2805 err = finish_pending_nodes(trans, rc, path, err);
2483 2806
2484 /* 2807 btrfs_free_path(path);
2485 * now backrefs of some upper level tree blocks have been cached, 2808 return err;
2486 * try relocating blocks referenced by these upper level blocks. 2809}
2487 */
2488 while (1) {
2489 struct backref_node *upper = NULL;
2490 if (trans->transaction->in_commit ||
2491 trans->transaction->delayed_refs.flushing)
2492 break;
2493 2810
2494 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2811static noinline_for_stack
2495 &upper); 2812int prealloc_file_extent_cluster(struct inode *inode,
2496 if (ret < 0) 2813 struct file_extent_cluster *cluster)
2497 err = ret; 2814{
2498 if (ret != 0) 2815 u64 alloc_hint = 0;
2499 break; 2816 u64 start;
2817 u64 end;
2818 u64 offset = BTRFS_I(inode)->index_cnt;
2819 u64 num_bytes;
2820 int nr = 0;
2821 int ret = 0;
2500 2822
2501 rb_node = rb_first(blocks); 2823 BUG_ON(cluster->start != cluster->boundary[0]);
2502 while (rb_node) { 2824 mutex_lock(&inode->i_mutex);
2503 block = rb_entry(rb_node, struct tree_block, rb_node);
2504 if (trans->transaction->in_commit ||
2505 trans->transaction->delayed_refs.flushing)
2506 goto out;
2507 BUG_ON(!block->key_ready);
2508 node = build_backref_tree(rc, cache, &block->key,
2509 level, block->bytenr);
2510 if (IS_ERR(node)) {
2511 err = PTR_ERR(node);
2512 goto out;
2513 }
2514 2825
2515 ret = relocate_tree_block(trans, rc, node, 2826 ret = btrfs_check_data_free_space(inode, cluster->end +
2516 &block->key, path); 2827 1 - cluster->start);
2517 if (ret < 0) { 2828 if (ret)
2518 err = ret; 2829 goto out;
2519 goto out;
2520 }
2521 remove_backref_node(cache, node);
2522 rb_node = rb_next(rb_node);
2523 }
2524 free_block_list(blocks);
2525 2830
2526 if (upper) { 2831 while (nr < cluster->nr) {
2527 ret = link_to_upper(trans, upper, path); 2832 start = cluster->boundary[nr] - offset;
2528 if (ret < 0) { 2833 if (nr + 1 < cluster->nr)
2529 err = ret; 2834 end = cluster->boundary[nr + 1] - 1 - offset;
2530 break; 2835 else
2531 } 2836 end = cluster->end - offset;
2532 remove_backref_node(cache, upper); 2837
2533 } 2838 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2839 num_bytes = end + 1 - start;
2840 ret = btrfs_prealloc_file_range(inode, 0, start,
2841 num_bytes, num_bytes,
2842 end + 1, &alloc_hint);
2843 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2844 if (ret)
2845 break;
2846 nr++;
2534 } 2847 }
2848 btrfs_free_reserved_data_space(inode, cluster->end +
2849 1 - cluster->start);
2535out: 2850out:
2536 free_block_list(blocks); 2851 mutex_unlock(&inode->i_mutex);
2537 2852 return ret;
2538 ret = finish_pending_nodes(trans, cache, path);
2539 if (ret < 0)
2540 err = ret;
2541
2542 kfree(cache);
2543 btrfs_free_path(path);
2544 return err;
2545} 2853}
2546 2854
2547static noinline_for_stack 2855static noinline_for_stack
@@ -2587,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2587 u64 offset = BTRFS_I(inode)->index_cnt; 2895 u64 offset = BTRFS_I(inode)->index_cnt;
2588 unsigned long index; 2896 unsigned long index;
2589 unsigned long last_index; 2897 unsigned long last_index;
2590 unsigned int dirty_page = 0;
2591 struct page *page; 2898 struct page *page;
2592 struct file_ra_state *ra; 2899 struct file_ra_state *ra;
2593 int nr = 0; 2900 int nr = 0;
@@ -2600,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2600 if (!ra) 2907 if (!ra)
2601 return -ENOMEM; 2908 return -ENOMEM;
2602 2909
2603 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2910 ret = prealloc_file_extent_cluster(inode, cluster);
2604 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2911 if (ret)
2912 goto out;
2605 2913
2606 mutex_lock(&inode->i_mutex); 2914 file_ra_state_init(ra, inode->i_mapping);
2607 2915
2608 i_size_write(inode, cluster->end + 1 - offset);
2609 ret = setup_extent_mapping(inode, cluster->start - offset, 2916 ret = setup_extent_mapping(inode, cluster->start - offset,
2610 cluster->end - offset, cluster->start); 2917 cluster->end - offset, cluster->start);
2611 if (ret) 2918 if (ret)
2612 goto out_unlock; 2919 goto out;
2613
2614 file_ra_state_init(ra, inode->i_mapping);
2615 2920
2616 WARN_ON(cluster->start != cluster->boundary[0]); 2921 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2922 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2617 while (index <= last_index) { 2923 while (index <= last_index) {
2924 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2925 if (ret)
2926 goto out;
2927
2618 page = find_lock_page(inode->i_mapping, index); 2928 page = find_lock_page(inode->i_mapping, index);
2619 if (!page) { 2929 if (!page) {
2620 page_cache_sync_readahead(inode->i_mapping, 2930 page_cache_sync_readahead(inode->i_mapping,
@@ -2622,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2622 last_index + 1 - index); 2932 last_index + 1 - index);
2623 page = grab_cache_page(inode->i_mapping, index); 2933 page = grab_cache_page(inode->i_mapping, index);
2624 if (!page) { 2934 if (!page) {
2935 btrfs_delalloc_release_metadata(inode,
2936 PAGE_CACHE_SIZE);
2625 ret = -ENOMEM; 2937 ret = -ENOMEM;
2626 goto out_unlock; 2938 goto out;
2627 } 2939 }
2628 } 2940 }
2629 2941
@@ -2639,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2639 if (!PageUptodate(page)) { 2951 if (!PageUptodate(page)) {
2640 unlock_page(page); 2952 unlock_page(page);
2641 page_cache_release(page); 2953 page_cache_release(page);
2954 btrfs_delalloc_release_metadata(inode,
2955 PAGE_CACHE_SIZE);
2642 ret = -EIO; 2956 ret = -EIO;
2643 goto out_unlock; 2957 goto out;
2644 } 2958 }
2645 } 2959 }
2646 2960
@@ -2659,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2973 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2974 nr++;
2661 } 2975 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2976
2977 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 set_page_dirty(page); 2978 set_page_dirty(page);
2665 dirty_page++;
2666 2979
2667 unlock_extent(&BTRFS_I(inode)->io_tree, 2980 unlock_extent(&BTRFS_I(inode)->io_tree,
2668 page_start, page_end, GFP_NOFS); 2981 page_start, page_end, GFP_NOFS);
@@ -2670,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2670 page_cache_release(page); 2983 page_cache_release(page);
2671 2984
2672 index++; 2985 index++;
2673 if (nr < cluster->nr && 2986 balance_dirty_pages_ratelimited(inode->i_mapping);
2674 page_end + 1 + offset == cluster->boundary[nr]) { 2987 btrfs_throttle(BTRFS_I(inode)->root);
2675 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2676 dirty_page);
2677 dirty_page = 0;
2678 }
2679 }
2680 if (dirty_page) {
2681 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2682 dirty_page);
2683 } 2988 }
2684 WARN_ON(nr != cluster->nr); 2989 WARN_ON(nr != cluster->nr);
2685out_unlock: 2990out:
2686 mutex_unlock(&inode->i_mutex);
2687 kfree(ra); 2991 kfree(ra);
2688 return ret; 2992 return ret;
2689} 2993}
@@ -2869,9 +3173,6 @@ out:
2869static int block_use_full_backref(struct reloc_control *rc, 3173static int block_use_full_backref(struct reloc_control *rc,
2870 struct extent_buffer *eb) 3174 struct extent_buffer *eb)
2871{ 3175{
2872 struct btrfs_path *path;
2873 struct btrfs_extent_item *ei;
2874 struct btrfs_key key;
2875 u64 flags; 3176 u64 flags;
2876 int ret; 3177 int ret;
2877 3178
@@ -2879,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2879 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3180 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2880 return 1; 3181 return 1;
2881 3182
2882 path = btrfs_alloc_path(); 3183 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2883 BUG_ON(!path); 3184 eb->start, eb->len, NULL, &flags);
2884
2885 key.objectid = eb->start;
2886 key.type = BTRFS_EXTENT_ITEM_KEY;
2887 key.offset = eb->len;
2888
2889 path->search_commit_root = 1;
2890 path->skip_locking = 1;
2891 ret = btrfs_search_slot(NULL, rc->extent_root,
2892 &key, path, 0, 0);
2893 BUG_ON(ret); 3185 BUG_ON(ret);
2894 3186
2895 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2896 struct btrfs_extent_item);
2897 flags = btrfs_extent_flags(path->nodes[0], ei);
2898 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2899 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3187 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2900 ret = 1; 3188 ret = 1;
2901 else 3189 else
2902 ret = 0; 3190 ret = 0;
2903 btrfs_free_path(path);
2904 return ret; 3191 return ret;
2905} 3192}
2906 3193
@@ -3073,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
3073 struct btrfs_extent_inline_ref *iref; 3360 struct btrfs_extent_inline_ref *iref;
3074 unsigned long ptr; 3361 unsigned long ptr;
3075 unsigned long end; 3362 unsigned long end;
3076 u32 blocksize; 3363 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3077 int ret; 3364 int ret;
3078 int err = 0; 3365 int err = 0;
3079 3366
3080 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3081 extent_key->offset);
3082 BUG_ON(ret < 0);
3083 if (ret > 0) {
3084 /* the relocated data is fragmented */
3085 rc->extents_skipped++;
3086 btrfs_release_path(rc->extent_root, path);
3087 return 0;
3088 }
3089
3090 blocksize = btrfs_level_size(rc->extent_root, 0);
3091
3092 eb = path->nodes[0]; 3367 eb = path->nodes[0];
3093 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3368 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3094 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3369 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3169,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
3169 */ 3444 */
3170static noinline_for_stack 3445static noinline_for_stack
3171int find_next_extent(struct btrfs_trans_handle *trans, 3446int find_next_extent(struct btrfs_trans_handle *trans,
3172 struct reloc_control *rc, struct btrfs_path *path) 3447 struct reloc_control *rc, struct btrfs_path *path,
3448 struct btrfs_key *extent_key)
3173{ 3449{
3174 struct btrfs_key key; 3450 struct btrfs_key key;
3175 struct extent_buffer *leaf; 3451 struct extent_buffer *leaf;
@@ -3224,6 +3500,7 @@ next:
3224 rc->search_start = end + 1; 3500 rc->search_start = end + 1;
3225 } else { 3501 } else {
3226 rc->search_start = key.objectid + key.offset; 3502 rc->search_start = key.objectid + key.offset;
3503 memcpy(extent_key, &key, sizeof(key));
3227 return 0; 3504 return 0;
3228 } 3505 }
3229 } 3506 }
@@ -3261,12 +3538,49 @@ static int check_extent_flags(u64 flags)
3261 return 0; 3538 return 0;
3262} 3539}
3263 3540
3541static noinline_for_stack
3542int prepare_to_relocate(struct reloc_control *rc)
3543{
3544 struct btrfs_trans_handle *trans;
3545 int ret;
3546
3547 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3548 if (!rc->block_rsv)
3549 return -ENOMEM;
3550
3551 /*
3552 * reserve some space for creating reloc trees.
3553 * btrfs_init_reloc_root will use them when there
3554 * is no reservation in transaction handle.
3555 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256,
3558 &rc->block_rsv_retries);
3559 if (ret)
3560 return ret;
3561
3562 rc->block_rsv->refill_used = 1;
3563 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3564
3565 memset(&rc->cluster, 0, sizeof(rc->cluster));
3566 rc->search_start = rc->block_group->key.objectid;
3567 rc->extents_found = 0;
3568 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571
3572 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc);
3574
3575 trans = btrfs_join_transaction(rc->extent_root, 1);
3576 btrfs_commit_transaction(trans, rc->extent_root);
3577 return 0;
3578}
3264 3579
3265static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3580static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3266{ 3581{
3267 struct rb_root blocks = RB_ROOT; 3582 struct rb_root blocks = RB_ROOT;
3268 struct btrfs_key key; 3583 struct btrfs_key key;
3269 struct file_extent_cluster *cluster;
3270 struct btrfs_trans_handle *trans = NULL; 3584 struct btrfs_trans_handle *trans = NULL;
3271 struct btrfs_path *path; 3585 struct btrfs_path *path;
3272 struct btrfs_extent_item *ei; 3586 struct btrfs_extent_item *ei;
@@ -3276,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3276 int ret; 3590 int ret;
3277 int err = 0; 3591 int err = 0;
3278 3592
3279 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3280 if (!cluster)
3281 return -ENOMEM;
3282
3283 path = btrfs_alloc_path(); 3593 path = btrfs_alloc_path();
3284 if (!path) { 3594 if (!path)
3285 kfree(cluster);
3286 return -ENOMEM; 3595 return -ENOMEM;
3287 }
3288
3289 rc->extents_found = 0;
3290 rc->extents_skipped = 0;
3291
3292 rc->search_start = rc->block_group->key.objectid;
3293 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3294 GFP_NOFS);
3295
3296 rc->create_reloc_root = 1;
3297 set_reloc_control(rc);
3298 3596
3299 trans = btrfs_start_transaction(rc->extent_root, 1); 3597 ret = prepare_to_relocate(rc);
3300 btrfs_commit_transaction(trans, rc->extent_root); 3598 if (ret) {
3599 err = ret;
3600 goto out_free;
3601 }
3301 3602
3302 while (1) { 3603 while (1) {
3303 trans = btrfs_start_transaction(rc->extent_root, 1); 3604 trans = btrfs_start_transaction(rc->extent_root, 0);
3605
3606 if (update_backref_cache(trans, &rc->backref_cache)) {
3607 btrfs_end_transaction(trans, rc->extent_root);
3608 continue;
3609 }
3304 3610
3305 ret = find_next_extent(trans, rc, path); 3611 ret = find_next_extent(trans, rc, path, &key);
3306 if (ret < 0) 3612 if (ret < 0)
3307 err = ret; 3613 err = ret;
3308 if (ret != 0) 3614 if (ret != 0)
@@ -3312,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3312 3618
3313 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3619 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3314 struct btrfs_extent_item); 3620 struct btrfs_extent_item);
3315 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3621 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3316 item_size = btrfs_item_size_nr(path->nodes[0],
3317 path->slots[0]);
3318 if (item_size >= sizeof(*ei)) { 3622 if (item_size >= sizeof(*ei)) {
3319 flags = btrfs_extent_flags(path->nodes[0], ei); 3623 flags = btrfs_extent_flags(path->nodes[0], ei);
3320 ret = check_extent_flags(flags); 3624 ret = check_extent_flags(flags);
@@ -3355,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3355 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3659 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3356 ret = add_tree_block(rc, &key, path, &blocks); 3660 ret = add_tree_block(rc, &key, path, &blocks);
3357 } else if (rc->stage == UPDATE_DATA_PTRS && 3661 } else if (rc->stage == UPDATE_DATA_PTRS &&
3358 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3662 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3359 ret = add_data_references(rc, &key, path, &blocks); 3663 ret = add_data_references(rc, &key, path, &blocks);
3360 } else { 3664 } else {
3361 btrfs_release_path(rc->extent_root, path); 3665 btrfs_release_path(rc->extent_root, path);
3362 ret = 0; 3666 ret = 0;
3363 } 3667 }
3364 if (ret < 0) { 3668 if (ret < 0) {
3365 err = 0; 3669 err = ret;
3366 break; 3670 break;
3367 } 3671 }
3368 3672
3369 if (!RB_EMPTY_ROOT(&blocks)) { 3673 if (!RB_EMPTY_ROOT(&blocks)) {
3370 ret = relocate_tree_blocks(trans, rc, &blocks); 3674 ret = relocate_tree_blocks(trans, rc, &blocks);
3371 if (ret < 0) { 3675 if (ret < 0) {
3676 if (ret != -EAGAIN) {
3677 err = ret;
3678 break;
3679 }
3680 rc->extents_found--;
3681 rc->search_start = key.objectid;
3682 }
3683 }
3684
3685 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3686 rc->block_rsv, 0, 5);
3687 if (ret < 0) {
3688 if (ret != -EAGAIN) {
3372 err = ret; 3689 err = ret;
3690 WARN_ON(1);
3373 break; 3691 break;
3374 } 3692 }
3693 rc->commit_transaction = 1;
3375 } 3694 }
3376 3695
3377 nr = trans->blocks_used; 3696 if (rc->commit_transaction) {
3378 btrfs_end_transaction(trans, rc->extent_root); 3697 rc->commit_transaction = 0;
3698 ret = btrfs_commit_transaction(trans, rc->extent_root);
3699 BUG_ON(ret);
3700 } else {
3701 nr = trans->blocks_used;
3702 btrfs_end_transaction_throttle(trans, rc->extent_root);
3703 btrfs_btree_balance_dirty(rc->extent_root, nr);
3704 }
3379 trans = NULL; 3705 trans = NULL;
3380 btrfs_btree_balance_dirty(rc->extent_root, nr);
3381 3706
3382 if (rc->stage == MOVE_DATA_EXTENTS && 3707 if (rc->stage == MOVE_DATA_EXTENTS &&
3383 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3708 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3384 rc->found_file_extent = 1; 3709 rc->found_file_extent = 1;
3385 ret = relocate_data_extent(rc->data_inode, 3710 ret = relocate_data_extent(rc->data_inode,
3386 &key, cluster); 3711 &key, &rc->cluster);
3387 if (ret < 0) { 3712 if (ret < 0) {
3388 err = ret; 3713 err = ret;
3389 break; 3714 break;
3390 } 3715 }
3391 } 3716 }
3392 } 3717 }
3393 btrfs_free_path(path); 3718
3719 btrfs_release_path(rc->extent_root, path);
3720 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3721 GFP_NOFS);
3394 3722
3395 if (trans) { 3723 if (trans) {
3396 nr = trans->blocks_used; 3724 nr = trans->blocks_used;
3397 btrfs_end_transaction(trans, rc->extent_root); 3725 btrfs_end_transaction_throttle(trans, rc->extent_root);
3398 btrfs_btree_balance_dirty(rc->extent_root, nr); 3726 btrfs_btree_balance_dirty(rc->extent_root, nr);
3399 } 3727 }
3400 3728
3401 if (!err) { 3729 if (!err) {
3402 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3730 ret = relocate_file_extent_cluster(rc->data_inode,
3731 &rc->cluster);
3403 if (ret < 0) 3732 if (ret < 0)
3404 err = ret; 3733 err = ret;
3405 } 3734 }
3406 3735
3407 kfree(cluster); 3736 rc->create_reloc_tree = 0;
3737 set_reloc_control(rc);
3408 3738
3409 rc->create_reloc_root = 0; 3739 backref_cache_cleanup(&rc->backref_cache);
3410 smp_mb(); 3740 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3411 3741
3412 if (rc->extents_found > 0) { 3742 err = prepare_to_merge(rc, err);
3413 trans = btrfs_start_transaction(rc->extent_root, 1);
3414 btrfs_commit_transaction(trans, rc->extent_root);
3415 }
3416 3743
3417 merge_reloc_roots(rc); 3744 merge_reloc_roots(rc);
3418 3745
3746 rc->merge_reloc_tree = 0;
3419 unset_reloc_control(rc); 3747 unset_reloc_control(rc);
3748 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3420 3749
3421 /* get rid of pinned extents */ 3750 /* get rid of pinned extents */
3422 trans = btrfs_start_transaction(rc->extent_root, 1); 3751 trans = btrfs_join_transaction(rc->extent_root, 1);
3423 btrfs_commit_transaction(trans, rc->extent_root); 3752 btrfs_commit_transaction(trans, rc->extent_root);
3424 3753out_free:
3754 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3755 btrfs_free_path(path);
3425 return err; 3756 return err;
3426} 3757}
3427 3758
@@ -3447,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3447 btrfs_set_inode_generation(leaf, item, 1); 3778 btrfs_set_inode_generation(leaf, item, 1);
3448 btrfs_set_inode_size(leaf, item, 0); 3779 btrfs_set_inode_size(leaf, item, 0);
3449 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3780 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3450 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3781 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3782 BTRFS_INODE_PREALLOC);
3451 btrfs_mark_buffer_dirty(leaf); 3783 btrfs_mark_buffer_dirty(leaf);
3452 btrfs_release_path(root, path); 3784 btrfs_release_path(root, path);
3453out: 3785out:
@@ -3459,8 +3791,9 @@ out:
3459 * helper to create inode for data relocation. 3791 * helper to create inode for data relocation.
3460 * the inode is in data relocation tree and its link count is 0 3792 * the inode is in data relocation tree and its link count is 0
3461 */ 3793 */
3462static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3794static noinline_for_stack
3463 struct btrfs_block_group_cache *group) 3795struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3796 struct btrfs_block_group_cache *group)
3464{ 3797{
3465 struct inode *inode = NULL; 3798 struct inode *inode = NULL;
3466 struct btrfs_trans_handle *trans; 3799 struct btrfs_trans_handle *trans;
@@ -3474,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3474 if (IS_ERR(root)) 3807 if (IS_ERR(root))
3475 return ERR_CAST(root); 3808 return ERR_CAST(root);
3476 3809
3477 trans = btrfs_start_transaction(root, 1); 3810 trans = btrfs_start_transaction(root, 6);
3478 BUG_ON(!trans); 3811 if (IS_ERR(trans))
3812 return ERR_CAST(trans);
3479 3813
3480 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3814 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3481 if (err) 3815 if (err)
@@ -3495,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3495out: 3829out:
3496 nr = trans->blocks_used; 3830 nr = trans->blocks_used;
3497 btrfs_end_transaction(trans, root); 3831 btrfs_end_transaction(trans, root);
3498
3499 btrfs_btree_balance_dirty(root, nr); 3832 btrfs_btree_balance_dirty(root, nr);
3500 if (err) { 3833 if (err) {
3501 if (inode) 3834 if (inode)
@@ -3505,6 +3838,21 @@ out:
3505 return inode; 3838 return inode;
3506} 3839}
3507 3840
3841static struct reloc_control *alloc_reloc_control(void)
3842{
3843 struct reloc_control *rc;
3844
3845 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3846 if (!rc)
3847 return NULL;
3848
3849 INIT_LIST_HEAD(&rc->reloc_roots);
3850 backref_cache_init(&rc->backref_cache);
3851 mapping_tree_init(&rc->reloc_root_tree);
3852 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3853 return rc;
3854}
3855
3508/* 3856/*
3509 * function to relocate all extents in a block group. 3857 * function to relocate all extents in a block group.
3510 */ 3858 */
@@ -3513,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3513 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3861 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3514 struct reloc_control *rc; 3862 struct reloc_control *rc;
3515 int ret; 3863 int ret;
3864 int rw = 0;
3516 int err = 0; 3865 int err = 0;
3517 3866
3518 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3867 rc = alloc_reloc_control();
3519 if (!rc) 3868 if (!rc)
3520 return -ENOMEM; 3869 return -ENOMEM;
3521 3870
3522 mapping_tree_init(&rc->reloc_root_tree); 3871 rc->extent_root = extent_root;
3523 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3524 INIT_LIST_HEAD(&rc->reloc_roots);
3525 3872
3526 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3873 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3527 BUG_ON(!rc->block_group); 3874 BUG_ON(!rc->block_group);
3528 3875
3529 btrfs_init_workers(&rc->workers, "relocate", 3876 if (!rc->block_group->ro) {
3530 fs_info->thread_pool_size, NULL); 3877 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3531 3878 if (ret) {
3532 rc->extent_root = extent_root; 3879 err = ret;
3533 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3880 goto out;
3881 }
3882 rw = 1;
3883 }
3534 3884
3535 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3536 if (IS_ERR(rc->data_inode)) { 3886 if (IS_ERR(rc->data_inode)) {
@@ -3547,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3547 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3897 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3548 3898
3549 while (1) { 3899 while (1) {
3550 rc->extents_found = 0;
3551 rc->extents_skipped = 0;
3552
3553 mutex_lock(&fs_info->cleaner_mutex); 3900 mutex_lock(&fs_info->cleaner_mutex);
3554 3901
3555 btrfs_clean_old_snapshots(fs_info->tree_root); 3902 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3558,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3558 mutex_unlock(&fs_info->cleaner_mutex); 3905 mutex_unlock(&fs_info->cleaner_mutex);
3559 if (ret < 0) { 3906 if (ret < 0) {
3560 err = ret; 3907 err = ret;
3561 break; 3908 goto out;
3562 } 3909 }
3563 3910
3564 if (rc->extents_found == 0) 3911 if (rc->extents_found == 0)
@@ -3572,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3572 invalidate_mapping_pages(rc->data_inode->i_mapping, 3919 invalidate_mapping_pages(rc->data_inode->i_mapping,
3573 0, -1); 3920 0, -1);
3574 rc->stage = UPDATE_DATA_PTRS; 3921 rc->stage = UPDATE_DATA_PTRS;
3575 } else if (rc->stage == UPDATE_DATA_PTRS &&
3576 rc->extents_skipped >= rc->extents_found) {
3577 iput(rc->data_inode);
3578 rc->data_inode = create_reloc_inode(fs_info,
3579 rc->block_group);
3580 if (IS_ERR(rc->data_inode)) {
3581 err = PTR_ERR(rc->data_inode);
3582 rc->data_inode = NULL;
3583 break;
3584 }
3585 rc->stage = MOVE_DATA_EXTENTS;
3586 rc->found_file_extent = 0;
3587 } 3922 }
3588 } 3923 }
3589 3924
@@ -3596,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3596 WARN_ON(rc->block_group->reserved > 0); 3931 WARN_ON(rc->block_group->reserved > 0);
3597 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3932 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3598out: 3933out:
3934 if (err && rw)
3935 btrfs_set_block_group_rw(extent_root, rc->block_group);
3599 iput(rc->data_inode); 3936 iput(rc->data_inode);
3600 btrfs_stop_workers(&rc->workers);
3601 btrfs_put_block_group(rc->block_group); 3937 btrfs_put_block_group(rc->block_group);
3602 kfree(rc); 3938 kfree(rc);
3603 return err; 3939 return err;
@@ -3608,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3608 struct btrfs_trans_handle *trans; 3944 struct btrfs_trans_handle *trans;
3609 int ret; 3945 int ret;
3610 3946
3611 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3947 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3612 3948
3613 memset(&root->root_item.drop_progress, 0, 3949 memset(&root->root_item.drop_progress, 0,
3614 sizeof(root->root_item.drop_progress)); 3950 sizeof(root->root_item.drop_progress));
@@ -3701,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 if (list_empty(&reloc_roots)) 4037 if (list_empty(&reloc_roots))
3702 goto out; 4038 goto out;
3703 4039
3704 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4040 rc = alloc_reloc_control();
3705 if (!rc) { 4041 if (!rc) {
3706 err = -ENOMEM; 4042 err = -ENOMEM;
3707 goto out; 4043 goto out;
3708 } 4044 }
3709 4045
3710 mapping_tree_init(&rc->reloc_root_tree);
3711 INIT_LIST_HEAD(&rc->reloc_roots);
3712 btrfs_init_workers(&rc->workers, "relocate",
3713 root->fs_info->thread_pool_size, NULL);
3714 rc->extent_root = root->fs_info->extent_root; 4046 rc->extent_root = root->fs_info->extent_root;
3715 4047
3716 set_reloc_control(rc); 4048 set_reloc_control(rc);
3717 4049
4050 trans = btrfs_join_transaction(rc->extent_root, 1);
4051
4052 rc->merge_reloc_tree = 1;
4053
3718 while (!list_empty(&reloc_roots)) { 4054 while (!list_empty(&reloc_roots)) {
3719 reloc_root = list_entry(reloc_roots.next, 4055 reloc_root = list_entry(reloc_roots.next,
3720 struct btrfs_root, root_list); 4056 struct btrfs_root, root_list);
@@ -3734,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3734 fs_root->reloc_root = reloc_root; 4070 fs_root->reloc_root = reloc_root;
3735 } 4071 }
3736 4072
3737 trans = btrfs_start_transaction(rc->extent_root, 1);
3738 btrfs_commit_transaction(trans, rc->extent_root); 4073 btrfs_commit_transaction(trans, rc->extent_root);
3739 4074
3740 merge_reloc_roots(rc); 4075 merge_reloc_roots(rc);
3741 4076
3742 unset_reloc_control(rc); 4077 unset_reloc_control(rc);
3743 4078
3744 trans = btrfs_start_transaction(rc->extent_root, 1); 4079 trans = btrfs_join_transaction(rc->extent_root, 1);
3745 btrfs_commit_transaction(trans, rc->extent_root); 4080 btrfs_commit_transaction(trans, rc->extent_root);
3746out: 4081out:
3747 if (rc) { 4082 kfree(rc);
3748 btrfs_stop_workers(&rc->workers);
3749 kfree(rc);
3750 }
3751 while (!list_empty(&reloc_roots)) { 4083 while (!list_empty(&reloc_roots)) {
3752 reloc_root = list_entry(reloc_roots.next, 4084 reloc_root = list_entry(reloc_roots.next,
3753 struct btrfs_root, root_list); 4085 struct btrfs_root, root_list);
@@ -3813,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3813 btrfs_put_ordered_extent(ordered); 4145 btrfs_put_ordered_extent(ordered);
3814 return 0; 4146 return 0;
3815} 4147}
4148
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4150 struct btrfs_root *root, struct extent_buffer *buf,
4151 struct extent_buffer *cow)
4152{
4153 struct reloc_control *rc;
4154 struct backref_node *node;
4155 int first_cow = 0;
4156 int level;
4157 int ret;
4158
4159 rc = root->fs_info->reloc_ctl;
4160 if (!rc)
4161 return;
4162
4163 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4164 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4165
4166 level = btrfs_header_level(buf);
4167 if (btrfs_header_generation(buf) <=
4168 btrfs_root_last_snapshot(&root->root_item))
4169 first_cow = 1;
4170
4171 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4172 rc->create_reloc_tree) {
4173 WARN_ON(!first_cow && level == 0);
4174
4175 node = rc->backref_cache.path[level];
4176 BUG_ON(node->bytenr != buf->start &&
4177 node->new_bytenr != buf->start);
4178
4179 drop_node_buffer(node);
4180 extent_buffer_get(cow);
4181 node->eb = cow;
4182 node->new_bytenr = cow->start;
4183
4184 if (!node->pending) {
4185 list_move_tail(&node->list,
4186 &rc->backref_cache.pending[level]);
4187 node->pending = 1;
4188 }
4189
4190 if (first_cow)
4191 __mark_block_processed(rc, node);
4192
4193 if (first_cow && level > 0)
4194 rc->nodes_relocated += buf->len;
4195 }
4196
4197 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4198 ret = replace_file_extents(trans, rc, root, cow);
4199 BUG_ON(ret);
4200 }
4201}
4202
4203/*
4204 * called before creating snapshot. it calculates metadata reservation
4205 * requried for relocating tree blocks in the snapshot
4206 */
4207void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4208 struct btrfs_pending_snapshot *pending,
4209 u64 *bytes_to_reserve)
4210{
4211 struct btrfs_root *root;
4212 struct reloc_control *rc;
4213
4214 root = pending->root;
4215 if (!root->reloc_root)
4216 return;
4217
4218 rc = root->fs_info->reloc_ctl;
4219 if (!rc->merge_reloc_tree)
4220 return;
4221
4222 root = root->reloc_root;
4223 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4224 /*
4225 * relocation is in the stage of merging trees. the space
4226 * used by merging a reloc tree is twice the size of
4227 * relocated tree nodes in the worst case. half for cowing
4228 * the reloc tree, half for cowing the fs tree. the space
4229 * used by cowing the reloc tree will be freed after the
4230 * tree is dropped. if we create snapshot, cowing the fs
4231 * tree may use more space than it frees. so we need
4232 * reserve extra space.
4233 */
4234 *bytes_to_reserve += rc->nodes_relocated;
4235}
4236
4237/*
4238 * called after snapshot is created. migrate block reservation
4239 * and create reloc root for the newly created snapshot
4240 */
4241void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4242 struct btrfs_pending_snapshot *pending)
4243{
4244 struct btrfs_root *root = pending->root;
4245 struct btrfs_root *reloc_root;
4246 struct btrfs_root *new_root;
4247 struct reloc_control *rc;
4248 int ret;
4249
4250 if (!root->reloc_root)
4251 return;
4252
4253 rc = root->fs_info->reloc_ctl;
4254 rc->merging_rsv_size += rc->nodes_relocated;
4255
4256 if (rc->merge_reloc_tree) {
4257 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4258 rc->block_rsv,
4259 rc->nodes_relocated);
4260 BUG_ON(ret);
4261 }
4262
4263 new_root = pending->snap;
4264 reloc_root = create_reloc_root(trans, root->reloc_root,
4265 new_root->root_key.objectid);
4266
4267 __add_reloc_root(reloc_root);
4268 new_root->reloc_root = reloc_root;
4269
4270 if (rc->create_reloc_tree) {
4271 ret = clone_backref_node(trans, rc, root, reloc_root);
4272 BUG_ON(ret);
4273 }
4274}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
313{ 330{
314 struct btrfs_path *path; 331 struct btrfs_path *path;
315 int ret; 332 int ret;
316 u32 refs;
317 struct btrfs_root_item *ri; 333 struct btrfs_root_item *ri;
318 struct extent_buffer *leaf; 334 struct extent_buffer *leaf;
319 335
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
327 leaf = path->nodes[0]; 343 leaf = path->nodes[0];
328 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); 344 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
329 345
330 refs = btrfs_disk_root_refs(leaf, ri);
331 BUG_ON(refs != 0);
332 ret = btrfs_del_item(trans, root, path); 346 ret = btrfs_del_item(trans, root, path);
333out: 347out:
334 btrfs_free_path(path); 348 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ac612e6ca60..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -64,10 +65,9 @@ static void btrfs_put_super(struct super_block *sb)
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
73 73
@@ -79,7 +79,6 @@ static match_table_t tokens = {
79 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
80 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
81 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
82 {Opt_max_extent, "max_extent=%s"},
83 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
@@ -188,18 +187,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
188 info->thread_pool_size); 187 info->thread_pool_size);
189 } 188 }
190 break; 189 break;
191 case Opt_max_extent:
192 num = match_strdup(&args[0]);
193 if (num) {
194 info->max_extent = memparse(num, NULL);
195 kfree(num);
196
197 info->max_extent = max_t(u64,
198 info->max_extent, root->sectorsize);
199 printk(KERN_INFO "btrfs: max_extent at %llu\n",
200 (unsigned long long)info->max_extent);
201 }
202 break;
203 case Opt_max_inline: 190 case Opt_max_inline:
204 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
205 if (num) { 192 if (num) {
@@ -373,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
373 */ 360 */
374 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
375 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (IS_ERR(di))
364 return ERR_CAST(di);
376 if (!di) { 365 if (!di) {
377 /* 366 /*
378 * Ok the default dir item isn't there. This is weird since 367 * Ok the default dir item isn't there. This is weird since
@@ -403,8 +392,8 @@ setup_root:
403 location.offset = 0; 392 location.offset = 0;
404 393
405 inode = btrfs_iget(sb, &location, new_root, &new); 394 inode = btrfs_iget(sb, &location, new_root, &new);
406 if (!inode) 395 if (IS_ERR(inode))
407 return ERR_PTR(-ENOMEM); 396 return ERR_CAST(inode);
408 397
409 /* 398 /*
410 * If we're just mounting the root most subvol put the inode and return 399 * If we're just mounting the root most subvol put the inode and return
@@ -511,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
511 btrfs_start_delalloc_inodes(root, 0); 500 btrfs_start_delalloc_inodes(root, 0);
512 btrfs_wait_ordered_extents(root, 0, 0); 501 btrfs_wait_ordered_extents(root, 0, 0);
513 502
514 trans = btrfs_start_transaction(root, 1); 503 trans = btrfs_start_transaction(root, 0);
515 ret = btrfs_commit_transaction(trans, root); 504 ret = btrfs_commit_transaction(trans, root);
516 return ret; 505 return ret;
517} 506}
@@ -529,9 +518,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
529 seq_puts(seq, ",nodatacow"); 518 seq_puts(seq, ",nodatacow");
530 if (btrfs_test_opt(root, NOBARRIER)) 519 if (btrfs_test_opt(root, NOBARRIER))
531 seq_puts(seq, ",nobarrier"); 520 seq_puts(seq, ",nobarrier");
532 if (info->max_extent != (u64)-1)
533 seq_printf(seq, ",max_extent=%llu",
534 (unsigned long long)info->max_extent);
535 if (info->max_inline != 8192 * 1024) 521 if (info->max_inline != 8192 * 1024)
536 seq_printf(seq, ",max_inline=%llu", 522 seq_printf(seq, ",max_inline=%llu",
537 (unsigned long long)info->max_inline); 523 (unsigned long long)info->max_inline);
@@ -710,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
710 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 696 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
711 return -EINVAL; 697 return -EINVAL;
712 698
713 /* recover relocation */ 699 ret = btrfs_cleanup_fs_roots(root->fs_info);
714 ret = btrfs_recover_relocation(root);
715 WARN_ON(ret); 700 WARN_ON(ret);
716 701
717 ret = btrfs_cleanup_fs_roots(root->fs_info); 702 /* recover relocation */
703 ret = btrfs_recover_relocation(root);
718 WARN_ON(ret); 704 WARN_ON(ret);
719 705
720 sb->s_flags &= ~MS_RDONLY; 706 sb->s_flags &= ~MS_RDONLY;
@@ -730,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
730 struct list_head *head = &root->fs_info->space_info; 716 struct list_head *head = &root->fs_info->space_info;
731 struct btrfs_space_info *found; 717 struct btrfs_space_info *found;
732 u64 total_used = 0; 718 u64 total_used = 0;
733 u64 data_used = 0;
734 int bits = dentry->d_sb->s_blocksize_bits; 719 int bits = dentry->d_sb->s_blocksize_bits;
735 __be32 *fsid = (__be32 *)root->fs_info->fsid; 720 __be32 *fsid = (__be32 *)root->fs_info->fsid;
736 721
737 rcu_read_lock(); 722 rcu_read_lock();
738 list_for_each_entry_rcu(found, head, list) { 723 list_for_each_entry_rcu(found, head, list)
739 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 724 total_used += found->disk_used;
740 BTRFS_BLOCK_GROUP_RAID10|
741 BTRFS_BLOCK_GROUP_RAID1)) {
742 total_used += found->bytes_used;
743 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
744 data_used += found->bytes_used;
745 else
746 data_used += found->total_bytes;
747 }
748
749 total_used += found->bytes_used;
750 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
751 data_used += found->bytes_used;
752 else
753 data_used += found->total_bytes;
754 }
755 rcu_read_unlock(); 725 rcu_read_unlock();
756 726
757 buf->f_namelen = BTRFS_NAME_LEN; 727 buf->f_namelen = BTRFS_NAME_LEN;
758 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
759 buf->f_bfree = buf->f_blocks - (total_used >> bits); 729 buf->f_bfree = buf->f_blocks - (total_used >> bits);
760 buf->f_bavail = buf->f_blocks - (data_used >> bits); 730 buf->f_bavail = buf->f_bfree;
761 buf->f_bsize = dentry->d_sb->s_blocksize; 731 buf->f_bsize = dentry->d_sb->s_blocksize;
762 buf->f_type = BTRFS_SUPER_MAGIC; 732 buf->f_type = BTRFS_SUPER_MAGIC;
763 733
@@ -848,11 +818,14 @@ static const struct file_operations btrfs_ctl_fops = {
848}; 818};
849 819
850static struct miscdevice btrfs_misc = { 820static struct miscdevice btrfs_misc = {
851 .minor = MISC_DYNAMIC_MINOR, 821 .minor = BTRFS_MINOR,
852 .name = "btrfs-control", 822 .name = "btrfs-control",
853 .fops = &btrfs_ctl_fops 823 .fops = &btrfs_ctl_fops
854}; 824};
855 825
826MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
827MODULE_ALIAS("devname:btrfs-control");
828
856static int btrfs_interface_init(void) 829static int btrfs_interface_init(void)
857{ 830{
858 return misc_register(&btrfs_misc); 831 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d654c1c794d..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -169,54 +165,89 @@ enum btrfs_trans_type {
169 TRANS_USERSPACE, 165 TRANS_USERSPACE,
170}; 166};
171 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
173 int num_blocks, int type) 178 u64 num_items, int type)
174{ 179{
175 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
177 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
178 188
179 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
180 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE))
183 wait_current_trans(root); 191 wait_current_trans(root);
192
184 ret = join_transaction(root); 193 ret = join_transaction(root);
185 BUG_ON(ret); 194 BUG_ON(ret);
186 195
187 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
188 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
189 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
190 h->blocks_used = 0; 202 h->blocks_used = 0;
191 h->block_group = 0; 203 h->block_group = 0;
192 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
193 h->alloc_exclude_start = 0;
194 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
195 207
196 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
197 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
198 226
199 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
200 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
201 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
202 return h; 233 return h;
203} 234}
204 235
205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
206 int num_blocks) 237 int num_items)
207{ 238{
208 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
209} 240}
210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
211 int num_blocks) 242 int num_blocks)
212{ 243{
213 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
214} 245}
215 246
216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
217 int num_blocks) 248 int num_blocks)
218{ 249{
219 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
220} 251}
221 252
222/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -290,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
290 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
291} 322}
292 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
293static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
294 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
295{ 352{
296 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
297 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
298 int count = 0; 355 int count = 0;
299 356
@@ -317,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 count++; 374 count++;
318 } 375 }
319 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
320 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
321 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
322 WARN_ON(cur_trans != trans->transaction);
323 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
324 cur_trans->num_writers--; 393 cur_trans->num_writers--;
325 394
@@ -607,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
607 676
608 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
609 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
610 680
611 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
612 switch_commit_root(root); 682 switch_commit_root(root);
@@ -631,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
631int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
632{ 702{
633 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
634 int ret;
635 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
636 unsigned long nr; 706 unsigned long nr;
637 707
638 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
639 if (root->defrag_running)
640 return 0; 709 return 0;
641 trans = btrfs_start_transaction(root, 1); 710
642 while (1) { 711 while (1) {
643 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
644 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
645 nr = trans->blocks_used; 718 nr = trans->blocks_used;
646 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
647 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
648 cond_resched(); 721 cond_resched();
649 722
650 trans = btrfs_start_transaction(root, 1);
651 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
652 break; 724 break;
653 } 725 }
654 root->defrag_running = 0; 726 root->defrag_running = 0;
655 smp_mb(); 727 return ret;
656 btrfs_end_transaction(trans, root);
657 return 0;
658} 728}
659 729
660#if 0 730#if 0
@@ -760,29 +830,72 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 830 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 831 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
833 struct btrfs_root *parent_root;
834 struct inode *parent_inode;
835 struct dentry *dentry;
763 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
764 struct extent_buffer *old; 837 struct extent_buffer *old;
765 int ret; 838 int ret;
839 int retries = 0;
840 u64 to_reserve = 0;
841 u64 index = 0;
766 u64 objectid; 842 u64 objectid;
767 843
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 845 if (!new_root_item) {
770 ret = -ENOMEM; 846 pending->error = -ENOMEM;
771 goto fail; 847 goto fail;
772 } 848 }
849
773 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
774 if (ret) 851 if (ret) {
852 pending->error = ret;
775 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
867
868 key.objectid = objectid;
869 key.offset = (u64)-1;
870 key.type = BTRFS_ROOT_ITEM_KEY;
871
872 trans->block_rsv = &pending->block_rsv;
873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
877 record_root_in_trans(trans, parent_root);
878
879 /*
880 * insert the directory item
881 */
882 ret = btrfs_set_inode_index(parent_inode, &index);
883 BUG_ON(ret);
884 ret = btrfs_insert_dir_item(trans, parent_root,
885 dentry->d_name.name, dentry->d_name.len,
886 parent_inode->i_ino, &key,
887 BTRFS_FT_DIR, index);
888 BUG_ON(ret);
889
890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
893 BUG_ON(ret);
776 894
777 record_root_in_trans(trans, root); 895 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 896 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 897 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780 898
781 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785
786 old = btrfs_lock_root_node(root); 899 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old); 900 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old); 901 btrfs_set_lock_blocking(old);
@@ -792,62 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
792 free_extent_buffer(old); 905 free_extent_buffer(old);
793 906
794 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
796 new_root_item); 909 key.offset = trans->transid;
910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
797 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
799 if (ret) 913 BUG_ON(ret);
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key));
804fail:
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 914
823 /* 915 /*
824 * insert the directory item 916 * insert root back/forward references
825 */ 917 */
826 namelen = strlen(pending->name); 918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
827 ret = btrfs_set_inode_index(parent_inode, &index);
828 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen,
830 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index);
832
833 if (ret)
834 goto fail;
835
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret);
839
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid,
842 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
844 namelen); 921 dentry->d_name.name, dentry->d_name.len);
845
846 BUG_ON(ret); 922 BUG_ON(ret);
847 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
848fail: 930fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 931 kfree(new_root_item);
850 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
851} 934}
852 935
853/* 936/*
@@ -867,25 +950,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 950 return 0;
868} 951}
869 952
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 953static void update_super_roots(struct btrfs_root *root)
890{ 954{
891 struct btrfs_root_item *root_item; 955 struct btrfs_root_item *root_item;
@@ -914,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
914 return ret; 978 return ret;
915} 979}
916 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
917int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
918 struct btrfs_root *root) 992 struct btrfs_root *root)
919{ 993{
@@ -935,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
935 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
936 BUG_ON(ret); 1010 BUG_ON(ret);
937 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
938 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
939 /* 1015 /*
940 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -987,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
987 snap_pending = 1; 1063 snap_pending = 1;
988 1064
989 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
990 prepare_to_wait(&cur_trans->writer_wait, &wait,
991 TASK_UNINTERRUPTIBLE);
992
993 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
994 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
995 else if (should_grow) 1068 else if (should_grow)
@@ -1012,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1012 */ 1085 */
1013 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
1014 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
1015 smp_mb(); 1091 smp_mb();
1016 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
1017 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1097,9 +1173,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1097 1173
1098 btrfs_finish_extent_commit(trans, root); 1174 btrfs_finish_extent_commit(trans, root);
1099 1175
1100 /* do the directory inserts of any pending snapshot creations */
1101 finish_pending_snapshots(trans, root->fs_info);
1102
1103 mutex_lock(&root->fs_info->trans_mutex); 1176 mutex_lock(&root->fs_info->trans_mutex);
1104 1177
1105 cur_trans->commit_done = 1; 1178 cur_trans->commit_done = 1;
@@ -1142,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1142 1215
1143 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1144 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1145 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1146 else 1219 else
1147 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1148 } 1221 }
1149 return 0; 1222 return 0;
1150} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -134,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
134 struct btrfs_root *root) 135 struct btrfs_root *root)
135{ 136{
136 int ret; 137 int ret;
138 int err = 0;
137 139
138 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 141 if (root->log_root) {
@@ -154,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
154 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
155 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
156 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
157 BUG_ON(ret); 159 if (ret)
160 err = ret;
158 } 161 }
159 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
160 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
161 BUG_ON(ret); 164 if (ret)
165 err = ret;
162 } 166 }
163 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
164 root->log_batch++; 168 root->log_batch++;
165 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
166 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
167 return 0; 171 return err;
168} 172}
169 173
170/* 174/*
@@ -375,7 +379,7 @@ insert:
375 BUG_ON(ret); 379 BUG_ON(ret);
376 } 380 }
377 } else if (ret) { 381 } else if (ret) {
378 BUG(); 382 return ret;
379 } 383 }
380 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
381 path->slots[0]); 385 path->slots[0]);
@@ -1698,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1698 1702
1699 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1700 1704
1701 wc->process_func(root, next, wc, ptr_gen);
1702
1703 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1704 path->slots[*level]++; 1708 path->slots[*level]++;
1705 if (wc->free) { 1709 if (wc->free) {
1706 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1733,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1733 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1734 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1735 1739
1736 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1737 parent = path->nodes[*level];
1738 else
1739 parent = path->nodes[*level + 1];
1740
1741 bytenr = path->nodes[*level]->start;
1742
1743 blocksize = btrfs_level_size(root, *level);
1744 root_owner = btrfs_header_owner(parent);
1745 root_gen = btrfs_header_generation(parent);
1746
1747 wc->process_func(root, path->nodes[*level], wc,
1748 btrfs_header_generation(path->nodes[*level]));
1749
1750 if (wc->free) {
1751 next = path->nodes[*level];
1752 btrfs_tree_lock(next);
1753 clean_tree_block(trans, root, next);
1754 btrfs_set_lock_blocking(next);
1755 btrfs_wait_tree_block_writeback(next);
1756 btrfs_tree_unlock(next);
1757
1758 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1759 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1760 BUG_ON(ret);
1761 }
1762 free_extent_buffer(path->nodes[*level]);
1763 path->nodes[*level] = NULL;
1764 *level += 1;
1765 1741
1766 cond_resched(); 1742 cond_resched();
1767 return 0; 1743 return 0;
@@ -1780,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1780 1756
1781 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1782 slot = path->slots[i]; 1758 slot = path->slots[i];
1783 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1784 struct extent_buffer *node; 1760 struct extent_buffer *node;
1785 node = path->nodes[i]; 1761 node = path->nodes[i];
1786 path->slots[i]++; 1762 path->slots[i]++;
@@ -2046,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2046 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2047 2023
2048 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2049 BUG_ON(ret);
2050 2025
2051 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2052 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2055,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2056 } 2031 }
2057 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2058 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2059 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2060 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2128,15 +2112,10 @@ out:
2128 return 0; 2112 return 0;
2129} 2113}
2130 2114
2131/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2132 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2133 * at commit time of the full transaction
2134 */
2135int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2136{ 2117{
2137 int ret; 2118 int ret;
2138 struct btrfs_root *log;
2139 struct key;
2140 u64 start; 2119 u64 start;
2141 u64 end; 2120 u64 end;
2142 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2144,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2144 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2145 }; 2124 };
2146 2125
2147 if (!root->log_root || root->fs_info->log_root_recovering)
2148 return 0;
2149
2150 log = root->log_root;
2151 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2152 BUG_ON(ret); 2127 BUG_ON(ret);
2153 2128
@@ -2161,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2161 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2162 } 2137 }
2163 2138
2164 if (log->log_transid > 0) {
2165 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2166 &log->root_key);
2167 BUG_ON(ret);
2168 }
2169 root->log_root = NULL;
2170 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2171 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2172 return 0; 2163 return 0;
2173} 2164}
2174 2165
@@ -2202,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2202 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2203 struct btrfs_path *path; 2194 struct btrfs_path *path;
2204 int ret; 2195 int ret;
2196 int err = 0;
2205 int bytes_del = 0; 2197 int bytes_del = 0;
2206 2198
2207 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2217,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2217 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2218 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2219 name, name_len, -1); 2211 name, name_len, -1);
2220 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2221 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2222 bytes_del += name_len; 2218 bytes_del += name_len;
2223 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2225,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2225 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2226 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2227 index, name, name_len, -1); 2223 index, name, name_len, -1);
2228 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2229 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2230 bytes_del += name_len; 2230 bytes_del += name_len;
2231 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2243,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2243 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2244 2244
2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2246 if (ret == 0) { 2250 if (ret == 0) {
2247 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2248 u64 i_size; 2252 u64 i_size;
@@ -2260,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2260 ret = 0; 2264 ret = 0;
2261 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2262 } 2266 }
2263 2267fail:
2264 btrfs_free_path(path); 2268 btrfs_free_path(path);
2265 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2266 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2267 2275
2268 return 0; 2276 return 0;
@@ -2290,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2290 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2291 dirid, &index); 2299 dirid, &index);
2292 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2293 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2294 2306
2295 return ret; 2307 return ret;
@@ -2317,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2317 else 2329 else
2318 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2319 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2320 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2321 2334
2322 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2323 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2342,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2342 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2343 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2344 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2345 int ret; 2359 int ret;
2346 int i; 2360 int i;
2347 int nritems; 2361 int nritems;
@@ -2404,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2404 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2405 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2406 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2407 } 2425 }
2408 } 2426 }
2409 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2431,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2431 goto done; 2449 goto done;
2432 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2433 &min_key); 2451 &min_key);
2434 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2435 } 2456 }
2436 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2437 2458
@@ -2453,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2453 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2454 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2455 &tmp); 2476 &tmp);
2456 2477 if (ret)
2457 BUG_ON(ret); 2478 err = ret;
2458 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2459 goto done; 2481 goto done;
2460 } 2482 }
2461 } 2483 }
2462done: 2484done:
2463 *last_offset_ret = last_offset;
2464 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2465 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2466 2487
2467 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2468 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2469 first_offset, last_offset); 2490 /*
2470 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2471 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2472} 2501}
2473 2502
2474/* 2503/*
@@ -2500,7 +2529,8 @@ again:
2500 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2501 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2502 &max_key); 2531 &max_key);
2503 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2504 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2505 break; 2535 break;
2506 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2534,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2534 2564
2535 while (1) { 2565 while (1) {
2536 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2537 2567 BUG_ON(ret == 0);
2538 if (ret != 1) 2568 if (ret < 0)
2539 break; 2569 break;
2540 2570
2541 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2553,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2553 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2554 } 2584 }
2555 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2556 return 0; 2586 return ret;
2557} 2587}
2558 2588
2559static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2586,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2586 } 2616 }
2587 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2588 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2589 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2590 2623
2591 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2592 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2659,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2659 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2660 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2661 */ 2694 */
2695 ret = 0;
2662 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2663 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2664 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2665 list); 2699 list);
2666 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2667 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2668 list_del(&sums->list); 2702 list_del(&sums->list);
2669 kfree(sums); 2703 kfree(sums);
2670 } 2704 }
2671 return 0; 2705 return ret;
2672} 2706}
2673 2707
2674/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2696,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2696 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2697 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2698 u32 size; 2732 u32 size;
2733 int err = 0;
2699 int ret; 2734 int ret;
2700 int nritems; 2735 int nritems;
2701 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2738,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2738 } else { 2773 } else {
2739 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2740 } 2775 }
2741 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2742 path->keep_locks = 1; 2780 path->keep_locks = 1;
2743 2781
2744 while (1) { 2782 while (1) {
@@ -2767,7 +2805,10 @@ again:
2767 2805
2768 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2769 ins_nr, inode_only); 2807 ins_nr, inode_only);
2770 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2771 ins_nr = 1; 2812 ins_nr = 1;
2772 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2773next_slot: 2814next_slot:
@@ -2783,7 +2824,10 @@ next_slot:
2783 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2784 ins_start_slot, 2825 ins_start_slot,
2785 ins_nr, inode_only); 2826 ins_nr, inode_only);
2786 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2787 ins_nr = 0; 2831 ins_nr = 0;
2788 } 2832 }
2789 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2801,7 +2845,10 @@ next_slot:
2801 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2802 ins_start_slot, 2846 ins_start_slot,
2803 ins_nr, inode_only); 2847 ins_nr, inode_only);
2804 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2805 ins_nr = 0; 2852 ins_nr = 0;
2806 } 2853 }
2807 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2809,14 +2856,18 @@ next_slot:
2809 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2810 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2811 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2812 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2813 } 2863 }
2814 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2815 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2816 2867
2817 btrfs_free_path(path); 2868 btrfs_free_path(path);
2818 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2819 return 0; 2870 return err;
2820} 2871}
2821 2872
2822/* 2873/*
@@ -2941,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2941 goto end_no_trans; 2992 goto end_no_trans;
2942 } 2993 }
2943 2994
2944 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2945 2998
2946 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2947 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2948 3002
2949 /* 3003 /*
2950 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2954,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2954 */ 3008 */
2955 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2956 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2957 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2958 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2959 3015
2960 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2961 while (1) { 3017 while (1) {
@@ -2969,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2969 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2970 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2971 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2972 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2973 } 3030 }
2974 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2975 break; 3032 break;
2976 3033
2977 parent = parent->d_parent; 3034 parent = parent->d_parent;
2978 } 3035 }
2979no_parent:
2980 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2981 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2982end_no_trans: 3044end_no_trans:
2983 return ret; 3045 return ret;
@@ -3019,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3019 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3020 BUG_ON(!path); 3082 BUG_ON(!path);
3021 3083
3022 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3023 3085
3024 wc.trans = trans; 3086 wc.trans = trans;
3025 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9df8e3f1ccab..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -1096,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1096 if (!path) 1097 if (!path)
1097 return -ENOMEM; 1098 return -ENOMEM;
1098 1099
1099 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1100 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1101 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1102 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1485,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1485 goto error; 1486 goto error;
1486 } 1487 }
1487 1488
1488 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1489 lock_chunks(root); 1490 lock_chunks(root);
1490 1491
1491 device->barriers = 1; 1492 device->barriers = 1;
@@ -1750,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1750 1751
1751 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1752 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1753 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1754 1756
1755 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1756 BUG_ON(!trans); 1758 BUG_ON(!trans);
1757 1759
1758 lock_chunks(root); 1760 lock_chunks(root);
@@ -1924,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1924 break; 1926 break;
1925 BUG_ON(ret); 1927 BUG_ON(ret);
1926 1928
1927 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1928 BUG_ON(!trans); 1930 BUG_ON(!trans);
1929 1931
1930 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2093,11 +2095,7 @@ again:
2093 } 2095 }
2094 2096
2095 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2096 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2097 if (!trans) {
2098 ret = -ENOMEM;
2099 goto done;
2100 }
2101 lock_chunks(root); 2099 lock_chunks(root);
2102 2100
2103 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
@@ -2198,9 +2196,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2198 min_stripes = 2; 2196 min_stripes = 2;
2199 } 2197 }
2200 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2201 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2199 if (fs_devices->rw_devices < 2)
2202 if (num_stripes < 2)
2203 return -ENOSPC; 2200 return -ENOSPC;
2201 num_stripes = 2;
2204 min_stripes = 2; 2202 min_stripes = 2;
2205 } 2203 }
2206 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2244,8 +2242,16 @@ again:
2244 do_div(calc_size, stripe_len); 2242 do_div(calc_size, stripe_len);
2245 calc_size *= stripe_len; 2243 calc_size *= stripe_len;
2246 } 2244 }
2245
2247 /* we don't want tiny stripes */ 2246 /* we don't want tiny stripes */
2248 calc_size = max_t(u64, min_stripe_size, calc_size); 2247 if (!looped)
2248 calc_size = max_t(u64, min_stripe_size, calc_size);
2249
2250 /*
2251 * we're about to do_div by the stripe_len so lets make sure
2252 * we end up with something bigger than a stripe
2253 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4);
2249 2255
2250 do_div(calc_size, stripe_len); 2256 do_div(calc_size, stripe_len);
2251 calc_size *= stripe_len; 2257 calc_size *= stripe_len;
@@ -3389,6 +3395,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3389 key.type = 0; 3395 key.type = 0;
3390again: 3396again:
3391 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3397 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3398 if (ret < 0)
3399 goto error;
3392 while (1) { 3400 while (1) {
3393 leaf = path->nodes[0]; 3401 leaf = path->nodes[0];
3394 slot = path->slots[0]; 3402 slot = path->slots[0];
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
@@ -282,7 +276,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 276 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 277 * attributes are handled directly.
284 */ 278 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 279const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 280#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 281 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 282 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);