aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-05 19:41:22 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-05 19:41:22 -0400
commit9efe21cb82b5dbe3b0b2ae4de4eccc64ecb94e95 (patch)
tree7ff8833745d2f268f897f6fa4a27263b4a572245 /fs/btrfs
parentde18836e447c2dc30120c0919b8db8ddc0401cc4 (diff)
parent0221c81b1b8eb0cbb6b30a0ced52ead32d2b4e4c (diff)
Merge branch 'linus' into irq/threaded
Conflicts: include/linux/irq.h kernel/irq/handle.c
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/async-thread.c7
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c900
-rw-r--r--fs/btrfs/ctree.h155
-rw-r--r--fs/btrfs/delayed-ref.c668
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c91
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c2062
-rw-r--r--fs/btrfs/extent_io.c67
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/free-space-cache.c530
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c211
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c54
-rw-r--r--fs/btrfs/transaction.c158
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c456
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/btrfs/volumes.c41
-rw-r--r--fs/btrfs/volumes.h2
33 files changed, 3751 insertions, 2167 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..9adf5e4f7e96 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o 11 compression.o delayed-ref.o
12else 12else
13 13
14# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
256 } 256 }
257 257
258 if (!acl) 258 if (!acl)
259 inode->i_mode &= ~current->fs->umask; 259 inode->i_mode &= ~current_umask();
260 } 260 }
261 261
262 if (IS_POSIXACL(dir) && acl) { 262 if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
195 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending))
196 continue; 195 continue;
197 196
197 if (kthread_should_stop())
198 break;
199
198 /* still no more work?, sleep for real */ 200 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 201 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 202 set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
208 worker->working = 0; 210 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 211 spin_unlock_irq(&worker->lock);
210 212
211 schedule(); 213 if (!kthread_should_stop())
214 schedule();
212 } 215 }
213 __set_current_state(TASK_RUNNING); 216 __set_current_state(TASK_RUNNING);
214 } 217 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
69 /* the space_info for where this inode's data allocations are done */ 75 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info; 76 struct btrfs_space_info *space_info;
71 77
@@ -86,12 +92,6 @@ struct btrfs_inode {
86 */ 92 */
87 u64 logged_trans; 93 u64 logged_trans;
88 94
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 95 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 96 * real block usage of the file
97 */ 97 */
@@ -121,6 +121,25 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 121 /* the start of block group preferred for allocations. */
122 u64 block_group; 122 u64 block_group;
123 123
124 /* the fsync log has some corner cases that mean we have to check
125 * directories to see if any unlinks have been done before
126 * the directory was logged. See tree-log.c for all the
127 * details
128 */
129 u64 last_unlink_trans;
130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
124 struct inode vfs_inode; 143 struct inode vfs_inode;
125}; 144};
126 145
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529aa..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
254 * empty_size -- a hint that you plan on doing more cow. This is the size in 254 * empty_size -- a hint that you plan on doing more cow. This is the size in
255 * bytes the allocator should try to find free next to the block it returns. 255 * bytes the allocator should try to find free next to the block it returns.
256 * This is just a hint and may be ignored by the allocator. 256 * This is just a hint and may be ignored by the allocator.
257 *
258 * prealloc_dest -- if you have already reserved a destination for the cow,
259 * this uses that block instead of allocating a new one.
260 * btrfs_alloc_reserved_extent is used to finish the allocation.
261 */ 257 */
262static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 258static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263 struct btrfs_root *root, 259 struct btrfs_root *root,
264 struct extent_buffer *buf, 260 struct extent_buffer *buf,
265 struct extent_buffer *parent, int parent_slot, 261 struct extent_buffer *parent, int parent_slot,
266 struct extent_buffer **cow_ret, 262 struct extent_buffer **cow_ret,
267 u64 search_start, u64 empty_size, 263 u64 search_start, u64 empty_size)
268 u64 prealloc_dest)
269{ 264{
270 u64 parent_start; 265 u64 parent_start;
271 struct extent_buffer *cow; 266 struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
291 level = btrfs_header_level(buf); 286 level = btrfs_header_level(buf);
292 nritems = btrfs_header_nritems(buf); 287 nritems = btrfs_header_nritems(buf);
293 288
294 if (prealloc_dest) { 289 cow = btrfs_alloc_free_block(trans, root, buf->len,
295 struct btrfs_key ins; 290 parent_start, root->root_key.objectid,
296 291 trans->transid, level,
297 ins.objectid = prealloc_dest; 292 search_start, empty_size);
298 ins.offset = buf->len;
299 ins.type = BTRFS_EXTENT_ITEM_KEY;
300
301 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302 root->root_key.objectid,
303 trans->transid, level, &ins);
304 BUG_ON(ret);
305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306 buf->len, level);
307 } else {
308 cow = btrfs_alloc_free_block(trans, root, buf->len,
309 parent_start,
310 root->root_key.objectid,
311 trans->transid, level,
312 search_start, empty_size);
313 }
314 if (IS_ERR(cow)) 293 if (IS_ERR(cow))
315 return PTR_ERR(cow); 294 return PTR_ERR(cow);
316 295
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
413noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 392noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414 struct btrfs_root *root, struct extent_buffer *buf, 393 struct btrfs_root *root, struct extent_buffer *buf,
415 struct extent_buffer *parent, int parent_slot, 394 struct extent_buffer *parent, int parent_slot,
416 struct extent_buffer **cow_ret, u64 prealloc_dest) 395 struct extent_buffer **cow_ret)
417{ 396{
418 u64 search_start; 397 u64 search_start;
419 int ret; 398 int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
436 btrfs_header_owner(buf) == root->root_key.objectid && 415 btrfs_header_owner(buf) == root->root_key.objectid &&
437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438 *cow_ret = buf; 417 *cow_ret = buf;
439 WARN_ON(prealloc_dest);
440 return 0; 418 return 0;
441 } 419 }
442 420
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
447 btrfs_set_lock_blocking(buf); 425 btrfs_set_lock_blocking(buf);
448 426
449 ret = __btrfs_cow_block(trans, root, buf, parent, 427 ret = __btrfs_cow_block(trans, root, buf, parent,
450 parent_slot, cow_ret, search_start, 0, 428 parent_slot, cow_ret, search_start, 0);
451 prealloc_dest);
452 return ret; 429 return ret;
453} 430}
454 431
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
617 err = __btrfs_cow_block(trans, root, cur, parent, i, 594 err = __btrfs_cow_block(trans, root, cur, parent, i,
618 &cur, search_start, 595 &cur, search_start,
619 min(16 * blocksize, 596 min(16 * blocksize,
620 (end_slot - i) * blocksize), 0); 597 (end_slot - i) * blocksize));
621 if (err) { 598 if (err) {
622 btrfs_tree_unlock(cur); 599 btrfs_tree_unlock(cur);
623 free_extent_buffer(cur); 600 free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
937 BUG_ON(!child); 914 BUG_ON(!child);
938 btrfs_tree_lock(child); 915 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child); 916 btrfs_set_lock_blocking(child);
940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 917 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
941 BUG_ON(ret); 918 BUG_ON(ret);
942 919
943 spin_lock(&root->node_lock); 920 spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
945 spin_unlock(&root->node_lock); 922 spin_unlock(&root->node_lock);
946 923
947 ret = btrfs_update_extent_ref(trans, root, child->start, 924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
948 mid->start, child->start, 926 mid->start, child->start,
949 root->root_key.objectid, 927 root->root_key.objectid,
950 trans->transid, level - 1); 928 trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
971 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972 return 0; 950 return 0;
973 951
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
974 if (btrfs_header_nritems(mid) < 2) 956 if (btrfs_header_nritems(mid) < 2)
975 err_on_enospc = 1; 957 err_on_enospc = 1;
976 958
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
979 btrfs_tree_lock(left); 961 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left); 962 btrfs_set_lock_blocking(left);
981 wret = btrfs_cow_block(trans, root, left, 963 wret = btrfs_cow_block(trans, root, left,
982 parent, pslot - 1, &left, 0); 964 parent, pslot - 1, &left);
983 if (wret) { 965 if (wret) {
984 ret = wret; 966 ret = wret;
985 goto enospc; 967 goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
990 btrfs_tree_lock(right); 972 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right); 973 btrfs_set_lock_blocking(right);
992 wret = btrfs_cow_block(trans, root, right, 974 wret = btrfs_cow_block(trans, root, right,
993 parent, pslot + 1, &right, 0); 975 parent, pslot + 1, &right);
994 if (wret) { 976 if (wret) {
995 ret = wret; 977 ret = wret;
996 goto enospc; 978 goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1171 wret = 1; 1153 wret = 1;
1172 } else { 1154 } else {
1173 ret = btrfs_cow_block(trans, root, left, parent, 1155 ret = btrfs_cow_block(trans, root, left, parent,
1174 pslot - 1, &left, 0); 1156 pslot - 1, &left);
1175 if (ret) 1157 if (ret)
1176 wret = 1; 1158 wret = 1;
1177 else { 1159 else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1222 } else { 1204 } else {
1223 ret = btrfs_cow_block(trans, root, right, 1205 ret = btrfs_cow_block(trans, root, right,
1224 parent, pslot + 1, 1206 parent, pslot + 1,
1225 &right, 0); 1207 &right);
1226 if (ret) 1208 if (ret)
1227 wret = 1; 1209 wret = 1;
1228 else { 1210 else {
@@ -1262,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1262 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1263 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1264 */ 1246 */
1265static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1266 struct btrfs_path *path, 1248 struct btrfs_path *path,
1267 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1268{ 1250{
1269 struct extent_buffer *node; 1251 struct extent_buffer *node;
1270 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1465,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1465} 1447}
1466 1448
1467/* 1449/*
1450 * helper function for btrfs_search_slot. The goal is to find a block
1451 * in cache without setting the path to blocking. If we find the block
1452 * we return zero and the path is unchanged.
1453 *
1454 * If we can't find the block, we set the path blocking and do some
1455 * reada. -EAGAIN is returned and the search must be repeated.
1456 */
1457static int
1458read_block_for_search(struct btrfs_trans_handle *trans,
1459 struct btrfs_root *root, struct btrfs_path *p,
1460 struct extent_buffer **eb_ret, int level, int slot,
1461 struct btrfs_key *key)
1462{
1463 u64 blocknr;
1464 u64 gen;
1465 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp;
1468
1469 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot);
1471 blocksize = btrfs_level_size(root, level - 1);
1472
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 *eb_ret = tmp;
1476 return 0;
1477 }
1478
1479 /*
1480 * reduce lock contention at high levels
1481 * of the btree by dropping locks before
1482 * we read.
1483 */
1484 btrfs_release_path(NULL, p);
1485 if (tmp)
1486 free_extent_buffer(tmp);
1487 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid);
1489
1490 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp)
1492 free_extent_buffer(tmp);
1493 return -EAGAIN;
1494}
1495
1496/*
1497 * helper function for btrfs_search_slot. This does all of the checks
1498 * for node-level blocks and does any balancing required based on
1499 * the ins_len.
1500 *
1501 * If no extra work was required, zero is returned. If we had to
1502 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503 * start over
1504 */
1505static int
1506setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 struct btrfs_root *root, struct btrfs_path *p,
1508 struct extent_buffer *b, int level, int ins_len)
1509{
1510 int ret;
1511 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 int sret;
1514
1515 sret = reada_for_balance(root, p, level);
1516 if (sret)
1517 goto again;
1518
1519 btrfs_set_path_blocking(p);
1520 sret = split_node(trans, root, p, level);
1521 btrfs_clear_path_blocking(p, NULL);
1522
1523 BUG_ON(sret > 0);
1524 if (sret) {
1525 ret = sret;
1526 goto done;
1527 }
1528 b = p->nodes[level];
1529 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 int sret;
1532
1533 sret = reada_for_balance(root, p, level);
1534 if (sret)
1535 goto again;
1536
1537 btrfs_set_path_blocking(p);
1538 sret = balance_level(trans, root, p, level);
1539 btrfs_clear_path_blocking(p, NULL);
1540
1541 if (sret) {
1542 ret = sret;
1543 goto done;
1544 }
1545 b = p->nodes[level];
1546 if (!b) {
1547 btrfs_release_path(NULL, p);
1548 goto again;
1549 }
1550 BUG_ON(btrfs_header_nritems(b) == 1);
1551 }
1552 return 0;
1553
1554again:
1555 ret = -EAGAIN;
1556done:
1557 return ret;
1558}
1559
1560/*
1468 * look for key in the tree. path is filled in with nodes along the way 1561 * look for key in the tree. path is filled in with nodes along the way
1469 * if key is found, we return zero and you can find the item in the leaf 1562 * if key is found, we return zero and you can find the item in the leaf
1470 * level of the path (level 0) 1563 * level of the path (level 0)
@@ -1482,17 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1482 ins_len, int cow) 1575 ins_len, int cow)
1483{ 1576{
1484 struct extent_buffer *b; 1577 struct extent_buffer *b;
1485 struct extent_buffer *tmp;
1486 int slot; 1578 int slot;
1487 int ret; 1579 int ret;
1488 int level; 1580 int level;
1489 int should_reada = p->reada;
1490 int lowest_unlock = 1; 1581 int lowest_unlock = 1;
1491 int blocksize;
1492 u8 lowest_level = 0; 1582 u8 lowest_level = 0;
1493 u64 blocknr;
1494 u64 gen;
1495 struct btrfs_key prealloc_block;
1496 1583
1497 lowest_level = p->lowest_level; 1584 lowest_level = p->lowest_level;
1498 WARN_ON(lowest_level && ins_len > 0); 1585 WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1588,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1501 if (ins_len < 0) 1588 if (ins_len < 0)
1502 lowest_unlock = 2; 1589 lowest_unlock = 2;
1503 1590
1504 prealloc_block.objectid = 0;
1505
1506again: 1591again:
1507 if (p->skip_locking) 1592 if (p->skip_locking)
1508 b = btrfs_root_node(root); 1593 b = btrfs_root_node(root);
@@ -1523,50 +1608,21 @@ again:
1523 if (cow) { 1608 if (cow) {
1524 int wret; 1609 int wret;
1525 1610
1526 /* is a cow on this block not required */ 1611 /*
1612 * if we don't really need to cow this block
1613 * then we don't want to set the path blocking,
1614 * so we test it here
1615 */
1527 if (btrfs_header_generation(b) == trans->transid && 1616 if (btrfs_header_generation(b) == trans->transid &&
1528 btrfs_header_owner(b) == root->root_key.objectid && 1617 btrfs_header_owner(b) == root->root_key.objectid &&
1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1618 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1530 goto cow_done; 1619 goto cow_done;
1531 } 1620 }
1532
1533 /* ok, we have to cow, is our old prealloc the right
1534 * size?
1535 */
1536 if (prealloc_block.objectid &&
1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1539 btrfs_free_reserved_extent(root,
1540 prealloc_block.objectid,
1541 prealloc_block.offset);
1542 prealloc_block.objectid = 0;
1543 goto again;
1544 }
1545
1546 /*
1547 * for higher level blocks, try not to allocate blocks
1548 * with the block and the parent locks held.
1549 */
1550 if (level > 0 && !prealloc_block.objectid) {
1551 u32 size = b->len;
1552 u64 hint = b->start;
1553
1554 btrfs_release_path(root, p);
1555 ret = btrfs_reserve_extent(trans, root,
1556 size, size, 0,
1557 hint, (u64)-1,
1558 &prealloc_block, 0);
1559 BUG_ON(ret);
1560 goto again;
1561 }
1562
1563 btrfs_set_path_blocking(p); 1621 btrfs_set_path_blocking(p);
1564 1622
1565 wret = btrfs_cow_block(trans, root, b, 1623 wret = btrfs_cow_block(trans, root, b,
1566 p->nodes[level + 1], 1624 p->nodes[level + 1],
1567 p->slots[level + 1], 1625 p->slots[level + 1], &b);
1568 &b, prealloc_block.objectid);
1569 prealloc_block.objectid = 0;
1570 if (wret) { 1626 if (wret) {
1571 free_extent_buffer(b); 1627 free_extent_buffer(b);
1572 ret = wret; 1628 ret = wret;
@@ -1611,51 +1667,15 @@ cow_done:
1611 if (ret && slot > 0) 1667 if (ret && slot > 0)
1612 slot -= 1; 1668 slot -= 1;
1613 p->slots[level] = slot; 1669 p->slots[level] = slot;
1614 if ((p->search_for_split || ins_len > 0) && 1670 ret = setup_nodes_for_search(trans, root, p, b, level,
1615 btrfs_header_nritems(b) >= 1671 ins_len);
1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1672 if (ret == -EAGAIN)
1617 int sret; 1673 goto again;
1618 1674 else if (ret)
1619 sret = reada_for_balance(root, p, level); 1675 goto done;
1620 if (sret) 1676 b = p->nodes[level];
1621 goto again; 1677 slot = p->slots[level];
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1627 BUG_ON(sret > 0);
1628 if (sret) {
1629 ret = sret;
1630 goto done;
1631 }
1632 b = p->nodes[level];
1633 slot = p->slots[level];
1634 } else if (ins_len < 0 &&
1635 btrfs_header_nritems(b) <
1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646 1678
1647 if (sret) {
1648 ret = sret;
1649 goto done;
1650 }
1651 b = p->nodes[level];
1652 if (!b) {
1653 btrfs_release_path(NULL, p);
1654 goto again;
1655 }
1656 slot = p->slots[level];
1657 BUG_ON(btrfs_header_nritems(b) == 1);
1658 }
1659 unlock_up(p, level, lowest_unlock); 1679 unlock_up(p, level, lowest_unlock);
1660 1680
1661 /* this is only true while dropping a snapshot */ 1681 /* this is only true while dropping a snapshot */
@@ -1664,44 +1684,11 @@ cow_done:
1664 goto done; 1684 goto done;
1665 } 1685 }
1666 1686
1667 blocknr = btrfs_node_blockptr(b, slot); 1687 ret = read_block_for_search(trans, root, p,
1668 gen = btrfs_node_ptr_generation(b, slot); 1688 &b, level, slot, key);
1669 blocksize = btrfs_level_size(root, level - 1); 1689 if (ret == -EAGAIN)
1690 goto again;
1670 1691
1671 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1672 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1673 b = tmp;
1674 } else {
1675 /*
1676 * reduce lock contention at high levels
1677 * of the btree by dropping locks before
1678 * we read.
1679 */
1680 if (level > 0) {
1681 btrfs_release_path(NULL, p);
1682 if (tmp)
1683 free_extent_buffer(tmp);
1684 if (should_reada)
1685 reada_for_search(root, p,
1686 level, slot,
1687 key->objectid);
1688
1689 tmp = read_tree_block(root, blocknr,
1690 blocksize, gen);
1691 if (tmp)
1692 free_extent_buffer(tmp);
1693 goto again;
1694 } else {
1695 btrfs_set_path_blocking(p);
1696 if (tmp)
1697 free_extent_buffer(tmp);
1698 if (should_reada)
1699 reada_for_search(root, p,
1700 level, slot,
1701 key->objectid);
1702 b = read_node_slot(root, b, slot);
1703 }
1704 }
1705 if (!p->skip_locking) { 1692 if (!p->skip_locking) {
1706 int lret; 1693 int lret;
1707 1694
@@ -1742,12 +1729,8 @@ done:
1742 * we don't really know what they plan on doing with the path 1729 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking 1730 * from here on, so for now just mark it as blocking
1744 */ 1731 */
1745 btrfs_set_path_blocking(p); 1732 if (!p->leave_spinning)
1746 if (prealloc_block.objectid) { 1733 btrfs_set_path_blocking(p);
1747 btrfs_free_reserved_extent(root,
1748 prealloc_block.objectid,
1749 prealloc_block.offset);
1750 }
1751 return ret; 1734 return ret;
1752} 1735}
1753 1736
@@ -1768,7 +1751,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 int ret; 1751 int ret;
1769 1752
1770 eb = btrfs_lock_root_node(root); 1753 eb = btrfs_lock_root_node(root);
1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1754 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1772 BUG_ON(ret); 1755 BUG_ON(ret);
1773 1756
1774 btrfs_set_lock_blocking(eb); 1757 btrfs_set_lock_blocking(eb);
@@ -1826,7 +1809,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1826 } 1809 }
1827 1810
1828 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1811 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1829 &eb, 0); 1812 &eb);
1830 BUG_ON(ret); 1813 BUG_ON(ret);
1831 1814
1832 if (root->root_key.objectid == 1815 if (root->root_key.objectid ==
@@ -2139,7 +2122,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2139 spin_unlock(&root->node_lock); 2122 spin_unlock(&root->node_lock);
2140 2123
2141 ret = btrfs_update_extent_ref(trans, root, lower->start, 2124 ret = btrfs_update_extent_ref(trans, root, lower->start,
2142 lower->start, c->start, 2125 lower->len, lower->start, c->start,
2143 root->root_key.objectid, 2126 root->root_key.objectid,
2144 trans->transid, level - 1); 2127 trans->transid, level - 1);
2145 BUG_ON(ret); 2128 BUG_ON(ret);
@@ -2174,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2174 BUG_ON(!path->nodes[level]); 2157 BUG_ON(!path->nodes[level]);
2175 lower = path->nodes[level]; 2158 lower = path->nodes[level];
2176 nritems = btrfs_header_nritems(lower); 2159 nritems = btrfs_header_nritems(lower);
2177 if (slot > nritems) 2160 BUG_ON(slot > nritems);
2178 BUG();
2179 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2161 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2180 BUG(); 2162 BUG();
2181 if (slot != nritems) { 2163 if (slot != nritems) {
@@ -2221,7 +2203,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2221 ret = insert_new_root(trans, root, path, level + 1); 2203 ret = insert_new_root(trans, root, path, level + 1);
2222 if (ret) 2204 if (ret)
2223 return ret; 2205 return ret;
2224 } else { 2206 } else if (!trans->transaction->delayed_refs.flushing) {
2225 ret = push_nodes_for_insert(trans, root, path, level); 2207 ret = push_nodes_for_insert(trans, root, path, level);
2226 c = path->nodes[level]; 2208 c = path->nodes[level];
2227 if (!ret && btrfs_header_nritems(c) < 2209 if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2311,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2329 return ret; 2311 return ret;
2330} 2312}
2331 2313
2332/* 2314static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2333 * push some data in the path leaf to the right, trying to free up at 2315 struct btrfs_root *root,
2334 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2316 struct btrfs_path *path,
2335 * 2317 int data_size, int empty,
2336 * returns 1 if the push failed because the other node didn't have enough 2318 struct extent_buffer *right,
2337 * room, 0 if everything worked out and < 0 if there were major errors. 2319 int free_space, u32 left_nritems)
2338 */
2339static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340 *root, struct btrfs_path *path, int data_size,
2341 int empty)
2342{ 2320{
2343 struct extent_buffer *left = path->nodes[0]; 2321 struct extent_buffer *left = path->nodes[0];
2344 struct extent_buffer *right; 2322 struct extent_buffer *upper = path->nodes[1];
2345 struct extent_buffer *upper;
2346 struct btrfs_disk_key disk_key; 2323 struct btrfs_disk_key disk_key;
2347 int slot; 2324 int slot;
2348 u32 i; 2325 u32 i;
2349 int free_space;
2350 int push_space = 0; 2326 int push_space = 0;
2351 int push_items = 0; 2327 int push_items = 0;
2352 struct btrfs_item *item; 2328 struct btrfs_item *item;
2353 u32 left_nritems;
2354 u32 nr; 2329 u32 nr;
2355 u32 right_nritems; 2330 u32 right_nritems;
2356 u32 data_end; 2331 u32 data_end;
2357 u32 this_item_size; 2332 u32 this_item_size;
2358 int ret; 2333 int ret;
2359 2334
2360 slot = path->slots[1];
2361 if (!path->nodes[1])
2362 return 1;
2363
2364 upper = path->nodes[1];
2365 if (slot >= btrfs_header_nritems(upper) - 1)
2366 return 1;
2367
2368 btrfs_assert_tree_locked(path->nodes[1]);
2369
2370 right = read_node_slot(root, upper, slot + 1);
2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2374 free_space = btrfs_leaf_free_space(root, right);
2375 if (free_space < data_size)
2376 goto out_unlock;
2377
2378 /* cow and double check */
2379 ret = btrfs_cow_block(trans, root, right, upper,
2380 slot + 1, &right, 0);
2381 if (ret)
2382 goto out_unlock;
2383
2384 free_space = btrfs_leaf_free_space(root, right);
2385 if (free_space < data_size)
2386 goto out_unlock;
2387
2388 left_nritems = btrfs_header_nritems(left);
2389 if (left_nritems == 0)
2390 goto out_unlock;
2391
2392 if (empty) 2335 if (empty)
2393 nr = 0; 2336 nr = 0;
2394 else 2337 else
@@ -2397,6 +2340,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2397 if (path->slots[0] >= left_nritems) 2340 if (path->slots[0] >= left_nritems)
2398 push_space += data_size; 2341 push_space += data_size;
2399 2342
2343 slot = path->slots[1];
2400 i = left_nritems - 1; 2344 i = left_nritems - 1;
2401 while (i >= nr) { 2345 while (i >= nr) {
2402 item = btrfs_item_nr(left, i); 2346 item = btrfs_item_nr(left, i);
@@ -2528,24 +2472,82 @@ out_unlock:
2528} 2472}
2529 2473
2530/* 2474/*
2475 * push some data in the path leaf to the right, trying to free up at
2476 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2477 *
2478 * returns 1 if the push failed because the other node didn't have enough
2479 * room, 0 if everything worked out and < 0 if there were major errors.
2480 */
2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2482 *root, struct btrfs_path *path, int data_size,
2483 int empty)
2484{
2485 struct extent_buffer *left = path->nodes[0];
2486 struct extent_buffer *right;
2487 struct extent_buffer *upper;
2488 int slot;
2489 int free_space;
2490 u32 left_nritems;
2491 int ret;
2492
2493 if (!path->nodes[1])
2494 return 1;
2495
2496 slot = path->slots[1];
2497 upper = path->nodes[1];
2498 if (slot >= btrfs_header_nritems(upper) - 1)
2499 return 1;
2500
2501 btrfs_assert_tree_locked(path->nodes[1]);
2502
2503 right = read_node_slot(root, upper, slot + 1);
2504 btrfs_tree_lock(right);
2505 btrfs_set_lock_blocking(right);
2506
2507 free_space = btrfs_leaf_free_space(root, right);
2508 if (free_space < data_size)
2509 goto out_unlock;
2510
2511 /* cow and double check */
2512 ret = btrfs_cow_block(trans, root, right, upper,
2513 slot + 1, &right);
2514 if (ret)
2515 goto out_unlock;
2516
2517 free_space = btrfs_leaf_free_space(root, right);
2518 if (free_space < data_size)
2519 goto out_unlock;
2520
2521 left_nritems = btrfs_header_nritems(left);
2522 if (left_nritems == 0)
2523 goto out_unlock;
2524
2525 return __push_leaf_right(trans, root, path, data_size, empty,
2526 right, free_space, left_nritems);
2527out_unlock:
2528 btrfs_tree_unlock(right);
2529 free_extent_buffer(right);
2530 return 1;
2531}
2532
2533/*
2531 * push some data in the path leaf to the left, trying to free up at 2534 * push some data in the path leaf to the left, trying to free up at
2532 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2535 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2533 */ 2536 */
2534static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2537static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2535 *root, struct btrfs_path *path, int data_size, 2538 struct btrfs_root *root,
2536 int empty) 2539 struct btrfs_path *path, int data_size,
2540 int empty, struct extent_buffer *left,
2541 int free_space, int right_nritems)
2537{ 2542{
2538 struct btrfs_disk_key disk_key; 2543 struct btrfs_disk_key disk_key;
2539 struct extent_buffer *right = path->nodes[0]; 2544 struct extent_buffer *right = path->nodes[0];
2540 struct extent_buffer *left;
2541 int slot; 2545 int slot;
2542 int i; 2546 int i;
2543 int free_space;
2544 int push_space = 0; 2547 int push_space = 0;
2545 int push_items = 0; 2548 int push_items = 0;
2546 struct btrfs_item *item; 2549 struct btrfs_item *item;
2547 u32 old_left_nritems; 2550 u32 old_left_nritems;
2548 u32 right_nritems;
2549 u32 nr; 2551 u32 nr;
2550 int ret = 0; 2552 int ret = 0;
2551 int wret; 2553 int wret;
@@ -2553,41 +2555,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2553 u32 old_left_item_size; 2555 u32 old_left_item_size;
2554 2556
2555 slot = path->slots[1]; 2557 slot = path->slots[1];
2556 if (slot == 0)
2557 return 1;
2558 if (!path->nodes[1])
2559 return 1;
2560
2561 right_nritems = btrfs_header_nritems(right);
2562 if (right_nritems == 0)
2563 return 1;
2564
2565 btrfs_assert_tree_locked(path->nodes[1]);
2566
2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2571 free_space = btrfs_leaf_free_space(root, left);
2572 if (free_space < data_size) {
2573 ret = 1;
2574 goto out;
2575 }
2576
2577 /* cow and double check */
2578 ret = btrfs_cow_block(trans, root, left,
2579 path->nodes[1], slot - 1, &left, 0);
2580 if (ret) {
2581 /* we hit -ENOSPC, but it isn't fatal here */
2582 ret = 1;
2583 goto out;
2584 }
2585
2586 free_space = btrfs_leaf_free_space(root, left);
2587 if (free_space < data_size) {
2588 ret = 1;
2589 goto out;
2590 }
2591 2558
2592 if (empty) 2559 if (empty)
2593 nr = right_nritems; 2560 nr = right_nritems;
@@ -2755,6 +2722,154 @@ out:
2755} 2722}
2756 2723
2757/* 2724/*
2725 * push some data in the path leaf to the left, trying to free up at
2726 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2727 */
2728static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2729 *root, struct btrfs_path *path, int data_size,
2730 int empty)
2731{
2732 struct extent_buffer *right = path->nodes[0];
2733 struct extent_buffer *left;
2734 int slot;
2735 int free_space;
2736 u32 right_nritems;
2737 int ret = 0;
2738
2739 slot = path->slots[1];
2740 if (slot == 0)
2741 return 1;
2742 if (!path->nodes[1])
2743 return 1;
2744
2745 right_nritems = btrfs_header_nritems(right);
2746 if (right_nritems == 0)
2747 return 1;
2748
2749 btrfs_assert_tree_locked(path->nodes[1]);
2750
2751 left = read_node_slot(root, path->nodes[1], slot - 1);
2752 btrfs_tree_lock(left);
2753 btrfs_set_lock_blocking(left);
2754
2755 free_space = btrfs_leaf_free_space(root, left);
2756 if (free_space < data_size) {
2757 ret = 1;
2758 goto out;
2759 }
2760
2761 /* cow and double check */
2762 ret = btrfs_cow_block(trans, root, left,
2763 path->nodes[1], slot - 1, &left);
2764 if (ret) {
2765 /* we hit -ENOSPC, but it isn't fatal here */
2766 ret = 1;
2767 goto out;
2768 }
2769
2770 free_space = btrfs_leaf_free_space(root, left);
2771 if (free_space < data_size) {
2772 ret = 1;
2773 goto out;
2774 }
2775
2776 return __push_leaf_left(trans, root, path, data_size,
2777 empty, left, free_space, right_nritems);
2778out:
2779 btrfs_tree_unlock(left);
2780 free_extent_buffer(left);
2781 return ret;
2782}
2783
2784/*
2785 * split the path's leaf in two, making sure there is at least data_size
2786 * available for the resulting leaf level of the path.
2787 *
2788 * returns 0 if all went well and < 0 on failure.
2789 */
2790static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2791 struct btrfs_root *root,
2792 struct btrfs_path *path,
2793 struct extent_buffer *l,
2794 struct extent_buffer *right,
2795 int slot, int mid, int nritems)
2796{
2797 int data_copy_size;
2798 int rt_data_off;
2799 int i;
2800 int ret = 0;
2801 int wret;
2802 struct btrfs_disk_key disk_key;
2803
2804 nritems = nritems - mid;
2805 btrfs_set_header_nritems(right, nritems);
2806 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2807
2808 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2809 btrfs_item_nr_offset(mid),
2810 nritems * sizeof(struct btrfs_item));
2811
2812 copy_extent_buffer(right, l,
2813 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2814 data_copy_size, btrfs_leaf_data(l) +
2815 leaf_data_end(root, l), data_copy_size);
2816
2817 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2818 btrfs_item_end_nr(l, mid);
2819
2820 for (i = 0; i < nritems; i++) {
2821 struct btrfs_item *item = btrfs_item_nr(right, i);
2822 u32 ioff;
2823
2824 if (!right->map_token) {
2825 map_extent_buffer(right, (unsigned long)item,
2826 sizeof(struct btrfs_item),
2827 &right->map_token, &right->kaddr,
2828 &right->map_start, &right->map_len,
2829 KM_USER1);
2830 }
2831
2832 ioff = btrfs_item_offset(right, item);
2833 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2834 }
2835
2836 if (right->map_token) {
2837 unmap_extent_buffer(right, right->map_token, KM_USER1);
2838 right->map_token = NULL;
2839 }
2840
2841 btrfs_set_header_nritems(l, mid);
2842 ret = 0;
2843 btrfs_item_key(right, &disk_key, 0);
2844 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2845 path->slots[1] + 1, 1);
2846 if (wret)
2847 ret = wret;
2848
2849 btrfs_mark_buffer_dirty(right);
2850 btrfs_mark_buffer_dirty(l);
2851 BUG_ON(path->slots[0] != slot);
2852
2853 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2854 BUG_ON(ret);
2855
2856 if (mid <= slot) {
2857 btrfs_tree_unlock(path->nodes[0]);
2858 free_extent_buffer(path->nodes[0]);
2859 path->nodes[0] = right;
2860 path->slots[0] -= mid;
2861 path->slots[1] += 1;
2862 } else {
2863 btrfs_tree_unlock(right);
2864 free_extent_buffer(right);
2865 }
2866
2867 BUG_ON(path->slots[0] < 0);
2868
2869 return ret;
2870}
2871
2872/*
2758 * split the path's leaf in two, making sure there is at least data_size 2873 * split the path's leaf in two, making sure there is at least data_size
2759 * available for the resulting leaf level of the path. 2874 * available for the resulting leaf level of the path.
2760 * 2875 *
@@ -2771,17 +2886,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2771 int mid; 2886 int mid;
2772 int slot; 2887 int slot;
2773 struct extent_buffer *right; 2888 struct extent_buffer *right;
2774 int data_copy_size;
2775 int rt_data_off;
2776 int i;
2777 int ret = 0; 2889 int ret = 0;
2778 int wret; 2890 int wret;
2779 int double_split; 2891 int double_split;
2780 int num_doubles = 0; 2892 int num_doubles = 0;
2781 struct btrfs_disk_key disk_key;
2782 2893
2783 /* first try to make some room by pushing left and right */ 2894 /* first try to make some room by pushing left and right */
2784 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2895 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2896 !trans->transaction->delayed_refs.flushing) {
2785 wret = push_leaf_right(trans, root, path, data_size, 0); 2897 wret = push_leaf_right(trans, root, path, data_size, 0);
2786 if (wret < 0) 2898 if (wret < 0)
2787 return wret; 2899 return wret;
@@ -2830,11 +2942,14 @@ again:
2830 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 2942 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2831 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2943 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2832 BTRFS_UUID_SIZE); 2944 BTRFS_UUID_SIZE);
2945
2833 if (mid <= slot) { 2946 if (mid <= slot) {
2834 if (nritems == 1 || 2947 if (nritems == 1 ||
2835 leaf_space_used(l, mid, nritems - mid) + data_size > 2948 leaf_space_used(l, mid, nritems - mid) + data_size >
2836 BTRFS_LEAF_DATA_SIZE(root)) { 2949 BTRFS_LEAF_DATA_SIZE(root)) {
2837 if (slot >= nritems) { 2950 if (slot >= nritems) {
2951 struct btrfs_disk_key disk_key;
2952
2838 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2953 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839 btrfs_set_header_nritems(right, 0); 2954 btrfs_set_header_nritems(right, 0);
2840 wret = insert_ptr(trans, root, path, 2955 wret = insert_ptr(trans, root, path,
@@ -2862,6 +2977,8 @@ again:
2862 if (leaf_space_used(l, 0, mid) + data_size > 2977 if (leaf_space_used(l, 0, mid) + data_size >
2863 BTRFS_LEAF_DATA_SIZE(root)) { 2978 BTRFS_LEAF_DATA_SIZE(root)) {
2864 if (!extend && data_size && slot == 0) { 2979 if (!extend && data_size && slot == 0) {
2980 struct btrfs_disk_key disk_key;
2981
2865 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2982 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866 btrfs_set_header_nritems(right, 0); 2983 btrfs_set_header_nritems(right, 0);
2867 wret = insert_ptr(trans, root, path, 2984 wret = insert_ptr(trans, root, path,
@@ -2894,76 +3011,16 @@ again:
2894 } 3011 }
2895 } 3012 }
2896 } 3013 }
2897 nritems = nritems - mid;
2898 btrfs_set_header_nritems(right, nritems);
2899 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2900
2901 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2902 btrfs_item_nr_offset(mid),
2903 nritems * sizeof(struct btrfs_item));
2904
2905 copy_extent_buffer(right, l,
2906 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2907 data_copy_size, btrfs_leaf_data(l) +
2908 leaf_data_end(root, l), data_copy_size);
2909
2910 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911 btrfs_item_end_nr(l, mid);
2912
2913 for (i = 0; i < nritems; i++) {
2914 struct btrfs_item *item = btrfs_item_nr(right, i);
2915 u32 ioff;
2916
2917 if (!right->map_token) {
2918 map_extent_buffer(right, (unsigned long)item,
2919 sizeof(struct btrfs_item),
2920 &right->map_token, &right->kaddr,
2921 &right->map_start, &right->map_len,
2922 KM_USER1);
2923 }
2924
2925 ioff = btrfs_item_offset(right, item);
2926 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2927 }
2928 3014
2929 if (right->map_token) { 3015 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2930 unmap_extent_buffer(right, right->map_token, KM_USER1);
2931 right->map_token = NULL;
2932 }
2933
2934 btrfs_set_header_nritems(l, mid);
2935 ret = 0;
2936 btrfs_item_key(right, &disk_key, 0);
2937 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2938 path->slots[1] + 1, 1);
2939 if (wret)
2940 ret = wret;
2941
2942 btrfs_mark_buffer_dirty(right);
2943 btrfs_mark_buffer_dirty(l);
2944 BUG_ON(path->slots[0] != slot);
2945
2946 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2947 BUG_ON(ret); 3016 BUG_ON(ret);
2948 3017
2949 if (mid <= slot) {
2950 btrfs_tree_unlock(path->nodes[0]);
2951 free_extent_buffer(path->nodes[0]);
2952 path->nodes[0] = right;
2953 path->slots[0] -= mid;
2954 path->slots[1] += 1;
2955 } else {
2956 btrfs_tree_unlock(right);
2957 free_extent_buffer(right);
2958 }
2959
2960 BUG_ON(path->slots[0] < 0);
2961
2962 if (double_split) { 3018 if (double_split) {
2963 BUG_ON(num_doubles != 0); 3019 BUG_ON(num_doubles != 0);
2964 num_doubles++; 3020 num_doubles++;
2965 goto again; 3021 goto again;
2966 } 3022 }
3023
2967 return ret; 3024 return ret;
2968} 3025}
2969 3026
@@ -3021,26 +3078,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
3021 return -EAGAIN; 3078 return -EAGAIN;
3022 } 3079 }
3023 3080
3081 btrfs_set_path_blocking(path);
3024 ret = split_leaf(trans, root, &orig_key, path, 3082 ret = split_leaf(trans, root, &orig_key, path,
3025 sizeof(struct btrfs_item), 1); 3083 sizeof(struct btrfs_item), 1);
3026 path->keep_locks = 0; 3084 path->keep_locks = 0;
3027 BUG_ON(ret); 3085 BUG_ON(ret);
3028 3086
3087 btrfs_unlock_up_safe(path, 1);
3088 leaf = path->nodes[0];
3089 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3090
3091split:
3029 /* 3092 /*
3030 * make sure any changes to the path from split_leaf leave it 3093 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state 3094 * in a blocking state
3032 */ 3095 */
3033 btrfs_set_path_blocking(path); 3096 btrfs_set_path_blocking(path);
3034 3097
3035 leaf = path->nodes[0];
3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037
3038split:
3039 item = btrfs_item_nr(leaf, path->slots[0]); 3098 item = btrfs_item_nr(leaf, path->slots[0]);
3040 orig_offset = btrfs_item_offset(leaf, item); 3099 orig_offset = btrfs_item_offset(leaf, item);
3041 item_size = btrfs_item_size(leaf, item); 3100 item_size = btrfs_item_size(leaf, item);
3042 3101
3043
3044 buf = kmalloc(item_size, GFP_NOFS); 3102 buf = kmalloc(item_size, GFP_NOFS);
3045 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3103 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3046 path->slots[0]), item_size); 3104 path->slots[0]), item_size);
@@ -3445,39 +3503,27 @@ out:
3445} 3503}
3446 3504
3447/* 3505/*
3448 * Given a key and some data, insert items into the tree. 3506 * this is a helper for btrfs_insert_empty_items, the main goal here is
3449 * This does all the path init required, making room in the tree if needed. 3507 * to save stack depth by doing the bulk of the work in a function
3508 * that doesn't call btrfs_search_slot
3450 */ 3509 */
3451int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3510static noinline_for_stack int
3452 struct btrfs_root *root, 3511setup_items_for_insert(struct btrfs_trans_handle *trans,
3453 struct btrfs_path *path, 3512 struct btrfs_root *root, struct btrfs_path *path,
3454 struct btrfs_key *cpu_key, u32 *data_size, 3513 struct btrfs_key *cpu_key, u32 *data_size,
3455 int nr) 3514 u32 total_data, u32 total_size, int nr)
3456{ 3515{
3457 struct extent_buffer *leaf;
3458 struct btrfs_item *item; 3516 struct btrfs_item *item;
3459 int ret = 0;
3460 int slot;
3461 int slot_orig;
3462 int i; 3517 int i;
3463 u32 nritems; 3518 u32 nritems;
3464 u32 total_size = 0;
3465 u32 total_data = 0;
3466 unsigned int data_end; 3519 unsigned int data_end;
3467 struct btrfs_disk_key disk_key; 3520 struct btrfs_disk_key disk_key;
3521 int ret;
3522 struct extent_buffer *leaf;
3523 int slot;
3468 3524
3469 for (i = 0; i < nr; i++)
3470 total_data += data_size[i];
3471
3472 total_size = total_data + (nr * sizeof(struct btrfs_item));
3473 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3474 if (ret == 0)
3475 return -EEXIST;
3476 if (ret < 0)
3477 goto out;
3478
3479 slot_orig = path->slots[0];
3480 leaf = path->nodes[0]; 3525 leaf = path->nodes[0];
3526 slot = path->slots[0];
3481 3527
3482 nritems = btrfs_header_nritems(leaf); 3528 nritems = btrfs_header_nritems(leaf);
3483 data_end = leaf_data_end(root, leaf); 3529 data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3535,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3489 BUG(); 3535 BUG();
3490 } 3536 }
3491 3537
3492 slot = path->slots[0];
3493 BUG_ON(slot < 0);
3494
3495 if (slot != nritems) { 3538 if (slot != nritems) {
3496 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3539 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3497 3540
@@ -3547,21 +3590,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3547 data_end -= data_size[i]; 3590 data_end -= data_size[i];
3548 btrfs_set_item_size(leaf, item, data_size[i]); 3591 btrfs_set_item_size(leaf, item, data_size[i]);
3549 } 3592 }
3593
3550 btrfs_set_header_nritems(leaf, nritems + nr); 3594 btrfs_set_header_nritems(leaf, nritems + nr);
3551 btrfs_mark_buffer_dirty(leaf);
3552 3595
3553 ret = 0; 3596 ret = 0;
3554 if (slot == 0) { 3597 if (slot == 0) {
3598 struct btrfs_disk_key disk_key;
3555 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3599 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3556 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3600 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3557 } 3601 }
3602 btrfs_unlock_up_safe(path, 1);
3603 btrfs_mark_buffer_dirty(leaf);
3558 3604
3559 if (btrfs_leaf_free_space(root, leaf) < 0) { 3605 if (btrfs_leaf_free_space(root, leaf) < 0) {
3560 btrfs_print_leaf(root, leaf); 3606 btrfs_print_leaf(root, leaf);
3561 BUG(); 3607 BUG();
3562 } 3608 }
3609 return ret;
3610}
3611
3612/*
3613 * Given a key and some data, insert items into the tree.
3614 * This does all the path init required, making room in the tree if needed.
3615 */
3616int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3617 struct btrfs_root *root,
3618 struct btrfs_path *path,
3619 struct btrfs_key *cpu_key, u32 *data_size,
3620 int nr)
3621{
3622 struct extent_buffer *leaf;
3623 int ret = 0;
3624 int slot;
3625 int i;
3626 u32 total_size = 0;
3627 u32 total_data = 0;
3628
3629 for (i = 0; i < nr; i++)
3630 total_data += data_size[i];
3631
3632 total_size = total_data + (nr * sizeof(struct btrfs_item));
3633 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3634 if (ret == 0)
3635 return -EEXIST;
3636 if (ret < 0)
3637 goto out;
3638
3639 leaf = path->nodes[0];
3640 slot = path->slots[0];
3641 BUG_ON(slot < 0);
3642
3643 ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3644 total_data, total_size, nr);
3645
3563out: 3646out:
3564 btrfs_unlock_up_safe(path, 1);
3565 return ret; 3647 return ret;
3566} 3648}
3567 3649
@@ -3749,7 +3831,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3749 } 3831 }
3750 3832
3751 /* delete the leaf if it is mostly empty */ 3833 /* delete the leaf if it is mostly empty */
3752 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { 3834 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3835 !trans->transaction->delayed_refs.flushing) {
3753 /* push_leaf_left fixes the path. 3836 /* push_leaf_left fixes the path.
3754 * make sure the path still points to our leaf 3837 * make sure the path still points to our leaf
3755 * for possible call to del_ptr below 3838 * for possible call to del_ptr below
@@ -3757,6 +3840,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3757 slot = path->slots[1]; 3840 slot = path->slots[1];
3758 extent_buffer_get(leaf); 3841 extent_buffer_get(leaf);
3759 3842
3843 btrfs_set_path_blocking(path);
3760 wret = push_leaf_left(trans, root, path, 1, 1); 3844 wret = push_leaf_left(trans, root, path, 1, 1);
3761 if (wret < 0 && wret != -ENOSPC) 3845 if (wret < 0 && wret != -ENOSPC)
3762 ret = wret; 3846 ret = wret;
@@ -4042,28 +4126,44 @@ next:
4042int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4126int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4043{ 4127{
4044 int slot; 4128 int slot;
4045 int level = 1; 4129 int level;
4046 struct extent_buffer *c; 4130 struct extent_buffer *c;
4047 struct extent_buffer *next = NULL; 4131 struct extent_buffer *next;
4048 struct btrfs_key key; 4132 struct btrfs_key key;
4049 u32 nritems; 4133 u32 nritems;
4050 int ret; 4134 int ret;
4135 int old_spinning = path->leave_spinning;
4136 int force_blocking = 0;
4051 4137
4052 nritems = btrfs_header_nritems(path->nodes[0]); 4138 nritems = btrfs_header_nritems(path->nodes[0]);
4053 if (nritems == 0) 4139 if (nritems == 0)
4054 return 1; 4140 return 1;
4055 4141
4056 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4142 /*
4143 * we take the blocks in an order that upsets lockdep. Using
4144 * blocking mode is the only way around it.
4145 */
4146#ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 force_blocking = 1;
4148#endif
4057 4149
4150 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151again:
4152 level = 1;
4153 next = NULL;
4058 btrfs_release_path(root, path); 4154 btrfs_release_path(root, path);
4155
4059 path->keep_locks = 1; 4156 path->keep_locks = 1;
4157
4158 if (!force_blocking)
4159 path->leave_spinning = 1;
4160
4060 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4061 path->keep_locks = 0; 4162 path->keep_locks = 0;
4062 4163
4063 if (ret < 0) 4164 if (ret < 0)
4064 return ret; 4165 return ret;
4065 4166
4066 btrfs_set_path_blocking(path);
4067 nritems = btrfs_header_nritems(path->nodes[0]); 4167 nritems = btrfs_header_nritems(path->nodes[0]);
4068 /* 4168 /*
4069 * by releasing the path above we dropped all our locks. A balance 4169 * by releasing the path above we dropped all our locks. A balance
@@ -4073,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4073 */ 4173 */
4074 if (nritems > 0 && path->slots[0] < nritems - 1) { 4174 if (nritems > 0 && path->slots[0] < nritems - 1) {
4075 path->slots[0]++; 4175 path->slots[0]++;
4176 ret = 0;
4076 goto done; 4177 goto done;
4077 } 4178 }
4078 4179
4079 while (level < BTRFS_MAX_LEVEL) { 4180 while (level < BTRFS_MAX_LEVEL) {
4080 if (!path->nodes[level]) 4181 if (!path->nodes[level]) {
4081 return 1; 4182 ret = 1;
4183 goto done;
4184 }
4082 4185
4083 slot = path->slots[level] + 1; 4186 slot = path->slots[level] + 1;
4084 c = path->nodes[level]; 4187 c = path->nodes[level];
4085 if (slot >= btrfs_header_nritems(c)) { 4188 if (slot >= btrfs_header_nritems(c)) {
4086 level++; 4189 level++;
4087 if (level == BTRFS_MAX_LEVEL) 4190 if (level == BTRFS_MAX_LEVEL) {
4088 return 1; 4191 ret = 1;
4192 goto done;
4193 }
4089 continue; 4194 continue;
4090 } 4195 }
4091 4196
@@ -4094,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4094 free_extent_buffer(next); 4199 free_extent_buffer(next);
4095 } 4200 }
4096 4201
4097 /* the path was set to blocking above */ 4202 next = c;
4098 if (level == 1 && (path->locks[1] || path->skip_locking) && 4203 ret = read_block_for_search(NULL, root, path, &next, level,
4099 path->reada) 4204 slot, &key);
4100 reada_for_search(root, path, level, slot, 0); 4205 if (ret == -EAGAIN)
4206 goto again;
4101 4207
4102 next = read_node_slot(root, c, slot);
4103 if (!path->skip_locking) { 4208 if (!path->skip_locking) {
4104 btrfs_assert_tree_locked(c); 4209 ret = btrfs_try_spin_lock(next);
4105 btrfs_tree_lock(next); 4210 if (!ret) {
4106 btrfs_set_lock_blocking(next); 4211 btrfs_set_path_blocking(path);
4212 btrfs_tree_lock(next);
4213 if (!force_blocking)
4214 btrfs_clear_path_blocking(path, next);
4215 }
4216 if (force_blocking)
4217 btrfs_set_lock_blocking(next);
4107 } 4218 }
4108 break; 4219 break;
4109 } 4220 }
@@ -4113,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4113 c = path->nodes[level]; 4224 c = path->nodes[level];
4114 if (path->locks[level]) 4225 if (path->locks[level])
4115 btrfs_tree_unlock(c); 4226 btrfs_tree_unlock(c);
4227
4116 free_extent_buffer(c); 4228 free_extent_buffer(c);
4117 path->nodes[level] = next; 4229 path->nodes[level] = next;
4118 path->slots[level] = 0; 4230 path->slots[level] = 0;
4119 if (!path->skip_locking) 4231 if (!path->skip_locking)
4120 path->locks[level] = 1; 4232 path->locks[level] = 1;
4233
4121 if (!level) 4234 if (!level)
4122 break; 4235 break;
4123 4236
4124 btrfs_set_path_blocking(path); 4237 ret = read_block_for_search(NULL, root, path, &next, level,
4125 if (level == 1 && path->locks[1] && path->reada) 4238 0, &key);
4126 reada_for_search(root, path, level, slot, 0); 4239 if (ret == -EAGAIN)
4127 next = read_node_slot(root, next, 0); 4240 goto again;
4241
4128 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4129 btrfs_assert_tree_locked(path->nodes[level]); 4243 btrfs_assert_tree_locked(path->nodes[level]);
4130 btrfs_tree_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4131 btrfs_set_lock_blocking(next); 4245 if (!ret) {
4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4132 } 4253 }
4133 } 4254 }
4255 ret = 0;
4134done: 4256done:
4135 unlock_up(path, 0, 1); 4257 unlock_up(path, 0, 1);
4136 return 0; 4258 path->leave_spinning = old_spinning;
4259 if (!old_spinning)
4260 btrfs_set_path_blocking(path);
4261
4262 return ret;
4137} 4263}
4138 4264
4139/* 4265/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -136,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
136#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
137 144
138/* 145/*
139 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
140 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
141 * tells us things about the object, and is a kind of stream selector. 148 *
142 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
143 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
144 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
145 * 155 *
146 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
147 * 157 *
@@ -193,7 +203,7 @@ struct btrfs_dev_item {
193 203
194 /* 204 /*
195 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
196 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
197 */ 207 */
198 __le64 start_offset; 208 __le64 start_offset;
199 209
@@ -401,15 +411,16 @@ struct btrfs_path {
401 int locks[BTRFS_MAX_LEVEL]; 411 int locks[BTRFS_MAX_LEVEL];
402 int reada; 412 int reada;
403 /* keep some upper locks as we walk down */ 413 /* keep some upper locks as we walk down */
404 int keep_locks;
405 int skip_locking;
406 int lowest_level; 414 int lowest_level;
407 415
408 /* 416 /*
409 * set by btrfs_split_item, tells search_slot to keep all locks 417 * set by btrfs_split_item, tells search_slot to keep all locks
410 * and to force calls to keep space in the nodes 418 * and to force calls to keep space in the nodes
411 */ 419 */
412 int search_for_split; 420 unsigned int search_for_split:1;
421 unsigned int keep_locks:1;
422 unsigned int skip_locking:1;
423 unsigned int leave_spinning:1;
413}; 424};
414 425
415/* 426/*
@@ -625,18 +636,35 @@ struct btrfs_space_info {
625 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
626}; 637};
627 638
628struct btrfs_free_space { 639/*
629 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
630 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
631 u64 offset; 642 * allocations and data allocations in ssd mode.
632 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
633}; 662};
634 663
635struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
636 struct btrfs_key key; 665 struct btrfs_key key;
637 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
638 spinlock_t lock; 667 spinlock_t lock;
639 struct mutex alloc_mutex;
640 struct mutex cache_mutex; 668 struct mutex cache_mutex;
641 u64 pinned; 669 u64 pinned;
642 u64 reserved; 670 u64 reserved;
@@ -648,6 +676,7 @@ struct btrfs_block_group_cache {
648 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
649 677
650 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
651 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
652 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
653 682
@@ -659,6 +688,11 @@ struct btrfs_block_group_cache {
659 688
660 /* usage count */ 689 /* usage count */
661 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
662}; 696};
663 697
664struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -688,15 +722,18 @@ struct btrfs_fs_info {
688 struct rb_root block_group_cache_tree; 722 struct rb_root block_group_cache_tree;
689 723
690 struct extent_io_tree pinned_extents; 724 struct extent_io_tree pinned_extents;
691 struct extent_io_tree pending_del;
692 struct extent_io_tree extent_ins;
693 725
694 /* logical->physical extent mapping */ 726 /* logical->physical extent mapping */
695 struct btrfs_mapping_tree mapping_tree; 727 struct btrfs_mapping_tree mapping_tree;
696 728
697 u64 generation; 729 u64 generation;
698 u64 last_trans_committed; 730 u64 last_trans_committed;
699 u64 last_trans_new_blockgroup; 731
732 /*
733 * this is updated to the current trans every time a full commit
734 * is required instead of the faster short fsync log commits
735 */
736 u64 last_trans_log_full_commit;
700 u64 open_ioctl_trans; 737 u64 open_ioctl_trans;
701 unsigned long mount_opt; 738 unsigned long mount_opt;
702 u64 max_extent; 739 u64 max_extent;
@@ -717,12 +754,20 @@ struct btrfs_fs_info {
717 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
718 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
719 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
720 struct mutex extent_ins_mutex;
721 struct mutex pinned_mutex;
722 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
723 struct mutex drop_mutex; 758 struct mutex drop_mutex;
724 struct mutex volume_mutex; 759 struct mutex volume_mutex;
725 struct mutex tree_reloc_mutex; 760 struct mutex tree_reloc_mutex;
761
762 /*
763 * this protects the ordered operations list only while we are
764 * processing all of the entries on it. This way we make
765 * sure the commit code doesn't find the list temporarily empty
766 * because another function happens to be doing non-waiting preflush
767 * before jumping into the main commit.
768 */
769 struct mutex ordered_operations_mutex;
770
726 struct list_head trans_list; 771 struct list_head trans_list;
727 struct list_head hashers; 772 struct list_head hashers;
728 struct list_head dead_roots; 773 struct list_head dead_roots;
@@ -737,10 +782,29 @@ struct btrfs_fs_info {
737 * ordered extents 782 * ordered extents
738 */ 783 */
739 spinlock_t ordered_extent_lock; 784 spinlock_t ordered_extent_lock;
785
786 /*
787 * all of the data=ordered extents pending writeback
788 * these can span multiple transactions and basically include
789 * every dirty data page that isn't from nodatacow
790 */
740 struct list_head ordered_extents; 791 struct list_head ordered_extents;
792
793 /*
794 * all of the inodes that have delalloc bytes. It is possible for
795 * this list to be empty even when there is still dirty data=ordered
796 * extents waiting to finish IO.
797 */
741 struct list_head delalloc_inodes; 798 struct list_head delalloc_inodes;
742 799
743 /* 800 /*
801 * special rename and truncate targets that must be on disk before
802 * we're allowed to commit. This is basically the ext3 style
803 * data=ordered list.
804 */
805 struct list_head ordered_operations;
806
807 /*
744 * there is a pool of worker threads for checksumming during writes 808 * there is a pool of worker threads for checksumming during writes
745 * and a pool for checksumming after reads. This is because readers 809 * and a pool for checksumming after reads. This is because readers
746 * can run with FS locks held, and the writers may be waiting for 810 * can run with FS locks held, and the writers may be waiting for
@@ -781,6 +845,11 @@ struct btrfs_fs_info {
781 atomic_t throttle_gen; 845 atomic_t throttle_gen;
782 846
783 u64 total_pinned; 847 u64 total_pinned;
848
849 /* protected by the delalloc lock, used to keep from writing
850 * metadata until there is a nice batch
851 */
852 u64 dirty_metadata_bytes;
784 struct list_head dirty_cowonly_roots; 853 struct list_head dirty_cowonly_roots;
785 854
786 struct btrfs_fs_devices *fs_devices; 855 struct btrfs_fs_devices *fs_devices;
@@ -795,8 +864,12 @@ struct btrfs_fs_info {
795 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
796 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
797 u64 delalloc_bytes; 866 u64 delalloc_bytes;
798 u64 last_alloc; 867
799 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
800 873
801 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
802 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -888,7 +961,6 @@ struct btrfs_root {
888}; 961};
889 962
890/* 963/*
891
892 * inode items have the data typically returned from stat and store other 964 * inode items have the data typically returned from stat and store other
893 * info about object characteristics. There is one for every file and dir in 965 * info about object characteristics. There is one for every file and dir in
894 * the FS 966 * the FS
@@ -919,7 +991,7 @@ struct btrfs_root {
919#define BTRFS_EXTENT_CSUM_KEY 128 991#define BTRFS_EXTENT_CSUM_KEY 128
920 992
921/* 993/*
922 * root items point to tree roots. There are typically in the root 994 * root items point to tree roots. They are typically in the root
923 * tree used by the super block to find all the other trees 995 * tree used by the super block to find all the other trees
924 */ 996 */
925#define BTRFS_ROOT_ITEM_KEY 132 997#define BTRFS_ROOT_ITEM_KEY 132
@@ -966,6 +1038,8 @@ struct btrfs_root {
966#define BTRFS_MOUNT_SSD (1 << 3) 1038#define BTRFS_MOUNT_SSD (1 << 3)
967#define BTRFS_MOUNT_DEGRADED (1 << 4) 1039#define BTRFS_MOUNT_DEGRADED (1 << 4)
968#define BTRFS_MOUNT_COMPRESS (1 << 5) 1040#define BTRFS_MOUNT_COMPRESS (1 << 5)
1041#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1042#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
969 1043
970#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1044#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
971#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1045#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1704,18 +1778,16 @@ static inline struct dentry *fdentry(struct file *file)
1704} 1778}
1705 1779
1706/* extent-tree.c */ 1780/* extent-tree.c */
1781void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1782int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1783 struct btrfs_root *root, unsigned long count);
1707int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1784int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1708int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1709 struct btrfs_root *root, u64 bytenr,
1710 u64 num_bytes, u32 *refs);
1711int btrfs_update_pinned_extents(struct btrfs_root *root, 1785int btrfs_update_pinned_extents(struct btrfs_root *root,
1712 u64 bytenr, u64 num, int pin); 1786 u64 bytenr, u64 num, int pin);
1713int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1787int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1714 struct btrfs_root *root, struct extent_buffer *leaf); 1788 struct btrfs_root *root, struct extent_buffer *leaf);
1715int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1789int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1716 struct btrfs_root *root, u64 objectid, u64 bytenr); 1790 struct btrfs_root *root, u64 objectid, u64 bytenr);
1717int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1718 struct btrfs_root *root);
1719int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1791int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1720struct btrfs_block_group_cache *btrfs_lookup_block_group( 1792struct btrfs_block_group_cache *btrfs_lookup_block_group(
1721 struct btrfs_fs_info *info, 1793 struct btrfs_fs_info *info,
@@ -1777,7 +1849,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1777 u64 root_objectid, u64 ref_generation, 1849 u64 root_objectid, u64 ref_generation,
1778 u64 owner_objectid); 1850 u64 owner_objectid);
1779int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1851int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1780 struct btrfs_root *root, u64 bytenr, 1852 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1781 u64 orig_parent, u64 parent, 1853 u64 orig_parent, u64 parent,
1782 u64 root_objectid, u64 ref_generation, 1854 u64 root_objectid, u64 ref_generation,
1783 u64 owner_objectid); 1855 u64 owner_objectid);
@@ -1838,7 +1910,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1838int btrfs_cow_block(struct btrfs_trans_handle *trans, 1910int btrfs_cow_block(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *root, struct extent_buffer *buf, 1911 struct btrfs_root *root, struct extent_buffer *buf,
1840 struct extent_buffer *parent, int parent_slot, 1912 struct extent_buffer *parent, int parent_slot,
1841 struct extent_buffer **cow_ret, u64 prealloc_dest); 1913 struct extent_buffer **cow_ret);
1842int btrfs_copy_root(struct btrfs_trans_handle *trans, 1914int btrfs_copy_root(struct btrfs_trans_handle *trans,
1843 struct btrfs_root *root, 1915 struct btrfs_root *root,
1844 struct extent_buffer *buf, 1916 struct extent_buffer *buf,
@@ -2060,7 +2132,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2060unsigned long btrfs_force_ra(struct address_space *mapping, 2132unsigned long btrfs_force_ra(struct address_space *mapping,
2061 struct file_ra_state *ra, struct file *file, 2133 struct file_ra_state *ra, struct file *file,
2062 pgoff_t offset, pgoff_t last_index); 2134 pgoff_t offset, pgoff_t last_index);
2063int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2135int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2064int btrfs_readpage(struct file *file, struct page *page); 2136int btrfs_readpage(struct file *file, struct page *page);
2065void btrfs_delete_inode(struct inode *inode); 2137void btrfs_delete_inode(struct inode *inode);
2066void btrfs_put_inode(struct inode *inode); 2138void btrfs_put_inode(struct inode *inode);
@@ -2133,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2133int btrfs_init_acl(struct inode *inode, struct inode *dir); 2205int btrfs_init_acl(struct inode *inode, struct inode *dir);
2134int btrfs_acl_chmod(struct inode *inode); 2206int btrfs_acl_chmod(struct inode *inode);
2135 2207
2136/* free-space-cache.c */
2137int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2138 u64 bytenr, u64 size);
2139int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2140 u64 offset, u64 bytes);
2141int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2142 u64 bytenr, u64 size);
2143int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2144 u64 offset, u64 bytes);
2145void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2146 *block_group);
2147struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2148 *block_group, u64 offset,
2149 u64 bytes);
2150void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2151 u64 bytes);
2152u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2153#endif 2208#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include "ctree.h"
22#include "delayed-ref.h"
23#include "transaction.h"
24
25/*
26 * delayed back reference update tracking. For subvolume trees
27 * we queue up extent allocations and backref maintenance for
28 * delayed processing. This avoids deep call chains where we
29 * add extents in the middle of btrfs_search_slot, and it allows
30 * us to buffer up frequently modified backrefs in an rb tree instead
31 * of hammering updates on the extent allocation tree.
32 *
33 * Right now this code is only used for reference counted trees, but
34 * the long term goal is to get rid of the similar code for delayed
35 * extent tree modifications.
36 */
37
38/*
39 * entries in the rb tree are ordered by the byte number of the extent
40 * and by the byte number of the parent block.
41 */
42static int comp_entry(struct btrfs_delayed_ref_node *ref,
43 u64 bytenr, u64 parent)
44{
45 if (bytenr < ref->bytenr)
46 return -1;
47 if (bytenr > ref->bytenr)
48 return 1;
49 if (parent < ref->parent)
50 return -1;
51 if (parent > ref->parent)
52 return 1;
53 return 0;
54}
55
56/*
57 * insert a new ref into the rbtree. This returns any existing refs
58 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
59 * inserted.
60 */
61static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 u64 bytenr, u64 parent,
63 struct rb_node *node)
64{
65 struct rb_node **p = &root->rb_node;
66 struct rb_node *parent_node = NULL;
67 struct btrfs_delayed_ref_node *entry;
68 int cmp;
69
70 while (*p) {
71 parent_node = *p;
72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 rb_node);
74
75 cmp = comp_entry(entry, bytenr, parent);
76 if (cmp < 0)
77 p = &(*p)->rb_left;
78 else if (cmp > 0)
79 p = &(*p)->rb_right;
80 else
81 return entry;
82 }
83
84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 rb_link_node(node, parent_node, p);
86 rb_insert_color(node, root);
87 return NULL;
88}
89
90/*
91 * find an entry based on (bytenr,parent). This returns the delayed
92 * ref if it was able to find one, or NULL if nothing was in that spot
93 */
94static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
95 u64 bytenr, u64 parent,
96 struct btrfs_delayed_ref_node **last)
97{
98 struct rb_node *n = root->rb_node;
99 struct btrfs_delayed_ref_node *entry;
100 int cmp;
101
102 while (n) {
103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
104 WARN_ON(!entry->in_tree);
105 if (last)
106 *last = entry;
107
108 cmp = comp_entry(entry, bytenr, parent);
109 if (cmp < 0)
110 n = n->rb_left;
111 else if (cmp > 0)
112 n = n->rb_right;
113 else
114 return entry;
115 }
116 return NULL;
117}
118
119int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
120 struct btrfs_delayed_ref_head *head)
121{
122 struct btrfs_delayed_ref_root *delayed_refs;
123
124 delayed_refs = &trans->transaction->delayed_refs;
125 assert_spin_locked(&delayed_refs->lock);
126 if (mutex_trylock(&head->mutex))
127 return 0;
128
129 atomic_inc(&head->node.refs);
130 spin_unlock(&delayed_refs->lock);
131
132 mutex_lock(&head->mutex);
133 spin_lock(&delayed_refs->lock);
134 if (!head->node.in_tree) {
135 mutex_unlock(&head->mutex);
136 btrfs_put_delayed_ref(&head->node);
137 return -EAGAIN;
138 }
139 btrfs_put_delayed_ref(&head->node);
140 return 0;
141}
142
143int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
144 struct list_head *cluster, u64 start)
145{
146 int count = 0;
147 struct btrfs_delayed_ref_root *delayed_refs;
148 struct rb_node *node;
149 struct btrfs_delayed_ref_node *ref;
150 struct btrfs_delayed_ref_head *head;
151
152 delayed_refs = &trans->transaction->delayed_refs;
153 if (start == 0) {
154 node = rb_first(&delayed_refs->root);
155 } else {
156 ref = NULL;
157 tree_search(&delayed_refs->root, start, (u64)-1, &ref);
158 if (ref) {
159 struct btrfs_delayed_ref_node *tmp;
160
161 node = rb_prev(&ref->rb_node);
162 while (node) {
163 tmp = rb_entry(node,
164 struct btrfs_delayed_ref_node,
165 rb_node);
166 if (tmp->bytenr < start)
167 break;
168 ref = tmp;
169 node = rb_prev(&ref->rb_node);
170 }
171 node = &ref->rb_node;
172 } else
173 node = rb_first(&delayed_refs->root);
174 }
175again:
176 while (node && count < 32) {
177 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
178 if (btrfs_delayed_ref_is_head(ref)) {
179 head = btrfs_delayed_node_to_head(ref);
180 if (list_empty(&head->cluster)) {
181 list_add_tail(&head->cluster, cluster);
182 delayed_refs->run_delayed_start =
183 head->node.bytenr;
184 count++;
185
186 WARN_ON(delayed_refs->num_heads_ready == 0);
187 delayed_refs->num_heads_ready--;
188 } else if (count) {
189 /* the goal of the clustering is to find extents
190 * that are likely to end up in the same extent
191 * leaf on disk. So, we don't want them spread
192 * all over the tree. Stop now if we've hit
193 * a head that was already in use
194 */
195 break;
196 }
197 }
198 node = rb_next(node);
199 }
200 if (count) {
201 return 0;
202 } else if (start) {
203 /*
204 * we've gone to the end of the rbtree without finding any
205 * clusters. start from the beginning and try again
206 */
207 start = 0;
208 node = rb_first(&delayed_refs->root);
209 goto again;
210 }
211 return 1;
212}
213
214/*
215 * This checks to see if there are any delayed refs in the
216 * btree for a given bytenr. It returns one if it finds any
217 * and zero otherwise.
218 *
219 * If it only finds a head node, it returns 0.
220 *
221 * The idea is to use this when deciding if you can safely delete an
222 * extent from the extent allocation tree. There may be a pending
223 * ref in the rbtree that adds or removes references, so as long as this
224 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
225 * allocation tree.
226 */
227int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
228{
229 struct btrfs_delayed_ref_node *ref;
230 struct btrfs_delayed_ref_root *delayed_refs;
231 struct rb_node *prev_node;
232 int ret = 0;
233
234 delayed_refs = &trans->transaction->delayed_refs;
235 spin_lock(&delayed_refs->lock);
236
237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
238 if (ref) {
239 prev_node = rb_prev(&ref->rb_node);
240 if (!prev_node)
241 goto out;
242 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
243 rb_node);
244 if (ref->bytenr == bytenr)
245 ret = 1;
246 }
247out:
248 spin_unlock(&delayed_refs->lock);
249 return ret;
250}
251
252/*
253 * helper function to lookup reference count
254 *
255 * the head node for delayed ref is used to store the sum of all the
256 * reference count modifications queued up in the rbtree. This way you
257 * can check to see what the reference count would be if all of the
258 * delayed refs are processed.
259 */
260int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
261 struct btrfs_root *root, u64 bytenr,
262 u64 num_bytes, u32 *refs)
263{
264 struct btrfs_delayed_ref_node *ref;
265 struct btrfs_delayed_ref_head *head;
266 struct btrfs_delayed_ref_root *delayed_refs;
267 struct btrfs_path *path;
268 struct extent_buffer *leaf;
269 struct btrfs_extent_item *ei;
270 struct btrfs_key key;
271 u32 num_refs;
272 int ret;
273
274 path = btrfs_alloc_path();
275 if (!path)
276 return -ENOMEM;
277
278 key.objectid = bytenr;
279 key.type = BTRFS_EXTENT_ITEM_KEY;
280 key.offset = num_bytes;
281 delayed_refs = &trans->transaction->delayed_refs;
282again:
283 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
284 &key, path, 0, 0);
285 if (ret < 0)
286 goto out;
287
288 if (ret == 0) {
289 leaf = path->nodes[0];
290 ei = btrfs_item_ptr(leaf, path->slots[0],
291 struct btrfs_extent_item);
292 num_refs = btrfs_extent_refs(leaf, ei);
293 } else {
294 num_refs = 0;
295 ret = 0;
296 }
297
298 spin_lock(&delayed_refs->lock);
299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
300 if (ref) {
301 head = btrfs_delayed_node_to_head(ref);
302 if (mutex_trylock(&head->mutex)) {
303 num_refs += ref->ref_mod;
304 mutex_unlock(&head->mutex);
305 *refs = num_refs;
306 goto out;
307 }
308
309 atomic_inc(&ref->refs);
310 spin_unlock(&delayed_refs->lock);
311
312 btrfs_release_path(root->fs_info->extent_root, path);
313
314 mutex_lock(&head->mutex);
315 mutex_unlock(&head->mutex);
316 btrfs_put_delayed_ref(ref);
317 goto again;
318 } else {
319 *refs = num_refs;
320 }
321out:
322 spin_unlock(&delayed_refs->lock);
323 btrfs_free_path(path);
324 return ret;
325}
326
327/*
328 * helper function to update an extent delayed ref in the
329 * rbtree. existing and update must both have the same
330 * bytenr and parent
331 *
332 * This may free existing if the update cancels out whatever
333 * operation it was doing.
334 */
335static noinline void
336update_existing_ref(struct btrfs_trans_handle *trans,
337 struct btrfs_delayed_ref_root *delayed_refs,
338 struct btrfs_delayed_ref_node *existing,
339 struct btrfs_delayed_ref_node *update)
340{
341 struct btrfs_delayed_ref *existing_ref;
342 struct btrfs_delayed_ref *ref;
343
344 existing_ref = btrfs_delayed_node_to_ref(existing);
345 ref = btrfs_delayed_node_to_ref(update);
346
347 if (ref->pin)
348 existing_ref->pin = 1;
349
350 if (ref->action != existing_ref->action) {
351 /*
352 * this is effectively undoing either an add or a
353 * drop. We decrement the ref_mod, and if it goes
354 * down to zero we just delete the entry without
355 * every changing the extent allocation tree.
356 */
357 existing->ref_mod--;
358 if (existing->ref_mod == 0) {
359 rb_erase(&existing->rb_node,
360 &delayed_refs->root);
361 existing->in_tree = 0;
362 btrfs_put_delayed_ref(existing);
363 delayed_refs->num_entries--;
364 if (trans->delayed_ref_updates)
365 trans->delayed_ref_updates--;
366 }
367 } else {
368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
369 /* if we're adding refs, make sure all the
370 * details match up. The extent could
371 * have been totally freed and reallocated
372 * by a different owner before the delayed
373 * ref entries were removed.
374 */
375 existing_ref->owner_objectid = ref->owner_objectid;
376 existing_ref->generation = ref->generation;
377 existing_ref->root = ref->root;
378 existing->num_bytes = update->num_bytes;
379 }
380 /*
381 * the action on the existing ref matches
382 * the action on the ref we're trying to add.
383 * Bump the ref_mod by one so the backref that
384 * is eventually added/removed has the correct
385 * reference count
386 */
387 existing->ref_mod += update->ref_mod;
388 }
389}
390
391/*
392 * helper function to update the accounting in the head ref
393 * existing and update must have the same bytenr
394 */
395static noinline void
396update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
397 struct btrfs_delayed_ref_node *update)
398{
399 struct btrfs_delayed_ref_head *existing_ref;
400 struct btrfs_delayed_ref_head *ref;
401
402 existing_ref = btrfs_delayed_node_to_head(existing);
403 ref = btrfs_delayed_node_to_head(update);
404
405 if (ref->must_insert_reserved) {
406 /* if the extent was freed and then
407 * reallocated before the delayed ref
408 * entries were processed, we can end up
409 * with an existing head ref without
410 * the must_insert_reserved flag set.
411 * Set it again here
412 */
413 existing_ref->must_insert_reserved = ref->must_insert_reserved;
414
415 /*
416 * update the num_bytes so we make sure the accounting
417 * is done correctly
418 */
419 existing->num_bytes = update->num_bytes;
420
421 }
422
423 /*
424 * update the reference mod on the head to reflect this new operation
425 */
426 existing->ref_mod += update->ref_mod;
427}
428
429/*
430 * helper function to actually insert a delayed ref into the rbtree.
431 * this does all the dirty work in terms of maintaining the correct
432 * overall modification count in the head node and properly dealing
433 * with updating existing nodes as new modifications are queued.
434 */
435static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
436 struct btrfs_delayed_ref_node *ref,
437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
438 u64 ref_generation, u64 owner_objectid, int action,
439 int pin)
440{
441 struct btrfs_delayed_ref_node *existing;
442 struct btrfs_delayed_ref *full_ref;
443 struct btrfs_delayed_ref_head *head_ref = NULL;
444 struct btrfs_delayed_ref_root *delayed_refs;
445 int count_mod = 1;
446 int must_insert_reserved = 0;
447
448 /*
449 * the head node stores the sum of all the mods, so dropping a ref
450 * should drop the sum in the head node by one.
451 */
452 if (parent == (u64)-1) {
453 if (action == BTRFS_DROP_DELAYED_REF)
454 count_mod = -1;
455 else if (action == BTRFS_UPDATE_DELAYED_HEAD)
456 count_mod = 0;
457 }
458
459 /*
460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
461 * the reserved accounting when the extent is finally added, or
462 * if a later modification deletes the delayed ref without ever
463 * inserting the extent into the extent allocation tree.
464 * ref->must_insert_reserved is the flag used to record
465 * that accounting mods are required.
466 *
467 * Once we record must_insert_reserved, switch the action to
468 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 */
470 if (action == BTRFS_ADD_DELAYED_EXTENT) {
471 must_insert_reserved = 1;
472 action = BTRFS_ADD_DELAYED_REF;
473 } else {
474 must_insert_reserved = 0;
475 }
476
477
478 delayed_refs = &trans->transaction->delayed_refs;
479
480 /* first set the basic ref node struct up */
481 atomic_set(&ref->refs, 1);
482 ref->bytenr = bytenr;
483 ref->parent = parent;
484 ref->ref_mod = count_mod;
485 ref->in_tree = 1;
486 ref->num_bytes = num_bytes;
487
488 if (btrfs_delayed_ref_is_head(ref)) {
489 head_ref = btrfs_delayed_node_to_head(ref);
490 head_ref->must_insert_reserved = must_insert_reserved;
491 INIT_LIST_HEAD(&head_ref->cluster);
492 mutex_init(&head_ref->mutex);
493 } else {
494 full_ref = btrfs_delayed_node_to_ref(ref);
495 full_ref->root = ref_root;
496 full_ref->generation = ref_generation;
497 full_ref->owner_objectid = owner_objectid;
498 full_ref->pin = pin;
499 full_ref->action = action;
500 }
501
502 existing = tree_insert(&delayed_refs->root, bytenr,
503 parent, &ref->rb_node);
504
505 if (existing) {
506 if (btrfs_delayed_ref_is_head(ref))
507 update_existing_head_ref(existing, ref);
508 else
509 update_existing_ref(trans, delayed_refs, existing, ref);
510
511 /*
512 * we've updated the existing ref, free the newly
513 * allocated ref
514 */
515 kfree(ref);
516 } else {
517 if (btrfs_delayed_ref_is_head(ref)) {
518 delayed_refs->num_heads++;
519 delayed_refs->num_heads_ready++;
520 }
521 delayed_refs->num_entries++;
522 trans->delayed_ref_updates++;
523 }
524 return 0;
525}
526
527/*
528 * add a delayed ref to the tree. This does all of the accounting required
529 * to make sure the delayed ref is eventually processed before this
530 * transaction commits.
531 */
532int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
534 u64 ref_generation, u64 owner_objectid, int action,
535 int pin)
536{
537 struct btrfs_delayed_ref *ref;
538 struct btrfs_delayed_ref_head *head_ref;
539 struct btrfs_delayed_ref_root *delayed_refs;
540 int ret;
541
542 ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 if (!ref)
544 return -ENOMEM;
545
546 /*
547 * the parent = 0 case comes from cases where we don't actually
548 * know the parent yet. It will get updated later via a add/drop
549 * pair.
550 */
551 if (parent == 0)
552 parent = bytenr;
553
554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 if (!head_ref) {
556 kfree(ref);
557 return -ENOMEM;
558 }
559 delayed_refs = &trans->transaction->delayed_refs;
560 spin_lock(&delayed_refs->lock);
561
562 /*
563 * insert both the head node and the new ref without dropping
564 * the spin lock
565 */
566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
567 (u64)-1, 0, 0, 0, action, pin);
568 BUG_ON(ret);
569
570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
571 parent, ref_root, ref_generation,
572 owner_objectid, action, pin);
573 BUG_ON(ret);
574 spin_unlock(&delayed_refs->lock);
575 return 0;
576}
577
578/*
579 * this does a simple search for the head node for a given extent.
580 * It must be called with the delayed ref spinlock held, and it returns
581 * the head node if any where found, or NULL if not.
582 */
583struct btrfs_delayed_ref_head *
584btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
585{
586 struct btrfs_delayed_ref_node *ref;
587 struct btrfs_delayed_ref_root *delayed_refs;
588
589 delayed_refs = &trans->transaction->delayed_refs;
590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
591 if (ref)
592 return btrfs_delayed_node_to_head(ref);
593 return NULL;
594}
595
596/*
597 * add a delayed ref to the tree. This does all of the accounting required
598 * to make sure the delayed ref is eventually processed before this
599 * transaction commits.
600 *
601 * The main point of this call is to add and remove a backreference in a single
602 * shot, taking the lock only once, and only searching for the head node once.
603 *
604 * It is the same as doing a ref add and delete in two separate calls.
605 */
606int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 u64 bytenr, u64 num_bytes, u64 orig_parent,
608 u64 parent, u64 orig_ref_root, u64 ref_root,
609 u64 orig_ref_generation, u64 ref_generation,
610 u64 owner_objectid, int pin)
611{
612 struct btrfs_delayed_ref *ref;
613 struct btrfs_delayed_ref *old_ref;
614 struct btrfs_delayed_ref_head *head_ref;
615 struct btrfs_delayed_ref_root *delayed_refs;
616 int ret;
617
618 ref = kmalloc(sizeof(*ref), GFP_NOFS);
619 if (!ref)
620 return -ENOMEM;
621
622 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
623 if (!old_ref) {
624 kfree(ref);
625 return -ENOMEM;
626 }
627
628 /*
629 * the parent = 0 case comes from cases where we don't actually
630 * know the parent yet. It will get updated later via a add/drop
631 * pair.
632 */
633 if (parent == 0)
634 parent = bytenr;
635 if (orig_parent == 0)
636 orig_parent = bytenr;
637
638 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
639 if (!head_ref) {
640 kfree(ref);
641 kfree(old_ref);
642 return -ENOMEM;
643 }
644 delayed_refs = &trans->transaction->delayed_refs;
645 spin_lock(&delayed_refs->lock);
646
647 /*
648 * insert both the head node and the new ref without dropping
649 * the spin lock
650 */
651 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
652 (u64)-1, 0, 0, 0,
653 BTRFS_UPDATE_DELAYED_HEAD, 0);
654 BUG_ON(ret);
655
656 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
657 parent, ref_root, ref_generation,
658 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
659 BUG_ON(ret);
660
661 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
662 orig_parent, orig_ref_root,
663 orig_ref_generation, owner_objectid,
664 BTRFS_DROP_DELAYED_REF, pin);
665 BUG_ON(ret);
666 spin_unlock(&delayed_refs->lock);
667 return 0;
668}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__
20
21/* these are the possible values of struct btrfs_delayed_ref->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
25#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
26
27struct btrfs_delayed_ref_node {
28 struct rb_node rb_node;
29
30 /* the starting bytenr of the extent */
31 u64 bytenr;
32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */
37 u64 num_bytes;
38
39 /* ref count on this data structure */
40 atomic_t refs;
41
42 /*
43 * how many refs is this entry adding or deleting. For
44 * head refs, this may be a negative number because it is keeping
45 * track of the total mods done to the reference count.
46 * For individual refs, this will always be a positive number
47 *
48 * It may be more than one, since it is possible for a single
49 * parent to have more than one ref on an extent
50 */
51 int ref_mod;
52
53 /* is this node still in the rbtree? */
54 unsigned int in_tree:1;
55};
56
57/*
58 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs
60 * at a time for a single extent. They also store the sum of all the
61 * reference count modifications we've queued up.
62 */
63struct btrfs_delayed_ref_head {
64 struct btrfs_delayed_ref_node node;
65
66 /*
67 * the mutex is held while running the refs, and it is also
68 * held when checking the sum of reference modifications.
69 */
70 struct mutex mutex;
71
72 struct list_head cluster;
73
74 /*
75 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree
77 * until the delayed ref is processed. must_insert_reserved is
78 * used to flag a delayed ref so the accounting can be updated
79 * when a full insert is done.
80 *
81 * It is possible the extent will be freed before it is ever
82 * inserted into the extent allocation tree. In this case
83 * we need to update the in ram accounting to properly reflect
84 * the free has happened.
85 */
86 unsigned int must_insert_reserved:1;
87};
88
89struct btrfs_delayed_ref {
90 struct btrfs_delayed_ref_node node;
91
92 /* the root objectid our ref will point to */
93 u64 root;
94
95 /* the generation for the backref */
96 u64 generation;
97
98 /* owner_objectid of the backref */
99 u64 owner_objectid;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108};
109
110struct btrfs_delayed_ref_root {
111 struct rb_root root;
112
113 /* this spin lock protects the rbtree and the entries inside */
114 spinlock_t lock;
115
116 /* how many delayed ref updates we've queued, used by the
117 * throttling code
118 */
119 unsigned long num_entries;
120
121 /* total number of head nodes in tree */
122 unsigned long num_heads;
123
124 /* total number of head nodes ready for processing */
125 unsigned long num_heads_ready;
126
127 /*
128 * set when the tree is flushing before a transaction commit,
129 * used by the throttling code to decide if new updates need
130 * to be run right away
131 */
132 int flushing;
133
134 u64 run_delayed_start;
135};
136
137static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
138{
139 WARN_ON(atomic_read(&ref->refs) == 0);
140 if (atomic_dec_and_test(&ref->refs)) {
141 WARN_ON(ref->in_tree);
142 kfree(ref);
143 }
144}
145
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
148 u64 ref_generation, u64 owner_objectid, int action,
149 int pin);
150
151struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root,
160 u64 orig_ref_generation, u64 ref_generation,
161 u64 owner_objectid, int pin);
162int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
163 struct btrfs_delayed_ref_head *head);
164int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
165 struct list_head *cluster, u64 search_start);
166/*
167 * a node might live in a head or a regular ref, this lets you
168 * test for the proper type to use.
169 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{
172 return node->parent == (u64)-1;
173}
174
175/*
176 * helper functions to cast a node into its container
177 */
178static inline struct btrfs_delayed_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
180{
181 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node);
183
184}
185
186static inline struct btrfs_delayed_ref_head *
187btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{
189 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192}
193#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
145 key.objectid = dir; 145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len); 147 key.offset = btrfs_name_hash(name, name_len);
148
148 path = btrfs_alloc_path(); 149 path = btrfs_alloc_path();
150 path->leave_spinning = 1;
151
149 data_size = sizeof(*dir_item) + name_len; 152 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 153 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len); 154 name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3e18175248e0..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -668,14 +669,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
668static int btree_writepage(struct page *page, struct writeback_control *wbc) 669static int btree_writepage(struct page *page, struct writeback_control *wbc)
669{ 670{
670 struct extent_io_tree *tree; 671 struct extent_io_tree *tree;
672 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
673 struct extent_buffer *eb;
674 int was_dirty;
675
671 tree = &BTRFS_I(page->mapping->host)->io_tree; 676 tree = &BTRFS_I(page->mapping->host)->io_tree;
677 if (!(current->flags & PF_MEMALLOC)) {
678 return extent_write_full_page(tree, page,
679 btree_get_extent, wbc);
680 }
672 681
673 if (current->flags & PF_MEMALLOC) { 682 redirty_page_for_writepage(wbc, page);
674 redirty_page_for_writepage(wbc, page); 683 eb = btrfs_find_tree_block(root, page_offset(page),
675 unlock_page(page); 684 PAGE_CACHE_SIZE);
676 return 0; 685 WARN_ON(!eb);
686
687 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
688 if (!was_dirty) {
689 spin_lock(&root->fs_info->delalloc_lock);
690 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
691 spin_unlock(&root->fs_info->delalloc_lock);
677 } 692 }
678 return extent_write_full_page(tree, page, btree_get_extent, wbc); 693 free_extent_buffer(eb);
694
695 unlock_page(page);
696 return 0;
679} 697}
680 698
681static int btree_writepages(struct address_space *mapping, 699static int btree_writepages(struct address_space *mapping,
@@ -684,15 +702,15 @@ static int btree_writepages(struct address_space *mapping,
684 struct extent_io_tree *tree; 702 struct extent_io_tree *tree;
685 tree = &BTRFS_I(mapping->host)->io_tree; 703 tree = &BTRFS_I(mapping->host)->io_tree;
686 if (wbc->sync_mode == WB_SYNC_NONE) { 704 if (wbc->sync_mode == WB_SYNC_NONE) {
705 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
687 u64 num_dirty; 706 u64 num_dirty;
688 u64 start = 0;
689 unsigned long thresh = 32 * 1024 * 1024; 707 unsigned long thresh = 32 * 1024 * 1024;
690 708
691 if (wbc->for_kupdate) 709 if (wbc->for_kupdate)
692 return 0; 710 return 0;
693 711
694 num_dirty = count_range_bits(tree, &start, (u64)-1, 712 /* this is a bit racy, but that's ok */
695 thresh, EXTENT_DIRTY); 713 num_dirty = root->fs_info->dirty_metadata_bytes;
696 if (num_dirty < thresh) 714 if (num_dirty < thresh)
697 return 0; 715 return 0;
698 } 716 }
@@ -859,9 +877,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
859 root->fs_info->running_transaction->transid) { 877 root->fs_info->running_transaction->transid) {
860 btrfs_assert_tree_locked(buf); 878 btrfs_assert_tree_locked(buf);
861 879
862 /* ugh, clear_extent_buffer_dirty can be expensive */ 880 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
863 btrfs_set_lock_blocking(buf); 881 spin_lock(&root->fs_info->delalloc_lock);
882 if (root->fs_info->dirty_metadata_bytes >= buf->len)
883 root->fs_info->dirty_metadata_bytes -= buf->len;
884 else
885 WARN_ON(1);
886 spin_unlock(&root->fs_info->delalloc_lock);
887 }
864 888
889 /* ugh, clear_extent_buffer_dirty needs to lock the page */
890 btrfs_set_lock_blocking(buf);
865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 891 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
866 buf); 892 buf);
867 } 893 }
@@ -1387,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
1387 1413
1388 ret = extent_range_uptodate(io_tree, start + length, 1414 ret = extent_range_uptodate(io_tree, start + length,
1389 start + buf_len - 1); 1415 start + buf_len - 1);
1390 if (ret == 1)
1391 return ret;
1392 return ret; 1416 return ret;
1393} 1417}
1394 1418
@@ -1471,12 +1495,6 @@ static int transaction_kthread(void *arg)
1471 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1495 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1472 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1496 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1473 1497
1474 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1475 printk(KERN_INFO "btrfs: total reference cache "
1476 "size %llu\n",
1477 root->fs_info->total_ref_cache_size);
1478 }
1479
1480 mutex_lock(&root->fs_info->trans_mutex); 1498 mutex_lock(&root->fs_info->trans_mutex);
1481 cur = root->fs_info->running_transaction; 1499 cur = root->fs_info->running_transaction;
1482 if (!cur) { 1500 if (!cur) {
@@ -1493,6 +1511,7 @@ static int transaction_kthread(void *arg)
1493 mutex_unlock(&root->fs_info->trans_mutex); 1511 mutex_unlock(&root->fs_info->trans_mutex);
1494 trans = btrfs_start_transaction(root, 1); 1512 trans = btrfs_start_transaction(root, 1);
1495 ret = btrfs_commit_transaction(trans, root); 1513 ret = btrfs_commit_transaction(trans, root);
1514
1496sleep: 1515sleep:
1497 wake_up_process(root->fs_info->cleaner_kthread); 1516 wake_up_process(root->fs_info->cleaner_kthread);
1498 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1517 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1571,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1552 INIT_LIST_HEAD(&fs_info->dead_roots); 1571 INIT_LIST_HEAD(&fs_info->dead_roots);
1553 INIT_LIST_HEAD(&fs_info->hashers); 1572 INIT_LIST_HEAD(&fs_info->hashers);
1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1573 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1574 INIT_LIST_HEAD(&fs_info->ordered_operations);
1555 spin_lock_init(&fs_info->delalloc_lock); 1575 spin_lock_init(&fs_info->delalloc_lock);
1556 spin_lock_init(&fs_info->new_trans_lock); 1576 spin_lock_init(&fs_info->new_trans_lock);
1557 spin_lock_init(&fs_info->ref_cache_lock); 1577 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1631,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1611 1631
1612 extent_io_tree_init(&fs_info->pinned_extents, 1632 extent_io_tree_init(&fs_info->pinned_extents,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1633 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 extent_io_tree_init(&fs_info->pending_del,
1615 fs_info->btree_inode->i_mapping, GFP_NOFS);
1616 extent_io_tree_init(&fs_info->extent_ins,
1617 fs_info->btree_inode->i_mapping, GFP_NOFS);
1618 fs_info->do_barriers = 1; 1634 fs_info->do_barriers = 1;
1619 1635
1620 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1636 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,15 +1643,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 insert_inode_hash(fs_info->btree_inode); 1643 insert_inode_hash(fs_info->btree_inode);
1628 1644
1629 mutex_init(&fs_info->trans_mutex); 1645 mutex_init(&fs_info->trans_mutex);
1646 mutex_init(&fs_info->ordered_operations_mutex);
1630 mutex_init(&fs_info->tree_log_mutex); 1647 mutex_init(&fs_info->tree_log_mutex);
1631 mutex_init(&fs_info->drop_mutex); 1648 mutex_init(&fs_info->drop_mutex);
1632 mutex_init(&fs_info->extent_ins_mutex);
1633 mutex_init(&fs_info->pinned_mutex);
1634 mutex_init(&fs_info->chunk_mutex); 1649 mutex_init(&fs_info->chunk_mutex);
1635 mutex_init(&fs_info->transaction_kthread_mutex); 1650 mutex_init(&fs_info->transaction_kthread_mutex);
1636 mutex_init(&fs_info->cleaner_mutex); 1651 mutex_init(&fs_info->cleaner_mutex);
1637 mutex_init(&fs_info->volume_mutex); 1652 mutex_init(&fs_info->volume_mutex);
1638 mutex_init(&fs_info->tree_reloc_mutex); 1653 mutex_init(&fs_info->tree_reloc_mutex);
1654
1655 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1656 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1657
1639 init_waitqueue_head(&fs_info->transaction_throttle); 1658 init_waitqueue_head(&fs_info->transaction_throttle);
1640 init_waitqueue_head(&fs_info->transaction_wait); 1659 init_waitqueue_head(&fs_info->transaction_wait);
1641 init_waitqueue_head(&fs_info->async_submit_wait); 1660 init_waitqueue_head(&fs_info->async_submit_wait);
@@ -2358,8 +2377,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2358 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2377 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2359 u64 transid = btrfs_header_generation(buf); 2378 u64 transid = btrfs_header_generation(buf);
2360 struct inode *btree_inode = root->fs_info->btree_inode; 2379 struct inode *btree_inode = root->fs_info->btree_inode;
2361 2380 int was_dirty;
2362 btrfs_set_lock_blocking(buf);
2363 2381
2364 btrfs_assert_tree_locked(buf); 2382 btrfs_assert_tree_locked(buf);
2365 if (transid != root->fs_info->generation) { 2383 if (transid != root->fs_info->generation) {
@@ -2370,7 +2388,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2370 (unsigned long long)root->fs_info->generation); 2388 (unsigned long long)root->fs_info->generation);
2371 WARN_ON(1); 2389 WARN_ON(1);
2372 } 2390 }
2373 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 2391 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2392 buf);
2393 if (!was_dirty) {
2394 spin_lock(&root->fs_info->delalloc_lock);
2395 root->fs_info->dirty_metadata_bytes += buf->len;
2396 spin_unlock(&root->fs_info->delalloc_lock);
2397 }
2374} 2398}
2375 2399
2376void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2400void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2385,7 +2409,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2385 unsigned long thresh = 32 * 1024 * 1024; 2409 unsigned long thresh = 32 * 1024 * 1024;
2386 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 2410 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2387 2411
2388 if (current_is_pdflush() || current->flags & PF_MEMALLOC) 2412 if (current->flags & PF_MEMALLOC)
2389 return; 2413 return;
2390 2414
2391 num_dirty = count_range_bits(tree, &start, (u64)-1, 2415 num_dirty = count_range_bits(tree, &start, (u64)-1,
@@ -2410,6 +2434,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2410int btree_lock_page_hook(struct page *page) 2434int btree_lock_page_hook(struct page *page)
2411{ 2435{
2412 struct inode *inode = page->mapping->host; 2436 struct inode *inode = page->mapping->host;
2437 struct btrfs_root *root = BTRFS_I(inode)->root;
2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2438 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2414 struct extent_buffer *eb; 2439 struct extent_buffer *eb;
2415 unsigned long len; 2440 unsigned long len;
@@ -2425,6 +2450,16 @@ int btree_lock_page_hook(struct page *page)
2425 2450
2426 btrfs_tree_lock(eb); 2451 btrfs_tree_lock(eb);
2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2452 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2453
2454 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2455 spin_lock(&root->fs_info->delalloc_lock);
2456 if (root->fs_info->dirty_metadata_bytes >= eb->len)
2457 root->fs_info->dirty_metadata_bytes -= eb->len;
2458 else
2459 WARN_ON(1);
2460 spin_unlock(&root->fs_info->delalloc_lock);
2461 }
2462
2428 btrfs_tree_unlock(eb); 2463 btrfs_tree_unlock(eb);
2429 free_extent_buffer(eb); 2464 free_extent_buffer(eb);
2430out: 2465out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 77int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root, 78int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad2059..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -49,17 +50,23 @@ struct pending_extent_op {
49 int del; 50 int del;
50}; 51};
51 52
52static int finish_current_insert(struct btrfs_trans_handle *trans, 53static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all); 54 struct btrfs_root *root, u64 parent,
54static int del_pending_extents(struct btrfs_trans_handle *trans, 55 u64 root_objectid, u64 ref_generation,
55 struct btrfs_root *extent_root, int all); 56 u64 owner, struct btrfs_key *ins,
56static int pin_down_bytes(struct btrfs_trans_handle *trans, 57 int ref_mod);
57 struct btrfs_root *root, 58static int update_reserved_extents(struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data); 59 u64 bytenr, u64 num, int reserve);
59static int update_block_group(struct btrfs_trans_handle *trans, 60static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root, 61 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc, 62 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 63 int mark_free);
64static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 u64 bytenr, u64 num_bytes, u64 parent,
67 u64 root_objectid, u64 ref_generation,
68 u64 owner_objectid, int pin,
69 int ref_to_drop);
63 70
64static int do_chunk_alloc(struct btrfs_trans_handle *trans, 71static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes, 72 struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -160,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
160 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
161 int ret; 168 int ret;
162 169
163 mutex_lock(&info->pinned_mutex);
164 while (start < end) { 170 while (start < end) {
165 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
166 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -186,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
186 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
187 BUG_ON(ret); 193 BUG_ON(ret);
188 } 194 }
189 mutex_unlock(&info->pinned_mutex);
190 195
191 return 0; 196 return 0;
192} 197}
@@ -285,8 +290,8 @@ next:
285 block_group->key.objectid + 290 block_group->key.objectid +
286 block_group->key.offset); 291 block_group->key.offset);
287 292
288 remove_sb_from_cache(root, block_group);
289 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
290 ret = 0; 295 ret = 0;
291err: 296err:
292 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -320,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
320 return cache; 325 return cache;
321} 326}
322 327
323static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
324{ 329{
325 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
326 kfree(cache); 331 kfree(cache);
@@ -393,12 +398,12 @@ again:
393 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
394 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
395 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
396 put_block_group(cache); 401 btrfs_put_block_group(cache);
397 goto found; 402 goto found;
398 } 403 }
399 } 404 }
400 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
401 put_block_group(cache); 406 btrfs_put_block_group(cache);
402 cond_resched(); 407 cond_resched();
403 } 408 }
404 if (!wrapped) { 409 if (!wrapped) {
@@ -554,262 +559,13 @@ out:
554 return ret; 559 return ret;
555} 560}
556 561
557/*
558 * updates all the backrefs that are pending on update_list for the
559 * extent_root
560 */
561static noinline int update_backrefs(struct btrfs_trans_handle *trans,
562 struct btrfs_root *extent_root,
563 struct btrfs_path *path,
564 struct list_head *update_list)
565{
566 struct btrfs_key key;
567 struct btrfs_extent_ref *ref;
568 struct btrfs_fs_info *info = extent_root->fs_info;
569 struct pending_extent_op *op;
570 struct extent_buffer *leaf;
571 int ret = 0;
572 struct list_head *cur = update_list->next;
573 u64 ref_objectid;
574 u64 ref_root = extent_root->root_key.objectid;
575
576 op = list_entry(cur, struct pending_extent_op, list);
577
578search:
579 key.objectid = op->bytenr;
580 key.type = BTRFS_EXTENT_REF_KEY;
581 key.offset = op->orig_parent;
582
583 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
584 BUG_ON(ret);
585
586 leaf = path->nodes[0];
587
588loop:
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590
591 ref_objectid = btrfs_ref_objectid(leaf, ref);
592
593 if (btrfs_ref_root(leaf, ref) != ref_root ||
594 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
595 (ref_objectid != op->level &&
596 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
597 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
598 "root %llu, owner %u\n",
599 (unsigned long long)op->bytenr,
600 (unsigned long long)op->orig_parent,
601 (unsigned long long)ref_root, op->level);
602 btrfs_print_leaf(extent_root, leaf);
603 BUG();
604 }
605
606 key.objectid = op->bytenr;
607 key.offset = op->parent;
608 key.type = BTRFS_EXTENT_REF_KEY;
609 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
610 BUG_ON(ret);
611 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
612 btrfs_set_ref_generation(leaf, ref, op->generation);
613
614 cur = cur->next;
615
616 list_del_init(&op->list);
617 unlock_extent(&info->extent_ins, op->bytenr,
618 op->bytenr + op->num_bytes - 1, GFP_NOFS);
619 kfree(op);
620
621 if (cur == update_list) {
622 btrfs_mark_buffer_dirty(path->nodes[0]);
623 btrfs_release_path(extent_root, path);
624 goto out;
625 }
626
627 op = list_entry(cur, struct pending_extent_op, list);
628
629 path->slots[0]++;
630 while (path->slots[0] < btrfs_header_nritems(leaf)) {
631 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
632 if (key.objectid == op->bytenr &&
633 key.type == BTRFS_EXTENT_REF_KEY)
634 goto loop;
635 path->slots[0]++;
636 }
637
638 btrfs_mark_buffer_dirty(path->nodes[0]);
639 btrfs_release_path(extent_root, path);
640 goto search;
641
642out:
643 return 0;
644}
645
646static noinline int insert_extents(struct btrfs_trans_handle *trans,
647 struct btrfs_root *extent_root,
648 struct btrfs_path *path,
649 struct list_head *insert_list, int nr)
650{
651 struct btrfs_key *keys;
652 u32 *data_size;
653 struct pending_extent_op *op;
654 struct extent_buffer *leaf;
655 struct list_head *cur = insert_list->next;
656 struct btrfs_fs_info *info = extent_root->fs_info;
657 u64 ref_root = extent_root->root_key.objectid;
658 int i = 0, last = 0, ret;
659 int total = nr * 2;
660
661 if (!nr)
662 return 0;
663
664 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
665 if (!keys)
666 return -ENOMEM;
667
668 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
669 if (!data_size) {
670 kfree(keys);
671 return -ENOMEM;
672 }
673
674 list_for_each_entry(op, insert_list, list) {
675 keys[i].objectid = op->bytenr;
676 keys[i].offset = op->num_bytes;
677 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
678 data_size[i] = sizeof(struct btrfs_extent_item);
679 i++;
680
681 keys[i].objectid = op->bytenr;
682 keys[i].offset = op->parent;
683 keys[i].type = BTRFS_EXTENT_REF_KEY;
684 data_size[i] = sizeof(struct btrfs_extent_ref);
685 i++;
686 }
687
688 op = list_entry(cur, struct pending_extent_op, list);
689 i = 0;
690 while (i < total) {
691 int c;
692 ret = btrfs_insert_some_items(trans, extent_root, path,
693 keys+i, data_size+i, total-i);
694 BUG_ON(ret < 0);
695
696 if (last && ret > 1)
697 BUG();
698
699 leaf = path->nodes[0];
700 for (c = 0; c < ret; c++) {
701 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
702
703 /*
704 * if the first item we inserted was a backref, then
705 * the EXTENT_ITEM will be the odd c's, else it will
706 * be the even c's
707 */
708 if ((ref_first && (c % 2)) ||
709 (!ref_first && !(c % 2))) {
710 struct btrfs_extent_item *itm;
711
712 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
713 struct btrfs_extent_item);
714 btrfs_set_extent_refs(path->nodes[0], itm, 1);
715 op->del++;
716 } else {
717 struct btrfs_extent_ref *ref;
718
719 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
720 struct btrfs_extent_ref);
721 btrfs_set_ref_root(leaf, ref, ref_root);
722 btrfs_set_ref_generation(leaf, ref,
723 op->generation);
724 btrfs_set_ref_objectid(leaf, ref, op->level);
725 btrfs_set_ref_num_refs(leaf, ref, 1);
726 op->del++;
727 }
728
729 /*
730 * using del to see when its ok to free up the
731 * pending_extent_op. In the case where we insert the
732 * last item on the list in order to help do batching
733 * we need to not free the extent op until we actually
734 * insert the extent_item
735 */
736 if (op->del == 2) {
737 unlock_extent(&info->extent_ins, op->bytenr,
738 op->bytenr + op->num_bytes - 1,
739 GFP_NOFS);
740 cur = cur->next;
741 list_del_init(&op->list);
742 kfree(op);
743 if (cur != insert_list)
744 op = list_entry(cur,
745 struct pending_extent_op,
746 list);
747 }
748 }
749 btrfs_mark_buffer_dirty(leaf);
750 btrfs_release_path(extent_root, path);
751
752 /*
753 * Ok backref's and items usually go right next to eachother,
754 * but if we could only insert 1 item that means that we
755 * inserted on the end of a leaf, and we have no idea what may
756 * be on the next leaf so we just play it safe. In order to
757 * try and help this case we insert the last thing on our
758 * insert list so hopefully it will end up being the last
759 * thing on the leaf and everything else will be before it,
760 * which will let us insert a whole bunch of items at the same
761 * time.
762 */
763 if (ret == 1 && !last && (i + ret < total)) {
764 /*
765 * last: where we will pick up the next time around
766 * i: our current key to insert, will be total - 1
767 * cur: the current op we are screwing with
768 * op: duh
769 */
770 last = i + ret;
771 i = total - 1;
772 cur = insert_list->prev;
773 op = list_entry(cur, struct pending_extent_op, list);
774 } else if (last) {
775 /*
776 * ok we successfully inserted the last item on the
777 * list, lets reset everything
778 *
779 * i: our current key to insert, so where we left off
780 * last time
781 * last: done with this
782 * cur: the op we are messing with
783 * op: duh
784 * total: since we inserted the last key, we need to
785 * decrement total so we dont overflow
786 */
787 i = last;
788 last = 0;
789 total--;
790 if (i < total) {
791 cur = insert_list->next;
792 op = list_entry(cur, struct pending_extent_op,
793 list);
794 }
795 } else {
796 i += ret;
797 }
798
799 cond_resched();
800 }
801 ret = 0;
802 kfree(keys);
803 kfree(data_size);
804 return ret;
805}
806
807static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 562static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
808 struct btrfs_root *root, 563 struct btrfs_root *root,
809 struct btrfs_path *path, 564 struct btrfs_path *path,
810 u64 bytenr, u64 parent, 565 u64 bytenr, u64 parent,
811 u64 ref_root, u64 ref_generation, 566 u64 ref_root, u64 ref_generation,
812 u64 owner_objectid) 567 u64 owner_objectid,
568 int refs_to_add)
813{ 569{
814 struct btrfs_key key; 570 struct btrfs_key key;
815 struct extent_buffer *leaf; 571 struct extent_buffer *leaf;
@@ -829,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
829 btrfs_set_ref_root(leaf, ref, ref_root); 585 btrfs_set_ref_root(leaf, ref, ref_root);
830 btrfs_set_ref_generation(leaf, ref, ref_generation); 586 btrfs_set_ref_generation(leaf, ref, ref_generation);
831 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 587 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
832 btrfs_set_ref_num_refs(leaf, ref, 1); 588 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
833 } else if (ret == -EEXIST) { 589 } else if (ret == -EEXIST) {
834 u64 existing_owner; 590 u64 existing_owner;
591
835 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 592 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
836 leaf = path->nodes[0]; 593 leaf = path->nodes[0];
837 ref = btrfs_item_ptr(leaf, path->slots[0], 594 ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
845 602
846 num_refs = btrfs_ref_num_refs(leaf, ref); 603 num_refs = btrfs_ref_num_refs(leaf, ref);
847 BUG_ON(num_refs == 0); 604 BUG_ON(num_refs == 0);
848 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); 605 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
849 606
850 existing_owner = btrfs_ref_objectid(leaf, ref); 607 existing_owner = btrfs_ref_objectid(leaf, ref);
851 if (existing_owner != owner_objectid && 608 if (existing_owner != owner_objectid &&
@@ -857,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
857 } else { 614 } else {
858 goto out; 615 goto out;
859 } 616 }
617 btrfs_unlock_up_safe(path, 1);
860 btrfs_mark_buffer_dirty(path->nodes[0]); 618 btrfs_mark_buffer_dirty(path->nodes[0]);
861out: 619out:
862 btrfs_release_path(root, path); 620 btrfs_release_path(root, path);
@@ -865,7 +623,8 @@ out:
865 623
866static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 624static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
867 struct btrfs_root *root, 625 struct btrfs_root *root,
868 struct btrfs_path *path) 626 struct btrfs_path *path,
627 int refs_to_drop)
869{ 628{
870 struct extent_buffer *leaf; 629 struct extent_buffer *leaf;
871 struct btrfs_extent_ref *ref; 630 struct btrfs_extent_ref *ref;
@@ -875,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
875 leaf = path->nodes[0]; 634 leaf = path->nodes[0];
876 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 635 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
877 num_refs = btrfs_ref_num_refs(leaf, ref); 636 num_refs = btrfs_ref_num_refs(leaf, ref);
878 BUG_ON(num_refs == 0); 637 BUG_ON(num_refs < refs_to_drop);
879 num_refs -= 1; 638 num_refs -= refs_to_drop;
880 if (num_refs == 0) { 639 if (num_refs == 0) {
881 ret = btrfs_del_item(trans, root, path); 640 ret = btrfs_del_item(trans, root, path);
882 } else { 641 } else {
@@ -927,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
927#endif 686#endif
928} 687}
929 688
930static noinline int free_extents(struct btrfs_trans_handle *trans,
931 struct btrfs_root *extent_root,
932 struct list_head *del_list)
933{
934 struct btrfs_fs_info *info = extent_root->fs_info;
935 struct btrfs_path *path;
936 struct btrfs_key key, found_key;
937 struct extent_buffer *leaf;
938 struct list_head *cur;
939 struct pending_extent_op *op;
940 struct btrfs_extent_item *ei;
941 int ret, num_to_del, extent_slot = 0, found_extent = 0;
942 u32 refs;
943 u64 bytes_freed = 0;
944
945 path = btrfs_alloc_path();
946 if (!path)
947 return -ENOMEM;
948 path->reada = 1;
949
950search:
951 /* search for the backref for the current ref we want to delete */
952 cur = del_list->next;
953 op = list_entry(cur, struct pending_extent_op, list);
954 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
955 op->orig_parent,
956 extent_root->root_key.objectid,
957 op->orig_generation, op->level, 1);
958 if (ret) {
959 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
960 "root %llu gen %llu owner %u\n",
961 (unsigned long long)op->bytenr,
962 (unsigned long long)extent_root->root_key.objectid,
963 (unsigned long long)op->orig_generation, op->level);
964 btrfs_print_leaf(extent_root, path->nodes[0]);
965 WARN_ON(1);
966 goto out;
967 }
968
969 extent_slot = path->slots[0];
970 num_to_del = 1;
971 found_extent = 0;
972
973 /*
974 * if we aren't the first item on the leaf we can move back one and see
975 * if our ref is right next to our extent item
976 */
977 if (likely(extent_slot)) {
978 extent_slot--;
979 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
980 extent_slot);
981 if (found_key.objectid == op->bytenr &&
982 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
983 found_key.offset == op->num_bytes) {
984 num_to_del++;
985 found_extent = 1;
986 }
987 }
988
989 /*
990 * if we didn't find the extent we need to delete the backref and then
991 * search for the extent item key so we can update its ref count
992 */
993 if (!found_extent) {
994 key.objectid = op->bytenr;
995 key.type = BTRFS_EXTENT_ITEM_KEY;
996 key.offset = op->num_bytes;
997
998 ret = remove_extent_backref(trans, extent_root, path);
999 BUG_ON(ret);
1000 btrfs_release_path(extent_root, path);
1001 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1002 BUG_ON(ret);
1003 extent_slot = path->slots[0];
1004 }
1005
1006 /* this is where we update the ref count for the extent */
1007 leaf = path->nodes[0];
1008 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
1009 refs = btrfs_extent_refs(leaf, ei);
1010 BUG_ON(refs == 0);
1011 refs--;
1012 btrfs_set_extent_refs(leaf, ei, refs);
1013
1014 btrfs_mark_buffer_dirty(leaf);
1015
1016 /*
1017 * This extent needs deleting. The reason cur_slot is extent_slot +
1018 * num_to_del is because extent_slot points to the slot where the extent
1019 * is, and if the backref was not right next to the extent we will be
1020 * deleting at least 1 item, and will want to start searching at the
1021 * slot directly next to extent_slot. However if we did find the
1022 * backref next to the extent item them we will be deleting at least 2
1023 * items and will want to start searching directly after the ref slot
1024 */
1025 if (!refs) {
1026 struct list_head *pos, *n, *end;
1027 int cur_slot = extent_slot+num_to_del;
1028 u64 super_used;
1029 u64 root_used;
1030
1031 path->slots[0] = extent_slot;
1032 bytes_freed = op->num_bytes;
1033
1034 mutex_lock(&info->pinned_mutex);
1035 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1036 op->num_bytes, op->level >=
1037 BTRFS_FIRST_FREE_OBJECTID);
1038 mutex_unlock(&info->pinned_mutex);
1039 BUG_ON(ret < 0);
1040 op->del = ret;
1041
1042 /*
1043 * we need to see if we can delete multiple things at once, so
1044 * start looping through the list of extents we are wanting to
1045 * delete and see if their extent/backref's are right next to
1046 * eachother and the extents only have 1 ref
1047 */
1048 for (pos = cur->next; pos != del_list; pos = pos->next) {
1049 struct pending_extent_op *tmp;
1050
1051 tmp = list_entry(pos, struct pending_extent_op, list);
1052
1053 /* we only want to delete extent+ref at this stage */
1054 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1055 break;
1056
1057 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1058 if (found_key.objectid != tmp->bytenr ||
1059 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1060 found_key.offset != tmp->num_bytes)
1061 break;
1062
1063 /* check to make sure this extent only has one ref */
1064 ei = btrfs_item_ptr(leaf, cur_slot,
1065 struct btrfs_extent_item);
1066 if (btrfs_extent_refs(leaf, ei) != 1)
1067 break;
1068
1069 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1070 if (found_key.objectid != tmp->bytenr ||
1071 found_key.type != BTRFS_EXTENT_REF_KEY ||
1072 found_key.offset != tmp->orig_parent)
1073 break;
1074
1075 /*
1076 * the ref is right next to the extent, we can set the
1077 * ref count to 0 since we will delete them both now
1078 */
1079 btrfs_set_extent_refs(leaf, ei, 0);
1080
1081 /* pin down the bytes for this extent */
1082 mutex_lock(&info->pinned_mutex);
1083 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1084 tmp->num_bytes, tmp->level >=
1085 BTRFS_FIRST_FREE_OBJECTID);
1086 mutex_unlock(&info->pinned_mutex);
1087 BUG_ON(ret < 0);
1088
1089 /*
1090 * use the del field to tell if we need to go ahead and
1091 * free up the extent when we delete the item or not.
1092 */
1093 tmp->del = ret;
1094 bytes_freed += tmp->num_bytes;
1095
1096 num_to_del += 2;
1097 cur_slot += 2;
1098 }
1099 end = pos;
1100
1101 /* update the free space counters */
1102 spin_lock(&info->delalloc_lock);
1103 super_used = btrfs_super_bytes_used(&info->super_copy);
1104 btrfs_set_super_bytes_used(&info->super_copy,
1105 super_used - bytes_freed);
1106
1107 root_used = btrfs_root_used(&extent_root->root_item);
1108 btrfs_set_root_used(&extent_root->root_item,
1109 root_used - bytes_freed);
1110 spin_unlock(&info->delalloc_lock);
1111
1112 /* delete the items */
1113 ret = btrfs_del_items(trans, extent_root, path,
1114 path->slots[0], num_to_del);
1115 BUG_ON(ret);
1116
1117 /*
1118 * loop through the extents we deleted and do the cleanup work
1119 * on them
1120 */
1121 for (pos = cur, n = pos->next; pos != end;
1122 pos = n, n = pos->next) {
1123 struct pending_extent_op *tmp;
1124 tmp = list_entry(pos, struct pending_extent_op, list);
1125
1126 /*
1127 * remember tmp->del tells us wether or not we pinned
1128 * down the extent
1129 */
1130 ret = update_block_group(trans, extent_root,
1131 tmp->bytenr, tmp->num_bytes, 0,
1132 tmp->del);
1133 BUG_ON(ret);
1134
1135 list_del_init(&tmp->list);
1136 unlock_extent(&info->extent_ins, tmp->bytenr,
1137 tmp->bytenr + tmp->num_bytes - 1,
1138 GFP_NOFS);
1139 kfree(tmp);
1140 }
1141 } else if (refs && found_extent) {
1142 /*
1143 * the ref and extent were right next to eachother, but the
1144 * extent still has a ref, so just free the backref and keep
1145 * going
1146 */
1147 ret = remove_extent_backref(trans, extent_root, path);
1148 BUG_ON(ret);
1149
1150 list_del_init(&op->list);
1151 unlock_extent(&info->extent_ins, op->bytenr,
1152 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1153 kfree(op);
1154 } else {
1155 /*
1156 * the extent has multiple refs and the backref we were looking
1157 * for was not right next to it, so just unlock and go next,
1158 * we're good to go
1159 */
1160 list_del_init(&op->list);
1161 unlock_extent(&info->extent_ins, op->bytenr,
1162 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1163 kfree(op);
1164 }
1165
1166 btrfs_release_path(extent_root, path);
1167 if (!list_empty(del_list))
1168 goto search;
1169
1170out:
1171 btrfs_free_path(path);
1172 return ret;
1173}
1174
1175static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 689static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1176 struct btrfs_root *root, u64 bytenr, 690 struct btrfs_root *root, u64 bytenr,
691 u64 num_bytes,
1177 u64 orig_parent, u64 parent, 692 u64 orig_parent, u64 parent,
1178 u64 orig_root, u64 ref_root, 693 u64 orig_root, u64 ref_root,
1179 u64 orig_generation, u64 ref_generation, 694 u64 orig_generation, u64 ref_generation,
1180 u64 owner_objectid) 695 u64 owner_objectid)
1181{ 696{
1182 int ret; 697 int ret;
1183 struct btrfs_root *extent_root = root->fs_info->extent_root; 698 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
1184 struct btrfs_path *path;
1185 699
1186 if (root == root->fs_info->extent_root) { 700 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
1187 struct pending_extent_op *extent_op; 701 orig_parent, parent, orig_root,
1188 u64 num_bytes; 702 ref_root, orig_generation,
1189 703 ref_generation, owner_objectid, pin);
1190 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1191 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1192 mutex_lock(&root->fs_info->extent_ins_mutex);
1193 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1194 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1195 u64 priv;
1196 ret = get_state_private(&root->fs_info->extent_ins,
1197 bytenr, &priv);
1198 BUG_ON(ret);
1199 extent_op = (struct pending_extent_op *)
1200 (unsigned long)priv;
1201 BUG_ON(extent_op->parent != orig_parent);
1202 BUG_ON(extent_op->generation != orig_generation);
1203
1204 extent_op->parent = parent;
1205 extent_op->generation = ref_generation;
1206 } else {
1207 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1208 BUG_ON(!extent_op);
1209
1210 extent_op->type = PENDING_BACKREF_UPDATE;
1211 extent_op->bytenr = bytenr;
1212 extent_op->num_bytes = num_bytes;
1213 extent_op->parent = parent;
1214 extent_op->orig_parent = orig_parent;
1215 extent_op->generation = ref_generation;
1216 extent_op->orig_generation = orig_generation;
1217 extent_op->level = (int)owner_objectid;
1218 INIT_LIST_HEAD(&extent_op->list);
1219 extent_op->del = 0;
1220
1221 set_extent_bits(&root->fs_info->extent_ins,
1222 bytenr, bytenr + num_bytes - 1,
1223 EXTENT_WRITEBACK, GFP_NOFS);
1224 set_state_private(&root->fs_info->extent_ins,
1225 bytenr, (unsigned long)extent_op);
1226 }
1227 mutex_unlock(&root->fs_info->extent_ins_mutex);
1228 return 0;
1229 }
1230
1231 path = btrfs_alloc_path();
1232 if (!path)
1233 return -ENOMEM;
1234 ret = lookup_extent_backref(trans, extent_root, path,
1235 bytenr, orig_parent, orig_root,
1236 orig_generation, owner_objectid, 1);
1237 if (ret)
1238 goto out;
1239 ret = remove_extent_backref(trans, extent_root, path);
1240 if (ret)
1241 goto out;
1242 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1243 parent, ref_root, ref_generation,
1244 owner_objectid);
1245 BUG_ON(ret); 704 BUG_ON(ret);
1246 finish_current_insert(trans, extent_root, 0);
1247 del_pending_extents(trans, extent_root, 0);
1248out:
1249 btrfs_free_path(path);
1250 return ret; 705 return ret;
1251} 706}
1252 707
1253int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 708int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root, u64 bytenr, 709 struct btrfs_root *root, u64 bytenr,
1255 u64 orig_parent, u64 parent, 710 u64 num_bytes, u64 orig_parent, u64 parent,
1256 u64 ref_root, u64 ref_generation, 711 u64 ref_root, u64 ref_generation,
1257 u64 owner_objectid) 712 u64 owner_objectid)
1258{ 713{
@@ -1260,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1260 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 715 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1261 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 716 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1262 return 0; 717 return 0;
1263 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, 718
1264 parent, ref_root, ref_root, 719 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
1265 ref_generation, ref_generation, 720 orig_parent, parent, ref_root,
1266 owner_objectid); 721 ref_root, ref_generation,
722 ref_generation, owner_objectid);
1267 return ret; 723 return ret;
1268} 724}
1269
1270static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 725static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1271 struct btrfs_root *root, u64 bytenr, 726 struct btrfs_root *root, u64 bytenr,
727 u64 num_bytes,
1272 u64 orig_parent, u64 parent, 728 u64 orig_parent, u64 parent,
1273 u64 orig_root, u64 ref_root, 729 u64 orig_root, u64 ref_root,
1274 u64 orig_generation, u64 ref_generation, 730 u64 orig_generation, u64 ref_generation,
1275 u64 owner_objectid) 731 u64 owner_objectid)
1276{ 732{
733 int ret;
734
735 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736 ref_generation, owner_objectid,
737 BTRFS_ADD_DELAYED_REF, 0);
738 BUG_ON(ret);
739 return ret;
740}
741
742static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743 struct btrfs_root *root, u64 bytenr,
744 u64 num_bytes, u64 parent, u64 ref_root,
745 u64 ref_generation, u64 owner_objectid,
746 int refs_to_add)
747{
1277 struct btrfs_path *path; 748 struct btrfs_path *path;
1278 int ret; 749 int ret;
1279 struct btrfs_key key; 750 struct btrfs_key key;
@@ -1286,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1286 return -ENOMEM; 757 return -ENOMEM;
1287 758
1288 path->reada = 1; 759 path->reada = 1;
760 path->leave_spinning = 1;
1289 key.objectid = bytenr; 761 key.objectid = bytenr;
1290 key.type = BTRFS_EXTENT_ITEM_KEY; 762 key.type = BTRFS_EXTENT_ITEM_KEY;
1291 key.offset = (u64)-1; 763 key.offset = num_bytes;
1292 764
1293 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 765 /* first find the extent item and update its reference count */
1294 0, 1); 766 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1295 if (ret < 0) 767 path, 0, 1);
768 if (ret < 0) {
769 btrfs_set_path_blocking(path);
1296 return ret; 770 return ret;
1297 BUG_ON(ret == 0 || path->slots[0] == 0); 771 }
1298 772
1299 path->slots[0]--; 773 if (ret > 0) {
774 WARN_ON(1);
775 btrfs_free_path(path);
776 return -EIO;
777 }
1300 l = path->nodes[0]; 778 l = path->nodes[0];
1301 779
1302 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 780 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1310 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 788 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1311 789
1312 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 790 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
1313 refs = btrfs_extent_refs(l, item); 792 refs = btrfs_extent_refs(l, item);
1314 btrfs_set_extent_refs(l, item, refs + 1); 793 btrfs_set_extent_refs(l, item, refs + refs_to_add);
794 btrfs_unlock_up_safe(path, 1);
795
1315 btrfs_mark_buffer_dirty(path->nodes[0]); 796 btrfs_mark_buffer_dirty(path->nodes[0]);
1316 797
1317 btrfs_release_path(root->fs_info->extent_root, path); 798 btrfs_release_path(root->fs_info->extent_root, path);
1318 799
1319 path->reada = 1; 800 path->reada = 1;
801 path->leave_spinning = 1;
802
803 /* now insert the actual backref */
1320 ret = insert_extent_backref(trans, root->fs_info->extent_root, 804 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1321 path, bytenr, parent, 805 path, bytenr, parent,
1322 ref_root, ref_generation, 806 ref_root, ref_generation,
1323 owner_objectid); 807 owner_objectid, refs_to_add);
1324 BUG_ON(ret); 808 BUG_ON(ret);
1325 finish_current_insert(trans, root->fs_info->extent_root, 0);
1326 del_pending_extents(trans, root->fs_info->extent_root, 0);
1327
1328 btrfs_free_path(path); 809 btrfs_free_path(path);
1329 return 0; 810 return 0;
1330} 811}
@@ -1339,68 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1339 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 820 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1340 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 821 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1341 return 0; 822 return 0;
1342 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, 823
824 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
1343 0, ref_root, 0, ref_generation, 825 0, ref_root, 0, ref_generation,
1344 owner_objectid); 826 owner_objectid);
1345 return ret; 827 return ret;
1346} 828}
1347 829
1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 830static int drop_delayed_ref(struct btrfs_trans_handle *trans,
1349 struct btrfs_root *root) 831 struct btrfs_root *root,
832 struct btrfs_delayed_ref_node *node)
833{
834 int ret = 0;
835 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
836
837 BUG_ON(node->ref_mod == 0);
838 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
839 node->parent, ref->root, ref->generation,
840 ref->owner_objectid, ref->pin, node->ref_mod);
841
842 return ret;
843}
844
845/* helper function to actually process a single delayed ref entry */
846static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_delayed_ref_node *node,
849 int insert_reserved)
1350{ 850{
1351 u64 start;
1352 u64 end;
1353 int ret; 851 int ret;
852 struct btrfs_delayed_ref *ref;
1354 853
1355 while(1) { 854 if (node->parent == (u64)-1) {
1356 finish_current_insert(trans, root->fs_info->extent_root, 1); 855 struct btrfs_delayed_ref_head *head;
1357 del_pending_extents(trans, root->fs_info->extent_root, 1); 856 /*
857 * we've hit the end of the chain and we were supposed
858 * to insert this extent into the tree. But, it got
859 * deleted before we ever needed to insert it, so all
860 * we have to do is clean up the accounting
861 */
862 if (insert_reserved) {
863 update_reserved_extents(root, node->bytenr,
864 node->num_bytes, 0);
865 }
866 head = btrfs_delayed_node_to_head(node);
867 mutex_unlock(&head->mutex);
868 return 0;
869 }
1358 870
1359 /* is there more work to do? */ 871 ref = btrfs_delayed_node_to_ref(node);
1360 ret = find_first_extent_bit(&root->fs_info->pending_del, 872 if (ref->action == BTRFS_ADD_DELAYED_REF) {
1361 0, &start, &end, EXTENT_WRITEBACK); 873 if (insert_reserved) {
1362 if (!ret) 874 struct btrfs_key ins;
1363 continue; 875
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins, 876 ins.objectid = node->bytenr;
1365 0, &start, &end, EXTENT_WRITEBACK); 877 ins.offset = node->num_bytes;
1366 if (!ret) 878 ins.type = BTRFS_EXTENT_ITEM_KEY;
1367 continue; 879
1368 break; 880 /* record the full extent allocation */
881 ret = __btrfs_alloc_reserved_extent(trans, root,
882 node->parent, ref->root,
883 ref->generation, ref->owner_objectid,
884 &ins, node->ref_mod);
885 update_reserved_extents(root, node->bytenr,
886 node->num_bytes, 0);
887 } else {
888 /* just add one backref */
889 ret = add_extent_ref(trans, root, node->bytenr,
890 node->num_bytes,
891 node->parent, ref->root, ref->generation,
892 ref->owner_objectid, node->ref_mod);
893 }
894 BUG_ON(ret);
895 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896 WARN_ON(insert_reserved);
897 ret = drop_delayed_ref(trans, root, node);
1369 } 898 }
1370 return 0; 899 return 0;
1371} 900}
1372 901
1373int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 902static noinline struct btrfs_delayed_ref_node *
1374 struct btrfs_root *root, u64 bytenr, 903select_delayed_ref(struct btrfs_delayed_ref_head *head)
1375 u64 num_bytes, u32 *refs)
1376{ 904{
1377 struct btrfs_path *path; 905 struct rb_node *node;
906 struct btrfs_delayed_ref_node *ref;
907 int action = BTRFS_ADD_DELAYED_REF;
908again:
909 /*
910 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
911 * this prevents ref count from going down to zero when
912 * there still are pending delayed ref.
913 */
914 node = rb_prev(&head->node.rb_node);
915 while (1) {
916 if (!node)
917 break;
918 ref = rb_entry(node, struct btrfs_delayed_ref_node,
919 rb_node);
920 if (ref->bytenr != head->node.bytenr)
921 break;
922 if (btrfs_delayed_node_to_ref(ref)->action == action)
923 return ref;
924 node = rb_prev(node);
925 }
926 if (action == BTRFS_ADD_DELAYED_REF) {
927 action = BTRFS_DROP_DELAYED_REF;
928 goto again;
929 }
930 return NULL;
931}
932
933static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
934 struct btrfs_root *root,
935 struct list_head *cluster)
936{
937 struct btrfs_delayed_ref_root *delayed_refs;
938 struct btrfs_delayed_ref_node *ref;
939 struct btrfs_delayed_ref_head *locked_ref = NULL;
1378 int ret; 940 int ret;
1379 struct btrfs_key key; 941 int count = 0;
1380 struct extent_buffer *l; 942 int must_insert_reserved = 0;
1381 struct btrfs_extent_item *item;
1382 943
1383 WARN_ON(num_bytes < root->sectorsize); 944 delayed_refs = &trans->transaction->delayed_refs;
1384 path = btrfs_alloc_path(); 945 while (1) {
1385 path->reada = 1; 946 if (!locked_ref) {
1386 key.objectid = bytenr; 947 /* pick a new head ref from the cluster list */
1387 key.offset = num_bytes; 948 if (list_empty(cluster))
1388 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 949 break;
1389 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 950
1390 0, 0); 951 locked_ref = list_entry(cluster->next,
1391 if (ret < 0) 952 struct btrfs_delayed_ref_head, cluster);
1392 goto out; 953
1393 if (ret != 0) { 954 /* grab the lock that says we are going to process
1394 btrfs_print_leaf(root, path->nodes[0]); 955 * all the refs for this head */
1395 printk(KERN_INFO "btrfs failed to find block number %llu\n", 956 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1396 (unsigned long long)bytenr); 957
1397 BUG(); 958 /*
959 * we may have dropped the spin lock to get the head
960 * mutex lock, and that might have given someone else
961 * time to free the head. If that's true, it has been
962 * removed from our list and we can move on.
963 */
964 if (ret == -EAGAIN) {
965 locked_ref = NULL;
966 count++;
967 continue;
968 }
969 }
970
971 /*
972 * record the must insert reserved flag before we
973 * drop the spin lock.
974 */
975 must_insert_reserved = locked_ref->must_insert_reserved;
976 locked_ref->must_insert_reserved = 0;
977
978 /*
979 * locked_ref is the head node, so we have to go one
980 * node back for any delayed ref updates
981 */
982 ref = select_delayed_ref(locked_ref);
983 if (!ref) {
984 /* All delayed refs have been processed, Go ahead
985 * and send the head node to run_one_delayed_ref,
986 * so that any accounting fixes can happen
987 */
988 ref = &locked_ref->node;
989 list_del_init(&locked_ref->cluster);
990 locked_ref = NULL;
991 }
992
993 ref->in_tree = 0;
994 rb_erase(&ref->rb_node, &delayed_refs->root);
995 delayed_refs->num_entries--;
996 spin_unlock(&delayed_refs->lock);
997
998 ret = run_one_delayed_ref(trans, root, ref,
999 must_insert_reserved);
1000 BUG_ON(ret);
1001 btrfs_put_delayed_ref(ref);
1002
1003 count++;
1004 cond_resched();
1005 spin_lock(&delayed_refs->lock);
1006 }
1007 return count;
1008}
1009
1010/*
1011 * this starts processing the delayed reference count updates and
1012 * extent insertions we have queued up so far. count can be
1013 * 0, which means to process everything in the tree at the start
1014 * of the run (but not newly added entries), or it can be some target
1015 * number you'd like to process.
1016 */
1017int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1018 struct btrfs_root *root, unsigned long count)
1019{
1020 struct rb_node *node;
1021 struct btrfs_delayed_ref_root *delayed_refs;
1022 struct btrfs_delayed_ref_node *ref;
1023 struct list_head cluster;
1024 int ret;
1025 int run_all = count == (unsigned long)-1;
1026 int run_most = 0;
1027
1028 if (root == root->fs_info->extent_root)
1029 root = root->fs_info->tree_root;
1030
1031 delayed_refs = &trans->transaction->delayed_refs;
1032 INIT_LIST_HEAD(&cluster);
1033again:
1034 spin_lock(&delayed_refs->lock);
1035 if (count == 0) {
1036 count = delayed_refs->num_entries * 2;
1037 run_most = 1;
1038 }
1039 while (1) {
1040 if (!(run_all || run_most) &&
1041 delayed_refs->num_heads_ready < 64)
1042 break;
1043
1044 /*
1045 * go find something we can process in the rbtree. We start at
1046 * the beginning of the tree, and then build a cluster
1047 * of refs to process starting at the first one we are able to
1048 * lock
1049 */
1050 ret = btrfs_find_ref_cluster(trans, &cluster,
1051 delayed_refs->run_delayed_start);
1052 if (ret)
1053 break;
1054
1055 ret = run_clustered_refs(trans, root, &cluster);
1056 BUG_ON(ret < 0);
1057
1058 count -= min_t(unsigned long, ret, count);
1059
1060 if (count == 0)
1061 break;
1062 }
1063
1064 if (run_all) {
1065 node = rb_first(&delayed_refs->root);
1066 if (!node)
1067 goto out;
1068 count = (unsigned long)-1;
1069
1070 while (node) {
1071 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1072 rb_node);
1073 if (btrfs_delayed_ref_is_head(ref)) {
1074 struct btrfs_delayed_ref_head *head;
1075
1076 head = btrfs_delayed_node_to_head(ref);
1077 atomic_inc(&ref->refs);
1078
1079 spin_unlock(&delayed_refs->lock);
1080 mutex_lock(&head->mutex);
1081 mutex_unlock(&head->mutex);
1082
1083 btrfs_put_delayed_ref(ref);
1084 cond_resched();
1085 goto again;
1086 }
1087 node = rb_next(node);
1088 }
1089 spin_unlock(&delayed_refs->lock);
1090 schedule_timeout(1);
1091 goto again;
1398 } 1092 }
1399 l = path->nodes[0];
1400 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1401 *refs = btrfs_extent_refs(l, item);
1402out: 1093out:
1403 btrfs_free_path(path); 1094 spin_unlock(&delayed_refs->lock);
1404 return 0; 1095 return 0;
1405} 1096}
1406 1097
@@ -1624,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1624 int refi = 0; 1315 int refi = 0;
1625 int slot; 1316 int slot;
1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1317 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1627 u64, u64, u64, u64, u64, u64, u64, u64); 1318 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1628 1319
1629 ref_root = btrfs_header_owner(buf); 1320 ref_root = btrfs_header_owner(buf);
1630 ref_generation = btrfs_header_generation(buf); 1321 ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1696 1387
1697 if (level == 0) { 1388 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot); 1389 btrfs_item_key_to_cpu(buf, &key, slot);
1390 fi = btrfs_item_ptr(buf, slot,
1391 struct btrfs_file_extent_item);
1392
1393 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394 if (bytenr == 0)
1395 continue;
1699 1396
1700 ret = process_func(trans, root, bytenr, 1397 ret = process_func(trans, root, bytenr,
1701 orig_buf->start, buf->start, 1398 btrfs_file_extent_disk_num_bytes(buf, fi),
1702 orig_root, ref_root, 1399 orig_buf->start, buf->start,
1703 orig_generation, ref_generation, 1400 orig_root, ref_root,
1704 key.objectid); 1401 orig_generation, ref_generation,
1402 key.objectid);
1705 1403
1706 if (ret) { 1404 if (ret) {
1707 faili = slot; 1405 faili = slot;
@@ -1709,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1709 goto fail; 1407 goto fail;
1710 } 1408 }
1711 } else { 1409 } else {
1712 ret = process_func(trans, root, bytenr, 1410 ret = process_func(trans, root, bytenr, buf->len,
1713 orig_buf->start, buf->start, 1411 orig_buf->start, buf->start,
1714 orig_root, ref_root, 1412 orig_root, ref_root,
1715 orig_generation, ref_generation, 1413 orig_generation, ref_generation,
@@ -1786,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
1786 if (bytenr == 0) 1484 if (bytenr == 0)
1787 continue; 1485 continue;
1788 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1486 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1789 orig_buf->start, buf->start, 1487 btrfs_file_extent_disk_num_bytes(buf, fi),
1790 orig_root, ref_root, 1488 orig_buf->start, buf->start,
1791 orig_generation, ref_generation, 1489 orig_root, ref_root, orig_generation,
1792 key.objectid); 1490 ref_generation, key.objectid);
1793 if (ret) 1491 if (ret)
1794 goto fail; 1492 goto fail;
1795 } else { 1493 } else {
1796 bytenr = btrfs_node_blockptr(buf, slot); 1494 bytenr = btrfs_node_blockptr(buf, slot);
1797 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1495 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1798 orig_buf->start, buf->start, 1496 buf->len, orig_buf->start,
1799 orig_root, ref_root, 1497 buf->start, orig_root, ref_root,
1800 orig_generation, ref_generation, 1498 orig_generation, ref_generation,
1801 level - 1); 1499 level - 1);
1802 if (ret) 1500 if (ret)
@@ -1815,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1815 struct btrfs_block_group_cache *cache) 1513 struct btrfs_block_group_cache *cache)
1816{ 1514{
1817 int ret; 1515 int ret;
1818 int pending_ret;
1819 struct btrfs_root *extent_root = root->fs_info->extent_root; 1516 struct btrfs_root *extent_root = root->fs_info->extent_root;
1820 unsigned long bi; 1517 unsigned long bi;
1821 struct extent_buffer *leaf; 1518 struct extent_buffer *leaf;
@@ -1831,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1831 btrfs_mark_buffer_dirty(leaf); 1528 btrfs_mark_buffer_dirty(leaf);
1832 btrfs_release_path(extent_root, path); 1529 btrfs_release_path(extent_root, path);
1833fail: 1530fail:
1834 finish_current_insert(trans, extent_root, 0);
1835 pending_ret = del_pending_extents(trans, extent_root, 0);
1836 if (ret) 1531 if (ret)
1837 return ret; 1532 return ret;
1838 if (pending_ret)
1839 return pending_ret;
1840 return 0; 1533 return 0;
1841 1534
1842} 1535}
@@ -1900,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1900 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1901 readonly = 1; 1594 readonly = 1;
1902 if (block_group) 1595 if (block_group)
1903 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1904 return readonly; 1597 return readonly;
1905} 1598}
1906 1599
@@ -2324,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2324 WARN_ON(ret); 2017 WARN_ON(ret);
2325 } 2018 }
2326 } 2019 }
2327 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2328 total -= num_bytes; 2021 total -= num_bytes;
2329 bytenr += num_bytes; 2022 bytenr += num_bytes;
2330 } 2023 }
@@ -2341,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2341 return 0; 2034 return 0;
2342 2035
2343 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2344 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2345 2038
2346 return bytenr; 2039 return bytenr;
2347} 2040}
@@ -2353,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2353 struct btrfs_block_group_cache *cache; 2046 struct btrfs_block_group_cache *cache;
2354 struct btrfs_fs_info *fs_info = root->fs_info; 2047 struct btrfs_fs_info *fs_info = root->fs_info;
2355 2048
2356 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2357 if (pin) { 2049 if (pin) {
2358 set_extent_dirty(&fs_info->pinned_extents, 2050 set_extent_dirty(&fs_info->pinned_extents,
2359 bytenr, bytenr + num - 1, GFP_NOFS); 2051 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2361,6 +2053,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2361 clear_extent_dirty(&fs_info->pinned_extents, 2053 clear_extent_dirty(&fs_info->pinned_extents,
2362 bytenr, bytenr + num - 1, GFP_NOFS); 2054 bytenr, bytenr + num - 1, GFP_NOFS);
2363 } 2055 }
2056
2364 while (num > 0) { 2057 while (num > 0) {
2365 cache = btrfs_lookup_block_group(fs_info, bytenr); 2058 cache = btrfs_lookup_block_group(fs_info, bytenr);
2366 BUG_ON(!cache); 2059 BUG_ON(!cache);
@@ -2385,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2385 if (cache->cached) 2078 if (cache->cached)
2386 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2387 } 2080 }
2388 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2389 bytenr += len; 2082 bytenr += len;
2390 num -= len; 2083 num -= len;
2391 } 2084 }
@@ -2416,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2416 } 2109 }
2417 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2418 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2419 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2420 bytenr += len; 2113 bytenr += len;
2421 num -= len; 2114 num -= len;
2422 } 2115 }
@@ -2431,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2431 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2124 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2432 int ret; 2125 int ret;
2433 2126
2434 mutex_lock(&root->fs_info->pinned_mutex);
2435 while (1) { 2127 while (1) {
2436 ret = find_first_extent_bit(pinned_extents, last, 2128 ret = find_first_extent_bit(pinned_extents, last,
2437 &start, &end, EXTENT_DIRTY); 2129 &start, &end, EXTENT_DIRTY);
@@ -2440,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2440 set_extent_dirty(copy, start, end, GFP_NOFS); 2132 set_extent_dirty(copy, start, end, GFP_NOFS);
2441 last = end + 1; 2133 last = end + 1;
2442 } 2134 }
2443 mutex_unlock(&root->fs_info->pinned_mutex);
2444 return 0; 2135 return 0;
2445} 2136}
2446 2137
@@ -2452,7 +2143,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2452 u64 end; 2143 u64 end;
2453 int ret; 2144 int ret;
2454 2145
2455 mutex_lock(&root->fs_info->pinned_mutex);
2456 while (1) { 2146 while (1) {
2457 ret = find_first_extent_bit(unpin, 0, &start, &end, 2147 ret = find_first_extent_bit(unpin, 0, &start, &end,
2458 EXTENT_DIRTY); 2148 EXTENT_DIRTY);
@@ -2461,209 +2151,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2461 2151
2462 ret = btrfs_discard_extent(root, start, end + 1 - start); 2152 ret = btrfs_discard_extent(root, start, end + 1 - start);
2463 2153
2154 /* unlocks the pinned mutex */
2464 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2155 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2465 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2156 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2466 2157
2467 if (need_resched()) { 2158 cond_resched();
2468 mutex_unlock(&root->fs_info->pinned_mutex);
2469 cond_resched();
2470 mutex_lock(&root->fs_info->pinned_mutex);
2471 }
2472 } 2159 }
2473 mutex_unlock(&root->fs_info->pinned_mutex);
2474 return ret; 2160 return ret;
2475} 2161}
2476 2162
2477static int finish_current_insert(struct btrfs_trans_handle *trans,
2478 struct btrfs_root *extent_root, int all)
2479{
2480 u64 start;
2481 u64 end;
2482 u64 priv;
2483 u64 search = 0;
2484 struct btrfs_fs_info *info = extent_root->fs_info;
2485 struct btrfs_path *path;
2486 struct pending_extent_op *extent_op, *tmp;
2487 struct list_head insert_list, update_list;
2488 int ret;
2489 int num_inserts = 0, max_inserts, restart = 0;
2490
2491 path = btrfs_alloc_path();
2492 INIT_LIST_HEAD(&insert_list);
2493 INIT_LIST_HEAD(&update_list);
2494
2495 max_inserts = extent_root->leafsize /
2496 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2497 sizeof(struct btrfs_extent_ref) +
2498 sizeof(struct btrfs_extent_item));
2499again:
2500 mutex_lock(&info->extent_ins_mutex);
2501 while (1) {
2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2503 &end, EXTENT_WRITEBACK);
2504 if (ret) {
2505 if (restart && !num_inserts &&
2506 list_empty(&update_list)) {
2507 restart = 0;
2508 search = 0;
2509 continue;
2510 }
2511 break;
2512 }
2513
2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2515 if (!ret) {
2516 if (all)
2517 restart = 1;
2518 search = end + 1;
2519 if (need_resched()) {
2520 mutex_unlock(&info->extent_ins_mutex);
2521 cond_resched();
2522 mutex_lock(&info->extent_ins_mutex);
2523 }
2524 continue;
2525 }
2526
2527 ret = get_state_private(&info->extent_ins, start, &priv);
2528 BUG_ON(ret);
2529 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2530
2531 if (extent_op->type == PENDING_EXTENT_INSERT) {
2532 num_inserts++;
2533 list_add_tail(&extent_op->list, &insert_list);
2534 search = end + 1;
2535 if (num_inserts == max_inserts) {
2536 restart = 1;
2537 break;
2538 }
2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2540 list_add_tail(&extent_op->list, &update_list);
2541 search = end + 1;
2542 } else {
2543 BUG();
2544 }
2545 }
2546
2547 /*
2548 * process the update list, clear the writeback bit for it, and if
2549 * somebody marked this thing for deletion then just unlock it and be
2550 * done, the free_extents will handle it
2551 */
2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2554 extent_op->bytenr + extent_op->num_bytes - 1,
2555 EXTENT_WRITEBACK, GFP_NOFS);
2556 if (extent_op->del) {
2557 list_del_init(&extent_op->list);
2558 unlock_extent(&info->extent_ins, extent_op->bytenr,
2559 extent_op->bytenr + extent_op->num_bytes
2560 - 1, GFP_NOFS);
2561 kfree(extent_op);
2562 }
2563 }
2564 mutex_unlock(&info->extent_ins_mutex);
2565
2566 /*
2567 * still have things left on the update list, go ahead an update
2568 * everything
2569 */
2570 if (!list_empty(&update_list)) {
2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2577 }
2578
2579 /*
2580 * if no inserts need to be done, but we skipped some extents and we
2581 * need to make sure everything is cleaned then reset everything and
2582 * go back to the beginning
2583 */
2584 if (!num_inserts && restart) {
2585 search = 0;
2586 restart = 0;
2587 INIT_LIST_HEAD(&update_list);
2588 INIT_LIST_HEAD(&insert_list);
2589 goto again;
2590 } else if (!num_inserts) {
2591 goto out;
2592 }
2593
2594 /*
2595 * process the insert extents list. Again if we are deleting this
2596 * extent, then just unlock it, pin down the bytes if need be, and be
2597 * done with it. Saves us from having to actually insert the extent
2598 * into the tree and then subsequently come along and delete it
2599 */
2600 mutex_lock(&info->extent_ins_mutex);
2601 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2602 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2603 extent_op->bytenr + extent_op->num_bytes - 1,
2604 EXTENT_WRITEBACK, GFP_NOFS);
2605 if (extent_op->del) {
2606 u64 used;
2607 list_del_init(&extent_op->list);
2608 unlock_extent(&info->extent_ins, extent_op->bytenr,
2609 extent_op->bytenr + extent_op->num_bytes
2610 - 1, GFP_NOFS);
2611
2612 mutex_lock(&extent_root->fs_info->pinned_mutex);
2613 ret = pin_down_bytes(trans, extent_root,
2614 extent_op->bytenr,
2615 extent_op->num_bytes, 0);
2616 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2617
2618 spin_lock(&info->delalloc_lock);
2619 used = btrfs_super_bytes_used(&info->super_copy);
2620 btrfs_set_super_bytes_used(&info->super_copy,
2621 used - extent_op->num_bytes);
2622 used = btrfs_root_used(&extent_root->root_item);
2623 btrfs_set_root_used(&extent_root->root_item,
2624 used - extent_op->num_bytes);
2625 spin_unlock(&info->delalloc_lock);
2626
2627 ret = update_block_group(trans, extent_root,
2628 extent_op->bytenr,
2629 extent_op->num_bytes,
2630 0, ret > 0);
2631 BUG_ON(ret);
2632 kfree(extent_op);
2633 num_inserts--;
2634 }
2635 }
2636 mutex_unlock(&info->extent_ins_mutex);
2637
2638 ret = insert_extents(trans, extent_root, path, &insert_list,
2639 num_inserts);
2640 BUG_ON(ret);
2641
2642 /*
2643 * if restart is set for whatever reason we need to go back and start
2644 * searching through the pending list again.
2645 *
2646 * We just inserted some extents, which could have resulted in new
2647 * blocks being allocated, which would result in new blocks needing
2648 * updates, so if all is set we _must_ restart to get the updated
2649 * blocks.
2650 */
2651 if (restart || all) {
2652 INIT_LIST_HEAD(&insert_list);
2653 INIT_LIST_HEAD(&update_list);
2654 search = 0;
2655 restart = 0;
2656 num_inserts = 0;
2657 goto again;
2658 }
2659out:
2660 btrfs_free_path(path);
2661 return 0;
2662}
2663
2664static int pin_down_bytes(struct btrfs_trans_handle *trans, 2163static int pin_down_bytes(struct btrfs_trans_handle *trans,
2665 struct btrfs_root *root, 2164 struct btrfs_root *root,
2666 u64 bytenr, u64 num_bytes, int is_data) 2165 struct btrfs_path *path,
2166 u64 bytenr, u64 num_bytes, int is_data,
2167 struct extent_buffer **must_clean)
2667{ 2168{
2668 int err = 0; 2169 int err = 0;
2669 struct extent_buffer *buf; 2170 struct extent_buffer *buf;
@@ -2686,17 +2187,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2686 u64 header_transid = btrfs_header_generation(buf); 2187 u64 header_transid = btrfs_header_generation(buf);
2687 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2188 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2688 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2189 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2190 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2689 header_transid == trans->transid && 2191 header_transid == trans->transid &&
2690 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2192 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2691 clean_tree_block(NULL, root, buf); 2193 *must_clean = buf;
2692 btrfs_tree_unlock(buf);
2693 free_extent_buffer(buf);
2694 return 1; 2194 return 1;
2695 } 2195 }
2696 btrfs_tree_unlock(buf); 2196 btrfs_tree_unlock(buf);
2697 } 2197 }
2698 free_extent_buffer(buf); 2198 free_extent_buffer(buf);
2699pinit: 2199pinit:
2200 btrfs_set_path_blocking(path);
2201 /* unlocks the pinned mutex */
2700 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2202 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2701 2203
2702 BUG_ON(err < 0); 2204 BUG_ON(err < 0);
@@ -2710,7 +2212,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2710 struct btrfs_root *root, 2212 struct btrfs_root *root,
2711 u64 bytenr, u64 num_bytes, u64 parent, 2213 u64 bytenr, u64 num_bytes, u64 parent,
2712 u64 root_objectid, u64 ref_generation, 2214 u64 root_objectid, u64 ref_generation,
2713 u64 owner_objectid, int pin, int mark_free) 2215 u64 owner_objectid, int pin, int mark_free,
2216 int refs_to_drop)
2714{ 2217{
2715 struct btrfs_path *path; 2218 struct btrfs_path *path;
2716 struct btrfs_key key; 2219 struct btrfs_key key;
@@ -2732,6 +2235,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2732 return -ENOMEM; 2235 return -ENOMEM;
2733 2236
2734 path->reada = 1; 2237 path->reada = 1;
2238 path->leave_spinning = 1;
2735 ret = lookup_extent_backref(trans, extent_root, path, 2239 ret = lookup_extent_backref(trans, extent_root, path,
2736 bytenr, parent, root_objectid, 2240 bytenr, parent, root_objectid,
2737 ref_generation, owner_objectid, 1); 2241 ref_generation, owner_objectid, 1);
@@ -2753,9 +2257,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2753 break; 2257 break;
2754 } 2258 }
2755 if (!found_extent) { 2259 if (!found_extent) {
2756 ret = remove_extent_backref(trans, extent_root, path); 2260 ret = remove_extent_backref(trans, extent_root, path,
2261 refs_to_drop);
2757 BUG_ON(ret); 2262 BUG_ON(ret);
2758 btrfs_release_path(extent_root, path); 2263 btrfs_release_path(extent_root, path);
2264 path->leave_spinning = 1;
2759 ret = btrfs_search_slot(trans, extent_root, 2265 ret = btrfs_search_slot(trans, extent_root,
2760 &key, path, -1, 1); 2266 &key, path, -1, 1);
2761 if (ret) { 2267 if (ret) {
@@ -2771,8 +2277,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2771 btrfs_print_leaf(extent_root, path->nodes[0]); 2277 btrfs_print_leaf(extent_root, path->nodes[0]);
2772 WARN_ON(1); 2278 WARN_ON(1);
2773 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2279 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2774 "root %llu gen %llu owner %llu\n", 2280 "parent %llu root %llu gen %llu owner %llu\n",
2775 (unsigned long long)bytenr, 2281 (unsigned long long)bytenr,
2282 (unsigned long long)parent,
2776 (unsigned long long)root_objectid, 2283 (unsigned long long)root_objectid,
2777 (unsigned long long)ref_generation, 2284 (unsigned long long)ref_generation,
2778 (unsigned long long)owner_objectid); 2285 (unsigned long long)owner_objectid);
@@ -2782,17 +2289,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2782 ei = btrfs_item_ptr(leaf, extent_slot, 2289 ei = btrfs_item_ptr(leaf, extent_slot,
2783 struct btrfs_extent_item); 2290 struct btrfs_extent_item);
2784 refs = btrfs_extent_refs(leaf, ei); 2291 refs = btrfs_extent_refs(leaf, ei);
2785 BUG_ON(refs == 0);
2786 refs -= 1;
2787 btrfs_set_extent_refs(leaf, ei, refs);
2788 2292
2293 /*
2294 * we're not allowed to delete the extent item if there
2295 * are other delayed ref updates pending
2296 */
2297
2298 BUG_ON(refs < refs_to_drop);
2299 refs -= refs_to_drop;
2300 btrfs_set_extent_refs(leaf, ei, refs);
2789 btrfs_mark_buffer_dirty(leaf); 2301 btrfs_mark_buffer_dirty(leaf);
2790 2302
2791 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { 2303 if (refs == 0 && found_extent &&
2304 path->slots[0] == extent_slot + 1) {
2792 struct btrfs_extent_ref *ref; 2305 struct btrfs_extent_ref *ref;
2793 ref = btrfs_item_ptr(leaf, path->slots[0], 2306 ref = btrfs_item_ptr(leaf, path->slots[0],
2794 struct btrfs_extent_ref); 2307 struct btrfs_extent_ref);
2795 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); 2308 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2796 /* if the back ref and the extent are next to each other 2309 /* if the back ref and the extent are next to each other
2797 * they get deleted below in one shot 2310 * they get deleted below in one shot
2798 */ 2311 */
@@ -2800,11 +2313,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2800 num_to_del = 2; 2313 num_to_del = 2;
2801 } else if (found_extent) { 2314 } else if (found_extent) {
2802 /* otherwise delete the extent back ref */ 2315 /* otherwise delete the extent back ref */
2803 ret = remove_extent_backref(trans, extent_root, path); 2316 ret = remove_extent_backref(trans, extent_root, path,
2317 refs_to_drop);
2804 BUG_ON(ret); 2318 BUG_ON(ret);
2805 /* if refs are 0, we need to setup the path for deletion */ 2319 /* if refs are 0, we need to setup the path for deletion */
2806 if (refs == 0) { 2320 if (refs == 0) {
2807 btrfs_release_path(extent_root, path); 2321 btrfs_release_path(extent_root, path);
2322 path->leave_spinning = 1;
2808 ret = btrfs_search_slot(trans, extent_root, &key, path, 2323 ret = btrfs_search_slot(trans, extent_root, &key, path,
2809 -1, 1); 2324 -1, 1);
2810 BUG_ON(ret); 2325 BUG_ON(ret);
@@ -2814,16 +2329,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2814 if (refs == 0) { 2329 if (refs == 0) {
2815 u64 super_used; 2330 u64 super_used;
2816 u64 root_used; 2331 u64 root_used;
2332 struct extent_buffer *must_clean = NULL;
2817 2333
2818 if (pin) { 2334 if (pin) {
2819 mutex_lock(&root->fs_info->pinned_mutex); 2335 ret = pin_down_bytes(trans, root, path,
2820 ret = pin_down_bytes(trans, root, bytenr, num_bytes, 2336 bytenr, num_bytes,
2821 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); 2337 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2822 mutex_unlock(&root->fs_info->pinned_mutex); 2338 &must_clean);
2823 if (ret > 0) 2339 if (ret > 0)
2824 mark_free = 1; 2340 mark_free = 1;
2825 BUG_ON(ret < 0); 2341 BUG_ON(ret < 0);
2826 } 2342 }
2343
2827 /* block accounting for super block */ 2344 /* block accounting for super block */
2828 spin_lock(&info->delalloc_lock); 2345 spin_lock(&info->delalloc_lock);
2829 super_used = btrfs_super_bytes_used(&info->super_copy); 2346 super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2352,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2835 btrfs_set_root_used(&root->root_item, 2352 btrfs_set_root_used(&root->root_item,
2836 root_used - num_bytes); 2353 root_used - num_bytes);
2837 spin_unlock(&info->delalloc_lock); 2354 spin_unlock(&info->delalloc_lock);
2355
2356 /*
2357 * it is going to be very rare for someone to be waiting
2358 * on the block we're freeing. del_items might need to
2359 * schedule, so rather than get fancy, just force it
2360 * to blocking here
2361 */
2362 if (must_clean)
2363 btrfs_set_lock_blocking(must_clean);
2364
2838 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2365 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2839 num_to_del); 2366 num_to_del);
2840 BUG_ON(ret); 2367 BUG_ON(ret);
2841 btrfs_release_path(extent_root, path); 2368 btrfs_release_path(extent_root, path);
2842 2369
2370 if (must_clean) {
2371 clean_tree_block(NULL, root, must_clean);
2372 btrfs_tree_unlock(must_clean);
2373 free_extent_buffer(must_clean);
2374 }
2375
2843 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2376 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2844 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2377 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2845 BUG_ON(ret); 2378 BUG_ON(ret);
2379 } else {
2380 invalidate_mapping_pages(info->btree_inode->i_mapping,
2381 bytenr >> PAGE_CACHE_SHIFT,
2382 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2846 } 2383 }
2847 2384
2848 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2385 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2387,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2850 BUG_ON(ret); 2387 BUG_ON(ret);
2851 } 2388 }
2852 btrfs_free_path(path); 2389 btrfs_free_path(path);
2853 finish_current_insert(trans, extent_root, 0);
2854 return ret; 2390 return ret;
2855} 2391}
2856 2392
2857/* 2393/*
2858 * find all the blocks marked as pending in the radix tree and remove 2394 * remove an extent from the root, returns 0 on success
2859 * them from the extent map
2860 */ 2395 */
2861static int del_pending_extents(struct btrfs_trans_handle *trans, 2396static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2862 struct btrfs_root *extent_root, int all) 2397 struct btrfs_root *root,
2398 u64 bytenr, u64 num_bytes, u64 parent,
2399 u64 root_objectid, u64 ref_generation,
2400 u64 owner_objectid, int pin,
2401 int refs_to_drop)
2863{ 2402{
2864 int ret; 2403 WARN_ON(num_bytes < root->sectorsize);
2865 int err = 0;
2866 u64 start;
2867 u64 end;
2868 u64 priv;
2869 u64 search = 0;
2870 int nr = 0, skipped = 0;
2871 struct extent_io_tree *pending_del;
2872 struct extent_io_tree *extent_ins;
2873 struct pending_extent_op *extent_op;
2874 struct btrfs_fs_info *info = extent_root->fs_info;
2875 struct list_head delete_list;
2876
2877 INIT_LIST_HEAD(&delete_list);
2878 extent_ins = &extent_root->fs_info->extent_ins;
2879 pending_del = &extent_root->fs_info->pending_del;
2880
2881again:
2882 mutex_lock(&info->extent_ins_mutex);
2883 while (1) {
2884 ret = find_first_extent_bit(pending_del, search, &start, &end,
2885 EXTENT_WRITEBACK);
2886 if (ret) {
2887 if (all && skipped && !nr) {
2888 search = 0;
2889 skipped = 0;
2890 continue;
2891 }
2892 mutex_unlock(&info->extent_ins_mutex);
2893 break;
2894 }
2895
2896 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2897 if (!ret) {
2898 search = end+1;
2899 skipped = 1;
2900
2901 if (need_resched()) {
2902 mutex_unlock(&info->extent_ins_mutex);
2903 cond_resched();
2904 mutex_lock(&info->extent_ins_mutex);
2905 }
2906
2907 continue;
2908 }
2909 BUG_ON(ret < 0);
2910
2911 ret = get_state_private(pending_del, start, &priv);
2912 BUG_ON(ret);
2913 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2914
2915 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2916 GFP_NOFS);
2917 if (!test_range_bit(extent_ins, start, end,
2918 EXTENT_WRITEBACK, 0)) {
2919 list_add_tail(&extent_op->list, &delete_list);
2920 nr++;
2921 } else {
2922 kfree(extent_op);
2923
2924 ret = get_state_private(&info->extent_ins, start,
2925 &priv);
2926 BUG_ON(ret);
2927 extent_op = (struct pending_extent_op *)
2928 (unsigned long)priv;
2929
2930 clear_extent_bits(&info->extent_ins, start, end,
2931 EXTENT_WRITEBACK, GFP_NOFS);
2932
2933 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2934 list_add_tail(&extent_op->list, &delete_list);
2935 search = end + 1;
2936 nr++;
2937 continue;
2938 }
2939
2940 mutex_lock(&extent_root->fs_info->pinned_mutex);
2941 ret = pin_down_bytes(trans, extent_root, start,
2942 end + 1 - start, 0);
2943 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2944
2945 ret = update_block_group(trans, extent_root, start,
2946 end + 1 - start, 0, ret > 0);
2947
2948 unlock_extent(extent_ins, start, end, GFP_NOFS);
2949 BUG_ON(ret);
2950 kfree(extent_op);
2951 }
2952 if (ret)
2953 err = ret;
2954
2955 search = end + 1;
2956
2957 if (need_resched()) {
2958 mutex_unlock(&info->extent_ins_mutex);
2959 cond_resched();
2960 mutex_lock(&info->extent_ins_mutex);
2961 }
2962 }
2963 2404
2964 if (nr) { 2405 /*
2965 ret = free_extents(trans, extent_root, &delete_list); 2406 * if metadata always pin
2966 BUG_ON(ret); 2407 * if data pin when any transaction has committed this
2967 } 2408 */
2409 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2410 ref_generation != trans->transid)
2411 pin = 1;
2968 2412
2969 if (all && skipped) { 2413 if (ref_generation != trans->transid)
2970 INIT_LIST_HEAD(&delete_list); 2414 pin = 1;
2971 search = 0;
2972 nr = 0;
2973 goto again;
2974 }
2975 2415
2976 if (!err) 2416 return __free_extent(trans, root, bytenr, num_bytes, parent,
2977 finish_current_insert(trans, extent_root, 0); 2417 root_objectid, ref_generation,
2978 return err; 2418 owner_objectid, pin, pin == 0, refs_to_drop);
2979} 2419}
2980 2420
2981/* 2421/*
2982 * remove an extent from the root, returns 0 on success 2422 * when we free an extent, it is possible (and likely) that we free the last
2423 * delayed ref for that extent as well. This searches the delayed ref tree for
2424 * a given extent, and if there are no other delayed refs to be processed, it
2425 * removes it from the tree.
2983 */ 2426 */
2984static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2427static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2985 struct btrfs_root *root, 2428 struct btrfs_root *root, u64 bytenr)
2986 u64 bytenr, u64 num_bytes, u64 parent,
2987 u64 root_objectid, u64 ref_generation,
2988 u64 owner_objectid, int pin)
2989{ 2429{
2990 struct btrfs_root *extent_root = root->fs_info->extent_root; 2430 struct btrfs_delayed_ref_head *head;
2991 int pending_ret; 2431 struct btrfs_delayed_ref_root *delayed_refs;
2432 struct btrfs_delayed_ref_node *ref;
2433 struct rb_node *node;
2992 int ret; 2434 int ret;
2993 2435
2994 WARN_ON(num_bytes < root->sectorsize); 2436 delayed_refs = &trans->transaction->delayed_refs;
2995 if (root == extent_root) { 2437 spin_lock(&delayed_refs->lock);
2996 struct pending_extent_op *extent_op = NULL; 2438 head = btrfs_find_delayed_ref_head(trans, bytenr);
2997 2439 if (!head)
2998 mutex_lock(&root->fs_info->extent_ins_mutex); 2440 goto out;
2999 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
3000 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
3001 u64 priv;
3002 ret = get_state_private(&root->fs_info->extent_ins,
3003 bytenr, &priv);
3004 BUG_ON(ret);
3005 extent_op = (struct pending_extent_op *)
3006 (unsigned long)priv;
3007 2441
3008 extent_op->del = 1; 2442 node = rb_prev(&head->node.rb_node);
3009 if (extent_op->type == PENDING_EXTENT_INSERT) { 2443 if (!node)
3010 mutex_unlock(&root->fs_info->extent_ins_mutex); 2444 goto out;
3011 return 0;
3012 }
3013 }
3014 2445
3015 if (extent_op) { 2446 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3016 ref_generation = extent_op->orig_generation;
3017 parent = extent_op->orig_parent;
3018 }
3019 2447
3020 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2448 /* there are still entries for this ref, we can't drop it */
3021 BUG_ON(!extent_op); 2449 if (ref->bytenr == bytenr)
3022 2450 goto out;
3023 extent_op->type = PENDING_EXTENT_DELETE;
3024 extent_op->bytenr = bytenr;
3025 extent_op->num_bytes = num_bytes;
3026 extent_op->parent = parent;
3027 extent_op->orig_parent = parent;
3028 extent_op->generation = ref_generation;
3029 extent_op->orig_generation = ref_generation;
3030 extent_op->level = (int)owner_objectid;
3031 INIT_LIST_HEAD(&extent_op->list);
3032 extent_op->del = 0;
3033
3034 set_extent_bits(&root->fs_info->pending_del,
3035 bytenr, bytenr + num_bytes - 1,
3036 EXTENT_WRITEBACK, GFP_NOFS);
3037 set_state_private(&root->fs_info->pending_del,
3038 bytenr, (unsigned long)extent_op);
3039 mutex_unlock(&root->fs_info->extent_ins_mutex);
3040 return 0;
3041 }
3042 /* if metadata always pin */
3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3045 mutex_lock(&root->fs_info->pinned_mutex);
3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3047 mutex_unlock(&root->fs_info->pinned_mutex);
3048 update_reserved_extents(root, bytenr, num_bytes, 0);
3049 return 0;
3050 }
3051 pin = 1;
3052 }
3053 2451
3054 /* if data pin when any transaction has committed this */ 2452 /*
3055 if (ref_generation != trans->transid) 2453 * waiting for the lock here would deadlock. If someone else has it
3056 pin = 1; 2454 * locked they are already in the process of dropping it anyway
2455 */
2456 if (!mutex_trylock(&head->mutex))
2457 goto out;
3057 2458
3058 ret = __free_extent(trans, root, bytenr, num_bytes, parent, 2459 /*
3059 root_objectid, ref_generation, 2460 * at this point we have a head with no other entries. Go
3060 owner_objectid, pin, pin == 0); 2461 * ahead and process it.
2462 */
2463 head->node.in_tree = 0;
2464 rb_erase(&head->node.rb_node, &delayed_refs->root);
2465
2466 delayed_refs->num_entries--;
2467
2468 /*
2469 * we don't take a ref on the node because we're removing it from the
2470 * tree, so we just steal the ref the tree was holding.
2471 */
2472 delayed_refs->num_heads--;
2473 if (list_empty(&head->cluster))
2474 delayed_refs->num_heads_ready--;
3061 2475
3062 finish_current_insert(trans, root->fs_info->extent_root, 0); 2476 list_del_init(&head->cluster);
3063 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); 2477 spin_unlock(&delayed_refs->lock);
3064 return ret ? ret : pending_ret; 2478
2479 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2480 &head->node, head->must_insert_reserved);
2481 BUG_ON(ret);
2482 btrfs_put_delayed_ref(&head->node);
2483 return 0;
2484out:
2485 spin_unlock(&delayed_refs->lock);
2486 return 0;
3065} 2487}
3066 2488
3067int btrfs_free_extent(struct btrfs_trans_handle *trans, 2489int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2494,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3072{ 2494{
3073 int ret; 2495 int ret;
3074 2496
3075 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, 2497 /*
3076 root_objectid, ref_generation, 2498 * tree log blocks never actually go into the extent allocation
3077 owner_objectid, pin); 2499 * tree, just update pinning info and exit early.
2500 *
2501 * data extents referenced by the tree log do need to have
2502 * their reference counts bumped.
2503 */
2504 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2505 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2506 /* unlocks the pinned mutex */
2507 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2508 update_reserved_extents(root, bytenr, num_bytes, 0);
2509 ret = 0;
2510 } else {
2511 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2512 root_objectid, ref_generation,
2513 owner_objectid,
2514 BTRFS_DROP_DELAYED_REF, 1);
2515 BUG_ON(ret);
2516 ret = check_ref_cleanup(trans, root, bytenr);
2517 BUG_ON(ret);
2518 }
3078 return ret; 2519 return ret;
3079} 2520}
3080 2521
@@ -3103,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3103{ 2544{
3104 int ret = 0; 2545 int ret = 0;
3105 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
3106 u64 total_needed = num_bytes; 2547 struct btrfs_free_cluster *last_ptr = NULL;
3107 u64 *last_ptr = NULL;
3108 u64 last_wanted = 0;
3109 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
3110 int chunk_alloc_done = 0;
3111 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
3112 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
3113 struct list_head *head = NULL, *cur = NULL;
3114 int loop = 0;
3115 int extra_loop = 0;
3116 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
3117 2554
3118 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
3119 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3120 ins->objectid = 0; 2557 ins->objectid = 0;
3121 ins->offset = 0; 2558 ins->offset = 0;
3122 2559
2560 space_info = __find_space_info(root->fs_info, data);
2561
3123 if (orig_root->ref_cows || empty_size) 2562 if (orig_root->ref_cows || empty_size)
3124 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
3125 2564
3126 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
3127 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
3128 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
3129 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
3130 } 2569 }
3131 2570
3132 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
3133 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
3134 2574
3135 if (last_ptr) { 2575 if (last_ptr) {
3136 if (*last_ptr) { 2576 spin_lock(&last_ptr->lock);
3137 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
3138 last_wanted = *last_ptr; 2578 hint_byte = last_ptr->window_start;
3139 } else 2579 spin_unlock(&last_ptr->lock);
3140 empty_size += empty_cluster;
3141 } else {
3142 empty_cluster = 0;
3143 } 2580 }
2581
3144 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
3145 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
3146 2584
3147 if (last_wanted && search_start != last_wanted) { 2585 if (!last_ptr) {
3148 last_wanted = 0; 2586 empty_cluster = 0;
3149 empty_size += empty_cluster; 2587 loop = 1;
3150 } 2588 }
3151 2589
3152 total_needed += empty_size; 2590 if (search_start == hint_byte) {
3153 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2591 block_group = btrfs_lookup_block_group(root->fs_info,
3154 if (!block_group) 2592 search_start);
3155 block_group = btrfs_lookup_first_block_group(root->fs_info, 2593 if (block_group && block_group_bits(block_group, data)) {
3156 search_start); 2594 down_read(&space_info->groups_sem);
3157 space_info = __find_space_info(root->fs_info, data); 2595 goto have_block_group;
2596 } else if (block_group) {
2597 btrfs_put_block_group(block_group);
2598 }
2599 }
3158 2600
2601search:
3159 down_read(&space_info->groups_sem); 2602 down_read(&space_info->groups_sem);
3160 while (1) { 2603 list_for_each_entry(block_group, &space_info->block_groups, list) {
3161 struct btrfs_free_space *free_space; 2604 u64 offset;
3162 /*
3163 * the only way this happens if our hint points to a block
3164 * group thats not of the proper type, while looping this
3165 * should never happen
3166 */
3167 if (empty_size)
3168 extra_loop = 1;
3169 2605
3170 if (!block_group) 2606 atomic_inc(&block_group->count);
3171 goto new_group_no_lock; 2607 search_start = block_group->key.objectid;
3172 2608
2609have_block_group:
3173 if (unlikely(!block_group->cached)) { 2610 if (unlikely(!block_group->cached)) {
3174 mutex_lock(&block_group->cache_mutex); 2611 mutex_lock(&block_group->cache_mutex);
3175 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
3176 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
3177 if (ret) 2614 if (ret) {
2615 btrfs_put_block_group(block_group);
3178 break; 2616 break;
2617 }
3179 } 2618 }
3180 2619
3181 mutex_lock(&block_group->alloc_mutex);
3182 if (unlikely(!block_group_bits(block_group, data)))
3183 goto new_group;
3184
3185 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
3186 goto new_group; 2621 goto loop;
3187 2622
3188 free_space = btrfs_find_free_space(block_group, search_start, 2623 if (last_ptr) {
3189 total_needed); 2624 /*
3190 if (free_space) { 2625 * the refill lock keeps out other
3191 u64 start = block_group->key.objectid; 2626 * people trying to start a new cluster
3192 u64 end = block_group->key.objectid + 2627 */
3193 block_group->key.offset; 2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
3194 2636
3195 search_start = stripe_align(root, free_space->offset); 2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
3196 2657
3197 /* move on to the next group */ 2658 /*
3198 if (search_start + num_bytes >= search_end) 2659 * this cluster didn't work out, free it and
3199 goto new_group; 2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3200 2663
3201 /* move on to the next group */ 2664 last_ptr_loop = 0;
3202 if (search_start + num_bytes > end)
3203 goto new_group;
3204 2665
3205 if (last_wanted && search_start != last_wanted) { 2666 /* allocate a cluster in this block group */
3206 total_needed += empty_cluster; 2667 ret = btrfs_find_space_cluster(trans,
3207 empty_size += empty_cluster; 2668 block_group, last_ptr,
3208 last_wanted = 0; 2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
3209 /* 2672 /*
3210 * if search_start is still in this block group 2673 * now pull our allocation out of this
3211 * then we just re-search this block group 2674 * cluster
3212 */ 2675 */
3213 if (search_start >= start && 2676 offset = btrfs_alloc_from_cluster(block_group,
3214 search_start < end) { 2677 last_ptr, num_bytes,
3215 mutex_unlock(&block_group->alloc_mutex); 2678 search_start);
3216 continue; 2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
3217 } 2683 }
3218
3219 /* else we go to the next block group */
3220 goto new_group;
3221 } 2684 }
3222 2685 /*
3223 if (exclude_nr > 0 && 2686 * at this point we either didn't find a cluster
3224 (search_start + num_bytes > exclude_start && 2687 * or we weren't able to allocate a block from our
3225 search_start < exclude_start + exclude_nr)) { 2688 * cluster. Free the cluster we've been trying
3226 search_start = exclude_start + exclude_nr; 2689 * to use, and go to the next block group
3227 /* 2690 */
3228 * if search_start is still in this block group 2691 if (loop < 2) {
3229 * then we just re-search this block group 2692 btrfs_return_cluster_to_free_space(NULL,
3230 */ 2693 last_ptr);
3231 if (search_start >= start && 2694 spin_unlock(&last_ptr->refill_lock);
3232 search_start < end) { 2695 goto loop;
3233 mutex_unlock(&block_group->alloc_mutex);
3234 last_wanted = 0;
3235 continue;
3236 }
3237
3238 /* else we go to the next block group */
3239 goto new_group;
3240 } 2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
3241 2699
3242 ins->objectid = search_start; 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
3243 ins->offset = num_bytes; 2701 num_bytes, empty_size);
2702 if (!offset)
2703 goto loop;
2704checks:
2705 search_start = stripe_align(root, offset);
3244 2706
3245 btrfs_remove_free_space_lock(block_group, search_start, 2707 /* move on to the next group */
3246 num_bytes); 2708 if (search_start + num_bytes >= search_end) {
3247 /* we are all good, lets return */ 2709 btrfs_add_free_space(block_group, offset, num_bytes);
3248 mutex_unlock(&block_group->alloc_mutex); 2710 goto loop;
3249 break;
3250 } 2711 }
3251new_group:
3252 mutex_unlock(&block_group->alloc_mutex);
3253 put_block_group(block_group);
3254 block_group = NULL;
3255new_group_no_lock:
3256 /* don't try to compare new allocations against the
3257 * last allocation any more
3258 */
3259 last_wanted = 0;
3260 2712
3261 /* 2713 /* move on to the next group */
3262 * Here's how this works. 2714 if (search_start + num_bytes >
3263 * loop == 0: we were searching a block group via a hint 2715 block_group->key.objectid + block_group->key.offset) {
3264 * and didn't find anything, so we start at 2716 btrfs_add_free_space(block_group, offset, num_bytes);
3265 * the head of the block groups and keep searching 2717 goto loop;
3266 * loop == 1: we're searching through all of the block groups 2718 }
3267 * if we hit the head again we have searched 2719
3268 * all of the block groups for this space and we 2720 if (exclude_nr > 0 &&
3269 * need to try and allocate, if we cant error out. 2721 (search_start + num_bytes > exclude_start &&
3270 * loop == 2: we allocated more space and are looping through 2722 search_start < exclude_start + exclude_nr)) {
3271 * all of the block groups again. 2723 search_start = exclude_start + exclude_nr;
3272 */ 2724
3273 if (loop == 0) { 2725 btrfs_add_free_space(block_group, offset, num_bytes);
3274 head = &space_info->block_groups; 2726 /*
3275 cur = head->next; 2727 * if search_start is still in this block group
3276 loop++; 2728 * then we just re-search this block group
3277 } else if (loop == 1 && cur == head) {
3278 int keep_going;
3279
3280 /* at this point we give up on the empty_size
3281 * allocations and just try to allocate the min
3282 * space.
3283 *
3284 * The extra_loop field was set if an empty_size
3285 * allocation was attempted above, and if this
3286 * is try we need to try the loop again without
3287 * the additional empty_size.
3288 */ 2729 */
3289 total_needed -= empty_size; 2730 if (search_start >= block_group->key.objectid &&
3290 empty_size = 0; 2731 search_start < (block_group->key.objectid +
3291 keep_going = extra_loop; 2732 block_group->key.offset))
3292 loop++; 2733 goto have_block_group;
2734 goto loop;
2735 }
3293 2736
3294 if (allowed_chunk_alloc && !chunk_alloc_done) { 2737 ins->objectid = search_start;
3295 up_read(&space_info->groups_sem); 2738 ins->offset = num_bytes;
3296 ret = do_chunk_alloc(trans, root, num_bytes + 2739
3297 2 * 1024 * 1024, data, 1); 2740 if (offset < search_start)
3298 down_read(&space_info->groups_sem); 2741 btrfs_add_free_space(block_group, offset,
3299 if (ret < 0) 2742 search_start - offset);
3300 goto loop_check; 2743 BUG_ON(offset > search_start);
3301 head = &space_info->block_groups; 2744
3302 /* 2745 /* we are all good, lets return */
3303 * we've allocated a new chunk, keep 2746 break;
3304 * trying 2747loop:
3305 */ 2748 btrfs_put_block_group(block_group);
3306 keep_going = 1; 2749 }
3307 chunk_alloc_done = 1; 2750 up_read(&space_info->groups_sem);
3308 } else if (!allowed_chunk_alloc) { 2751
3309 space_info->force_alloc = 1; 2752 /* loop == 0, try to find a clustered alloc in every block group
3310 } 2753 * loop == 1, try again after forcing a chunk allocation
3311loop_check: 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
3312 if (keep_going) { 2755 */
3313 cur = head->next; 2756 if (!ins->objectid && loop < 3 &&
3314 extra_loop = 0; 2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
3315 } else { 2758 if (loop >= 2) {
3316 break; 2759 empty_size = 0;
3317 } 2760 empty_cluster = 0;
3318 } else if (cur == head) {
3319 break;
3320 } 2761 }
3321 2762
3322 block_group = list_entry(cur, struct btrfs_block_group_cache, 2763 if (allowed_chunk_alloc) {
3323 list); 2764 ret = do_chunk_alloc(trans, root, num_bytes +
3324 atomic_inc(&block_group->count); 2765 2 * 1024 * 1024, data, 1);
2766 allowed_chunk_alloc = 0;
2767 } else {
2768 space_info->force_alloc = 1;
2769 }
3325 2770
3326 search_start = block_group->key.objectid; 2771 if (loop < 3) {
3327 cur = cur->next; 2772 loop++;
2773 goto search;
2774 }
2775 ret = -ENOSPC;
2776 } else if (!ins->objectid) {
2777 ret = -ENOSPC;
3328 } 2778 }
3329 2779
3330 /* we found what we needed */ 2780 /* we found what we needed */
@@ -3332,21 +2782,10 @@ loop_check:
3332 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
3333 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
3334 2784
3335 if (last_ptr) 2785 btrfs_put_block_group(block_group);
3336 *last_ptr = ins->objectid + ins->offset;
3337 ret = 0; 2786 ret = 0;
3338 } else if (!ret) {
3339 printk(KERN_ERR "btrfs searching for %llu bytes, "
3340 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3341 (unsigned long long)total_needed,
3342 (unsigned long long)num_bytes,
3343 loop, allowed_chunk_alloc);
3344 ret = -ENOSPC;
3345 } 2787 }
3346 if (block_group)
3347 put_block_group(block_group);
3348 2788
3349 up_read(&space_info->groups_sem);
3350 return ret; 2789 return ret;
3351} 2790}
3352 2791
@@ -3451,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3451 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
3452 2891
3453 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
3454 put_block_group(cache); 2893 btrfs_put_block_group(cache);
3455 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
3456 2895
3457 return ret; 2896 return ret;
@@ -3475,10 +2914,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3475static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2914static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3476 struct btrfs_root *root, u64 parent, 2915 struct btrfs_root *root, u64 parent,
3477 u64 root_objectid, u64 ref_generation, 2916 u64 root_objectid, u64 ref_generation,
3478 u64 owner, struct btrfs_key *ins) 2917 u64 owner, struct btrfs_key *ins,
2918 int ref_mod)
3479{ 2919{
3480 int ret; 2920 int ret;
3481 int pending_ret;
3482 u64 super_used; 2921 u64 super_used;
3483 u64 root_used; 2922 u64 root_used;
3484 u64 num_bytes = ins->offset; 2923 u64 num_bytes = ins->offset;
@@ -3503,33 +2942,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3503 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 2942 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3504 spin_unlock(&info->delalloc_lock); 2943 spin_unlock(&info->delalloc_lock);
3505 2944
3506 if (root == extent_root) {
3507 struct pending_extent_op *extent_op;
3508
3509 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3510 BUG_ON(!extent_op);
3511
3512 extent_op->type = PENDING_EXTENT_INSERT;
3513 extent_op->bytenr = ins->objectid;
3514 extent_op->num_bytes = ins->offset;
3515 extent_op->parent = parent;
3516 extent_op->orig_parent = 0;
3517 extent_op->generation = ref_generation;
3518 extent_op->orig_generation = 0;
3519 extent_op->level = (int)owner;
3520 INIT_LIST_HEAD(&extent_op->list);
3521 extent_op->del = 0;
3522
3523 mutex_lock(&root->fs_info->extent_ins_mutex);
3524 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3525 ins->objectid + ins->offset - 1,
3526 EXTENT_WRITEBACK, GFP_NOFS);
3527 set_state_private(&root->fs_info->extent_ins,
3528 ins->objectid, (unsigned long)extent_op);
3529 mutex_unlock(&root->fs_info->extent_ins_mutex);
3530 goto update_block;
3531 }
3532
3533 memcpy(&keys[0], ins, sizeof(*ins)); 2945 memcpy(&keys[0], ins, sizeof(*ins));
3534 keys[1].objectid = ins->objectid; 2946 keys[1].objectid = ins->objectid;
3535 keys[1].type = BTRFS_EXTENT_REF_KEY; 2947 keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2952,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3540 path = btrfs_alloc_path(); 2952 path = btrfs_alloc_path();
3541 BUG_ON(!path); 2953 BUG_ON(!path);
3542 2954
2955 path->leave_spinning = 1;
3543 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 2956 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3544 sizes, 2); 2957 sizes, 2);
3545 BUG_ON(ret); 2958 BUG_ON(ret);
3546 2959
3547 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2960 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3548 struct btrfs_extent_item); 2961 struct btrfs_extent_item);
3549 btrfs_set_extent_refs(path->nodes[0], extent_item, 1); 2962 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
3550 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 2963 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3551 struct btrfs_extent_ref); 2964 struct btrfs_extent_ref);
3552 2965
3553 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 2966 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3554 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 2967 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3555 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 2968 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3556 btrfs_set_ref_num_refs(path->nodes[0], ref, 1); 2969 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
3557 2970
3558 btrfs_mark_buffer_dirty(path->nodes[0]); 2971 btrfs_mark_buffer_dirty(path->nodes[0]);
3559 2972
3560 trans->alloc_exclude_start = 0; 2973 trans->alloc_exclude_start = 0;
3561 trans->alloc_exclude_nr = 0; 2974 trans->alloc_exclude_nr = 0;
3562 btrfs_free_path(path); 2975 btrfs_free_path(path);
3563 finish_current_insert(trans, extent_root, 0);
3564 pending_ret = del_pending_extents(trans, extent_root, 0);
3565 2976
3566 if (ret) 2977 if (ret)
3567 goto out; 2978 goto out;
3568 if (pending_ret) {
3569 ret = pending_ret;
3570 goto out;
3571 }
3572 2979
3573update_block:
3574 ret = update_block_group(trans, root, ins->objectid, 2980 ret = update_block_group(trans, root, ins->objectid,
3575 ins->offset, 1, 0); 2981 ins->offset, 1, 0);
3576 if (ret) { 2982 if (ret) {
@@ -3592,9 +2998,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3592 2998
3593 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 2999 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3594 return 0; 3000 return 0;
3595 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3001
3596 ref_generation, owner, ins); 3002 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3597 update_reserved_extents(root, ins->objectid, ins->offset, 0); 3003 ins->offset, parent, root_objectid,
3004 ref_generation, owner,
3005 BTRFS_ADD_DELAYED_EXTENT, 0);
3006 BUG_ON(ret);
3598 return ret; 3007 return ret;
3599} 3008}
3600 3009
@@ -3619,9 +3028,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3619 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
3620 ins->offset); 3029 ins->offset);
3621 BUG_ON(ret); 3030 BUG_ON(ret);
3622 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
3623 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3624 ref_generation, owner, ins); 3033 ref_generation, owner, ins, 1);
3625 return ret; 3034 return ret;
3626} 3035}
3627 3036
@@ -3640,20 +3049,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3640 u64 search_end, struct btrfs_key *ins, u64 data) 3049 u64 search_end, struct btrfs_key *ins, u64 data)
3641{ 3050{
3642 int ret; 3051 int ret;
3643
3644 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3052 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3645 min_alloc_size, empty_size, hint_byte, 3053 min_alloc_size, empty_size, hint_byte,
3646 search_end, ins, data); 3054 search_end, ins, data);
3647 BUG_ON(ret); 3055 BUG_ON(ret);
3648 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3056 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3649 ret = __btrfs_alloc_reserved_extent(trans, root, parent, 3057 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3650 root_objectid, ref_generation, 3058 ins->offset, parent, root_objectid,
3651 owner_objectid, ins); 3059 ref_generation, owner_objectid,
3060 BTRFS_ADD_DELAYED_EXTENT, 0);
3652 BUG_ON(ret); 3061 BUG_ON(ret);
3653
3654 } else {
3655 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3656 } 3062 }
3063 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3657 return ret; 3064 return ret;
3658} 3065}
3659 3066
@@ -3789,7 +3196,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3789 3196
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3197 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791 3198
3792 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3199 ret = btrfs_free_extent(trans, root, disk_bytenr,
3793 btrfs_file_extent_disk_num_bytes(leaf, fi), 3200 btrfs_file_extent_disk_num_bytes(leaf, fi),
3794 leaf->start, leaf_owner, leaf_generation, 3201 leaf->start, leaf_owner, leaf_generation,
3795 key.objectid, 0); 3202 key.objectid, 0);
@@ -3829,7 +3236,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3829 */ 3236 */
3830 for (i = 0; i < ref->nritems; i++) { 3237 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot; 3238 info = ref->extents + sorted[i].slot;
3832 ret = __btrfs_free_extent(trans, root, info->bytenr, 3239 ret = btrfs_free_extent(trans, root, info->bytenr,
3833 info->num_bytes, ref->bytenr, 3240 info->num_bytes, ref->bytenr,
3834 ref->owner, ref->generation, 3241 ref->owner, ref->generation,
3835 info->objectid, 0); 3242 info->objectid, 0);
@@ -3846,12 +3253,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3846 return 0; 3253 return 0;
3847} 3254}
3848 3255
3849static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, 3256static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3257 struct btrfs_root *root, u64 start,
3850 u64 len, u32 *refs) 3258 u64 len, u32 *refs)
3851{ 3259{
3852 int ret; 3260 int ret;
3853 3261
3854 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); 3262 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3855 BUG_ON(ret); 3263 BUG_ON(ret);
3856 3264
3857#if 0 /* some debugging code in case we see problems here */ 3265#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3367,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3959 * we just decrement it below and don't update any 3367 * we just decrement it below and don't update any
3960 * of the refs the leaf points to. 3368 * of the refs the leaf points to.
3961 */ 3369 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3370 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3371 blocksize, &refs);
3963 BUG_ON(ret); 3372 BUG_ON(ret);
3964 if (refs != 1) 3373 if (refs != 1)
3965 continue; 3374 continue;
@@ -4010,7 +3419,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4010 */ 3419 */
4011 for (i = 0; i < refi; i++) { 3420 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr; 3421 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr, 3422 ret = btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start, 3423 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1); 3424 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret); 3425 BUG_ON(ret);
@@ -4053,7 +3462,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4053 3462
4054 WARN_ON(*level < 0); 3463 WARN_ON(*level < 0);
4055 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3464 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4056 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, 3465 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4057 path->nodes[*level]->len, &refs); 3466 path->nodes[*level]->len, &refs);
4058 BUG_ON(ret); 3467 BUG_ON(ret);
4059 if (refs > 1) 3468 if (refs > 1)
@@ -4104,7 +3513,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3513 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4105 blocksize = btrfs_level_size(root, *level - 1); 3514 blocksize = btrfs_level_size(root, *level - 1);
4106 3515
4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3516 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3517 blocksize, &refs);
4108 BUG_ON(ret); 3518 BUG_ON(ret);
4109 3519
4110 /* 3520 /*
@@ -4119,7 +3529,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4119 root_gen = btrfs_header_generation(parent); 3529 root_gen = btrfs_header_generation(parent);
4120 path->slots[*level]++; 3530 path->slots[*level]++;
4121 3531
4122 ret = __btrfs_free_extent(trans, root, bytenr, 3532 ret = btrfs_free_extent(trans, root, bytenr,
4123 blocksize, parent->start, 3533 blocksize, parent->start,
4124 root_owner, root_gen, 3534 root_owner, root_gen,
4125 *level - 1, 1); 3535 *level - 1, 1);
@@ -4165,7 +3575,7 @@ out:
4165 * cleanup and free the reference on the last node 3575 * cleanup and free the reference on the last node
4166 * we processed 3576 * we processed
4167 */ 3577 */
4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3578 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4169 parent->start, root_owner, root_gen, 3579 parent->start, root_owner, root_gen,
4170 *level, 1); 3580 *level, 1);
4171 free_extent_buffer(path->nodes[*level]); 3581 free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3764,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4354 struct btrfs_path *path; 3764 struct btrfs_path *path;
4355 int i; 3765 int i;
4356 int orig_level; 3766 int orig_level;
3767 int update_count;
4357 struct btrfs_root_item *root_item = &root->root_item; 3768 struct btrfs_root_item *root_item = &root->root_item;
4358 3769
4359 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3770 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3806,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4395 } 3806 }
4396 } 3807 }
4397 while (1) { 3808 while (1) {
3809 unsigned long update;
4398 wret = walk_down_tree(trans, root, path, &level); 3810 wret = walk_down_tree(trans, root, path, &level);
4399 if (wret > 0) 3811 if (wret > 0)
4400 break; 3812 break;
@@ -4407,12 +3819,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4407 break; 3819 break;
4408 if (wret < 0) 3820 if (wret < 0)
4409 ret = wret; 3821 ret = wret;
4410 if (trans->transaction->in_commit) { 3822 if (trans->transaction->in_commit ||
3823 trans->transaction->delayed_refs.flushing) {
4411 ret = -EAGAIN; 3824 ret = -EAGAIN;
4412 break; 3825 break;
4413 } 3826 }
4414 atomic_inc(&root->fs_info->throttle_gen); 3827 atomic_inc(&root->fs_info->throttle_gen);
4415 wake_up(&root->fs_info->transaction_throttle); 3828 wake_up(&root->fs_info->transaction_throttle);
3829 for (update_count = 0; update_count < 16; update_count++) {
3830 update = trans->delayed_ref_updates;
3831 trans->delayed_ref_updates = 0;
3832 if (update)
3833 btrfs_run_delayed_refs(trans, root, update);
3834 else
3835 break;
3836 }
4416 } 3837 }
4417 for (i = 0; i <= orig_level; i++) { 3838 for (i = 0; i <= orig_level; i++) {
4418 if (path->nodes[i]) { 3839 if (path->nodes[i]) {
@@ -5457,6 +4878,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5457 root->root_key.objectid, 4878 root->root_key.objectid,
5458 trans->transid, key.objectid); 4879 trans->transid, key.objectid);
5459 BUG_ON(ret); 4880 BUG_ON(ret);
4881
5460 ret = btrfs_free_extent(trans, root, 4882 ret = btrfs_free_extent(trans, root,
5461 bytenr, num_bytes, leaf->start, 4883 bytenr, num_bytes, leaf->start,
5462 btrfs_header_owner(leaf), 4884 btrfs_header_owner(leaf),
@@ -5768,9 +5190,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5768 ref_path, NULL, NULL); 5190 ref_path, NULL, NULL);
5769 BUG_ON(ret); 5191 BUG_ON(ret);
5770 5192
5771 if (root == root->fs_info->extent_root)
5772 btrfs_extent_post_op(trans, root);
5773
5774 return 0; 5193 return 0;
5775} 5194}
5776 5195
@@ -6038,6 +5457,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6038 if (!path) 5457 if (!path)
6039 return -ENOMEM; 5458 return -ENOMEM;
6040 5459
5460 path->leave_spinning = 1;
6041 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5461 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6042 if (ret) 5462 if (ret)
6043 goto out; 5463 goto out;
@@ -6208,6 +5628,9 @@ again:
6208 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5628 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6209 mutex_unlock(&root->fs_info->cleaner_mutex); 5629 mutex_unlock(&root->fs_info->cleaner_mutex);
6210 5630
5631 trans = btrfs_start_transaction(info->tree_root, 1);
5632 btrfs_commit_transaction(trans, info->tree_root);
5633
6211 while (1) { 5634 while (1) {
6212 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5635 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6213 if (ret < 0) 5636 if (ret < 0)
@@ -6294,7 +5717,7 @@ next:
6294 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
6295 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6296 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
6297 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
6298 ret = 0; 5721 ret = 0;
6299out: 5722out:
6300 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -6421,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6421 5844
6422 atomic_set(&cache->count, 1); 5845 atomic_set(&cache->count, 1);
6423 spin_lock_init(&cache->lock); 5846 spin_lock_init(&cache->lock);
6424 mutex_init(&cache->alloc_mutex); 5847 spin_lock_init(&cache->tree_lock);
6425 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
6426 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
6427 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
6428 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
6429 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -6466,7 +5890,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6466 5890
6467 extent_root = root->fs_info->extent_root; 5891 extent_root = root->fs_info->extent_root;
6468 5892
6469 root->fs_info->last_trans_new_blockgroup = trans->transid; 5893 root->fs_info->last_trans_log_full_commit = trans->transid;
6470 5894
6471 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5895 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6472 if (!cache) 5896 if (!cache)
@@ -6477,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6477 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5901 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
6478 atomic_set(&cache->count, 1); 5902 atomic_set(&cache->count, 1);
6479 spin_lock_init(&cache->lock); 5903 spin_lock_init(&cache->lock);
6480 mutex_init(&cache->alloc_mutex); 5904 spin_lock_init(&cache->tree_lock);
6481 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
6482 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
6483 5908
6484 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
6485 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6500,9 +5925,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6500 sizeof(cache->item)); 5925 sizeof(cache->item));
6501 BUG_ON(ret); 5926 BUG_ON(ret);
6502 5927
6503 finish_current_insert(trans, extent_root, 0);
6504 ret = del_pending_extents(trans, extent_root, 0);
6505 BUG_ON(ret);
6506 set_avail_alloc_bits(extent_root->fs_info, type); 5928 set_avail_alloc_bits(extent_root->fs_info, type);
6507 5929
6508 return 0; 5930 return 0;
@@ -6542,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6542 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
6543 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
6544 5966
6545 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
6546 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
6547 5969
6548 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6549 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 disko = 0; 2884 disko = 0;
2885 flags = 0; 2885 flags = 0;
2886 2886
2887 switch (em->block_start) { 2887 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1; 2888 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST; 2889 flags |= FIEMAP_EXTENT_LAST;
2891 break; 2890 } else if (em->block_start == EXTENT_MAP_HOLE) {
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN; 2891 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break; 2892 } else if (em->block_start == EXTENT_MAP_INLINE) {
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED); 2894 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break; 2895 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC | 2896 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN); 2897 FIEMAP_EXTENT_UNKNOWN);
2902 break; 2898 } else {
2903 default:
2904 disko = em->block_start; 2899 disko = em->block_start;
2905 break;
2906 } 2900 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED; 2902 flags |= FIEMAP_EXTENT_ENCODED;
@@ -3124,20 +3118,15 @@ void free_extent_buffer(struct extent_buffer *eb)
3124int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3118int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3125 struct extent_buffer *eb) 3119 struct extent_buffer *eb)
3126{ 3120{
3127 int set;
3128 unsigned long i; 3121 unsigned long i;
3129 unsigned long num_pages; 3122 unsigned long num_pages;
3130 struct page *page; 3123 struct page *page;
3131 3124
3132 u64 start = eb->start;
3133 u64 end = start + eb->len - 1;
3134
3135 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3136 num_pages = num_extent_pages(eb->start, eb->len); 3125 num_pages = num_extent_pages(eb->start, eb->len);
3137 3126
3138 for (i = 0; i < num_pages; i++) { 3127 for (i = 0; i < num_pages; i++) {
3139 page = extent_buffer_page(eb, i); 3128 page = extent_buffer_page(eb, i);
3140 if (!set && !PageDirty(page)) 3129 if (!PageDirty(page))
3141 continue; 3130 continue;
3142 3131
3143 lock_page(page); 3132 lock_page(page);
@@ -3146,22 +3135,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3146 else 3135 else
3147 set_page_private(page, EXTENT_PAGE_PRIVATE); 3136 set_page_private(page, EXTENT_PAGE_PRIVATE);
3148 3137
3149 /*
3150 * if we're on the last page or the first page and the
3151 * block isn't aligned on a page boundary, do extra checks
3152 * to make sure we don't clean page that is partially dirty
3153 */
3154 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3155 ((i == num_pages - 1) &&
3156 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3157 start = (u64)page->index << PAGE_CACHE_SHIFT;
3158 end = start + PAGE_CACHE_SIZE - 1;
3159 if (test_range_bit(tree, start, end,
3160 EXTENT_DIRTY, 0)) {
3161 unlock_page(page);
3162 continue;
3163 }
3164 }
3165 clear_page_dirty_for_io(page); 3138 clear_page_dirty_for_io(page);
3166 spin_lock_irq(&page->mapping->tree_lock); 3139 spin_lock_irq(&page->mapping->tree_lock);
3167 if (!PageDirty(page)) { 3140 if (!PageDirty(page)) {
@@ -3187,29 +3160,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3187{ 3160{
3188 unsigned long i; 3161 unsigned long i;
3189 unsigned long num_pages; 3162 unsigned long num_pages;
3163 int was_dirty = 0;
3190 3164
3165 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3191 num_pages = num_extent_pages(eb->start, eb->len); 3166 num_pages = num_extent_pages(eb->start, eb->len);
3192 for (i = 0; i < num_pages; i++) { 3167 for (i = 0; i < num_pages; i++)
3193 struct page *page = extent_buffer_page(eb, i);
3194 /* writepage may need to do something special for the
3195 * first page, we have to make sure page->private is
3196 * properly set. releasepage may drop page->private
3197 * on us if the page isn't already dirty.
3198 */
3199 lock_page(page);
3200 if (i == 0) {
3201 set_page_extent_head(page, eb->len);
3202 } else if (PagePrivate(page) &&
3203 page->private != EXTENT_PAGE_PRIVATE) {
3204 set_page_extent_mapped(page);
3205 }
3206 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3168 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3207 set_extent_dirty(tree, page_offset(page), 3169 return was_dirty;
3208 page_offset(page) + PAGE_CACHE_SIZE - 1,
3209 GFP_NOFS);
3210 unlock_page(page);
3211 }
3212 return 0;
3213} 3170}
3214 3171
3215int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3172int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3746,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3789 ret = 0; 3746 ret = 0;
3790 goto out; 3747 goto out;
3791 } 3748 }
3749 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3750 ret = 0;
3751 goto out;
3752 }
3792 /* at this point we can safely release the extent buffer */ 3753 /* at this point we can safely release the extent buffer */
3793 num_pages = num_extent_pages(eb->start, eb->len); 3754 num_pages = num_extent_pages(eb->start, eb->len);
3794 for (i = 0; i < num_pages; i++) 3755 for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
25/* these are bit numbers for test/set bit */ 25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0 26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1 27#define EXTENT_BUFFER_BLOCKING 1
28#define EXTENT_BUFFER_DIRTY 2
28 29
29/* 30/*
30 * page->private values. Every page that is controlled by the extent 31 * page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
254 struct extent_buffer *eb); 255 struct extent_buffer *eb);
255int set_extent_buffer_dirty(struct extent_io_tree *tree, 256int set_extent_buffer_dirty(struct extent_io_tree *tree,
256 struct extent_buffer *eb); 257 struct extent_buffer *eb);
258int test_extent_buffer_dirty(struct extent_io_tree *tree,
259 struct extent_buffer *eb);
257int set_extent_buffer_uptodate(struct extent_io_tree *tree, 260int set_extent_buffer_uptodate(struct extent_io_tree *tree,
258 struct extent_buffer *eb); 261 struct extent_buffer *eb);
259int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 262int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 234 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 235 if (rb) {
236 ret = -EEXIST; 236 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 237 goto out;
239 } 238 }
240 atomic_inc(&em->refs); 239 atomic_inc(&em->refs);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
52 file_key.offset = pos; 52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54 54
55 path->leave_spinning = 1;
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 56 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item)); 57 sizeof(*item));
57 if (ret < 0) 58 if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
523 key.offset = end_byte - 1; 524 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY; 525 key.type = BTRFS_EXTENT_CSUM_KEY;
525 526
527 path->leave_spinning = 1;
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) { 529 if (ret > 0) {
528 if (path->slots[0] == 0) 530 if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
757 } else { 759 } else {
758 ins_size = csum_size; 760 ins_size = csum_size;
759 } 761 }
762 path->leave_spinning = 1;
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 763 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size); 764 ins_size);
765 path->leave_spinning = 0;
762 if (ret < 0) 766 if (ret < 0)
763 goto fail_unlock; 767 goto fail_unlock;
764 if (ret != 0) { 768 if (ret != 0) {
@@ -776,7 +780,6 @@ found:
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 780 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0])); 781 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL; 782 eb_token = NULL;
779 cond_resched();
780next_sector: 783next_sector:
781 784
782 if (!eb_token || 785 if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
817 eb_token = NULL; 820 eb_token = NULL;
818 } 821 }
819 btrfs_mark_buffer_dirty(path->nodes[0]); 822 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) { 823 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path); 824 btrfs_release_path(root, path);
825 cond_resched();
823 goto again; 826 goto again;
824 } 827 }
825out: 828out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
607 607
608 btrfs_release_path(root, path); 608 btrfs_release_path(root, path);
609 path->leave_spinning = 1;
609 ret = btrfs_insert_empty_item(trans, root, path, &ins, 610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
610 sizeof(*extent)); 611 sizeof(*extent));
611 BUG_ON(ret); 612 BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
639 ram_bytes); 640 ram_bytes);
640 btrfs_set_file_extent_type(leaf, extent, found_type); 641 btrfs_set_file_extent_type(leaf, extent, found_type);
641 642
643 btrfs_unlock_up_safe(path, 1);
642 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
645 btrfs_set_lock_blocking(path->nodes[0]);
643 646
644 if (disk_bytenr != 0) { 647 if (disk_bytenr != 0) {
645 ret = btrfs_update_extent_ref(trans, root, 648 ret = btrfs_update_extent_ref(trans, root,
646 disk_bytenr, orig_parent, 649 disk_bytenr,
650 le64_to_cpu(old.disk_num_bytes),
651 orig_parent,
647 leaf->start, 652 leaf->start,
648 root->root_key.objectid, 653 root->root_key.objectid,
649 trans->transid, ins.objectid); 654 trans->transid, ins.objectid);
650 655
651 BUG_ON(ret); 656 BUG_ON(ret);
652 } 657 }
658 path->leave_spinning = 0;
653 btrfs_release_path(root, path); 659 btrfs_release_path(root, path);
654 if (disk_bytenr != 0) 660 if (disk_bytenr != 0)
655 inode_add_bytes(inode, extent_end - end); 661 inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
912 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 918 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
913 919
914 if (orig_parent != leaf->start) { 920 if (orig_parent != leaf->start) {
915 ret = btrfs_update_extent_ref(trans, root, bytenr, 921 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
916 orig_parent, leaf->start, 922 orig_parent, leaf->start,
917 root->root_key.objectid, 923 root->root_key.objectid,
918 trans->transid, inode->i_ino); 924 trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
1155 page_cache_release(pinned[1]); 1161 page_cache_release(pinned[1]);
1156 *ppos = pos; 1162 *ppos = pos;
1157 1163
1164 /*
1165 * we want to make sure fsync finds this change
1166 * but we haven't joined a transaction running right now.
1167 *
1168 * Later on, someone is sure to update the inode and get the
1169 * real transid recorded.
1170 *
1171 * We set last_trans now to the fs_info generation + 1,
1172 * this will either be one more than the running transaction
1173 * or the generation used for the next transaction if there isn't
1174 * one running right now.
1175 */
1176 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1177
1158 if (num_written > 0 && will_write) { 1178 if (num_written > 0 && will_write) {
1159 struct btrfs_trans_handle *trans; 1179 struct btrfs_trans_handle *trans;
1160 1180
@@ -1167,8 +1187,11 @@ out_nolock:
1167 ret = btrfs_log_dentry_safe(trans, root, 1187 ret = btrfs_log_dentry_safe(trans, root,
1168 file->f_dentry); 1188 file->f_dentry);
1169 if (ret == 0) { 1189 if (ret == 0) {
1170 btrfs_sync_log(trans, root); 1190 ret = btrfs_sync_log(trans, root);
1171 btrfs_end_transaction(trans, root); 1191 if (ret == 0)
1192 btrfs_end_transaction(trans, root);
1193 else
1194 btrfs_commit_transaction(trans, root);
1172 } else { 1195 } else {
1173 btrfs_commit_transaction(trans, root); 1196 btrfs_commit_transaction(trans, root);
1174 } 1197 }
@@ -1185,6 +1208,18 @@ out_nolock:
1185 1208
1186int btrfs_release_file(struct inode *inode, struct file *filp) 1209int btrfs_release_file(struct inode *inode, struct file *filp)
1187{ 1210{
1211 /*
1212 * ordered_data_close is set by settattr when we are about to truncate
1213 * a file from a non-zero size to a zero size. This tries to
1214 * flush down new bytes that may have been written if the
1215 * application were using truncate to replace a file in place.
1216 */
1217 if (BTRFS_I(inode)->ordered_data_close) {
1218 BTRFS_I(inode)->ordered_data_close = 0;
1219 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1220 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1221 filemap_flush(inode->i_mapping);
1222 }
1188 if (filp->private_data) 1223 if (filp->private_data)
1189 btrfs_ioctl_trans_end(filp); 1224 btrfs_ioctl_trans_end(filp);
1190 return 0; 1225 return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1260 if (ret > 0) { 1295 if (ret > 0) {
1261 ret = btrfs_commit_transaction(trans, root); 1296 ret = btrfs_commit_transaction(trans, root);
1262 } else { 1297 } else {
1263 btrfs_sync_log(trans, root); 1298 ret = btrfs_sync_log(trans, root);
1264 ret = btrfs_end_transaction(trans, root); 1299 if (ret == 0)
1300 ret = btrfs_end_transaction(trans, root);
1301 else
1302 ret = btrfs_commit_transaction(trans, root);
1265 } 1303 }
1266 mutex_lock(&dentry->d_inode->i_mutex); 1304 mutex_lock(&dentry->d_inode->i_mutex);
1267out: 1305out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid,
336 block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes);
342 }
344 WARN_ON(1); 343 WARN_ON(1);
345 } 344 }
346out: 345out:
347 return ret; 346 return ret;
348} 347}
349 348
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 349void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 350 u64 bytes)
402{ 351{
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 357 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 358 if (info->bytes >= bytes)
410 count++; 359 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
361 info->bytes);
411 } 362 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 364 "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 379 return ret;
429} 380}
430 381
382/*
383 * for a given cluster, put all of its extents back into the free
384 * space cache. If the block group passed doesn't match the block group
385 * pointed to by the cluster, someone else raced in and freed the
386 * cluster already. In that case, we just return without changing anything
387 */
388static int
389__btrfs_return_cluster_to_free_space(
390 struct btrfs_block_group_cache *block_group,
391 struct btrfs_free_cluster *cluster)
392{
393 struct btrfs_free_space *entry;
394 struct rb_node *node;
395
396 spin_lock(&cluster->lock);
397 if (cluster->block_group != block_group)
398 goto out;
399
400 cluster->window_start = 0;
401 node = rb_first(&cluster->root);
402 while(node) {
403 entry = rb_entry(node, struct btrfs_free_space, offset_index);
404 node = rb_next(&entry->offset_index);
405 rb_erase(&entry->offset_index, &cluster->root);
406 link_free_space(block_group, entry);
407 }
408 list_del_init(&cluster->block_group_list);
409
410 btrfs_put_block_group(cluster->block_group);
411 cluster->block_group = NULL;
412 cluster->root.rb_node = NULL;
413out:
414 spin_unlock(&cluster->lock);
415 return 0;
416}
417
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 418void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 419{
433 struct btrfs_free_space *info; 420 struct btrfs_free_space *info;
434 struct rb_node *node; 421 struct rb_node *node;
422 struct btrfs_free_cluster *cluster;
423 struct btrfs_free_cluster *safe;
424
425 spin_lock(&block_group->tree_lock);
426
427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
428 block_group_list) {
429
430 WARN_ON(cluster->block_group != block_group);
431 __btrfs_return_cluster_to_free_space(block_group, cluster);
432 }
435 433
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 435 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 436 unlink_free_space(block_group, info);
440 kfree(info); 437 kfree(info);
441 if (need_resched()) { 438 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 439 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 440 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 441 spin_lock(&block_group->tree_lock);
445 } 442 }
446 } 443 }
447 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
448} 445}
449 446
450#if 0 447u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 448 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 449{
456 struct btrfs_free_space *ret; 450 struct btrfs_free_space *entry = NULL;
451 u64 ret = 0;
457 452
458 mutex_lock(&block_group->alloc_mutex); 453 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 454 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 455 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 456 if (!entry)
457 entry = tree_search_bytes(&block_group->free_space_bytes,
458 offset, bytes + empty_size);
459 if (entry) {
460 unlink_free_space(block_group, entry);
461 ret = entry->offset;
462 entry->offset += bytes;
463 entry->bytes -= bytes;
464
465 if (!entry->bytes)
466 kfree(entry);
467 else
468 link_free_space(block_group, entry);
469 }
470 spin_unlock(&block_group->tree_lock);
462 471
463 return ret; 472 return ret;
464} 473}
465 474
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 475/*
467 btrfs_block_group_cache 476 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 477 * cache. If a block group is passed, this function will only free
469 u64 bytes) 478 * a cluster that belongs to the passed block group.
479 *
480 * Otherwise, it'll get a reference on the block group pointed to by the
481 * cluster and remove the cluster from it.
482 */
483int btrfs_return_cluster_to_free_space(
484 struct btrfs_block_group_cache *block_group,
485 struct btrfs_free_cluster *cluster)
470{ 486{
471 struct btrfs_free_space *ret; 487 int ret;
472 488
473 mutex_lock(&block_group->alloc_mutex); 489 /* first, get a safe pointer to the block group */
490 spin_lock(&cluster->lock);
491 if (!block_group) {
492 block_group = cluster->block_group;
493 if (!block_group) {
494 spin_unlock(&cluster->lock);
495 return 0;
496 }
497 } else if (cluster->block_group != block_group) {
498 /* someone else has already freed it don't redo their work */
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 atomic_inc(&block_group->count);
503 spin_unlock(&cluster->lock);
474 504
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 505 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 506 spin_lock(&block_group->tree_lock);
507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
508 spin_unlock(&block_group->tree_lock);
477 509
510 /* finally drop our ref */
511 btrfs_put_block_group(block_group);
478 return ret; 512 return ret;
479} 513}
480#endif
481 514
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 515/*
483 *block_group, u64 offset, 516 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 517 * if it couldn't find anything suitably large, or a logical disk offset
518 * if things worked out
519 */
520u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
521 struct btrfs_free_cluster *cluster, u64 bytes,
522 u64 min_start)
523{
524 struct btrfs_free_space *entry = NULL;
525 struct rb_node *node;
526 u64 ret = 0;
527
528 spin_lock(&cluster->lock);
529 if (bytes > cluster->max_size)
530 goto out;
531
532 if (cluster->block_group != block_group)
533 goto out;
534
535 node = rb_first(&cluster->root);
536 if (!node)
537 goto out;
538
539 entry = rb_entry(node, struct btrfs_free_space, offset_index);
540
541 while(1) {
542 if (entry->bytes < bytes || entry->offset < min_start) {
543 struct rb_node *node;
544
545 node = rb_next(&entry->offset_index);
546 if (!node)
547 break;
548 entry = rb_entry(node, struct btrfs_free_space,
549 offset_index);
550 continue;
551 }
552 ret = entry->offset;
553
554 entry->offset += bytes;
555 entry->bytes -= bytes;
556
557 if (entry->bytes == 0) {
558 rb_erase(&entry->offset_index, &cluster->root);
559 kfree(entry);
560 }
561 break;
562 }
563out:
564 spin_unlock(&cluster->lock);
565 return ret;
566}
567
568/*
569 * here we try to find a cluster of blocks in a block group. The goal
570 * is to find at least bytes free and up to empty_size + bytes free.
571 * We might not find them all in one contiguous area.
572 *
573 * returns zero and sets up cluster if things worked out, otherwise
574 * it returns -enospc
575 */
576int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
577 struct btrfs_block_group_cache *block_group,
578 struct btrfs_free_cluster *cluster,
579 u64 offset, u64 bytes, u64 empty_size)
485{ 580{
486 struct btrfs_free_space *ret = NULL; 581 struct btrfs_free_space *entry = NULL;
582 struct rb_node *node;
583 struct btrfs_free_space *next;
584 struct btrfs_free_space *last;
585 u64 min_bytes;
586 u64 window_start;
587 u64 window_free;
588 u64 max_extent = 0;
589 int total_retries = 0;
590 int ret;
591
592 /* for metadata, allow allocates with more holes */
593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
594 /*
595 * we want to do larger allocations when we are
596 * flushing out the delayed refs, it helps prevent
597 * making more work as we go along.
598 */
599 if (trans->transaction->delayed_refs.flushing)
600 min_bytes = max(bytes, (bytes + empty_size) >> 1);
601 else
602 min_bytes = max(bytes, (bytes + empty_size) >> 4);
603 } else
604 min_bytes = max(bytes, (bytes + empty_size) >> 2);
605
606 spin_lock(&block_group->tree_lock);
607 spin_lock(&cluster->lock);
608
609 /* someone already found a cluster, hooray */
610 if (cluster->block_group) {
611 ret = 0;
612 goto out;
613 }
614again:
615 min_bytes = min(min_bytes, bytes + empty_size);
616 entry = tree_search_bytes(&block_group->free_space_bytes,
617 offset, min_bytes);
618 if (!entry) {
619 ret = -ENOSPC;
620 goto out;
621 }
622 window_start = entry->offset;
623 window_free = entry->bytes;
624 last = entry;
625 max_extent = entry->bytes;
626
627 while(1) {
628 /* out window is just right, lets fill it */
629 if (window_free >= bytes + empty_size)
630 break;
487 631
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 632 node = rb_next(&last->offset_index);
489 bytes, 0); 633 if (!node) {
490 if (!ret) 634 ret = -ENOSPC;
491 ret = tree_search_bytes(&block_group->free_space_bytes, 635 goto out;
492 offset, bytes); 636 }
637 next = rb_entry(node, struct btrfs_free_space, offset_index);
638
639 /*
640 * we haven't filled the empty size and the window is
641 * very large. reset and try again
642 */
643 if (next->offset - window_start > (bytes + empty_size) * 2) {
644 entry = next;
645 window_start = entry->offset;
646 window_free = entry->bytes;
647 last = entry;
648 max_extent = 0;
649 total_retries++;
650 if (total_retries % 256 == 0) {
651 if (min_bytes >= (bytes + empty_size)) {
652 ret = -ENOSPC;
653 goto out;
654 }
655 /*
656 * grow our allocation a bit, we're not having
657 * much luck
658 */
659 min_bytes *= 2;
660 goto again;
661 }
662 } else {
663 last = next;
664 window_free += next->bytes;
665 if (entry->bytes > max_extent)
666 max_extent = entry->bytes;
667 }
668 }
669
670 cluster->window_start = entry->offset;
671
672 /*
673 * now we've found our entries, pull them out of the free space
674 * cache and put them into the cluster rbtree
675 *
676 * The cluster includes an rbtree, but only uses the offset index
677 * of each free space cache entry.
678 */
679 while(1) {
680 node = rb_next(&entry->offset_index);
681 unlink_free_space(block_group, entry);
682 ret = tree_insert_offset(&cluster->root, entry->offset,
683 &entry->offset_index);
684 BUG_ON(ret);
685
686 if (!node || entry == last)
687 break;
688
689 entry = rb_entry(node, struct btrfs_free_space, offset_index);
690 }
691 ret = 0;
692 cluster->max_size = max_extent;
693 atomic_inc(&block_group->count);
694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
695 cluster->block_group = block_group;
696out:
697 spin_unlock(&cluster->lock);
698 spin_unlock(&block_group->tree_lock);
493 699
494 return ret; 700 return ret;
495} 701}
702
703/*
704 * simple code to zero out a cluster
705 */
706void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
707{
708 spin_lock_init(&cluster->lock);
709 spin_lock_init(&cluster->refill_lock);
710 cluster->root.rb_node = NULL;
711 cluster->max_size = 0;
712 INIT_LIST_HEAD(&cluster->block_group_list);
713 cluster->block_group = NULL;
714}
715
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
73 if (!path) 73 if (!path)
74 return -ENOMEM; 74 return -ENOMEM;
75 75
76 path->leave_spinning = 1;
77
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 78 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) { 79 if (ret > 0) {
78 ret = -ENOENT; 80 ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
127 if (!path) 129 if (!path)
128 return -ENOMEM; 130 return -ENOMEM;
129 131
132 path->leave_spinning = 1;
130 ret = btrfs_insert_empty_item(trans, root, path, &key, 133 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len); 134 ins_len);
132 if (ret == -EEXIST) { 135 if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
134 if (!path) 134 if (!path)
135 return -ENOMEM; 135 return -ENOMEM;
136 136
137 path->leave_spinning = 1;
137 btrfs_set_trans_block_group(trans, inode); 138 btrfs_set_trans_block_group(trans, inode);
138 139
139 key.objectid = inode->i_ino; 140 key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
167 cur_size = min_t(unsigned long, compressed_size, 168 cur_size = min_t(unsigned long, compressed_size,
168 PAGE_CACHE_SIZE); 169 PAGE_CACHE_SIZE);
169 170
170 kaddr = kmap(cpage); 171 kaddr = kmap_atomic(cpage, KM_USER0);
171 write_extent_buffer(leaf, kaddr, ptr, cur_size); 172 write_extent_buffer(leaf, kaddr, ptr, cur_size);
172 kunmap(cpage); 173 kunmap_atomic(kaddr, KM_USER0);
173 174
174 i++; 175 i++;
175 ptr += cur_size; 176 ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
204 * does the checks required to make sure the data is small enough 205 * does the checks required to make sure the data is small enough
205 * to fit as an inline extent. 206 * to fit as an inline extent.
206 */ 207 */
207static int cow_file_range_inline(struct btrfs_trans_handle *trans, 208static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
208 struct btrfs_root *root, 209 struct btrfs_root *root,
209 struct inode *inode, u64 start, u64 end, 210 struct inode *inode, u64 start, u64 end,
210 size_t compressed_size, 211 size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
854 u64 cur_end; 855 u64 cur_end;
855 int limit = 10 * 1024 * 1042; 856 int limit = 10 * 1024 * 1042;
856 857
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 858 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 859 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while (start < end) { 860 while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
935 * If no cow copies or snapshots exist, we write directly to the existing 931 * If no cow copies or snapshots exist, we write directly to the existing
936 * blocks on disk 932 * blocks on disk
937 */ 933 */
938static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 934static noinline int run_delalloc_nocow(struct inode *inode,
935 struct page *locked_page,
939 u64 start, u64 end, int *page_started, int force, 936 u64 start, u64 end, int *page_started, int force,
940 unsigned long *nr_written) 937 unsigned long *nr_written)
941{ 938{
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1133 unsigned long *nr_written) 1130 unsigned long *nr_written)
1134{ 1131{
1135 int ret; 1132 int ret;
1133 struct btrfs_root *root = BTRFS_I(inode)->root;
1136 1134
1137 if (btrfs_test_flag(inode, NODATACOW)) 1135 if (btrfs_test_flag(inode, NODATACOW))
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1136 ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1140 else if (btrfs_test_flag(inode, PREALLOC)) 1138 else if (btrfs_test_flag(inode, PREALLOC))
1141 ret = run_delalloc_nocow(inode, locked_page, start, end, 1139 ret = run_delalloc_nocow(inode, locked_page, start, end,
1142 page_started, 0, nr_written); 1140 page_started, 0, nr_written);
1141 else if (!btrfs_test_opt(root, COMPRESS))
1142 ret = cow_file_range(inode, locked_page, start, end,
1143 page_started, nr_written, 1);
1143 else 1144 else
1144 ret = cow_file_range_async(inode, locked_page, start, end, 1145 ret = cow_file_range_async(inode, locked_page, start, end,
1145 page_started, nr_written); 1146 page_started, nr_written);
1146
1147 return ret; 1147 return ret;
1148} 1148}
1149 1149
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1453 path = btrfs_alloc_path(); 1453 path = btrfs_alloc_path();
1454 BUG_ON(!path); 1454 BUG_ON(!path);
1455 1455
1456 path->leave_spinning = 1;
1456 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1457 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, file_pos, &hint);
1458 BUG_ON(ret); 1459 BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1475 btrfs_set_file_extent_compression(leaf, fi, compression); 1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1476 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1477 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479
1480 btrfs_unlock_up_safe(path, 1);
1481 btrfs_set_lock_blocking(leaf);
1482
1478 btrfs_mark_buffer_dirty(leaf); 1483 btrfs_mark_buffer_dirty(leaf);
1479 1484
1480 inode_add_bytes(inode, num_bytes); 1485 inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1487 root->root_key.objectid, 1492 root->root_key.objectid,
1488 trans->transid, inode->i_ino, &ins); 1493 trans->transid, inode->i_ino, &ins);
1489 BUG_ON(ret); 1494 BUG_ON(ret);
1490
1491 btrfs_free_path(path); 1495 btrfs_free_path(path);
1496
1492 return 0; 1497 return 0;
1493} 1498}
1494 1499
1500/*
1501 * helper function for btrfs_finish_ordered_io, this
1502 * just reads in some of the csum leaves to prime them into ram
1503 * before we start the transaction. It limits the amount of btree
1504 * reads required while inside the transaction.
1505 */
1506static noinline void reada_csum(struct btrfs_root *root,
1507 struct btrfs_path *path,
1508 struct btrfs_ordered_extent *ordered_extent)
1509{
1510 struct btrfs_ordered_sum *sum;
1511 u64 bytenr;
1512
1513 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1514 list);
1515 bytenr = sum->sums[0].bytenr;
1516
1517 /*
1518 * we don't care about the results, the point of this search is
1519 * just to get the btree leaves into ram
1520 */
1521 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1522}
1523
1495/* as ordered data IO finishes, this gets called so we can finish 1524/* as ordered data IO finishes, this gets called so we can finish
1496 * an ordered extent if the range of bytes in the file it covers are 1525 * an ordered extent if the range of bytes in the file it covers are
1497 * fully written. 1526 * fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1500{ 1529{
1501 struct btrfs_root *root = BTRFS_I(inode)->root; 1530 struct btrfs_root *root = BTRFS_I(inode)->root;
1502 struct btrfs_trans_handle *trans; 1531 struct btrfs_trans_handle *trans;
1503 struct btrfs_ordered_extent *ordered_extent; 1532 struct btrfs_ordered_extent *ordered_extent = NULL;
1504 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1533 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1534 struct btrfs_path *path;
1505 int compressed = 0; 1535 int compressed = 0;
1506 int ret; 1536 int ret;
1507 1537
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1509 if (!ret) 1539 if (!ret)
1510 return 0; 1540 return 0;
1511 1541
1542 /*
1543 * before we join the transaction, try to do some of our IO.
1544 * This will limit the amount of IO that we have to do with
1545 * the transaction running. We're unlikely to need to do any
1546 * IO if the file extents are new, the disk_i_size checks
1547 * covers the most common case.
1548 */
1549 if (start < BTRFS_I(inode)->disk_i_size) {
1550 path = btrfs_alloc_path();
1551 if (path) {
1552 ret = btrfs_lookup_file_extent(NULL, root, path,
1553 inode->i_ino,
1554 start, 0);
1555 ordered_extent = btrfs_lookup_ordered_extent(inode,
1556 start);
1557 if (!list_empty(&ordered_extent->list)) {
1558 btrfs_release_path(root, path);
1559 reada_csum(root, path, ordered_extent);
1560 }
1561 btrfs_free_path(path);
1562 }
1563 }
1564
1512 trans = btrfs_join_transaction(root, 1); 1565 trans = btrfs_join_transaction(root, 1);
1513 1566
1514 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1567 if (!ordered_extent)
1568 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1515 BUG_ON(!ordered_extent); 1569 BUG_ON(!ordered_extent);
1516 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1570 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1517 goto nocow; 1571 goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2101 2155
2102 path = btrfs_alloc_path(); 2156 path = btrfs_alloc_path();
2103 BUG_ON(!path); 2157 BUG_ON(!path);
2158 path->leave_spinning = 1;
2104 ret = btrfs_lookup_inode(trans, root, path, 2159 ret = btrfs_lookup_inode(trans, root, path,
2105 &BTRFS_I(inode)->location, 1); 2160 &BTRFS_I(inode)->location, 1);
2106 if (ret) { 2161 if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2147 goto err; 2202 goto err;
2148 } 2203 }
2149 2204
2205 path->leave_spinning = 1;
2150 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2206 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2151 name, name_len, -1); 2207 name, name_len, -1);
2152 if (IS_ERR(di)) { 2208 if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2190 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2191 inode, dir->i_ino); 2247 inode, dir->i_ino);
2192 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2193 if (ret != -ENOENT)
2194 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2195 2249
2196 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2197 dir, index); 2251 dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2224 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2225 2279
2226 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2227 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2228 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2229 2286
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2498 key.type = (u8)-1; 2555 key.type = (u8)-1;
2499 2556
2500search_again: 2557search_again:
2558 path->leave_spinning = 1;
2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2559 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2502 if (ret < 0) 2560 if (ret < 0)
2503 goto error; 2561 goto error;
@@ -2644,6 +2702,7 @@ delete:
2644 break; 2702 break;
2645 } 2703 }
2646 if (found_extent) { 2704 if (found_extent) {
2705 btrfs_set_path_blocking(path);
2647 ret = btrfs_free_extent(trans, root, extent_start, 2706 ret = btrfs_free_extent(trans, root, extent_start,
2648 extent_num_bytes, 2707 extent_num_bytes,
2649 leaf->start, root_owner, 2708 leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2848 if (err) 2907 if (err)
2849 return err; 2908 return err;
2850 2909
2851 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2852 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2853 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2854 if (err) 2913 if (err)
2855 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2856 } 2925 }
2857 2926
2858 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->disk_i_size = 0; 3053 bi->disk_i_size = 0;
2985 bi->flags = 0; 3054 bi->flags = 0;
2986 bi->index_cnt = (u64)-1; 3055 bi->index_cnt = (u64)-1;
2987 bi->log_dirty_trans = 0; 3056 bi->last_unlink_trans = 0;
2988 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3057 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2989 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3058 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2990 inode->i_mapping, GFP_NOFS); 3059 inode->i_mapping, GFP_NOFS);
2991 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2992 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
2993 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
2994 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2995 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
2996 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3411,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3411 3481
3412 if (dir) { 3482 if (dir) {
3413 ret = btrfs_set_inode_index(dir, index); 3483 ret = btrfs_set_inode_index(dir, index);
3414 if (ret) 3484 if (ret) {
3485 iput(inode);
3415 return ERR_PTR(ret); 3486 return ERR_PTR(ret);
3487 }
3416 } 3488 }
3417 /* 3489 /*
3418 * index_cnt is ignored for everything but a dir, 3490 * index_cnt is ignored for everything but a dir,
@@ -3449,6 +3521,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3449 sizes[0] = sizeof(struct btrfs_inode_item); 3521 sizes[0] = sizeof(struct btrfs_inode_item);
3450 sizes[1] = name_len + sizeof(*ref); 3522 sizes[1] = name_len + sizeof(*ref);
3451 3523
3524 path->leave_spinning = 1;
3452 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3525 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3453 if (ret != 0) 3526 if (ret != 0)
3454 goto fail; 3527 goto fail;
@@ -3494,6 +3567,7 @@ fail:
3494 if (dir) 3567 if (dir)
3495 BTRFS_I(dir)->index_cnt--; 3568 BTRFS_I(dir)->index_cnt--;
3496 btrfs_free_path(path); 3569 btrfs_free_path(path);
3570 iput(inode);
3497 return ERR_PTR(ret); 3571 return ERR_PTR(ret);
3498} 3572}
3499 3573
@@ -3727,6 +3801,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3727 drop_inode = 1; 3801 drop_inode = 1;
3728 3802
3729 nr = trans->blocks_used; 3803 nr = trans->blocks_used;
3804
3805 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3730 btrfs_end_transaction_throttle(trans, root); 3806 btrfs_end_transaction_throttle(trans, root);
3731fail: 3807fail:
3732 if (drop_inode) { 3808 if (drop_inode) {
@@ -4292,8 +4368,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4292 * beyond EOF, then the page is guaranteed safe against truncation until we 4368 * beyond EOF, then the page is guaranteed safe against truncation until we
4293 * unlock the page. 4369 * unlock the page.
4294 */ 4370 */
4295int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 4371int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4296{ 4372{
4373 struct page *page = vmf->page;
4297 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4374 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4298 struct btrfs_root *root = BTRFS_I(inode)->root; 4375 struct btrfs_root *root = BTRFS_I(inode)->root;
4299 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4376 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4383,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4306 u64 page_end; 4383 u64 page_end;
4307 4384
4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 4385 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4309 if (ret) 4386 if (ret) {
4387 if (ret == -ENOMEM)
4388 ret = VM_FAULT_OOM;
4389 else /* -ENOSPC, -EIO, etc */
4390 ret = VM_FAULT_SIGBUS;
4310 goto out; 4391 goto out;
4392 }
4311 4393
4312 ret = -EINVAL; 4394 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4313again: 4395again:
4314 lock_page(page); 4396 lock_page(page);
4315 size = i_size_read(inode); 4397 size = i_size_read(inode);
@@ -4357,6 +4439,8 @@ again:
4357 } 4439 }
4358 ClearPageChecked(page); 4440 ClearPageChecked(page);
4359 set_page_dirty(page); 4441 set_page_dirty(page);
4442
4443 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4360 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4444 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4361 4445
4362out_unlock: 4446out_unlock:
@@ -4382,6 +4466,27 @@ static void btrfs_truncate(struct inode *inode)
4382 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4466 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4383 4467
4384 trans = btrfs_start_transaction(root, 1); 4468 trans = btrfs_start_transaction(root, 1);
4469
4470 /*
4471 * setattr is responsible for setting the ordered_data_close flag,
4472 * but that is only tested during the last file release. That
4473 * could happen well after the next commit, leaving a great big
4474 * window where new writes may get lost if someone chooses to write
4475 * to this file after truncating to zero
4476 *
4477 * The inode doesn't have any dirty data here, and so if we commit
4478 * this is a noop. If someone immediately starts writing to the inode
4479 * it is very likely we'll catch some of their writes in this
4480 * transaction, and the commit will find this file on the ordered
4481 * data list with good things to send down.
4482 *
4483 * This is a best effort solution, there is still a window where
4484 * using truncate to replace the contents of the file will
4485 * end up with a zero length file after a crash.
4486 */
4487 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4488 btrfs_add_ordered_operation(trans, root, inode);
4489
4385 btrfs_set_trans_block_group(trans, inode); 4490 btrfs_set_trans_block_group(trans, inode);
4386 btrfs_i_size_write(inode, inode->i_size); 4491 btrfs_i_size_write(inode, inode->i_size);
4387 4492
@@ -4458,12 +4563,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4458 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4563 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4459 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4564 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4460 INIT_LIST_HEAD(&ei->i_orphan); 4565 INIT_LIST_HEAD(&ei->i_orphan);
4566 INIT_LIST_HEAD(&ei->ordered_operations);
4461 return &ei->vfs_inode; 4567 return &ei->vfs_inode;
4462} 4568}
4463 4569
4464void btrfs_destroy_inode(struct inode *inode) 4570void btrfs_destroy_inode(struct inode *inode)
4465{ 4571{
4466 struct btrfs_ordered_extent *ordered; 4572 struct btrfs_ordered_extent *ordered;
4573 struct btrfs_root *root = BTRFS_I(inode)->root;
4574
4467 WARN_ON(!list_empty(&inode->i_dentry)); 4575 WARN_ON(!list_empty(&inode->i_dentry));
4468 WARN_ON(inode->i_data.nrpages); 4576 WARN_ON(inode->i_data.nrpages);
4469 4577
@@ -4474,13 +4582,24 @@ void btrfs_destroy_inode(struct inode *inode)
4474 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4582 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4475 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4583 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4476 4584
4477 spin_lock(&BTRFS_I(inode)->root->list_lock); 4585 /*
4586 * Make sure we're properly removed from the ordered operation
4587 * lists.
4588 */
4589 smp_mb();
4590 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4591 spin_lock(&root->fs_info->ordered_extent_lock);
4592 list_del_init(&BTRFS_I(inode)->ordered_operations);
4593 spin_unlock(&root->fs_info->ordered_extent_lock);
4594 }
4595
4596 spin_lock(&root->list_lock);
4478 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4597 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4479 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4598 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4480 " list\n", inode->i_ino); 4599 " list\n", inode->i_ino);
4481 dump_stack(); 4600 dump_stack();
4482 } 4601 }
4483 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4602 spin_unlock(&root->list_lock);
4484 4603
4485 while (1) { 4604 while (1) {
4486 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4605 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4605,8 +4724,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4605 if (ret) 4724 if (ret)
4606 goto out_unlock; 4725 goto out_unlock;
4607 4726
4727 /*
4728 * we're using rename to replace one file with another.
4729 * and the replacement file is large. Start IO on it now so
4730 * we don't add too much work to the end of the transaction
4731 */
4732 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4733 new_inode->i_size &&
4734 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4735 filemap_flush(old_inode->i_mapping);
4736
4608 trans = btrfs_start_transaction(root, 1); 4737 trans = btrfs_start_transaction(root, 1);
4609 4738
4739 /*
4740 * make sure the inode gets flushed if it is replacing
4741 * something.
4742 */
4743 if (new_inode && new_inode->i_size &&
4744 old_inode && S_ISREG(old_inode->i_mode)) {
4745 btrfs_add_ordered_operation(trans, root, old_inode);
4746 }
4747
4748 /*
4749 * this is an ugly little race, but the rename is required to make
4750 * sure that if we crash, the inode is either at the old name
4751 * or the new one. pinning the log transaction lets us make sure
4752 * we don't allow a log commit to come in after we unlink the
4753 * name but before we add the new name back in.
4754 */
4755 btrfs_pin_log_trans(root);
4756
4610 btrfs_set_trans_block_group(trans, new_dir); 4757 btrfs_set_trans_block_group(trans, new_dir);
4611 4758
4612 btrfs_inc_nlink(old_dentry->d_inode); 4759 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4761,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4614 new_dir->i_ctime = new_dir->i_mtime = ctime; 4761 new_dir->i_ctime = new_dir->i_mtime = ctime;
4615 old_inode->i_ctime = ctime; 4762 old_inode->i_ctime = ctime;
4616 4763
4764 if (old_dentry->d_parent != new_dentry->d_parent)
4765 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4766
4617 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4767 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4618 old_dentry->d_name.name, 4768 old_dentry->d_name.name,
4619 old_dentry->d_name.len); 4769 old_dentry->d_name.len);
@@ -4645,7 +4795,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4645 if (ret) 4795 if (ret)
4646 goto out_fail; 4796 goto out_fail;
4647 4797
4798 btrfs_log_new_name(trans, old_inode, old_dir,
4799 new_dentry->d_parent);
4648out_fail: 4800out_fail:
4801
4802 /* this btrfs_end_log_trans just allows the current
4803 * log-sub transaction to complete
4804 */
4805 btrfs_end_log_trans(root);
4649 btrfs_end_transaction_throttle(trans, root); 4806 btrfs_end_transaction_throttle(trans, root);
4650out_unlock: 4807out_unlock:
4651 return ret; 4808 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
267 goto out_dput; 267 goto out_dput;
268 268
269 if (!IS_POSIXACL(parent->dentry->d_inode)) 269 if (!IS_POSIXACL(parent->dentry->d_inode))
270 mode &= ~current->fs->umask; 270 mode &= ~current_umask();
271 271
272 error = mnt_want_write(parent->mnt); 272 error = mnt_want_write(parent->mnt);
273 if (error) 273 if (error)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
60 60
61/* 61/*
62 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 * 66 *
67 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
71static int btrfs_spin_on_block(struct extent_buffer *eb) 71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{ 72{
73 int i; 73 int i;
74
74 for (i = 0; i < 512; i++) { 75 for (i = 0; i < 512; i++) {
75 cpu_relax();
76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
77 return 1; 77 return 1;
78 if (need_resched()) 78 if (need_resched())
79 break; 79 break;
80 cpu_relax();
80 } 81 }
81 return 0; 82 return 0;
82} 83}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
95{ 96{
96 int i; 97 int i;
97 98
98 spin_nested(eb); 99 if (btrfs_spin_on_block(eb)) {
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 100 spin_nested(eb);
100 return 1; 101 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
101 spin_unlock(&eb->lock); 102 return 1;
102 103 spin_unlock(&eb->lock);
104 }
103 /* spin for a bit on the BLOCKING flag */ 105 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) { 106 for (i = 0; i < 2; i++) {
107 cpu_relax();
105 if (!btrfs_spin_on_block(eb)) 108 if (!btrfs_spin_on_block(eb))
106 break; 109 break;
107 110
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
148 DEFINE_WAIT(wait); 151 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function; 152 wait.func = btrfs_wake_function;
150 153
154 if (!btrfs_spin_on_block(eb))
155 goto sleep;
156
151 while(1) { 157 while(1) {
152 spin_nested(eb); 158 spin_nested(eb);
153 159
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
165 * spin for a bit, and if the blocking flag goes away, 171 * spin for a bit, and if the blocking flag goes away,
166 * loop around 172 * loop around
167 */ 173 */
174 cpu_relax();
168 if (btrfs_spin_on_block(eb)) 175 if (btrfs_spin_on_block(eb))
169 continue; 176 continue;
170 177sleep:
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE); 179 TASK_UNINTERRUPTIBLE);
173 180
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
86 {Opt_err, NULL}, 90 {Opt_err, NULL},
87}; 91};
88 92
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
222 case Opt_noacl: 226 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 228 break;
229 case Opt_notreelog:
230 printk(KERN_INFO "btrfs: disabling tree log\n");
231 btrfs_set_opt(info->mount_opt, NOTREELOG);
232 break;
233 case Opt_flushoncommit:
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break;
225 default: 237 default:
226 break; 238 break;
227 } 239 }
@@ -363,9 +375,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 375int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 376{
365 struct btrfs_trans_handle *trans; 377 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 378 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 379 int ret;
368 root = btrfs_sb(sb);
369 380
370 if (sb->s_flags & MS_RDONLY) 381 if (sb->s_flags & MS_RDONLY)
371 return 0; 382 return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
385 return ret; 396 return ret;
386} 397}
387 398
399static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
400{
401 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
402 struct btrfs_fs_info *info = root->fs_info;
403
404 if (btrfs_test_opt(root, DEGRADED))
405 seq_puts(seq, ",degraded");
406 if (btrfs_test_opt(root, NODATASUM))
407 seq_puts(seq, ",nodatasum");
408 if (btrfs_test_opt(root, NODATACOW))
409 seq_puts(seq, ",nodatacow");
410 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent);
414 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline);
416 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
421 if (btrfs_test_opt(root, COMPRESS))
422 seq_puts(seq, ",compress");
423 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl");
431 return 0;
432}
433
388static void btrfs_write_super(struct super_block *sb) 434static void btrfs_write_super(struct super_block *sb)
389{ 435{
390 sb->s_dirt = 0; 436 sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
630 .put_super = btrfs_put_super, 676 .put_super = btrfs_put_super,
631 .write_super = btrfs_write_super, 677 .write_super = btrfs_write_super,
632 .sync_fs = btrfs_sync_fs, 678 .sync_fs = btrfs_sync_fs,
633 .show_options = generic_show_options, 679 .show_options = btrfs_show_options,
634 .write_inode = btrfs_write_inode, 680 .write_inode = btrfs_write_inode,
635 .dirty_inode = btrfs_dirty_inode, 681 .dirty_inode = btrfs_dirty_inode,
636 .alloc_inode = btrfs_alloc_inode, 682 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
65 cur_trans->use_count = 1; 63 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0; 64 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds(); 65 cur_trans->start_time = get_seconds();
66
67 cur_trans->delayed_refs.root.rb_node = NULL;
68 cur_trans->delayed_refs.num_entries = 0;
69 cur_trans->delayed_refs.num_heads_ready = 0;
70 cur_trans->delayed_refs.num_heads = 0;
71 cur_trans->delayed_refs.flushing = 0;
72 cur_trans->delayed_refs.run_delayed_start = 0;
73 spin_lock_init(&cur_trans->delayed_refs.lock);
74
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 75 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 76 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages, 77 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
182 h->block_group = 0; 189 h->block_group = 0;
183 h->alloc_exclude_nr = 0; 190 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0; 191 h->alloc_exclude_start = 0;
192 h->delayed_ref_updates = 0;
193
185 root->fs_info->running_transaction->use_count++; 194 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex); 195 mutex_unlock(&root->fs_info->trans_mutex);
187 return h; 196 return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
271 if (!root->fs_info->open_ioctl_trans) 280 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root); 281 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex); 282 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root); 283 throttle_on_drops(root);
276} 284}
277 285
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
280{ 288{
281 struct btrfs_transaction *cur_trans; 289 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info; 290 struct btrfs_fs_info *info = root->fs_info;
291 int count = 0;
292
293 while (count < 4) {
294 unsigned long cur = trans->delayed_ref_updates;
295 trans->delayed_ref_updates = 0;
296 if (cur &&
297 trans->transaction->delayed_refs.num_heads_ready > 64) {
298 trans->delayed_ref_updates = 0;
299
300 /*
301 * do a full flush if the transaction is trying
302 * to close
303 */
304 if (trans->transaction->delayed_refs.flushing)
305 cur = 0;
306 btrfs_run_delayed_refs(trans, root, cur);
307 } else {
308 break;
309 }
310 count++;
311 }
283 312
284 mutex_lock(&info->trans_mutex); 313 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction; 314 cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
424 u64 old_root_bytenr; 453 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root; 454 struct btrfs_root *tree_root = root->fs_info->tree_root;
426 455
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root); 456 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root); 457
458 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459 BUG_ON(ret);
430 460
431 while (1) { 461 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 462 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
438 btrfs_header_level(root->node)); 468 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid); 469 btrfs_set_root_generation(&root->root_item, trans->transid);
440 470
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root, 471 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key, 472 &root->root_key,
445 &root->root_item); 473 &root->root_item);
446 BUG_ON(ret); 474 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root); 475 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root); 476
477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478 BUG_ON(ret);
449 } 479 }
450 return 0; 480 return 0;
451} 481}
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
459 struct btrfs_fs_info *fs_info = root->fs_info; 489 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next; 490 struct list_head *next;
461 struct extent_buffer *eb; 491 struct extent_buffer *eb;
492 int ret;
462 493
463 btrfs_extent_post_op(trans, fs_info->tree_root); 494 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
495 BUG_ON(ret);
464 496
465 eb = btrfs_lock_root_node(fs_info->tree_root); 497 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 498 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
467 btrfs_tree_unlock(eb); 499 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb); 500 free_extent_buffer(eb);
469 501
470 btrfs_extent_post_op(trans, fs_info->tree_root); 502 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
503 BUG_ON(ret);
471 504
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 505 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next; 506 next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
475 root = list_entry(next, struct btrfs_root, dirty_list); 508 root = list_entry(next, struct btrfs_root, dirty_list);
476 509
477 update_cowonly_root(trans, root); 510 update_cowonly_root(trans, root);
511
512 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513 BUG_ON(ret);
478 } 514 }
479 return 0; 515 return 0;
480} 516}
@@ -635,6 +671,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
635} 671}
636 672
637/* 673/*
674 * when dropping snapshots, we generate a ton of delayed refs, and it makes
675 * sense not to join the transaction while it is trying to flush the current
676 * queue of delayed refs out.
677 *
678 * This is used by the drop snapshot code only
679 */
680static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
681{
682 DEFINE_WAIT(wait);
683
684 mutex_lock(&info->trans_mutex);
685 while (info->running_transaction &&
686 info->running_transaction->delayed_refs.flushing) {
687 prepare_to_wait(&info->transaction_wait, &wait,
688 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex);
690 schedule();
691 mutex_lock(&info->trans_mutex);
692 finish_wait(&info->transaction_wait, &wait);
693 }
694 mutex_unlock(&info->trans_mutex);
695 return 0;
696}
697
698/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 699 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them 700 * all of them
640 */ 701 */
@@ -661,7 +722,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
661 atomic_inc(&root->fs_info->throttles); 722 atomic_inc(&root->fs_info->throttles);
662 723
663 while (1) { 724 while (1) {
725 /*
726 * we don't want to jump in and create a bunch of
727 * delayed refs if the transaction is starting to close
728 */
729 wait_transaction_pre_flush(tree_root->fs_info);
664 trans = btrfs_start_transaction(tree_root, 1); 730 trans = btrfs_start_transaction(tree_root, 1);
731
732 /*
733 * we've joined a transaction, make sure it isn't
734 * closing right now
735 */
736 if (trans->transaction->delayed_refs.flushing) {
737 btrfs_end_transaction(trans, tree_root);
738 continue;
739 }
740
665 mutex_lock(&root->fs_info->drop_mutex); 741 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root); 742 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN) 743 if (ret != -EAGAIN)
@@ -766,7 +842,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
766 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 842 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
767 843
768 old = btrfs_lock_root_node(root); 844 old = btrfs_lock_root_node(root);
769 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 845 btrfs_cow_block(trans, root, old, NULL, 0, &old);
770 846
771 btrfs_copy_root(trans, root, old, &tmp, objectid); 847 btrfs_copy_root(trans, root, old, &tmp, objectid);
772 btrfs_tree_unlock(old); 848 btrfs_tree_unlock(old);
@@ -894,12 +970,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
894 struct extent_io_tree *pinned_copy; 970 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait); 971 DEFINE_WAIT(wait);
896 int ret; 972 int ret;
973 int should_grow = 0;
974 unsigned long now = get_seconds();
975 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
976
977 btrfs_run_ordered_operations(root, 0);
978
979 /* make a pass through all the delayed refs we have so far
980 * any runnings procs may add more while we are here
981 */
982 ret = btrfs_run_delayed_refs(trans, root, 0);
983 BUG_ON(ret);
984
985 cur_trans = trans->transaction;
986 /*
987 * set the flushing flag so procs in this transaction have to
988 * start sending their work down.
989 */
990 cur_trans->delayed_refs.flushing = 1;
991
992 ret = btrfs_run_delayed_refs(trans, root, 0);
993 BUG_ON(ret);
897 994
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex); 995 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) { 996 INIT_LIST_HEAD(&dirty_fs_roots);
901 cur_trans = trans->transaction; 997 if (cur_trans->in_commit) {
902 trans->transaction->use_count++; 998 cur_trans->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex); 999 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root); 1000 btrfs_end_transaction(trans, root);
905 1001
@@ -922,7 +1018,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
922 1018
923 trans->transaction->in_commit = 1; 1019 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1; 1020 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1021 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev, 1022 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list); 1023 struct btrfs_transaction, list);
@@ -937,6 +1032,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
937 } 1032 }
938 } 1033 }
939 1034
1035 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1036 should_grow = 1;
1037
940 do { 1038 do {
941 int snap_pending = 0; 1039 int snap_pending = 0;
942 joined = cur_trans->num_joined; 1040 joined = cur_trans->num_joined;
@@ -949,26 +1047,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
949 1047
950 if (cur_trans->num_writers > 1) 1048 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT; 1049 timeout = MAX_SCHEDULE_TIMEOUT;
952 else 1050 else if (should_grow)
953 timeout = 1; 1051 timeout = 1;
954 1052
955 mutex_unlock(&root->fs_info->trans_mutex); 1053 mutex_unlock(&root->fs_info->trans_mutex);
956 1054
957 if (snap_pending) { 1055 if (flush_on_commit || snap_pending) {
1056 if (flush_on_commit)
1057 btrfs_start_delalloc_inodes(root);
958 ret = btrfs_wait_ordered_extents(root, 1); 1058 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret); 1059 BUG_ON(ret);
960 } 1060 }
961 1061
962 schedule_timeout(timeout); 1062 /*
1063 * rename don't use btrfs_join_transaction, so, once we
1064 * set the transaction to blocked above, we aren't going
1065 * to get any new ordered operations. We can safely run
1066 * it here and no for sure that nothing new will be added
1067 * to the list
1068 */
1069 btrfs_run_ordered_operations(root, 1);
1070
1071 smp_mb();
1072 if (cur_trans->num_writers > 1 || should_grow)
1073 schedule_timeout(timeout);
963 1074
964 mutex_lock(&root->fs_info->trans_mutex); 1075 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait); 1076 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 || 1077 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined)); 1078 (should_grow && cur_trans->num_joined != joined));
968 1079
969 ret = create_pending_snapshots(trans, root->fs_info); 1080 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret); 1081 BUG_ON(ret);
971 1082
1083 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1084 BUG_ON(ret);
1085
972 WARN_ON(cur_trans != trans->transaction); 1086 WARN_ON(cur_trans != trans->transaction);
973 1087
974 /* btrfs_commit_tree_roots is responsible for getting the 1088 /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1146,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1032 btrfs_copy_pinned(root, pinned_copy); 1146 btrfs_copy_pinned(root, pinned_copy);
1033 1147
1034 trans->transaction->blocked = 0; 1148 trans->transaction->blocked = 0;
1149
1035 wake_up(&root->fs_info->transaction_throttle); 1150 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait); 1151 wake_up(&root->fs_info->transaction_wait);
1037 1152
@@ -1058,6 +1173,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 mutex_lock(&root->fs_info->trans_mutex); 1173 mutex_lock(&root->fs_info->trans_mutex);
1059 1174
1060 cur_trans->commit_done = 1; 1175 cur_trans->commit_done = 1;
1176
1061 root->fs_info->last_trans_committed = cur_trans->transid; 1177 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait); 1178 wake_up(&cur_trans->commit_wait);
1063 1179
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
19#ifndef __BTRFS_TRANSACTION__ 19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h"
22 23
23struct btrfs_transaction { 24struct btrfs_transaction {
24 u64 transid; 25 u64 transid;
26 /*
27 * total writers in this transaction, it must be zero before the
28 * transaction can end
29 */
25 unsigned long num_writers; 30 unsigned long num_writers;
31
26 unsigned long num_joined; 32 unsigned long num_joined;
27 int in_commit; 33 int in_commit;
28 int use_count; 34 int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
34 wait_queue_head_t writer_wait; 40 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait; 41 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots; 42 struct list_head pending_snapshots;
43 struct btrfs_delayed_ref_root delayed_refs;
37}; 44};
38 45
39struct btrfs_trans_handle { 46struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
44 u64 block_group; 51 u64 block_group;
45 u64 alloc_exclude_start; 52 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr; 53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates;
47}; 55};
48 56
49struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
124 } 124 }
125 125
126 btrfs_release_path(root, path); 126 btrfs_release_path(root, path);
127 if (is_extent)
128 btrfs_extent_post_op(trans, root);
129out: 127out:
130 if (path) 128 if (path)
131 btrfs_free_path(path); 129 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
199 struct extent_buffer *eb, 262 struct extent_buffer *eb,
200 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
201{ 264{
202 if (wc->pin) { 265 if (wc->pin)
203 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 }
208 268
209 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
210 if (wc->write) 270 if (wc->write)
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 663
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 664 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 665 BUG_ON(ret);
666
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 667 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 668 BUG_ON(ret);
608 kfree(name); 669 kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
804 victim_name_len)) { 865 victim_name_len)) {
805 btrfs_inc_nlink(inode); 866 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 867 btrfs_release_path(root, path);
868
807 ret = btrfs_unlink_inode(trans, root, dir, 869 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 870 inode, victim_name,
809 victim_name_len); 871 victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 984 key.offset--;
923 btrfs_release_path(root, path); 985 btrfs_release_path(root, path);
924 } 986 }
925 btrfs_free_path(path); 987 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 988 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 989 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 990 btrfs_update_inode(trans, root, inode);
929 } 991 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 992 BTRFS_I(inode)->index_cnt = (u64)-1;
931 993
994 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
995 ret = replay_dir_deletes(trans, root, NULL, path,
996 inode->i_ino, 1);
997 BUG_ON(ret);
998 }
999 btrfs_free_path(path);
1000
932 return 0; 1001 return 0;
933} 1002}
934 1003
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1040
972 iput(inode); 1041 iput(inode);
973 1042
974 if (key.offset == 0) 1043 /*
975 break; 1044 * fixup on a directory may create new entries,
976 key.offset--; 1045 * make sure we always look for the highset possible
1046 * offset
1047 */
1048 key.offset = (u64)-1;
977 } 1049 }
978 btrfs_release_path(root, path); 1050 btrfs_release_path(root, path);
979 return 0; 1051 return 0;
@@ -1150,8 +1222,7 @@ insert:
1150 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1151 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1152 1224
1153 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1154 BUG();
1155 goto out; 1226 goto out;
1156} 1227}
1157 1228
@@ -1313,11 +1384,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1384 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1385 name_len);
1315 log_di = NULL; 1386 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1387 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1388 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1389 dir_key->objectid,
1319 name, name_len, 0); 1390 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1391 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1392 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1393 log_path,
1323 dir_key->objectid, 1394 dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1449 struct btrfs_root *root,
1379 struct btrfs_root *log, 1450 struct btrfs_root *log,
1380 struct btrfs_path *path, 1451 struct btrfs_path *path,
1381 u64 dirid) 1452 u64 dirid, int del_all)
1382{ 1453{
1383 u64 range_start; 1454 u64 range_start;
1384 u64 range_end; 1455 u64 range_end;
@@ -1408,10 +1479,14 @@ again:
1408 range_start = 0; 1479 range_start = 0;
1409 range_end = 0; 1480 range_end = 0;
1410 while (1) { 1481 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1482 if (del_all)
1412 &range_start, &range_end); 1483 range_end = (u64)-1;
1413 if (ret != 0) 1484 else {
1414 break; 1485 ret = find_dir_range(log, path, dirid, key_type,
1486 &range_start, &range_end);
1487 if (ret != 0)
1488 break;
1489 }
1415 1490
1416 dir_key.offset = range_start; 1491 dir_key.offset = range_start;
1417 while (1) { 1492 while (1) {
@@ -1437,7 +1512,8 @@ again:
1437 break; 1512 break;
1438 1513
1439 ret = check_item_in_log(trans, root, log, path, 1514 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1515 log_path, dir,
1516 &found_key);
1441 BUG_ON(ret); 1517 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1518 if (found_key.offset == (u64)-1)
1443 break; 1519 break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1591 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1592 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1593 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1594 BUG_ON(ret);
1519 } 1595 }
1520 ret = overwrite_item(wc->trans, root, path, 1596 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1609 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1610 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1611 BUG_ON(ret);
1612
1613 /* if the nlink count is zero here, the iput
1614 * will free the inode. We bump it to make
1615 * sure it doesn't get freed until the link
1616 * count fixup is done
1617 */
1618 if (inode->i_nlink == 0) {
1619 btrfs_inc_nlink(inode);
1620 btrfs_update_inode(wc->trans,
1621 root, inode);
1622 }
1536 iput(inode); 1623 iput(inode);
1537 } 1624 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1625 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1927 return ret;
1841} 1928}
1842 1929
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1930static int wait_log_commit(struct btrfs_trans_handle *trans,
1931 struct btrfs_root *root, unsigned long transid)
1844{ 1932{
1845 DEFINE_WAIT(wait); 1933 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1934 int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1942 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1943 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1944 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1945
1946 if (root->fs_info->last_trans_log_full_commit !=
1947 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1948 atomic_read(&root->log_commit[index]))
1859 schedule(); 1949 schedule();
1950
1860 finish_wait(&root->log_commit_wait[index], &wait); 1951 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1952 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1953 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1955 return 0;
1865} 1956}
1866 1957
1867static int wait_for_writer(struct btrfs_root *root) 1958static int wait_for_writer(struct btrfs_trans_handle *trans,
1959 struct btrfs_root *root)
1868{ 1960{
1869 DEFINE_WAIT(wait); 1961 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1962 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1963 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1964 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1965 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1966 if (root->fs_info->last_trans_log_full_commit !=
1967 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1968 schedule();
1876 mutex_lock(&root->log_mutex); 1969 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1970 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1975/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1976 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1977 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1978 * you know that any inodes previously logged are safely on disk only
1979 * if it returns 0.
1980 *
1981 * Any other return value means you need to call btrfs_commit_transaction.
1982 * Some of the edge cases for fsyncing directories that have had unlinks
1983 * or renames done in the past mean that sometimes the only safe
1984 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1985 * that has happened.
1886 */ 1986 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1987int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1988 struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1996 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 1997 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 1998 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 1999 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2000 mutex_unlock(&root->log_mutex);
1901 return 0; 2001 return 0;
1902 } 2002 }
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2004
1905 /* wait for previous tree log sync to complete */ 2005 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2006 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2007 wait_log_commit(trans, root, root->log_transid - 1);
1908 2008
1909 while (1) { 2009 while (1) {
1910 unsigned long batch = root->log_batch; 2010 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2011 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2012 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2013 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2014
2015 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2016 if (batch == root->log_batch)
1916 break; 2017 break;
1917 } 2018 }
1918 2019
2020 /* bail out if we need to do a full commit */
2021 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2022 ret = -EAGAIN;
2023 mutex_unlock(&root->log_mutex);
2024 goto out;
2025 }
2026
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2028 BUG_ON(ret);
1921 2029
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2059
1952 index2 = log_root_tree->log_transid % 2; 2060 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2061 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2062 wait_log_commit(trans, log_root_tree,
2063 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2064 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2065 goto out;
1957 } 2066 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2067 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2068
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2069 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2070 wait_log_commit(trans, log_root_tree,
2071 log_root_tree->log_transid - 1);
2072 }
2073
2074 wait_for_writer(trans, log_root_tree);
1962 2075
1963 wait_for_writer(log_root_tree); 2076 /*
2077 * now that we've moved on to the tree of log tree roots,
2078 * check the full commit flag again
2079 */
2080 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2081 mutex_unlock(&log_root_tree->log_mutex);
2082 ret = -EAGAIN;
2083 goto out_wake_log_root;
2084 }
1964 2085
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2087 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2106 * in and cause problems either.
1986 */ 2107 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2108 write_ctree_super(trans, root->fs_info->tree_root, 2);
2109 ret = 0;
1988 2110
2111out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2112 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2113 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2114 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
1998 return 0; 2121 return 0;
1999} 2122}
2000 2123
2001/* * free all the extents used by the tree log. This should be called 2124/*
2125 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2126 * at commit time of the full transaction
2003 */ 2127 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2128int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2256
2133 btrfs_free_path(path); 2257 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2258 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2259 btrfs_end_log_trans(root);
2136 2260
2137 return 0; 2261 return 0;
2138} 2262}
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2283 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2284 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2285 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2286 btrfs_end_log_trans(root);
2163 2287
2164 return ret; 2288 return ret;
2165} 2289}
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2683 *
2560 * This handles both files and directories. 2684 * This handles both files and directories.
2561 */ 2685 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2686static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2687 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2688 int inode_only)
2565{ 2689{
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2709 min_key.offset = 0;
2586 2710
2587 max_key.objectid = inode->i_ino; 2711 max_key.objectid = inode->i_ino;
2712
2713 /* today the code can only do partial logging of directories */
2714 if (!S_ISDIR(inode->i_mode))
2715 inode_only = LOG_INODE_ALL;
2716
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2717 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2718 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2719 else
2591 max_key.type = (u8)-1; 2720 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2721 max_key.offset = (u64)-1;
2593 2722
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2723 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2724
2612 /* 2725 /*
@@ -2693,7 +2806,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2806 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2807 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2808 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2809 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2810 BUG_ON(ret);
2699 } 2811 }
@@ -2702,19 +2814,69 @@ next_slot:
2702 2814
2703 btrfs_free_path(path); 2815 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2816 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2817 return 0;
2707} 2818}
2708 2819
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2820/*
2710 struct btrfs_root *root, struct inode *inode, 2821 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2822 * of the directories in it require a full commit before they can
2823 * be logged. Returns zero if nothing special needs to be done or 1 if
2824 * a full commit is required.
2825 */
2826static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2827 struct inode *inode,
2828 struct dentry *parent,
2829 struct super_block *sb,
2830 u64 last_committed)
2712{ 2831{
2713 int ret; 2832 int ret = 0;
2833 struct btrfs_root *root;
2714 2834
2715 start_log_trans(trans, root); 2835 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2836 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2837 * have to worry about the parents at all. This is because
2838 * we can use the last_unlink_trans field to record renames
2839 * and other fun in this file.
2840 */
2841 if (S_ISREG(inode->i_mode) &&
2842 BTRFS_I(inode)->generation <= last_committed &&
2843 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2844 goto out;
2845
2846 if (!S_ISDIR(inode->i_mode)) {
2847 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2848 goto out;
2849 inode = parent->d_inode;
2850 }
2851
2852 while (1) {
2853 BTRFS_I(inode)->logged_trans = trans->transid;
2854 smp_mb();
2855
2856 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2857 root = BTRFS_I(inode)->root;
2858
2859 /*
2860 * make sure any commits to the log are forced
2861 * to be full commits
2862 */
2863 root->fs_info->last_trans_log_full_commit =
2864 trans->transid;
2865 ret = 1;
2866 break;
2867 }
2868
2869 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2870 break;
2871
2872 if (parent == sb->s_root)
2873 break;
2874
2875 parent = parent->d_parent;
2876 inode = parent->d_inode;
2877
2878 }
2879out:
2718 return ret; 2880 return ret;
2719} 2881}
2720 2882
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2886 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2887 * the last committed transaction
2726 */ 2888 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2889int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2890 struct btrfs_root *root, struct inode *inode,
2891 struct dentry *parent, int exists_only)
2729{ 2892{
2730 int inode_only = LOG_INODE_ALL; 2893 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2894 struct super_block *sb;
2732 int ret; 2895 int ret = 0;
2896 u64 last_committed = root->fs_info->last_trans_committed;
2897
2898 sb = inode->i_sb;
2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2905 if (root->fs_info->last_trans_log_full_commit >
2906 root->fs_info->last_trans_committed) {
2907 ret = 1;
2908 goto end_no_trans;
2909 }
2910
2911 ret = check_parent_dirs_for_sync(trans, inode, parent,
2912 sb, last_committed);
2913 if (ret)
2914 goto end_no_trans;
2733 2915
2734 start_log_trans(trans, root); 2916 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2917
2742 dentry = dentry->d_parent; 2918 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2919 BUG_ON(ret);
2920
2921 /*
2922 * for regular files, if its inode is already on disk, we don't
2923 * have to worry about the parents at all. This is because
2924 * we can use the last_unlink_trans field to record renames
2925 * and other fun in this file.
2926 */
2927 if (S_ISREG(inode->i_mode) &&
2928 BTRFS_I(inode)->generation <= last_committed &&
2929 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2930 goto no_parent;
2931
2932 inode_only = LOG_INODE_EXISTS;
2933 while (1) {
2934 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2935 break;
2745 2936
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2937 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2938 if (BTRFS_I(inode)->generation >
2939 root->fs_info->last_trans_committed) {
2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
2941 BUG_ON(ret);
2942 }
2943 if (parent == sb->s_root)
2748 break; 2944 break;
2945
2946 parent = parent->d_parent;
2749 } 2947 }
2750 end_log_trans(root); 2948no_parent:
2751 return 0; 2949 ret = 0;
2950 btrfs_end_log_trans(root);
2951end_no_trans:
2952 return ret;
2752} 2953}
2753 2954
2754/* 2955/*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2961int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2962 struct btrfs_root *root, struct dentry *dentry)
2762{ 2963{
2763 u64 gen; 2964 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2965 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2966}
2770 2967
2771/* 2968/*
@@ -2884,3 +3081,94 @@ again:
2884 kfree(log_root_tree); 3081 kfree(log_root_tree);
2885 return 0; 3082 return 0;
2886} 3083}
3084
3085/*
3086 * there are some corner cases where we want to force a full
3087 * commit instead of allowing a directory to be logged.
3088 *
3089 * They revolve around files there were unlinked from the directory, and
3090 * this function updates the parent directory so that a full commit is
3091 * properly done if it is fsync'd later after the unlinks are done.
3092 */
3093void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3094 struct inode *dir, struct inode *inode,
3095 int for_rename)
3096{
3097 /*
3098 * when we're logging a file, if it hasn't been renamed
3099 * or unlinked, and its inode is fully committed on disk,
3100 * we don't have to worry about walking up the directory chain
3101 * to log its parents.
3102 *
3103 * So, we use the last_unlink_trans field to put this transid
3104 * into the file. When the file is logged we check it and
3105 * don't log the parents if the file is fully on disk.
3106 */
3107 if (S_ISREG(inode->i_mode))
3108 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3109
3110 /*
3111 * if this directory was already logged any new
3112 * names for this file/dir will get recorded
3113 */
3114 smp_mb();
3115 if (BTRFS_I(dir)->logged_trans == trans->transid)
3116 return;
3117
3118 /*
3119 * if the inode we're about to unlink was logged,
3120 * the log will be properly updated for any new names
3121 */
3122 if (BTRFS_I(inode)->logged_trans == trans->transid)
3123 return;
3124
3125 /*
3126 * when renaming files across directories, if the directory
3127 * there we're unlinking from gets fsync'd later on, there's
3128 * no way to find the destination directory later and fsync it
3129 * properly. So, we have to be conservative and force commits
3130 * so the new name gets discovered.
3131 */
3132 if (for_rename)
3133 goto record;
3134
3135 /* we can safely do the unlink without any special recording */
3136 return;
3137
3138record:
3139 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3140}
3141
3142/*
3143 * Call this after adding a new name for a file and it will properly
3144 * update the log to reflect the new name.
3145 *
3146 * It will return zero if all goes well, and it will return 1 if a
3147 * full transaction commit is required.
3148 */
3149int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3150 struct inode *inode, struct inode *old_dir,
3151 struct dentry *parent)
3152{
3153 struct btrfs_root * root = BTRFS_I(inode)->root;
3154
3155 /*
3156 * this will force the logging code to walk the dentry chain
3157 * up for the file
3158 */
3159 if (S_ISREG(inode->i_mode))
3160 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3161
3162 /*
3163 * if this inode hasn't been logged and directory we're renaming it
3164 * from hasn't been logged, we don't need to log it
3165 */
3166 if (BTRFS_I(inode)->logged_trans <=
3167 root->fs_info->last_trans_committed &&
3168 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3169 root->fs_info->last_trans_committed))
3170 return 0;
3171
3172 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3173}
3174
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
145 int again = 0; 146 int again = 0;
146 unsigned long num_run = 0; 147 unsigned long num_run = 0;
147 unsigned long limit; 148 unsigned long limit;
149 unsigned long last_waited = 0;
148 150
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 151 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 152 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 153 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 154 limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 209 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 210 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 211 struct bio *old_head;
212 struct io_context *ioc;
210 213
214 ioc = current->io_context;
215
216 /*
217 * the main goal here is that we don't want to
218 * block if we're going to be able to submit
219 * more requests without blocking.
220 *
221 * This code does two great things, it pokes into
222 * the elevator code from a filesystem _and_
223 * it makes assumptions about how batching works.
224 */
225 if (ioc && ioc->nr_batch_requests > 0 &&
226 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227 (last_waited == 0 ||
228 ioc->last_waited == last_waited)) {
229 /*
230 * we want to go through our batch of
231 * requests and stop. So, we copy out
232 * the ioc->last_waited time and test
233 * against it before looping
234 */
235 last_waited = ioc->last_waited;
236 continue;
237 }
211 spin_lock(&device->io_lock); 238 spin_lock(&device->io_lock);
212 239
213 old_head = device->pending_bios; 240 old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
231 if (device->pending_bios) 258 if (device->pending_bios)
232 goto loop_lock; 259 goto loop_lock;
233 spin_unlock(&device->io_lock); 260 spin_unlock(&device->io_lock);
261
262 /*
263 * IO has already been through a long path to get here. Checksumming,
264 * async helper threads, perhaps compression. We've done a pretty
265 * good job of collecting a batch of IO and should just unplug
266 * the device right away.
267 *
268 * This will help anyone who is waiting on the IO, they might have
269 * already unplugged, but managed to do so before the bio they
270 * cared about found its way down here.
271 */
272 blk_run_backing_dev(bdi, NULL);
234done: 273done:
235 return 0; 274 return 0;
236} 275}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 78
79 /* the device with this id has the most recent coyp of the super */ 79 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 80 u64 latest_devid;
81 u64 latest_trans; 81 u64 latest_trans;
82 u64 num_devices; 82 u64 num_devices;