aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/async-thread.c7
-rw-r--r--fs/btrfs/btrfs_inode.h39
-rw-r--r--fs/btrfs/ctree.c956
-rw-r--r--fs/btrfs/ctree.h215
-rw-r--r--fs/btrfs/delayed-ref.c668
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c141
-rw-r--r--fs/btrfs/disk-io.h11
-rw-r--r--fs/btrfs/extent-tree.c2370
-rw-r--r--fs/btrfs/extent_io.c69
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c74
-rw-r--r--fs/btrfs/free-space-cache.c530
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c277
-rw-r--r--fs/btrfs/ioctl.c8
-rw-r--r--fs/btrfs/locking.c42
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c59
-rw-r--r--fs/btrfs/transaction.c160
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c458
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/btrfs/volumes.c55
-rw-r--r--fs/btrfs/volumes.h2
35 files changed, 4248 insertions, 2303 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..9adf5e4f7e96 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o 11 compression.o delayed-ref.o
12else 12else
13 13
14# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
256 } 256 }
257 257
258 if (!acl) 258 if (!acl)
259 inode->i_mode &= ~current->fs->umask; 259 inode->i_mode &= ~current_umask();
260 } 260 }
261 261
262 if (IS_POSIXACL(dir) && acl) { 262 if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
195 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending))
196 continue; 195 continue;
197 196
197 if (kthread_should_stop())
198 break;
199
198 /* still no more work?, sleep for real */ 200 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 201 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 202 set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
208 worker->working = 0; 210 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 211 spin_unlock_irq(&worker->lock);
210 212
211 schedule(); 213 if (!kthread_should_stop())
214 schedule();
212 } 215 }
213 __set_current_state(TASK_RUNNING); 216 __set_current_state(TASK_RUNNING);
214 } 217 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,15 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
75 /* the space_info for where this inode's data allocations are done */
76 struct btrfs_space_info *space_info;
77
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big 78 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this. 79 * enough field for this.
71 */ 80 */
@@ -83,17 +92,16 @@ struct btrfs_inode {
83 */ 92 */
84 u64 logged_trans; 93 u64 logged_trans;
85 94
86 /*
87 * trans that last made a change that should be fully fsync'd. This
88 * gets reset to zero each time the inode is logged
89 */
90 u64 log_dirty_trans;
91
92 /* total number of bytes pending delalloc, used by stat to calc the 95 /* total number of bytes pending delalloc, used by stat to calc the
93 * real block usage of the file 96 * real block usage of the file
94 */ 97 */
95 u64 delalloc_bytes; 98 u64 delalloc_bytes;
96 99
100 /* total number of bytes that may be used for this inode for
101 * delalloc
102 */
103 u64 reserved_bytes;
104
97 /* 105 /*
98 * the size of the file stored in the metadata on disk. data=ordered 106 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk 107 * means the in-memory i_size might be larger than the size on disk
@@ -113,6 +121,25 @@ struct btrfs_inode {
113 /* the start of block group preferred for allocations. */ 121 /* the start of block group preferred for allocations. */
114 u64 block_group; 122 u64 block_group;
115 123
124 /* the fsync log has some corner cases that mean we have to check
125 * directories to see if any unlinks have been done before
126 * the directory was logged. See tree-log.c for all the
127 * details
128 */
129 u64 last_unlink_trans;
130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
116 struct inode vfs_inode; 143 struct inode vfs_inode;
117}; 144};
118 145
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a9..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot); 39 struct btrfs_path *path, int level, int slot);
40 40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void) 41struct btrfs_path *btrfs_alloc_path(void)
47{ 42{
48 struct btrfs_path *path; 43 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); 44 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) { 45 if (path)
51 btrfs_init_path(path);
52 path->reada = 1; 46 path->reada = 1;
53 }
54 return path; 47 return path;
55} 48}
56 49
@@ -69,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
69 62
70/* 63/*
71 * reset all the locked nodes in the patch to spinning locks. 64 * reset all the locked nodes in the patch to spinning locks.
65 *
66 * held is used to keep lockdep happy, when lockdep is enabled
67 * we set held to a blocking lock before we go around and
68 * retake all the spinlocks in the path. You can safely use NULL
69 * for held
72 */ 70 */
73noinline void btrfs_clear_path_blocking(struct btrfs_path *p) 71noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 struct extent_buffer *held)
74{ 73{
75 int i; 74 int i;
76 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77 /* lockdep really cares that we take all of these spinlocks
78 * in the right order. If any of the locks in the path are not
79 * currently blocking, it is going to complain. So, make really
80 * really sure by forcing the path to blocking before we clear
81 * the path blocking.
82 */
83 if (held)
84 btrfs_set_lock_blocking(held);
85 btrfs_set_path_blocking(p);
86#endif
87
88 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
77 if (p->nodes[i] && p->locks[i]) 89 if (p->nodes[i] && p->locks[i])
78 btrfs_clear_lock_blocking(p->nodes[i]); 90 btrfs_clear_lock_blocking(p->nodes[i]);
79 } 91 }
92
93#ifdef CONFIG_DEBUG_LOCK_ALLOC
94 if (held)
95 btrfs_clear_lock_blocking(held);
96#endif
80} 97}
81 98
82/* this also releases the path */ 99/* this also releases the path */
@@ -237,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
237 * empty_size -- a hint that you plan on doing more cow. This is the size in 254 * empty_size -- a hint that you plan on doing more cow. This is the size in
238 * bytes the allocator should try to find free next to the block it returns. 255 * bytes the allocator should try to find free next to the block it returns.
239 * This is just a hint and may be ignored by the allocator. 256 * This is just a hint and may be ignored by the allocator.
240 *
241 * prealloc_dest -- if you have already reserved a destination for the cow,
242 * this uses that block instead of allocating a new one.
243 * btrfs_alloc_reserved_extent is used to finish the allocation.
244 */ 257 */
245static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 258static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
246 struct btrfs_root *root, 259 struct btrfs_root *root,
247 struct extent_buffer *buf, 260 struct extent_buffer *buf,
248 struct extent_buffer *parent, int parent_slot, 261 struct extent_buffer *parent, int parent_slot,
249 struct extent_buffer **cow_ret, 262 struct extent_buffer **cow_ret,
250 u64 search_start, u64 empty_size, 263 u64 search_start, u64 empty_size)
251 u64 prealloc_dest)
252{ 264{
253 u64 parent_start; 265 u64 parent_start;
254 struct extent_buffer *cow; 266 struct extent_buffer *cow;
@@ -260,7 +272,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
260 if (*cow_ret == buf) 272 if (*cow_ret == buf)
261 unlock_orig = 1; 273 unlock_orig = 1;
262 274
263 WARN_ON(!btrfs_tree_locked(buf)); 275 btrfs_assert_tree_locked(buf);
264 276
265 if (parent) 277 if (parent)
266 parent_start = parent->start; 278 parent_start = parent->start;
@@ -274,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
274 level = btrfs_header_level(buf); 286 level = btrfs_header_level(buf);
275 nritems = btrfs_header_nritems(buf); 287 nritems = btrfs_header_nritems(buf);
276 288
277 if (prealloc_dest) { 289 cow = btrfs_alloc_free_block(trans, root, buf->len,
278 struct btrfs_key ins; 290 parent_start, root->root_key.objectid,
279 291 trans->transid, level,
280 ins.objectid = prealloc_dest; 292 search_start, empty_size);
281 ins.offset = buf->len;
282 ins.type = BTRFS_EXTENT_ITEM_KEY;
283
284 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
285 root->root_key.objectid,
286 trans->transid, level, &ins);
287 BUG_ON(ret);
288 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
289 buf->len);
290 } else {
291 cow = btrfs_alloc_free_block(trans, root, buf->len,
292 parent_start,
293 root->root_key.objectid,
294 trans->transid, level,
295 search_start, empty_size);
296 }
297 if (IS_ERR(cow)) 293 if (IS_ERR(cow))
298 return PTR_ERR(cow); 294 return PTR_ERR(cow);
299 295
@@ -396,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
396noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 392noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
397 struct btrfs_root *root, struct extent_buffer *buf, 393 struct btrfs_root *root, struct extent_buffer *buf,
398 struct extent_buffer *parent, int parent_slot, 394 struct extent_buffer *parent, int parent_slot,
399 struct extent_buffer **cow_ret, u64 prealloc_dest) 395 struct extent_buffer **cow_ret)
400{ 396{
401 u64 search_start; 397 u64 search_start;
402 int ret; 398 int ret;
@@ -419,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
419 btrfs_header_owner(buf) == root->root_key.objectid && 415 btrfs_header_owner(buf) == root->root_key.objectid &&
420 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
421 *cow_ret = buf; 417 *cow_ret = buf;
422 WARN_ON(prealloc_dest);
423 return 0; 418 return 0;
424 } 419 }
425 420
@@ -430,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
430 btrfs_set_lock_blocking(buf); 425 btrfs_set_lock_blocking(buf);
431 426
432 ret = __btrfs_cow_block(trans, root, buf, parent, 427 ret = __btrfs_cow_block(trans, root, buf, parent,
433 parent_slot, cow_ret, search_start, 0, 428 parent_slot, cow_ret, search_start, 0);
434 prealloc_dest);
435 return ret; 429 return ret;
436} 430}
437 431
@@ -600,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
600 err = __btrfs_cow_block(trans, root, cur, parent, i, 594 err = __btrfs_cow_block(trans, root, cur, parent, i,
601 &cur, search_start, 595 &cur, search_start,
602 min(16 * blocksize, 596 min(16 * blocksize,
603 (end_slot - i) * blocksize), 0); 597 (end_slot - i) * blocksize));
604 if (err) { 598 if (err) {
605 btrfs_tree_unlock(cur); 599 btrfs_tree_unlock(cur);
606 free_extent_buffer(cur); 600 free_extent_buffer(cur);
@@ -917,10 +911,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
917 911
918 /* promote the child to a root */ 912 /* promote the child to a root */
919 child = read_node_slot(root, mid, 0); 913 child = read_node_slot(root, mid, 0);
914 BUG_ON(!child);
920 btrfs_tree_lock(child); 915 btrfs_tree_lock(child);
921 btrfs_set_lock_blocking(child); 916 btrfs_set_lock_blocking(child);
922 BUG_ON(!child); 917 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
923 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
924 BUG_ON(ret); 918 BUG_ON(ret);
925 919
926 spin_lock(&root->node_lock); 920 spin_lock(&root->node_lock);
@@ -928,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
928 spin_unlock(&root->node_lock); 922 spin_unlock(&root->node_lock);
929 923
930 ret = btrfs_update_extent_ref(trans, root, child->start, 924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
931 mid->start, child->start, 926 mid->start, child->start,
932 root->root_key.objectid, 927 root->root_key.objectid,
933 trans->transid, level - 1); 928 trans->transid, level - 1);
@@ -954,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
954 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
955 return 0; 950 return 0;
956 951
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
957 if (btrfs_header_nritems(mid) < 2) 956 if (btrfs_header_nritems(mid) < 2)
958 err_on_enospc = 1; 957 err_on_enospc = 1;
959 958
@@ -962,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
962 btrfs_tree_lock(left); 961 btrfs_tree_lock(left);
963 btrfs_set_lock_blocking(left); 962 btrfs_set_lock_blocking(left);
964 wret = btrfs_cow_block(trans, root, left, 963 wret = btrfs_cow_block(trans, root, left,
965 parent, pslot - 1, &left, 0); 964 parent, pslot - 1, &left);
966 if (wret) { 965 if (wret) {
967 ret = wret; 966 ret = wret;
968 goto enospc; 967 goto enospc;
@@ -973,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
973 btrfs_tree_lock(right); 972 btrfs_tree_lock(right);
974 btrfs_set_lock_blocking(right); 973 btrfs_set_lock_blocking(right);
975 wret = btrfs_cow_block(trans, root, right, 974 wret = btrfs_cow_block(trans, root, right,
976 parent, pslot + 1, &right, 0); 975 parent, pslot + 1, &right);
977 if (wret) { 976 if (wret) {
978 ret = wret; 977 ret = wret;
979 goto enospc; 978 goto enospc;
@@ -1154,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1154 wret = 1; 1153 wret = 1;
1155 } else { 1154 } else {
1156 ret = btrfs_cow_block(trans, root, left, parent, 1155 ret = btrfs_cow_block(trans, root, left, parent,
1157 pslot - 1, &left, 0); 1156 pslot - 1, &left);
1158 if (ret) 1157 if (ret)
1159 wret = 1; 1158 wret = 1;
1160 else { 1159 else {
@@ -1205,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1205 } else { 1204 } else {
1206 ret = btrfs_cow_block(trans, root, right, 1205 ret = btrfs_cow_block(trans, root, right,
1207 parent, pslot + 1, 1206 parent, pslot + 1,
1208 &right, 0); 1207 &right);
1209 if (ret) 1208 if (ret)
1210 wret = 1; 1209 wret = 1;
1211 else { 1210 else {
@@ -1245,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1245 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1246 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1247 */ 1246 */
1248static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1249 struct btrfs_path *path, 1248 struct btrfs_path *path,
1250 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1251{ 1250{
1252 struct extent_buffer *node; 1251 struct extent_buffer *node;
1253 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1448,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1448} 1447}
1449 1448
1450/* 1449/*
1450 * helper function for btrfs_search_slot. The goal is to find a block
1451 * in cache without setting the path to blocking. If we find the block
1452 * we return zero and the path is unchanged.
1453 *
1454 * If we can't find the block, we set the path blocking and do some
1455 * reada. -EAGAIN is returned and the search must be repeated.
1456 */
1457static int
1458read_block_for_search(struct btrfs_trans_handle *trans,
1459 struct btrfs_root *root, struct btrfs_path *p,
1460 struct extent_buffer **eb_ret, int level, int slot,
1461 struct btrfs_key *key)
1462{
1463 u64 blocknr;
1464 u64 gen;
1465 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp;
1468
1469 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot);
1471 blocksize = btrfs_level_size(root, level - 1);
1472
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 *eb_ret = tmp;
1476 return 0;
1477 }
1478
1479 /*
1480 * reduce lock contention at high levels
1481 * of the btree by dropping locks before
1482 * we read.
1483 */
1484 btrfs_release_path(NULL, p);
1485 if (tmp)
1486 free_extent_buffer(tmp);
1487 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid);
1489
1490 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp)
1492 free_extent_buffer(tmp);
1493 return -EAGAIN;
1494}
1495
1496/*
1497 * helper function for btrfs_search_slot. This does all of the checks
1498 * for node-level blocks and does any balancing required based on
1499 * the ins_len.
1500 *
1501 * If no extra work was required, zero is returned. If we had to
1502 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503 * start over
1504 */
1505static int
1506setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 struct btrfs_root *root, struct btrfs_path *p,
1508 struct extent_buffer *b, int level, int ins_len)
1509{
1510 int ret;
1511 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 int sret;
1514
1515 sret = reada_for_balance(root, p, level);
1516 if (sret)
1517 goto again;
1518
1519 btrfs_set_path_blocking(p);
1520 sret = split_node(trans, root, p, level);
1521 btrfs_clear_path_blocking(p, NULL);
1522
1523 BUG_ON(sret > 0);
1524 if (sret) {
1525 ret = sret;
1526 goto done;
1527 }
1528 b = p->nodes[level];
1529 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 int sret;
1532
1533 sret = reada_for_balance(root, p, level);
1534 if (sret)
1535 goto again;
1536
1537 btrfs_set_path_blocking(p);
1538 sret = balance_level(trans, root, p, level);
1539 btrfs_clear_path_blocking(p, NULL);
1540
1541 if (sret) {
1542 ret = sret;
1543 goto done;
1544 }
1545 b = p->nodes[level];
1546 if (!b) {
1547 btrfs_release_path(NULL, p);
1548 goto again;
1549 }
1550 BUG_ON(btrfs_header_nritems(b) == 1);
1551 }
1552 return 0;
1553
1554again:
1555 ret = -EAGAIN;
1556done:
1557 return ret;
1558}
1559
1560/*
1451 * look for key in the tree. path is filled in with nodes along the way 1561 * look for key in the tree. path is filled in with nodes along the way
1452 * if key is found, we return zero and you can find the item in the leaf 1562 * if key is found, we return zero and you can find the item in the leaf
1453 * level of the path (level 0) 1563 * level of the path (level 0)
@@ -1465,17 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1465 ins_len, int cow) 1575 ins_len, int cow)
1466{ 1576{
1467 struct extent_buffer *b; 1577 struct extent_buffer *b;
1468 struct extent_buffer *tmp;
1469 int slot; 1578 int slot;
1470 int ret; 1579 int ret;
1471 int level; 1580 int level;
1472 int should_reada = p->reada;
1473 int lowest_unlock = 1; 1581 int lowest_unlock = 1;
1474 int blocksize;
1475 u8 lowest_level = 0; 1582 u8 lowest_level = 0;
1476 u64 blocknr;
1477 u64 gen;
1478 struct btrfs_key prealloc_block;
1479 1583
1480 lowest_level = p->lowest_level; 1584 lowest_level = p->lowest_level;
1481 WARN_ON(lowest_level && ins_len > 0); 1585 WARN_ON(lowest_level && ins_len > 0);
@@ -1484,8 +1588,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1484 if (ins_len < 0) 1588 if (ins_len < 0)
1485 lowest_unlock = 2; 1589 lowest_unlock = 2;
1486 1590
1487 prealloc_block.objectid = 0;
1488
1489again: 1591again:
1490 if (p->skip_locking) 1592 if (p->skip_locking)
1491 b = btrfs_root_node(root); 1593 b = btrfs_root_node(root);
@@ -1506,50 +1608,21 @@ again:
1506 if (cow) { 1608 if (cow) {
1507 int wret; 1609 int wret;
1508 1610
1509 /* is a cow on this block not required */ 1611 /*
1612 * if we don't really need to cow this block
1613 * then we don't want to set the path blocking,
1614 * so we test it here
1615 */
1510 if (btrfs_header_generation(b) == trans->transid && 1616 if (btrfs_header_generation(b) == trans->transid &&
1511 btrfs_header_owner(b) == root->root_key.objectid && 1617 btrfs_header_owner(b) == root->root_key.objectid &&
1512 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1618 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1513 goto cow_done; 1619 goto cow_done;
1514 } 1620 }
1515
1516 /* ok, we have to cow, is our old prealloc the right
1517 * size?
1518 */
1519 if (prealloc_block.objectid &&
1520 prealloc_block.offset != b->len) {
1521 btrfs_release_path(root, p);
1522 btrfs_free_reserved_extent(root,
1523 prealloc_block.objectid,
1524 prealloc_block.offset);
1525 prealloc_block.objectid = 0;
1526 goto again;
1527 }
1528
1529 /*
1530 * for higher level blocks, try not to allocate blocks
1531 * with the block and the parent locks held.
1532 */
1533 if (level > 0 && !prealloc_block.objectid) {
1534 u32 size = b->len;
1535 u64 hint = b->start;
1536
1537 btrfs_release_path(root, p);
1538 ret = btrfs_reserve_extent(trans, root,
1539 size, size, 0,
1540 hint, (u64)-1,
1541 &prealloc_block, 0);
1542 BUG_ON(ret);
1543 goto again;
1544 }
1545
1546 btrfs_set_path_blocking(p); 1621 btrfs_set_path_blocking(p);
1547 1622
1548 wret = btrfs_cow_block(trans, root, b, 1623 wret = btrfs_cow_block(trans, root, b,
1549 p->nodes[level + 1], 1624 p->nodes[level + 1],
1550 p->slots[level + 1], 1625 p->slots[level + 1], &b);
1551 &b, prealloc_block.objectid);
1552 prealloc_block.objectid = 0;
1553 if (wret) { 1626 if (wret) {
1554 free_extent_buffer(b); 1627 free_extent_buffer(b);
1555 ret = wret; 1628 ret = wret;
@@ -1566,7 +1639,7 @@ cow_done:
1566 if (!p->skip_locking) 1639 if (!p->skip_locking)
1567 p->locks[level] = 1; 1640 p->locks[level] = 1;
1568 1641
1569 btrfs_clear_path_blocking(p); 1642 btrfs_clear_path_blocking(p, NULL);
1570 1643
1571 /* 1644 /*
1572 * we have a lock on b and as long as we aren't changing 1645 * we have a lock on b and as long as we aren't changing
@@ -1594,51 +1667,15 @@ cow_done:
1594 if (ret && slot > 0) 1667 if (ret && slot > 0)
1595 slot -= 1; 1668 slot -= 1;
1596 p->slots[level] = slot; 1669 p->slots[level] = slot;
1597 if ((p->search_for_split || ins_len > 0) && 1670 ret = setup_nodes_for_search(trans, root, p, b, level,
1598 btrfs_header_nritems(b) >= 1671 ins_len);
1599 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1672 if (ret == -EAGAIN)
1600 int sret; 1673 goto again;
1601 1674 else if (ret)
1602 sret = reada_for_balance(root, p, level); 1675 goto done;
1603 if (sret) 1676 b = p->nodes[level];
1604 goto again; 1677 slot = p->slots[level];
1605
1606 btrfs_set_path_blocking(p);
1607 sret = split_node(trans, root, p, level);
1608 btrfs_clear_path_blocking(p);
1609
1610 BUG_ON(sret > 0);
1611 if (sret) {
1612 ret = sret;
1613 goto done;
1614 }
1615 b = p->nodes[level];
1616 slot = p->slots[level];
1617 } else if (ins_len < 0 &&
1618 btrfs_header_nritems(b) <
1619 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1620 int sret;
1621
1622 sret = reada_for_balance(root, p, level);
1623 if (sret)
1624 goto again;
1625
1626 btrfs_set_path_blocking(p);
1627 sret = balance_level(trans, root, p, level);
1628 btrfs_clear_path_blocking(p);
1629 1678
1630 if (sret) {
1631 ret = sret;
1632 goto done;
1633 }
1634 b = p->nodes[level];
1635 if (!b) {
1636 btrfs_release_path(NULL, p);
1637 goto again;
1638 }
1639 slot = p->slots[level];
1640 BUG_ON(btrfs_header_nritems(b) == 1);
1641 }
1642 unlock_up(p, level, lowest_unlock); 1679 unlock_up(p, level, lowest_unlock);
1643 1680
1644 /* this is only true while dropping a snapshot */ 1681 /* this is only true while dropping a snapshot */
@@ -1647,54 +1684,21 @@ cow_done:
1647 goto done; 1684 goto done;
1648 } 1685 }
1649 1686
1650 blocknr = btrfs_node_blockptr(b, slot); 1687 ret = read_block_for_search(trans, root, p,
1651 gen = btrfs_node_ptr_generation(b, slot); 1688 &b, level, slot, key);
1652 blocksize = btrfs_level_size(root, level - 1); 1689 if (ret == -EAGAIN)
1690 goto again;
1653 1691
1654 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1655 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1656 b = tmp;
1657 } else {
1658 /*
1659 * reduce lock contention at high levels
1660 * of the btree by dropping locks before
1661 * we read.
1662 */
1663 if (level > 0) {
1664 btrfs_release_path(NULL, p);
1665 if (tmp)
1666 free_extent_buffer(tmp);
1667 if (should_reada)
1668 reada_for_search(root, p,
1669 level, slot,
1670 key->objectid);
1671
1672 tmp = read_tree_block(root, blocknr,
1673 blocksize, gen);
1674 if (tmp)
1675 free_extent_buffer(tmp);
1676 goto again;
1677 } else {
1678 btrfs_set_path_blocking(p);
1679 if (tmp)
1680 free_extent_buffer(tmp);
1681 if (should_reada)
1682 reada_for_search(root, p,
1683 level, slot,
1684 key->objectid);
1685 b = read_node_slot(root, b, slot);
1686 }
1687 }
1688 if (!p->skip_locking) { 1692 if (!p->skip_locking) {
1689 int lret; 1693 int lret;
1690 1694
1691 btrfs_clear_path_blocking(p); 1695 btrfs_clear_path_blocking(p, NULL);
1692 lret = btrfs_try_spin_lock(b); 1696 lret = btrfs_try_spin_lock(b);
1693 1697
1694 if (!lret) { 1698 if (!lret) {
1695 btrfs_set_path_blocking(p); 1699 btrfs_set_path_blocking(p);
1696 btrfs_tree_lock(b); 1700 btrfs_tree_lock(b);
1697 btrfs_clear_path_blocking(p); 1701 btrfs_clear_path_blocking(p, b);
1698 } 1702 }
1699 } 1703 }
1700 } else { 1704 } else {
@@ -1706,7 +1710,7 @@ cow_done:
1706 btrfs_set_path_blocking(p); 1710 btrfs_set_path_blocking(p);
1707 sret = split_leaf(trans, root, key, 1711 sret = split_leaf(trans, root, key,
1708 p, ins_len, ret == 0); 1712 p, ins_len, ret == 0);
1709 btrfs_clear_path_blocking(p); 1713 btrfs_clear_path_blocking(p, NULL);
1710 1714
1711 BUG_ON(sret > 0); 1715 BUG_ON(sret > 0);
1712 if (sret) { 1716 if (sret) {
@@ -1725,12 +1729,8 @@ done:
1725 * we don't really know what they plan on doing with the path 1729 * we don't really know what they plan on doing with the path
1726 * from here on, so for now just mark it as blocking 1730 * from here on, so for now just mark it as blocking
1727 */ 1731 */
1728 btrfs_set_path_blocking(p); 1732 if (!p->leave_spinning)
1729 if (prealloc_block.objectid) { 1733 btrfs_set_path_blocking(p);
1730 btrfs_free_reserved_extent(root,
1731 prealloc_block.objectid,
1732 prealloc_block.offset);
1733 }
1734 return ret; 1734 return ret;
1735} 1735}
1736 1736
@@ -1751,7 +1751,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1751 int ret; 1751 int ret;
1752 1752
1753 eb = btrfs_lock_root_node(root); 1753 eb = btrfs_lock_root_node(root);
1754 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1754 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1755 BUG_ON(ret); 1755 BUG_ON(ret);
1756 1756
1757 btrfs_set_lock_blocking(eb); 1757 btrfs_set_lock_blocking(eb);
@@ -1809,7 +1809,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1809 } 1809 }
1810 1810
1811 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1811 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1812 &eb, 0); 1812 &eb);
1813 BUG_ON(ret); 1813 BUG_ON(ret);
1814 1814
1815 if (root->root_key.objectid == 1815 if (root->root_key.objectid ==
@@ -2122,7 +2122,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2122 spin_unlock(&root->node_lock); 2122 spin_unlock(&root->node_lock);
2123 2123
2124 ret = btrfs_update_extent_ref(trans, root, lower->start, 2124 ret = btrfs_update_extent_ref(trans, root, lower->start,
2125 lower->start, c->start, 2125 lower->len, lower->start, c->start,
2126 root->root_key.objectid, 2126 root->root_key.objectid,
2127 trans->transid, level - 1); 2127 trans->transid, level - 1);
2128 BUG_ON(ret); 2128 BUG_ON(ret);
@@ -2157,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2157 BUG_ON(!path->nodes[level]); 2157 BUG_ON(!path->nodes[level]);
2158 lower = path->nodes[level]; 2158 lower = path->nodes[level];
2159 nritems = btrfs_header_nritems(lower); 2159 nritems = btrfs_header_nritems(lower);
2160 if (slot > nritems) 2160 BUG_ON(slot > nritems);
2161 BUG();
2162 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2161 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2163 BUG(); 2162 BUG();
2164 if (slot != nritems) { 2163 if (slot != nritems) {
@@ -2204,7 +2203,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2204 ret = insert_new_root(trans, root, path, level + 1); 2203 ret = insert_new_root(trans, root, path, level + 1);
2205 if (ret) 2204 if (ret)
2206 return ret; 2205 return ret;
2207 } else { 2206 } else if (!trans->transaction->delayed_refs.flushing) {
2208 ret = push_nodes_for_insert(trans, root, path, level); 2207 ret = push_nodes_for_insert(trans, root, path, level);
2209 c = path->nodes[level]; 2208 c = path->nodes[level];
2210 if (!ret && btrfs_header_nritems(c) < 2209 if (!ret && btrfs_header_nritems(c) <
@@ -2312,66 +2311,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2312 return ret; 2311 return ret;
2313} 2312}
2314 2313
2315/* 2314static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2316 * push some data in the path leaf to the right, trying to free up at 2315 struct btrfs_root *root,
2317 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2316 struct btrfs_path *path,
2318 * 2317 int data_size, int empty,
2319 * returns 1 if the push failed because the other node didn't have enough 2318 struct extent_buffer *right,
2320 * room, 0 if everything worked out and < 0 if there were major errors. 2319 int free_space, u32 left_nritems)
2321 */
2322static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2323 *root, struct btrfs_path *path, int data_size,
2324 int empty)
2325{ 2320{
2326 struct extent_buffer *left = path->nodes[0]; 2321 struct extent_buffer *left = path->nodes[0];
2327 struct extent_buffer *right; 2322 struct extent_buffer *upper = path->nodes[1];
2328 struct extent_buffer *upper;
2329 struct btrfs_disk_key disk_key; 2323 struct btrfs_disk_key disk_key;
2330 int slot; 2324 int slot;
2331 u32 i; 2325 u32 i;
2332 int free_space;
2333 int push_space = 0; 2326 int push_space = 0;
2334 int push_items = 0; 2327 int push_items = 0;
2335 struct btrfs_item *item; 2328 struct btrfs_item *item;
2336 u32 left_nritems;
2337 u32 nr; 2329 u32 nr;
2338 u32 right_nritems; 2330 u32 right_nritems;
2339 u32 data_end; 2331 u32 data_end;
2340 u32 this_item_size; 2332 u32 this_item_size;
2341 int ret; 2333 int ret;
2342 2334
2343 slot = path->slots[1];
2344 if (!path->nodes[1])
2345 return 1;
2346
2347 upper = path->nodes[1];
2348 if (slot >= btrfs_header_nritems(upper) - 1)
2349 return 1;
2350
2351 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2352
2353 right = read_node_slot(root, upper, slot + 1);
2354 btrfs_tree_lock(right);
2355 btrfs_set_lock_blocking(right);
2356
2357 free_space = btrfs_leaf_free_space(root, right);
2358 if (free_space < data_size)
2359 goto out_unlock;
2360
2361 /* cow and double check */
2362 ret = btrfs_cow_block(trans, root, right, upper,
2363 slot + 1, &right, 0);
2364 if (ret)
2365 goto out_unlock;
2366
2367 free_space = btrfs_leaf_free_space(root, right);
2368 if (free_space < data_size)
2369 goto out_unlock;
2370
2371 left_nritems = btrfs_header_nritems(left);
2372 if (left_nritems == 0)
2373 goto out_unlock;
2374
2375 if (empty) 2335 if (empty)
2376 nr = 0; 2336 nr = 0;
2377 else 2337 else
@@ -2380,6 +2340,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2380 if (path->slots[0] >= left_nritems) 2340 if (path->slots[0] >= left_nritems)
2381 push_space += data_size; 2341 push_space += data_size;
2382 2342
2343 slot = path->slots[1];
2383 i = left_nritems - 1; 2344 i = left_nritems - 1;
2384 while (i >= nr) { 2345 while (i >= nr) {
2385 item = btrfs_item_nr(left, i); 2346 item = btrfs_item_nr(left, i);
@@ -2511,24 +2472,82 @@ out_unlock:
2511} 2472}
2512 2473
2513/* 2474/*
2475 * push some data in the path leaf to the right, trying to free up at
2476 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2477 *
2478 * returns 1 if the push failed because the other node didn't have enough
2479 * room, 0 if everything worked out and < 0 if there were major errors.
2480 */
2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2482 *root, struct btrfs_path *path, int data_size,
2483 int empty)
2484{
2485 struct extent_buffer *left = path->nodes[0];
2486 struct extent_buffer *right;
2487 struct extent_buffer *upper;
2488 int slot;
2489 int free_space;
2490 u32 left_nritems;
2491 int ret;
2492
2493 if (!path->nodes[1])
2494 return 1;
2495
2496 slot = path->slots[1];
2497 upper = path->nodes[1];
2498 if (slot >= btrfs_header_nritems(upper) - 1)
2499 return 1;
2500
2501 btrfs_assert_tree_locked(path->nodes[1]);
2502
2503 right = read_node_slot(root, upper, slot + 1);
2504 btrfs_tree_lock(right);
2505 btrfs_set_lock_blocking(right);
2506
2507 free_space = btrfs_leaf_free_space(root, right);
2508 if (free_space < data_size)
2509 goto out_unlock;
2510
2511 /* cow and double check */
2512 ret = btrfs_cow_block(trans, root, right, upper,
2513 slot + 1, &right);
2514 if (ret)
2515 goto out_unlock;
2516
2517 free_space = btrfs_leaf_free_space(root, right);
2518 if (free_space < data_size)
2519 goto out_unlock;
2520
2521 left_nritems = btrfs_header_nritems(left);
2522 if (left_nritems == 0)
2523 goto out_unlock;
2524
2525 return __push_leaf_right(trans, root, path, data_size, empty,
2526 right, free_space, left_nritems);
2527out_unlock:
2528 btrfs_tree_unlock(right);
2529 free_extent_buffer(right);
2530 return 1;
2531}
2532
2533/*
2514 * push some data in the path leaf to the left, trying to free up at 2534 * push some data in the path leaf to the left, trying to free up at
2515 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2535 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2516 */ 2536 */
2517static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2537static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2518 *root, struct btrfs_path *path, int data_size, 2538 struct btrfs_root *root,
2519 int empty) 2539 struct btrfs_path *path, int data_size,
2540 int empty, struct extent_buffer *left,
2541 int free_space, int right_nritems)
2520{ 2542{
2521 struct btrfs_disk_key disk_key; 2543 struct btrfs_disk_key disk_key;
2522 struct extent_buffer *right = path->nodes[0]; 2544 struct extent_buffer *right = path->nodes[0];
2523 struct extent_buffer *left;
2524 int slot; 2545 int slot;
2525 int i; 2546 int i;
2526 int free_space;
2527 int push_space = 0; 2547 int push_space = 0;
2528 int push_items = 0; 2548 int push_items = 0;
2529 struct btrfs_item *item; 2549 struct btrfs_item *item;
2530 u32 old_left_nritems; 2550 u32 old_left_nritems;
2531 u32 right_nritems;
2532 u32 nr; 2551 u32 nr;
2533 int ret = 0; 2552 int ret = 0;
2534 int wret; 2553 int wret;
@@ -2536,41 +2555,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2536 u32 old_left_item_size; 2555 u32 old_left_item_size;
2537 2556
2538 slot = path->slots[1]; 2557 slot = path->slots[1];
2539 if (slot == 0)
2540 return 1;
2541 if (!path->nodes[1])
2542 return 1;
2543
2544 right_nritems = btrfs_header_nritems(right);
2545 if (right_nritems == 0)
2546 return 1;
2547
2548 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2549
2550 left = read_node_slot(root, path->nodes[1], slot - 1);
2551 btrfs_tree_lock(left);
2552 btrfs_set_lock_blocking(left);
2553
2554 free_space = btrfs_leaf_free_space(root, left);
2555 if (free_space < data_size) {
2556 ret = 1;
2557 goto out;
2558 }
2559
2560 /* cow and double check */
2561 ret = btrfs_cow_block(trans, root, left,
2562 path->nodes[1], slot - 1, &left, 0);
2563 if (ret) {
2564 /* we hit -ENOSPC, but it isn't fatal here */
2565 ret = 1;
2566 goto out;
2567 }
2568
2569 free_space = btrfs_leaf_free_space(root, left);
2570 if (free_space < data_size) {
2571 ret = 1;
2572 goto out;
2573 }
2574 2558
2575 if (empty) 2559 if (empty)
2576 nr = right_nritems; 2560 nr = right_nritems;
@@ -2738,6 +2722,154 @@ out:
2738} 2722}
2739 2723
2740/* 2724/*
2725 * push some data in the path leaf to the left, trying to free up at
2726 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2727 */
2728static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2729 *root, struct btrfs_path *path, int data_size,
2730 int empty)
2731{
2732 struct extent_buffer *right = path->nodes[0];
2733 struct extent_buffer *left;
2734 int slot;
2735 int free_space;
2736 u32 right_nritems;
2737 int ret = 0;
2738
2739 slot = path->slots[1];
2740 if (slot == 0)
2741 return 1;
2742 if (!path->nodes[1])
2743 return 1;
2744
2745 right_nritems = btrfs_header_nritems(right);
2746 if (right_nritems == 0)
2747 return 1;
2748
2749 btrfs_assert_tree_locked(path->nodes[1]);
2750
2751 left = read_node_slot(root, path->nodes[1], slot - 1);
2752 btrfs_tree_lock(left);
2753 btrfs_set_lock_blocking(left);
2754
2755 free_space = btrfs_leaf_free_space(root, left);
2756 if (free_space < data_size) {
2757 ret = 1;
2758 goto out;
2759 }
2760
2761 /* cow and double check */
2762 ret = btrfs_cow_block(trans, root, left,
2763 path->nodes[1], slot - 1, &left);
2764 if (ret) {
2765 /* we hit -ENOSPC, but it isn't fatal here */
2766 ret = 1;
2767 goto out;
2768 }
2769
2770 free_space = btrfs_leaf_free_space(root, left);
2771 if (free_space < data_size) {
2772 ret = 1;
2773 goto out;
2774 }
2775
2776 return __push_leaf_left(trans, root, path, data_size,
2777 empty, left, free_space, right_nritems);
2778out:
2779 btrfs_tree_unlock(left);
2780 free_extent_buffer(left);
2781 return ret;
2782}
2783
2784/*
2785 * split the path's leaf in two, making sure there is at least data_size
2786 * available for the resulting leaf level of the path.
2787 *
2788 * returns 0 if all went well and < 0 on failure.
2789 */
2790static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2791 struct btrfs_root *root,
2792 struct btrfs_path *path,
2793 struct extent_buffer *l,
2794 struct extent_buffer *right,
2795 int slot, int mid, int nritems)
2796{
2797 int data_copy_size;
2798 int rt_data_off;
2799 int i;
2800 int ret = 0;
2801 int wret;
2802 struct btrfs_disk_key disk_key;
2803
2804 nritems = nritems - mid;
2805 btrfs_set_header_nritems(right, nritems);
2806 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2807
2808 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2809 btrfs_item_nr_offset(mid),
2810 nritems * sizeof(struct btrfs_item));
2811
2812 copy_extent_buffer(right, l,
2813 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2814 data_copy_size, btrfs_leaf_data(l) +
2815 leaf_data_end(root, l), data_copy_size);
2816
2817 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2818 btrfs_item_end_nr(l, mid);
2819
2820 for (i = 0; i < nritems; i++) {
2821 struct btrfs_item *item = btrfs_item_nr(right, i);
2822 u32 ioff;
2823
2824 if (!right->map_token) {
2825 map_extent_buffer(right, (unsigned long)item,
2826 sizeof(struct btrfs_item),
2827 &right->map_token, &right->kaddr,
2828 &right->map_start, &right->map_len,
2829 KM_USER1);
2830 }
2831
2832 ioff = btrfs_item_offset(right, item);
2833 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2834 }
2835
2836 if (right->map_token) {
2837 unmap_extent_buffer(right, right->map_token, KM_USER1);
2838 right->map_token = NULL;
2839 }
2840
2841 btrfs_set_header_nritems(l, mid);
2842 ret = 0;
2843 btrfs_item_key(right, &disk_key, 0);
2844 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2845 path->slots[1] + 1, 1);
2846 if (wret)
2847 ret = wret;
2848
2849 btrfs_mark_buffer_dirty(right);
2850 btrfs_mark_buffer_dirty(l);
2851 BUG_ON(path->slots[0] != slot);
2852
2853 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2854 BUG_ON(ret);
2855
2856 if (mid <= slot) {
2857 btrfs_tree_unlock(path->nodes[0]);
2858 free_extent_buffer(path->nodes[0]);
2859 path->nodes[0] = right;
2860 path->slots[0] -= mid;
2861 path->slots[1] += 1;
2862 } else {
2863 btrfs_tree_unlock(right);
2864 free_extent_buffer(right);
2865 }
2866
2867 BUG_ON(path->slots[0] < 0);
2868
2869 return ret;
2870}
2871
2872/*
2741 * split the path's leaf in two, making sure there is at least data_size 2873 * split the path's leaf in two, making sure there is at least data_size
2742 * available for the resulting leaf level of the path. 2874 * available for the resulting leaf level of the path.
2743 * 2875 *
@@ -2754,17 +2886,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2754 int mid; 2886 int mid;
2755 int slot; 2887 int slot;
2756 struct extent_buffer *right; 2888 struct extent_buffer *right;
2757 int data_copy_size;
2758 int rt_data_off;
2759 int i;
2760 int ret = 0; 2889 int ret = 0;
2761 int wret; 2890 int wret;
2762 int double_split; 2891 int double_split;
2763 int num_doubles = 0; 2892 int num_doubles = 0;
2764 struct btrfs_disk_key disk_key;
2765 2893
2766 /* first try to make some room by pushing left and right */ 2894 /* first try to make some room by pushing left and right */
2767 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2895 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2896 !trans->transaction->delayed_refs.flushing) {
2768 wret = push_leaf_right(trans, root, path, data_size, 0); 2897 wret = push_leaf_right(trans, root, path, data_size, 0);
2769 if (wret < 0) 2898 if (wret < 0)
2770 return wret; 2899 return wret;
@@ -2813,11 +2942,14 @@ again:
2813 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 2942 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2814 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2943 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2815 BTRFS_UUID_SIZE); 2944 BTRFS_UUID_SIZE);
2945
2816 if (mid <= slot) { 2946 if (mid <= slot) {
2817 if (nritems == 1 || 2947 if (nritems == 1 ||
2818 leaf_space_used(l, mid, nritems - mid) + data_size > 2948 leaf_space_used(l, mid, nritems - mid) + data_size >
2819 BTRFS_LEAF_DATA_SIZE(root)) { 2949 BTRFS_LEAF_DATA_SIZE(root)) {
2820 if (slot >= nritems) { 2950 if (slot >= nritems) {
2951 struct btrfs_disk_key disk_key;
2952
2821 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2953 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2822 btrfs_set_header_nritems(right, 0); 2954 btrfs_set_header_nritems(right, 0);
2823 wret = insert_ptr(trans, root, path, 2955 wret = insert_ptr(trans, root, path,
@@ -2845,6 +2977,8 @@ again:
2845 if (leaf_space_used(l, 0, mid) + data_size > 2977 if (leaf_space_used(l, 0, mid) + data_size >
2846 BTRFS_LEAF_DATA_SIZE(root)) { 2978 BTRFS_LEAF_DATA_SIZE(root)) {
2847 if (!extend && data_size && slot == 0) { 2979 if (!extend && data_size && slot == 0) {
2980 struct btrfs_disk_key disk_key;
2981
2848 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2982 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2849 btrfs_set_header_nritems(right, 0); 2983 btrfs_set_header_nritems(right, 0);
2850 wret = insert_ptr(trans, root, path, 2984 wret = insert_ptr(trans, root, path,
@@ -2877,76 +3011,16 @@ again:
2877 } 3011 }
2878 } 3012 }
2879 } 3013 }
2880 nritems = nritems - mid;
2881 btrfs_set_header_nritems(right, nritems);
2882 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2883
2884 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2885 btrfs_item_nr_offset(mid),
2886 nritems * sizeof(struct btrfs_item));
2887
2888 copy_extent_buffer(right, l,
2889 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2890 data_copy_size, btrfs_leaf_data(l) +
2891 leaf_data_end(root, l), data_copy_size);
2892
2893 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2894 btrfs_item_end_nr(l, mid);
2895
2896 for (i = 0; i < nritems; i++) {
2897 struct btrfs_item *item = btrfs_item_nr(right, i);
2898 u32 ioff;
2899
2900 if (!right->map_token) {
2901 map_extent_buffer(right, (unsigned long)item,
2902 sizeof(struct btrfs_item),
2903 &right->map_token, &right->kaddr,
2904 &right->map_start, &right->map_len,
2905 KM_USER1);
2906 }
2907
2908 ioff = btrfs_item_offset(right, item);
2909 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2910 }
2911
2912 if (right->map_token) {
2913 unmap_extent_buffer(right, right->map_token, KM_USER1);
2914 right->map_token = NULL;
2915 }
2916 3014
2917 btrfs_set_header_nritems(l, mid); 3015 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2918 ret = 0;
2919 btrfs_item_key(right, &disk_key, 0);
2920 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2921 path->slots[1] + 1, 1);
2922 if (wret)
2923 ret = wret;
2924
2925 btrfs_mark_buffer_dirty(right);
2926 btrfs_mark_buffer_dirty(l);
2927 BUG_ON(path->slots[0] != slot);
2928
2929 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2930 BUG_ON(ret); 3016 BUG_ON(ret);
2931 3017
2932 if (mid <= slot) {
2933 btrfs_tree_unlock(path->nodes[0]);
2934 free_extent_buffer(path->nodes[0]);
2935 path->nodes[0] = right;
2936 path->slots[0] -= mid;
2937 path->slots[1] += 1;
2938 } else {
2939 btrfs_tree_unlock(right);
2940 free_extent_buffer(right);
2941 }
2942
2943 BUG_ON(path->slots[0] < 0);
2944
2945 if (double_split) { 3018 if (double_split) {
2946 BUG_ON(num_doubles != 0); 3019 BUG_ON(num_doubles != 0);
2947 num_doubles++; 3020 num_doubles++;
2948 goto again; 3021 goto again;
2949 } 3022 }
3023
2950 return ret; 3024 return ret;
2951} 3025}
2952 3026
@@ -3004,26 +3078,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
3004 return -EAGAIN; 3078 return -EAGAIN;
3005 } 3079 }
3006 3080
3081 btrfs_set_path_blocking(path);
3007 ret = split_leaf(trans, root, &orig_key, path, 3082 ret = split_leaf(trans, root, &orig_key, path,
3008 sizeof(struct btrfs_item), 1); 3083 sizeof(struct btrfs_item), 1);
3009 path->keep_locks = 0; 3084 path->keep_locks = 0;
3010 BUG_ON(ret); 3085 BUG_ON(ret);
3011 3086
3087 btrfs_unlock_up_safe(path, 1);
3088 leaf = path->nodes[0];
3089 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3090
3091split:
3012 /* 3092 /*
3013 * make sure any changes to the path from split_leaf leave it 3093 * make sure any changes to the path from split_leaf leave it
3014 * in a blocking state 3094 * in a blocking state
3015 */ 3095 */
3016 btrfs_set_path_blocking(path); 3096 btrfs_set_path_blocking(path);
3017 3097
3018 leaf = path->nodes[0];
3019 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3020
3021split:
3022 item = btrfs_item_nr(leaf, path->slots[0]); 3098 item = btrfs_item_nr(leaf, path->slots[0]);
3023 orig_offset = btrfs_item_offset(leaf, item); 3099 orig_offset = btrfs_item_offset(leaf, item);
3024 item_size = btrfs_item_size(leaf, item); 3100 item_size = btrfs_item_size(leaf, item);
3025 3101
3026
3027 buf = kmalloc(item_size, GFP_NOFS); 3102 buf = kmalloc(item_size, GFP_NOFS);
3028 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3103 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3029 path->slots[0]), item_size); 3104 path->slots[0]), item_size);
@@ -3428,39 +3503,27 @@ out:
3428} 3503}
3429 3504
3430/* 3505/*
3431 * Given a key and some data, insert items into the tree. 3506 * this is a helper for btrfs_insert_empty_items, the main goal here is
3432 * This does all the path init required, making room in the tree if needed. 3507 * to save stack depth by doing the bulk of the work in a function
3508 * that doesn't call btrfs_search_slot
3433 */ 3509 */
3434int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3510static noinline_for_stack int
3435 struct btrfs_root *root, 3511setup_items_for_insert(struct btrfs_trans_handle *trans,
3436 struct btrfs_path *path, 3512 struct btrfs_root *root, struct btrfs_path *path,
3437 struct btrfs_key *cpu_key, u32 *data_size, 3513 struct btrfs_key *cpu_key, u32 *data_size,
3438 int nr) 3514 u32 total_data, u32 total_size, int nr)
3439{ 3515{
3440 struct extent_buffer *leaf;
3441 struct btrfs_item *item; 3516 struct btrfs_item *item;
3442 int ret = 0;
3443 int slot;
3444 int slot_orig;
3445 int i; 3517 int i;
3446 u32 nritems; 3518 u32 nritems;
3447 u32 total_size = 0;
3448 u32 total_data = 0;
3449 unsigned int data_end; 3519 unsigned int data_end;
3450 struct btrfs_disk_key disk_key; 3520 struct btrfs_disk_key disk_key;
3521 int ret;
3522 struct extent_buffer *leaf;
3523 int slot;
3451 3524
3452 for (i = 0; i < nr; i++)
3453 total_data += data_size[i];
3454
3455 total_size = total_data + (nr * sizeof(struct btrfs_item));
3456 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3457 if (ret == 0)
3458 return -EEXIST;
3459 if (ret < 0)
3460 goto out;
3461
3462 slot_orig = path->slots[0];
3463 leaf = path->nodes[0]; 3525 leaf = path->nodes[0];
3526 slot = path->slots[0];
3464 3527
3465 nritems = btrfs_header_nritems(leaf); 3528 nritems = btrfs_header_nritems(leaf);
3466 data_end = leaf_data_end(root, leaf); 3529 data_end = leaf_data_end(root, leaf);
@@ -3472,9 +3535,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3472 BUG(); 3535 BUG();
3473 } 3536 }
3474 3537
3475 slot = path->slots[0];
3476 BUG_ON(slot < 0);
3477
3478 if (slot != nritems) { 3538 if (slot != nritems) {
3479 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3539 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3480 3540
@@ -3530,21 +3590,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3530 data_end -= data_size[i]; 3590 data_end -= data_size[i];
3531 btrfs_set_item_size(leaf, item, data_size[i]); 3591 btrfs_set_item_size(leaf, item, data_size[i]);
3532 } 3592 }
3593
3533 btrfs_set_header_nritems(leaf, nritems + nr); 3594 btrfs_set_header_nritems(leaf, nritems + nr);
3534 btrfs_mark_buffer_dirty(leaf);
3535 3595
3536 ret = 0; 3596 ret = 0;
3537 if (slot == 0) { 3597 if (slot == 0) {
3598 struct btrfs_disk_key disk_key;
3538 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3599 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3539 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3600 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3540 } 3601 }
3602 btrfs_unlock_up_safe(path, 1);
3603 btrfs_mark_buffer_dirty(leaf);
3541 3604
3542 if (btrfs_leaf_free_space(root, leaf) < 0) { 3605 if (btrfs_leaf_free_space(root, leaf) < 0) {
3543 btrfs_print_leaf(root, leaf); 3606 btrfs_print_leaf(root, leaf);
3544 BUG(); 3607 BUG();
3545 } 3608 }
3609 return ret;
3610}
3611
3612/*
3613 * Given a key and some data, insert items into the tree.
3614 * This does all the path init required, making room in the tree if needed.
3615 */
3616int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3617 struct btrfs_root *root,
3618 struct btrfs_path *path,
3619 struct btrfs_key *cpu_key, u32 *data_size,
3620 int nr)
3621{
3622 struct extent_buffer *leaf;
3623 int ret = 0;
3624 int slot;
3625 int i;
3626 u32 total_size = 0;
3627 u32 total_data = 0;
3628
3629 for (i = 0; i < nr; i++)
3630 total_data += data_size[i];
3631
3632 total_size = total_data + (nr * sizeof(struct btrfs_item));
3633 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3634 if (ret == 0)
3635 return -EEXIST;
3636 if (ret < 0)
3637 goto out;
3638
3639 leaf = path->nodes[0];
3640 slot = path->slots[0];
3641 BUG_ON(slot < 0);
3642
3643 ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3644 total_data, total_size, nr);
3645
3546out: 3646out:
3547 btrfs_unlock_up_safe(path, 1);
3548 return ret; 3647 return ret;
3549} 3648}
3550 3649
@@ -3732,7 +3831,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3732 } 3831 }
3733 3832
3734 /* delete the leaf if it is mostly empty */ 3833 /* delete the leaf if it is mostly empty */
3735 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { 3834 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3835 !trans->transaction->delayed_refs.flushing) {
3736 /* push_leaf_left fixes the path. 3836 /* push_leaf_left fixes the path.
3737 * make sure the path still points to our leaf 3837 * make sure the path still points to our leaf
3738 * for possible call to del_ptr below 3838 * for possible call to del_ptr below
@@ -3740,6 +3840,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3740 slot = path->slots[1]; 3840 slot = path->slots[1];
3741 extent_buffer_get(leaf); 3841 extent_buffer_get(leaf);
3742 3842
3843 btrfs_set_path_blocking(path);
3743 wret = push_leaf_left(trans, root, path, 1, 1); 3844 wret = push_leaf_left(trans, root, path, 1, 1);
3744 if (wret < 0 && wret != -ENOSPC) 3845 if (wret < 0 && wret != -ENOSPC)
3745 ret = wret; 3846 ret = wret;
@@ -3926,7 +4027,6 @@ find_next_key:
3926 btrfs_release_path(root, path); 4027 btrfs_release_path(root, path);
3927 goto again; 4028 goto again;
3928 } else { 4029 } else {
3929 btrfs_clear_path_blocking(path);
3930 goto out; 4030 goto out;
3931 } 4031 }
3932 } 4032 }
@@ -3946,7 +4046,7 @@ find_next_key:
3946 path->locks[level - 1] = 1; 4046 path->locks[level - 1] = 1;
3947 path->nodes[level - 1] = cur; 4047 path->nodes[level - 1] = cur;
3948 unlock_up(path, level, 1); 4048 unlock_up(path, level, 1);
3949 btrfs_clear_path_blocking(path); 4049 btrfs_clear_path_blocking(path, NULL);
3950 } 4050 }
3951out: 4051out:
3952 if (ret == 0) 4052 if (ret == 0)
@@ -4026,28 +4126,44 @@ next:
4026int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4126int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4027{ 4127{
4028 int slot; 4128 int slot;
4029 int level = 1; 4129 int level;
4030 struct extent_buffer *c; 4130 struct extent_buffer *c;
4031 struct extent_buffer *next = NULL; 4131 struct extent_buffer *next;
4032 struct btrfs_key key; 4132 struct btrfs_key key;
4033 u32 nritems; 4133 u32 nritems;
4034 int ret; 4134 int ret;
4135 int old_spinning = path->leave_spinning;
4136 int force_blocking = 0;
4035 4137
4036 nritems = btrfs_header_nritems(path->nodes[0]); 4138 nritems = btrfs_header_nritems(path->nodes[0]);
4037 if (nritems == 0) 4139 if (nritems == 0)
4038 return 1; 4140 return 1;
4039 4141
4040 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4142 /*
4143 * we take the blocks in an order that upsets lockdep. Using
4144 * blocking mode is the only way around it.
4145 */
4146#ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 force_blocking = 1;
4148#endif
4041 4149
4150 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151again:
4152 level = 1;
4153 next = NULL;
4042 btrfs_release_path(root, path); 4154 btrfs_release_path(root, path);
4155
4043 path->keep_locks = 1; 4156 path->keep_locks = 1;
4157
4158 if (!force_blocking)
4159 path->leave_spinning = 1;
4160
4044 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4045 path->keep_locks = 0; 4162 path->keep_locks = 0;
4046 4163
4047 if (ret < 0) 4164 if (ret < 0)
4048 return ret; 4165 return ret;
4049 4166
4050 btrfs_set_path_blocking(path);
4051 nritems = btrfs_header_nritems(path->nodes[0]); 4167 nritems = btrfs_header_nritems(path->nodes[0]);
4052 /* 4168 /*
4053 * by releasing the path above we dropped all our locks. A balance 4169 * by releasing the path above we dropped all our locks. A balance
@@ -4057,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4057 */ 4173 */
4058 if (nritems > 0 && path->slots[0] < nritems - 1) { 4174 if (nritems > 0 && path->slots[0] < nritems - 1) {
4059 path->slots[0]++; 4175 path->slots[0]++;
4176 ret = 0;
4060 goto done; 4177 goto done;
4061 } 4178 }
4062 4179
4063 while (level < BTRFS_MAX_LEVEL) { 4180 while (level < BTRFS_MAX_LEVEL) {
4064 if (!path->nodes[level]) 4181 if (!path->nodes[level]) {
4065 return 1; 4182 ret = 1;
4183 goto done;
4184 }
4066 4185
4067 slot = path->slots[level] + 1; 4186 slot = path->slots[level] + 1;
4068 c = path->nodes[level]; 4187 c = path->nodes[level];
4069 if (slot >= btrfs_header_nritems(c)) { 4188 if (slot >= btrfs_header_nritems(c)) {
4070 level++; 4189 level++;
4071 if (level == BTRFS_MAX_LEVEL) 4190 if (level == BTRFS_MAX_LEVEL) {
4072 return 1; 4191 ret = 1;
4192 goto done;
4193 }
4073 continue; 4194 continue;
4074 } 4195 }
4075 4196
@@ -4078,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4078 free_extent_buffer(next); 4199 free_extent_buffer(next);
4079 } 4200 }
4080 4201
4081 /* the path was set to blocking above */ 4202 next = c;
4082 if (level == 1 && (path->locks[1] || path->skip_locking) && 4203 ret = read_block_for_search(NULL, root, path, &next, level,
4083 path->reada) 4204 slot, &key);
4084 reada_for_search(root, path, level, slot, 0); 4205 if (ret == -EAGAIN)
4206 goto again;
4085 4207
4086 next = read_node_slot(root, c, slot);
4087 if (!path->skip_locking) { 4208 if (!path->skip_locking) {
4088 WARN_ON(!btrfs_tree_locked(c)); 4209 ret = btrfs_try_spin_lock(next);
4089 btrfs_tree_lock(next); 4210 if (!ret) {
4090 btrfs_set_lock_blocking(next); 4211 btrfs_set_path_blocking(path);
4212 btrfs_tree_lock(next);
4213 if (!force_blocking)
4214 btrfs_clear_path_blocking(path, next);
4215 }
4216 if (force_blocking)
4217 btrfs_set_lock_blocking(next);
4091 } 4218 }
4092 break; 4219 break;
4093 } 4220 }
@@ -4097,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4097 c = path->nodes[level]; 4224 c = path->nodes[level];
4098 if (path->locks[level]) 4225 if (path->locks[level])
4099 btrfs_tree_unlock(c); 4226 btrfs_tree_unlock(c);
4227
4100 free_extent_buffer(c); 4228 free_extent_buffer(c);
4101 path->nodes[level] = next; 4229 path->nodes[level] = next;
4102 path->slots[level] = 0; 4230 path->slots[level] = 0;
4103 if (!path->skip_locking) 4231 if (!path->skip_locking)
4104 path->locks[level] = 1; 4232 path->locks[level] = 1;
4233
4105 if (!level) 4234 if (!level)
4106 break; 4235 break;
4107 4236
4108 btrfs_set_path_blocking(path); 4237 ret = read_block_for_search(NULL, root, path, &next, level,
4109 if (level == 1 && path->locks[1] && path->reada) 4238 0, &key);
4110 reada_for_search(root, path, level, slot, 0); 4239 if (ret == -EAGAIN)
4111 next = read_node_slot(root, next, 0); 4240 goto again;
4241
4112 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4113 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4243 btrfs_assert_tree_locked(path->nodes[level]);
4114 btrfs_tree_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4115 btrfs_set_lock_blocking(next); 4245 if (!ret) {
4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4116 } 4253 }
4117 } 4254 }
4255 ret = 0;
4118done: 4256done:
4119 unlock_up(path, 0, 1); 4257 unlock_up(path, 0, 1);
4120 return 0; 4258 path->leave_spinning = old_spinning;
4259 if (!old_spinning)
4260 btrfs_set_path_blocking(path);
4261
4262 return ret;
4121} 4263}
4122 4264
4123/* 4265/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8bd..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,14 @@ struct btrfs_ordered_sum;
43 43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1) 44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45 45
46#ifdef CONFIG_LOCKDEP 46#define BTRFS_MAX_LEVEL 8
47# define BTRFS_MAX_LEVEL 7 47
48#else 48/*
49# define BTRFS_MAX_LEVEL 8 49 * files bigger than this get some pre-flushing when they are added
50#endif 50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
51 54
52/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -140,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
140#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
141 144
142/* 145/*
143 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
144 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
145 * tells us things about the object, and is a kind of stream selector. 148 *
146 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
147 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
148 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
149 * 155 *
150 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
151 * 157 *
@@ -197,7 +203,7 @@ struct btrfs_dev_item {
197 203
198 /* 204 /*
199 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
200 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
201 */ 207 */
202 __le64 start_offset; 208 __le64 start_offset;
203 209
@@ -405,15 +411,16 @@ struct btrfs_path {
405 int locks[BTRFS_MAX_LEVEL]; 411 int locks[BTRFS_MAX_LEVEL];
406 int reada; 412 int reada;
407 /* keep some upper locks as we walk down */ 413 /* keep some upper locks as we walk down */
408 int keep_locks;
409 int skip_locking;
410 int lowest_level; 414 int lowest_level;
411 415
412 /* 416 /*
413 * set by btrfs_split_item, tells search_slot to keep all locks 417 * set by btrfs_split_item, tells search_slot to keep all locks
414 * and to force calls to keep space in the nodes 418 * and to force calls to keep space in the nodes
415 */ 419 */
416 int search_for_split; 420 unsigned int search_for_split:1;
421 unsigned int keep_locks:1;
422 unsigned int skip_locking:1;
423 unsigned int leave_spinning:1;
417}; 424};
418 425
419/* 426/*
@@ -600,13 +607,27 @@ struct btrfs_block_group_item {
600 607
601struct btrfs_space_info { 608struct btrfs_space_info {
602 u64 flags; 609 u64 flags;
603 u64 total_bytes; 610
604 u64 bytes_used; 611 u64 total_bytes; /* total bytes in the space */
605 u64 bytes_pinned; 612 u64 bytes_used; /* total bytes used on disk */
606 u64 bytes_reserved; 613 u64 bytes_pinned; /* total bytes pinned, will be freed when the
607 u64 bytes_readonly; 614 transaction finishes */
608 int full; 615 u64 bytes_reserved; /* total bytes the allocator has reserved for
609 int force_alloc; 616 current allocations */
617 u64 bytes_readonly; /* total bytes that are read only */
618
619 /* delalloc accounting */
620 u64 bytes_delalloc; /* number of bytes reserved for allocation,
621 this space is not necessarily reserved yet
622 by the allocator */
623 u64 bytes_may_use; /* number of bytes that may be used for
624 delalloc */
625
626 int full; /* indicates that we cannot allocate any more
627 chunks for this space */
628 int force_alloc; /* set if we need to force a chunk alloc for
629 this space */
630
610 struct list_head list; 631 struct list_head list;
611 632
612 /* for block groups in our same type */ 633 /* for block groups in our same type */
@@ -615,18 +636,35 @@ struct btrfs_space_info {
615 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
616}; 637};
617 638
618struct btrfs_free_space { 639/*
619 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
620 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
621 u64 offset; 642 * allocations and data allocations in ssd mode.
622 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
623}; 662};
624 663
625struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
626 struct btrfs_key key; 665 struct btrfs_key key;
627 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
628 spinlock_t lock; 667 spinlock_t lock;
629 struct mutex alloc_mutex;
630 struct mutex cache_mutex; 668 struct mutex cache_mutex;
631 u64 pinned; 669 u64 pinned;
632 u64 reserved; 670 u64 reserved;
@@ -638,6 +676,7 @@ struct btrfs_block_group_cache {
638 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
639 677
640 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
641 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
642 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
643 682
@@ -649,6 +688,11 @@ struct btrfs_block_group_cache {
649 688
650 /* usage count */ 689 /* usage count */
651 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
652}; 696};
653 697
654struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -678,15 +722,18 @@ struct btrfs_fs_info {
678 struct rb_root block_group_cache_tree; 722 struct rb_root block_group_cache_tree;
679 723
680 struct extent_io_tree pinned_extents; 724 struct extent_io_tree pinned_extents;
681 struct extent_io_tree pending_del;
682 struct extent_io_tree extent_ins;
683 725
684 /* logical->physical extent mapping */ 726 /* logical->physical extent mapping */
685 struct btrfs_mapping_tree mapping_tree; 727 struct btrfs_mapping_tree mapping_tree;
686 728
687 u64 generation; 729 u64 generation;
688 u64 last_trans_committed; 730 u64 last_trans_committed;
689 u64 last_trans_new_blockgroup; 731
732 /*
733 * this is updated to the current trans every time a full commit
734 * is required instead of the faster short fsync log commits
735 */
736 u64 last_trans_log_full_commit;
690 u64 open_ioctl_trans; 737 u64 open_ioctl_trans;
691 unsigned long mount_opt; 738 unsigned long mount_opt;
692 u64 max_extent; 739 u64 max_extent;
@@ -707,12 +754,20 @@ struct btrfs_fs_info {
707 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
708 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
709 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
710 struct mutex extent_ins_mutex;
711 struct mutex pinned_mutex;
712 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
713 struct mutex drop_mutex; 758 struct mutex drop_mutex;
714 struct mutex volume_mutex; 759 struct mutex volume_mutex;
715 struct mutex tree_reloc_mutex; 760 struct mutex tree_reloc_mutex;
761
762 /*
763 * this protects the ordered operations list only while we are
764 * processing all of the entries on it. This way we make
765 * sure the commit code doesn't find the list temporarily empty
766 * because another function happens to be doing non-waiting preflush
767 * before jumping into the main commit.
768 */
769 struct mutex ordered_operations_mutex;
770
716 struct list_head trans_list; 771 struct list_head trans_list;
717 struct list_head hashers; 772 struct list_head hashers;
718 struct list_head dead_roots; 773 struct list_head dead_roots;
@@ -727,10 +782,29 @@ struct btrfs_fs_info {
727 * ordered extents 782 * ordered extents
728 */ 783 */
729 spinlock_t ordered_extent_lock; 784 spinlock_t ordered_extent_lock;
785
786 /*
787 * all of the data=ordered extents pending writeback
788 * these can span multiple transactions and basically include
789 * every dirty data page that isn't from nodatacow
790 */
730 struct list_head ordered_extents; 791 struct list_head ordered_extents;
792
793 /*
794 * all of the inodes that have delalloc bytes. It is possible for
795 * this list to be empty even when there is still dirty data=ordered
796 * extents waiting to finish IO.
797 */
731 struct list_head delalloc_inodes; 798 struct list_head delalloc_inodes;
732 799
733 /* 800 /*
801 * special rename and truncate targets that must be on disk before
802 * we're allowed to commit. This is basically the ext3 style
803 * data=ordered list.
804 */
805 struct list_head ordered_operations;
806
807 /*
734 * there is a pool of worker threads for checksumming during writes 808 * there is a pool of worker threads for checksumming during writes
735 * and a pool for checksumming after reads. This is because readers 809 * and a pool for checksumming after reads. This is because readers
736 * can run with FS locks held, and the writers may be waiting for 810 * can run with FS locks held, and the writers may be waiting for
@@ -771,15 +845,31 @@ struct btrfs_fs_info {
771 atomic_t throttle_gen; 845 atomic_t throttle_gen;
772 846
773 u64 total_pinned; 847 u64 total_pinned;
848
849 /* protected by the delalloc lock, used to keep from writing
850 * metadata until there is a nice batch
851 */
852 u64 dirty_metadata_bytes;
774 struct list_head dirty_cowonly_roots; 853 struct list_head dirty_cowonly_roots;
775 854
776 struct btrfs_fs_devices *fs_devices; 855 struct btrfs_fs_devices *fs_devices;
856
857 /*
858 * the space_info list is almost entirely read only. It only changes
859 * when we add a new raid type to the FS, and that happens
860 * very rarely. RCU is used to protect it.
861 */
777 struct list_head space_info; 862 struct list_head space_info;
863
778 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
779 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
780 u64 delalloc_bytes; 866 u64 delalloc_bytes;
781 u64 last_alloc; 867
782 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
783 873
784 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
785 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -871,7 +961,6 @@ struct btrfs_root {
871}; 961};
872 962
873/* 963/*
874
875 * inode items have the data typically returned from stat and store other 964 * inode items have the data typically returned from stat and store other
876 * info about object characteristics. There is one for every file and dir in 965 * info about object characteristics. There is one for every file and dir in
877 * the FS 966 * the FS
@@ -902,7 +991,7 @@ struct btrfs_root {
902#define BTRFS_EXTENT_CSUM_KEY 128 991#define BTRFS_EXTENT_CSUM_KEY 128
903 992
904/* 993/*
905 * root items point to tree roots. There are typically in the root 994 * root items point to tree roots. They are typically in the root
906 * tree used by the super block to find all the other trees 995 * tree used by the super block to find all the other trees
907 */ 996 */
908#define BTRFS_ROOT_ITEM_KEY 132 997#define BTRFS_ROOT_ITEM_KEY 132
@@ -949,6 +1038,8 @@ struct btrfs_root {
949#define BTRFS_MOUNT_SSD (1 << 3) 1038#define BTRFS_MOUNT_SSD (1 << 3)
950#define BTRFS_MOUNT_DEGRADED (1 << 4) 1039#define BTRFS_MOUNT_DEGRADED (1 << 4)
951#define BTRFS_MOUNT_COMPRESS (1 << 5) 1040#define BTRFS_MOUNT_COMPRESS (1 << 5)
1041#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1042#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
952 1043
953#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1044#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
954#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1045#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1687,18 +1778,16 @@ static inline struct dentry *fdentry(struct file *file)
1687} 1778}
1688 1779
1689/* extent-tree.c */ 1780/* extent-tree.c */
1781void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1782int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1783 struct btrfs_root *root, unsigned long count);
1690int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1784int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1691int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1692 struct btrfs_root *root, u64 bytenr,
1693 u64 num_bytes, u32 *refs);
1694int btrfs_update_pinned_extents(struct btrfs_root *root, 1785int btrfs_update_pinned_extents(struct btrfs_root *root,
1695 u64 bytenr, u64 num, int pin); 1786 u64 bytenr, u64 num, int pin);
1696int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1787int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root, struct extent_buffer *leaf); 1788 struct btrfs_root *root, struct extent_buffer *leaf);
1698int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1789int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1699 struct btrfs_root *root, u64 objectid, u64 bytenr); 1790 struct btrfs_root *root, u64 objectid, u64 bytenr);
1700int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1701 struct btrfs_root *root);
1702int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1791int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1703struct btrfs_block_group_cache *btrfs_lookup_block_group( 1792struct btrfs_block_group_cache *btrfs_lookup_block_group(
1704 struct btrfs_fs_info *info, 1793 struct btrfs_fs_info *info,
@@ -1715,7 +1804,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1715 u64 empty_size); 1804 u64 empty_size);
1716struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1805struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1717 struct btrfs_root *root, 1806 struct btrfs_root *root,
1718 u64 bytenr, u32 blocksize); 1807 u64 bytenr, u32 blocksize,
1808 int level);
1719int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1809int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1720 struct btrfs_root *root, 1810 struct btrfs_root *root,
1721 u64 num_bytes, u64 parent, u64 min_bytes, 1811 u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1759,7 +1849,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1759 u64 root_objectid, u64 ref_generation, 1849 u64 root_objectid, u64 ref_generation,
1760 u64 owner_objectid); 1850 u64 owner_objectid);
1761int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1851int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1762 struct btrfs_root *root, u64 bytenr, 1852 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1763 u64 orig_parent, u64 parent, 1853 u64 orig_parent, u64 parent,
1764 u64 root_objectid, u64 ref_generation, 1854 u64 root_objectid, u64 ref_generation,
1765 u64 owner_objectid); 1855 u64 owner_objectid);
@@ -1785,6 +1875,18 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1785int btrfs_cleanup_reloc_trees(struct btrfs_root *root); 1875int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1786int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 1876int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1787u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1877u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1878void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1879void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
1880
1881int btrfs_check_metadata_free_space(struct btrfs_root *root);
1882int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1883 u64 bytes);
1884void btrfs_free_reserved_data_space(struct btrfs_root *root,
1885 struct inode *inode, u64 bytes);
1886void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1887 u64 bytes);
1888void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1889 u64 bytes);
1788/* ctree.c */ 1890/* ctree.c */
1789int btrfs_previous_item(struct btrfs_root *root, 1891int btrfs_previous_item(struct btrfs_root *root,
1790 struct btrfs_path *path, u64 min_objectid, 1892 struct btrfs_path *path, u64 min_objectid,
@@ -1808,7 +1910,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1808int btrfs_cow_block(struct btrfs_trans_handle *trans, 1910int btrfs_cow_block(struct btrfs_trans_handle *trans,
1809 struct btrfs_root *root, struct extent_buffer *buf, 1911 struct btrfs_root *root, struct extent_buffer *buf,
1810 struct extent_buffer *parent, int parent_slot, 1912 struct extent_buffer *parent, int parent_slot,
1811 struct extent_buffer **cow_ret, u64 prealloc_dest); 1913 struct extent_buffer **cow_ret);
1812int btrfs_copy_root(struct btrfs_trans_handle *trans, 1914int btrfs_copy_root(struct btrfs_trans_handle *trans,
1813 struct btrfs_root *root, 1915 struct btrfs_root *root,
1814 struct extent_buffer *buf, 1916 struct extent_buffer *buf,
@@ -1834,9 +1936,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1834void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); 1936void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1835struct btrfs_path *btrfs_alloc_path(void); 1937struct btrfs_path *btrfs_alloc_path(void);
1836void btrfs_free_path(struct btrfs_path *p); 1938void btrfs_free_path(struct btrfs_path *p);
1837void btrfs_init_path(struct btrfs_path *p);
1838void btrfs_set_path_blocking(struct btrfs_path *p); 1939void btrfs_set_path_blocking(struct btrfs_path *p);
1839void btrfs_clear_path_blocking(struct btrfs_path *p);
1840void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 1940void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1841 1941
1842int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1942int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2032,9 +2132,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2032unsigned long btrfs_force_ra(struct address_space *mapping, 2132unsigned long btrfs_force_ra(struct address_space *mapping,
2033 struct file_ra_state *ra, struct file *file, 2133 struct file_ra_state *ra, struct file *file,
2034 pgoff_t offset, pgoff_t last_index); 2134 pgoff_t offset, pgoff_t last_index);
2035int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, 2135int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2036 int for_del);
2037int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2038int btrfs_readpage(struct file *file, struct page *page); 2136int btrfs_readpage(struct file *file, struct page *page);
2039void btrfs_delete_inode(struct inode *inode); 2137void btrfs_delete_inode(struct inode *inode);
2040void btrfs_put_inode(struct inode *inode); 2138void btrfs_put_inode(struct inode *inode);
@@ -2107,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2107int btrfs_init_acl(struct inode *inode, struct inode *dir); 2205int btrfs_init_acl(struct inode *inode, struct inode *dir);
2108int btrfs_acl_chmod(struct inode *inode); 2206int btrfs_acl_chmod(struct inode *inode);
2109 2207
2110/* free-space-cache.c */
2111int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2112 u64 bytenr, u64 size);
2113int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2114 u64 offset, u64 bytes);
2115int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2116 u64 bytenr, u64 size);
2117int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2118 u64 offset, u64 bytes);
2119void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2120 *block_group);
2121struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2122 *block_group, u64 offset,
2123 u64 bytes);
2124void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2125 u64 bytes);
2126u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2127#endif 2208#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include "ctree.h"
22#include "delayed-ref.h"
23#include "transaction.h"
24
25/*
26 * delayed back reference update tracking. For subvolume trees
27 * we queue up extent allocations and backref maintenance for
28 * delayed processing. This avoids deep call chains where we
29 * add extents in the middle of btrfs_search_slot, and it allows
30 * us to buffer up frequently modified backrefs in an rb tree instead
31 * of hammering updates on the extent allocation tree.
32 *
33 * Right now this code is only used for reference counted trees, but
34 * the long term goal is to get rid of the similar code for delayed
35 * extent tree modifications.
36 */
37
38/*
39 * entries in the rb tree are ordered by the byte number of the extent
40 * and by the byte number of the parent block.
41 */
42static int comp_entry(struct btrfs_delayed_ref_node *ref,
43 u64 bytenr, u64 parent)
44{
45 if (bytenr < ref->bytenr)
46 return -1;
47 if (bytenr > ref->bytenr)
48 return 1;
49 if (parent < ref->parent)
50 return -1;
51 if (parent > ref->parent)
52 return 1;
53 return 0;
54}
55
56/*
57 * insert a new ref into the rbtree. This returns any existing refs
58 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
59 * inserted.
60 */
61static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 u64 bytenr, u64 parent,
63 struct rb_node *node)
64{
65 struct rb_node **p = &root->rb_node;
66 struct rb_node *parent_node = NULL;
67 struct btrfs_delayed_ref_node *entry;
68 int cmp;
69
70 while (*p) {
71 parent_node = *p;
72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 rb_node);
74
75 cmp = comp_entry(entry, bytenr, parent);
76 if (cmp < 0)
77 p = &(*p)->rb_left;
78 else if (cmp > 0)
79 p = &(*p)->rb_right;
80 else
81 return entry;
82 }
83
84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 rb_link_node(node, parent_node, p);
86 rb_insert_color(node, root);
87 return NULL;
88}
89
90/*
91 * find an entry based on (bytenr,parent). This returns the delayed
92 * ref if it was able to find one, or NULL if nothing was in that spot
93 */
94static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
95 u64 bytenr, u64 parent,
96 struct btrfs_delayed_ref_node **last)
97{
98 struct rb_node *n = root->rb_node;
99 struct btrfs_delayed_ref_node *entry;
100 int cmp;
101
102 while (n) {
103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
104 WARN_ON(!entry->in_tree);
105 if (last)
106 *last = entry;
107
108 cmp = comp_entry(entry, bytenr, parent);
109 if (cmp < 0)
110 n = n->rb_left;
111 else if (cmp > 0)
112 n = n->rb_right;
113 else
114 return entry;
115 }
116 return NULL;
117}
118
119int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
120 struct btrfs_delayed_ref_head *head)
121{
122 struct btrfs_delayed_ref_root *delayed_refs;
123
124 delayed_refs = &trans->transaction->delayed_refs;
125 assert_spin_locked(&delayed_refs->lock);
126 if (mutex_trylock(&head->mutex))
127 return 0;
128
129 atomic_inc(&head->node.refs);
130 spin_unlock(&delayed_refs->lock);
131
132 mutex_lock(&head->mutex);
133 spin_lock(&delayed_refs->lock);
134 if (!head->node.in_tree) {
135 mutex_unlock(&head->mutex);
136 btrfs_put_delayed_ref(&head->node);
137 return -EAGAIN;
138 }
139 btrfs_put_delayed_ref(&head->node);
140 return 0;
141}
142
143int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
144 struct list_head *cluster, u64 start)
145{
146 int count = 0;
147 struct btrfs_delayed_ref_root *delayed_refs;
148 struct rb_node *node;
149 struct btrfs_delayed_ref_node *ref;
150 struct btrfs_delayed_ref_head *head;
151
152 delayed_refs = &trans->transaction->delayed_refs;
153 if (start == 0) {
154 node = rb_first(&delayed_refs->root);
155 } else {
156 ref = NULL;
157 tree_search(&delayed_refs->root, start, (u64)-1, &ref);
158 if (ref) {
159 struct btrfs_delayed_ref_node *tmp;
160
161 node = rb_prev(&ref->rb_node);
162 while (node) {
163 tmp = rb_entry(node,
164 struct btrfs_delayed_ref_node,
165 rb_node);
166 if (tmp->bytenr < start)
167 break;
168 ref = tmp;
169 node = rb_prev(&ref->rb_node);
170 }
171 node = &ref->rb_node;
172 } else
173 node = rb_first(&delayed_refs->root);
174 }
175again:
176 while (node && count < 32) {
177 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
178 if (btrfs_delayed_ref_is_head(ref)) {
179 head = btrfs_delayed_node_to_head(ref);
180 if (list_empty(&head->cluster)) {
181 list_add_tail(&head->cluster, cluster);
182 delayed_refs->run_delayed_start =
183 head->node.bytenr;
184 count++;
185
186 WARN_ON(delayed_refs->num_heads_ready == 0);
187 delayed_refs->num_heads_ready--;
188 } else if (count) {
189 /* the goal of the clustering is to find extents
190 * that are likely to end up in the same extent
191 * leaf on disk. So, we don't want them spread
192 * all over the tree. Stop now if we've hit
193 * a head that was already in use
194 */
195 break;
196 }
197 }
198 node = rb_next(node);
199 }
200 if (count) {
201 return 0;
202 } else if (start) {
203 /*
204 * we've gone to the end of the rbtree without finding any
205 * clusters. start from the beginning and try again
206 */
207 start = 0;
208 node = rb_first(&delayed_refs->root);
209 goto again;
210 }
211 return 1;
212}
213
214/*
215 * This checks to see if there are any delayed refs in the
216 * btree for a given bytenr. It returns one if it finds any
217 * and zero otherwise.
218 *
219 * If it only finds a head node, it returns 0.
220 *
221 * The idea is to use this when deciding if you can safely delete an
222 * extent from the extent allocation tree. There may be a pending
223 * ref in the rbtree that adds or removes references, so as long as this
224 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
225 * allocation tree.
226 */
227int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
228{
229 struct btrfs_delayed_ref_node *ref;
230 struct btrfs_delayed_ref_root *delayed_refs;
231 struct rb_node *prev_node;
232 int ret = 0;
233
234 delayed_refs = &trans->transaction->delayed_refs;
235 spin_lock(&delayed_refs->lock);
236
237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
238 if (ref) {
239 prev_node = rb_prev(&ref->rb_node);
240 if (!prev_node)
241 goto out;
242 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
243 rb_node);
244 if (ref->bytenr == bytenr)
245 ret = 1;
246 }
247out:
248 spin_unlock(&delayed_refs->lock);
249 return ret;
250}
251
252/*
253 * helper function to lookup reference count
254 *
255 * the head node for delayed ref is used to store the sum of all the
256 * reference count modifications queued up in the rbtree. This way you
257 * can check to see what the reference count would be if all of the
258 * delayed refs are processed.
259 */
260int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
261 struct btrfs_root *root, u64 bytenr,
262 u64 num_bytes, u32 *refs)
263{
264 struct btrfs_delayed_ref_node *ref;
265 struct btrfs_delayed_ref_head *head;
266 struct btrfs_delayed_ref_root *delayed_refs;
267 struct btrfs_path *path;
268 struct extent_buffer *leaf;
269 struct btrfs_extent_item *ei;
270 struct btrfs_key key;
271 u32 num_refs;
272 int ret;
273
274 path = btrfs_alloc_path();
275 if (!path)
276 return -ENOMEM;
277
278 key.objectid = bytenr;
279 key.type = BTRFS_EXTENT_ITEM_KEY;
280 key.offset = num_bytes;
281 delayed_refs = &trans->transaction->delayed_refs;
282again:
283 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
284 &key, path, 0, 0);
285 if (ret < 0)
286 goto out;
287
288 if (ret == 0) {
289 leaf = path->nodes[0];
290 ei = btrfs_item_ptr(leaf, path->slots[0],
291 struct btrfs_extent_item);
292 num_refs = btrfs_extent_refs(leaf, ei);
293 } else {
294 num_refs = 0;
295 ret = 0;
296 }
297
298 spin_lock(&delayed_refs->lock);
299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
300 if (ref) {
301 head = btrfs_delayed_node_to_head(ref);
302 if (mutex_trylock(&head->mutex)) {
303 num_refs += ref->ref_mod;
304 mutex_unlock(&head->mutex);
305 *refs = num_refs;
306 goto out;
307 }
308
309 atomic_inc(&ref->refs);
310 spin_unlock(&delayed_refs->lock);
311
312 btrfs_release_path(root->fs_info->extent_root, path);
313
314 mutex_lock(&head->mutex);
315 mutex_unlock(&head->mutex);
316 btrfs_put_delayed_ref(ref);
317 goto again;
318 } else {
319 *refs = num_refs;
320 }
321out:
322 spin_unlock(&delayed_refs->lock);
323 btrfs_free_path(path);
324 return ret;
325}
326
327/*
328 * helper function to update an extent delayed ref in the
329 * rbtree. existing and update must both have the same
330 * bytenr and parent
331 *
332 * This may free existing if the update cancels out whatever
333 * operation it was doing.
334 */
335static noinline void
336update_existing_ref(struct btrfs_trans_handle *trans,
337 struct btrfs_delayed_ref_root *delayed_refs,
338 struct btrfs_delayed_ref_node *existing,
339 struct btrfs_delayed_ref_node *update)
340{
341 struct btrfs_delayed_ref *existing_ref;
342 struct btrfs_delayed_ref *ref;
343
344 existing_ref = btrfs_delayed_node_to_ref(existing);
345 ref = btrfs_delayed_node_to_ref(update);
346
347 if (ref->pin)
348 existing_ref->pin = 1;
349
350 if (ref->action != existing_ref->action) {
351 /*
352 * this is effectively undoing either an add or a
353 * drop. We decrement the ref_mod, and if it goes
354 * down to zero we just delete the entry without
355 * every changing the extent allocation tree.
356 */
357 existing->ref_mod--;
358 if (existing->ref_mod == 0) {
359 rb_erase(&existing->rb_node,
360 &delayed_refs->root);
361 existing->in_tree = 0;
362 btrfs_put_delayed_ref(existing);
363 delayed_refs->num_entries--;
364 if (trans->delayed_ref_updates)
365 trans->delayed_ref_updates--;
366 }
367 } else {
368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
369 /* if we're adding refs, make sure all the
370 * details match up. The extent could
371 * have been totally freed and reallocated
372 * by a different owner before the delayed
373 * ref entries were removed.
374 */
375 existing_ref->owner_objectid = ref->owner_objectid;
376 existing_ref->generation = ref->generation;
377 existing_ref->root = ref->root;
378 existing->num_bytes = update->num_bytes;
379 }
380 /*
381 * the action on the existing ref matches
382 * the action on the ref we're trying to add.
383 * Bump the ref_mod by one so the backref that
384 * is eventually added/removed has the correct
385 * reference count
386 */
387 existing->ref_mod += update->ref_mod;
388 }
389}
390
391/*
392 * helper function to update the accounting in the head ref
393 * existing and update must have the same bytenr
394 */
395static noinline void
396update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
397 struct btrfs_delayed_ref_node *update)
398{
399 struct btrfs_delayed_ref_head *existing_ref;
400 struct btrfs_delayed_ref_head *ref;
401
402 existing_ref = btrfs_delayed_node_to_head(existing);
403 ref = btrfs_delayed_node_to_head(update);
404
405 if (ref->must_insert_reserved) {
406 /* if the extent was freed and then
407 * reallocated before the delayed ref
408 * entries were processed, we can end up
409 * with an existing head ref without
410 * the must_insert_reserved flag set.
411 * Set it again here
412 */
413 existing_ref->must_insert_reserved = ref->must_insert_reserved;
414
415 /*
416 * update the num_bytes so we make sure the accounting
417 * is done correctly
418 */
419 existing->num_bytes = update->num_bytes;
420
421 }
422
423 /*
424 * update the reference mod on the head to reflect this new operation
425 */
426 existing->ref_mod += update->ref_mod;
427}
428
429/*
430 * helper function to actually insert a delayed ref into the rbtree.
431 * this does all the dirty work in terms of maintaining the correct
432 * overall modification count in the head node and properly dealing
433 * with updating existing nodes as new modifications are queued.
434 */
435static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
436 struct btrfs_delayed_ref_node *ref,
437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
438 u64 ref_generation, u64 owner_objectid, int action,
439 int pin)
440{
441 struct btrfs_delayed_ref_node *existing;
442 struct btrfs_delayed_ref *full_ref;
443 struct btrfs_delayed_ref_head *head_ref = NULL;
444 struct btrfs_delayed_ref_root *delayed_refs;
445 int count_mod = 1;
446 int must_insert_reserved = 0;
447
448 /*
449 * the head node stores the sum of all the mods, so dropping a ref
450 * should drop the sum in the head node by one.
451 */
452 if (parent == (u64)-1) {
453 if (action == BTRFS_DROP_DELAYED_REF)
454 count_mod = -1;
455 else if (action == BTRFS_UPDATE_DELAYED_HEAD)
456 count_mod = 0;
457 }
458
459 /*
460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
461 * the reserved accounting when the extent is finally added, or
462 * if a later modification deletes the delayed ref without ever
463 * inserting the extent into the extent allocation tree.
464 * ref->must_insert_reserved is the flag used to record
465 * that accounting mods are required.
466 *
467 * Once we record must_insert_reserved, switch the action to
468 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 */
470 if (action == BTRFS_ADD_DELAYED_EXTENT) {
471 must_insert_reserved = 1;
472 action = BTRFS_ADD_DELAYED_REF;
473 } else {
474 must_insert_reserved = 0;
475 }
476
477
478 delayed_refs = &trans->transaction->delayed_refs;
479
480 /* first set the basic ref node struct up */
481 atomic_set(&ref->refs, 1);
482 ref->bytenr = bytenr;
483 ref->parent = parent;
484 ref->ref_mod = count_mod;
485 ref->in_tree = 1;
486 ref->num_bytes = num_bytes;
487
488 if (btrfs_delayed_ref_is_head(ref)) {
489 head_ref = btrfs_delayed_node_to_head(ref);
490 head_ref->must_insert_reserved = must_insert_reserved;
491 INIT_LIST_HEAD(&head_ref->cluster);
492 mutex_init(&head_ref->mutex);
493 } else {
494 full_ref = btrfs_delayed_node_to_ref(ref);
495 full_ref->root = ref_root;
496 full_ref->generation = ref_generation;
497 full_ref->owner_objectid = owner_objectid;
498 full_ref->pin = pin;
499 full_ref->action = action;
500 }
501
502 existing = tree_insert(&delayed_refs->root, bytenr,
503 parent, &ref->rb_node);
504
505 if (existing) {
506 if (btrfs_delayed_ref_is_head(ref))
507 update_existing_head_ref(existing, ref);
508 else
509 update_existing_ref(trans, delayed_refs, existing, ref);
510
511 /*
512 * we've updated the existing ref, free the newly
513 * allocated ref
514 */
515 kfree(ref);
516 } else {
517 if (btrfs_delayed_ref_is_head(ref)) {
518 delayed_refs->num_heads++;
519 delayed_refs->num_heads_ready++;
520 }
521 delayed_refs->num_entries++;
522 trans->delayed_ref_updates++;
523 }
524 return 0;
525}
526
527/*
528 * add a delayed ref to the tree. This does all of the accounting required
529 * to make sure the delayed ref is eventually processed before this
530 * transaction commits.
531 */
532int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
534 u64 ref_generation, u64 owner_objectid, int action,
535 int pin)
536{
537 struct btrfs_delayed_ref *ref;
538 struct btrfs_delayed_ref_head *head_ref;
539 struct btrfs_delayed_ref_root *delayed_refs;
540 int ret;
541
542 ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 if (!ref)
544 return -ENOMEM;
545
546 /*
547 * the parent = 0 case comes from cases where we don't actually
548 * know the parent yet. It will get updated later via a add/drop
549 * pair.
550 */
551 if (parent == 0)
552 parent = bytenr;
553
554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 if (!head_ref) {
556 kfree(ref);
557 return -ENOMEM;
558 }
559 delayed_refs = &trans->transaction->delayed_refs;
560 spin_lock(&delayed_refs->lock);
561
562 /*
563 * insert both the head node and the new ref without dropping
564 * the spin lock
565 */
566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
567 (u64)-1, 0, 0, 0, action, pin);
568 BUG_ON(ret);
569
570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
571 parent, ref_root, ref_generation,
572 owner_objectid, action, pin);
573 BUG_ON(ret);
574 spin_unlock(&delayed_refs->lock);
575 return 0;
576}
577
578/*
579 * this does a simple search for the head node for a given extent.
580 * It must be called with the delayed ref spinlock held, and it returns
581 * the head node if any where found, or NULL if not.
582 */
583struct btrfs_delayed_ref_head *
584btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
585{
586 struct btrfs_delayed_ref_node *ref;
587 struct btrfs_delayed_ref_root *delayed_refs;
588
589 delayed_refs = &trans->transaction->delayed_refs;
590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
591 if (ref)
592 return btrfs_delayed_node_to_head(ref);
593 return NULL;
594}
595
596/*
597 * add a delayed ref to the tree. This does all of the accounting required
598 * to make sure the delayed ref is eventually processed before this
599 * transaction commits.
600 *
601 * The main point of this call is to add and remove a backreference in a single
602 * shot, taking the lock only once, and only searching for the head node once.
603 *
604 * It is the same as doing a ref add and delete in two separate calls.
605 */
606int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 u64 bytenr, u64 num_bytes, u64 orig_parent,
608 u64 parent, u64 orig_ref_root, u64 ref_root,
609 u64 orig_ref_generation, u64 ref_generation,
610 u64 owner_objectid, int pin)
611{
612 struct btrfs_delayed_ref *ref;
613 struct btrfs_delayed_ref *old_ref;
614 struct btrfs_delayed_ref_head *head_ref;
615 struct btrfs_delayed_ref_root *delayed_refs;
616 int ret;
617
618 ref = kmalloc(sizeof(*ref), GFP_NOFS);
619 if (!ref)
620 return -ENOMEM;
621
622 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
623 if (!old_ref) {
624 kfree(ref);
625 return -ENOMEM;
626 }
627
628 /*
629 * the parent = 0 case comes from cases where we don't actually
630 * know the parent yet. It will get updated later via a add/drop
631 * pair.
632 */
633 if (parent == 0)
634 parent = bytenr;
635 if (orig_parent == 0)
636 orig_parent = bytenr;
637
638 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
639 if (!head_ref) {
640 kfree(ref);
641 kfree(old_ref);
642 return -ENOMEM;
643 }
644 delayed_refs = &trans->transaction->delayed_refs;
645 spin_lock(&delayed_refs->lock);
646
647 /*
648 * insert both the head node and the new ref without dropping
649 * the spin lock
650 */
651 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
652 (u64)-1, 0, 0, 0,
653 BTRFS_UPDATE_DELAYED_HEAD, 0);
654 BUG_ON(ret);
655
656 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
657 parent, ref_root, ref_generation,
658 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
659 BUG_ON(ret);
660
661 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
662 orig_parent, orig_ref_root,
663 orig_ref_generation, owner_objectid,
664 BTRFS_DROP_DELAYED_REF, pin);
665 BUG_ON(ret);
666 spin_unlock(&delayed_refs->lock);
667 return 0;
668}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__
20
21/* these are the possible values of struct btrfs_delayed_ref->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
25#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
26
27struct btrfs_delayed_ref_node {
28 struct rb_node rb_node;
29
30 /* the starting bytenr of the extent */
31 u64 bytenr;
32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */
37 u64 num_bytes;
38
39 /* ref count on this data structure */
40 atomic_t refs;
41
42 /*
43 * how many refs is this entry adding or deleting. For
44 * head refs, this may be a negative number because it is keeping
45 * track of the total mods done to the reference count.
46 * For individual refs, this will always be a positive number
47 *
48 * It may be more than one, since it is possible for a single
49 * parent to have more than one ref on an extent
50 */
51 int ref_mod;
52
53 /* is this node still in the rbtree? */
54 unsigned int in_tree:1;
55};
56
57/*
58 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs
60 * at a time for a single extent. They also store the sum of all the
61 * reference count modifications we've queued up.
62 */
63struct btrfs_delayed_ref_head {
64 struct btrfs_delayed_ref_node node;
65
66 /*
67 * the mutex is held while running the refs, and it is also
68 * held when checking the sum of reference modifications.
69 */
70 struct mutex mutex;
71
72 struct list_head cluster;
73
74 /*
75 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree
77 * until the delayed ref is processed. must_insert_reserved is
78 * used to flag a delayed ref so the accounting can be updated
79 * when a full insert is done.
80 *
81 * It is possible the extent will be freed before it is ever
82 * inserted into the extent allocation tree. In this case
83 * we need to update the in ram accounting to properly reflect
84 * the free has happened.
85 */
86 unsigned int must_insert_reserved:1;
87};
88
89struct btrfs_delayed_ref {
90 struct btrfs_delayed_ref_node node;
91
92 /* the root objectid our ref will point to */
93 u64 root;
94
95 /* the generation for the backref */
96 u64 generation;
97
98 /* owner_objectid of the backref */
99 u64 owner_objectid;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108};
109
110struct btrfs_delayed_ref_root {
111 struct rb_root root;
112
113 /* this spin lock protects the rbtree and the entries inside */
114 spinlock_t lock;
115
116 /* how many delayed ref updates we've queued, used by the
117 * throttling code
118 */
119 unsigned long num_entries;
120
121 /* total number of head nodes in tree */
122 unsigned long num_heads;
123
124 /* total number of head nodes ready for processing */
125 unsigned long num_heads_ready;
126
127 /*
128 * set when the tree is flushing before a transaction commit,
129 * used by the throttling code to decide if new updates need
130 * to be run right away
131 */
132 int flushing;
133
134 u64 run_delayed_start;
135};
136
137static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
138{
139 WARN_ON(atomic_read(&ref->refs) == 0);
140 if (atomic_dec_and_test(&ref->refs)) {
141 WARN_ON(ref->in_tree);
142 kfree(ref);
143 }
144}
145
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
148 u64 ref_generation, u64 owner_objectid, int action,
149 int pin);
150
151struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root,
160 u64 orig_ref_generation, u64 ref_generation,
161 u64 owner_objectid, int pin);
162int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
163 struct btrfs_delayed_ref_head *head);
164int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
165 struct list_head *cluster, u64 search_start);
166/*
167 * a node might live in a head or a regular ref, this lets you
168 * test for the proper type to use.
169 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{
172 return node->parent == (u64)-1;
173}
174
175/*
176 * helper functions to cast a node into its container
177 */
178static inline struct btrfs_delayed_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
180{
181 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node);
183
184}
185
186static inline struct btrfs_delayed_ref_head *
187btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{
189 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192}
193#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
145 key.objectid = dir; 145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len); 147 key.offset = btrfs_name_hash(name, name_len);
148
148 path = btrfs_alloc_path(); 149 path = btrfs_alloc_path();
150 path->leave_spinning = 1;
151
149 data_size = sizeof(*dir_item) + name_len; 152 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 153 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len); 154 name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd71193..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -75,6 +76,40 @@ struct async_submit_bio {
75 struct btrfs_work work; 76 struct btrfs_work work;
76}; 77};
77 78
79/* These are used to set the lockdep class on the extent buffer locks.
80 * The class is set by the readpage_end_io_hook after the buffer has
81 * passed csum validation but before the pages are unlocked.
82 *
83 * The lockdep class is also set by btrfs_init_new_buffer on freshly
84 * allocated blocks.
85 *
86 * The class is based on the level in the tree block, which allows lockdep
87 * to know that lower nodes nest inside the locks of higher nodes.
88 *
89 * We also add a check to make sure the highest level of the tree is
90 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
91 * code needs update as well.
92 */
93#ifdef CONFIG_DEBUG_LOCK_ALLOC
94# if BTRFS_MAX_LEVEL != 8
95# error
96# endif
97static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
98static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
99 /* leaf */
100 "btrfs-extent-00",
101 "btrfs-extent-01",
102 "btrfs-extent-02",
103 "btrfs-extent-03",
104 "btrfs-extent-04",
105 "btrfs-extent-05",
106 "btrfs-extent-06",
107 "btrfs-extent-07",
108 /* highest possible level */
109 "btrfs-extent-08",
110};
111#endif
112
78/* 113/*
79 * extents on the btree inode are pretty simple, there's one extent 114 * extents on the btree inode are pretty simple, there's one extent
80 * that covers the entire device 115 * that covers the entire device
@@ -347,6 +382,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
347 return ret; 382 return ret;
348} 383}
349 384
385#ifdef CONFIG_DEBUG_LOCK_ALLOC
386void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
387{
388 lockdep_set_class_and_name(&eb->lock,
389 &btrfs_eb_class[level],
390 btrfs_eb_name[level]);
391}
392#endif
393
350static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 394static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
351 struct extent_state *state) 395 struct extent_state *state)
352{ 396{
@@ -392,6 +436,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
392 } 436 }
393 found_level = btrfs_header_level(eb); 437 found_level = btrfs_header_level(eb);
394 438
439 btrfs_set_buffer_lockdep_class(eb, found_level);
440
395 ret = csum_tree_block(root, eb, 1); 441 ret = csum_tree_block(root, eb, 1);
396 if (ret) 442 if (ret)
397 ret = -EIO; 443 ret = -EIO;
@@ -623,14 +669,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
623static int btree_writepage(struct page *page, struct writeback_control *wbc) 669static int btree_writepage(struct page *page, struct writeback_control *wbc)
624{ 670{
625 struct extent_io_tree *tree; 671 struct extent_io_tree *tree;
672 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
673 struct extent_buffer *eb;
674 int was_dirty;
675
626 tree = &BTRFS_I(page->mapping->host)->io_tree; 676 tree = &BTRFS_I(page->mapping->host)->io_tree;
677 if (!(current->flags & PF_MEMALLOC)) {
678 return extent_write_full_page(tree, page,
679 btree_get_extent, wbc);
680 }
627 681
628 if (current->flags & PF_MEMALLOC) { 682 redirty_page_for_writepage(wbc, page);
629 redirty_page_for_writepage(wbc, page); 683 eb = btrfs_find_tree_block(root, page_offset(page),
630 unlock_page(page); 684 PAGE_CACHE_SIZE);
631 return 0; 685 WARN_ON(!eb);
686
687 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
688 if (!was_dirty) {
689 spin_lock(&root->fs_info->delalloc_lock);
690 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
691 spin_unlock(&root->fs_info->delalloc_lock);
632 } 692 }
633 return extent_write_full_page(tree, page, btree_get_extent, wbc); 693 free_extent_buffer(eb);
694
695 unlock_page(page);
696 return 0;
634} 697}
635 698
636static int btree_writepages(struct address_space *mapping, 699static int btree_writepages(struct address_space *mapping,
@@ -639,15 +702,15 @@ static int btree_writepages(struct address_space *mapping,
639 struct extent_io_tree *tree; 702 struct extent_io_tree *tree;
640 tree = &BTRFS_I(mapping->host)->io_tree; 703 tree = &BTRFS_I(mapping->host)->io_tree;
641 if (wbc->sync_mode == WB_SYNC_NONE) { 704 if (wbc->sync_mode == WB_SYNC_NONE) {
705 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
642 u64 num_dirty; 706 u64 num_dirty;
643 u64 start = 0;
644 unsigned long thresh = 32 * 1024 * 1024; 707 unsigned long thresh = 32 * 1024 * 1024;
645 708
646 if (wbc->for_kupdate) 709 if (wbc->for_kupdate)
647 return 0; 710 return 0;
648 711
649 num_dirty = count_range_bits(tree, &start, (u64)-1, 712 /* this is a bit racy, but that's ok */
650 thresh, EXTENT_DIRTY); 713 num_dirty = root->fs_info->dirty_metadata_bytes;
651 if (num_dirty < thresh) 714 if (num_dirty < thresh)
652 return 0; 715 return 0;
653 } 716 }
@@ -812,11 +875,19 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
812 struct inode *btree_inode = root->fs_info->btree_inode; 875 struct inode *btree_inode = root->fs_info->btree_inode;
813 if (btrfs_header_generation(buf) == 876 if (btrfs_header_generation(buf) ==
814 root->fs_info->running_transaction->transid) { 877 root->fs_info->running_transaction->transid) {
815 WARN_ON(!btrfs_tree_locked(buf)); 878 btrfs_assert_tree_locked(buf);
879
880 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
881 spin_lock(&root->fs_info->delalloc_lock);
882 if (root->fs_info->dirty_metadata_bytes >= buf->len)
883 root->fs_info->dirty_metadata_bytes -= buf->len;
884 else
885 WARN_ON(1);
886 spin_unlock(&root->fs_info->delalloc_lock);
887 }
816 888
817 /* ugh, clear_extent_buffer_dirty can be expensive */ 889 /* ugh, clear_extent_buffer_dirty needs to lock the page */
818 btrfs_set_lock_blocking(buf); 890 btrfs_set_lock_blocking(buf);
819
820 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 891 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
821 buf); 892 buf);
822 } 893 }
@@ -1342,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
1342 1413
1343 ret = extent_range_uptodate(io_tree, start + length, 1414 ret = extent_range_uptodate(io_tree, start + length,
1344 start + buf_len - 1); 1415 start + buf_len - 1);
1345 if (ret == 1)
1346 return ret;
1347 return ret; 1416 return ret;
1348} 1417}
1349 1418
@@ -1426,12 +1495,6 @@ static int transaction_kthread(void *arg)
1426 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1495 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1427 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1496 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1428 1497
1429 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1430 printk(KERN_INFO "btrfs: total reference cache "
1431 "size %llu\n",
1432 root->fs_info->total_ref_cache_size);
1433 }
1434
1435 mutex_lock(&root->fs_info->trans_mutex); 1498 mutex_lock(&root->fs_info->trans_mutex);
1436 cur = root->fs_info->running_transaction; 1499 cur = root->fs_info->running_transaction;
1437 if (!cur) { 1500 if (!cur) {
@@ -1448,6 +1511,7 @@ static int transaction_kthread(void *arg)
1448 mutex_unlock(&root->fs_info->trans_mutex); 1511 mutex_unlock(&root->fs_info->trans_mutex);
1449 trans = btrfs_start_transaction(root, 1); 1512 trans = btrfs_start_transaction(root, 1);
1450 ret = btrfs_commit_transaction(trans, root); 1513 ret = btrfs_commit_transaction(trans, root);
1514
1451sleep: 1515sleep:
1452 wake_up_process(root->fs_info->cleaner_kthread); 1516 wake_up_process(root->fs_info->cleaner_kthread);
1453 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1517 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1507,6 +1571,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1507 INIT_LIST_HEAD(&fs_info->dead_roots); 1571 INIT_LIST_HEAD(&fs_info->dead_roots);
1508 INIT_LIST_HEAD(&fs_info->hashers); 1572 INIT_LIST_HEAD(&fs_info->hashers);
1509 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1573 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1574 INIT_LIST_HEAD(&fs_info->ordered_operations);
1510 spin_lock_init(&fs_info->delalloc_lock); 1575 spin_lock_init(&fs_info->delalloc_lock);
1511 spin_lock_init(&fs_info->new_trans_lock); 1576 spin_lock_init(&fs_info->new_trans_lock);
1512 spin_lock_init(&fs_info->ref_cache_lock); 1577 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1566,10 +1631,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1566 1631
1567 extent_io_tree_init(&fs_info->pinned_extents, 1632 extent_io_tree_init(&fs_info->pinned_extents,
1568 fs_info->btree_inode->i_mapping, GFP_NOFS); 1633 fs_info->btree_inode->i_mapping, GFP_NOFS);
1569 extent_io_tree_init(&fs_info->pending_del,
1570 fs_info->btree_inode->i_mapping, GFP_NOFS);
1571 extent_io_tree_init(&fs_info->extent_ins,
1572 fs_info->btree_inode->i_mapping, GFP_NOFS);
1573 fs_info->do_barriers = 1; 1634 fs_info->do_barriers = 1;
1574 1635
1575 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1636 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1582,15 +1643,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1582 insert_inode_hash(fs_info->btree_inode); 1643 insert_inode_hash(fs_info->btree_inode);
1583 1644
1584 mutex_init(&fs_info->trans_mutex); 1645 mutex_init(&fs_info->trans_mutex);
1646 mutex_init(&fs_info->ordered_operations_mutex);
1585 mutex_init(&fs_info->tree_log_mutex); 1647 mutex_init(&fs_info->tree_log_mutex);
1586 mutex_init(&fs_info->drop_mutex); 1648 mutex_init(&fs_info->drop_mutex);
1587 mutex_init(&fs_info->extent_ins_mutex);
1588 mutex_init(&fs_info->pinned_mutex);
1589 mutex_init(&fs_info->chunk_mutex); 1649 mutex_init(&fs_info->chunk_mutex);
1590 mutex_init(&fs_info->transaction_kthread_mutex); 1650 mutex_init(&fs_info->transaction_kthread_mutex);
1591 mutex_init(&fs_info->cleaner_mutex); 1651 mutex_init(&fs_info->cleaner_mutex);
1592 mutex_init(&fs_info->volume_mutex); 1652 mutex_init(&fs_info->volume_mutex);
1593 mutex_init(&fs_info->tree_reloc_mutex); 1653 mutex_init(&fs_info->tree_reloc_mutex);
1654
1655 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1656 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1657
1594 init_waitqueue_head(&fs_info->transaction_throttle); 1658 init_waitqueue_head(&fs_info->transaction_throttle);
1595 init_waitqueue_head(&fs_info->transaction_wait); 1659 init_waitqueue_head(&fs_info->transaction_wait);
1596 init_waitqueue_head(&fs_info->async_submit_wait); 1660 init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1777,7 +1841,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1777 ret = find_and_setup_root(tree_root, fs_info, 1841 ret = find_and_setup_root(tree_root, fs_info,
1778 BTRFS_DEV_TREE_OBJECTID, dev_root); 1842 BTRFS_DEV_TREE_OBJECTID, dev_root);
1779 dev_root->track_dirty = 1; 1843 dev_root->track_dirty = 1;
1780
1781 if (ret) 1844 if (ret)
1782 goto fail_extent_root; 1845 goto fail_extent_root;
1783 1846
@@ -2314,10 +2377,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2314 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2377 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2315 u64 transid = btrfs_header_generation(buf); 2378 u64 transid = btrfs_header_generation(buf);
2316 struct inode *btree_inode = root->fs_info->btree_inode; 2379 struct inode *btree_inode = root->fs_info->btree_inode;
2380 int was_dirty;
2317 2381
2318 btrfs_set_lock_blocking(buf); 2382 btrfs_assert_tree_locked(buf);
2319
2320 WARN_ON(!btrfs_tree_locked(buf));
2321 if (transid != root->fs_info->generation) { 2383 if (transid != root->fs_info->generation) {
2322 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2384 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2323 "found %llu running %llu\n", 2385 "found %llu running %llu\n",
@@ -2326,7 +2388,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2326 (unsigned long long)root->fs_info->generation); 2388 (unsigned long long)root->fs_info->generation);
2327 WARN_ON(1); 2389 WARN_ON(1);
2328 } 2390 }
2329 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 2391 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2392 buf);
2393 if (!was_dirty) {
2394 spin_lock(&root->fs_info->delalloc_lock);
2395 root->fs_info->dirty_metadata_bytes += buf->len;
2396 spin_unlock(&root->fs_info->delalloc_lock);
2397 }
2330} 2398}
2331 2399
2332void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2400void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2341,7 +2409,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2341 unsigned long thresh = 32 * 1024 * 1024; 2409 unsigned long thresh = 32 * 1024 * 1024;
2342 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 2410 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2343 2411
2344 if (current_is_pdflush() || current->flags & PF_MEMALLOC) 2412 if (current->flags & PF_MEMALLOC)
2345 return; 2413 return;
2346 2414
2347 num_dirty = count_range_bits(tree, &start, (u64)-1, 2415 num_dirty = count_range_bits(tree, &start, (u64)-1,
@@ -2366,6 +2434,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2366int btree_lock_page_hook(struct page *page) 2434int btree_lock_page_hook(struct page *page)
2367{ 2435{
2368 struct inode *inode = page->mapping->host; 2436 struct inode *inode = page->mapping->host;
2437 struct btrfs_root *root = BTRFS_I(inode)->root;
2369 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2438 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2370 struct extent_buffer *eb; 2439 struct extent_buffer *eb;
2371 unsigned long len; 2440 unsigned long len;
@@ -2381,6 +2450,16 @@ int btree_lock_page_hook(struct page *page)
2381 2450
2382 btrfs_tree_lock(eb); 2451 btrfs_tree_lock(eb);
2383 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2452 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2453
2454 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2455 spin_lock(&root->fs_info->delalloc_lock);
2456 if (root->fs_info->dirty_metadata_bytes >= eb->len)
2457 root->fs_info->dirty_metadata_bytes -= eb->len;
2458 else
2459 WARN_ON(1);
2460 spin_unlock(&root->fs_info->delalloc_lock);
2461 }
2462
2384 btrfs_tree_unlock(eb); 2463 btrfs_tree_unlock(eb);
2385 free_extent_buffer(eb); 2464 free_extent_buffer(eb);
2386out: 2465out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb2986..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 77int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root, 78int wait_on_tree_block_writeback(struct btrfs_root *root,
@@ -101,4 +102,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 102int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root); 103 struct btrfs_root *root);
103int btree_lock_page_hook(struct page *page); 104int btree_lock_page_hook(struct page *page);
105
106
107#ifdef CONFIG_DEBUG_LOCK_ALLOC
108void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
109#else
110static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
111 int level)
112{
113}
114#endif
104#endif 115#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2d..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -20,6 +20,7 @@
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h>
23#include "compat.h" 24#include "compat.h"
24#include "hash.h" 25#include "hash.h"
25#include "crc32c.h" 26#include "crc32c.h"
@@ -30,6 +31,7 @@
30#include "volumes.h" 31#include "volumes.h"
31#include "locking.h" 32#include "locking.h"
32#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
33 35
34#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
35#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -48,17 +50,27 @@ struct pending_extent_op {
48 int del; 50 int del;
49}; 51};
50 52
51static int finish_current_insert(struct btrfs_trans_handle *trans, 53static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
52 struct btrfs_root *extent_root, int all); 54 struct btrfs_root *root, u64 parent,
53static int del_pending_extents(struct btrfs_trans_handle *trans, 55 u64 root_objectid, u64 ref_generation,
54 struct btrfs_root *extent_root, int all); 56 u64 owner, struct btrfs_key *ins,
55static int pin_down_bytes(struct btrfs_trans_handle *trans, 57 int ref_mod);
56 struct btrfs_root *root, 58static int update_reserved_extents(struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int is_data); 59 u64 bytenr, u64 num, int reserve);
58static int update_block_group(struct btrfs_trans_handle *trans, 60static int update_block_group(struct btrfs_trans_handle *trans,
59 struct btrfs_root *root, 61 struct btrfs_root *root,
60 u64 bytenr, u64 num_bytes, int alloc, 62 u64 bytenr, u64 num_bytes, int alloc,
61 int mark_free); 63 int mark_free);
64static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 u64 bytenr, u64 num_bytes, u64 parent,
67 u64 root_objectid, u64 ref_generation,
68 u64 owner_objectid, int pin,
69 int ref_to_drop);
70
71static int do_chunk_alloc(struct btrfs_trans_handle *trans,
72 struct btrfs_root *extent_root, u64 alloc_bytes,
73 u64 flags, int force);
62 74
63static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 75static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
64{ 76{
@@ -155,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
155 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
156 int ret; 168 int ret;
157 169
158 mutex_lock(&info->pinned_mutex);
159 while (start < end) { 170 while (start < end) {
160 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
161 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -181,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
181 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
182 BUG_ON(ret); 193 BUG_ON(ret);
183 } 194 }
184 mutex_unlock(&info->pinned_mutex);
185 195
186 return 0; 196 return 0;
187} 197}
@@ -280,8 +290,8 @@ next:
280 block_group->key.objectid + 290 block_group->key.objectid +
281 block_group->key.offset); 291 block_group->key.offset);
282 292
283 remove_sb_from_cache(root, block_group);
284 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
285 ret = 0; 295 ret = 0;
286err: 296err:
287 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -315,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
315 return cache; 325 return cache;
316} 326}
317 327
318static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
319{ 329{
320 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
321 kfree(cache); 331 kfree(cache);
@@ -326,13 +336,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326{ 336{
327 struct list_head *head = &info->space_info; 337 struct list_head *head = &info->space_info;
328 struct btrfs_space_info *found; 338 struct btrfs_space_info *found;
329 list_for_each_entry(found, head, list) { 339
330 if (found->flags == flags) 340 rcu_read_lock();
341 list_for_each_entry_rcu(found, head, list) {
342 if (found->flags == flags) {
343 rcu_read_unlock();
331 return found; 344 return found;
345 }
332 } 346 }
347 rcu_read_unlock();
333 return NULL; 348 return NULL;
334} 349}
335 350
351/*
352 * after adding space to the filesystem, we need to clear the full flags
353 * on all the space infos.
354 */
355void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
356{
357 struct list_head *head = &info->space_info;
358 struct btrfs_space_info *found;
359
360 rcu_read_lock();
361 list_for_each_entry_rcu(found, head, list)
362 found->full = 0;
363 rcu_read_unlock();
364}
365
336static u64 div_factor(u64 num, int factor) 366static u64 div_factor(u64 num, int factor)
337{ 367{
338 if (factor == 10) 368 if (factor == 10)
@@ -368,12 +398,12 @@ again:
368 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
369 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
370 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
371 put_block_group(cache); 401 btrfs_put_block_group(cache);
372 goto found; 402 goto found;
373 } 403 }
374 } 404 }
375 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
376 put_block_group(cache); 406 btrfs_put_block_group(cache);
377 cond_resched(); 407 cond_resched();
378 } 408 }
379 if (!wrapped) { 409 if (!wrapped) {
@@ -529,262 +559,13 @@ out:
529 return ret; 559 return ret;
530} 560}
531 561
532/*
533 * updates all the backrefs that are pending on update_list for the
534 * extent_root
535 */
536static noinline int update_backrefs(struct btrfs_trans_handle *trans,
537 struct btrfs_root *extent_root,
538 struct btrfs_path *path,
539 struct list_head *update_list)
540{
541 struct btrfs_key key;
542 struct btrfs_extent_ref *ref;
543 struct btrfs_fs_info *info = extent_root->fs_info;
544 struct pending_extent_op *op;
545 struct extent_buffer *leaf;
546 int ret = 0;
547 struct list_head *cur = update_list->next;
548 u64 ref_objectid;
549 u64 ref_root = extent_root->root_key.objectid;
550
551 op = list_entry(cur, struct pending_extent_op, list);
552
553search:
554 key.objectid = op->bytenr;
555 key.type = BTRFS_EXTENT_REF_KEY;
556 key.offset = op->orig_parent;
557
558 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
559 BUG_ON(ret);
560
561 leaf = path->nodes[0];
562
563loop:
564 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
565
566 ref_objectid = btrfs_ref_objectid(leaf, ref);
567
568 if (btrfs_ref_root(leaf, ref) != ref_root ||
569 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
570 (ref_objectid != op->level &&
571 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
572 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
573 "root %llu, owner %u\n",
574 (unsigned long long)op->bytenr,
575 (unsigned long long)op->orig_parent,
576 (unsigned long long)ref_root, op->level);
577 btrfs_print_leaf(extent_root, leaf);
578 BUG();
579 }
580
581 key.objectid = op->bytenr;
582 key.offset = op->parent;
583 key.type = BTRFS_EXTENT_REF_KEY;
584 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
585 BUG_ON(ret);
586 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
587 btrfs_set_ref_generation(leaf, ref, op->generation);
588
589 cur = cur->next;
590
591 list_del_init(&op->list);
592 unlock_extent(&info->extent_ins, op->bytenr,
593 op->bytenr + op->num_bytes - 1, GFP_NOFS);
594 kfree(op);
595
596 if (cur == update_list) {
597 btrfs_mark_buffer_dirty(path->nodes[0]);
598 btrfs_release_path(extent_root, path);
599 goto out;
600 }
601
602 op = list_entry(cur, struct pending_extent_op, list);
603
604 path->slots[0]++;
605 while (path->slots[0] < btrfs_header_nritems(leaf)) {
606 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
607 if (key.objectid == op->bytenr &&
608 key.type == BTRFS_EXTENT_REF_KEY)
609 goto loop;
610 path->slots[0]++;
611 }
612
613 btrfs_mark_buffer_dirty(path->nodes[0]);
614 btrfs_release_path(extent_root, path);
615 goto search;
616
617out:
618 return 0;
619}
620
621static noinline int insert_extents(struct btrfs_trans_handle *trans,
622 struct btrfs_root *extent_root,
623 struct btrfs_path *path,
624 struct list_head *insert_list, int nr)
625{
626 struct btrfs_key *keys;
627 u32 *data_size;
628 struct pending_extent_op *op;
629 struct extent_buffer *leaf;
630 struct list_head *cur = insert_list->next;
631 struct btrfs_fs_info *info = extent_root->fs_info;
632 u64 ref_root = extent_root->root_key.objectid;
633 int i = 0, last = 0, ret;
634 int total = nr * 2;
635
636 if (!nr)
637 return 0;
638
639 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
640 if (!keys)
641 return -ENOMEM;
642
643 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
644 if (!data_size) {
645 kfree(keys);
646 return -ENOMEM;
647 }
648
649 list_for_each_entry(op, insert_list, list) {
650 keys[i].objectid = op->bytenr;
651 keys[i].offset = op->num_bytes;
652 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
653 data_size[i] = sizeof(struct btrfs_extent_item);
654 i++;
655
656 keys[i].objectid = op->bytenr;
657 keys[i].offset = op->parent;
658 keys[i].type = BTRFS_EXTENT_REF_KEY;
659 data_size[i] = sizeof(struct btrfs_extent_ref);
660 i++;
661 }
662
663 op = list_entry(cur, struct pending_extent_op, list);
664 i = 0;
665 while (i < total) {
666 int c;
667 ret = btrfs_insert_some_items(trans, extent_root, path,
668 keys+i, data_size+i, total-i);
669 BUG_ON(ret < 0);
670
671 if (last && ret > 1)
672 BUG();
673
674 leaf = path->nodes[0];
675 for (c = 0; c < ret; c++) {
676 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
677
678 /*
679 * if the first item we inserted was a backref, then
680 * the EXTENT_ITEM will be the odd c's, else it will
681 * be the even c's
682 */
683 if ((ref_first && (c % 2)) ||
684 (!ref_first && !(c % 2))) {
685 struct btrfs_extent_item *itm;
686
687 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
688 struct btrfs_extent_item);
689 btrfs_set_extent_refs(path->nodes[0], itm, 1);
690 op->del++;
691 } else {
692 struct btrfs_extent_ref *ref;
693
694 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
695 struct btrfs_extent_ref);
696 btrfs_set_ref_root(leaf, ref, ref_root);
697 btrfs_set_ref_generation(leaf, ref,
698 op->generation);
699 btrfs_set_ref_objectid(leaf, ref, op->level);
700 btrfs_set_ref_num_refs(leaf, ref, 1);
701 op->del++;
702 }
703
704 /*
705 * using del to see when its ok to free up the
706 * pending_extent_op. In the case where we insert the
707 * last item on the list in order to help do batching
708 * we need to not free the extent op until we actually
709 * insert the extent_item
710 */
711 if (op->del == 2) {
712 unlock_extent(&info->extent_ins, op->bytenr,
713 op->bytenr + op->num_bytes - 1,
714 GFP_NOFS);
715 cur = cur->next;
716 list_del_init(&op->list);
717 kfree(op);
718 if (cur != insert_list)
719 op = list_entry(cur,
720 struct pending_extent_op,
721 list);
722 }
723 }
724 btrfs_mark_buffer_dirty(leaf);
725 btrfs_release_path(extent_root, path);
726
727 /*
728 * Ok backref's and items usually go right next to eachother,
729 * but if we could only insert 1 item that means that we
730 * inserted on the end of a leaf, and we have no idea what may
731 * be on the next leaf so we just play it safe. In order to
732 * try and help this case we insert the last thing on our
733 * insert list so hopefully it will end up being the last
734 * thing on the leaf and everything else will be before it,
735 * which will let us insert a whole bunch of items at the same
736 * time.
737 */
738 if (ret == 1 && !last && (i + ret < total)) {
739 /*
740 * last: where we will pick up the next time around
741 * i: our current key to insert, will be total - 1
742 * cur: the current op we are screwing with
743 * op: duh
744 */
745 last = i + ret;
746 i = total - 1;
747 cur = insert_list->prev;
748 op = list_entry(cur, struct pending_extent_op, list);
749 } else if (last) {
750 /*
751 * ok we successfully inserted the last item on the
752 * list, lets reset everything
753 *
754 * i: our current key to insert, so where we left off
755 * last time
756 * last: done with this
757 * cur: the op we are messing with
758 * op: duh
759 * total: since we inserted the last key, we need to
760 * decrement total so we dont overflow
761 */
762 i = last;
763 last = 0;
764 total--;
765 if (i < total) {
766 cur = insert_list->next;
767 op = list_entry(cur, struct pending_extent_op,
768 list);
769 }
770 } else {
771 i += ret;
772 }
773
774 cond_resched();
775 }
776 ret = 0;
777 kfree(keys);
778 kfree(data_size);
779 return ret;
780}
781
782static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 562static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
783 struct btrfs_root *root, 563 struct btrfs_root *root,
784 struct btrfs_path *path, 564 struct btrfs_path *path,
785 u64 bytenr, u64 parent, 565 u64 bytenr, u64 parent,
786 u64 ref_root, u64 ref_generation, 566 u64 ref_root, u64 ref_generation,
787 u64 owner_objectid) 567 u64 owner_objectid,
568 int refs_to_add)
788{ 569{
789 struct btrfs_key key; 570 struct btrfs_key key;
790 struct extent_buffer *leaf; 571 struct extent_buffer *leaf;
@@ -804,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
804 btrfs_set_ref_root(leaf, ref, ref_root); 585 btrfs_set_ref_root(leaf, ref, ref_root);
805 btrfs_set_ref_generation(leaf, ref, ref_generation); 586 btrfs_set_ref_generation(leaf, ref, ref_generation);
806 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 587 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
807 btrfs_set_ref_num_refs(leaf, ref, 1); 588 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
808 } else if (ret == -EEXIST) { 589 } else if (ret == -EEXIST) {
809 u64 existing_owner; 590 u64 existing_owner;
591
810 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 592 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
811 leaf = path->nodes[0]; 593 leaf = path->nodes[0];
812 ref = btrfs_item_ptr(leaf, path->slots[0], 594 ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -820,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
820 602
821 num_refs = btrfs_ref_num_refs(leaf, ref); 603 num_refs = btrfs_ref_num_refs(leaf, ref);
822 BUG_ON(num_refs == 0); 604 BUG_ON(num_refs == 0);
823 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); 605 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
824 606
825 existing_owner = btrfs_ref_objectid(leaf, ref); 607 existing_owner = btrfs_ref_objectid(leaf, ref);
826 if (existing_owner != owner_objectid && 608 if (existing_owner != owner_objectid &&
@@ -832,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
832 } else { 614 } else {
833 goto out; 615 goto out;
834 } 616 }
617 btrfs_unlock_up_safe(path, 1);
835 btrfs_mark_buffer_dirty(path->nodes[0]); 618 btrfs_mark_buffer_dirty(path->nodes[0]);
836out: 619out:
837 btrfs_release_path(root, path); 620 btrfs_release_path(root, path);
@@ -840,7 +623,8 @@ out:
840 623
841static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 624static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
842 struct btrfs_root *root, 625 struct btrfs_root *root,
843 struct btrfs_path *path) 626 struct btrfs_path *path,
627 int refs_to_drop)
844{ 628{
845 struct extent_buffer *leaf; 629 struct extent_buffer *leaf;
846 struct btrfs_extent_ref *ref; 630 struct btrfs_extent_ref *ref;
@@ -850,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
850 leaf = path->nodes[0]; 634 leaf = path->nodes[0];
851 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 635 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
852 num_refs = btrfs_ref_num_refs(leaf, ref); 636 num_refs = btrfs_ref_num_refs(leaf, ref);
853 BUG_ON(num_refs == 0); 637 BUG_ON(num_refs < refs_to_drop);
854 num_refs -= 1; 638 num_refs -= refs_to_drop;
855 if (num_refs == 0) { 639 if (num_refs == 0) {
856 ret = btrfs_del_item(trans, root, path); 640 ret = btrfs_del_item(trans, root, path);
857 } else { 641 } else {
@@ -902,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
902#endif 686#endif
903} 687}
904 688
905static noinline int free_extents(struct btrfs_trans_handle *trans,
906 struct btrfs_root *extent_root,
907 struct list_head *del_list)
908{
909 struct btrfs_fs_info *info = extent_root->fs_info;
910 struct btrfs_path *path;
911 struct btrfs_key key, found_key;
912 struct extent_buffer *leaf;
913 struct list_head *cur;
914 struct pending_extent_op *op;
915 struct btrfs_extent_item *ei;
916 int ret, num_to_del, extent_slot = 0, found_extent = 0;
917 u32 refs;
918 u64 bytes_freed = 0;
919
920 path = btrfs_alloc_path();
921 if (!path)
922 return -ENOMEM;
923 path->reada = 1;
924
925search:
926 /* search for the backref for the current ref we want to delete */
927 cur = del_list->next;
928 op = list_entry(cur, struct pending_extent_op, list);
929 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
930 op->orig_parent,
931 extent_root->root_key.objectid,
932 op->orig_generation, op->level, 1);
933 if (ret) {
934 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
935 "root %llu gen %llu owner %u\n",
936 (unsigned long long)op->bytenr,
937 (unsigned long long)extent_root->root_key.objectid,
938 (unsigned long long)op->orig_generation, op->level);
939 btrfs_print_leaf(extent_root, path->nodes[0]);
940 WARN_ON(1);
941 goto out;
942 }
943
944 extent_slot = path->slots[0];
945 num_to_del = 1;
946 found_extent = 0;
947
948 /*
949 * if we aren't the first item on the leaf we can move back one and see
950 * if our ref is right next to our extent item
951 */
952 if (likely(extent_slot)) {
953 extent_slot--;
954 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
955 extent_slot);
956 if (found_key.objectid == op->bytenr &&
957 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
958 found_key.offset == op->num_bytes) {
959 num_to_del++;
960 found_extent = 1;
961 }
962 }
963
964 /*
965 * if we didn't find the extent we need to delete the backref and then
966 * search for the extent item key so we can update its ref count
967 */
968 if (!found_extent) {
969 key.objectid = op->bytenr;
970 key.type = BTRFS_EXTENT_ITEM_KEY;
971 key.offset = op->num_bytes;
972
973 ret = remove_extent_backref(trans, extent_root, path);
974 BUG_ON(ret);
975 btrfs_release_path(extent_root, path);
976 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
977 BUG_ON(ret);
978 extent_slot = path->slots[0];
979 }
980
981 /* this is where we update the ref count for the extent */
982 leaf = path->nodes[0];
983 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
984 refs = btrfs_extent_refs(leaf, ei);
985 BUG_ON(refs == 0);
986 refs--;
987 btrfs_set_extent_refs(leaf, ei, refs);
988
989 btrfs_mark_buffer_dirty(leaf);
990
991 /*
992 * This extent needs deleting. The reason cur_slot is extent_slot +
993 * num_to_del is because extent_slot points to the slot where the extent
994 * is, and if the backref was not right next to the extent we will be
995 * deleting at least 1 item, and will want to start searching at the
996 * slot directly next to extent_slot. However if we did find the
997 * backref next to the extent item them we will be deleting at least 2
998 * items and will want to start searching directly after the ref slot
999 */
1000 if (!refs) {
1001 struct list_head *pos, *n, *end;
1002 int cur_slot = extent_slot+num_to_del;
1003 u64 super_used;
1004 u64 root_used;
1005
1006 path->slots[0] = extent_slot;
1007 bytes_freed = op->num_bytes;
1008
1009 mutex_lock(&info->pinned_mutex);
1010 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1011 op->num_bytes, op->level >=
1012 BTRFS_FIRST_FREE_OBJECTID);
1013 mutex_unlock(&info->pinned_mutex);
1014 BUG_ON(ret < 0);
1015 op->del = ret;
1016
1017 /*
1018 * we need to see if we can delete multiple things at once, so
1019 * start looping through the list of extents we are wanting to
1020 * delete and see if their extent/backref's are right next to
1021 * eachother and the extents only have 1 ref
1022 */
1023 for (pos = cur->next; pos != del_list; pos = pos->next) {
1024 struct pending_extent_op *tmp;
1025
1026 tmp = list_entry(pos, struct pending_extent_op, list);
1027
1028 /* we only want to delete extent+ref at this stage */
1029 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1030 break;
1031
1032 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1033 if (found_key.objectid != tmp->bytenr ||
1034 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1035 found_key.offset != tmp->num_bytes)
1036 break;
1037
1038 /* check to make sure this extent only has one ref */
1039 ei = btrfs_item_ptr(leaf, cur_slot,
1040 struct btrfs_extent_item);
1041 if (btrfs_extent_refs(leaf, ei) != 1)
1042 break;
1043
1044 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1045 if (found_key.objectid != tmp->bytenr ||
1046 found_key.type != BTRFS_EXTENT_REF_KEY ||
1047 found_key.offset != tmp->orig_parent)
1048 break;
1049
1050 /*
1051 * the ref is right next to the extent, we can set the
1052 * ref count to 0 since we will delete them both now
1053 */
1054 btrfs_set_extent_refs(leaf, ei, 0);
1055
1056 /* pin down the bytes for this extent */
1057 mutex_lock(&info->pinned_mutex);
1058 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1059 tmp->num_bytes, tmp->level >=
1060 BTRFS_FIRST_FREE_OBJECTID);
1061 mutex_unlock(&info->pinned_mutex);
1062 BUG_ON(ret < 0);
1063
1064 /*
1065 * use the del field to tell if we need to go ahead and
1066 * free up the extent when we delete the item or not.
1067 */
1068 tmp->del = ret;
1069 bytes_freed += tmp->num_bytes;
1070
1071 num_to_del += 2;
1072 cur_slot += 2;
1073 }
1074 end = pos;
1075
1076 /* update the free space counters */
1077 spin_lock(&info->delalloc_lock);
1078 super_used = btrfs_super_bytes_used(&info->super_copy);
1079 btrfs_set_super_bytes_used(&info->super_copy,
1080 super_used - bytes_freed);
1081
1082 root_used = btrfs_root_used(&extent_root->root_item);
1083 btrfs_set_root_used(&extent_root->root_item,
1084 root_used - bytes_freed);
1085 spin_unlock(&info->delalloc_lock);
1086
1087 /* delete the items */
1088 ret = btrfs_del_items(trans, extent_root, path,
1089 path->slots[0], num_to_del);
1090 BUG_ON(ret);
1091
1092 /*
1093 * loop through the extents we deleted and do the cleanup work
1094 * on them
1095 */
1096 for (pos = cur, n = pos->next; pos != end;
1097 pos = n, n = pos->next) {
1098 struct pending_extent_op *tmp;
1099 tmp = list_entry(pos, struct pending_extent_op, list);
1100
1101 /*
1102 * remember tmp->del tells us wether or not we pinned
1103 * down the extent
1104 */
1105 ret = update_block_group(trans, extent_root,
1106 tmp->bytenr, tmp->num_bytes, 0,
1107 tmp->del);
1108 BUG_ON(ret);
1109
1110 list_del_init(&tmp->list);
1111 unlock_extent(&info->extent_ins, tmp->bytenr,
1112 tmp->bytenr + tmp->num_bytes - 1,
1113 GFP_NOFS);
1114 kfree(tmp);
1115 }
1116 } else if (refs && found_extent) {
1117 /*
1118 * the ref and extent were right next to eachother, but the
1119 * extent still has a ref, so just free the backref and keep
1120 * going
1121 */
1122 ret = remove_extent_backref(trans, extent_root, path);
1123 BUG_ON(ret);
1124
1125 list_del_init(&op->list);
1126 unlock_extent(&info->extent_ins, op->bytenr,
1127 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1128 kfree(op);
1129 } else {
1130 /*
1131 * the extent has multiple refs and the backref we were looking
1132 * for was not right next to it, so just unlock and go next,
1133 * we're good to go
1134 */
1135 list_del_init(&op->list);
1136 unlock_extent(&info->extent_ins, op->bytenr,
1137 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1138 kfree(op);
1139 }
1140
1141 btrfs_release_path(extent_root, path);
1142 if (!list_empty(del_list))
1143 goto search;
1144
1145out:
1146 btrfs_free_path(path);
1147 return ret;
1148}
1149
1150static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 689static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1151 struct btrfs_root *root, u64 bytenr, 690 struct btrfs_root *root, u64 bytenr,
691 u64 num_bytes,
1152 u64 orig_parent, u64 parent, 692 u64 orig_parent, u64 parent,
1153 u64 orig_root, u64 ref_root, 693 u64 orig_root, u64 ref_root,
1154 u64 orig_generation, u64 ref_generation, 694 u64 orig_generation, u64 ref_generation,
1155 u64 owner_objectid) 695 u64 owner_objectid)
1156{ 696{
1157 int ret; 697 int ret;
1158 struct btrfs_root *extent_root = root->fs_info->extent_root; 698 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
1159 struct btrfs_path *path;
1160
1161 if (root == root->fs_info->extent_root) {
1162 struct pending_extent_op *extent_op;
1163 u64 num_bytes;
1164
1165 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1166 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1167 mutex_lock(&root->fs_info->extent_ins_mutex);
1168 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1169 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1170 u64 priv;
1171 ret = get_state_private(&root->fs_info->extent_ins,
1172 bytenr, &priv);
1173 BUG_ON(ret);
1174 extent_op = (struct pending_extent_op *)
1175 (unsigned long)priv;
1176 BUG_ON(extent_op->parent != orig_parent);
1177 BUG_ON(extent_op->generation != orig_generation);
1178 699
1179 extent_op->parent = parent; 700 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
1180 extent_op->generation = ref_generation; 701 orig_parent, parent, orig_root,
1181 } else { 702 ref_root, orig_generation,
1182 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 703 ref_generation, owner_objectid, pin);
1183 BUG_ON(!extent_op);
1184
1185 extent_op->type = PENDING_BACKREF_UPDATE;
1186 extent_op->bytenr = bytenr;
1187 extent_op->num_bytes = num_bytes;
1188 extent_op->parent = parent;
1189 extent_op->orig_parent = orig_parent;
1190 extent_op->generation = ref_generation;
1191 extent_op->orig_generation = orig_generation;
1192 extent_op->level = (int)owner_objectid;
1193 INIT_LIST_HEAD(&extent_op->list);
1194 extent_op->del = 0;
1195
1196 set_extent_bits(&root->fs_info->extent_ins,
1197 bytenr, bytenr + num_bytes - 1,
1198 EXTENT_WRITEBACK, GFP_NOFS);
1199 set_state_private(&root->fs_info->extent_ins,
1200 bytenr, (unsigned long)extent_op);
1201 }
1202 mutex_unlock(&root->fs_info->extent_ins_mutex);
1203 return 0;
1204 }
1205
1206 path = btrfs_alloc_path();
1207 if (!path)
1208 return -ENOMEM;
1209 ret = lookup_extent_backref(trans, extent_root, path,
1210 bytenr, orig_parent, orig_root,
1211 orig_generation, owner_objectid, 1);
1212 if (ret)
1213 goto out;
1214 ret = remove_extent_backref(trans, extent_root, path);
1215 if (ret)
1216 goto out;
1217 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1218 parent, ref_root, ref_generation,
1219 owner_objectid);
1220 BUG_ON(ret); 704 BUG_ON(ret);
1221 finish_current_insert(trans, extent_root, 0);
1222 del_pending_extents(trans, extent_root, 0);
1223out:
1224 btrfs_free_path(path);
1225 return ret; 705 return ret;
1226} 706}
1227 707
1228int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 708int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1229 struct btrfs_root *root, u64 bytenr, 709 struct btrfs_root *root, u64 bytenr,
1230 u64 orig_parent, u64 parent, 710 u64 num_bytes, u64 orig_parent, u64 parent,
1231 u64 ref_root, u64 ref_generation, 711 u64 ref_root, u64 ref_generation,
1232 u64 owner_objectid) 712 u64 owner_objectid)
1233{ 713{
@@ -1235,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1235 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 715 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1236 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 716 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1237 return 0; 717 return 0;
1238 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, 718
1239 parent, ref_root, ref_root, 719 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
1240 ref_generation, ref_generation, 720 orig_parent, parent, ref_root,
1241 owner_objectid); 721 ref_root, ref_generation,
722 ref_generation, owner_objectid);
1242 return ret; 723 return ret;
1243} 724}
1244
1245static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 725static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1246 struct btrfs_root *root, u64 bytenr, 726 struct btrfs_root *root, u64 bytenr,
727 u64 num_bytes,
1247 u64 orig_parent, u64 parent, 728 u64 orig_parent, u64 parent,
1248 u64 orig_root, u64 ref_root, 729 u64 orig_root, u64 ref_root,
1249 u64 orig_generation, u64 ref_generation, 730 u64 orig_generation, u64 ref_generation,
1250 u64 owner_objectid) 731 u64 owner_objectid)
1251{ 732{
733 int ret;
734
735 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736 ref_generation, owner_objectid,
737 BTRFS_ADD_DELAYED_REF, 0);
738 BUG_ON(ret);
739 return ret;
740}
741
742static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743 struct btrfs_root *root, u64 bytenr,
744 u64 num_bytes, u64 parent, u64 ref_root,
745 u64 ref_generation, u64 owner_objectid,
746 int refs_to_add)
747{
1252 struct btrfs_path *path; 748 struct btrfs_path *path;
1253 int ret; 749 int ret;
1254 struct btrfs_key key; 750 struct btrfs_key key;
@@ -1261,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1261 return -ENOMEM; 757 return -ENOMEM;
1262 758
1263 path->reada = 1; 759 path->reada = 1;
760 path->leave_spinning = 1;
1264 key.objectid = bytenr; 761 key.objectid = bytenr;
1265 key.type = BTRFS_EXTENT_ITEM_KEY; 762 key.type = BTRFS_EXTENT_ITEM_KEY;
1266 key.offset = (u64)-1; 763 key.offset = num_bytes;
1267 764
1268 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 765 /* first find the extent item and update its reference count */
1269 0, 1); 766 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1270 if (ret < 0) 767 path, 0, 1);
768 if (ret < 0) {
769 btrfs_set_path_blocking(path);
1271 return ret; 770 return ret;
1272 BUG_ON(ret == 0 || path->slots[0] == 0); 771 }
1273 772
1274 path->slots[0]--; 773 if (ret > 0) {
774 WARN_ON(1);
775 btrfs_free_path(path);
776 return -EIO;
777 }
1275 l = path->nodes[0]; 778 l = path->nodes[0];
1276 779
1277 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 780 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1285,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1285 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 788 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1286 789
1287 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 790 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
1288 refs = btrfs_extent_refs(l, item); 792 refs = btrfs_extent_refs(l, item);
1289 btrfs_set_extent_refs(l, item, refs + 1); 793 btrfs_set_extent_refs(l, item, refs + refs_to_add);
794 btrfs_unlock_up_safe(path, 1);
795
1290 btrfs_mark_buffer_dirty(path->nodes[0]); 796 btrfs_mark_buffer_dirty(path->nodes[0]);
1291 797
1292 btrfs_release_path(root->fs_info->extent_root, path); 798 btrfs_release_path(root->fs_info->extent_root, path);
1293 799
1294 path->reada = 1; 800 path->reada = 1;
801 path->leave_spinning = 1;
802
803 /* now insert the actual backref */
1295 ret = insert_extent_backref(trans, root->fs_info->extent_root, 804 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1296 path, bytenr, parent, 805 path, bytenr, parent,
1297 ref_root, ref_generation, 806 ref_root, ref_generation,
1298 owner_objectid); 807 owner_objectid, refs_to_add);
1299 BUG_ON(ret); 808 BUG_ON(ret);
1300 finish_current_insert(trans, root->fs_info->extent_root, 0);
1301 del_pending_extents(trans, root->fs_info->extent_root, 0);
1302
1303 btrfs_free_path(path); 809 btrfs_free_path(path);
1304 return 0; 810 return 0;
1305} 811}
@@ -1314,51 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1314 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 820 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1315 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 821 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1316 return 0; 822 return 0;
1317 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, 823
824 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
1318 0, ref_root, 0, ref_generation, 825 0, ref_root, 0, ref_generation,
1319 owner_objectid); 826 owner_objectid);
1320 return ret; 827 return ret;
1321} 828}
1322 829
1323int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 830static int drop_delayed_ref(struct btrfs_trans_handle *trans,
1324 struct btrfs_root *root) 831 struct btrfs_root *root,
832 struct btrfs_delayed_ref_node *node)
833{
834 int ret = 0;
835 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
836
837 BUG_ON(node->ref_mod == 0);
838 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
839 node->parent, ref->root, ref->generation,
840 ref->owner_objectid, ref->pin, node->ref_mod);
841
842 return ret;
843}
844
845/* helper function to actually process a single delayed ref entry */
846static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_delayed_ref_node *node,
849 int insert_reserved)
1325{ 850{
1326 finish_current_insert(trans, root->fs_info->extent_root, 1); 851 int ret;
1327 del_pending_extents(trans, root->fs_info->extent_root, 1); 852 struct btrfs_delayed_ref *ref;
853
854 if (node->parent == (u64)-1) {
855 struct btrfs_delayed_ref_head *head;
856 /*
857 * we've hit the end of the chain and we were supposed
858 * to insert this extent into the tree. But, it got
859 * deleted before we ever needed to insert it, so all
860 * we have to do is clean up the accounting
861 */
862 if (insert_reserved) {
863 update_reserved_extents(root, node->bytenr,
864 node->num_bytes, 0);
865 }
866 head = btrfs_delayed_node_to_head(node);
867 mutex_unlock(&head->mutex);
868 return 0;
869 }
870
871 ref = btrfs_delayed_node_to_ref(node);
872 if (ref->action == BTRFS_ADD_DELAYED_REF) {
873 if (insert_reserved) {
874 struct btrfs_key ins;
875
876 ins.objectid = node->bytenr;
877 ins.offset = node->num_bytes;
878 ins.type = BTRFS_EXTENT_ITEM_KEY;
879
880 /* record the full extent allocation */
881 ret = __btrfs_alloc_reserved_extent(trans, root,
882 node->parent, ref->root,
883 ref->generation, ref->owner_objectid,
884 &ins, node->ref_mod);
885 update_reserved_extents(root, node->bytenr,
886 node->num_bytes, 0);
887 } else {
888 /* just add one backref */
889 ret = add_extent_ref(trans, root, node->bytenr,
890 node->num_bytes,
891 node->parent, ref->root, ref->generation,
892 ref->owner_objectid, node->ref_mod);
893 }
894 BUG_ON(ret);
895 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896 WARN_ON(insert_reserved);
897 ret = drop_delayed_ref(trans, root, node);
898 }
1328 return 0; 899 return 0;
1329} 900}
1330 901
1331int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 902static noinline struct btrfs_delayed_ref_node *
1332 struct btrfs_root *root, u64 bytenr, 903select_delayed_ref(struct btrfs_delayed_ref_head *head)
1333 u64 num_bytes, u32 *refs)
1334{ 904{
1335 struct btrfs_path *path; 905 struct rb_node *node;
906 struct btrfs_delayed_ref_node *ref;
907 int action = BTRFS_ADD_DELAYED_REF;
908again:
909 /*
910 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
911 * this prevents ref count from going down to zero when
912 * there still are pending delayed ref.
913 */
914 node = rb_prev(&head->node.rb_node);
915 while (1) {
916 if (!node)
917 break;
918 ref = rb_entry(node, struct btrfs_delayed_ref_node,
919 rb_node);
920 if (ref->bytenr != head->node.bytenr)
921 break;
922 if (btrfs_delayed_node_to_ref(ref)->action == action)
923 return ref;
924 node = rb_prev(node);
925 }
926 if (action == BTRFS_ADD_DELAYED_REF) {
927 action = BTRFS_DROP_DELAYED_REF;
928 goto again;
929 }
930 return NULL;
931}
932
933static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
934 struct btrfs_root *root,
935 struct list_head *cluster)
936{
937 struct btrfs_delayed_ref_root *delayed_refs;
938 struct btrfs_delayed_ref_node *ref;
939 struct btrfs_delayed_ref_head *locked_ref = NULL;
1336 int ret; 940 int ret;
1337 struct btrfs_key key; 941 int count = 0;
1338 struct extent_buffer *l; 942 int must_insert_reserved = 0;
1339 struct btrfs_extent_item *item;
1340 943
1341 WARN_ON(num_bytes < root->sectorsize); 944 delayed_refs = &trans->transaction->delayed_refs;
1342 path = btrfs_alloc_path(); 945 while (1) {
1343 path->reada = 1; 946 if (!locked_ref) {
1344 key.objectid = bytenr; 947 /* pick a new head ref from the cluster list */
1345 key.offset = num_bytes; 948 if (list_empty(cluster))
1346 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 949 break;
1347 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 950
1348 0, 0); 951 locked_ref = list_entry(cluster->next,
1349 if (ret < 0) 952 struct btrfs_delayed_ref_head, cluster);
1350 goto out; 953
1351 if (ret != 0) { 954 /* grab the lock that says we are going to process
1352 btrfs_print_leaf(root, path->nodes[0]); 955 * all the refs for this head */
1353 printk(KERN_INFO "btrfs failed to find block number %llu\n", 956 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1354 (unsigned long long)bytenr); 957
1355 BUG(); 958 /*
959 * we may have dropped the spin lock to get the head
960 * mutex lock, and that might have given someone else
961 * time to free the head. If that's true, it has been
962 * removed from our list and we can move on.
963 */
964 if (ret == -EAGAIN) {
965 locked_ref = NULL;
966 count++;
967 continue;
968 }
969 }
970
971 /*
972 * record the must insert reserved flag before we
973 * drop the spin lock.
974 */
975 must_insert_reserved = locked_ref->must_insert_reserved;
976 locked_ref->must_insert_reserved = 0;
977
978 /*
979 * locked_ref is the head node, so we have to go one
980 * node back for any delayed ref updates
981 */
982 ref = select_delayed_ref(locked_ref);
983 if (!ref) {
984 /* All delayed refs have been processed, Go ahead
985 * and send the head node to run_one_delayed_ref,
986 * so that any accounting fixes can happen
987 */
988 ref = &locked_ref->node;
989 list_del_init(&locked_ref->cluster);
990 locked_ref = NULL;
991 }
992
993 ref->in_tree = 0;
994 rb_erase(&ref->rb_node, &delayed_refs->root);
995 delayed_refs->num_entries--;
996 spin_unlock(&delayed_refs->lock);
997
998 ret = run_one_delayed_ref(trans, root, ref,
999 must_insert_reserved);
1000 BUG_ON(ret);
1001 btrfs_put_delayed_ref(ref);
1002
1003 count++;
1004 cond_resched();
1005 spin_lock(&delayed_refs->lock);
1006 }
1007 return count;
1008}
1009
1010/*
1011 * this starts processing the delayed reference count updates and
1012 * extent insertions we have queued up so far. count can be
1013 * 0, which means to process everything in the tree at the start
1014 * of the run (but not newly added entries), or it can be some target
1015 * number you'd like to process.
1016 */
1017int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1018 struct btrfs_root *root, unsigned long count)
1019{
1020 struct rb_node *node;
1021 struct btrfs_delayed_ref_root *delayed_refs;
1022 struct btrfs_delayed_ref_node *ref;
1023 struct list_head cluster;
1024 int ret;
1025 int run_all = count == (unsigned long)-1;
1026 int run_most = 0;
1027
1028 if (root == root->fs_info->extent_root)
1029 root = root->fs_info->tree_root;
1030
1031 delayed_refs = &trans->transaction->delayed_refs;
1032 INIT_LIST_HEAD(&cluster);
1033again:
1034 spin_lock(&delayed_refs->lock);
1035 if (count == 0) {
1036 count = delayed_refs->num_entries * 2;
1037 run_most = 1;
1038 }
1039 while (1) {
1040 if (!(run_all || run_most) &&
1041 delayed_refs->num_heads_ready < 64)
1042 break;
1043
1044 /*
1045 * go find something we can process in the rbtree. We start at
1046 * the beginning of the tree, and then build a cluster
1047 * of refs to process starting at the first one we are able to
1048 * lock
1049 */
1050 ret = btrfs_find_ref_cluster(trans, &cluster,
1051 delayed_refs->run_delayed_start);
1052 if (ret)
1053 break;
1054
1055 ret = run_clustered_refs(trans, root, &cluster);
1056 BUG_ON(ret < 0);
1057
1058 count -= min_t(unsigned long, ret, count);
1059
1060 if (count == 0)
1061 break;
1062 }
1063
1064 if (run_all) {
1065 node = rb_first(&delayed_refs->root);
1066 if (!node)
1067 goto out;
1068 count = (unsigned long)-1;
1069
1070 while (node) {
1071 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1072 rb_node);
1073 if (btrfs_delayed_ref_is_head(ref)) {
1074 struct btrfs_delayed_ref_head *head;
1075
1076 head = btrfs_delayed_node_to_head(ref);
1077 atomic_inc(&ref->refs);
1078
1079 spin_unlock(&delayed_refs->lock);
1080 mutex_lock(&head->mutex);
1081 mutex_unlock(&head->mutex);
1082
1083 btrfs_put_delayed_ref(ref);
1084 cond_resched();
1085 goto again;
1086 }
1087 node = rb_next(node);
1088 }
1089 spin_unlock(&delayed_refs->lock);
1090 schedule_timeout(1);
1091 goto again;
1356 } 1092 }
1357 l = path->nodes[0];
1358 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1359 *refs = btrfs_extent_refs(l, item);
1360out: 1093out:
1361 btrfs_free_path(path); 1094 spin_unlock(&delayed_refs->lock);
1362 return 0; 1095 return 0;
1363} 1096}
1364 1097
@@ -1582,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1582 int refi = 0; 1315 int refi = 0;
1583 int slot; 1316 int slot;
1584 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1317 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1585 u64, u64, u64, u64, u64, u64, u64, u64); 1318 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1586 1319
1587 ref_root = btrfs_header_owner(buf); 1320 ref_root = btrfs_header_owner(buf);
1588 ref_generation = btrfs_header_generation(buf); 1321 ref_generation = btrfs_header_generation(buf);
@@ -1654,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1654 1387
1655 if (level == 0) { 1388 if (level == 0) {
1656 btrfs_item_key_to_cpu(buf, &key, slot); 1389 btrfs_item_key_to_cpu(buf, &key, slot);
1390 fi = btrfs_item_ptr(buf, slot,
1391 struct btrfs_file_extent_item);
1392
1393 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394 if (bytenr == 0)
1395 continue;
1657 1396
1658 ret = process_func(trans, root, bytenr, 1397 ret = process_func(trans, root, bytenr,
1659 orig_buf->start, buf->start, 1398 btrfs_file_extent_disk_num_bytes(buf, fi),
1660 orig_root, ref_root, 1399 orig_buf->start, buf->start,
1661 orig_generation, ref_generation, 1400 orig_root, ref_root,
1662 key.objectid); 1401 orig_generation, ref_generation,
1402 key.objectid);
1663 1403
1664 if (ret) { 1404 if (ret) {
1665 faili = slot; 1405 faili = slot;
@@ -1667,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1667 goto fail; 1407 goto fail;
1668 } 1408 }
1669 } else { 1409 } else {
1670 ret = process_func(trans, root, bytenr, 1410 ret = process_func(trans, root, bytenr, buf->len,
1671 orig_buf->start, buf->start, 1411 orig_buf->start, buf->start,
1672 orig_root, ref_root, 1412 orig_root, ref_root,
1673 orig_generation, ref_generation, 1413 orig_generation, ref_generation,
@@ -1744,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
1744 if (bytenr == 0) 1484 if (bytenr == 0)
1745 continue; 1485 continue;
1746 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1486 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1747 orig_buf->start, buf->start, 1487 btrfs_file_extent_disk_num_bytes(buf, fi),
1748 orig_root, ref_root, 1488 orig_buf->start, buf->start,
1749 orig_generation, ref_generation, 1489 orig_root, ref_root, orig_generation,
1750 key.objectid); 1490 ref_generation, key.objectid);
1751 if (ret) 1491 if (ret)
1752 goto fail; 1492 goto fail;
1753 } else { 1493 } else {
1754 bytenr = btrfs_node_blockptr(buf, slot); 1494 bytenr = btrfs_node_blockptr(buf, slot);
1755 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1495 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1756 orig_buf->start, buf->start, 1496 buf->len, orig_buf->start,
1757 orig_root, ref_root, 1497 buf->start, orig_root, ref_root,
1758 orig_generation, ref_generation, 1498 orig_generation, ref_generation,
1759 level - 1); 1499 level - 1);
1760 if (ret) 1500 if (ret)
@@ -1773,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1773 struct btrfs_block_group_cache *cache) 1513 struct btrfs_block_group_cache *cache)
1774{ 1514{
1775 int ret; 1515 int ret;
1776 int pending_ret;
1777 struct btrfs_root *extent_root = root->fs_info->extent_root; 1516 struct btrfs_root *extent_root = root->fs_info->extent_root;
1778 unsigned long bi; 1517 unsigned long bi;
1779 struct extent_buffer *leaf; 1518 struct extent_buffer *leaf;
@@ -1789,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1789 btrfs_mark_buffer_dirty(leaf); 1528 btrfs_mark_buffer_dirty(leaf);
1790 btrfs_release_path(extent_root, path); 1529 btrfs_release_path(extent_root, path);
1791fail: 1530fail:
1792 finish_current_insert(trans, extent_root, 0);
1793 pending_ret = del_pending_extents(trans, extent_root, 0);
1794 if (ret) 1531 if (ret)
1795 return ret; 1532 return ret;
1796 if (pending_ret)
1797 return pending_ret;
1798 return 0; 1533 return 0;
1799 1534
1800} 1535}
@@ -1858,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1858 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1859 readonly = 1; 1594 readonly = 1;
1860 if (block_group) 1595 if (block_group)
1861 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1862 return readonly; 1597 return readonly;
1863} 1598}
1864 1599
@@ -1882,7 +1617,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1882 if (!found) 1617 if (!found)
1883 return -ENOMEM; 1618 return -ENOMEM;
1884 1619
1885 list_add(&found->list, &info->space_info);
1886 INIT_LIST_HEAD(&found->block_groups); 1620 INIT_LIST_HEAD(&found->block_groups);
1887 init_rwsem(&found->groups_sem); 1621 init_rwsem(&found->groups_sem);
1888 spin_lock_init(&found->lock); 1622 spin_lock_init(&found->lock);
@@ -1892,9 +1626,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1892 found->bytes_pinned = 0; 1626 found->bytes_pinned = 0;
1893 found->bytes_reserved = 0; 1627 found->bytes_reserved = 0;
1894 found->bytes_readonly = 0; 1628 found->bytes_readonly = 0;
1629 found->bytes_delalloc = 0;
1895 found->full = 0; 1630 found->full = 0;
1896 found->force_alloc = 0; 1631 found->force_alloc = 0;
1897 *space_info = found; 1632 *space_info = found;
1633 list_add_rcu(&found->list, &info->space_info);
1898 return 0; 1634 return 0;
1899} 1635}
1900 1636
@@ -1955,6 +1691,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1955 return flags; 1691 return flags;
1956} 1692}
1957 1693
1694static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1695{
1696 struct btrfs_fs_info *info = root->fs_info;
1697 u64 alloc_profile;
1698
1699 if (data) {
1700 alloc_profile = info->avail_data_alloc_bits &
1701 info->data_alloc_profile;
1702 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1703 } else if (root == root->fs_info->chunk_root) {
1704 alloc_profile = info->avail_system_alloc_bits &
1705 info->system_alloc_profile;
1706 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1707 } else {
1708 alloc_profile = info->avail_metadata_alloc_bits &
1709 info->metadata_alloc_profile;
1710 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1711 }
1712
1713 return btrfs_reduce_alloc_profile(root, data);
1714}
1715
1716void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
1717{
1718 u64 alloc_target;
1719
1720 alloc_target = btrfs_get_alloc_profile(root, 1);
1721 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
1722 alloc_target);
1723}
1724
1725/*
1726 * for now this just makes sure we have at least 5% of our metadata space free
1727 * for use.
1728 */
1729int btrfs_check_metadata_free_space(struct btrfs_root *root)
1730{
1731 struct btrfs_fs_info *info = root->fs_info;
1732 struct btrfs_space_info *meta_sinfo;
1733 u64 alloc_target, thresh;
1734 int committed = 0, ret;
1735
1736 /* get the space info for where the metadata will live */
1737 alloc_target = btrfs_get_alloc_profile(root, 0);
1738 meta_sinfo = __find_space_info(info, alloc_target);
1739
1740again:
1741 spin_lock(&meta_sinfo->lock);
1742 if (!meta_sinfo->full)
1743 thresh = meta_sinfo->total_bytes * 80;
1744 else
1745 thresh = meta_sinfo->total_bytes * 95;
1746
1747 do_div(thresh, 100);
1748
1749 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
1750 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
1751 struct btrfs_trans_handle *trans;
1752 if (!meta_sinfo->full) {
1753 meta_sinfo->force_alloc = 1;
1754 spin_unlock(&meta_sinfo->lock);
1755
1756 trans = btrfs_start_transaction(root, 1);
1757 if (!trans)
1758 return -ENOMEM;
1759
1760 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1761 2 * 1024 * 1024, alloc_target, 0);
1762 btrfs_end_transaction(trans, root);
1763 goto again;
1764 }
1765 spin_unlock(&meta_sinfo->lock);
1766
1767 if (!committed) {
1768 committed = 1;
1769 trans = btrfs_join_transaction(root, 1);
1770 if (!trans)
1771 return -ENOMEM;
1772 ret = btrfs_commit_transaction(trans, root);
1773 if (ret)
1774 return ret;
1775 goto again;
1776 }
1777 return -ENOSPC;
1778 }
1779 spin_unlock(&meta_sinfo->lock);
1780
1781 return 0;
1782}
1783
1784/*
1785 * This will check the space that the inode allocates from to make sure we have
1786 * enough space for bytes.
1787 */
1788int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1789 u64 bytes)
1790{
1791 struct btrfs_space_info *data_sinfo;
1792 int ret = 0, committed = 0;
1793
1794 /* make sure bytes are sectorsize aligned */
1795 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1796
1797 data_sinfo = BTRFS_I(inode)->space_info;
1798again:
1799 /* make sure we have enough space to handle the data first */
1800 spin_lock(&data_sinfo->lock);
1801 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
1802 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
1803 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
1804 data_sinfo->bytes_may_use < bytes) {
1805 struct btrfs_trans_handle *trans;
1806
1807 /*
1808 * if we don't have enough free bytes in this space then we need
1809 * to alloc a new chunk.
1810 */
1811 if (!data_sinfo->full) {
1812 u64 alloc_target;
1813
1814 data_sinfo->force_alloc = 1;
1815 spin_unlock(&data_sinfo->lock);
1816
1817 alloc_target = btrfs_get_alloc_profile(root, 1);
1818 trans = btrfs_start_transaction(root, 1);
1819 if (!trans)
1820 return -ENOMEM;
1821
1822 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1823 bytes + 2 * 1024 * 1024,
1824 alloc_target, 0);
1825 btrfs_end_transaction(trans, root);
1826 if (ret)
1827 return ret;
1828 goto again;
1829 }
1830 spin_unlock(&data_sinfo->lock);
1831
1832 /* commit the current transaction and try again */
1833 if (!committed) {
1834 committed = 1;
1835 trans = btrfs_join_transaction(root, 1);
1836 if (!trans)
1837 return -ENOMEM;
1838 ret = btrfs_commit_transaction(trans, root);
1839 if (ret)
1840 return ret;
1841 goto again;
1842 }
1843
1844 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
1845 ", %llu bytes_used, %llu bytes_reserved, "
1846 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
1847 "%llu total\n", bytes, data_sinfo->bytes_delalloc,
1848 data_sinfo->bytes_used, data_sinfo->bytes_reserved,
1849 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
1850 data_sinfo->bytes_may_use, data_sinfo->total_bytes);
1851 return -ENOSPC;
1852 }
1853 data_sinfo->bytes_may_use += bytes;
1854 BTRFS_I(inode)->reserved_bytes += bytes;
1855 spin_unlock(&data_sinfo->lock);
1856
1857 return btrfs_check_metadata_free_space(root);
1858}
1859
1860/*
1861 * if there was an error for whatever reason after calling
1862 * btrfs_check_data_free_space, call this so we can cleanup the counters.
1863 */
1864void btrfs_free_reserved_data_space(struct btrfs_root *root,
1865 struct inode *inode, u64 bytes)
1866{
1867 struct btrfs_space_info *data_sinfo;
1868
1869 /* make sure bytes are sectorsize aligned */
1870 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1871
1872 data_sinfo = BTRFS_I(inode)->space_info;
1873 spin_lock(&data_sinfo->lock);
1874 data_sinfo->bytes_may_use -= bytes;
1875 BTRFS_I(inode)->reserved_bytes -= bytes;
1876 spin_unlock(&data_sinfo->lock);
1877}
1878
1879/* called when we are adding a delalloc extent to the inode's io_tree */
1880void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1881 u64 bytes)
1882{
1883 struct btrfs_space_info *data_sinfo;
1884
1885 /* get the space info for where this inode will be storing its data */
1886 data_sinfo = BTRFS_I(inode)->space_info;
1887
1888 /* make sure we have enough space to handle the data first */
1889 spin_lock(&data_sinfo->lock);
1890 data_sinfo->bytes_delalloc += bytes;
1891
1892 /*
1893 * we are adding a delalloc extent without calling
1894 * btrfs_check_data_free_space first. This happens on a weird
1895 * writepage condition, but shouldn't hurt our accounting
1896 */
1897 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
1898 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
1899 BTRFS_I(inode)->reserved_bytes = 0;
1900 } else {
1901 data_sinfo->bytes_may_use -= bytes;
1902 BTRFS_I(inode)->reserved_bytes -= bytes;
1903 }
1904
1905 spin_unlock(&data_sinfo->lock);
1906}
1907
1908/* called when we are clearing an delalloc extent from the inode's io_tree */
1909void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1910 u64 bytes)
1911{
1912 struct btrfs_space_info *info;
1913
1914 info = BTRFS_I(inode)->space_info;
1915
1916 spin_lock(&info->lock);
1917 info->bytes_delalloc -= bytes;
1918 spin_unlock(&info->lock);
1919}
1920
1958static int do_chunk_alloc(struct btrfs_trans_handle *trans, 1921static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1959 struct btrfs_root *extent_root, u64 alloc_bytes, 1922 struct btrfs_root *extent_root, u64 alloc_bytes,
1960 u64 flags, int force) 1923 u64 flags, int force)
@@ -2054,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2054 WARN_ON(ret); 2017 WARN_ON(ret);
2055 } 2018 }
2056 } 2019 }
2057 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2058 total -= num_bytes; 2021 total -= num_bytes;
2059 bytenr += num_bytes; 2022 bytenr += num_bytes;
2060 } 2023 }
@@ -2071,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2071 return 0; 2034 return 0;
2072 2035
2073 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2074 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2075 2038
2076 return bytenr; 2039 return bytenr;
2077} 2040}
@@ -2083,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2083 struct btrfs_block_group_cache *cache; 2046 struct btrfs_block_group_cache *cache;
2084 struct btrfs_fs_info *fs_info = root->fs_info; 2047 struct btrfs_fs_info *fs_info = root->fs_info;
2085 2048
2086 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2087 if (pin) { 2049 if (pin) {
2088 set_extent_dirty(&fs_info->pinned_extents, 2050 set_extent_dirty(&fs_info->pinned_extents,
2089 bytenr, bytenr + num - 1, GFP_NOFS); 2051 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2091,6 +2053,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2091 clear_extent_dirty(&fs_info->pinned_extents, 2053 clear_extent_dirty(&fs_info->pinned_extents,
2092 bytenr, bytenr + num - 1, GFP_NOFS); 2054 bytenr, bytenr + num - 1, GFP_NOFS);
2093 } 2055 }
2056
2094 while (num > 0) { 2057 while (num > 0) {
2095 cache = btrfs_lookup_block_group(fs_info, bytenr); 2058 cache = btrfs_lookup_block_group(fs_info, bytenr);
2096 BUG_ON(!cache); 2059 BUG_ON(!cache);
@@ -2115,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2115 if (cache->cached) 2078 if (cache->cached)
2116 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2117 } 2080 }
2118 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2119 bytenr += len; 2082 bytenr += len;
2120 num -= len; 2083 num -= len;
2121 } 2084 }
@@ -2146,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2146 } 2109 }
2147 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2148 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2149 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2150 bytenr += len; 2113 bytenr += len;
2151 num -= len; 2114 num -= len;
2152 } 2115 }
@@ -2161,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2161 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2124 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2162 int ret; 2125 int ret;
2163 2126
2164 mutex_lock(&root->fs_info->pinned_mutex);
2165 while (1) { 2127 while (1) {
2166 ret = find_first_extent_bit(pinned_extents, last, 2128 ret = find_first_extent_bit(pinned_extents, last,
2167 &start, &end, EXTENT_DIRTY); 2129 &start, &end, EXTENT_DIRTY);
@@ -2170,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2170 set_extent_dirty(copy, start, end, GFP_NOFS); 2132 set_extent_dirty(copy, start, end, GFP_NOFS);
2171 last = end + 1; 2133 last = end + 1;
2172 } 2134 }
2173 mutex_unlock(&root->fs_info->pinned_mutex);
2174 return 0; 2135 return 0;
2175} 2136}
2176 2137
@@ -2182,7 +2143,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2182 u64 end; 2143 u64 end;
2183 int ret; 2144 int ret;
2184 2145
2185 mutex_lock(&root->fs_info->pinned_mutex);
2186 while (1) { 2146 while (1) {
2187 ret = find_first_extent_bit(unpin, 0, &start, &end, 2147 ret = find_first_extent_bit(unpin, 0, &start, &end,
2188 EXTENT_DIRTY); 2148 EXTENT_DIRTY);
@@ -2191,215 +2151,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2191 2151
2192 ret = btrfs_discard_extent(root, start, end + 1 - start); 2152 ret = btrfs_discard_extent(root, start, end + 1 - start);
2193 2153
2154 /* unlocks the pinned mutex */
2194 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2155 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2195 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2156 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2196 2157
2197 if (need_resched()) { 2158 cond_resched();
2198 mutex_unlock(&root->fs_info->pinned_mutex);
2199 cond_resched();
2200 mutex_lock(&root->fs_info->pinned_mutex);
2201 }
2202 } 2159 }
2203 mutex_unlock(&root->fs_info->pinned_mutex);
2204 return ret; 2160 return ret;
2205} 2161}
2206 2162
2207static int finish_current_insert(struct btrfs_trans_handle *trans,
2208 struct btrfs_root *extent_root, int all)
2209{
2210 u64 start;
2211 u64 end;
2212 u64 priv;
2213 u64 search = 0;
2214 u64 skipped = 0;
2215 struct btrfs_fs_info *info = extent_root->fs_info;
2216 struct btrfs_path *path;
2217 struct pending_extent_op *extent_op, *tmp;
2218 struct list_head insert_list, update_list;
2219 int ret;
2220 int num_inserts = 0, max_inserts;
2221
2222 path = btrfs_alloc_path();
2223 INIT_LIST_HEAD(&insert_list);
2224 INIT_LIST_HEAD(&update_list);
2225
2226 max_inserts = extent_root->leafsize /
2227 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2228 sizeof(struct btrfs_extent_ref) +
2229 sizeof(struct btrfs_extent_item));
2230again:
2231 mutex_lock(&info->extent_ins_mutex);
2232 while (1) {
2233 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2234 &end, EXTENT_WRITEBACK);
2235 if (ret) {
2236 if (skipped && all && !num_inserts &&
2237 list_empty(&update_list)) {
2238 skipped = 0;
2239 search = 0;
2240 continue;
2241 }
2242 mutex_unlock(&info->extent_ins_mutex);
2243 break;
2244 }
2245
2246 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2247 if (!ret) {
2248 skipped = 1;
2249 search = end + 1;
2250 if (need_resched()) {
2251 mutex_unlock(&info->extent_ins_mutex);
2252 cond_resched();
2253 mutex_lock(&info->extent_ins_mutex);
2254 }
2255 continue;
2256 }
2257
2258 ret = get_state_private(&info->extent_ins, start, &priv);
2259 BUG_ON(ret);
2260 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2261
2262 if (extent_op->type == PENDING_EXTENT_INSERT) {
2263 num_inserts++;
2264 list_add_tail(&extent_op->list, &insert_list);
2265 search = end + 1;
2266 if (num_inserts == max_inserts) {
2267 mutex_unlock(&info->extent_ins_mutex);
2268 break;
2269 }
2270 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2271 list_add_tail(&extent_op->list, &update_list);
2272 search = end + 1;
2273 } else {
2274 BUG();
2275 }
2276 }
2277
2278 /*
2279 * process the update list, clear the writeback bit for it, and if
2280 * somebody marked this thing for deletion then just unlock it and be
2281 * done, the free_extents will handle it
2282 */
2283 mutex_lock(&info->extent_ins_mutex);
2284 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2285 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2286 extent_op->bytenr + extent_op->num_bytes - 1,
2287 EXTENT_WRITEBACK, GFP_NOFS);
2288 if (extent_op->del) {
2289 list_del_init(&extent_op->list);
2290 unlock_extent(&info->extent_ins, extent_op->bytenr,
2291 extent_op->bytenr + extent_op->num_bytes
2292 - 1, GFP_NOFS);
2293 kfree(extent_op);
2294 }
2295 }
2296 mutex_unlock(&info->extent_ins_mutex);
2297
2298 /*
2299 * still have things left on the update list, go ahead an update
2300 * everything
2301 */
2302 if (!list_empty(&update_list)) {
2303 ret = update_backrefs(trans, extent_root, path, &update_list);
2304 BUG_ON(ret);
2305 }
2306
2307 /*
2308 * if no inserts need to be done, but we skipped some extents and we
2309 * need to make sure everything is cleaned then reset everything and
2310 * go back to the beginning
2311 */
2312 if (!num_inserts && all && skipped) {
2313 search = 0;
2314 skipped = 0;
2315 INIT_LIST_HEAD(&update_list);
2316 INIT_LIST_HEAD(&insert_list);
2317 goto again;
2318 } else if (!num_inserts) {
2319 goto out;
2320 }
2321
2322 /*
2323 * process the insert extents list. Again if we are deleting this
2324 * extent, then just unlock it, pin down the bytes if need be, and be
2325 * done with it. Saves us from having to actually insert the extent
2326 * into the tree and then subsequently come along and delete it
2327 */
2328 mutex_lock(&info->extent_ins_mutex);
2329 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2330 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2331 extent_op->bytenr + extent_op->num_bytes - 1,
2332 EXTENT_WRITEBACK, GFP_NOFS);
2333 if (extent_op->del) {
2334 u64 used;
2335 list_del_init(&extent_op->list);
2336 unlock_extent(&info->extent_ins, extent_op->bytenr,
2337 extent_op->bytenr + extent_op->num_bytes
2338 - 1, GFP_NOFS);
2339
2340 mutex_lock(&extent_root->fs_info->pinned_mutex);
2341 ret = pin_down_bytes(trans, extent_root,
2342 extent_op->bytenr,
2343 extent_op->num_bytes, 0);
2344 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2345
2346 spin_lock(&info->delalloc_lock);
2347 used = btrfs_super_bytes_used(&info->super_copy);
2348 btrfs_set_super_bytes_used(&info->super_copy,
2349 used - extent_op->num_bytes);
2350 used = btrfs_root_used(&extent_root->root_item);
2351 btrfs_set_root_used(&extent_root->root_item,
2352 used - extent_op->num_bytes);
2353 spin_unlock(&info->delalloc_lock);
2354
2355 ret = update_block_group(trans, extent_root,
2356 extent_op->bytenr,
2357 extent_op->num_bytes,
2358 0, ret > 0);
2359 BUG_ON(ret);
2360 kfree(extent_op);
2361 num_inserts--;
2362 }
2363 }
2364 mutex_unlock(&info->extent_ins_mutex);
2365
2366 ret = insert_extents(trans, extent_root, path, &insert_list,
2367 num_inserts);
2368 BUG_ON(ret);
2369
2370 /*
2371 * if we broke out of the loop in order to insert stuff because we hit
2372 * the maximum number of inserts at a time we can handle, then loop
2373 * back and pick up where we left off
2374 */
2375 if (num_inserts == max_inserts) {
2376 INIT_LIST_HEAD(&insert_list);
2377 INIT_LIST_HEAD(&update_list);
2378 num_inserts = 0;
2379 goto again;
2380 }
2381
2382 /*
2383 * again, if we need to make absolutely sure there are no more pending
2384 * extent operations left and we know that we skipped some, go back to
2385 * the beginning and do it all again
2386 */
2387 if (all && skipped) {
2388 INIT_LIST_HEAD(&insert_list);
2389 INIT_LIST_HEAD(&update_list);
2390 search = 0;
2391 skipped = 0;
2392 num_inserts = 0;
2393 goto again;
2394 }
2395out:
2396 btrfs_free_path(path);
2397 return 0;
2398}
2399
2400static int pin_down_bytes(struct btrfs_trans_handle *trans, 2163static int pin_down_bytes(struct btrfs_trans_handle *trans,
2401 struct btrfs_root *root, 2164 struct btrfs_root *root,
2402 u64 bytenr, u64 num_bytes, int is_data) 2165 struct btrfs_path *path,
2166 u64 bytenr, u64 num_bytes, int is_data,
2167 struct extent_buffer **must_clean)
2403{ 2168{
2404 int err = 0; 2169 int err = 0;
2405 struct extent_buffer *buf; 2170 struct extent_buffer *buf;
@@ -2422,17 +2187,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2422 u64 header_transid = btrfs_header_generation(buf); 2187 u64 header_transid = btrfs_header_generation(buf);
2423 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2188 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2424 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2189 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2190 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2425 header_transid == trans->transid && 2191 header_transid == trans->transid &&
2426 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2192 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2427 clean_tree_block(NULL, root, buf); 2193 *must_clean = buf;
2428 btrfs_tree_unlock(buf);
2429 free_extent_buffer(buf);
2430 return 1; 2194 return 1;
2431 } 2195 }
2432 btrfs_tree_unlock(buf); 2196 btrfs_tree_unlock(buf);
2433 } 2197 }
2434 free_extent_buffer(buf); 2198 free_extent_buffer(buf);
2435pinit: 2199pinit:
2200 btrfs_set_path_blocking(path);
2201 /* unlocks the pinned mutex */
2436 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2202 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2437 2203
2438 BUG_ON(err < 0); 2204 BUG_ON(err < 0);
@@ -2446,7 +2212,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2446 struct btrfs_root *root, 2212 struct btrfs_root *root,
2447 u64 bytenr, u64 num_bytes, u64 parent, 2213 u64 bytenr, u64 num_bytes, u64 parent,
2448 u64 root_objectid, u64 ref_generation, 2214 u64 root_objectid, u64 ref_generation,
2449 u64 owner_objectid, int pin, int mark_free) 2215 u64 owner_objectid, int pin, int mark_free,
2216 int refs_to_drop)
2450{ 2217{
2451 struct btrfs_path *path; 2218 struct btrfs_path *path;
2452 struct btrfs_key key; 2219 struct btrfs_key key;
@@ -2468,6 +2235,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2468 return -ENOMEM; 2235 return -ENOMEM;
2469 2236
2470 path->reada = 1; 2237 path->reada = 1;
2238 path->leave_spinning = 1;
2471 ret = lookup_extent_backref(trans, extent_root, path, 2239 ret = lookup_extent_backref(trans, extent_root, path,
2472 bytenr, parent, root_objectid, 2240 bytenr, parent, root_objectid,
2473 ref_generation, owner_objectid, 1); 2241 ref_generation, owner_objectid, 1);
@@ -2489,9 +2257,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2489 break; 2257 break;
2490 } 2258 }
2491 if (!found_extent) { 2259 if (!found_extent) {
2492 ret = remove_extent_backref(trans, extent_root, path); 2260 ret = remove_extent_backref(trans, extent_root, path,
2261 refs_to_drop);
2493 BUG_ON(ret); 2262 BUG_ON(ret);
2494 btrfs_release_path(extent_root, path); 2263 btrfs_release_path(extent_root, path);
2264 path->leave_spinning = 1;
2495 ret = btrfs_search_slot(trans, extent_root, 2265 ret = btrfs_search_slot(trans, extent_root,
2496 &key, path, -1, 1); 2266 &key, path, -1, 1);
2497 if (ret) { 2267 if (ret) {
@@ -2507,8 +2277,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2507 btrfs_print_leaf(extent_root, path->nodes[0]); 2277 btrfs_print_leaf(extent_root, path->nodes[0]);
2508 WARN_ON(1); 2278 WARN_ON(1);
2509 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2279 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2510 "root %llu gen %llu owner %llu\n", 2280 "parent %llu root %llu gen %llu owner %llu\n",
2511 (unsigned long long)bytenr, 2281 (unsigned long long)bytenr,
2282 (unsigned long long)parent,
2512 (unsigned long long)root_objectid, 2283 (unsigned long long)root_objectid,
2513 (unsigned long long)ref_generation, 2284 (unsigned long long)ref_generation,
2514 (unsigned long long)owner_objectid); 2285 (unsigned long long)owner_objectid);
@@ -2518,17 +2289,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2518 ei = btrfs_item_ptr(leaf, extent_slot, 2289 ei = btrfs_item_ptr(leaf, extent_slot,
2519 struct btrfs_extent_item); 2290 struct btrfs_extent_item);
2520 refs = btrfs_extent_refs(leaf, ei); 2291 refs = btrfs_extent_refs(leaf, ei);
2521 BUG_ON(refs == 0);
2522 refs -= 1;
2523 btrfs_set_extent_refs(leaf, ei, refs);
2524 2292
2293 /*
2294 * we're not allowed to delete the extent item if there
2295 * are other delayed ref updates pending
2296 */
2297
2298 BUG_ON(refs < refs_to_drop);
2299 refs -= refs_to_drop;
2300 btrfs_set_extent_refs(leaf, ei, refs);
2525 btrfs_mark_buffer_dirty(leaf); 2301 btrfs_mark_buffer_dirty(leaf);
2526 2302
2527 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { 2303 if (refs == 0 && found_extent &&
2304 path->slots[0] == extent_slot + 1) {
2528 struct btrfs_extent_ref *ref; 2305 struct btrfs_extent_ref *ref;
2529 ref = btrfs_item_ptr(leaf, path->slots[0], 2306 ref = btrfs_item_ptr(leaf, path->slots[0],
2530 struct btrfs_extent_ref); 2307 struct btrfs_extent_ref);
2531 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); 2308 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2532 /* if the back ref and the extent are next to each other 2309 /* if the back ref and the extent are next to each other
2533 * they get deleted below in one shot 2310 * they get deleted below in one shot
2534 */ 2311 */
@@ -2536,11 +2313,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2536 num_to_del = 2; 2313 num_to_del = 2;
2537 } else if (found_extent) { 2314 } else if (found_extent) {
2538 /* otherwise delete the extent back ref */ 2315 /* otherwise delete the extent back ref */
2539 ret = remove_extent_backref(trans, extent_root, path); 2316 ret = remove_extent_backref(trans, extent_root, path,
2317 refs_to_drop);
2540 BUG_ON(ret); 2318 BUG_ON(ret);
2541 /* if refs are 0, we need to setup the path for deletion */ 2319 /* if refs are 0, we need to setup the path for deletion */
2542 if (refs == 0) { 2320 if (refs == 0) {
2543 btrfs_release_path(extent_root, path); 2321 btrfs_release_path(extent_root, path);
2322 path->leave_spinning = 1;
2544 ret = btrfs_search_slot(trans, extent_root, &key, path, 2323 ret = btrfs_search_slot(trans, extent_root, &key, path,
2545 -1, 1); 2324 -1, 1);
2546 BUG_ON(ret); 2325 BUG_ON(ret);
@@ -2550,16 +2329,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2550 if (refs == 0) { 2329 if (refs == 0) {
2551 u64 super_used; 2330 u64 super_used;
2552 u64 root_used; 2331 u64 root_used;
2332 struct extent_buffer *must_clean = NULL;
2553 2333
2554 if (pin) { 2334 if (pin) {
2555 mutex_lock(&root->fs_info->pinned_mutex); 2335 ret = pin_down_bytes(trans, root, path,
2556 ret = pin_down_bytes(trans, root, bytenr, num_bytes, 2336 bytenr, num_bytes,
2557 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); 2337 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2558 mutex_unlock(&root->fs_info->pinned_mutex); 2338 &must_clean);
2559 if (ret > 0) 2339 if (ret > 0)
2560 mark_free = 1; 2340 mark_free = 1;
2561 BUG_ON(ret < 0); 2341 BUG_ON(ret < 0);
2562 } 2342 }
2343
2563 /* block accounting for super block */ 2344 /* block accounting for super block */
2564 spin_lock(&info->delalloc_lock); 2345 spin_lock(&info->delalloc_lock);
2565 super_used = btrfs_super_bytes_used(&info->super_copy); 2346 super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2571,14 +2352,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2571 btrfs_set_root_used(&root->root_item, 2352 btrfs_set_root_used(&root->root_item,
2572 root_used - num_bytes); 2353 root_used - num_bytes);
2573 spin_unlock(&info->delalloc_lock); 2354 spin_unlock(&info->delalloc_lock);
2355
2356 /*
2357 * it is going to be very rare for someone to be waiting
2358 * on the block we're freeing. del_items might need to
2359 * schedule, so rather than get fancy, just force it
2360 * to blocking here
2361 */
2362 if (must_clean)
2363 btrfs_set_lock_blocking(must_clean);
2364
2574 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2365 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2575 num_to_del); 2366 num_to_del);
2576 BUG_ON(ret); 2367 BUG_ON(ret);
2577 btrfs_release_path(extent_root, path); 2368 btrfs_release_path(extent_root, path);
2578 2369
2370 if (must_clean) {
2371 clean_tree_block(NULL, root, must_clean);
2372 btrfs_tree_unlock(must_clean);
2373 free_extent_buffer(must_clean);
2374 }
2375
2579 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2376 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2580 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2377 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2581 BUG_ON(ret); 2378 BUG_ON(ret);
2379 } else {
2380 invalidate_mapping_pages(info->btree_inode->i_mapping,
2381 bytenr >> PAGE_CACHE_SHIFT,
2382 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2582 } 2383 }
2583 2384
2584 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2385 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2586,216 +2387,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2586 BUG_ON(ret); 2387 BUG_ON(ret);
2587 } 2388 }
2588 btrfs_free_path(path); 2389 btrfs_free_path(path);
2589 finish_current_insert(trans, extent_root, 0);
2590 return ret; 2390 return ret;
2591} 2391}
2592 2392
2593/* 2393/*
2594 * find all the blocks marked as pending in the radix tree and remove 2394 * remove an extent from the root, returns 0 on success
2595 * them from the extent map
2596 */ 2395 */
2597static int del_pending_extents(struct btrfs_trans_handle *trans, 2396static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2598 struct btrfs_root *extent_root, int all) 2397 struct btrfs_root *root,
2398 u64 bytenr, u64 num_bytes, u64 parent,
2399 u64 root_objectid, u64 ref_generation,
2400 u64 owner_objectid, int pin,
2401 int refs_to_drop)
2599{ 2402{
2600 int ret; 2403 WARN_ON(num_bytes < root->sectorsize);
2601 int err = 0;
2602 u64 start;
2603 u64 end;
2604 u64 priv;
2605 u64 search = 0;
2606 int nr = 0, skipped = 0;
2607 struct extent_io_tree *pending_del;
2608 struct extent_io_tree *extent_ins;
2609 struct pending_extent_op *extent_op;
2610 struct btrfs_fs_info *info = extent_root->fs_info;
2611 struct list_head delete_list;
2612
2613 INIT_LIST_HEAD(&delete_list);
2614 extent_ins = &extent_root->fs_info->extent_ins;
2615 pending_del = &extent_root->fs_info->pending_del;
2616
2617again:
2618 mutex_lock(&info->extent_ins_mutex);
2619 while (1) {
2620 ret = find_first_extent_bit(pending_del, search, &start, &end,
2621 EXTENT_WRITEBACK);
2622 if (ret) {
2623 if (all && skipped && !nr) {
2624 search = 0;
2625 skipped = 0;
2626 continue;
2627 }
2628 mutex_unlock(&info->extent_ins_mutex);
2629 break;
2630 }
2631
2632 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2633 if (!ret) {
2634 search = end+1;
2635 skipped = 1;
2636
2637 if (need_resched()) {
2638 mutex_unlock(&info->extent_ins_mutex);
2639 cond_resched();
2640 mutex_lock(&info->extent_ins_mutex);
2641 }
2642
2643 continue;
2644 }
2645 BUG_ON(ret < 0);
2646
2647 ret = get_state_private(pending_del, start, &priv);
2648 BUG_ON(ret);
2649 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2650
2651 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2652 GFP_NOFS);
2653 if (!test_range_bit(extent_ins, start, end,
2654 EXTENT_WRITEBACK, 0)) {
2655 list_add_tail(&extent_op->list, &delete_list);
2656 nr++;
2657 } else {
2658 kfree(extent_op);
2659
2660 ret = get_state_private(&info->extent_ins, start,
2661 &priv);
2662 BUG_ON(ret);
2663 extent_op = (struct pending_extent_op *)
2664 (unsigned long)priv;
2665
2666 clear_extent_bits(&info->extent_ins, start, end,
2667 EXTENT_WRITEBACK, GFP_NOFS);
2668
2669 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2670 list_add_tail(&extent_op->list, &delete_list);
2671 search = end + 1;
2672 nr++;
2673 continue;
2674 }
2675
2676 mutex_lock(&extent_root->fs_info->pinned_mutex);
2677 ret = pin_down_bytes(trans, extent_root, start,
2678 end + 1 - start, 0);
2679 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2680
2681 ret = update_block_group(trans, extent_root, start,
2682 end + 1 - start, 0, ret > 0);
2683
2684 unlock_extent(extent_ins, start, end, GFP_NOFS);
2685 BUG_ON(ret);
2686 kfree(extent_op);
2687 }
2688 if (ret)
2689 err = ret;
2690
2691 search = end + 1;
2692
2693 if (need_resched()) {
2694 mutex_unlock(&info->extent_ins_mutex);
2695 cond_resched();
2696 mutex_lock(&info->extent_ins_mutex);
2697 }
2698 }
2699 2404
2700 if (nr) { 2405 /*
2701 ret = free_extents(trans, extent_root, &delete_list); 2406 * if metadata always pin
2702 BUG_ON(ret); 2407 * if data pin when any transaction has committed this
2703 } 2408 */
2409 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2410 ref_generation != trans->transid)
2411 pin = 1;
2704 2412
2705 if (all && skipped) { 2413 if (ref_generation != trans->transid)
2706 INIT_LIST_HEAD(&delete_list); 2414 pin = 1;
2707 search = 0;
2708 nr = 0;
2709 goto again;
2710 }
2711 2415
2712 return err; 2416 return __free_extent(trans, root, bytenr, num_bytes, parent,
2417 root_objectid, ref_generation,
2418 owner_objectid, pin, pin == 0, refs_to_drop);
2713} 2419}
2714 2420
2715/* 2421/*
2716 * remove an extent from the root, returns 0 on success 2422 * when we free an extent, it is possible (and likely) that we free the last
2423 * delayed ref for that extent as well. This searches the delayed ref tree for
2424 * a given extent, and if there are no other delayed refs to be processed, it
2425 * removes it from the tree.
2717 */ 2426 */
2718static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2427static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2719 struct btrfs_root *root, 2428 struct btrfs_root *root, u64 bytenr)
2720 u64 bytenr, u64 num_bytes, u64 parent,
2721 u64 root_objectid, u64 ref_generation,
2722 u64 owner_objectid, int pin)
2723{ 2429{
2724 struct btrfs_root *extent_root = root->fs_info->extent_root; 2430 struct btrfs_delayed_ref_head *head;
2725 int pending_ret; 2431 struct btrfs_delayed_ref_root *delayed_refs;
2432 struct btrfs_delayed_ref_node *ref;
2433 struct rb_node *node;
2726 int ret; 2434 int ret;
2727 2435
2728 WARN_ON(num_bytes < root->sectorsize); 2436 delayed_refs = &trans->transaction->delayed_refs;
2729 if (root == extent_root) { 2437 spin_lock(&delayed_refs->lock);
2730 struct pending_extent_op *extent_op = NULL; 2438 head = btrfs_find_delayed_ref_head(trans, bytenr);
2731 2439 if (!head)
2732 mutex_lock(&root->fs_info->extent_ins_mutex); 2440 goto out;
2733 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2734 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2735 u64 priv;
2736 ret = get_state_private(&root->fs_info->extent_ins,
2737 bytenr, &priv);
2738 BUG_ON(ret);
2739 extent_op = (struct pending_extent_op *)
2740 (unsigned long)priv;
2741 2441
2742 extent_op->del = 1; 2442 node = rb_prev(&head->node.rb_node);
2743 if (extent_op->type == PENDING_EXTENT_INSERT) { 2443 if (!node)
2744 mutex_unlock(&root->fs_info->extent_ins_mutex); 2444 goto out;
2745 return 0;
2746 }
2747 }
2748 2445
2749 if (extent_op) { 2446 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2750 ref_generation = extent_op->orig_generation;
2751 parent = extent_op->orig_parent;
2752 }
2753 2447
2754 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2448 /* there are still entries for this ref, we can't drop it */
2755 BUG_ON(!extent_op); 2449 if (ref->bytenr == bytenr)
2756 2450 goto out;
2757 extent_op->type = PENDING_EXTENT_DELETE;
2758 extent_op->bytenr = bytenr;
2759 extent_op->num_bytes = num_bytes;
2760 extent_op->parent = parent;
2761 extent_op->orig_parent = parent;
2762 extent_op->generation = ref_generation;
2763 extent_op->orig_generation = ref_generation;
2764 extent_op->level = (int)owner_objectid;
2765 INIT_LIST_HEAD(&extent_op->list);
2766 extent_op->del = 0;
2767
2768 set_extent_bits(&root->fs_info->pending_del,
2769 bytenr, bytenr + num_bytes - 1,
2770 EXTENT_WRITEBACK, GFP_NOFS);
2771 set_state_private(&root->fs_info->pending_del,
2772 bytenr, (unsigned long)extent_op);
2773 mutex_unlock(&root->fs_info->extent_ins_mutex);
2774 return 0;
2775 }
2776 /* if metadata always pin */
2777 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2778 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2779 mutex_lock(&root->fs_info->pinned_mutex);
2780 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2781 mutex_unlock(&root->fs_info->pinned_mutex);
2782 update_reserved_extents(root, bytenr, num_bytes, 0);
2783 return 0;
2784 }
2785 pin = 1;
2786 }
2787 2451
2788 /* if data pin when any transaction has committed this */ 2452 /*
2789 if (ref_generation != trans->transid) 2453 * waiting for the lock here would deadlock. If someone else has it
2790 pin = 1; 2454 * locked they are already in the process of dropping it anyway
2455 */
2456 if (!mutex_trylock(&head->mutex))
2457 goto out;
2791 2458
2792 ret = __free_extent(trans, root, bytenr, num_bytes, parent, 2459 /*
2793 root_objectid, ref_generation, 2460 * at this point we have a head with no other entries. Go
2794 owner_objectid, pin, pin == 0); 2461 * ahead and process it.
2462 */
2463 head->node.in_tree = 0;
2464 rb_erase(&head->node.rb_node, &delayed_refs->root);
2795 2465
2796 finish_current_insert(trans, root->fs_info->extent_root, 0); 2466 delayed_refs->num_entries--;
2797 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); 2467
2798 return ret ? ret : pending_ret; 2468 /*
2469 * we don't take a ref on the node because we're removing it from the
2470 * tree, so we just steal the ref the tree was holding.
2471 */
2472 delayed_refs->num_heads--;
2473 if (list_empty(&head->cluster))
2474 delayed_refs->num_heads_ready--;
2475
2476 list_del_init(&head->cluster);
2477 spin_unlock(&delayed_refs->lock);
2478
2479 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2480 &head->node, head->must_insert_reserved);
2481 BUG_ON(ret);
2482 btrfs_put_delayed_ref(&head->node);
2483 return 0;
2484out:
2485 spin_unlock(&delayed_refs->lock);
2486 return 0;
2799} 2487}
2800 2488
2801int btrfs_free_extent(struct btrfs_trans_handle *trans, 2489int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -2806,9 +2494,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2806{ 2494{
2807 int ret; 2495 int ret;
2808 2496
2809 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, 2497 /*
2810 root_objectid, ref_generation, 2498 * tree log blocks never actually go into the extent allocation
2811 owner_objectid, pin); 2499 * tree, just update pinning info and exit early.
2500 *
2501 * data extents referenced by the tree log do need to have
2502 * their reference counts bumped.
2503 */
2504 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2505 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2506 /* unlocks the pinned mutex */
2507 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2508 update_reserved_extents(root, bytenr, num_bytes, 0);
2509 ret = 0;
2510 } else {
2511 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2512 root_objectid, ref_generation,
2513 owner_objectid,
2514 BTRFS_DROP_DELAYED_REF, 1);
2515 BUG_ON(ret);
2516 ret = check_ref_cleanup(trans, root, bytenr);
2517 BUG_ON(ret);
2518 }
2812 return ret; 2519 return ret;
2813} 2520}
2814 2521
@@ -2837,227 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2837{ 2544{
2838 int ret = 0; 2545 int ret = 0;
2839 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
2840 u64 total_needed = num_bytes; 2547 struct btrfs_free_cluster *last_ptr = NULL;
2841 u64 *last_ptr = NULL;
2842 u64 last_wanted = 0;
2843 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
2844 int chunk_alloc_done = 0;
2845 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
2846 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
2847 struct list_head *head = NULL, *cur = NULL;
2848 int loop = 0;
2849 int extra_loop = 0;
2850 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
2851 2554
2852 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
2853 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2854 ins->objectid = 0; 2557 ins->objectid = 0;
2855 ins->offset = 0; 2558 ins->offset = 0;
2856 2559
2560 space_info = __find_space_info(root->fs_info, data);
2561
2857 if (orig_root->ref_cows || empty_size) 2562 if (orig_root->ref_cows || empty_size)
2858 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
2859 2564
2860 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2861 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
2862 empty_cluster = 64 * 1024; 2567 if (!btrfs_test_opt(root, SSD))
2568 empty_cluster = 64 * 1024;
2863 } 2569 }
2864 2570
2865 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2866 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
2867 2574
2868 if (last_ptr) { 2575 if (last_ptr) {
2869 if (*last_ptr) { 2576 spin_lock(&last_ptr->lock);
2870 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
2871 last_wanted = *last_ptr; 2578 hint_byte = last_ptr->window_start;
2872 } else 2579 spin_unlock(&last_ptr->lock);
2873 empty_size += empty_cluster;
2874 } else {
2875 empty_cluster = 0;
2876 } 2580 }
2581
2877 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
2878 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
2879 2584
2880 if (last_wanted && search_start != last_wanted) { 2585 if (!last_ptr) {
2881 last_wanted = 0; 2586 empty_cluster = 0;
2882 empty_size += empty_cluster; 2587 loop = 1;
2883 } 2588 }
2884 2589
2885 total_needed += empty_size; 2590 if (search_start == hint_byte) {
2886 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2591 block_group = btrfs_lookup_block_group(root->fs_info,
2887 if (!block_group) 2592 search_start);
2888 block_group = btrfs_lookup_first_block_group(root->fs_info, 2593 if (block_group && block_group_bits(block_group, data)) {
2889 search_start); 2594 down_read(&space_info->groups_sem);
2890 space_info = __find_space_info(root->fs_info, data); 2595 goto have_block_group;
2596 } else if (block_group) {
2597 btrfs_put_block_group(block_group);
2598 }
2599 }
2891 2600
2601search:
2892 down_read(&space_info->groups_sem); 2602 down_read(&space_info->groups_sem);
2893 while (1) { 2603 list_for_each_entry(block_group, &space_info->block_groups, list) {
2894 struct btrfs_free_space *free_space; 2604 u64 offset;
2895 /*
2896 * the only way this happens if our hint points to a block
2897 * group thats not of the proper type, while looping this
2898 * should never happen
2899 */
2900 if (empty_size)
2901 extra_loop = 1;
2902 2605
2903 if (!block_group) 2606 atomic_inc(&block_group->count);
2904 goto new_group_no_lock; 2607 search_start = block_group->key.objectid;
2905 2608
2609have_block_group:
2906 if (unlikely(!block_group->cached)) { 2610 if (unlikely(!block_group->cached)) {
2907 mutex_lock(&block_group->cache_mutex); 2611 mutex_lock(&block_group->cache_mutex);
2908 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
2909 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
2910 if (ret) 2614 if (ret) {
2615 btrfs_put_block_group(block_group);
2911 break; 2616 break;
2617 }
2912 } 2618 }
2913 2619
2914 mutex_lock(&block_group->alloc_mutex);
2915 if (unlikely(!block_group_bits(block_group, data)))
2916 goto new_group;
2917
2918 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
2919 goto new_group; 2621 goto loop;
2920 2622
2921 free_space = btrfs_find_free_space(block_group, search_start, 2623 if (last_ptr) {
2922 total_needed); 2624 /*
2923 if (free_space) { 2625 * the refill lock keeps out other
2924 u64 start = block_group->key.objectid; 2626 * people trying to start a new cluster
2925 u64 end = block_group->key.objectid + 2627 */
2926 block_group->key.offset; 2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
2927 2636
2928 search_start = stripe_align(root, free_space->offset); 2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
2929 2657
2930 /* move on to the next group */ 2658 /*
2931 if (search_start + num_bytes >= search_end) 2659 * this cluster didn't work out, free it and
2932 goto new_group; 2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2933 2663
2934 /* move on to the next group */ 2664 last_ptr_loop = 0;
2935 if (search_start + num_bytes > end)
2936 goto new_group;
2937 2665
2938 if (last_wanted && search_start != last_wanted) { 2666 /* allocate a cluster in this block group */
2939 total_needed += empty_cluster; 2667 ret = btrfs_find_space_cluster(trans,
2940 empty_size += empty_cluster; 2668 block_group, last_ptr,
2941 last_wanted = 0; 2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
2942 /* 2672 /*
2943 * if search_start is still in this block group 2673 * now pull our allocation out of this
2944 * then we just re-search this block group 2674 * cluster
2945 */ 2675 */
2946 if (search_start >= start && 2676 offset = btrfs_alloc_from_cluster(block_group,
2947 search_start < end) { 2677 last_ptr, num_bytes,
2948 mutex_unlock(&block_group->alloc_mutex); 2678 search_start);
2949 continue; 2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2950 } 2683 }
2951
2952 /* else we go to the next block group */
2953 goto new_group;
2954 } 2684 }
2955 2685 /*
2956 if (exclude_nr > 0 && 2686 * at this point we either didn't find a cluster
2957 (search_start + num_bytes > exclude_start && 2687 * or we weren't able to allocate a block from our
2958 search_start < exclude_start + exclude_nr)) { 2688 * cluster. Free the cluster we've been trying
2959 search_start = exclude_start + exclude_nr; 2689 * to use, and go to the next block group
2960 /* 2690 */
2961 * if search_start is still in this block group 2691 if (loop < 2) {
2962 * then we just re-search this block group 2692 btrfs_return_cluster_to_free_space(NULL,
2963 */ 2693 last_ptr);
2964 if (search_start >= start && 2694 spin_unlock(&last_ptr->refill_lock);
2965 search_start < end) { 2695 goto loop;
2966 mutex_unlock(&block_group->alloc_mutex);
2967 last_wanted = 0;
2968 continue;
2969 }
2970
2971 /* else we go to the next block group */
2972 goto new_group;
2973 } 2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
2974 2699
2975 ins->objectid = search_start; 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
2976 ins->offset = num_bytes; 2701 num_bytes, empty_size);
2702 if (!offset)
2703 goto loop;
2704checks:
2705 search_start = stripe_align(root, offset);
2977 2706
2978 btrfs_remove_free_space_lock(block_group, search_start, 2707 /* move on to the next group */
2979 num_bytes); 2708 if (search_start + num_bytes >= search_end) {
2980 /* we are all good, lets return */ 2709 btrfs_add_free_space(block_group, offset, num_bytes);
2981 mutex_unlock(&block_group->alloc_mutex); 2710 goto loop;
2982 break;
2983 } 2711 }
2984new_group:
2985 mutex_unlock(&block_group->alloc_mutex);
2986 put_block_group(block_group);
2987 block_group = NULL;
2988new_group_no_lock:
2989 /* don't try to compare new allocations against the
2990 * last allocation any more
2991 */
2992 last_wanted = 0;
2993 2712
2994 /* 2713 /* move on to the next group */
2995 * Here's how this works. 2714 if (search_start + num_bytes >
2996 * loop == 0: we were searching a block group via a hint 2715 block_group->key.objectid + block_group->key.offset) {
2997 * and didn't find anything, so we start at 2716 btrfs_add_free_space(block_group, offset, num_bytes);
2998 * the head of the block groups and keep searching 2717 goto loop;
2999 * loop == 1: we're searching through all of the block groups 2718 }
3000 * if we hit the head again we have searched 2719
3001 * all of the block groups for this space and we 2720 if (exclude_nr > 0 &&
3002 * need to try and allocate, if we cant error out. 2721 (search_start + num_bytes > exclude_start &&
3003 * loop == 2: we allocated more space and are looping through 2722 search_start < exclude_start + exclude_nr)) {
3004 * all of the block groups again. 2723 search_start = exclude_start + exclude_nr;
3005 */ 2724
3006 if (loop == 0) { 2725 btrfs_add_free_space(block_group, offset, num_bytes);
3007 head = &space_info->block_groups; 2726 /*
3008 cur = head->next; 2727 * if search_start is still in this block group
3009 loop++; 2728 * then we just re-search this block group
3010 } else if (loop == 1 && cur == head) {
3011 int keep_going;
3012
3013 /* at this point we give up on the empty_size
3014 * allocations and just try to allocate the min
3015 * space.
3016 *
3017 * The extra_loop field was set if an empty_size
3018 * allocation was attempted above, and if this
3019 * is try we need to try the loop again without
3020 * the additional empty_size.
3021 */ 2729 */
3022 total_needed -= empty_size; 2730 if (search_start >= block_group->key.objectid &&
3023 empty_size = 0; 2731 search_start < (block_group->key.objectid +
3024 keep_going = extra_loop; 2732 block_group->key.offset))
3025 loop++; 2733 goto have_block_group;
2734 goto loop;
2735 }
3026 2736
3027 if (allowed_chunk_alloc && !chunk_alloc_done) { 2737 ins->objectid = search_start;
3028 up_read(&space_info->groups_sem); 2738 ins->offset = num_bytes;
3029 ret = do_chunk_alloc(trans, root, num_bytes + 2739
3030 2 * 1024 * 1024, data, 1); 2740 if (offset < search_start)
3031 down_read(&space_info->groups_sem); 2741 btrfs_add_free_space(block_group, offset,
3032 if (ret < 0) 2742 search_start - offset);
3033 goto loop_check; 2743 BUG_ON(offset > search_start);
3034 head = &space_info->block_groups; 2744
3035 /* 2745 /* we are all good, lets return */
3036 * we've allocated a new chunk, keep 2746 break;
3037 * trying 2747loop:
3038 */ 2748 btrfs_put_block_group(block_group);
3039 keep_going = 1; 2749 }
3040 chunk_alloc_done = 1; 2750 up_read(&space_info->groups_sem);
3041 } else if (!allowed_chunk_alloc) { 2751
3042 space_info->force_alloc = 1; 2752 /* loop == 0, try to find a clustered alloc in every block group
3043 } 2753 * loop == 1, try again after forcing a chunk allocation
3044loop_check: 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
3045 if (keep_going) { 2755 */
3046 cur = head->next; 2756 if (!ins->objectid && loop < 3 &&
3047 extra_loop = 0; 2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
3048 } else { 2758 if (loop >= 2) {
3049 break; 2759 empty_size = 0;
3050 } 2760 empty_cluster = 0;
3051 } else if (cur == head) {
3052 break;
3053 } 2761 }
3054 2762
3055 block_group = list_entry(cur, struct btrfs_block_group_cache, 2763 if (allowed_chunk_alloc) {
3056 list); 2764 ret = do_chunk_alloc(trans, root, num_bytes +
3057 atomic_inc(&block_group->count); 2765 2 * 1024 * 1024, data, 1);
2766 allowed_chunk_alloc = 0;
2767 } else {
2768 space_info->force_alloc = 1;
2769 }
3058 2770
3059 search_start = block_group->key.objectid; 2771 if (loop < 3) {
3060 cur = cur->next; 2772 loop++;
2773 goto search;
2774 }
2775 ret = -ENOSPC;
2776 } else if (!ins->objectid) {
2777 ret = -ENOSPC;
3061 } 2778 }
3062 2779
3063 /* we found what we needed */ 2780 /* we found what we needed */
@@ -3065,21 +2782,10 @@ loop_check:
3065 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
3066 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
3067 2784
3068 if (last_ptr) 2785 btrfs_put_block_group(block_group);
3069 *last_ptr = ins->objectid + ins->offset;
3070 ret = 0; 2786 ret = 0;
3071 } else if (!ret) {
3072 printk(KERN_ERR "btrfs searching for %llu bytes, "
3073 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3074 (unsigned long long)total_needed,
3075 (unsigned long long)num_bytes,
3076 loop, allowed_chunk_alloc);
3077 ret = -ENOSPC;
3078 } 2787 }
3079 if (block_group)
3080 put_block_group(block_group);
3081 2788
3082 up_read(&space_info->groups_sem);
3083 return ret; 2789 return ret;
3084} 2790}
3085 2791
@@ -3091,6 +2797,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3091 (unsigned long long)(info->total_bytes - info->bytes_used - 2797 (unsigned long long)(info->total_bytes - info->bytes_used -
3092 info->bytes_pinned - info->bytes_reserved), 2798 info->bytes_pinned - info->bytes_reserved),
3093 (info->full) ? "" : "not "); 2799 (info->full) ? "" : "not ");
2800 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
2801 " may_use=%llu, used=%llu\n", info->total_bytes,
2802 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
2803 info->bytes_used);
3094 2804
3095 down_read(&info->groups_sem); 2805 down_read(&info->groups_sem);
3096 list_for_each_entry(cache, &info->block_groups, list) { 2806 list_for_each_entry(cache, &info->block_groups, list) {
@@ -3117,24 +2827,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3117{ 2827{
3118 int ret; 2828 int ret;
3119 u64 search_start = 0; 2829 u64 search_start = 0;
3120 u64 alloc_profile;
3121 struct btrfs_fs_info *info = root->fs_info; 2830 struct btrfs_fs_info *info = root->fs_info;
3122 2831
3123 if (data) { 2832 data = btrfs_get_alloc_profile(root, data);
3124 alloc_profile = info->avail_data_alloc_bits &
3125 info->data_alloc_profile;
3126 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3127 } else if (root == root->fs_info->chunk_root) {
3128 alloc_profile = info->avail_system_alloc_bits &
3129 info->system_alloc_profile;
3130 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3131 } else {
3132 alloc_profile = info->avail_metadata_alloc_bits &
3133 info->metadata_alloc_profile;
3134 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3135 }
3136again: 2833again:
3137 data = btrfs_reduce_alloc_profile(root, data);
3138 /* 2834 /*
3139 * the only place that sets empty_size is btrfs_realloc_node, which 2835 * the only place that sets empty_size is btrfs_realloc_node, which
3140 * is not called recursively on allocations 2836 * is not called recursively on allocations
@@ -3194,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3194 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
3195 2891
3196 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
3197 put_block_group(cache); 2893 btrfs_put_block_group(cache);
3198 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
3199 2895
3200 return ret; 2896 return ret;
@@ -3218,10 +2914,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3218static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2914static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3219 struct btrfs_root *root, u64 parent, 2915 struct btrfs_root *root, u64 parent,
3220 u64 root_objectid, u64 ref_generation, 2916 u64 root_objectid, u64 ref_generation,
3221 u64 owner, struct btrfs_key *ins) 2917 u64 owner, struct btrfs_key *ins,
2918 int ref_mod)
3222{ 2919{
3223 int ret; 2920 int ret;
3224 int pending_ret;
3225 u64 super_used; 2921 u64 super_used;
3226 u64 root_used; 2922 u64 root_used;
3227 u64 num_bytes = ins->offset; 2923 u64 num_bytes = ins->offset;
@@ -3246,33 +2942,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3246 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 2942 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3247 spin_unlock(&info->delalloc_lock); 2943 spin_unlock(&info->delalloc_lock);
3248 2944
3249 if (root == extent_root) {
3250 struct pending_extent_op *extent_op;
3251
3252 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3253 BUG_ON(!extent_op);
3254
3255 extent_op->type = PENDING_EXTENT_INSERT;
3256 extent_op->bytenr = ins->objectid;
3257 extent_op->num_bytes = ins->offset;
3258 extent_op->parent = parent;
3259 extent_op->orig_parent = 0;
3260 extent_op->generation = ref_generation;
3261 extent_op->orig_generation = 0;
3262 extent_op->level = (int)owner;
3263 INIT_LIST_HEAD(&extent_op->list);
3264 extent_op->del = 0;
3265
3266 mutex_lock(&root->fs_info->extent_ins_mutex);
3267 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3268 ins->objectid + ins->offset - 1,
3269 EXTENT_WRITEBACK, GFP_NOFS);
3270 set_state_private(&root->fs_info->extent_ins,
3271 ins->objectid, (unsigned long)extent_op);
3272 mutex_unlock(&root->fs_info->extent_ins_mutex);
3273 goto update_block;
3274 }
3275
3276 memcpy(&keys[0], ins, sizeof(*ins)); 2945 memcpy(&keys[0], ins, sizeof(*ins));
3277 keys[1].objectid = ins->objectid; 2946 keys[1].objectid = ins->objectid;
3278 keys[1].type = BTRFS_EXTENT_REF_KEY; 2947 keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3283,37 +2952,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3283 path = btrfs_alloc_path(); 2952 path = btrfs_alloc_path();
3284 BUG_ON(!path); 2953 BUG_ON(!path);
3285 2954
2955 path->leave_spinning = 1;
3286 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 2956 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3287 sizes, 2); 2957 sizes, 2);
3288 BUG_ON(ret); 2958 BUG_ON(ret);
3289 2959
3290 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2960 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3291 struct btrfs_extent_item); 2961 struct btrfs_extent_item);
3292 btrfs_set_extent_refs(path->nodes[0], extent_item, 1); 2962 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
3293 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 2963 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3294 struct btrfs_extent_ref); 2964 struct btrfs_extent_ref);
3295 2965
3296 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 2966 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3297 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 2967 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3298 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 2968 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3299 btrfs_set_ref_num_refs(path->nodes[0], ref, 1); 2969 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
3300 2970
3301 btrfs_mark_buffer_dirty(path->nodes[0]); 2971 btrfs_mark_buffer_dirty(path->nodes[0]);
3302 2972
3303 trans->alloc_exclude_start = 0; 2973 trans->alloc_exclude_start = 0;
3304 trans->alloc_exclude_nr = 0; 2974 trans->alloc_exclude_nr = 0;
3305 btrfs_free_path(path); 2975 btrfs_free_path(path);
3306 finish_current_insert(trans, extent_root, 0);
3307 pending_ret = del_pending_extents(trans, extent_root, 0);
3308 2976
3309 if (ret) 2977 if (ret)
3310 goto out; 2978 goto out;
3311 if (pending_ret) {
3312 ret = pending_ret;
3313 goto out;
3314 }
3315 2979
3316update_block:
3317 ret = update_block_group(trans, root, ins->objectid, 2980 ret = update_block_group(trans, root, ins->objectid,
3318 ins->offset, 1, 0); 2981 ins->offset, 1, 0);
3319 if (ret) { 2982 if (ret) {
@@ -3335,9 +2998,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3335 2998
3336 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 2999 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3337 return 0; 3000 return 0;
3338 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3001
3339 ref_generation, owner, ins); 3002 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3340 update_reserved_extents(root, ins->objectid, ins->offset, 0); 3003 ins->offset, parent, root_objectid,
3004 ref_generation, owner,
3005 BTRFS_ADD_DELAYED_EXTENT, 0);
3006 BUG_ON(ret);
3341 return ret; 3007 return ret;
3342} 3008}
3343 3009
@@ -3362,9 +3028,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3362 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
3363 ins->offset); 3029 ins->offset);
3364 BUG_ON(ret); 3030 BUG_ON(ret);
3365 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
3366 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3367 ref_generation, owner, ins); 3033 ref_generation, owner, ins, 1);
3368 return ret; 3034 return ret;
3369} 3035}
3370 3036
@@ -3383,26 +3049,25 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3383 u64 search_end, struct btrfs_key *ins, u64 data) 3049 u64 search_end, struct btrfs_key *ins, u64 data)
3384{ 3050{
3385 int ret; 3051 int ret;
3386
3387 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3052 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3388 min_alloc_size, empty_size, hint_byte, 3053 min_alloc_size, empty_size, hint_byte,
3389 search_end, ins, data); 3054 search_end, ins, data);
3390 BUG_ON(ret); 3055 BUG_ON(ret);
3391 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3056 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3392 ret = __btrfs_alloc_reserved_extent(trans, root, parent, 3057 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3393 root_objectid, ref_generation, 3058 ins->offset, parent, root_objectid,
3394 owner_objectid, ins); 3059 ref_generation, owner_objectid,
3060 BTRFS_ADD_DELAYED_EXTENT, 0);
3395 BUG_ON(ret); 3061 BUG_ON(ret);
3396
3397 } else {
3398 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3399 } 3062 }
3063 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3400 return ret; 3064 return ret;
3401} 3065}
3402 3066
3403struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3067struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3404 struct btrfs_root *root, 3068 struct btrfs_root *root,
3405 u64 bytenr, u32 blocksize) 3069 u64 bytenr, u32 blocksize,
3070 int level)
3406{ 3071{
3407 struct extent_buffer *buf; 3072 struct extent_buffer *buf;
3408 3073
@@ -3410,6 +3075,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3410 if (!buf) 3075 if (!buf)
3411 return ERR_PTR(-ENOMEM); 3076 return ERR_PTR(-ENOMEM);
3412 btrfs_set_header_generation(buf, trans->transid); 3077 btrfs_set_header_generation(buf, trans->transid);
3078 btrfs_set_buffer_lockdep_class(buf, level);
3413 btrfs_tree_lock(buf); 3079 btrfs_tree_lock(buf);
3414 clean_tree_block(trans, root, buf); 3080 clean_tree_block(trans, root, buf);
3415 3081
@@ -3453,7 +3119,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3453 return ERR_PTR(ret); 3119 return ERR_PTR(ret);
3454 } 3120 }
3455 3121
3456 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3122 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3123 blocksize, level);
3457 return buf; 3124 return buf;
3458} 3125}
3459 3126
@@ -3529,7 +3196,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3529 3196
3530 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3197 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3531 3198
3532 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3199 ret = btrfs_free_extent(trans, root, disk_bytenr,
3533 btrfs_file_extent_disk_num_bytes(leaf, fi), 3200 btrfs_file_extent_disk_num_bytes(leaf, fi),
3534 leaf->start, leaf_owner, leaf_generation, 3201 leaf->start, leaf_owner, leaf_generation,
3535 key.objectid, 0); 3202 key.objectid, 0);
@@ -3569,7 +3236,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3569 */ 3236 */
3570 for (i = 0; i < ref->nritems; i++) { 3237 for (i = 0; i < ref->nritems; i++) {
3571 info = ref->extents + sorted[i].slot; 3238 info = ref->extents + sorted[i].slot;
3572 ret = __btrfs_free_extent(trans, root, info->bytenr, 3239 ret = btrfs_free_extent(trans, root, info->bytenr,
3573 info->num_bytes, ref->bytenr, 3240 info->num_bytes, ref->bytenr,
3574 ref->owner, ref->generation, 3241 ref->owner, ref->generation,
3575 info->objectid, 0); 3242 info->objectid, 0);
@@ -3586,12 +3253,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3586 return 0; 3253 return 0;
3587} 3254}
3588 3255
3589static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, 3256static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3257 struct btrfs_root *root, u64 start,
3590 u64 len, u32 *refs) 3258 u64 len, u32 *refs)
3591{ 3259{
3592 int ret; 3260 int ret;
3593 3261
3594 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); 3262 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3595 BUG_ON(ret); 3263 BUG_ON(ret);
3596 3264
3597#if 0 /* some debugging code in case we see problems here */ 3265#if 0 /* some debugging code in case we see problems here */
@@ -3699,7 +3367,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3699 * we just decrement it below and don't update any 3367 * we just decrement it below and don't update any
3700 * of the refs the leaf points to. 3368 * of the refs the leaf points to.
3701 */ 3369 */
3702 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3370 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3371 blocksize, &refs);
3703 BUG_ON(ret); 3372 BUG_ON(ret);
3704 if (refs != 1) 3373 if (refs != 1)
3705 continue; 3374 continue;
@@ -3750,7 +3419,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3750 */ 3419 */
3751 for (i = 0; i < refi; i++) { 3420 for (i = 0; i < refi; i++) {
3752 bytenr = sorted[i].bytenr; 3421 bytenr = sorted[i].bytenr;
3753 ret = __btrfs_free_extent(trans, root, bytenr, 3422 ret = btrfs_free_extent(trans, root, bytenr,
3754 blocksize, eb->start, 3423 blocksize, eb->start,
3755 root_owner, root_gen, 0, 1); 3424 root_owner, root_gen, 0, 1);
3756 BUG_ON(ret); 3425 BUG_ON(ret);
@@ -3793,7 +3462,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3793 3462
3794 WARN_ON(*level < 0); 3463 WARN_ON(*level < 0);
3795 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3464 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3796 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, 3465 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
3797 path->nodes[*level]->len, &refs); 3466 path->nodes[*level]->len, &refs);
3798 BUG_ON(ret); 3467 BUG_ON(ret);
3799 if (refs > 1) 3468 if (refs > 1)
@@ -3844,7 +3513,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3844 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3513 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3845 blocksize = btrfs_level_size(root, *level - 1); 3514 blocksize = btrfs_level_size(root, *level - 1);
3846 3515
3847 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3516 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3517 blocksize, &refs);
3848 BUG_ON(ret); 3518 BUG_ON(ret);
3849 3519
3850 /* 3520 /*
@@ -3859,7 +3529,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3859 root_gen = btrfs_header_generation(parent); 3529 root_gen = btrfs_header_generation(parent);
3860 path->slots[*level]++; 3530 path->slots[*level]++;
3861 3531
3862 ret = __btrfs_free_extent(trans, root, bytenr, 3532 ret = btrfs_free_extent(trans, root, bytenr,
3863 blocksize, parent->start, 3533 blocksize, parent->start,
3864 root_owner, root_gen, 3534 root_owner, root_gen,
3865 *level - 1, 1); 3535 *level - 1, 1);
@@ -3905,7 +3575,7 @@ out:
3905 * cleanup and free the reference on the last node 3575 * cleanup and free the reference on the last node
3906 * we processed 3576 * we processed
3907 */ 3577 */
3908 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3578 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3909 parent->start, root_owner, root_gen, 3579 parent->start, root_owner, root_gen,
3910 *level, 1); 3580 *level, 1);
3911 free_extent_buffer(path->nodes[*level]); 3581 free_extent_buffer(path->nodes[*level]);
@@ -4094,6 +3764,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4094 struct btrfs_path *path; 3764 struct btrfs_path *path;
4095 int i; 3765 int i;
4096 int orig_level; 3766 int orig_level;
3767 int update_count;
4097 struct btrfs_root_item *root_item = &root->root_item; 3768 struct btrfs_root_item *root_item = &root->root_item;
4098 3769
4099 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3770 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4135,6 +3806,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4135 } 3806 }
4136 } 3807 }
4137 while (1) { 3808 while (1) {
3809 unsigned long update;
4138 wret = walk_down_tree(trans, root, path, &level); 3810 wret = walk_down_tree(trans, root, path, &level);
4139 if (wret > 0) 3811 if (wret > 0)
4140 break; 3812 break;
@@ -4147,12 +3819,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4147 break; 3819 break;
4148 if (wret < 0) 3820 if (wret < 0)
4149 ret = wret; 3821 ret = wret;
4150 if (trans->transaction->in_commit) { 3822 if (trans->transaction->in_commit ||
3823 trans->transaction->delayed_refs.flushing) {
4151 ret = -EAGAIN; 3824 ret = -EAGAIN;
4152 break; 3825 break;
4153 } 3826 }
4154 atomic_inc(&root->fs_info->throttle_gen); 3827 atomic_inc(&root->fs_info->throttle_gen);
4155 wake_up(&root->fs_info->transaction_throttle); 3828 wake_up(&root->fs_info->transaction_throttle);
3829 for (update_count = 0; update_count < 16; update_count++) {
3830 update = trans->delayed_ref_updates;
3831 trans->delayed_ref_updates = 0;
3832 if (update)
3833 btrfs_run_delayed_refs(trans, root, update);
3834 else
3835 break;
3836 }
4156 } 3837 }
4157 for (i = 0; i <= orig_level; i++) { 3838 for (i = 0; i <= orig_level; i++) {
4158 if (path->nodes[i]) { 3839 if (path->nodes[i]) {
@@ -4179,13 +3860,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4179 path = btrfs_alloc_path(); 3860 path = btrfs_alloc_path();
4180 BUG_ON(!path); 3861 BUG_ON(!path);
4181 3862
4182 BUG_ON(!btrfs_tree_locked(parent)); 3863 btrfs_assert_tree_locked(parent);
4183 parent_level = btrfs_header_level(parent); 3864 parent_level = btrfs_header_level(parent);
4184 extent_buffer_get(parent); 3865 extent_buffer_get(parent);
4185 path->nodes[parent_level] = parent; 3866 path->nodes[parent_level] = parent;
4186 path->slots[parent_level] = btrfs_header_nritems(parent); 3867 path->slots[parent_level] = btrfs_header_nritems(parent);
4187 3868
4188 BUG_ON(!btrfs_tree_locked(node)); 3869 btrfs_assert_tree_locked(node);
4189 level = btrfs_header_level(node); 3870 level = btrfs_header_level(node);
4190 extent_buffer_get(node); 3871 extent_buffer_get(node);
4191 path->nodes[level] = node; 3872 path->nodes[level] = node;
@@ -5197,6 +4878,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5197 root->root_key.objectid, 4878 root->root_key.objectid,
5198 trans->transid, key.objectid); 4879 trans->transid, key.objectid);
5199 BUG_ON(ret); 4880 BUG_ON(ret);
4881
5200 ret = btrfs_free_extent(trans, root, 4882 ret = btrfs_free_extent(trans, root,
5201 bytenr, num_bytes, leaf->start, 4883 bytenr, num_bytes, leaf->start,
5202 btrfs_header_owner(leaf), 4884 btrfs_header_owner(leaf),
@@ -5508,9 +5190,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5508 ref_path, NULL, NULL); 5190 ref_path, NULL, NULL);
5509 BUG_ON(ret); 5191 BUG_ON(ret);
5510 5192
5511 if (root == root->fs_info->extent_root)
5512 btrfs_extent_post_op(trans, root);
5513
5514 return 0; 5193 return 0;
5515} 5194}
5516 5195
@@ -5641,7 +5320,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5641 prev_block = block_start; 5320 prev_block = block_start;
5642 } 5321 }
5643 5322
5323 mutex_lock(&extent_root->fs_info->trans_mutex);
5644 btrfs_record_root_in_trans(found_root); 5324 btrfs_record_root_in_trans(found_root);
5325 mutex_unlock(&extent_root->fs_info->trans_mutex);
5645 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5326 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5646 /* 5327 /*
5647 * try to update data extent references while 5328 * try to update data extent references while
@@ -5776,6 +5457,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5776 if (!path) 5457 if (!path)
5777 return -ENOMEM; 5458 return -ENOMEM;
5778 5459
5460 path->leave_spinning = 1;
5779 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5461 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5780 if (ret) 5462 if (ret)
5781 goto out; 5463 goto out;
@@ -5946,6 +5628,9 @@ again:
5946 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5628 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5947 mutex_unlock(&root->fs_info->cleaner_mutex); 5629 mutex_unlock(&root->fs_info->cleaner_mutex);
5948 5630
5631 trans = btrfs_start_transaction(info->tree_root, 1);
5632 btrfs_commit_transaction(trans, info->tree_root);
5633
5949 while (1) { 5634 while (1) {
5950 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5635 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5951 if (ret < 0) 5636 if (ret < 0)
@@ -6032,7 +5717,7 @@ next:
6032 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
6033 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6034 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
6035 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
6036 ret = 0; 5721 ret = 0;
6037out: 5722out:
6038 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -6079,6 +5764,7 @@ out:
6079int btrfs_free_block_groups(struct btrfs_fs_info *info) 5764int btrfs_free_block_groups(struct btrfs_fs_info *info)
6080{ 5765{
6081 struct btrfs_block_group_cache *block_group; 5766 struct btrfs_block_group_cache *block_group;
5767 struct btrfs_space_info *space_info;
6082 struct rb_node *n; 5768 struct rb_node *n;
6083 5769
6084 spin_lock(&info->block_group_cache_lock); 5770 spin_lock(&info->block_group_cache_lock);
@@ -6100,6 +5786,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6100 spin_lock(&info->block_group_cache_lock); 5786 spin_lock(&info->block_group_cache_lock);
6101 } 5787 }
6102 spin_unlock(&info->block_group_cache_lock); 5788 spin_unlock(&info->block_group_cache_lock);
5789
5790 /* now that all the block groups are freed, go through and
5791 * free all the space_info structs. This is only called during
5792 * the final stages of unmount, and so we know nobody is
5793 * using them. We call synchronize_rcu() once before we start,
5794 * just to be on the safe side.
5795 */
5796 synchronize_rcu();
5797
5798 while(!list_empty(&info->space_info)) {
5799 space_info = list_entry(info->space_info.next,
5800 struct btrfs_space_info,
5801 list);
5802
5803 list_del(&space_info->list);
5804 kfree(space_info);
5805 }
6103 return 0; 5806 return 0;
6104} 5807}
6105 5808
@@ -6141,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6141 5844
6142 atomic_set(&cache->count, 1); 5845 atomic_set(&cache->count, 1);
6143 spin_lock_init(&cache->lock); 5846 spin_lock_init(&cache->lock);
6144 mutex_init(&cache->alloc_mutex); 5847 spin_lock_init(&cache->tree_lock);
6145 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
6146 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
6147 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
6148 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
6149 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -6186,7 +5890,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6186 5890
6187 extent_root = root->fs_info->extent_root; 5891 extent_root = root->fs_info->extent_root;
6188 5892
6189 root->fs_info->last_trans_new_blockgroup = trans->transid; 5893 root->fs_info->last_trans_log_full_commit = trans->transid;
6190 5894
6191 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5895 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6192 if (!cache) 5896 if (!cache)
@@ -6197,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6197 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5901 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
6198 atomic_set(&cache->count, 1); 5902 atomic_set(&cache->count, 1);
6199 spin_lock_init(&cache->lock); 5903 spin_lock_init(&cache->lock);
6200 mutex_init(&cache->alloc_mutex); 5904 spin_lock_init(&cache->tree_lock);
6201 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
6202 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
6203 5908
6204 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
6205 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6220,9 +5925,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6220 sizeof(cache->item)); 5925 sizeof(cache->item));
6221 BUG_ON(ret); 5926 BUG_ON(ret);
6222 5927
6223 finish_current_insert(trans, extent_root, 0);
6224 ret = del_pending_extents(trans, extent_root, 0);
6225 BUG_ON(ret);
6226 set_avail_alloc_bits(extent_root->fs_info, type); 5928 set_avail_alloc_bits(extent_root->fs_info, type);
6227 5929
6228 return 0; 5930 return 0;
@@ -6262,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6262 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
6263 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
6264 5966
6265 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
6266 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
6267 5969
6268 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6269 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b79..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
415 415
416 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 416 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
417 if (node) { 417 if (node) {
418 struct extent_state *found;
419 found = rb_entry(node, struct extent_state, rb_node);
420 free_extent_state(prealloc); 418 free_extent_state(prealloc);
421 return -EEXIST; 419 return -EEXIST;
422 } 420 }
@@ -2886,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2886 disko = 0; 2884 disko = 0;
2887 flags = 0; 2885 flags = 0;
2888 2886
2889 switch (em->block_start) { 2887 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2890 case EXTENT_MAP_LAST_BYTE:
2891 end = 1; 2888 end = 1;
2892 flags |= FIEMAP_EXTENT_LAST; 2889 flags |= FIEMAP_EXTENT_LAST;
2893 break; 2890 } else if (em->block_start == EXTENT_MAP_HOLE) {
2894 case EXTENT_MAP_HOLE:
2895 flags |= FIEMAP_EXTENT_UNWRITTEN; 2891 flags |= FIEMAP_EXTENT_UNWRITTEN;
2896 break; 2892 } else if (em->block_start == EXTENT_MAP_INLINE) {
2897 case EXTENT_MAP_INLINE:
2898 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2899 FIEMAP_EXTENT_NOT_ALIGNED); 2894 FIEMAP_EXTENT_NOT_ALIGNED);
2900 break; 2895 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2901 case EXTENT_MAP_DELALLOC:
2902 flags |= (FIEMAP_EXTENT_DELALLOC | 2896 flags |= (FIEMAP_EXTENT_DELALLOC |
2903 FIEMAP_EXTENT_UNKNOWN); 2897 FIEMAP_EXTENT_UNKNOWN);
2904 break; 2898 } else {
2905 default:
2906 disko = em->block_start; 2899 disko = em->block_start;
2907 break;
2908 } 2900 }
2909 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2910 flags |= FIEMAP_EXTENT_ENCODED; 2902 flags |= FIEMAP_EXTENT_ENCODED;
@@ -3126,20 +3118,15 @@ void free_extent_buffer(struct extent_buffer *eb)
3126int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3118int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3127 struct extent_buffer *eb) 3119 struct extent_buffer *eb)
3128{ 3120{
3129 int set;
3130 unsigned long i; 3121 unsigned long i;
3131 unsigned long num_pages; 3122 unsigned long num_pages;
3132 struct page *page; 3123 struct page *page;
3133 3124
3134 u64 start = eb->start;
3135 u64 end = start + eb->len - 1;
3136
3137 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3138 num_pages = num_extent_pages(eb->start, eb->len); 3125 num_pages = num_extent_pages(eb->start, eb->len);
3139 3126
3140 for (i = 0; i < num_pages; i++) { 3127 for (i = 0; i < num_pages; i++) {
3141 page = extent_buffer_page(eb, i); 3128 page = extent_buffer_page(eb, i);
3142 if (!set && !PageDirty(page)) 3129 if (!PageDirty(page))
3143 continue; 3130 continue;
3144 3131
3145 lock_page(page); 3132 lock_page(page);
@@ -3148,22 +3135,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3148 else 3135 else
3149 set_page_private(page, EXTENT_PAGE_PRIVATE); 3136 set_page_private(page, EXTENT_PAGE_PRIVATE);
3150 3137
3151 /*
3152 * if we're on the last page or the first page and the
3153 * block isn't aligned on a page boundary, do extra checks
3154 * to make sure we don't clean page that is partially dirty
3155 */
3156 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3157 ((i == num_pages - 1) &&
3158 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3159 start = (u64)page->index << PAGE_CACHE_SHIFT;
3160 end = start + PAGE_CACHE_SIZE - 1;
3161 if (test_range_bit(tree, start, end,
3162 EXTENT_DIRTY, 0)) {
3163 unlock_page(page);
3164 continue;
3165 }
3166 }
3167 clear_page_dirty_for_io(page); 3138 clear_page_dirty_for_io(page);
3168 spin_lock_irq(&page->mapping->tree_lock); 3139 spin_lock_irq(&page->mapping->tree_lock);
3169 if (!PageDirty(page)) { 3140 if (!PageDirty(page)) {
@@ -3189,29 +3160,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3189{ 3160{
3190 unsigned long i; 3161 unsigned long i;
3191 unsigned long num_pages; 3162 unsigned long num_pages;
3163 int was_dirty = 0;
3192 3164
3165 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3193 num_pages = num_extent_pages(eb->start, eb->len); 3166 num_pages = num_extent_pages(eb->start, eb->len);
3194 for (i = 0; i < num_pages; i++) { 3167 for (i = 0; i < num_pages; i++)
3195 struct page *page = extent_buffer_page(eb, i);
3196 /* writepage may need to do something special for the
3197 * first page, we have to make sure page->private is
3198 * properly set. releasepage may drop page->private
3199 * on us if the page isn't already dirty.
3200 */
3201 lock_page(page);
3202 if (i == 0) {
3203 set_page_extent_head(page, eb->len);
3204 } else if (PagePrivate(page) &&
3205 page->private != EXTENT_PAGE_PRIVATE) {
3206 set_page_extent_mapped(page);
3207 }
3208 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3168 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3209 set_extent_dirty(tree, page_offset(page), 3169 return was_dirty;
3210 page_offset(page) + PAGE_CACHE_SIZE - 1,
3211 GFP_NOFS);
3212 unlock_page(page);
3213 }
3214 return 0;
3215} 3170}
3216 3171
3217int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3172int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3791,6 +3746,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3791 ret = 0; 3746 ret = 0;
3792 goto out; 3747 goto out;
3793 } 3748 }
3749 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3750 ret = 0;
3751 goto out;
3752 }
3794 /* at this point we can safely release the extent buffer */ 3753 /* at this point we can safely release the extent buffer */
3795 num_pages = num_extent_pages(eb->start, eb->len); 3754 num_pages = num_extent_pages(eb->start, eb->len);
3796 for (i = 0; i < num_pages; i++) 3755 for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
25/* these are bit numbers for test/set bit */ 25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0 26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1 27#define EXTENT_BUFFER_BLOCKING 1
28#define EXTENT_BUFFER_DIRTY 2
28 29
29/* 30/*
30 * page->private values. Every page that is controlled by the extent 31 * page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
254 struct extent_buffer *eb); 255 struct extent_buffer *eb);
255int set_extent_buffer_dirty(struct extent_io_tree *tree, 256int set_extent_buffer_dirty(struct extent_io_tree *tree,
256 struct extent_buffer *eb); 257 struct extent_buffer *eb);
258int test_extent_buffer_dirty(struct extent_io_tree *tree,
259 struct extent_buffer *eb);
257int set_extent_buffer_uptodate(struct extent_io_tree *tree, 260int set_extent_buffer_uptodate(struct extent_io_tree *tree,
258 struct extent_buffer *eb); 261 struct extent_buffer *eb);
259int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 262int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 234 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 235 if (rb) {
236 ret = -EEXIST; 236 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 237 goto out;
239 } 238 }
240 atomic_inc(&em->refs); 239 atomic_inc(&em->refs);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
52 file_key.offset = pos; 52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54 54
55 path->leave_spinning = 1;
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 56 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item)); 57 sizeof(*item));
57 if (ret < 0) 58 if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
523 key.offset = end_byte - 1; 524 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY; 525 key.type = BTRFS_EXTENT_CSUM_KEY;
525 526
527 path->leave_spinning = 1;
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) { 529 if (ret > 0) {
528 if (path->slots[0] == 0) 530 if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
757 } else { 759 } else {
758 ins_size = csum_size; 760 ins_size = csum_size;
759 } 761 }
762 path->leave_spinning = 1;
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 763 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size); 764 ins_size);
765 path->leave_spinning = 0;
762 if (ret < 0) 766 if (ret < 0)
763 goto fail_unlock; 767 goto fail_unlock;
764 if (ret != 0) { 768 if (ret != 0) {
@@ -776,7 +780,6 @@ found:
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 780 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0])); 781 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL; 782 eb_token = NULL;
779 cond_resched();
780next_sector: 783next_sector:
781 784
782 if (!eb_token || 785 if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
817 eb_token = NULL; 820 eb_token = NULL;
818 } 821 }
819 btrfs_mark_buffer_dirty(path->nodes[0]); 822 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) { 823 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path); 824 btrfs_release_path(root, path);
825 cond_resched();
823 goto again; 826 goto again;
824 } 827 }
825out: 828out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023efaff7..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
607 607
608 btrfs_release_path(root, path); 608 btrfs_release_path(root, path);
609 path->leave_spinning = 1;
609 ret = btrfs_insert_empty_item(trans, root, path, &ins, 610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
610 sizeof(*extent)); 611 sizeof(*extent));
611 BUG_ON(ret); 612 BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
639 ram_bytes); 640 ram_bytes);
640 btrfs_set_file_extent_type(leaf, extent, found_type); 641 btrfs_set_file_extent_type(leaf, extent, found_type);
641 642
643 btrfs_unlock_up_safe(path, 1);
642 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
645 btrfs_set_lock_blocking(path->nodes[0]);
643 646
644 if (disk_bytenr != 0) { 647 if (disk_bytenr != 0) {
645 ret = btrfs_update_extent_ref(trans, root, 648 ret = btrfs_update_extent_ref(trans, root,
646 disk_bytenr, orig_parent, 649 disk_bytenr,
650 le64_to_cpu(old.disk_num_bytes),
651 orig_parent,
647 leaf->start, 652 leaf->start,
648 root->root_key.objectid, 653 root->root_key.objectid,
649 trans->transid, ins.objectid); 654 trans->transid, ins.objectid);
650 655
651 BUG_ON(ret); 656 BUG_ON(ret);
652 } 657 }
658 path->leave_spinning = 0;
653 btrfs_release_path(root, path); 659 btrfs_release_path(root, path);
654 if (disk_bytenr != 0) 660 if (disk_bytenr != 0)
655 inode_add_bytes(inode, extent_end - end); 661 inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
912 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 918 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
913 919
914 if (orig_parent != leaf->start) { 920 if (orig_parent != leaf->start) {
915 ret = btrfs_update_extent_ref(trans, root, bytenr, 921 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
916 orig_parent, leaf->start, 922 orig_parent, leaf->start,
917 root->root_key.objectid, 923 root->root_key.objectid,
918 trans->transid, inode->i_ino); 924 trans->transid, inode->i_ino);
@@ -1091,19 +1097,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1091 WARN_ON(num_pages > nrptrs); 1097 WARN_ON(num_pages > nrptrs);
1092 memset(pages, 0, sizeof(struct page *) * nrptrs); 1098 memset(pages, 0, sizeof(struct page *) * nrptrs);
1093 1099
1094 ret = btrfs_check_free_space(root, write_bytes, 0); 1100 ret = btrfs_check_data_free_space(root, inode, write_bytes);
1095 if (ret) 1101 if (ret)
1096 goto out; 1102 goto out;
1097 1103
1098 ret = prepare_pages(root, file, pages, num_pages, 1104 ret = prepare_pages(root, file, pages, num_pages,
1099 pos, first_index, last_index, 1105 pos, first_index, last_index,
1100 write_bytes); 1106 write_bytes);
1101 if (ret) 1107 if (ret) {
1108 btrfs_free_reserved_data_space(root, inode,
1109 write_bytes);
1102 goto out; 1110 goto out;
1111 }
1103 1112
1104 ret = btrfs_copy_from_user(pos, num_pages, 1113 ret = btrfs_copy_from_user(pos, num_pages,
1105 write_bytes, pages, buf); 1114 write_bytes, pages, buf);
1106 if (ret) { 1115 if (ret) {
1116 btrfs_free_reserved_data_space(root, inode,
1117 write_bytes);
1107 btrfs_drop_pages(pages, num_pages); 1118 btrfs_drop_pages(pages, num_pages);
1108 goto out; 1119 goto out;
1109 } 1120 }
@@ -1111,8 +1122,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1111 ret = dirty_and_release_pages(NULL, root, file, pages, 1122 ret = dirty_and_release_pages(NULL, root, file, pages,
1112 num_pages, pos, write_bytes); 1123 num_pages, pos, write_bytes);
1113 btrfs_drop_pages(pages, num_pages); 1124 btrfs_drop_pages(pages, num_pages);
1114 if (ret) 1125 if (ret) {
1126 btrfs_free_reserved_data_space(root, inode,
1127 write_bytes);
1115 goto out; 1128 goto out;
1129 }
1116 1130
1117 if (will_write) { 1131 if (will_write) {
1118 btrfs_fdatawrite_range(inode->i_mapping, pos, 1132 btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1136,6 +1150,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1136 } 1150 }
1137out: 1151out:
1138 mutex_unlock(&inode->i_mutex); 1152 mutex_unlock(&inode->i_mutex);
1153 if (ret)
1154 err = ret;
1139 1155
1140out_nolock: 1156out_nolock:
1141 kfree(pages); 1157 kfree(pages);
@@ -1145,6 +1161,20 @@ out_nolock:
1145 page_cache_release(pinned[1]); 1161 page_cache_release(pinned[1]);
1146 *ppos = pos; 1162 *ppos = pos;
1147 1163
1164 /*
1165 * we want to make sure fsync finds this change
1166 * but we haven't joined a transaction running right now.
1167 *
1168 * Later on, someone is sure to update the inode and get the
1169 * real transid recorded.
1170 *
1171 * We set last_trans now to the fs_info generation + 1,
1172 * this will either be one more than the running transaction
1173 * or the generation used for the next transaction if there isn't
1174 * one running right now.
1175 */
1176 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1177
1148 if (num_written > 0 && will_write) { 1178 if (num_written > 0 && will_write) {
1149 struct btrfs_trans_handle *trans; 1179 struct btrfs_trans_handle *trans;
1150 1180
@@ -1157,8 +1187,11 @@ out_nolock:
1157 ret = btrfs_log_dentry_safe(trans, root, 1187 ret = btrfs_log_dentry_safe(trans, root,
1158 file->f_dentry); 1188 file->f_dentry);
1159 if (ret == 0) { 1189 if (ret == 0) {
1160 btrfs_sync_log(trans, root); 1190 ret = btrfs_sync_log(trans, root);
1161 btrfs_end_transaction(trans, root); 1191 if (ret == 0)
1192 btrfs_end_transaction(trans, root);
1193 else
1194 btrfs_commit_transaction(trans, root);
1162 } else { 1195 } else {
1163 btrfs_commit_transaction(trans, root); 1196 btrfs_commit_transaction(trans, root);
1164 } 1197 }
@@ -1175,6 +1208,18 @@ out_nolock:
1175 1208
1176int btrfs_release_file(struct inode *inode, struct file *filp) 1209int btrfs_release_file(struct inode *inode, struct file *filp)
1177{ 1210{
1211 /*
1212 * ordered_data_close is set by settattr when we are about to truncate
1213 * a file from a non-zero size to a zero size. This tries to
1214 * flush down new bytes that may have been written if the
1215 * application were using truncate to replace a file in place.
1216 */
1217 if (BTRFS_I(inode)->ordered_data_close) {
1218 BTRFS_I(inode)->ordered_data_close = 0;
1219 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1220 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1221 filemap_flush(inode->i_mapping);
1222 }
1178 if (filp->private_data) 1223 if (filp->private_data)
1179 btrfs_ioctl_trans_end(filp); 1224 btrfs_ioctl_trans_end(filp);
1180 return 0; 1225 return 0;
@@ -1222,7 +1267,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1222 /* 1267 /*
1223 * ok we haven't committed the transaction yet, lets do a commit 1268 * ok we haven't committed the transaction yet, lets do a commit
1224 */ 1269 */
1225 if (file->private_data) 1270 if (file && file->private_data)
1226 btrfs_ioctl_trans_end(file); 1271 btrfs_ioctl_trans_end(file);
1227 1272
1228 trans = btrfs_start_transaction(root, 1); 1273 trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1276,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1231 goto out; 1276 goto out;
1232 } 1277 }
1233 1278
1234 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); 1279 ret = btrfs_log_dentry_safe(trans, root, dentry);
1235 if (ret < 0) 1280 if (ret < 0)
1236 goto out; 1281 goto out;
1237 1282
@@ -1245,15 +1290,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1245 * file again, but that will end up using the synchronization 1290 * file again, but that will end up using the synchronization
1246 * inside btrfs_sync_log to keep things safe. 1291 * inside btrfs_sync_log to keep things safe.
1247 */ 1292 */
1248 mutex_unlock(&file->f_dentry->d_inode->i_mutex); 1293 mutex_unlock(&dentry->d_inode->i_mutex);
1249 1294
1250 if (ret > 0) { 1295 if (ret > 0) {
1251 ret = btrfs_commit_transaction(trans, root); 1296 ret = btrfs_commit_transaction(trans, root);
1252 } else { 1297 } else {
1253 btrfs_sync_log(trans, root); 1298 ret = btrfs_sync_log(trans, root);
1254 ret = btrfs_end_transaction(trans, root); 1299 if (ret == 0)
1300 ret = btrfs_end_transaction(trans, root);
1301 else
1302 ret = btrfs_commit_transaction(trans, root);
1255 } 1303 }
1256 mutex_lock(&file->f_dentry->d_inode->i_mutex); 1304 mutex_lock(&dentry->d_inode->i_mutex);
1257out: 1305out:
1258 return ret > 0 ? EIO : ret; 1306 return ret > 0 ? EIO : ret;
1259} 1307}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid,
336 block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes);
342 }
344 WARN_ON(1); 343 WARN_ON(1);
345 } 344 }
346out: 345out:
347 return ret; 346 return ret;
348} 347}
349 348
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 349void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 350 u64 bytes)
402{ 351{
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 357 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 358 if (info->bytes >= bytes)
410 count++; 359 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
361 info->bytes);
411 } 362 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 364 "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 379 return ret;
429} 380}
430 381
382/*
383 * for a given cluster, put all of its extents back into the free
384 * space cache. If the block group passed doesn't match the block group
385 * pointed to by the cluster, someone else raced in and freed the
386 * cluster already. In that case, we just return without changing anything
387 */
388static int
389__btrfs_return_cluster_to_free_space(
390 struct btrfs_block_group_cache *block_group,
391 struct btrfs_free_cluster *cluster)
392{
393 struct btrfs_free_space *entry;
394 struct rb_node *node;
395
396 spin_lock(&cluster->lock);
397 if (cluster->block_group != block_group)
398 goto out;
399
400 cluster->window_start = 0;
401 node = rb_first(&cluster->root);
402 while(node) {
403 entry = rb_entry(node, struct btrfs_free_space, offset_index);
404 node = rb_next(&entry->offset_index);
405 rb_erase(&entry->offset_index, &cluster->root);
406 link_free_space(block_group, entry);
407 }
408 list_del_init(&cluster->block_group_list);
409
410 btrfs_put_block_group(cluster->block_group);
411 cluster->block_group = NULL;
412 cluster->root.rb_node = NULL;
413out:
414 spin_unlock(&cluster->lock);
415 return 0;
416}
417
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 418void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 419{
433 struct btrfs_free_space *info; 420 struct btrfs_free_space *info;
434 struct rb_node *node; 421 struct rb_node *node;
422 struct btrfs_free_cluster *cluster;
423 struct btrfs_free_cluster *safe;
424
425 spin_lock(&block_group->tree_lock);
426
427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
428 block_group_list) {
429
430 WARN_ON(cluster->block_group != block_group);
431 __btrfs_return_cluster_to_free_space(block_group, cluster);
432 }
435 433
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 435 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 436 unlink_free_space(block_group, info);
440 kfree(info); 437 kfree(info);
441 if (need_resched()) { 438 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 439 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 440 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 441 spin_lock(&block_group->tree_lock);
445 } 442 }
446 } 443 }
447 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
448} 445}
449 446
450#if 0 447u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 448 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 449{
456 struct btrfs_free_space *ret; 450 struct btrfs_free_space *entry = NULL;
451 u64 ret = 0;
457 452
458 mutex_lock(&block_group->alloc_mutex); 453 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 454 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 455 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 456 if (!entry)
457 entry = tree_search_bytes(&block_group->free_space_bytes,
458 offset, bytes + empty_size);
459 if (entry) {
460 unlink_free_space(block_group, entry);
461 ret = entry->offset;
462 entry->offset += bytes;
463 entry->bytes -= bytes;
464
465 if (!entry->bytes)
466 kfree(entry);
467 else
468 link_free_space(block_group, entry);
469 }
470 spin_unlock(&block_group->tree_lock);
462 471
463 return ret; 472 return ret;
464} 473}
465 474
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 475/*
467 btrfs_block_group_cache 476 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 477 * cache. If a block group is passed, this function will only free
469 u64 bytes) 478 * a cluster that belongs to the passed block group.
479 *
480 * Otherwise, it'll get a reference on the block group pointed to by the
481 * cluster and remove the cluster from it.
482 */
483int btrfs_return_cluster_to_free_space(
484 struct btrfs_block_group_cache *block_group,
485 struct btrfs_free_cluster *cluster)
470{ 486{
471 struct btrfs_free_space *ret; 487 int ret;
472 488
473 mutex_lock(&block_group->alloc_mutex); 489 /* first, get a safe pointer to the block group */
490 spin_lock(&cluster->lock);
491 if (!block_group) {
492 block_group = cluster->block_group;
493 if (!block_group) {
494 spin_unlock(&cluster->lock);
495 return 0;
496 }
497 } else if (cluster->block_group != block_group) {
498 /* someone else has already freed it don't redo their work */
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 atomic_inc(&block_group->count);
503 spin_unlock(&cluster->lock);
474 504
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 505 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 506 spin_lock(&block_group->tree_lock);
507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
508 spin_unlock(&block_group->tree_lock);
477 509
510 /* finally drop our ref */
511 btrfs_put_block_group(block_group);
478 return ret; 512 return ret;
479} 513}
480#endif
481 514
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 515/*
483 *block_group, u64 offset, 516 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 517 * if it couldn't find anything suitably large, or a logical disk offset
518 * if things worked out
519 */
520u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
521 struct btrfs_free_cluster *cluster, u64 bytes,
522 u64 min_start)
523{
524 struct btrfs_free_space *entry = NULL;
525 struct rb_node *node;
526 u64 ret = 0;
527
528 spin_lock(&cluster->lock);
529 if (bytes > cluster->max_size)
530 goto out;
531
532 if (cluster->block_group != block_group)
533 goto out;
534
535 node = rb_first(&cluster->root);
536 if (!node)
537 goto out;
538
539 entry = rb_entry(node, struct btrfs_free_space, offset_index);
540
541 while(1) {
542 if (entry->bytes < bytes || entry->offset < min_start) {
543 struct rb_node *node;
544
545 node = rb_next(&entry->offset_index);
546 if (!node)
547 break;
548 entry = rb_entry(node, struct btrfs_free_space,
549 offset_index);
550 continue;
551 }
552 ret = entry->offset;
553
554 entry->offset += bytes;
555 entry->bytes -= bytes;
556
557 if (entry->bytes == 0) {
558 rb_erase(&entry->offset_index, &cluster->root);
559 kfree(entry);
560 }
561 break;
562 }
563out:
564 spin_unlock(&cluster->lock);
565 return ret;
566}
567
568/*
569 * here we try to find a cluster of blocks in a block group. The goal
570 * is to find at least bytes free and up to empty_size + bytes free.
571 * We might not find them all in one contiguous area.
572 *
573 * returns zero and sets up cluster if things worked out, otherwise
574 * it returns -enospc
575 */
576int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
577 struct btrfs_block_group_cache *block_group,
578 struct btrfs_free_cluster *cluster,
579 u64 offset, u64 bytes, u64 empty_size)
485{ 580{
486 struct btrfs_free_space *ret = NULL; 581 struct btrfs_free_space *entry = NULL;
582 struct rb_node *node;
583 struct btrfs_free_space *next;
584 struct btrfs_free_space *last;
585 u64 min_bytes;
586 u64 window_start;
587 u64 window_free;
588 u64 max_extent = 0;
589 int total_retries = 0;
590 int ret;
591
592 /* for metadata, allow allocates with more holes */
593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
594 /*
595 * we want to do larger allocations when we are
596 * flushing out the delayed refs, it helps prevent
597 * making more work as we go along.
598 */
599 if (trans->transaction->delayed_refs.flushing)
600 min_bytes = max(bytes, (bytes + empty_size) >> 1);
601 else
602 min_bytes = max(bytes, (bytes + empty_size) >> 4);
603 } else
604 min_bytes = max(bytes, (bytes + empty_size) >> 2);
605
606 spin_lock(&block_group->tree_lock);
607 spin_lock(&cluster->lock);
608
609 /* someone already found a cluster, hooray */
610 if (cluster->block_group) {
611 ret = 0;
612 goto out;
613 }
614again:
615 min_bytes = min(min_bytes, bytes + empty_size);
616 entry = tree_search_bytes(&block_group->free_space_bytes,
617 offset, min_bytes);
618 if (!entry) {
619 ret = -ENOSPC;
620 goto out;
621 }
622 window_start = entry->offset;
623 window_free = entry->bytes;
624 last = entry;
625 max_extent = entry->bytes;
626
627 while(1) {
628 /* out window is just right, lets fill it */
629 if (window_free >= bytes + empty_size)
630 break;
487 631
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 632 node = rb_next(&last->offset_index);
489 bytes, 0); 633 if (!node) {
490 if (!ret) 634 ret = -ENOSPC;
491 ret = tree_search_bytes(&block_group->free_space_bytes, 635 goto out;
492 offset, bytes); 636 }
637 next = rb_entry(node, struct btrfs_free_space, offset_index);
638
639 /*
640 * we haven't filled the empty size and the window is
641 * very large. reset and try again
642 */
643 if (next->offset - window_start > (bytes + empty_size) * 2) {
644 entry = next;
645 window_start = entry->offset;
646 window_free = entry->bytes;
647 last = entry;
648 max_extent = 0;
649 total_retries++;
650 if (total_retries % 256 == 0) {
651 if (min_bytes >= (bytes + empty_size)) {
652 ret = -ENOSPC;
653 goto out;
654 }
655 /*
656 * grow our allocation a bit, we're not having
657 * much luck
658 */
659 min_bytes *= 2;
660 goto again;
661 }
662 } else {
663 last = next;
664 window_free += next->bytes;
665 if (entry->bytes > max_extent)
666 max_extent = entry->bytes;
667 }
668 }
669
670 cluster->window_start = entry->offset;
671
672 /*
673 * now we've found our entries, pull them out of the free space
674 * cache and put them into the cluster rbtree
675 *
676 * The cluster includes an rbtree, but only uses the offset index
677 * of each free space cache entry.
678 */
679 while(1) {
680 node = rb_next(&entry->offset_index);
681 unlink_free_space(block_group, entry);
682 ret = tree_insert_offset(&cluster->root, entry->offset,
683 &entry->offset_index);
684 BUG_ON(ret);
685
686 if (!node || entry == last)
687 break;
688
689 entry = rb_entry(node, struct btrfs_free_space, offset_index);
690 }
691 ret = 0;
692 cluster->max_size = max_extent;
693 atomic_inc(&block_group->count);
694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
695 cluster->block_group = block_group;
696out:
697 spin_unlock(&cluster->lock);
698 spin_unlock(&block_group->tree_lock);
493 699
494 return ret; 700 return ret;
495} 701}
702
703/*
704 * simple code to zero out a cluster
705 */
706void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
707{
708 spin_lock_init(&cluster->lock);
709 spin_lock_init(&cluster->refill_lock);
710 cluster->root.rb_node = NULL;
711 cluster->max_size = 0;
712 INIT_LIST_HEAD(&cluster->block_group_list);
713 cluster->block_group = NULL;
714}
715
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
73 if (!path) 73 if (!path)
74 return -ENOMEM; 74 return -ENOMEM;
75 75
76 path->leave_spinning = 1;
77
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 78 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) { 79 if (ret > 0) {
78 ret = -ENOENT; 80 ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
127 if (!path) 129 if (!path)
128 return -ENOMEM; 130 return -ENOMEM;
129 131
132 path->leave_spinning = 1;
130 ret = btrfs_insert_empty_item(trans, root, path, &key, 133 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len); 134 ins_len);
132 if (ret == -EEXIST) { 135 if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
86 86
87 btrfs_init_path(path);
88 start_found = 0; 87 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); 88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0) 89 if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a47..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -102,34 +102,6 @@ static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
102} 102}
103 103
104/* 104/*
105 * a very lame attempt at stopping writes when the FS is 85% full. There
106 * are countless ways this is incorrect, but it is better than nothing.
107 */
108int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
109 int for_del)
110{
111 u64 total;
112 u64 used;
113 u64 thresh;
114 int ret = 0;
115
116 spin_lock(&root->fs_info->delalloc_lock);
117 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
118 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
119 if (for_del)
120 thresh = total * 90;
121 else
122 thresh = total * 85;
123
124 do_div(thresh, 100);
125
126 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
127 ret = -ENOSPC;
128 spin_unlock(&root->fs_info->delalloc_lock);
129 return ret;
130}
131
132/*
133 * this does all the hard work for inserting an inline extent into 105 * this does all the hard work for inserting an inline extent into
134 * the btree. The caller should have done a btrfs_drop_extents so that 106 * the btree. The caller should have done a btrfs_drop_extents so that
135 * no overlapping inline items exist in the btree 107 * no overlapping inline items exist in the btree
@@ -162,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
162 if (!path) 134 if (!path)
163 return -ENOMEM; 135 return -ENOMEM;
164 136
137 path->leave_spinning = 1;
165 btrfs_set_trans_block_group(trans, inode); 138 btrfs_set_trans_block_group(trans, inode);
166 139
167 key.objectid = inode->i_ino; 140 key.objectid = inode->i_ino;
@@ -195,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
195 cur_size = min_t(unsigned long, compressed_size, 168 cur_size = min_t(unsigned long, compressed_size,
196 PAGE_CACHE_SIZE); 169 PAGE_CACHE_SIZE);
197 170
198 kaddr = kmap(cpage); 171 kaddr = kmap_atomic(cpage, KM_USER0);
199 write_extent_buffer(leaf, kaddr, ptr, cur_size); 172 write_extent_buffer(leaf, kaddr, ptr, cur_size);
200 kunmap(cpage); 173 kunmap_atomic(kaddr, KM_USER0);
201 174
202 i++; 175 i++;
203 ptr += cur_size; 176 ptr += cur_size;
@@ -232,7 +205,7 @@ fail:
232 * does the checks required to make sure the data is small enough 205 * does the checks required to make sure the data is small enough
233 * to fit as an inline extent. 206 * to fit as an inline extent.
234 */ 207 */
235static int cow_file_range_inline(struct btrfs_trans_handle *trans, 208static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
236 struct btrfs_root *root, 209 struct btrfs_root *root,
237 struct inode *inode, u64 start, u64 end, 210 struct inode *inode, u64 start, u64 end,
238 size_t compressed_size, 211 size_t compressed_size,
@@ -882,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
882 u64 cur_end; 855 u64 cur_end;
883 int limit = 10 * 1024 * 1042; 856 int limit = 10 * 1024 * 1042;
884 857
885 if (!btrfs_test_opt(root, COMPRESS)) {
886 return cow_file_range(inode, locked_page, start, end,
887 page_started, nr_written, 1);
888 }
889
890 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 858 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
891 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 859 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
892 while (start < end) { 860 while (start < end) {
@@ -963,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
963 * If no cow copies or snapshots exist, we write directly to the existing 931 * If no cow copies or snapshots exist, we write directly to the existing
964 * blocks on disk 932 * blocks on disk
965 */ 933 */
966static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 934static noinline int run_delalloc_nocow(struct inode *inode,
935 struct page *locked_page,
967 u64 start, u64 end, int *page_started, int force, 936 u64 start, u64 end, int *page_started, int force,
968 unsigned long *nr_written) 937 unsigned long *nr_written)
969{ 938{
@@ -1161,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1161 unsigned long *nr_written) 1130 unsigned long *nr_written)
1162{ 1131{
1163 int ret; 1132 int ret;
1133 struct btrfs_root *root = BTRFS_I(inode)->root;
1164 1134
1165 if (btrfs_test_flag(inode, NODATACOW)) 1135 if (btrfs_test_flag(inode, NODATACOW))
1166 ret = run_delalloc_nocow(inode, locked_page, start, end, 1136 ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1168,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1168 else if (btrfs_test_flag(inode, PREALLOC)) 1138 else if (btrfs_test_flag(inode, PREALLOC))
1169 ret = run_delalloc_nocow(inode, locked_page, start, end, 1139 ret = run_delalloc_nocow(inode, locked_page, start, end,
1170 page_started, 0, nr_written); 1140 page_started, 0, nr_written);
1141 else if (!btrfs_test_opt(root, COMPRESS))
1142 ret = cow_file_range(inode, locked_page, start, end,
1143 page_started, nr_written, 1);
1171 else 1144 else
1172 ret = cow_file_range_async(inode, locked_page, start, end, 1145 ret = cow_file_range_async(inode, locked_page, start, end,
1173 page_started, nr_written); 1146 page_started, nr_written);
1174
1175 return ret; 1147 return ret;
1176} 1148}
1177 1149
@@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1190 */ 1162 */
1191 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1163 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1192 struct btrfs_root *root = BTRFS_I(inode)->root; 1164 struct btrfs_root *root = BTRFS_I(inode)->root;
1165 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1193 spin_lock(&root->fs_info->delalloc_lock); 1166 spin_lock(&root->fs_info->delalloc_lock);
1194 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1167 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1195 root->fs_info->delalloc_bytes += end - start + 1; 1168 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1223 (unsigned long long)end - start + 1, 1196 (unsigned long long)end - start + 1,
1224 (unsigned long long) 1197 (unsigned long long)
1225 root->fs_info->delalloc_bytes); 1198 root->fs_info->delalloc_bytes);
1199 btrfs_delalloc_free_space(root, inode, (u64)-1);
1226 root->fs_info->delalloc_bytes = 0; 1200 root->fs_info->delalloc_bytes = 0;
1227 BTRFS_I(inode)->delalloc_bytes = 0; 1201 BTRFS_I(inode)->delalloc_bytes = 0;
1228 } else { 1202 } else {
1203 btrfs_delalloc_free_space(root, inode,
1204 end - start + 1);
1229 root->fs_info->delalloc_bytes -= end - start + 1; 1205 root->fs_info->delalloc_bytes -= end - start + 1;
1230 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1231 } 1207 }
@@ -1477,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1477 path = btrfs_alloc_path(); 1453 path = btrfs_alloc_path();
1478 BUG_ON(!path); 1454 BUG_ON(!path);
1479 1455
1456 path->leave_spinning = 1;
1480 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1481 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, file_pos, &hint);
1482 BUG_ON(ret); 1459 BUG_ON(ret);
@@ -1499,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1499 btrfs_set_file_extent_compression(leaf, fi, compression); 1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1500 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1501 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479
1480 btrfs_unlock_up_safe(path, 1);
1481 btrfs_set_lock_blocking(leaf);
1482
1502 btrfs_mark_buffer_dirty(leaf); 1483 btrfs_mark_buffer_dirty(leaf);
1503 1484
1504 inode_add_bytes(inode, num_bytes); 1485 inode_add_bytes(inode, num_bytes);
@@ -1511,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1511 root->root_key.objectid, 1492 root->root_key.objectid,
1512 trans->transid, inode->i_ino, &ins); 1493 trans->transid, inode->i_ino, &ins);
1513 BUG_ON(ret); 1494 BUG_ON(ret);
1514
1515 btrfs_free_path(path); 1495 btrfs_free_path(path);
1496
1516 return 0; 1497 return 0;
1517} 1498}
1518 1499
1500/*
1501 * helper function for btrfs_finish_ordered_io, this
1502 * just reads in some of the csum leaves to prime them into ram
1503 * before we start the transaction. It limits the amount of btree
1504 * reads required while inside the transaction.
1505 */
1506static noinline void reada_csum(struct btrfs_root *root,
1507 struct btrfs_path *path,
1508 struct btrfs_ordered_extent *ordered_extent)
1509{
1510 struct btrfs_ordered_sum *sum;
1511 u64 bytenr;
1512
1513 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1514 list);
1515 bytenr = sum->sums[0].bytenr;
1516
1517 /*
1518 * we don't care about the results, the point of this search is
1519 * just to get the btree leaves into ram
1520 */
1521 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1522}
1523
1519/* as ordered data IO finishes, this gets called so we can finish 1524/* as ordered data IO finishes, this gets called so we can finish
1520 * an ordered extent if the range of bytes in the file it covers are 1525 * an ordered extent if the range of bytes in the file it covers are
1521 * fully written. 1526 * fully written.
@@ -1524,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1524{ 1529{
1525 struct btrfs_root *root = BTRFS_I(inode)->root; 1530 struct btrfs_root *root = BTRFS_I(inode)->root;
1526 struct btrfs_trans_handle *trans; 1531 struct btrfs_trans_handle *trans;
1527 struct btrfs_ordered_extent *ordered_extent; 1532 struct btrfs_ordered_extent *ordered_extent = NULL;
1528 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1533 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1534 struct btrfs_path *path;
1529 int compressed = 0; 1535 int compressed = 0;
1530 int ret; 1536 int ret;
1531 1537
@@ -1533,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1533 if (!ret) 1539 if (!ret)
1534 return 0; 1540 return 0;
1535 1541
1542 /*
1543 * before we join the transaction, try to do some of our IO.
1544 * This will limit the amount of IO that we have to do with
1545 * the transaction running. We're unlikely to need to do any
1546 * IO if the file extents are new, the disk_i_size checks
1547 * covers the most common case.
1548 */
1549 if (start < BTRFS_I(inode)->disk_i_size) {
1550 path = btrfs_alloc_path();
1551 if (path) {
1552 ret = btrfs_lookup_file_extent(NULL, root, path,
1553 inode->i_ino,
1554 start, 0);
1555 ordered_extent = btrfs_lookup_ordered_extent(inode,
1556 start);
1557 if (!list_empty(&ordered_extent->list)) {
1558 btrfs_release_path(root, path);
1559 reada_csum(root, path, ordered_extent);
1560 }
1561 btrfs_free_path(path);
1562 }
1563 }
1564
1536 trans = btrfs_join_transaction(root, 1); 1565 trans = btrfs_join_transaction(root, 1);
1537 1566
1538 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1567 if (!ordered_extent)
1568 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1539 BUG_ON(!ordered_extent); 1569 BUG_ON(!ordered_extent);
1540 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1570 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1541 goto nocow; 1571 goto nocow;
@@ -2125,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2125 2155
2126 path = btrfs_alloc_path(); 2156 path = btrfs_alloc_path();
2127 BUG_ON(!path); 2157 BUG_ON(!path);
2158 path->leave_spinning = 1;
2128 ret = btrfs_lookup_inode(trans, root, path, 2159 ret = btrfs_lookup_inode(trans, root, path,
2129 &BTRFS_I(inode)->location, 1); 2160 &BTRFS_I(inode)->location, 1);
2130 if (ret) { 2161 if (ret) {
@@ -2171,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2171 goto err; 2202 goto err;
2172 } 2203 }
2173 2204
2205 path->leave_spinning = 1;
2174 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2206 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2175 name, name_len, -1); 2207 name, name_len, -1);
2176 if (IS_ERR(di)) { 2208 if (IS_ERR(di)) {
@@ -2214,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2214 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2215 inode, dir->i_ino); 2247 inode, dir->i_ino);
2216 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2217 if (ret != -ENOENT)
2218 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2219 2249
2220 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2221 dir, index); 2251 dir, index);
@@ -2245,13 +2275,12 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2245 2275
2246 root = BTRFS_I(dir)->root; 2276 root = BTRFS_I(dir)->root;
2247 2277
2248 ret = btrfs_check_free_space(root, 1, 1);
2249 if (ret)
2250 goto fail;
2251
2252 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2253 2279
2254 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2255 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2256 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2257 2286
@@ -2261,7 +2290,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2261 nr = trans->blocks_used; 2290 nr = trans->blocks_used;
2262 2291
2263 btrfs_end_transaction_throttle(trans, root); 2292 btrfs_end_transaction_throttle(trans, root);
2264fail:
2265 btrfs_btree_balance_dirty(root, nr); 2293 btrfs_btree_balance_dirty(root, nr);
2266 return ret; 2294 return ret;
2267} 2295}
@@ -2284,10 +2312,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2284 return -ENOTEMPTY; 2312 return -ENOTEMPTY;
2285 } 2313 }
2286 2314
2287 ret = btrfs_check_free_space(root, 1, 1);
2288 if (ret)
2289 goto fail;
2290
2291 trans = btrfs_start_transaction(root, 1); 2315 trans = btrfs_start_transaction(root, 1);
2292 btrfs_set_trans_block_group(trans, dir); 2316 btrfs_set_trans_block_group(trans, dir);
2293 2317
@@ -2304,7 +2328,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2304fail_trans: 2328fail_trans:
2305 nr = trans->blocks_used; 2329 nr = trans->blocks_used;
2306 ret = btrfs_end_transaction_throttle(trans, root); 2330 ret = btrfs_end_transaction_throttle(trans, root);
2307fail:
2308 btrfs_btree_balance_dirty(root, nr); 2331 btrfs_btree_balance_dirty(root, nr);
2309 2332
2310 if (ret && !err) 2333 if (ret && !err)
@@ -2531,9 +2554,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2531 key.offset = (u64)-1; 2554 key.offset = (u64)-1;
2532 key.type = (u8)-1; 2555 key.type = (u8)-1;
2533 2556
2534 btrfs_init_path(path);
2535
2536search_again: 2557search_again:
2558 path->leave_spinning = 1;
2537 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2559 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2538 if (ret < 0) 2560 if (ret < 0)
2539 goto error; 2561 goto error;
@@ -2680,6 +2702,7 @@ delete:
2680 break; 2702 break;
2681 } 2703 }
2682 if (found_extent) { 2704 if (found_extent) {
2705 btrfs_set_path_blocking(path);
2683 ret = btrfs_free_extent(trans, root, extent_start, 2706 ret = btrfs_free_extent(trans, root, extent_start,
2684 extent_num_bytes, 2707 extent_num_bytes,
2685 leaf->start, root_owner, 2708 leaf->start, root_owner,
@@ -2820,7 +2843,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2820 if (size <= hole_start) 2843 if (size <= hole_start)
2821 return 0; 2844 return 0;
2822 2845
2823 err = btrfs_check_free_space(root, 1, 0); 2846 err = btrfs_check_metadata_free_space(root);
2824 if (err) 2847 if (err)
2825 return err; 2848 return err;
2826 2849
@@ -2884,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2884 if (err) 2907 if (err)
2885 return err; 2908 return err;
2886 2909
2887 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2888 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2889 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2890 if (err) 2913 if (err)
2891 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2892 } 2925 }
2893 2926
2894 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -3016,16 +3049,18 @@ static noinline void init_btrfs_i(struct inode *inode)
3016 bi->last_trans = 0; 3049 bi->last_trans = 0;
3017 bi->logged_trans = 0; 3050 bi->logged_trans = 0;
3018 bi->delalloc_bytes = 0; 3051 bi->delalloc_bytes = 0;
3052 bi->reserved_bytes = 0;
3019 bi->disk_i_size = 0; 3053 bi->disk_i_size = 0;
3020 bi->flags = 0; 3054 bi->flags = 0;
3021 bi->index_cnt = (u64)-1; 3055 bi->index_cnt = (u64)-1;
3022 bi->log_dirty_trans = 0; 3056 bi->last_unlink_trans = 0;
3023 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3057 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3024 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3058 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3025 inode->i_mapping, GFP_NOFS); 3059 inode->i_mapping, GFP_NOFS);
3026 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3027 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
3028 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3029 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3030 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
3031 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3037,6 +3072,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
3037 inode->i_ino = args->ino; 3072 inode->i_ino = args->ino;
3038 init_btrfs_i(inode); 3073 init_btrfs_i(inode);
3039 BTRFS_I(inode)->root = args->root; 3074 BTRFS_I(inode)->root = args->root;
3075 btrfs_set_inode_space_info(args->root, inode);
3040 return 0; 3076 return 0;
3041} 3077}
3042 3078
@@ -3445,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3445 3481
3446 if (dir) { 3482 if (dir) {
3447 ret = btrfs_set_inode_index(dir, index); 3483 ret = btrfs_set_inode_index(dir, index);
3448 if (ret) 3484 if (ret) {
3485 iput(inode);
3449 return ERR_PTR(ret); 3486 return ERR_PTR(ret);
3487 }
3450 } 3488 }
3451 /* 3489 /*
3452 * index_cnt is ignored for everything but a dir, 3490 * index_cnt is ignored for everything but a dir,
@@ -3457,6 +3495,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3457 BTRFS_I(inode)->index_cnt = 2; 3495 BTRFS_I(inode)->index_cnt = 2;
3458 BTRFS_I(inode)->root = root; 3496 BTRFS_I(inode)->root = root;
3459 BTRFS_I(inode)->generation = trans->transid; 3497 BTRFS_I(inode)->generation = trans->transid;
3498 btrfs_set_inode_space_info(root, inode);
3460 3499
3461 if (mode & S_IFDIR) 3500 if (mode & S_IFDIR)
3462 owner = 0; 3501 owner = 0;
@@ -3482,6 +3521,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3482 sizes[0] = sizeof(struct btrfs_inode_item); 3521 sizes[0] = sizeof(struct btrfs_inode_item);
3483 sizes[1] = name_len + sizeof(*ref); 3522 sizes[1] = name_len + sizeof(*ref);
3484 3523
3524 path->leave_spinning = 1;
3485 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3525 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3486 if (ret != 0) 3526 if (ret != 0)
3487 goto fail; 3527 goto fail;
@@ -3527,6 +3567,7 @@ fail:
3527 if (dir) 3567 if (dir)
3528 BTRFS_I(dir)->index_cnt--; 3568 BTRFS_I(dir)->index_cnt--;
3529 btrfs_free_path(path); 3569 btrfs_free_path(path);
3570 iput(inode);
3530 return ERR_PTR(ret); 3571 return ERR_PTR(ret);
3531} 3572}
3532 3573
@@ -3604,7 +3645,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3604 if (!new_valid_dev(rdev)) 3645 if (!new_valid_dev(rdev))
3605 return -EINVAL; 3646 return -EINVAL;
3606 3647
3607 err = btrfs_check_free_space(root, 1, 0); 3648 err = btrfs_check_metadata_free_space(root);
3608 if (err) 3649 if (err)
3609 goto fail; 3650 goto fail;
3610 3651
@@ -3667,7 +3708,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3667 u64 objectid; 3708 u64 objectid;
3668 u64 index = 0; 3709 u64 index = 0;
3669 3710
3670 err = btrfs_check_free_space(root, 1, 0); 3711 err = btrfs_check_metadata_free_space(root);
3671 if (err) 3712 if (err)
3672 goto fail; 3713 goto fail;
3673 trans = btrfs_start_transaction(root, 1); 3714 trans = btrfs_start_transaction(root, 1);
@@ -3735,7 +3776,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3735 return -ENOENT; 3776 return -ENOENT;
3736 3777
3737 btrfs_inc_nlink(inode); 3778 btrfs_inc_nlink(inode);
3738 err = btrfs_check_free_space(root, 1, 0); 3779 err = btrfs_check_metadata_free_space(root);
3739 if (err) 3780 if (err)
3740 goto fail; 3781 goto fail;
3741 err = btrfs_set_inode_index(dir, &index); 3782 err = btrfs_set_inode_index(dir, &index);
@@ -3760,6 +3801,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3760 drop_inode = 1; 3801 drop_inode = 1;
3761 3802
3762 nr = trans->blocks_used; 3803 nr = trans->blocks_used;
3804
3805 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3763 btrfs_end_transaction_throttle(trans, root); 3806 btrfs_end_transaction_throttle(trans, root);
3764fail: 3807fail:
3765 if (drop_inode) { 3808 if (drop_inode) {
@@ -3781,7 +3824,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3781 u64 index = 0; 3824 u64 index = 0;
3782 unsigned long nr = 1; 3825 unsigned long nr = 1;
3783 3826
3784 err = btrfs_check_free_space(root, 1, 0); 3827 err = btrfs_check_metadata_free_space(root);
3785 if (err) 3828 if (err)
3786 goto out_unlock; 3829 goto out_unlock;
3787 3830
@@ -4263,7 +4306,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4263{ 4306{
4264 if (PageWriteback(page) || PageDirty(page)) 4307 if (PageWriteback(page) || PageDirty(page))
4265 return 0; 4308 return 0;
4266 return __btrfs_releasepage(page, gfp_flags); 4309 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
4267} 4310}
4268 4311
4269static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4312static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4325,8 +4368,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4325 * beyond EOF, then the page is guaranteed safe against truncation until we 4368 * beyond EOF, then the page is guaranteed safe against truncation until we
4326 * unlock the page. 4369 * unlock the page.
4327 */ 4370 */
4328int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 4371int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4329{ 4372{
4373 struct page *page = vmf->page;
4330 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4374 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4331 struct btrfs_root *root = BTRFS_I(inode)->root; 4375 struct btrfs_root *root = BTRFS_I(inode)->root;
4332 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4376 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4338,11 +4382,16 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4338 u64 page_start; 4382 u64 page_start;
4339 u64 page_end; 4383 u64 page_end;
4340 4384
4341 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); 4385 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4342 if (ret) 4386 if (ret) {
4387 if (ret == -ENOMEM)
4388 ret = VM_FAULT_OOM;
4389 else /* -ENOSPC, -EIO, etc */
4390 ret = VM_FAULT_SIGBUS;
4343 goto out; 4391 goto out;
4392 }
4344 4393
4345 ret = -EINVAL; 4394 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4346again: 4395again:
4347 lock_page(page); 4396 lock_page(page);
4348 size = i_size_read(inode); 4397 size = i_size_read(inode);
@@ -4351,6 +4400,7 @@ again:
4351 4400
4352 if ((page->mapping != inode->i_mapping) || 4401 if ((page->mapping != inode->i_mapping) ||
4353 (page_start >= size)) { 4402 (page_start >= size)) {
4403 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4354 /* page got truncated out from underneath us */ 4404 /* page got truncated out from underneath us */
4355 goto out_unlock; 4405 goto out_unlock;
4356 } 4406 }
@@ -4389,6 +4439,8 @@ again:
4389 } 4439 }
4390 ClearPageChecked(page); 4440 ClearPageChecked(page);
4391 set_page_dirty(page); 4441 set_page_dirty(page);
4442
4443 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4392 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4444 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4393 4445
4394out_unlock: 4446out_unlock:
@@ -4414,6 +4466,27 @@ static void btrfs_truncate(struct inode *inode)
4414 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4466 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4415 4467
4416 trans = btrfs_start_transaction(root, 1); 4468 trans = btrfs_start_transaction(root, 1);
4469
4470 /*
4471 * setattr is responsible for setting the ordered_data_close flag,
4472 * but that is only tested during the last file release. That
4473 * could happen well after the next commit, leaving a great big
4474 * window where new writes may get lost if someone chooses to write
4475 * to this file after truncating to zero
4476 *
4477 * The inode doesn't have any dirty data here, and so if we commit
4478 * this is a noop. If someone immediately starts writing to the inode
4479 * it is very likely we'll catch some of their writes in this
4480 * transaction, and the commit will find this file on the ordered
4481 * data list with good things to send down.
4482 *
4483 * This is a best effort solution, there is still a window where
4484 * using truncate to replace the contents of the file will
4485 * end up with a zero length file after a crash.
4486 */
4487 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4488 btrfs_add_ordered_operation(trans, root, inode);
4489
4417 btrfs_set_trans_block_group(trans, inode); 4490 btrfs_set_trans_block_group(trans, inode);
4418 btrfs_i_size_write(inode, inode->i_size); 4491 btrfs_i_size_write(inode, inode->i_size);
4419 4492
@@ -4490,12 +4563,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4490 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4563 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4491 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4564 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4492 INIT_LIST_HEAD(&ei->i_orphan); 4565 INIT_LIST_HEAD(&ei->i_orphan);
4566 INIT_LIST_HEAD(&ei->ordered_operations);
4493 return &ei->vfs_inode; 4567 return &ei->vfs_inode;
4494} 4568}
4495 4569
4496void btrfs_destroy_inode(struct inode *inode) 4570void btrfs_destroy_inode(struct inode *inode)
4497{ 4571{
4498 struct btrfs_ordered_extent *ordered; 4572 struct btrfs_ordered_extent *ordered;
4573 struct btrfs_root *root = BTRFS_I(inode)->root;
4574
4499 WARN_ON(!list_empty(&inode->i_dentry)); 4575 WARN_ON(!list_empty(&inode->i_dentry));
4500 WARN_ON(inode->i_data.nrpages); 4576 WARN_ON(inode->i_data.nrpages);
4501 4577
@@ -4506,13 +4582,24 @@ void btrfs_destroy_inode(struct inode *inode)
4506 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4582 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4507 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4583 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4508 4584
4509 spin_lock(&BTRFS_I(inode)->root->list_lock); 4585 /*
4586 * Make sure we're properly removed from the ordered operation
4587 * lists.
4588 */
4589 smp_mb();
4590 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4591 spin_lock(&root->fs_info->ordered_extent_lock);
4592 list_del_init(&BTRFS_I(inode)->ordered_operations);
4593 spin_unlock(&root->fs_info->ordered_extent_lock);
4594 }
4595
4596 spin_lock(&root->list_lock);
4510 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4597 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4511 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4598 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4512 " list\n", inode->i_ino); 4599 " list\n", inode->i_ino);
4513 dump_stack(); 4600 dump_stack();
4514 } 4601 }
4515 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4602 spin_unlock(&root->list_lock);
4516 4603
4517 while (1) { 4604 while (1) {
4518 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4605 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4633,12 +4720,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4633 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 4720 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4634 return -EXDEV; 4721 return -EXDEV;
4635 4722
4636 ret = btrfs_check_free_space(root, 1, 0); 4723 ret = btrfs_check_metadata_free_space(root);
4637 if (ret) 4724 if (ret)
4638 goto out_unlock; 4725 goto out_unlock;
4639 4726
4727 /*
4728 * we're using rename to replace one file with another.
4729 * and the replacement file is large. Start IO on it now so
4730 * we don't add too much work to the end of the transaction
4731 */
4732 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4733 new_inode->i_size &&
4734 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4735 filemap_flush(old_inode->i_mapping);
4736
4640 trans = btrfs_start_transaction(root, 1); 4737 trans = btrfs_start_transaction(root, 1);
4641 4738
4739 /*
4740 * make sure the inode gets flushed if it is replacing
4741 * something.
4742 */
4743 if (new_inode && new_inode->i_size &&
4744 old_inode && S_ISREG(old_inode->i_mode)) {
4745 btrfs_add_ordered_operation(trans, root, old_inode);
4746 }
4747
4748 /*
4749 * this is an ugly little race, but the rename is required to make
4750 * sure that if we crash, the inode is either at the old name
4751 * or the new one. pinning the log transaction lets us make sure
4752 * we don't allow a log commit to come in after we unlink the
4753 * name but before we add the new name back in.
4754 */
4755 btrfs_pin_log_trans(root);
4756
4642 btrfs_set_trans_block_group(trans, new_dir); 4757 btrfs_set_trans_block_group(trans, new_dir);
4643 4758
4644 btrfs_inc_nlink(old_dentry->d_inode); 4759 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4646,6 +4761,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4646 new_dir->i_ctime = new_dir->i_mtime = ctime; 4761 new_dir->i_ctime = new_dir->i_mtime = ctime;
4647 old_inode->i_ctime = ctime; 4762 old_inode->i_ctime = ctime;
4648 4763
4764 if (old_dentry->d_parent != new_dentry->d_parent)
4765 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4766
4649 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4767 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4650 old_dentry->d_name.name, 4768 old_dentry->d_name.name,
4651 old_dentry->d_name.len); 4769 old_dentry->d_name.len);
@@ -4677,7 +4795,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4677 if (ret) 4795 if (ret)
4678 goto out_fail; 4796 goto out_fail;
4679 4797
4798 btrfs_log_new_name(trans, old_inode, old_dir,
4799 new_dentry->d_parent);
4680out_fail: 4800out_fail:
4801
4802 /* this btrfs_end_log_trans just allows the current
4803 * log-sub transaction to complete
4804 */
4805 btrfs_end_log_trans(root);
4681 btrfs_end_transaction_throttle(trans, root); 4806 btrfs_end_transaction_throttle(trans, root);
4682out_unlock: 4807out_unlock:
4683 return ret; 4808 return ret;
@@ -4751,7 +4876,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4751 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 4876 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4752 return -ENAMETOOLONG; 4877 return -ENAMETOOLONG;
4753 4878
4754 err = btrfs_check_free_space(root, 1, 0); 4879 err = btrfs_check_metadata_free_space(root);
4755 if (err) 4880 if (err)
4756 goto out_fail; 4881 goto out_fail;
4757 4882
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8b49eb..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
70 u64 index = 0; 70 u64 index = 0;
71 unsigned long nr = 1; 71 unsigned long nr = 1;
72 72
73 ret = btrfs_check_free_space(root, 1, 0); 73 ret = btrfs_check_metadata_free_space(root);
74 if (ret) 74 if (ret)
75 goto fail_commit; 75 goto fail_commit;
76 76
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
203 if (!root->ref_cows) 203 if (!root->ref_cows)
204 return -EINVAL; 204 return -EINVAL;
205 205
206 ret = btrfs_check_free_space(root, 1, 0); 206 ret = btrfs_check_metadata_free_space(root);
207 if (ret) 207 if (ret)
208 goto fail_unlock; 208 goto fail_unlock;
209 209
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
267 goto out_dput; 267 goto out_dput;
268 268
269 if (!IS_POSIXACL(parent->dentry->d_inode)) 269 if (!IS_POSIXACL(parent->dentry->d_inode))
270 mode &= ~current->fs->umask; 270 mode &= ~current_umask();
271 271
272 error = mnt_want_write(parent->mnt); 272 error = mnt_want_write(parent->mnt);
273 if (error) 273 if (error)
@@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
374 unsigned long i; 374 unsigned long i;
375 int ret; 375 int ret;
376 376
377 ret = btrfs_check_free_space(root, inode->i_size, 0); 377 ret = btrfs_check_data_free_space(root, inode, inode->i_size);
378 if (ret) 378 if (ret)
379 return -ENOSPC; 379 return -ENOSPC;
380 380
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129b..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
25#include "extent_io.h" 25#include "extent_io.h"
26#include "locking.h" 26#include "locking.h"
27 27
28/*
29 * btrfs_header_level() isn't free, so don't call it when lockdep isn't
30 * on
31 */
32#ifdef CONFIG_DEBUG_LOCK_ALLOC
33static inline void spin_nested(struct extent_buffer *eb)
34{
35 spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
36}
37#else
38static inline void spin_nested(struct extent_buffer *eb) 28static inline void spin_nested(struct extent_buffer *eb)
39{ 29{
40 spin_lock(&eb->lock); 30 spin_lock(&eb->lock);
41} 31}
42#endif
43 32
44/* 33/*
45 * Setting a lock to blocking will drop the spinlock and set the 34 * Setting a lock to blocking will drop the spinlock and set the
@@ -71,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
71 60
72/* 61/*
73 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
74 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
75 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
76 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
77 * 66 *
78 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
@@ -82,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
82static int btrfs_spin_on_block(struct extent_buffer *eb) 71static int btrfs_spin_on_block(struct extent_buffer *eb)
83{ 72{
84 int i; 73 int i;
74
85 for (i = 0; i < 512; i++) { 75 for (i = 0; i < 512; i++) {
86 cpu_relax();
87 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
88 return 1; 77 return 1;
89 if (need_resched()) 78 if (need_resched())
90 break; 79 break;
80 cpu_relax();
91 } 81 }
92 return 0; 82 return 0;
93} 83}
@@ -106,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
106{ 96{
107 int i; 97 int i;
108 98
109 spin_nested(eb); 99 if (btrfs_spin_on_block(eb)) {
110 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 100 spin_nested(eb);
111 return 1; 101 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112 spin_unlock(&eb->lock); 102 return 1;
113 103 spin_unlock(&eb->lock);
104 }
114 /* spin for a bit on the BLOCKING flag */ 105 /* spin for a bit on the BLOCKING flag */
115 for (i = 0; i < 2; i++) { 106 for (i = 0; i < 2; i++) {
107 cpu_relax();
116 if (!btrfs_spin_on_block(eb)) 108 if (!btrfs_spin_on_block(eb))
117 break; 109 break;
118 110
@@ -159,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
159 DEFINE_WAIT(wait); 151 DEFINE_WAIT(wait);
160 wait.func = btrfs_wake_function; 152 wait.func = btrfs_wake_function;
161 153
154 if (!btrfs_spin_on_block(eb))
155 goto sleep;
156
162 while(1) { 157 while(1) {
163 spin_nested(eb); 158 spin_nested(eb);
164 159
@@ -176,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
176 * spin for a bit, and if the blocking flag goes away, 171 * spin for a bit, and if the blocking flag goes away,
177 * loop around 172 * loop around
178 */ 173 */
174 cpu_relax();
179 if (btrfs_spin_on_block(eb)) 175 if (btrfs_spin_on_block(eb))
180 continue; 176 continue;
181 177sleep:
182 prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
183 TASK_UNINTERRUPTIBLE); 179 TASK_UNINTERRUPTIBLE);
184 180
@@ -231,8 +227,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
231 return 0; 227 return 0;
232} 228}
233 229
234int btrfs_tree_locked(struct extent_buffer *eb) 230void btrfs_assert_tree_locked(struct extent_buffer *eb)
235{ 231{
236 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || 232 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
237 spin_is_locked(&eb->lock); 233 assert_spin_locked(&eb->lock);
238} 234}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afbff928..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
21 21
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25 24
26int btrfs_try_tree_lock(struct extent_buffer *eb); 25int btrfs_try_tree_lock(struct extent_buffer *eb);
27int btrfs_try_spin_lock(struct extent_buffer *eb); 26int btrfs_try_spin_lock(struct extent_buffer *eb);
28 27
29void btrfs_set_lock_blocking(struct extent_buffer *eb); 28void btrfs_set_lock_blocking(struct extent_buffer *eb);
30void btrfs_clear_lock_blocking(struct extent_buffer *eb); 29void btrfs_clear_lock_blocking(struct extent_buffer *eb);
30void btrfs_assert_tree_locked(struct extent_buffer *eb);
31#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc38..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
86 {Opt_err, NULL}, 90 {Opt_err, NULL},
87}; 91};
88 92
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
222 case Opt_noacl: 226 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 228 break;
229 case Opt_notreelog:
230 printk(KERN_INFO "btrfs: disabling tree log\n");
231 btrfs_set_opt(info->mount_opt, NOTREELOG);
232 break;
233 case Opt_flushoncommit:
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break;
225 default: 237 default:
226 break; 238 break;
227 } 239 }
@@ -363,9 +375,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 375int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 376{
365 struct btrfs_trans_handle *trans; 377 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 378 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 379 int ret;
368 root = btrfs_sb(sb);
369 380
370 if (sb->s_flags & MS_RDONLY) 381 if (sb->s_flags & MS_RDONLY)
371 return 0; 382 return 0;
@@ -379,13 +390,47 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
379 btrfs_start_delalloc_inodes(root); 390 btrfs_start_delalloc_inodes(root);
380 btrfs_wait_ordered_extents(root, 0); 391 btrfs_wait_ordered_extents(root, 0);
381 392
382 btrfs_clean_old_snapshots(root);
383 trans = btrfs_start_transaction(root, 1); 393 trans = btrfs_start_transaction(root, 1);
384 ret = btrfs_commit_transaction(trans, root); 394 ret = btrfs_commit_transaction(trans, root);
385 sb->s_dirt = 0; 395 sb->s_dirt = 0;
386 return ret; 396 return ret;
387} 397}
388 398
399static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
400{
401 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
402 struct btrfs_fs_info *info = root->fs_info;
403
404 if (btrfs_test_opt(root, DEGRADED))
405 seq_puts(seq, ",degraded");
406 if (btrfs_test_opt(root, NODATASUM))
407 seq_puts(seq, ",nodatasum");
408 if (btrfs_test_opt(root, NODATACOW))
409 seq_puts(seq, ",nodatacow");
410 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent);
414 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline);
416 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
421 if (btrfs_test_opt(root, COMPRESS))
422 seq_puts(seq, ",compress");
423 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl");
431 return 0;
432}
433
389static void btrfs_write_super(struct super_block *sb) 434static void btrfs_write_super(struct super_block *sb)
390{ 435{
391 sb->s_dirt = 0; 436 sb->s_dirt = 0;
@@ -511,6 +556,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
511 struct btrfs_root *root = btrfs_sb(sb); 556 struct btrfs_root *root = btrfs_sb(sb);
512 int ret; 557 int ret;
513 558
559 ret = btrfs_parse_options(root, data);
560 if (ret)
561 return -EINVAL;
562
514 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 563 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
515 return 0; 564 return 0;
516 565
@@ -627,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
627 .put_super = btrfs_put_super, 676 .put_super = btrfs_put_super,
628 .write_super = btrfs_write_super, 677 .write_super = btrfs_write_super,
629 .sync_fs = btrfs_sync_fs, 678 .sync_fs = btrfs_sync_fs,
630 .show_options = generic_show_options, 679 .show_options = btrfs_show_options,
631 .write_inode = btrfs_write_inode, 680 .write_inode = btrfs_write_inode,
632 .dirty_inode = btrfs_dirty_inode, 681 .dirty_inode = btrfs_dirty_inode,
633 .alloc_inode = btrfs_alloc_inode, 682 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9a..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
65 cur_trans->use_count = 1; 63 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0; 64 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds(); 65 cur_trans->start_time = get_seconds();
66
67 cur_trans->delayed_refs.root.rb_node = NULL;
68 cur_trans->delayed_refs.num_entries = 0;
69 cur_trans->delayed_refs.num_heads_ready = 0;
70 cur_trans->delayed_refs.num_heads = 0;
71 cur_trans->delayed_refs.flushing = 0;
72 cur_trans->delayed_refs.run_delayed_start = 0;
73 spin_lock_init(&cur_trans->delayed_refs.lock);
74
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 75 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 76 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages, 77 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
182 h->block_group = 0; 189 h->block_group = 0;
183 h->alloc_exclude_nr = 0; 190 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0; 191 h->alloc_exclude_start = 0;
192 h->delayed_ref_updates = 0;
193
185 root->fs_info->running_transaction->use_count++; 194 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex); 195 mutex_unlock(&root->fs_info->trans_mutex);
187 return h; 196 return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
271 if (!root->fs_info->open_ioctl_trans) 280 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root); 281 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex); 282 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root); 283 throttle_on_drops(root);
276} 284}
277 285
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
280{ 288{
281 struct btrfs_transaction *cur_trans; 289 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info; 290 struct btrfs_fs_info *info = root->fs_info;
291 int count = 0;
292
293 while (count < 4) {
294 unsigned long cur = trans->delayed_ref_updates;
295 trans->delayed_ref_updates = 0;
296 if (cur &&
297 trans->transaction->delayed_refs.num_heads_ready > 64) {
298 trans->delayed_ref_updates = 0;
299
300 /*
301 * do a full flush if the transaction is trying
302 * to close
303 */
304 if (trans->transaction->delayed_refs.flushing)
305 cur = 0;
306 btrfs_run_delayed_refs(trans, root, cur);
307 } else {
308 break;
309 }
310 count++;
311 }
283 312
284 mutex_lock(&info->trans_mutex); 313 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction; 314 cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
424 u64 old_root_bytenr; 453 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root; 454 struct btrfs_root *tree_root = root->fs_info->tree_root;
426 455
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root); 456 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root); 457
458 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459 BUG_ON(ret);
430 460
431 while (1) { 461 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 462 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
438 btrfs_header_level(root->node)); 468 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid); 469 btrfs_set_root_generation(&root->root_item, trans->transid);
440 470
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root, 471 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key, 472 &root->root_key,
445 &root->root_item); 473 &root->root_item);
446 BUG_ON(ret); 474 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root); 475 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root); 476
477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478 BUG_ON(ret);
449 } 479 }
450 return 0; 480 return 0;
451} 481}
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
459 struct btrfs_fs_info *fs_info = root->fs_info; 489 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next; 490 struct list_head *next;
461 struct extent_buffer *eb; 491 struct extent_buffer *eb;
492 int ret;
462 493
463 btrfs_extent_post_op(trans, fs_info->tree_root); 494 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
495 BUG_ON(ret);
464 496
465 eb = btrfs_lock_root_node(fs_info->tree_root); 497 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 498 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
467 btrfs_tree_unlock(eb); 499 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb); 500 free_extent_buffer(eb);
469 501
470 btrfs_extent_post_op(trans, fs_info->tree_root); 502 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
503 BUG_ON(ret);
471 504
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 505 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next; 506 next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
475 root = list_entry(next, struct btrfs_root, dirty_list); 508 root = list_entry(next, struct btrfs_root, dirty_list);
476 509
477 update_cowonly_root(trans, root); 510 update_cowonly_root(trans, root);
511
512 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513 BUG_ON(ret);
478 } 514 }
479 return 0; 515 return 0;
480} 516}
@@ -635,6 +671,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
635} 671}
636 672
637/* 673/*
674 * when dropping snapshots, we generate a ton of delayed refs, and it makes
675 * sense not to join the transaction while it is trying to flush the current
676 * queue of delayed refs out.
677 *
678 * This is used by the drop snapshot code only
679 */
680static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
681{
682 DEFINE_WAIT(wait);
683
684 mutex_lock(&info->trans_mutex);
685 while (info->running_transaction &&
686 info->running_transaction->delayed_refs.flushing) {
687 prepare_to_wait(&info->transaction_wait, &wait,
688 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex);
690 schedule();
691 mutex_lock(&info->trans_mutex);
692 finish_wait(&info->transaction_wait, &wait);
693 }
694 mutex_unlock(&info->trans_mutex);
695 return 0;
696}
697
698/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 699 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them 700 * all of them
640 */ 701 */
@@ -661,7 +722,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
661 atomic_inc(&root->fs_info->throttles); 722 atomic_inc(&root->fs_info->throttles);
662 723
663 while (1) { 724 while (1) {
725 /*
726 * we don't want to jump in and create a bunch of
727 * delayed refs if the transaction is starting to close
728 */
729 wait_transaction_pre_flush(tree_root->fs_info);
664 trans = btrfs_start_transaction(tree_root, 1); 730 trans = btrfs_start_transaction(tree_root, 1);
731
732 /*
733 * we've joined a transaction, make sure it isn't
734 * closing right now
735 */
736 if (trans->transaction->delayed_refs.flushing) {
737 btrfs_end_transaction(trans, tree_root);
738 continue;
739 }
740
665 mutex_lock(&root->fs_info->drop_mutex); 741 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root); 742 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN) 743 if (ret != -EAGAIN)
@@ -688,7 +764,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
688 num_bytes -= btrfs_root_used(&dirty->root->root_item); 764 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item); 765 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) { 766 if (num_bytes) {
767 mutex_lock(&root->fs_info->trans_mutex);
691 btrfs_record_root_in_trans(root); 768 btrfs_record_root_in_trans(root);
769 mutex_unlock(&root->fs_info->trans_mutex);
692 btrfs_set_root_used(&root->root_item, 770 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes); 771 bytes_used - num_bytes);
694 } 772 }
@@ -764,7 +842,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
764 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 842 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
765 843
766 old = btrfs_lock_root_node(root); 844 old = btrfs_lock_root_node(root);
767 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 845 btrfs_cow_block(trans, root, old, NULL, 0, &old);
768 846
769 btrfs_copy_root(trans, root, old, &tmp, objectid); 847 btrfs_copy_root(trans, root, old, &tmp, objectid);
770 btrfs_tree_unlock(old); 848 btrfs_tree_unlock(old);
@@ -892,12 +970,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
892 struct extent_io_tree *pinned_copy; 970 struct extent_io_tree *pinned_copy;
893 DEFINE_WAIT(wait); 971 DEFINE_WAIT(wait);
894 int ret; 972 int ret;
973 int should_grow = 0;
974 unsigned long now = get_seconds();
975 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
976
977 btrfs_run_ordered_operations(root, 0);
978
979 /* make a pass through all the delayed refs we have so far
980 * any runnings procs may add more while we are here
981 */
982 ret = btrfs_run_delayed_refs(trans, root, 0);
983 BUG_ON(ret);
984
985 cur_trans = trans->transaction;
986 /*
987 * set the flushing flag so procs in this transaction have to
988 * start sending their work down.
989 */
990 cur_trans->delayed_refs.flushing = 1;
991
992 ret = btrfs_run_delayed_refs(trans, root, 0);
993 BUG_ON(ret);
895 994
896 INIT_LIST_HEAD(&dirty_fs_roots);
897 mutex_lock(&root->fs_info->trans_mutex); 995 mutex_lock(&root->fs_info->trans_mutex);
898 if (trans->transaction->in_commit) { 996 INIT_LIST_HEAD(&dirty_fs_roots);
899 cur_trans = trans->transaction; 997 if (cur_trans->in_commit) {
900 trans->transaction->use_count++; 998 cur_trans->use_count++;
901 mutex_unlock(&root->fs_info->trans_mutex); 999 mutex_unlock(&root->fs_info->trans_mutex);
902 btrfs_end_transaction(trans, root); 1000 btrfs_end_transaction(trans, root);
903 1001
@@ -920,7 +1018,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
920 1018
921 trans->transaction->in_commit = 1; 1019 trans->transaction->in_commit = 1;
922 trans->transaction->blocked = 1; 1020 trans->transaction->blocked = 1;
923 cur_trans = trans->transaction;
924 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1021 if (cur_trans->list.prev != &root->fs_info->trans_list) {
925 prev_trans = list_entry(cur_trans->list.prev, 1022 prev_trans = list_entry(cur_trans->list.prev,
926 struct btrfs_transaction, list); 1023 struct btrfs_transaction, list);
@@ -935,6 +1032,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
935 } 1032 }
936 } 1033 }
937 1034
1035 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1036 should_grow = 1;
1037
938 do { 1038 do {
939 int snap_pending = 0; 1039 int snap_pending = 0;
940 joined = cur_trans->num_joined; 1040 joined = cur_trans->num_joined;
@@ -947,26 +1047,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
947 1047
948 if (cur_trans->num_writers > 1) 1048 if (cur_trans->num_writers > 1)
949 timeout = MAX_SCHEDULE_TIMEOUT; 1049 timeout = MAX_SCHEDULE_TIMEOUT;
950 else 1050 else if (should_grow)
951 timeout = 1; 1051 timeout = 1;
952 1052
953 mutex_unlock(&root->fs_info->trans_mutex); 1053 mutex_unlock(&root->fs_info->trans_mutex);
954 1054
955 if (snap_pending) { 1055 if (flush_on_commit || snap_pending) {
1056 if (flush_on_commit)
1057 btrfs_start_delalloc_inodes(root);
956 ret = btrfs_wait_ordered_extents(root, 1); 1058 ret = btrfs_wait_ordered_extents(root, 1);
957 BUG_ON(ret); 1059 BUG_ON(ret);
958 } 1060 }
959 1061
960 schedule_timeout(timeout); 1062 /*
1063 * rename don't use btrfs_join_transaction, so, once we
1064 * set the transaction to blocked above, we aren't going
1065 * to get any new ordered operations. We can safely run
1066 * it here and no for sure that nothing new will be added
1067 * to the list
1068 */
1069 btrfs_run_ordered_operations(root, 1);
1070
1071 smp_mb();
1072 if (cur_trans->num_writers > 1 || should_grow)
1073 schedule_timeout(timeout);
961 1074
962 mutex_lock(&root->fs_info->trans_mutex); 1075 mutex_lock(&root->fs_info->trans_mutex);
963 finish_wait(&cur_trans->writer_wait, &wait); 1076 finish_wait(&cur_trans->writer_wait, &wait);
964 } while (cur_trans->num_writers > 1 || 1077 } while (cur_trans->num_writers > 1 ||
965 (cur_trans->num_joined != joined)); 1078 (should_grow && cur_trans->num_joined != joined));
966 1079
967 ret = create_pending_snapshots(trans, root->fs_info); 1080 ret = create_pending_snapshots(trans, root->fs_info);
968 BUG_ON(ret); 1081 BUG_ON(ret);
969 1082
1083 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1084 BUG_ON(ret);
1085
970 WARN_ON(cur_trans != trans->transaction); 1086 WARN_ON(cur_trans != trans->transaction);
971 1087
972 /* btrfs_commit_tree_roots is responsible for getting the 1088 /* btrfs_commit_tree_roots is responsible for getting the
@@ -1030,6 +1146,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1030 btrfs_copy_pinned(root, pinned_copy); 1146 btrfs_copy_pinned(root, pinned_copy);
1031 1147
1032 trans->transaction->blocked = 0; 1148 trans->transaction->blocked = 0;
1149
1033 wake_up(&root->fs_info->transaction_throttle); 1150 wake_up(&root->fs_info->transaction_throttle);
1034 wake_up(&root->fs_info->transaction_wait); 1151 wake_up(&root->fs_info->transaction_wait);
1035 1152
@@ -1056,6 +1173,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1056 mutex_lock(&root->fs_info->trans_mutex); 1173 mutex_lock(&root->fs_info->trans_mutex);
1057 1174
1058 cur_trans->commit_done = 1; 1175 cur_trans->commit_done = 1;
1176
1059 root->fs_info->last_trans_committed = cur_trans->transid; 1177 root->fs_info->last_trans_committed = cur_trans->transid;
1060 wake_up(&cur_trans->commit_wait); 1178 wake_up(&cur_trans->commit_wait);
1061 1179
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
19#ifndef __BTRFS_TRANSACTION__ 19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h"
22 23
23struct btrfs_transaction { 24struct btrfs_transaction {
24 u64 transid; 25 u64 transid;
26 /*
27 * total writers in this transaction, it must be zero before the
28 * transaction can end
29 */
25 unsigned long num_writers; 30 unsigned long num_writers;
31
26 unsigned long num_joined; 32 unsigned long num_joined;
27 int in_commit; 33 int in_commit;
28 int use_count; 34 int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
34 wait_queue_head_t writer_wait; 40 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait; 41 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots; 42 struct list_head pending_snapshots;
43 struct btrfs_delayed_ref_root delayed_refs;
37}; 44};
38 45
39struct btrfs_trans_handle { 46struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
44 u64 block_group; 51 u64 block_group;
45 u64 alloc_exclude_start; 52 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr; 53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates;
47}; 55};
48 56
49struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
124 } 124 }
125 125
126 btrfs_release_path(root, path); 126 btrfs_release_path(root, path);
127 if (is_extent)
128 btrfs_extent_post_op(trans, root);
129out: 127out:
130 if (path) 128 if (path)
131 btrfs_free_path(path); 129 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256b..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
199 struct extent_buffer *eb, 262 struct extent_buffer *eb,
200 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
201{ 264{
202 if (wc->pin) { 265 if (wc->pin)
203 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 }
208 268
209 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
210 if (wc->write) 270 if (wc->write)
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 663
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 664 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 665 BUG_ON(ret);
666
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 667 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 668 BUG_ON(ret);
608 kfree(name); 669 kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
804 victim_name_len)) { 865 victim_name_len)) {
805 btrfs_inc_nlink(inode); 866 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 867 btrfs_release_path(root, path);
868
807 ret = btrfs_unlink_inode(trans, root, dir, 869 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 870 inode, victim_name,
809 victim_name_len); 871 victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 984 key.offset--;
923 btrfs_release_path(root, path); 985 btrfs_release_path(root, path);
924 } 986 }
925 btrfs_free_path(path); 987 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 988 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 989 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 990 btrfs_update_inode(trans, root, inode);
929 } 991 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 992 BTRFS_I(inode)->index_cnt = (u64)-1;
931 993
994 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
995 ret = replay_dir_deletes(trans, root, NULL, path,
996 inode->i_ino, 1);
997 BUG_ON(ret);
998 }
999 btrfs_free_path(path);
1000
932 return 0; 1001 return 0;
933} 1002}
934 1003
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1040
972 iput(inode); 1041 iput(inode);
973 1042
974 if (key.offset == 0) 1043 /*
975 break; 1044 * fixup on a directory may create new entries,
976 key.offset--; 1045 * make sure we always look for the highset possible
1046 * offset
1047 */
1048 key.offset = (u64)-1;
977 } 1049 }
978 btrfs_release_path(root, path); 1050 btrfs_release_path(root, path);
979 return 0; 1051 return 0;
@@ -1150,8 +1222,7 @@ insert:
1150 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1151 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1152 1224
1153 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1154 BUG();
1155 goto out; 1226 goto out;
1156} 1227}
1157 1228
@@ -1313,11 +1384,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1384 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1385 name_len);
1315 log_di = NULL; 1386 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1387 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1388 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1389 dir_key->objectid,
1319 name, name_len, 0); 1390 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1391 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1392 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1393 log_path,
1323 dir_key->objectid, 1394 dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1449 struct btrfs_root *root,
1379 struct btrfs_root *log, 1450 struct btrfs_root *log,
1380 struct btrfs_path *path, 1451 struct btrfs_path *path,
1381 u64 dirid) 1452 u64 dirid, int del_all)
1382{ 1453{
1383 u64 range_start; 1454 u64 range_start;
1384 u64 range_end; 1455 u64 range_end;
@@ -1408,10 +1479,14 @@ again:
1408 range_start = 0; 1479 range_start = 0;
1409 range_end = 0; 1480 range_end = 0;
1410 while (1) { 1481 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1482 if (del_all)
1412 &range_start, &range_end); 1483 range_end = (u64)-1;
1413 if (ret != 0) 1484 else {
1414 break; 1485 ret = find_dir_range(log, path, dirid, key_type,
1486 &range_start, &range_end);
1487 if (ret != 0)
1488 break;
1489 }
1415 1490
1416 dir_key.offset = range_start; 1491 dir_key.offset = range_start;
1417 while (1) { 1492 while (1) {
@@ -1437,7 +1512,8 @@ again:
1437 break; 1512 break;
1438 1513
1439 ret = check_item_in_log(trans, root, log, path, 1514 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1515 log_path, dir,
1516 &found_key);
1441 BUG_ON(ret); 1517 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1518 if (found_key.offset == (u64)-1)
1443 break; 1519 break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1591 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1592 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1593 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1594 BUG_ON(ret);
1519 } 1595 }
1520 ret = overwrite_item(wc->trans, root, path, 1596 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1609 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1610 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1611 BUG_ON(ret);
1612
1613 /* if the nlink count is zero here, the iput
1614 * will free the inode. We bump it to make
1615 * sure it doesn't get freed until the link
1616 * count fixup is done
1617 */
1618 if (inode->i_nlink == 0) {
1619 btrfs_inc_nlink(inode);
1620 btrfs_update_inode(wc->trans,
1621 root, inode);
1622 }
1536 iput(inode); 1623 iput(inode);
1537 } 1624 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1625 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1927 return ret;
1841} 1928}
1842 1929
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1930static int wait_log_commit(struct btrfs_trans_handle *trans,
1931 struct btrfs_root *root, unsigned long transid)
1844{ 1932{
1845 DEFINE_WAIT(wait); 1933 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1934 int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1942 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1943 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1944 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1945
1946 if (root->fs_info->last_trans_log_full_commit !=
1947 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1948 atomic_read(&root->log_commit[index]))
1859 schedule(); 1949 schedule();
1950
1860 finish_wait(&root->log_commit_wait[index], &wait); 1951 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1952 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1953 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1955 return 0;
1865} 1956}
1866 1957
1867static int wait_for_writer(struct btrfs_root *root) 1958static int wait_for_writer(struct btrfs_trans_handle *trans,
1959 struct btrfs_root *root)
1868{ 1960{
1869 DEFINE_WAIT(wait); 1961 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1962 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1963 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1964 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1965 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1966 if (root->fs_info->last_trans_log_full_commit !=
1967 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1968 schedule();
1876 mutex_lock(&root->log_mutex); 1969 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1970 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1975/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1976 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1977 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1978 * you know that any inodes previously logged are safely on disk only
1979 * if it returns 0.
1980 *
1981 * Any other return value means you need to call btrfs_commit_transaction.
1982 * Some of the edge cases for fsyncing directories that have had unlinks
1983 * or renames done in the past mean that sometimes the only safe
1984 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1985 * that has happened.
1886 */ 1986 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1987int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1988 struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1996 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 1997 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 1998 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 1999 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2000 mutex_unlock(&root->log_mutex);
1901 return 0; 2001 return 0;
1902 } 2002 }
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2004
1905 /* wait for previous tree log sync to complete */ 2005 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2006 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2007 wait_log_commit(trans, root, root->log_transid - 1);
1908 2008
1909 while (1) { 2009 while (1) {
1910 unsigned long batch = root->log_batch; 2010 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2011 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2012 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2013 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2014
2015 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2016 if (batch == root->log_batch)
1916 break; 2017 break;
1917 } 2018 }
1918 2019
2020 /* bail out if we need to do a full commit */
2021 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2022 ret = -EAGAIN;
2023 mutex_unlock(&root->log_mutex);
2024 goto out;
2025 }
2026
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2028 BUG_ON(ret);
1921 2029
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2059
1952 index2 = log_root_tree->log_transid % 2; 2060 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2061 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2062 wait_log_commit(trans, log_root_tree,
2063 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2064 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2065 goto out;
1957 } 2066 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2067 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2068
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2069 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2070 wait_log_commit(trans, log_root_tree,
2071 log_root_tree->log_transid - 1);
2072 }
2073
2074 wait_for_writer(trans, log_root_tree);
1962 2075
1963 wait_for_writer(log_root_tree); 2076 /*
2077 * now that we've moved on to the tree of log tree roots,
2078 * check the full commit flag again
2079 */
2080 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2081 mutex_unlock(&log_root_tree->log_mutex);
2082 ret = -EAGAIN;
2083 goto out_wake_log_root;
2084 }
1964 2085
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2087 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2106 * in and cause problems either.
1986 */ 2107 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2108 write_ctree_super(trans, root->fs_info->tree_root, 2);
2109 ret = 0;
1988 2110
2111out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2112 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2113 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2114 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
1998 return 0; 2121 return 0;
1999} 2122}
2000 2123
2001/* * free all the extents used by the tree log. This should be called 2124/*
2125 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2126 * at commit time of the full transaction
2003 */ 2127 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2128int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2256
2133 btrfs_free_path(path); 2257 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2258 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2259 btrfs_end_log_trans(root);
2136 2260
2137 return 0; 2261 return 0;
2138} 2262}
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2283 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2284 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2285 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2286 btrfs_end_log_trans(root);
2163 2287
2164 return ret; 2288 return ret;
2165} 2289}
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2683 *
2560 * This handles both files and directories. 2684 * This handles both files and directories.
2561 */ 2685 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2686static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2687 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2688 int inode_only)
2565{ 2689{
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2709 min_key.offset = 0;
2586 2710
2587 max_key.objectid = inode->i_ino; 2711 max_key.objectid = inode->i_ino;
2712
2713 /* today the code can only do partial logging of directories */
2714 if (!S_ISDIR(inode->i_mode))
2715 inode_only = LOG_INODE_ALL;
2716
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2717 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2718 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2719 else
2591 max_key.type = (u8)-1; 2720 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2721 max_key.offset = (u64)-1;
2593 2722
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2723 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2724
2612 /* 2725 /*
@@ -2693,7 +2806,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2806 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2807 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2808 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2809 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2810 BUG_ON(ret);
2699 } 2811 }
@@ -2702,19 +2814,69 @@ next_slot:
2702 2814
2703 btrfs_free_path(path); 2815 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2816 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2817 return 0;
2707} 2818}
2708 2819
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2820/*
2710 struct btrfs_root *root, struct inode *inode, 2821 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2822 * of the directories in it require a full commit before they can
2823 * be logged. Returns zero if nothing special needs to be done or 1 if
2824 * a full commit is required.
2825 */
2826static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2827 struct inode *inode,
2828 struct dentry *parent,
2829 struct super_block *sb,
2830 u64 last_committed)
2712{ 2831{
2713 int ret; 2832 int ret = 0;
2833 struct btrfs_root *root;
2714 2834
2715 start_log_trans(trans, root); 2835 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2836 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2837 * have to worry about the parents at all. This is because
2838 * we can use the last_unlink_trans field to record renames
2839 * and other fun in this file.
2840 */
2841 if (S_ISREG(inode->i_mode) &&
2842 BTRFS_I(inode)->generation <= last_committed &&
2843 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2844 goto out;
2845
2846 if (!S_ISDIR(inode->i_mode)) {
2847 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2848 goto out;
2849 inode = parent->d_inode;
2850 }
2851
2852 while (1) {
2853 BTRFS_I(inode)->logged_trans = trans->transid;
2854 smp_mb();
2855
2856 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2857 root = BTRFS_I(inode)->root;
2858
2859 /*
2860 * make sure any commits to the log are forced
2861 * to be full commits
2862 */
2863 root->fs_info->last_trans_log_full_commit =
2864 trans->transid;
2865 ret = 1;
2866 break;
2867 }
2868
2869 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2870 break;
2871
2872 if (parent == sb->s_root)
2873 break;
2874
2875 parent = parent->d_parent;
2876 inode = parent->d_inode;
2877
2878 }
2879out:
2718 return ret; 2880 return ret;
2719} 2881}
2720 2882
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2886 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2887 * the last committed transaction
2726 */ 2888 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2889int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2890 struct btrfs_root *root, struct inode *inode,
2891 struct dentry *parent, int exists_only)
2729{ 2892{
2730 int inode_only = LOG_INODE_ALL; 2893 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2894 struct super_block *sb;
2732 int ret; 2895 int ret = 0;
2896 u64 last_committed = root->fs_info->last_trans_committed;
2897
2898 sb = inode->i_sb;
2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2905 if (root->fs_info->last_trans_log_full_commit >
2906 root->fs_info->last_trans_committed) {
2907 ret = 1;
2908 goto end_no_trans;
2909 }
2910
2911 ret = check_parent_dirs_for_sync(trans, inode, parent,
2912 sb, last_committed);
2913 if (ret)
2914 goto end_no_trans;
2733 2915
2734 start_log_trans(trans, root); 2916 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2917
2742 dentry = dentry->d_parent; 2918 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2919 BUG_ON(ret);
2920
2921 /*
2922 * for regular files, if its inode is already on disk, we don't
2923 * have to worry about the parents at all. This is because
2924 * we can use the last_unlink_trans field to record renames
2925 * and other fun in this file.
2926 */
2927 if (S_ISREG(inode->i_mode) &&
2928 BTRFS_I(inode)->generation <= last_committed &&
2929 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2930 goto no_parent;
2931
2932 inode_only = LOG_INODE_EXISTS;
2933 while (1) {
2934 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2935 break;
2745 2936
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2937 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2938 if (BTRFS_I(inode)->generation >
2939 root->fs_info->last_trans_committed) {
2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
2941 BUG_ON(ret);
2942 }
2943 if (parent == sb->s_root)
2748 break; 2944 break;
2945
2946 parent = parent->d_parent;
2749 } 2947 }
2750 end_log_trans(root); 2948no_parent:
2751 return 0; 2949 ret = 0;
2950 btrfs_end_log_trans(root);
2951end_no_trans:
2952 return ret;
2752} 2953}
2753 2954
2754/* 2955/*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2961int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2962 struct btrfs_root *root, struct dentry *dentry)
2762{ 2963{
2763 u64 gen; 2964 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2965 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2966}
2770 2967
2771/* 2968/*
@@ -2832,7 +3029,9 @@ again:
2832 BUG_ON(!wc.replay_dest); 3029 BUG_ON(!wc.replay_dest);
2833 3030
2834 wc.replay_dest->log_root = log; 3031 wc.replay_dest->log_root = log;
3032 mutex_lock(&fs_info->trans_mutex);
2835 btrfs_record_root_in_trans(wc.replay_dest); 3033 btrfs_record_root_in_trans(wc.replay_dest);
3034 mutex_unlock(&fs_info->trans_mutex);
2836 ret = walk_log_tree(trans, log, &wc); 3035 ret = walk_log_tree(trans, log, &wc);
2837 BUG_ON(ret); 3036 BUG_ON(ret);
2838 3037
@@ -2882,3 +3081,94 @@ again:
2882 kfree(log_root_tree); 3081 kfree(log_root_tree);
2883 return 0; 3082 return 0;
2884} 3083}
3084
3085/*
3086 * there are some corner cases where we want to force a full
3087 * commit instead of allowing a directory to be logged.
3088 *
3089 * They revolve around files there were unlinked from the directory, and
3090 * this function updates the parent directory so that a full commit is
3091 * properly done if it is fsync'd later after the unlinks are done.
3092 */
3093void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3094 struct inode *dir, struct inode *inode,
3095 int for_rename)
3096{
3097 /*
3098 * when we're logging a file, if it hasn't been renamed
3099 * or unlinked, and its inode is fully committed on disk,
3100 * we don't have to worry about walking up the directory chain
3101 * to log its parents.
3102 *
3103 * So, we use the last_unlink_trans field to put this transid
3104 * into the file. When the file is logged we check it and
3105 * don't log the parents if the file is fully on disk.
3106 */
3107 if (S_ISREG(inode->i_mode))
3108 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3109
3110 /*
3111 * if this directory was already logged any new
3112 * names for this file/dir will get recorded
3113 */
3114 smp_mb();
3115 if (BTRFS_I(dir)->logged_trans == trans->transid)
3116 return;
3117
3118 /*
3119 * if the inode we're about to unlink was logged,
3120 * the log will be properly updated for any new names
3121 */
3122 if (BTRFS_I(inode)->logged_trans == trans->transid)
3123 return;
3124
3125 /*
3126 * when renaming files across directories, if the directory
3127 * there we're unlinking from gets fsync'd later on, there's
3128 * no way to find the destination directory later and fsync it
3129 * properly. So, we have to be conservative and force commits
3130 * so the new name gets discovered.
3131 */
3132 if (for_rename)
3133 goto record;
3134
3135 /* we can safely do the unlink without any special recording */
3136 return;
3137
3138record:
3139 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3140}
3141
3142/*
3143 * Call this after adding a new name for a file and it will properly
3144 * update the log to reflect the new name.
3145 *
3146 * It will return zero if all goes well, and it will return 1 if a
3147 * full transaction commit is required.
3148 */
3149int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3150 struct inode *inode, struct inode *old_dir,
3151 struct dentry *parent)
3152{
3153 struct btrfs_root * root = BTRFS_I(inode)->root;
3154
3155 /*
3156 * this will force the logging code to walk the dentry chain
3157 * up for the file
3158 */
3159 if (S_ISREG(inode->i_mode))
3160 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3161
3162 /*
3163 * if this inode hasn't been logged and directory we're renaming it
3164 * from hasn't been logged, we don't need to log it
3165 */
3166 if (BTRFS_I(inode)->logged_trans <=
3167 root->fs_info->last_trans_committed &&
3168 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3169 root->fs_info->last_trans_committed))
3170 return 0;
3171
3172 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3173}
3174
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae1..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
145 int again = 0; 146 int again = 0;
146 unsigned long num_run = 0; 147 unsigned long num_run = 0;
147 unsigned long limit; 148 unsigned long limit;
149 unsigned long last_waited = 0;
148 150
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 151 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 152 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 153 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 154 limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 209 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 210 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 211 struct bio *old_head;
212 struct io_context *ioc;
210 213
214 ioc = current->io_context;
215
216 /*
217 * the main goal here is that we don't want to
218 * block if we're going to be able to submit
219 * more requests without blocking.
220 *
221 * This code does two great things, it pokes into
222 * the elevator code from a filesystem _and_
223 * it makes assumptions about how batching works.
224 */
225 if (ioc && ioc->nr_batch_requests > 0 &&
226 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227 (last_waited == 0 ||
228 ioc->last_waited == last_waited)) {
229 /*
230 * we want to go through our batch of
231 * requests and stop. So, we copy out
232 * the ioc->last_waited time and test
233 * against it before looping
234 */
235 last_waited = ioc->last_waited;
236 continue;
237 }
211 spin_lock(&device->io_lock); 238 spin_lock(&device->io_lock);
212 239
213 old_head = device->pending_bios; 240 old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
231 if (device->pending_bios) 258 if (device->pending_bios)
232 goto loop_lock; 259 goto loop_lock;
233 spin_unlock(&device->io_lock); 260 spin_unlock(&device->io_lock);
261
262 /*
263 * IO has already been through a long path to get here. Checksumming,
264 * async helper threads, perhaps compression. We've done a pretty
265 * good job of collecting a batch of IO and should just unplug
266 * the device right away.
267 *
268 * This will help anyone who is waiting on the IO, they might have
269 * already unplugged, but managed to do so before the bio they
270 * cared about found its way down here.
271 */
272 blk_run_backing_dev(bdi, NULL);
234done: 273done:
235 return 0; 274 return 0;
236} 275}
@@ -1374,6 +1413,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1374 ret = btrfs_add_device(trans, root, device); 1413 ret = btrfs_add_device(trans, root, device);
1375 } 1414 }
1376 1415
1416 /*
1417 * we've got more storage, clear any full flags on the space
1418 * infos
1419 */
1420 btrfs_clear_space_info_full(root->fs_info);
1421
1377 unlock_chunks(root); 1422 unlock_chunks(root);
1378 btrfs_commit_transaction(trans, root); 1423 btrfs_commit_transaction(trans, root);
1379 1424
@@ -1459,6 +1504,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1459 device->fs_devices->total_rw_bytes += diff; 1504 device->fs_devices->total_rw_bytes += diff;
1460 1505
1461 device->total_bytes = new_size; 1506 device->total_bytes = new_size;
1507 btrfs_clear_space_info_full(device->dev_root->fs_info);
1508
1462 return btrfs_update_device(trans, device); 1509 return btrfs_update_device(trans, device);
1463} 1510}
1464 1511
@@ -2894,10 +2941,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2894 free_extent_map(em); 2941 free_extent_map(em);
2895 } 2942 }
2896 2943
2897 map = kzalloc(sizeof(*map), GFP_NOFS);
2898 if (!map)
2899 return -ENOMEM;
2900
2901 em = alloc_extent_map(GFP_NOFS); 2944 em = alloc_extent_map(GFP_NOFS);
2902 if (!em) 2945 if (!em)
2903 return -ENOMEM; 2946 return -ENOMEM;
@@ -3106,6 +3149,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3106 if (!sb) 3149 if (!sb)
3107 return -ENOMEM; 3150 return -ENOMEM;
3108 btrfs_set_buffer_uptodate(sb); 3151 btrfs_set_buffer_uptodate(sb);
3152 btrfs_set_buffer_lockdep_class(sb, 0);
3153
3109 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3154 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3110 array_size = btrfs_super_sys_array_size(super_copy); 3155 array_size = btrfs_super_sys_array_size(super_copy);
3111 3156
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 78
79 /* the device with this id has the most recent coyp of the super */ 79 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 80 u64 latest_devid;
81 u64 latest_trans; 81 u64 latest_trans;
82 u64 num_devices; 82 u64 num_devices;