aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig13
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c325
-rw-r--r--fs/btrfs/ctree.h86
-rw-r--r--fs/btrfs/disk-io.c170
-rw-r--r--fs/btrfs/disk-io.h12
-rw-r--r--fs/btrfs/extent-tree.c818
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file.c29
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c142
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/locking.c207
-rw-r--r--fs/btrfs/locking.h8
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
-rw-r--r--fs/btrfs/super.c11
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c356
-rw-r--r--fs/btrfs/volumes.c63
-rw-r--r--fs/btrfs/xattr.c48
-rw-r--r--fs/btrfs/xattr.h2
28 files changed, 1916 insertions, 618 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf999ea1b..7bb3c020e570 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
16 module will be called btrfs. 16 module will be called btrfs.
17 17
18 If unsure, say N. 18 If unsure, say N.
19
20config BTRFS_FS_POSIX_ACL
21 bool "Btrfs POSIX Access Control Lists"
22 depends on BTRFS_FS
23 select FS_POSIX_ACL
24 help
25 POSIX Access Control Lists (ACLs) support permissions for users and
26 groups beyond the owner/group/world scheme.
27
28 To learn more about Access Control Lists, visit the POSIX ACLs for
29 Linux website <http://acl.bestbits.at/>.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe0..c84ca1f5259a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/kthread.h> 19#include <linux/kthread.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/spinlock.h> 21#include <linux/spinlock.h>
23# include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 24#include "async-thread.h"
25 25
26#define WORK_QUEUED_BIT 0 26#define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
143 struct btrfs_work *work; 143 struct btrfs_work *work;
144 do { 144 do {
145 spin_lock_irq(&worker->lock); 145 spin_lock_irq(&worker->lock);
146again_locked:
146 while (!list_empty(&worker->pending)) { 147 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next; 148 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list); 149 work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
165 check_idle_worker(worker); 166 check_idle_worker(worker);
166 167
167 } 168 }
168 worker->working = 0;
169 if (freezing(current)) { 169 if (freezing(current)) {
170 worker->working = 0;
171 spin_unlock_irq(&worker->lock);
170 refrigerator(); 172 refrigerator();
171 } else { 173 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock); 174 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop()) 175 if (!kthread_should_stop()) {
176 cpu_relax();
177 /*
178 * we've dropped the lock, did someone else
179 * jump_in?
180 */
181 smp_mb();
182 if (!list_empty(&worker->pending))
183 continue;
184
185 /*
186 * this short schedule allows more work to
187 * come in without the queue functions
188 * needing to go through wake_up_process()
189 *
190 * worker->working is still 1, so nobody
191 * is going to try and wake us up
192 */
193 schedule_timeout(1);
194 smp_mb();
195 if (!list_empty(&worker->pending))
196 continue;
197
198 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending))
202 goto again_locked;
203
204 /*
205 * this makes sure we get a wakeup when someone
206 * adds something new to the queue
207 */
208 worker->working = 0;
209 spin_unlock_irq(&worker->lock);
210
175 schedule(); 211 schedule();
212 }
176 __set_current_state(TASK_RUNNING); 213 __set_current_state(TASK_RUNNING);
177 } 214 }
178 } while (!kthread_should_stop()); 215 } while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
350{ 387{
351 struct btrfs_worker_thread *worker = work->worker; 388 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags; 389 unsigned long flags;
390 int wake = 0;
353 391
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out; 393 goto out;
356 394
357 spin_lock_irqsave(&worker->lock, flags); 395 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending); 396 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending);
360 398
361 /* by definition we're busy, take ourselves off the idle 399 /* by definition we're busy, take ourselves off the idle
362 * list 400 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
368 &worker->workers->worker_list); 406 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags); 407 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 } 408 }
409 if (!worker->working) {
410 wake = 1;
411 worker->working = 1;
412 }
371 413
372 spin_unlock_irqrestore(&worker->lock, flags); 414 spin_unlock_irqrestore(&worker->lock, flags);
373 415 if (wake)
416 wake_up_process(worker->task);
374out: 417out:
418
375 return 0; 419 return 0;
376} 420}
377 421
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
398 } 442 }
399 443
400 spin_lock_irqsave(&worker->lock, flags); 444 spin_lock_irqsave(&worker->lock, flags);
445
446 list_add_tail(&work->list, &worker->pending);
401 atomic_inc(&worker->num_pending); 447 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker); 448 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404 449
405 /* 450 /*
406 * avoid calling into wake_up_process if this thread has already 451 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..72677ce2b74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info;
71
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big 72 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this. 73 * enough field for this.
71 */ 74 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
94 */ 97 */
95 u64 delalloc_bytes; 98 u64 delalloc_bytes;
96 99
100 /* total number of bytes that may be used for this inode for
101 * delalloc
102 */
103 u64 reserved_bytes;
104
97 /* 105 /*
98 * the size of the file stored in the metadata on disk. data=ordered 106 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk 107 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d9..ab07627084f1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h> 35#include <linux/pagevec.h>
37#include "compat.h" 36#include "compat.h"
38#include "ctree.h" 37#include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c0776816..37f31b5529aa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot); 39 struct btrfs_path *path, int level, int slot);
40 40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void) 41struct btrfs_path *btrfs_alloc_path(void)
47{ 42{
48 struct btrfs_path *path; 43 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); 44 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) { 45 if (path)
51 btrfs_init_path(path);
52 path->reada = 1; 46 path->reada = 1;
53 }
54 return path; 47 return path;
55} 48}
56 49
50/*
51 * set all locked nodes in the path to blocking locks. This should
52 * be done before scheduling
53 */
54noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55{
56 int i;
57 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58 if (p->nodes[i] && p->locks[i])
59 btrfs_set_lock_blocking(p->nodes[i]);
60 }
61}
62
63/*
64 * reset all the locked nodes in the patch to spinning locks.
65 *
66 * held is used to keep lockdep happy, when lockdep is enabled
67 * we set held to a blocking lock before we go around and
68 * retake all the spinlocks in the path. You can safely use NULL
69 * for held
70 */
71noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 struct extent_buffer *held)
73{
74 int i;
75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77 /* lockdep really cares that we take all of these spinlocks
78 * in the right order. If any of the locks in the path are not
79 * currently blocking, it is going to complain. So, make really
80 * really sure by forcing the path to blocking before we clear
81 * the path blocking.
82 */
83 if (held)
84 btrfs_set_lock_blocking(held);
85 btrfs_set_path_blocking(p);
86#endif
87
88 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 if (p->nodes[i] && p->locks[i])
90 btrfs_clear_lock_blocking(p->nodes[i]);
91 }
92
93#ifdef CONFIG_DEBUG_LOCK_ALLOC
94 if (held)
95 btrfs_clear_lock_blocking(held);
96#endif
97}
98
57/* this also releases the path */ 99/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 100void btrfs_free_path(struct btrfs_path *p)
59{ 101{
@@ -235,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
235 if (*cow_ret == buf) 277 if (*cow_ret == buf)
236 unlock_orig = 1; 278 unlock_orig = 1;
237 279
238 WARN_ON(!btrfs_tree_locked(buf)); 280 btrfs_assert_tree_locked(buf);
239 281
240 if (parent) 282 if (parent)
241 parent_start = parent->start; 283 parent_start = parent->start;
@@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
261 trans->transid, level, &ins); 303 trans->transid, level, &ins);
262 BUG_ON(ret); 304 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest, 305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len); 306 buf->len, level);
265 } else { 307 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len, 308 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start, 309 parent_start,
@@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 314 if (IS_ERR(cow))
273 return PTR_ERR(cow); 315 return PTR_ERR(cow);
274 316
317 /* cow is set to blocking by btrfs_init_new_buffer */
318
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 319 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 320 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 321 btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
388 WARN_ON(1); 432 WARN_ON(1);
389 } 433 }
390 434
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid && 435 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid && 436 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf; 438 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest); 439 WARN_ON(prealloc_dest);
398 return 0; 440 return 0;
399 } 441 }
400 spin_unlock(&root->fs_info->hash_lock); 442
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 443 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
444
445 if (parent)
446 btrfs_set_lock_blocking(parent);
447 btrfs_set_lock_blocking(buf);
448
402 ret = __btrfs_cow_block(trans, root, buf, parent, 449 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0, 450 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest); 451 prealloc_dest);
@@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
504 if (parent_nritems == 1) 551 if (parent_nritems == 1)
505 return 0; 552 return 0;
506 553
554 btrfs_set_lock_blocking(parent);
555
507 for (i = start_slot; i < end_slot; i++) { 556 for (i = start_slot; i < end_slot; i++) {
508 int close = 1; 557 int close = 1;
509 558
@@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
564 search_start = last_block; 613 search_start = last_block;
565 614
566 btrfs_tree_lock(cur); 615 btrfs_tree_lock(cur);
616 btrfs_set_lock_blocking(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i, 617 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start, 618 &cur, search_start,
569 min(16 * blocksize, 619 min(16 * blocksize,
@@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 return 0; 912 return 0;
863 913
864 mid = path->nodes[level]; 914 mid = path->nodes[level];
915
865 WARN_ON(!path->locks[level]); 916 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 917 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 918
@@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
883 934
884 /* promote the child to a root */ 935 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0); 936 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child); 937 BUG_ON(!child);
938 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret); 941 BUG_ON(ret);
890 942
@@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
900 952
901 add_root_to_dirty_list(root); 953 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child); 954 btrfs_tree_unlock(child);
955
903 path->locks[level] = 0; 956 path->locks[level] = 0;
904 path->nodes[level] = NULL; 957 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid); 958 clean_tree_block(trans, root, mid);
@@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
924 left = read_node_slot(root, parent, pslot - 1); 977 left = read_node_slot(root, parent, pslot - 1);
925 if (left) { 978 if (left) {
926 btrfs_tree_lock(left); 979 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left);
927 wret = btrfs_cow_block(trans, root, left, 981 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0); 982 parent, pslot - 1, &left, 0);
929 if (wret) { 983 if (wret) {
@@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
934 right = read_node_slot(root, parent, pslot + 1); 988 right = read_node_slot(root, parent, pslot + 1);
935 if (right) { 989 if (right) {
936 btrfs_tree_lock(right); 990 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right);
937 wret = btrfs_cow_block(trans, root, right, 992 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0); 993 parent, pslot + 1, &right, 0);
939 if (wret) { 994 if (wret) {
@@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 u32 left_nr; 1164 u32 left_nr;
1110 1165
1111 btrfs_tree_lock(left); 1166 btrfs_tree_lock(left);
1167 btrfs_set_lock_blocking(left);
1168
1112 left_nr = btrfs_header_nritems(left); 1169 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1; 1171 wret = 1;
@@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1155 */ 1212 */
1156 if (right) { 1213 if (right) {
1157 u32 right_nr; 1214 u32 right_nr;
1215
1158 btrfs_tree_lock(right); 1216 btrfs_tree_lock(right);
1217 btrfs_set_lock_blocking(right);
1218
1159 right_nr = btrfs_header_nritems(right); 1219 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1220 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1; 1221 wret = 1;
@@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1210 struct btrfs_disk_key disk_key; 1270 struct btrfs_disk_key disk_key;
1211 u32 nritems; 1271 u32 nritems;
1212 u64 search; 1272 u64 search;
1213 u64 lowest_read; 1273 u64 target;
1214 u64 highest_read;
1215 u64 nread = 0; 1274 u64 nread = 0;
1216 int direction = path->reada; 1275 int direction = path->reada;
1217 struct extent_buffer *eb; 1276 struct extent_buffer *eb;
@@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1235 return; 1294 return;
1236 } 1295 }
1237 1296
1238 highest_read = search; 1297 target = search;
1239 lowest_read = search;
1240 1298
1241 nritems = btrfs_header_nritems(node); 1299 nritems = btrfs_header_nritems(node);
1242 nr = slot; 1300 nr = slot;
@@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
1256 break; 1314 break;
1257 } 1315 }
1258 search = btrfs_node_blockptr(node, nr); 1316 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) || 1317 if ((search <= target && target - search <= 65536) ||
1260 (search < lowest_read && lowest_read - search <= 16384) || 1318 (search > target && search - target <= 65536)) {
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize, 1319 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr)); 1320 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize; 1321 nread += blocksize;
1265 } 1322 }
1266 nscan++; 1323 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1324 if ((nread > 65536 || nscan > 32))
1268 break; 1325 break;
1326 }
1327}
1269 1328
1270 if (nread > (256 * 1024) || nscan > 128) 1329/*
1271 break; 1330 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1331 * cache
1332 */
1333static noinline int reada_for_balance(struct btrfs_root *root,
1334 struct btrfs_path *path, int level)
1335{
1336 int slot;
1337 int nritems;
1338 struct extent_buffer *parent;
1339 struct extent_buffer *eb;
1340 u64 gen;
1341 u64 block1 = 0;
1342 u64 block2 = 0;
1343 int ret = 0;
1344 int blocksize;
1345
1346 parent = path->nodes[level - 1];
1347 if (!parent)
1348 return 0;
1272 1349
1273 if (search < lowest_read) 1350 nritems = btrfs_header_nritems(parent);
1274 lowest_read = search; 1351 slot = path->slots[level];
1275 if (search > highest_read) 1352 blocksize = btrfs_level_size(root, level);
1276 highest_read = search; 1353
1354 if (slot > 0) {
1355 block1 = btrfs_node_blockptr(parent, slot - 1);
1356 gen = btrfs_node_ptr_generation(parent, slot - 1);
1357 eb = btrfs_find_tree_block(root, block1, blocksize);
1358 if (eb && btrfs_buffer_uptodate(eb, gen))
1359 block1 = 0;
1360 free_extent_buffer(eb);
1361 }
1362 if (slot < nritems) {
1363 block2 = btrfs_node_blockptr(parent, slot + 1);
1364 gen = btrfs_node_ptr_generation(parent, slot + 1);
1365 eb = btrfs_find_tree_block(root, block2, blocksize);
1366 if (eb && btrfs_buffer_uptodate(eb, gen))
1367 block2 = 0;
1368 free_extent_buffer(eb);
1277 } 1369 }
1370 if (block1 || block2) {
1371 ret = -EAGAIN;
1372 btrfs_release_path(root, path);
1373 if (block1)
1374 readahead_tree_block(root, block1, blocksize, 0);
1375 if (block2)
1376 readahead_tree_block(root, block2, blocksize, 0);
1377
1378 if (block1) {
1379 eb = read_tree_block(root, block1, blocksize, 0);
1380 free_extent_buffer(eb);
1381 }
1382 if (block1) {
1383 eb = read_tree_block(root, block2, blocksize, 0);
1384 free_extent_buffer(eb);
1385 }
1386 }
1387 return ret;
1278} 1388}
1279 1389
1390
1280/* 1391/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers 1392 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because 1393 * in the tree. The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1328} 1439}
1329 1440
1330/* 1441/*
1442 * This releases any locks held in the path starting at level and
1443 * going all the way up to the root.
1444 *
1445 * btrfs_search_slot will keep the lock held on higher nodes in a few
1446 * corner cases, such as COW of the block at slot zero in the node. This
1447 * ignores those rules, and it should only be called when there are no
1448 * more updates to be done higher up in the tree.
1449 */
1450noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1451{
1452 int i;
1453
1454 if (path->keep_locks || path->lowest_level)
1455 return;
1456
1457 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1458 if (!path->nodes[i])
1459 continue;
1460 if (!path->locks[i])
1461 continue;
1462 btrfs_tree_unlock(path->nodes[i]);
1463 path->locks[i] = 0;
1464 }
1465}
1466
1467/*
1331 * look for key in the tree. path is filled in with nodes along the way 1468 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf 1469 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0) 1470 * level of the path (level 0)
@@ -1387,32 +1524,30 @@ again:
1387 int wret; 1524 int wret;
1388 1525
1389 /* is a cow on this block not required */ 1526 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid && 1527 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid && 1528 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done; 1530 goto cow_done;
1396 } 1531 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398 1532
1399 /* ok, we have to cow, is our old prealloc the right 1533 /* ok, we have to cow, is our old prealloc the right
1400 * size? 1534 * size?
1401 */ 1535 */
1402 if (prealloc_block.objectid && 1536 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) { 1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1404 btrfs_free_reserved_extent(root, 1539 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid, 1540 prealloc_block.objectid,
1406 prealloc_block.offset); 1541 prealloc_block.offset);
1407 prealloc_block.objectid = 0; 1542 prealloc_block.objectid = 0;
1543 goto again;
1408 } 1544 }
1409 1545
1410 /* 1546 /*
1411 * for higher level blocks, try not to allocate blocks 1547 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held. 1548 * with the block and the parent locks held.
1413 */ 1549 */
1414 if (level > 1 && !prealloc_block.objectid && 1550 if (level > 0 && !prealloc_block.objectid) {
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len; 1551 u32 size = b->len;
1417 u64 hint = b->start; 1552 u64 hint = b->start;
1418 1553
@@ -1425,6 +1560,8 @@ again:
1425 goto again; 1560 goto again;
1426 } 1561 }
1427 1562
1563 btrfs_set_path_blocking(p);
1564
1428 wret = btrfs_cow_block(trans, root, b, 1565 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1], 1566 p->nodes[level + 1],
1430 p->slots[level + 1], 1567 p->slots[level + 1],
@@ -1446,6 +1583,22 @@ cow_done:
1446 if (!p->skip_locking) 1583 if (!p->skip_locking)
1447 p->locks[level] = 1; 1584 p->locks[level] = 1;
1448 1585
1586 btrfs_clear_path_blocking(p, NULL);
1587
1588 /*
1589 * we have a lock on b and as long as we aren't changing
1590 * the tree, there is no way to for the items in b to change.
1591 * It is safe to drop the lock on our parent before we
1592 * go through the expensive btree search on b.
1593 *
1594 * If cow is true, then we might be changing slot zero,
1595 * which may require changing the parent. So, we can't
1596 * drop the lock until after we know which slot we're
1597 * operating on.
1598 */
1599 if (!cow)
1600 btrfs_unlock_up_safe(p, level + 1);
1601
1449 ret = check_block(root, p, level); 1602 ret = check_block(root, p, level);
1450 if (ret) { 1603 if (ret) {
1451 ret = -1; 1604 ret = -1;
@@ -1453,6 +1606,7 @@ cow_done:
1453 } 1606 }
1454 1607
1455 ret = bin_search(b, key, level, &slot); 1608 ret = bin_search(b, key, level, &slot);
1609
1456 if (level != 0) { 1610 if (level != 0) {
1457 if (ret && slot > 0) 1611 if (ret && slot > 0)
1458 slot -= 1; 1612 slot -= 1;
@@ -1460,7 +1614,16 @@ cow_done:
1460 if ((p->search_for_split || ins_len > 0) && 1614 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >= 1615 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level); 1617 int sret;
1618
1619 sret = reada_for_balance(root, p, level);
1620 if (sret)
1621 goto again;
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1464 BUG_ON(sret > 0); 1627 BUG_ON(sret > 0);
1465 if (sret) { 1628 if (sret) {
1466 ret = sret; 1629 ret = sret;
@@ -1468,9 +1631,19 @@ cow_done:
1468 } 1631 }
1469 b = p->nodes[level]; 1632 b = p->nodes[level];
1470 slot = p->slots[level]; 1633 slot = p->slots[level];
1471 } else if (ins_len < 0) { 1634 } else if (ins_len < 0 &&
1472 int sret = balance_level(trans, root, p, 1635 btrfs_header_nritems(b) <
1473 level); 1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646
1474 if (sret) { 1647 if (sret) {
1475 ret = sret; 1648 ret = sret;
1476 goto done; 1649 goto done;
@@ -1504,7 +1677,7 @@ cow_done:
1504 * of the btree by dropping locks before 1677 * of the btree by dropping locks before
1505 * we read. 1678 * we read.
1506 */ 1679 */
1507 if (level > 1) { 1680 if (level > 0) {
1508 btrfs_release_path(NULL, p); 1681 btrfs_release_path(NULL, p);
1509 if (tmp) 1682 if (tmp)
1510 free_extent_buffer(tmp); 1683 free_extent_buffer(tmp);
@@ -1519,6 +1692,7 @@ cow_done:
1519 free_extent_buffer(tmp); 1692 free_extent_buffer(tmp);
1520 goto again; 1693 goto again;
1521 } else { 1694 } else {
1695 btrfs_set_path_blocking(p);
1522 if (tmp) 1696 if (tmp)
1523 free_extent_buffer(tmp); 1697 free_extent_buffer(tmp);
1524 if (should_reada) 1698 if (should_reada)
@@ -1528,14 +1702,29 @@ cow_done:
1528 b = read_node_slot(root, b, slot); 1702 b = read_node_slot(root, b, slot);
1529 } 1703 }
1530 } 1704 }
1531 if (!p->skip_locking) 1705 if (!p->skip_locking) {
1532 btrfs_tree_lock(b); 1706 int lret;
1707
1708 btrfs_clear_path_blocking(p, NULL);
1709 lret = btrfs_try_spin_lock(b);
1710
1711 if (!lret) {
1712 btrfs_set_path_blocking(p);
1713 btrfs_tree_lock(b);
1714 btrfs_clear_path_blocking(p, b);
1715 }
1716 }
1533 } else { 1717 } else {
1534 p->slots[level] = slot; 1718 p->slots[level] = slot;
1535 if (ins_len > 0 && 1719 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) { 1720 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key, 1721 int sret;
1722
1723 btrfs_set_path_blocking(p);
1724 sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0); 1725 p, ins_len, ret == 0);
1726 btrfs_clear_path_blocking(p, NULL);
1727
1539 BUG_ON(sret > 0); 1728 BUG_ON(sret > 0);
1540 if (sret) { 1729 if (sret) {
1541 ret = sret; 1730 ret = sret;
@@ -1549,12 +1738,16 @@ cow_done:
1549 } 1738 }
1550 ret = 1; 1739 ret = 1;
1551done: 1740done:
1741 /*
1742 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking
1744 */
1745 btrfs_set_path_blocking(p);
1552 if (prealloc_block.objectid) { 1746 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root, 1747 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid, 1748 prealloc_block.objectid,
1555 prealloc_block.offset); 1749 prealloc_block.offset);
1556 } 1750 }
1557
1558 return ret; 1751 return ret;
1559} 1752}
1560 1753
@@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret); 1772 BUG_ON(ret);
1580 1773
1774 btrfs_set_lock_blocking(eb);
1775
1581 parent = eb; 1776 parent = eb;
1582 while (1) { 1777 while (1) {
1583 level = btrfs_header_level(parent); 1778 level = btrfs_header_level(parent);
@@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1602 eb = read_tree_block(root, bytenr, blocksize, 1797 eb = read_tree_block(root, bytenr, blocksize,
1603 generation); 1798 generation);
1604 btrfs_tree_lock(eb); 1799 btrfs_tree_lock(eb);
1800 btrfs_set_lock_blocking(eb);
1605 } 1801 }
1606 1802
1607 /* 1803 /*
@@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1626 eb = read_tree_block(root, bytenr, blocksize, 1822 eb = read_tree_block(root, bytenr, blocksize,
1627 generation); 1823 generation);
1628 btrfs_tree_lock(eb); 1824 btrfs_tree_lock(eb);
1825 btrfs_set_lock_blocking(eb);
1629 } 1826 }
1630 1827
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1828 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2168,10 +2365,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2168 if (slot >= btrfs_header_nritems(upper) - 1) 2365 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1; 2366 return 1;
2170 2367
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1])); 2368 btrfs_assert_tree_locked(path->nodes[1]);
2172 2369
2173 right = read_node_slot(root, upper, slot + 1); 2370 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right); 2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2175 free_space = btrfs_leaf_free_space(root, right); 2374 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size) 2375 if (free_space < data_size)
2177 goto out_unlock; 2376 goto out_unlock;
@@ -2363,10 +2562,12 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2363 if (right_nritems == 0) 2562 if (right_nritems == 0)
2364 return 1; 2563 return 1;
2365 2564
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1])); 2565 btrfs_assert_tree_locked(path->nodes[1]);
2367 2566
2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left); 2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2370 free_space = btrfs_leaf_free_space(root, left); 2571 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) { 2572 if (free_space < data_size) {
2372 ret = 1; 2573 ret = 1;
@@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2825 path->keep_locks = 0; 3026 path->keep_locks = 0;
2826 BUG_ON(ret); 3027 BUG_ON(ret);
2827 3028
3029 /*
3030 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state
3032 */
3033 btrfs_set_path_blocking(path);
3034
2828 leaf = path->nodes[0]; 3035 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 3037
@@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3354 BUG(); 3561 BUG();
3355 } 3562 }
3356out: 3563out:
3564 btrfs_unlock_up_safe(path, 1);
3357 return ret; 3565 return ret;
3358} 3566}
3359 3567
@@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3441{ 3649{
3442 int ret; 3650 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3651 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3652 u64 parent_start = path->nodes[1]->start;
3653 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3444 3654
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3655 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret) 3656 if (ret)
3447 return ret; 3657 return ret;
3448 3658
3659 /*
3660 * btrfs_free_extent is expensive, we want to make sure we
3661 * aren't holding any locks when we call it
3662 */
3663 btrfs_unlock_up_safe(path, 0);
3664
3449 ret = btrfs_free_extent(trans, root, bytenr, 3665 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0), 3666 btrfs_level_size(root, 0),
3451 path->nodes[1]->start, 3667 parent_start, parent_owner,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1); 3668 root_gen, 0, 1);
3454 return ret; 3669 return ret;
3455} 3670}
@@ -3721,6 +3936,7 @@ find_next_key:
3721 */ 3936 */
3722 if (slot >= nritems) { 3937 if (slot >= nritems) {
3723 path->slots[level] = slot; 3938 path->slots[level] = slot;
3939 btrfs_set_path_blocking(path);
3724 sret = btrfs_find_next_key(root, path, min_key, level, 3940 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans); 3941 cache_only, min_trans);
3726 if (sret == 0) { 3942 if (sret == 0) {
@@ -3738,16 +3954,20 @@ find_next_key:
3738 unlock_up(path, level, 1); 3954 unlock_up(path, level, 1);
3739 goto out; 3955 goto out;
3740 } 3956 }
3957 btrfs_set_path_blocking(path);
3741 cur = read_node_slot(root, cur, slot); 3958 cur = read_node_slot(root, cur, slot);
3742 3959
3743 btrfs_tree_lock(cur); 3960 btrfs_tree_lock(cur);
3961
3744 path->locks[level - 1] = 1; 3962 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur; 3963 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1); 3964 unlock_up(path, level, 1);
3965 btrfs_clear_path_blocking(path, NULL);
3747 } 3966 }
3748out: 3967out:
3749 if (ret == 0) 3968 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key)); 3969 memcpy(min_key, &found_key, sizeof(found_key));
3970 btrfs_set_path_blocking(path);
3751 return ret; 3971 return ret;
3752} 3972}
3753 3973
@@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3843 if (ret < 0) 4063 if (ret < 0)
3844 return ret; 4064 return ret;
3845 4065
4066 btrfs_set_path_blocking(path);
3846 nritems = btrfs_header_nritems(path->nodes[0]); 4067 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /* 4068 /*
3848 * by releasing the path above we dropped all our locks. A balance 4069 * by releasing the path above we dropped all our locks. A balance
@@ -3873,14 +4094,16 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3873 free_extent_buffer(next); 4094 free_extent_buffer(next);
3874 } 4095 }
3875 4096
4097 /* the path was set to blocking above */
3876 if (level == 1 && (path->locks[1] || path->skip_locking) && 4098 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada) 4099 path->reada)
3878 reada_for_search(root, path, level, slot, 0); 4100 reada_for_search(root, path, level, slot, 0);
3879 4101
3880 next = read_node_slot(root, c, slot); 4102 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) { 4103 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c)); 4104 btrfs_assert_tree_locked(c);
3883 btrfs_tree_lock(next); 4105 btrfs_tree_lock(next);
4106 btrfs_set_lock_blocking(next);
3884 } 4107 }
3885 break; 4108 break;
3886 } 4109 }
@@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3897 path->locks[level] = 1; 4120 path->locks[level] = 1;
3898 if (!level) 4121 if (!level)
3899 break; 4122 break;
4123
4124 btrfs_set_path_blocking(path);
3900 if (level == 1 && path->locks[1] && path->reada) 4125 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0); 4126 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0); 4127 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) { 4128 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4129 btrfs_assert_tree_locked(path->nodes[level]);
3905 btrfs_tree_lock(next); 4130 btrfs_tree_lock(next);
4131 btrfs_set_lock_blocking(next);
3906 } 4132 }
3907 } 4133 }
3908done: 4134done:
@@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3927 4153
3928 while (1) { 4154 while (1) {
3929 if (path->slots[0] == 0) { 4155 if (path->slots[0] == 0) {
4156 btrfs_set_path_blocking(path);
3930 ret = btrfs_prev_leaf(root, path); 4157 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0) 4158 if (ret != 0)
3932 return ret; 4159 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f88113..5e1d4e30e9d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
43 43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1) 44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45 45
46#ifdef CONFIG_LOCKDEP 46#define BTRFS_MAX_LEVEL 8
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51 47
52/* holds pointers to all of the tree roots */ 48/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL 49#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -454,17 +450,11 @@ struct btrfs_timespec {
454 __le32 nsec; 450 __le32 nsec;
455} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
456 452
457typedef enum { 453enum btrfs_compression_type {
458 BTRFS_COMPRESS_NONE = 0, 454 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1, 455 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2, 456 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type; 457};
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468 458
469struct btrfs_inode_item { 459struct btrfs_inode_item {
470 /* nfs style generation number */ 460 /* nfs style generation number */
@@ -606,13 +596,27 @@ struct btrfs_block_group_item {
606 596
607struct btrfs_space_info { 597struct btrfs_space_info {
608 u64 flags; 598 u64 flags;
609 u64 total_bytes; 599
610 u64 bytes_used; 600 u64 total_bytes; /* total bytes in the space */
611 u64 bytes_pinned; 601 u64 bytes_used; /* total bytes used on disk */
612 u64 bytes_reserved; 602 u64 bytes_pinned; /* total bytes pinned, will be freed when the
613 u64 bytes_readonly; 603 transaction finishes */
614 int full; 604 u64 bytes_reserved; /* total bytes the allocator has reserved for
615 int force_alloc; 605 current allocations */
606 u64 bytes_readonly; /* total bytes that are read only */
607
608 /* delalloc accounting */
609 u64 bytes_delalloc; /* number of bytes reserved for allocation,
610 this space is not necessarily reserved yet
611 by the allocator */
612 u64 bytes_may_use; /* number of bytes that may be used for
613 delalloc */
614
615 int full; /* indicates that we cannot allocate any more
616 chunks for this space */
617 int force_alloc; /* set if we need to force a chunk alloc for
618 this space */
619
616 struct list_head list; 620 struct list_head list;
617 621
618 /* for block groups in our same type */ 622 /* for block groups in our same type */
@@ -701,9 +705,7 @@ struct btrfs_fs_info {
701 struct btrfs_transaction *running_transaction; 705 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle; 706 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait; 707 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait; 708 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707 709
708 struct btrfs_super_block super_copy; 710 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit; 711 struct btrfs_super_block super_for_commit;
@@ -711,7 +713,6 @@ struct btrfs_fs_info {
711 struct super_block *sb; 713 struct super_block *sb;
712 struct inode *btree_inode; 714 struct inode *btree_inode;
713 struct backing_dev_info bdi; 715 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex; 716 struct mutex trans_mutex;
716 struct mutex tree_log_mutex; 717 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex; 718 struct mutex transaction_kthread_mutex;
@@ -730,10 +731,6 @@ struct btrfs_fs_info {
730 atomic_t async_submit_draining; 731 atomic_t async_submit_draining;
731 atomic_t nr_async_bios; 732 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages; 733 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737 734
738 /* 735 /*
739 * this is used by the balancing code to wait for all the pending 736 * this is used by the balancing code to wait for all the pending
@@ -787,7 +784,14 @@ struct btrfs_fs_info {
787 struct list_head dirty_cowonly_roots; 784 struct list_head dirty_cowonly_roots;
788 785
789 struct btrfs_fs_devices *fs_devices; 786 struct btrfs_fs_devices *fs_devices;
787
788 /*
789 * the space_info list is almost entirely read only. It only changes
790 * when we add a new raid type to the FS, and that happens
791 * very rarely. RCU is used to protect it.
792 */
790 struct list_head space_info; 793 struct list_head space_info;
794
791 spinlock_t delalloc_lock; 795 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock; 796 spinlock_t new_trans_lock;
793 u64 delalloc_bytes; 797 u64 delalloc_bytes;
@@ -833,7 +837,14 @@ struct btrfs_root {
833 struct kobject root_kobj; 837 struct kobject root_kobj;
834 struct completion kobj_unregister; 838 struct completion kobj_unregister;
835 struct mutex objectid_mutex; 839 struct mutex objectid_mutex;
840
836 struct mutex log_mutex; 841 struct mutex log_mutex;
842 wait_queue_head_t log_writer_wait;
843 wait_queue_head_t log_commit_wait[2];
844 atomic_t log_writers;
845 atomic_t log_commit[2];
846 unsigned long log_transid;
847 unsigned long log_batch;
837 848
838 u64 objectid; 849 u64 objectid;
839 u64 last_trans; 850 u64 last_trans;
@@ -1721,7 +1732,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1721 u64 empty_size); 1732 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1733struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root, 1734 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize); 1735 u64 bytenr, u32 blocksize,
1736 int level);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1737int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root, 1738 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes, 1739 u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1791,6 +1803,18 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root); 1803int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 1804int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1805u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1806void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1807void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
1808
1809int btrfs_check_metadata_free_space(struct btrfs_root *root);
1810int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1811 u64 bytes);
1812void btrfs_free_reserved_data_space(struct btrfs_root *root,
1813 struct inode *inode, u64 bytes);
1814void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1815 u64 bytes);
1816void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1817 u64 bytes);
1794/* ctree.c */ 1818/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root, 1819int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid, 1820 struct btrfs_path *path, u64 min_objectid,
@@ -1840,7 +1864,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); 1864void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void); 1865struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p); 1866void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p); 1867void btrfs_set_path_blocking(struct btrfs_path *p);
1868void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1869
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1870int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr); 1871 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1872int btrfs_del_leaf(struct btrfs_trans_handle *trans,
@@ -2034,8 +2060,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2034unsigned long btrfs_force_ra(struct address_space *mapping, 2060unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file, 2061 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index); 2062 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2063int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page); 2064int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode); 2065void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae5..3e18175248e0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/scatterlist.h> 21#include <linux/scatterlist.h>
@@ -76,6 +75,40 @@ struct async_submit_bio {
76 struct btrfs_work work; 75 struct btrfs_work work;
77}; 76};
78 77
78/* These are used to set the lockdep class on the extent buffer locks.
79 * The class is set by the readpage_end_io_hook after the buffer has
80 * passed csum validation but before the pages are unlocked.
81 *
82 * The lockdep class is also set by btrfs_init_new_buffer on freshly
83 * allocated blocks.
84 *
85 * The class is based on the level in the tree block, which allows lockdep
86 * to know that lower nodes nest inside the locks of higher nodes.
87 *
88 * We also add a check to make sure the highest level of the tree is
89 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
90 * code needs update as well.
91 */
92#ifdef CONFIG_DEBUG_LOCK_ALLOC
93# if BTRFS_MAX_LEVEL != 8
94# error
95# endif
96static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
97static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
98 /* leaf */
99 "btrfs-extent-00",
100 "btrfs-extent-01",
101 "btrfs-extent-02",
102 "btrfs-extent-03",
103 "btrfs-extent-04",
104 "btrfs-extent-05",
105 "btrfs-extent-06",
106 "btrfs-extent-07",
107 /* highest possible level */
108 "btrfs-extent-08",
109};
110#endif
111
79/* 112/*
80 * extents on the btree inode are pretty simple, there's one extent 113 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device 114 * that covers the entire device
@@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
348 return ret; 381 return ret;
349} 382}
350 383
384#ifdef CONFIG_DEBUG_LOCK_ALLOC
385void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
386{
387 lockdep_set_class_and_name(&eb->lock,
388 &btrfs_eb_class[level],
389 btrfs_eb_name[level]);
390}
391#endif
392
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 393static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state) 394 struct extent_state *state)
353{ 395{
@@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
393 } 435 }
394 found_level = btrfs_header_level(eb); 436 found_level = btrfs_header_level(eb);
395 437
438 btrfs_set_buffer_lockdep_class(eb, found_level);
439
396 ret = csum_tree_block(root, eb, 1); 440 ret = csum_tree_block(root, eb, 1);
397 if (ret) 441 if (ret)
398 ret = -EIO; 442 ret = -EIO;
@@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 844 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801 845
802 if (ret == 0) 846 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE; 847 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
804 else 848 else
805 WARN_ON(1); 849 WARN_ON(1);
806 return buf; 850 return buf;
@@ -813,7 +857,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
813 struct inode *btree_inode = root->fs_info->btree_inode; 857 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) == 858 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) { 859 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf)); 860 btrfs_assert_tree_locked(buf);
861
862 /* ugh, clear_extent_buffer_dirty can be expensive */
863 btrfs_set_lock_blocking(buf);
864
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf); 866 buf);
819 } 867 }
@@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
850 spin_lock_init(&root->list_lock); 898 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex); 899 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex); 900 mutex_init(&root->log_mutex);
901 init_waitqueue_head(&root->log_writer_wait);
902 init_waitqueue_head(&root->log_commit_wait[0]);
903 init_waitqueue_head(&root->log_commit_wait[1]);
904 atomic_set(&root->log_commit[0], 0);
905 atomic_set(&root->log_commit[1], 0);
906 atomic_set(&root->log_writers, 0);
907 root->log_batch = 0;
908 root->log_transid = 0;
853 extent_io_tree_init(&root->dirty_log_pages, 909 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS); 910 fs_info->btree_inode->i_mapping, GFP_NOFS);
855 911
@@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
934 return 0; 990 return 0;
935} 991}
936 992
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 993static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info) 994 struct btrfs_fs_info *fs_info)
939{ 995{
940 struct btrfs_root *root; 996 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root; 997 struct btrfs_root *tree_root = fs_info->tree_root;
998 struct extent_buffer *leaf;
942 999
943 root = kzalloc(sizeof(*root), GFP_NOFS); 1000 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root) 1001 if (!root)
945 return -ENOMEM; 1002 return ERR_PTR(-ENOMEM);
946 1003
947 __setup_root(tree_root->nodesize, tree_root->leafsize, 1004 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize, 1005 tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1008 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1009 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1010 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1011 /*
1012 * log trees do not get reference counted because they go away
1013 * before a real commit is actually done. They do store pointers
1014 * to file data extents, and those reference counts still get
1015 * updated (along with back refs to the log tree).
1016 */
954 root->ref_cows = 0; 1017 root->ref_cows = 0;
955 1018
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 1019 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID, 1020 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0); 1021 trans->transid, 0, 0, 0);
1022 if (IS_ERR(leaf)) {
1023 kfree(root);
1024 return ERR_CAST(leaf);
1025 }
959 1026
1027 root->node = leaf;
960 btrfs_set_header_nritems(root->node, 0); 1028 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0); 1029 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start); 1030 btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
968 BTRFS_FSID_SIZE); 1036 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node); 1037 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node); 1038 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root; 1039 return root;
1040}
1041
1042int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info)
1044{
1045 struct btrfs_root *log_root;
1046
1047 log_root = alloc_log_tree(trans, fs_info);
1048 if (IS_ERR(log_root))
1049 return PTR_ERR(log_root);
1050 WARN_ON(fs_info->log_root_tree);
1051 fs_info->log_root_tree = log_root;
1052 return 0;
1053}
1054
1055int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1056 struct btrfs_root *root)
1057{
1058 struct btrfs_root *log_root;
1059 struct btrfs_inode_item *inode_item;
1060
1061 log_root = alloc_log_tree(trans, root->fs_info);
1062 if (IS_ERR(log_root))
1063 return PTR_ERR(log_root);
1064
1065 log_root->last_trans = trans->transid;
1066 log_root->root_key.offset = root->root_key.objectid;
1067
1068 inode_item = &log_root->root_item.inode;
1069 inode_item->generation = cpu_to_le64(1);
1070 inode_item->size = cpu_to_le64(3);
1071 inode_item->nlink = cpu_to_le32(1);
1072 inode_item->nbytes = cpu_to_le64(root->leafsize);
1073 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1074
1075 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
1076 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1077
1078 WARN_ON(root->log_root);
1079 root->log_root = log_root;
1080 root->log_transid = 0;
972 return 0; 1081 return 0;
973} 1082}
974 1083
@@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{ 1245{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1246 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0; 1247 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device; 1248 struct btrfs_device *device;
1141 struct backing_dev_info *bdi; 1249 struct backing_dev_info *bdi;
1142#if 0 1250#if 0
@@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1144 btrfs_congested_async(info, 0)) 1252 btrfs_congested_async(info, 0))
1145 return 1; 1253 return 1;
1146#endif 1254#endif
1147 list_for_each(cur, &info->fs_devices->devices) { 1255 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev) 1256 if (!device->bdev)
1150 continue; 1257 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev); 1258 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1163 */ 1270 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1271static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{ 1272{
1166 struct list_head *cur;
1167 struct btrfs_device *device; 1273 struct btrfs_device *device;
1168 struct btrfs_fs_info *info; 1274 struct btrfs_fs_info *info;
1169 1275
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1276 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) { 1277 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev) 1278 if (!device->bdev)
1174 continue; 1279 continue;
1175 1280
@@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1552 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers); 1553 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock); 1555 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock); 1556 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock); 1557 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1535 init_waitqueue_head(&fs_info->transaction_throttle); 1639 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait); 1640 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait); 1641 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542 1642
1543 __setup_root(4096, 4096, 4096, 4096, tree_root, 1643 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1644 fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 * low idle thresh 1727 * low idle thresh
1628 */ 1728 */
1629 fs_info->endio_workers.idle_thresh = 4; 1729 fs_info->endio_workers.idle_thresh = 4;
1730 fs_info->endio_meta_workers.idle_thresh = 4;
1731
1630 fs_info->endio_write_workers.idle_thresh = 64; 1732 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64; 1733 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632 1734
@@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1720 ret = find_and_setup_root(tree_root, fs_info, 1822 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root); 1823 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1; 1824 dev_root->track_dirty = 1;
1723
1724 if (ret) 1825 if (ret)
1725 goto fail_extent_root; 1826 goto fail_extent_root;
1726 1827
@@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1841 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1842 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner"); 1843 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread) 1844 if (IS_ERR(fs_info->cleaner_kthread))
1744 goto fail_csum_root; 1845 goto fail_csum_root;
1745 1846
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1847 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root, 1848 tree_root,
1748 "btrfs-transaction"); 1849 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread) 1850 if (IS_ERR(fs_info->transaction_kthread))
1750 goto fail_cleaner; 1851 goto fail_cleaner;
1751 1852
1752 if (btrfs_super_log_root(disk_super) != 0) { 1853 if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1929,14 @@ fail_sb_buffer:
1828fail_iput: 1929fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1930 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode); 1931 iput(fs_info->btree_inode);
1831fail: 1932
1832 btrfs_close_devices(fs_info->fs_devices); 1933 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1934 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1935 bdi_destroy(&fs_info->bdi);
1834 1936
1937fail:
1835 kfree(extent_root); 1938 kfree(extent_root);
1836 kfree(tree_root); 1939 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info); 1940 kfree(fs_info);
1839 kfree(chunk_root); 1941 kfree(chunk_root);
1840 kfree(dev_root); 1942 kfree(dev_root);
@@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device,
1995 2097
1996int write_all_supers(struct btrfs_root *root, int max_mirrors) 2098int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{ 2099{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices; 2100 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev; 2101 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb; 2102 struct btrfs_super_block *sb;
@@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2011 2112
2012 sb = &root->fs_info->super_for_commit; 2113 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item; 2114 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) { 2115 list_for_each_entry(dev, head, dev_list) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) { 2116 if (!dev->bdev) {
2017 total_errors++; 2117 total_errors++;
2018 continue; 2118 continue;
@@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2045 } 2145 }
2046 2146
2047 total_errors = 0; 2147 total_errors = 0;
2048 list_for_each(cur, head) { 2148 list_for_each_entry(dev, head, dev_list) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev) 2149 if (!dev->bdev)
2051 continue; 2150 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable) 2151 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,7 +2359,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2260 u64 transid = btrfs_header_generation(buf); 2359 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode; 2360 struct inode *btree_inode = root->fs_info->btree_inode;
2262 2361
2263 WARN_ON(!btrfs_tree_locked(buf)); 2362 btrfs_set_lock_blocking(buf);
2363
2364 btrfs_assert_tree_locked(buf);
2264 if (transid != root->fs_info->generation) { 2365 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2366 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n", 2367 "found %llu running %llu\n",
@@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2302 int ret; 2403 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2404 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0) 2405 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE; 2406 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2306 return ret; 2407 return ret;
2307} 2408}
2308 2409
2309int btree_lock_page_hook(struct page *page) 2410int btree_lock_page_hook(struct page *page)
2310{ 2411{
2311 struct inode *inode = page->mapping->host; 2412 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb; 2414 struct extent_buffer *eb;
2315 unsigned long len; 2415 unsigned long len;
@@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page)
2324 goto out; 2424 goto out;
2325 2425
2326 btrfs_tree_lock(eb); 2426 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb); 2428 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb); 2429 free_extent_buffer(eb);
2332out: 2430out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b7..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info); 98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root);
101int btree_lock_page_hook(struct page *page); 103int btree_lock_page_hook(struct page *page);
104
105
106#ifdef CONFIG_DEBUG_LOCK_ALLOC
107void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
108#else
109static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
110 int level)
111{
112}
113#endif
102#endif 114#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..fefe83ad2059 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,8 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h>
23#include "compat.h" 24#include "compat.h"
24#include "hash.h" 25#include "hash.h"
25#include "crc32c.h" 26#include "crc32c.h"
@@ -30,7 +31,6 @@
30#include "volumes.h" 31#include "volumes.h"
31#include "locking.h" 32#include "locking.h"
32#include "ref-cache.h" 33#include "ref-cache.h"
33#include "compat.h"
34 34
35#define PENDING_EXTENT_INSERT 0 35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 36#define PENDING_EXTENT_DELETE 1
@@ -61,6 +61,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
61 u64 bytenr, u64 num_bytes, int alloc, 61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 62 int mark_free);
63 63
64static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes,
66 u64 flags, int force);
67
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 68static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 69{
66 return (cache->flags & bits) == bits; 70 return (cache->flags & bits) == bits;
@@ -326,16 +330,34 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 330 u64 flags)
327{ 331{
328 struct list_head *head = &info->space_info; 332 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 333 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 334
332 found = list_entry(cur, struct btrfs_space_info, list); 335 rcu_read_lock();
333 if (found->flags == flags) 336 list_for_each_entry_rcu(found, head, list) {
337 if (found->flags == flags) {
338 rcu_read_unlock();
334 return found; 339 return found;
340 }
335 } 341 }
342 rcu_read_unlock();
336 return NULL; 343 return NULL;
337} 344}
338 345
346/*
347 * after adding space to the filesystem, we need to clear the full flags
348 * on all the space infos.
349 */
350void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
351{
352 struct list_head *head = &info->space_info;
353 struct btrfs_space_info *found;
354
355 rcu_read_lock();
356 list_for_each_entry_rcu(found, head, list)
357 found->full = 0;
358 rcu_read_unlock();
359}
360
339static u64 div_factor(u64 num, int factor) 361static u64 div_factor(u64 num, int factor)
340{ 362{
341 if (factor == 10) 363 if (factor == 10)
@@ -1326,8 +1348,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root) 1349 struct btrfs_root *root)
1328{ 1350{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1); 1351 u64 start;
1330 del_pending_extents(trans, root->fs_info->extent_root, 1); 1352 u64 end;
1353 int ret;
1354
1355 while(1) {
1356 finish_current_insert(trans, root->fs_info->extent_root, 1);
1357 del_pending_extents(trans, root->fs_info->extent_root, 1);
1358
1359 /* is there more work to do? */
1360 ret = find_first_extent_bit(&root->fs_info->pending_del,
1361 0, &start, &end, EXTENT_WRITEBACK);
1362 if (!ret)
1363 continue;
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins,
1365 0, &start, &end, EXTENT_WRITEBACK);
1366 if (!ret)
1367 continue;
1368 break;
1369 }
1331 return 0; 1370 return 0;
1332} 1371}
1333 1372
@@ -1525,15 +1564,55 @@ out:
1525 return ret; 1564 return ret;
1526} 1565}
1527 1566
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1567/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1568 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1569 * can be in just about any order, and it is likely to have clusters of
1570 * things that are close together and clusters of things that are not.
1571 *
1572 * To help reduce the seeks that come with updating all of these reference
1573 * counts, sort them by byte number before actual updates are done.
1574 *
1575 * struct refsort is used to match byte number to slot in the btree block.
1576 * we sort based on the byte number and then use the slot to actually
1577 * find the item.
1578 *
1579 * struct refsort is smaller than strcut btrfs_item and smaller than
1580 * struct btrfs_key_ptr. Since we're currently limited to the page size
1581 * for a btree block, there's no way for a kmalloc of refsorts for a
1582 * single node to be bigger than a page.
1583 */
1584struct refsort {
1585 u64 bytenr;
1586 u32 slot;
1587};
1588
1589/*
1590 * for passing into sort()
1591 */
1592static int refsort_cmp(const void *a_void, const void *b_void)
1593{
1594 const struct refsort *a = a_void;
1595 const struct refsort *b = b_void;
1596
1597 if (a->bytenr < b->bytenr)
1598 return -1;
1599 if (a->bytenr > b->bytenr)
1600 return 1;
1601 return 0;
1602}
1603
1604
1605noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1606 struct btrfs_root *root,
1607 struct extent_buffer *orig_buf,
1608 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1609{
1532 u64 bytenr; 1610 u64 bytenr;
1533 u64 ref_root; 1611 u64 ref_root;
1534 u64 orig_root; 1612 u64 orig_root;
1535 u64 ref_generation; 1613 u64 ref_generation;
1536 u64 orig_generation; 1614 u64 orig_generation;
1615 struct refsort *sorted;
1537 u32 nritems; 1616 u32 nritems;
1538 u32 nr_file_extents = 0; 1617 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1618 struct btrfs_key key;
@@ -1542,6 +1621,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1621 int level;
1543 int ret = 0; 1622 int ret = 0;
1544 int faili = 0; 1623 int faili = 0;
1624 int refi = 0;
1625 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1627 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1628
@@ -1553,6 +1634,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1634 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1635 level = btrfs_header_level(buf);
1555 1636
1637 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1638 BUG_ON(!sorted);
1639
1556 if (root->ref_cows) { 1640 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1641 process_func = __btrfs_inc_extent_ref;
1558 } else { 1642 } else {
@@ -1565,6 +1649,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1649 process_func = __btrfs_update_extent_ref;
1566 } 1650 }
1567 1651
1652 /*
1653 * we make two passes through the items. In the first pass we
1654 * only record the byte number and slot. Then we sort based on
1655 * byte number and do the actual work based on the sorted results
1656 */
1568 for (i = 0; i < nritems; i++) { 1657 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1658 cond_resched();
1570 if (level == 0) { 1659 if (level == 0) {
@@ -1581,6 +1670,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1670 continue;
1582 1671
1583 nr_file_extents++; 1672 nr_file_extents++;
1673 sorted[refi].bytenr = bytenr;
1674 sorted[refi].slot = i;
1675 refi++;
1676 } else {
1677 bytenr = btrfs_node_blockptr(buf, i);
1678 sorted[refi].bytenr = bytenr;
1679 sorted[refi].slot = i;
1680 refi++;
1681 }
1682 }
1683 /*
1684 * if refi == 0, we didn't actually put anything into the sorted
1685 * array and we're done
1686 */
1687 if (refi == 0)
1688 goto out;
1689
1690 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1691
1692 for (i = 0; i < refi; i++) {
1693 cond_resched();
1694 slot = sorted[i].slot;
1695 bytenr = sorted[i].bytenr;
1696
1697 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1699
1585 ret = process_func(trans, root, bytenr, 1700 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1701 orig_buf->start, buf->start,
@@ -1589,25 +1704,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1704 key.objectid);
1590 1705
1591 if (ret) { 1706 if (ret) {
1592 faili = i; 1707 faili = slot;
1593 WARN_ON(1); 1708 WARN_ON(1);
1594 goto fail; 1709 goto fail;
1595 } 1710 }
1596 } else { 1711 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1712 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1713 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1714 orig_root, ref_root,
1601 orig_generation, ref_generation, 1715 orig_generation, ref_generation,
1602 level - 1); 1716 level - 1);
1603 if (ret) { 1717 if (ret) {
1604 faili = i; 1718 faili = slot;
1605 WARN_ON(1); 1719 WARN_ON(1);
1606 goto fail; 1720 goto fail;
1607 } 1721 }
1608 } 1722 }
1609 } 1723 }
1610out: 1724out:
1725 kfree(sorted);
1611 if (nr_extents) { 1726 if (nr_extents) {
1612 if (level == 0) 1727 if (level == 0)
1613 *nr_extents = nr_file_extents; 1728 *nr_extents = nr_file_extents;
@@ -1616,6 +1731,7 @@ out:
1616 } 1731 }
1617 return 0; 1732 return 0;
1618fail: 1733fail:
1734 kfree(sorted);
1619 WARN_ON(1); 1735 WARN_ON(1);
1620 return ret; 1736 return ret;
1621} 1737}
@@ -1808,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1808 if (!found) 1924 if (!found)
1809 return -ENOMEM; 1925 return -ENOMEM;
1810 1926
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups); 1927 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem); 1928 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock); 1929 spin_lock_init(&found->lock);
@@ -1818,9 +1933,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1818 found->bytes_pinned = 0; 1933 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0; 1934 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0; 1935 found->bytes_readonly = 0;
1936 found->bytes_delalloc = 0;
1821 found->full = 0; 1937 found->full = 0;
1822 found->force_alloc = 0; 1938 found->force_alloc = 0;
1823 *space_info = found; 1939 *space_info = found;
1940 list_add_rcu(&found->list, &info->space_info);
1824 return 0; 1941 return 0;
1825} 1942}
1826 1943
@@ -1881,6 +1998,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1881 return flags; 1998 return flags;
1882} 1999}
1883 2000
2001static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2002{
2003 struct btrfs_fs_info *info = root->fs_info;
2004 u64 alloc_profile;
2005
2006 if (data) {
2007 alloc_profile = info->avail_data_alloc_bits &
2008 info->data_alloc_profile;
2009 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2010 } else if (root == root->fs_info->chunk_root) {
2011 alloc_profile = info->avail_system_alloc_bits &
2012 info->system_alloc_profile;
2013 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2014 } else {
2015 alloc_profile = info->avail_metadata_alloc_bits &
2016 info->metadata_alloc_profile;
2017 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2018 }
2019
2020 return btrfs_reduce_alloc_profile(root, data);
2021}
2022
2023void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2024{
2025 u64 alloc_target;
2026
2027 alloc_target = btrfs_get_alloc_profile(root, 1);
2028 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2029 alloc_target);
2030}
2031
2032/*
2033 * for now this just makes sure we have at least 5% of our metadata space free
2034 * for use.
2035 */
2036int btrfs_check_metadata_free_space(struct btrfs_root *root)
2037{
2038 struct btrfs_fs_info *info = root->fs_info;
2039 struct btrfs_space_info *meta_sinfo;
2040 u64 alloc_target, thresh;
2041 int committed = 0, ret;
2042
2043 /* get the space info for where the metadata will live */
2044 alloc_target = btrfs_get_alloc_profile(root, 0);
2045 meta_sinfo = __find_space_info(info, alloc_target);
2046
2047again:
2048 spin_lock(&meta_sinfo->lock);
2049 if (!meta_sinfo->full)
2050 thresh = meta_sinfo->total_bytes * 80;
2051 else
2052 thresh = meta_sinfo->total_bytes * 95;
2053
2054 do_div(thresh, 100);
2055
2056 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2057 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2058 struct btrfs_trans_handle *trans;
2059 if (!meta_sinfo->full) {
2060 meta_sinfo->force_alloc = 1;
2061 spin_unlock(&meta_sinfo->lock);
2062
2063 trans = btrfs_start_transaction(root, 1);
2064 if (!trans)
2065 return -ENOMEM;
2066
2067 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2068 2 * 1024 * 1024, alloc_target, 0);
2069 btrfs_end_transaction(trans, root);
2070 goto again;
2071 }
2072 spin_unlock(&meta_sinfo->lock);
2073
2074 if (!committed) {
2075 committed = 1;
2076 trans = btrfs_join_transaction(root, 1);
2077 if (!trans)
2078 return -ENOMEM;
2079 ret = btrfs_commit_transaction(trans, root);
2080 if (ret)
2081 return ret;
2082 goto again;
2083 }
2084 return -ENOSPC;
2085 }
2086 spin_unlock(&meta_sinfo->lock);
2087
2088 return 0;
2089}
2090
2091/*
2092 * This will check the space that the inode allocates from to make sure we have
2093 * enough space for bytes.
2094 */
2095int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2096 u64 bytes)
2097{
2098 struct btrfs_space_info *data_sinfo;
2099 int ret = 0, committed = 0;
2100
2101 /* make sure bytes are sectorsize aligned */
2102 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2103
2104 data_sinfo = BTRFS_I(inode)->space_info;
2105again:
2106 /* make sure we have enough space to handle the data first */
2107 spin_lock(&data_sinfo->lock);
2108 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2109 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2110 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2111 data_sinfo->bytes_may_use < bytes) {
2112 struct btrfs_trans_handle *trans;
2113
2114 /*
2115 * if we don't have enough free bytes in this space then we need
2116 * to alloc a new chunk.
2117 */
2118 if (!data_sinfo->full) {
2119 u64 alloc_target;
2120
2121 data_sinfo->force_alloc = 1;
2122 spin_unlock(&data_sinfo->lock);
2123
2124 alloc_target = btrfs_get_alloc_profile(root, 1);
2125 trans = btrfs_start_transaction(root, 1);
2126 if (!trans)
2127 return -ENOMEM;
2128
2129 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2130 bytes + 2 * 1024 * 1024,
2131 alloc_target, 0);
2132 btrfs_end_transaction(trans, root);
2133 if (ret)
2134 return ret;
2135 goto again;
2136 }
2137 spin_unlock(&data_sinfo->lock);
2138
2139 /* commit the current transaction and try again */
2140 if (!committed) {
2141 committed = 1;
2142 trans = btrfs_join_transaction(root, 1);
2143 if (!trans)
2144 return -ENOMEM;
2145 ret = btrfs_commit_transaction(trans, root);
2146 if (ret)
2147 return ret;
2148 goto again;
2149 }
2150
2151 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2152 ", %llu bytes_used, %llu bytes_reserved, "
2153 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2154 "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2155 data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2156 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2157 data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2158 return -ENOSPC;
2159 }
2160 data_sinfo->bytes_may_use += bytes;
2161 BTRFS_I(inode)->reserved_bytes += bytes;
2162 spin_unlock(&data_sinfo->lock);
2163
2164 return btrfs_check_metadata_free_space(root);
2165}
2166
2167/*
2168 * if there was an error for whatever reason after calling
2169 * btrfs_check_data_free_space, call this so we can cleanup the counters.
2170 */
2171void btrfs_free_reserved_data_space(struct btrfs_root *root,
2172 struct inode *inode, u64 bytes)
2173{
2174 struct btrfs_space_info *data_sinfo;
2175
2176 /* make sure bytes are sectorsize aligned */
2177 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2178
2179 data_sinfo = BTRFS_I(inode)->space_info;
2180 spin_lock(&data_sinfo->lock);
2181 data_sinfo->bytes_may_use -= bytes;
2182 BTRFS_I(inode)->reserved_bytes -= bytes;
2183 spin_unlock(&data_sinfo->lock);
2184}
2185
2186/* called when we are adding a delalloc extent to the inode's io_tree */
2187void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2188 u64 bytes)
2189{
2190 struct btrfs_space_info *data_sinfo;
2191
2192 /* get the space info for where this inode will be storing its data */
2193 data_sinfo = BTRFS_I(inode)->space_info;
2194
2195 /* make sure we have enough space to handle the data first */
2196 spin_lock(&data_sinfo->lock);
2197 data_sinfo->bytes_delalloc += bytes;
2198
2199 /*
2200 * we are adding a delalloc extent without calling
2201 * btrfs_check_data_free_space first. This happens on a weird
2202 * writepage condition, but shouldn't hurt our accounting
2203 */
2204 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2205 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2206 BTRFS_I(inode)->reserved_bytes = 0;
2207 } else {
2208 data_sinfo->bytes_may_use -= bytes;
2209 BTRFS_I(inode)->reserved_bytes -= bytes;
2210 }
2211
2212 spin_unlock(&data_sinfo->lock);
2213}
2214
2215/* called when we are clearing an delalloc extent from the inode's io_tree */
2216void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2217 u64 bytes)
2218{
2219 struct btrfs_space_info *info;
2220
2221 info = BTRFS_I(inode)->space_info;
2222
2223 spin_lock(&info->lock);
2224 info->bytes_delalloc -= bytes;
2225 spin_unlock(&info->lock);
2226}
2227
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans, 2228static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes, 2229 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force) 2230 u64 flags, int force)
@@ -2137,13 +2481,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
2137 u64 end; 2481 u64 end;
2138 u64 priv; 2482 u64 priv;
2139 u64 search = 0; 2483 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info; 2484 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path; 2485 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp; 2486 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list; 2487 struct list_head insert_list, update_list;
2145 int ret; 2488 int ret;
2146 int num_inserts = 0, max_inserts; 2489 int num_inserts = 0, max_inserts, restart = 0;
2147 2490
2148 path = btrfs_alloc_path(); 2491 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list); 2492 INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2502,19 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2503 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2504 if (ret) {
2162 if (skipped && all && !num_inserts) { 2505 if (restart && !num_inserts &&
2163 skipped = 0; 2506 list_empty(&update_list)) {
2507 restart = 0;
2164 search = 0; 2508 search = 0;
2165 continue; 2509 continue;
2166 } 2510 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break; 2511 break;
2169 } 2512 }
2170 2513
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); 2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) { 2515 if (!ret) {
2173 skipped = 1; 2516 if (all)
2517 restart = 1;
2174 search = end + 1; 2518 search = end + 1;
2175 if (need_resched()) { 2519 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex); 2520 mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2533,7 @@ again:
2189 list_add_tail(&extent_op->list, &insert_list); 2533 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1; 2534 search = end + 1;
2191 if (num_inserts == max_inserts) { 2535 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex); 2536 restart = 1;
2193 break; 2537 break;
2194 } 2538 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) { 2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2549,6 @@ again:
2205 * somebody marked this thing for deletion then just unlock it and be 2549 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it 2550 * done, the free_extents will handle it
2207 */ 2551 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) { 2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr, 2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1, 2554 extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2570,10 @@ again:
2227 if (!list_empty(&update_list)) { 2570 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list); 2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret); 2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2230 } 2577 }
2231 2578
2232 /* 2579 /*
@@ -2234,9 +2581,9 @@ again:
2234 * need to make sure everything is cleaned then reset everything and 2581 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning 2582 * go back to the beginning
2236 */ 2583 */
2237 if (!num_inserts && all && skipped) { 2584 if (!num_inserts && restart) {
2238 search = 0; 2585 search = 0;
2239 skipped = 0; 2586 restart = 0;
2240 INIT_LIST_HEAD(&update_list); 2587 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list); 2588 INIT_LIST_HEAD(&insert_list);
2242 goto again; 2589 goto again;
@@ -2293,27 +2640,19 @@ again:
2293 BUG_ON(ret); 2640 BUG_ON(ret);
2294 2641
2295 /* 2642 /*
2296 * if we broke out of the loop in order to insert stuff because we hit 2643 * if restart is set for whatever reason we need to go back and start
2297 * the maximum number of inserts at a time we can handle, then loop 2644 * searching through the pending list again.
2298 * back and pick up where we left off 2645 *
2299 */ 2646 * We just inserted some extents, which could have resulted in new
2300 if (num_inserts == max_inserts) { 2647 * blocks being allocated, which would result in new blocks needing
2301 INIT_LIST_HEAD(&insert_list); 2648 * updates, so if all is set we _must_ restart to get the updated
2302 INIT_LIST_HEAD(&update_list); 2649 * blocks.
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */ 2650 */
2312 if (all && skipped) { 2651 if (restart || all) {
2313 INIT_LIST_HEAD(&insert_list); 2652 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list); 2653 INIT_LIST_HEAD(&update_list);
2315 search = 0; 2654 search = 0;
2316 skipped = 0; 2655 restart = 0;
2317 num_inserts = 0; 2656 num_inserts = 0;
2318 goto again; 2657 goto again;
2319 } 2658 }
@@ -2547,6 +2886,7 @@ again:
2547 if (ret) { 2886 if (ret) {
2548 if (all && skipped && !nr) { 2887 if (all && skipped && !nr) {
2549 search = 0; 2888 search = 0;
2889 skipped = 0;
2550 continue; 2890 continue;
2551 } 2891 }
2552 mutex_unlock(&info->extent_ins_mutex); 2892 mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2973,8 @@ again:
2633 goto again; 2973 goto again;
2634 } 2974 }
2635 2975
2976 if (!err)
2977 finish_current_insert(trans, extent_root, 0);
2636 return err; 2978 return err;
2637} 2979}
2638 2980
@@ -2700,13 +3042,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 3042 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 3045 mutex_lock(&root->fs_info->pinned_mutex);
2704 3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 3047 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 3048 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 3049 return 0;
2712 } 3050 }
@@ -2787,7 +3125,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2787 3125
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) { 3126 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc; 3127 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024; 3128 if (!btrfs_test_opt(root, SSD))
3129 empty_cluster = 64 * 1024;
2791 } 3130 }
2792 3131
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 3132 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +3353,18 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3353static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3354{
3016 struct btrfs_block_group_cache *cache; 3355 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3356
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3357 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3358 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved), 3359 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not "); 3360 (info->full) ? "" : "not ");
3361 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3362 " may_use=%llu, used=%llu\n", info->total_bytes,
3363 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3364 info->bytes_used);
3023 3365
3024 down_read(&info->groups_sem); 3366 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3367 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3368 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3369 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3370 "%llu pinned %llu reserved\n",
@@ -3047,24 +3388,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3047{ 3388{
3048 int ret; 3389 int ret;
3049 u64 search_start = 0; 3390 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info; 3391 struct btrfs_fs_info *info = root->fs_info;
3052 3392
3053 if (data) { 3393 data = btrfs_get_alloc_profile(root, data);
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again: 3394again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /* 3395 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which 3396 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations 3397 * is not called recursively on allocations
@@ -3332,7 +3659,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3332 3659
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3660struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, 3661 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize) 3662 u64 bytenr, u32 blocksize,
3663 int level)
3336{ 3664{
3337 struct extent_buffer *buf; 3665 struct extent_buffer *buf;
3338 3666
@@ -3340,9 +3668,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3340 if (!buf) 3668 if (!buf)
3341 return ERR_PTR(-ENOMEM); 3669 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid); 3670 btrfs_set_header_generation(buf, trans->transid);
3671 btrfs_set_buffer_lockdep_class(buf, level);
3343 btrfs_tree_lock(buf); 3672 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3673 clean_tree_block(trans, root, buf);
3674
3675 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3676 btrfs_set_buffer_uptodate(buf);
3677
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3678 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3679 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3680 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3683,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3683 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3684 }
3353 trans->blocks_used++; 3685 trans->blocks_used++;
3686 /* this returns a buffer locked for blocking */
3354 return buf; 3687 return buf;
3355} 3688}
3356 3689
@@ -3379,7 +3712,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3379 return ERR_PTR(ret); 3712 return ERR_PTR(ret);
3380 } 3713 }
3381 3714
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3715 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3716 blocksize, level);
3383 return buf; 3717 return buf;
3384} 3718}
3385 3719
@@ -3388,36 +3722,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3722{
3389 u64 leaf_owner; 3723 u64 leaf_owner;
3390 u64 leaf_generation; 3724 u64 leaf_generation;
3725 struct refsort *sorted;
3391 struct btrfs_key key; 3726 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3727 struct btrfs_file_extent_item *fi;
3393 int i; 3728 int i;
3394 int nritems; 3729 int nritems;
3395 int ret; 3730 int ret;
3731 int refi = 0;
3732 int slot;
3396 3733
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3734 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3735 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3736 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3737 leaf_generation = btrfs_header_generation(leaf);
3401 3738
3739 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3740 /* we do this loop twice. The first time we build a list
3741 * of the extents we have a reference on, then we sort the list
3742 * by bytenr. The second time around we actually do the
3743 * extent freeing.
3744 */
3402 for (i = 0; i < nritems; i++) { 3745 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3746 u64 disk_bytenr;
3404 cond_resched(); 3747 cond_resched();
3405 3748
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3749 btrfs_item_key_to_cpu(leaf, &key, i);
3750
3751 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3752 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3753 continue;
3754
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3755 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3756
3757 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3758 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3759 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3760 continue;
3413 /* 3761
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3762 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3763
3764 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3765 if (disk_bytenr == 0)
3419 continue; 3766 continue;
3420 3767
3768 sorted[refi].bytenr = disk_bytenr;
3769 sorted[refi].slot = i;
3770 refi++;
3771 }
3772
3773 if (refi == 0)
3774 goto out;
3775
3776 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3777
3778 for (i = 0; i < refi; i++) {
3779 u64 disk_bytenr;
3780
3781 disk_bytenr = sorted[i].bytenr;
3782 slot = sorted[i].slot;
3783
3784 cond_resched();
3785
3786 btrfs_item_key_to_cpu(leaf, &key, slot);
3787 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3788 continue;
3789
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3792 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3793 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3794 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3799,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3799 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3800 cond_resched();
3430 } 3801 }
3802out:
3803 kfree(sorted);
3431 return 0; 3804 return 0;
3432} 3805}
3433 3806
@@ -3437,9 +3810,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3810{
3438 int i; 3811 int i;
3439 int ret; 3812 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3813 struct btrfs_extent_info *info;
3814 struct refsort *sorted;
3815
3816 if (ref->nritems == 0)
3817 return 0;
3441 3818
3819 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3820 for (i = 0; i < ref->nritems; i++) {
3821 sorted[i].bytenr = ref->extents[i].bytenr;
3822 sorted[i].slot = i;
3823 }
3824 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3825
3826 /*
3827 * the items in the ref were sorted when the ref was inserted
3828 * into the ref cache, so this is already in order
3829 */
3830 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3832 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3833 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3834 ref->owner, ref->generation,
@@ -3453,6 +3842,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3842 info++;
3454 } 3843 }
3455 3844
3845 kfree(sorted);
3456 return 0; 3846 return 0;
3457} 3847}
3458 3848
@@ -3497,6 +3887,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3887}
3498 3888
3499/* 3889/*
3890 * this is used while deleting old snapshots, and it drops the refs
3891 * on a whole subtree starting from a level 1 node.
3892 *
3893 * The idea is to sort all the leaf pointers, and then drop the
3894 * ref on all the leaves in order. Most of the time the leaves
3895 * will have ref cache entries, so no leaf IOs will be required to
3896 * find the extents they have references on.
3897 *
3898 * For each leaf, any references it has are also dropped in order
3899 *
3900 * This ends up dropping the references in something close to optimal
3901 * order for reading and modifying the extent allocation tree.
3902 */
3903static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3904 struct btrfs_root *root,
3905 struct btrfs_path *path)
3906{
3907 u64 bytenr;
3908 u64 root_owner;
3909 u64 root_gen;
3910 struct extent_buffer *eb = path->nodes[1];
3911 struct extent_buffer *leaf;
3912 struct btrfs_leaf_ref *ref;
3913 struct refsort *sorted = NULL;
3914 int nritems = btrfs_header_nritems(eb);
3915 int ret;
3916 int i;
3917 int refi = 0;
3918 int slot = path->slots[1];
3919 u32 blocksize = btrfs_level_size(root, 0);
3920 u32 refs;
3921
3922 if (nritems == 0)
3923 goto out;
3924
3925 root_owner = btrfs_header_owner(eb);
3926 root_gen = btrfs_header_generation(eb);
3927 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3928
3929 /*
3930 * step one, sort all the leaf pointers so we don't scribble
3931 * randomly into the extent allocation tree
3932 */
3933 for (i = slot; i < nritems; i++) {
3934 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3935 sorted[refi].slot = i;
3936 refi++;
3937 }
3938
3939 /*
3940 * nritems won't be zero, but if we're picking up drop_snapshot
3941 * after a crash, slot might be > 0, so double check things
3942 * just in case.
3943 */
3944 if (refi == 0)
3945 goto out;
3946
3947 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3948
3949 /*
3950 * the first loop frees everything the leaves point to
3951 */
3952 for (i = 0; i < refi; i++) {
3953 u64 ptr_gen;
3954
3955 bytenr = sorted[i].bytenr;
3956
3957 /*
3958 * check the reference count on this leaf. If it is > 1
3959 * we just decrement it below and don't update any
3960 * of the refs the leaf points to.
3961 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3963 BUG_ON(ret);
3964 if (refs != 1)
3965 continue;
3966
3967 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3968
3969 /*
3970 * the leaf only had one reference, which means the
3971 * only thing pointing to this leaf is the snapshot
3972 * we're deleting. It isn't possible for the reference
3973 * count to increase again later
3974 *
3975 * The reference cache is checked for the leaf,
3976 * and if found we'll be able to drop any refs held by
3977 * the leaf without needing to read it in.
3978 */
3979 ref = btrfs_lookup_leaf_ref(root, bytenr);
3980 if (ref && ref->generation != ptr_gen) {
3981 btrfs_free_leaf_ref(root, ref);
3982 ref = NULL;
3983 }
3984 if (ref) {
3985 ret = cache_drop_leaf_ref(trans, root, ref);
3986 BUG_ON(ret);
3987 btrfs_remove_leaf_ref(root, ref);
3988 btrfs_free_leaf_ref(root, ref);
3989 } else {
3990 /*
3991 * the leaf wasn't in the reference cache, so
3992 * we have to read it.
3993 */
3994 leaf = read_tree_block(root, bytenr, blocksize,
3995 ptr_gen);
3996 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3997 BUG_ON(ret);
3998 free_extent_buffer(leaf);
3999 }
4000 atomic_inc(&root->fs_info->throttle_gen);
4001 wake_up(&root->fs_info->transaction_throttle);
4002 cond_resched();
4003 }
4004
4005 /*
4006 * run through the loop again to free the refs on the leaves.
4007 * This is faster than doing it in the loop above because
4008 * the leaves are likely to be clustered together. We end up
4009 * working in nice chunks on the extent allocation tree.
4010 */
4011 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret);
4017
4018 atomic_inc(&root->fs_info->throttle_gen);
4019 wake_up(&root->fs_info->transaction_throttle);
4020 cond_resched();
4021 }
4022out:
4023 kfree(sorted);
4024
4025 /*
4026 * update the path to show we've processed the entire level 1
4027 * node. This will get saved into the root's drop_snapshot_progress
4028 * field so these drops are not repeated again if this transaction
4029 * commits.
4030 */
4031 path->slots[1] = nritems;
4032 return 0;
4033}
4034
4035/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 4036 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 4037 * counts as it goes.
3502 */ 4038 */
@@ -3511,7 +4047,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 4047 struct extent_buffer *next;
3512 struct extent_buffer *cur; 4048 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 4049 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 4050 u32 blocksize;
3516 int ret; 4051 int ret;
3517 u32 refs; 4052 u32 refs;
@@ -3538,17 +4073,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 4073 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 4074 btrfs_header_nritems(cur))
3540 break; 4075 break;
4076
4077 /* the new code goes down to level 1 and does all the
4078 * leaves pointed to that node in bulk. So, this check
4079 * for level 0 will always be false.
4080 *
4081 * But, the disk format allows the drop_snapshot_progress
4082 * field in the root to leave things in a state where
4083 * a leaf will need cleaning up here. If someone crashes
4084 * with the old code and then boots with the new code,
4085 * we might find a leaf here.
4086 */
3541 if (*level == 0) { 4087 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 4088 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 4089 BUG_ON(ret);
3544 break; 4090 break;
3545 } 4091 }
4092
4093 /*
4094 * once we get to level one, process the whole node
4095 * at once, including everything below it.
4096 */
4097 if (*level == 1) {
4098 ret = drop_level_one_refs(trans, root, path);
4099 BUG_ON(ret);
4100 break;
4101 }
4102
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 4103 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 4105 blocksize = btrfs_level_size(root, *level - 1);
3549 4106
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 4108 BUG_ON(ret);
4109
4110 /*
4111 * if there is more than one reference, we don't need
4112 * to read that node to drop any references it has. We
4113 * just drop the ref we hold on that node and move on to the
4114 * next slot in this level.
4115 */
3552 if (refs != 1) { 4116 if (refs != 1) {
3553 parent = path->nodes[*level]; 4117 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 4118 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +4131,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 4131
3568 continue; 4132 continue;
3569 } 4133 }
4134
3570 /* 4135 /*
3571 * at this point, we have a single ref, and since the 4136 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 4137 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 4138 */
3576 if (*level == 1) { 4139 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 4140 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 4141 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 4142 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +4161,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 4161 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 4162 root_gen = btrfs_header_generation(parent);
3633 4163
4164 /*
4165 * cleanup and free the reference on the last node
4166 * we processed
4167 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 4169 parent->start, root_owner, root_gen,
3636 *level, 1); 4170 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 4171 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 4172 path->nodes[*level] = NULL;
4173
3639 *level += 1; 4174 *level += 1;
3640 BUG_ON(ret); 4175 BUG_ON(ret);
3641 4176
@@ -3687,6 +4222,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 4222
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 4223 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 4224 btrfs_tree_lock(next);
4225 btrfs_set_lock_blocking(next);
3690 4226
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 4227 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 4228 &refs);
@@ -3754,6 +4290,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4290 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4291 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4292 struct btrfs_disk_key disk_key;
4293
4294 /*
4295 * there is more work to do in this level.
4296 * Update the drop_progress marker to reflect
4297 * the work we've done so far, and then bump
4298 * the slot number
4299 */
3757 node = path->nodes[i]; 4300 node = path->nodes[i];
3758 path->slots[i]++; 4301 path->slots[i]++;
3759 *level = i; 4302 *level = i;
@@ -3765,6 +4308,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4308 return 0;
3766 } else { 4309 } else {
3767 struct extent_buffer *parent; 4310 struct extent_buffer *parent;
4311
4312 /*
4313 * this whole node is done, free our reference
4314 * on it and go up one level
4315 */
3768 if (path->nodes[*level] == root->node) 4316 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4317 parent = path->nodes[*level];
3770 else 4318 else
@@ -3891,13 +4439,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3891 path = btrfs_alloc_path(); 4439 path = btrfs_alloc_path();
3892 BUG_ON(!path); 4440 BUG_ON(!path);
3893 4441
3894 BUG_ON(!btrfs_tree_locked(parent)); 4442 btrfs_assert_tree_locked(parent);
3895 parent_level = btrfs_header_level(parent); 4443 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent); 4444 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent; 4445 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent); 4446 path->slots[parent_level] = btrfs_header_nritems(parent);
3899 4447
3900 BUG_ON(!btrfs_tree_locked(node)); 4448 btrfs_assert_tree_locked(node);
3901 level = btrfs_header_level(node); 4449 level = btrfs_header_level(node);
3902 extent_buffer_get(node); 4450 extent_buffer_get(node);
3903 path->nodes[level] = node; 4451 path->nodes[level] = node;
@@ -4444,7 +4992,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4992 u64 lock_end = 0;
4445 u64 num_bytes; 4993 u64 num_bytes;
4446 u64 ext_offset; 4994 u64 ext_offset;
4447 u64 first_pos; 4995 u64 search_end = (u64)-1;
4448 u32 nritems; 4996 u32 nritems;
4449 int nr_scaned = 0; 4997 int nr_scaned = 0;
4450 int extent_locked = 0; 4998 int extent_locked = 0;
@@ -4452,7 +5000,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 5000 int ret;
4453 5001
4454 memcpy(&key, leaf_key, sizeof(key)); 5002 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 5003 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 5004 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 5005 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +5048,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 5048 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 5049 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 5050 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 5051 key.offset >= search_end)
4505 break; 5052 break;
4506 } 5053 }
4507 5054
@@ -4534,8 +5081,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 5081 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 5082 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 5083
4537 if (first_pos > key.offset - ext_offset) 5084 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 5085 search_end = key.offset - ext_offset +
5086 btrfs_file_extent_ram_bytes(leaf, fi);
5087 }
4539 5088
4540 if (!extent_locked) { 5089 if (!extent_locked) {
4541 lock_start = key.offset; 5090 lock_start = key.offset;
@@ -4724,7 +5273,7 @@ next:
4724 } 5273 }
4725skip: 5274skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5275 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5276 key.offset >= search_end)
4728 break; 5277 break;
4729 5278
4730 cond_resched(); 5279 cond_resched();
@@ -4778,6 +5327,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5327 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5328 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5329 ref->generation = btrfs_header_generation(buf);
5330
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5331 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5332 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5333 btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5901,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5351 prev_block = block_start; 5901 prev_block = block_start;
5352 } 5902 }
5353 5903
5904 mutex_lock(&extent_root->fs_info->trans_mutex);
5354 btrfs_record_root_in_trans(found_root); 5905 btrfs_record_root_in_trans(found_root);
5906 mutex_unlock(&extent_root->fs_info->trans_mutex);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5907 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /* 5908 /*
5357 * try to update data extent references while 5909 * try to update data extent references while
@@ -5789,6 +6341,7 @@ out:
5789int btrfs_free_block_groups(struct btrfs_fs_info *info) 6341int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{ 6342{
5791 struct btrfs_block_group_cache *block_group; 6343 struct btrfs_block_group_cache *block_group;
6344 struct btrfs_space_info *space_info;
5792 struct rb_node *n; 6345 struct rb_node *n;
5793 6346
5794 spin_lock(&info->block_group_cache_lock); 6347 spin_lock(&info->block_group_cache_lock);
@@ -5810,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
5810 spin_lock(&info->block_group_cache_lock); 6363 spin_lock(&info->block_group_cache_lock);
5811 } 6364 }
5812 spin_unlock(&info->block_group_cache_lock); 6365 spin_unlock(&info->block_group_cache_lock);
6366
6367 /* now that all the block groups are freed, go through and
6368 * free all the space_info structs. This is only called during
6369 * the final stages of unmount, and so we know nobody is
6370 * using them. We call synchronize_rcu() once before we start,
6371 * just to be on the safe side.
6372 */
6373 synchronize_rcu();
6374
6375 while(!list_empty(&info->space_info)) {
6376 space_info = list_entry(info->space_info.next,
6377 struct btrfs_space_info,
6378 list);
6379
6380 list_del(&space_info->list);
6381 kfree(space_info);
6382 }
5813 return 0; 6383 return 0;
5814} 6384}
5815 6385
@@ -5957,9 +6527,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6527 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6528 BUG_ON(!path);
5959 6529
5960 btrfs_remove_free_space_cache(block_group); 6530 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6531 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6532 &root->fs_info->block_group_cache_tree);
6533 spin_unlock(&root->fs_info->block_group_cache_lock);
6534 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6535 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6536 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6537 up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1fa..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/pagevec.h> 13#include <linux/pagevec.h>
15#include "extent_io.h" 14#include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
31static LIST_HEAD(states); 30static LIST_HEAD(states);
32 31
33#define LEAK_DEBUG 0 32#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG 33#if LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock); 34static DEFINE_SPINLOCK(leak_lock);
36#endif 35#endif
37 36
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
120static struct extent_state *alloc_extent_state(gfp_t mask) 119static struct extent_state *alloc_extent_state(gfp_t mask)
121{ 120{
122 struct extent_state *state; 121 struct extent_state *state;
123#ifdef LEAK_DEBUG 122#if LEAK_DEBUG
124 unsigned long flags; 123 unsigned long flags;
125#endif 124#endif
126 125
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
130 state->state = 0; 129 state->state = 0;
131 state->private = 0; 130 state->private = 0;
132 state->tree = NULL; 131 state->tree = NULL;
133#ifdef LEAK_DEBUG 132#if LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags); 133 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states); 134 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags); 135 spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
145 if (!state) 144 if (!state)
146 return; 145 return;
147 if (atomic_dec_and_test(&state->refs)) { 146 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG 147#if LEAK_DEBUG
149 unsigned long flags; 148 unsigned long flags;
150#endif 149#endif
151 WARN_ON(state->tree); 150 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG 151#if LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags); 152 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list); 153 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags); 154 spin_unlock_irqrestore(&leak_lock, flags);
@@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
416 415
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 416 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) { 417 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc); 418 free_extent_state(prealloc);
422 return -EEXIST; 419 return -EEXIST;
423 } 420 }
@@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2378 int scanned = 0; 2375 int scanned = 0;
2379 int range_whole = 0; 2376 int range_whole = 0;
2380 2377
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0); 2378 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) { 2379 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */ 2380 index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2847,98 @@ out:
2855 return sector; 2847 return sector;
2856} 2848}
2857 2849
2850int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2851 __u64 start, __u64 len, get_extent_t *get_extent)
2852{
2853 int ret;
2854 u64 off = start;
2855 u64 max = start + len;
2856 u32 flags = 0;
2857 u64 disko = 0;
2858 struct extent_map *em = NULL;
2859 int end = 0;
2860 u64 em_start = 0, em_len = 0;
2861 unsigned long emflags;
2862 ret = 0;
2863
2864 if (len == 0)
2865 return -EINVAL;
2866
2867 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2868 GFP_NOFS);
2869 em = get_extent(inode, NULL, 0, off, max - off, 0);
2870 if (!em)
2871 goto out;
2872 if (IS_ERR(em)) {
2873 ret = PTR_ERR(em);
2874 goto out;
2875 }
2876 while (!end) {
2877 off = em->start + em->len;
2878 if (off >= max)
2879 end = 1;
2880
2881 em_start = em->start;
2882 em_len = em->len;
2883
2884 disko = 0;
2885 flags = 0;
2886
2887 switch (em->block_start) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST;
2891 break;
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break;
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break;
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN);
2902 break;
2903 default:
2904 disko = em->block_start;
2905 break;
2906 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED;
2909
2910 emflags = em->flags;
2911 free_extent_map(em);
2912 em = NULL;
2913
2914 if (!end) {
2915 em = get_extent(inode, NULL, 0, off, max - off, 0);
2916 if (!em)
2917 goto out;
2918 if (IS_ERR(em)) {
2919 ret = PTR_ERR(em);
2920 goto out;
2921 }
2922 emflags = em->flags;
2923 }
2924 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2925 flags |= FIEMAP_EXTENT_LAST;
2926 end = 1;
2927 }
2928
2929 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2930 em_len, flags);
2931 if (ret)
2932 goto out_free;
2933 }
2934out_free:
2935 free_extent_map(em);
2936out:
2937 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2938 GFP_NOFS);
2939 return ret;
2940}
2941
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2942static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i) 2943 unsigned long i)
2860{ 2944{
@@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2892 gfp_t mask) 2976 gfp_t mask)
2893{ 2977{
2894 struct extent_buffer *eb = NULL; 2978 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG 2979#if LEAK_DEBUG
2896 unsigned long flags; 2980 unsigned long flags;
2897#endif 2981#endif
2898 2982
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2983 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start; 2984 eb->start = start;
2901 eb->len = len; 2985 eb->len = len;
2902 mutex_init(&eb->mutex); 2986 spin_lock_init(&eb->lock);
2903#ifdef LEAK_DEBUG 2987 init_waitqueue_head(&eb->lock_wq);
2988
2989#if LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags); 2990 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers); 2991 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags); 2992 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2912 2998
2913static void __free_extent_buffer(struct extent_buffer *eb) 2999static void __free_extent_buffer(struct extent_buffer *eb)
2914{ 3000{
2915#ifdef LEAK_DEBUG 3001#if LEAK_DEBUG
2916 unsigned long flags; 3002 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags); 3003 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list); 3004 list_del(&eb->leak_list);
@@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2980 unlock_page(p); 3066 unlock_page(p);
2981 } 3067 }
2982 if (uptodate) 3068 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE; 3069 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985 3070
2986 spin_lock(&tree->buffer_lock); 3071 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3072 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3135 unsigned long num_pages; 3220 unsigned long num_pages;
3136 3221
3137 num_pages = num_extent_pages(eb->start, eb->len); 3222 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE; 3223 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3139 3224
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3225 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS); 3226 GFP_NOFS);
@@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3206 struct page *page; 3291 struct page *page;
3207 int pg_uptodate = 1; 3292 int pg_uptodate = 1;
3208 3293
3209 if (eb->flags & EXTENT_UPTODATE) 3294 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3210 return 1; 3295 return 1;
3211 3296
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3297 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 struct bio *bio = NULL; 3327 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0; 3328 unsigned long bio_flags = 0;
3244 3329
3245 if (eb->flags & EXTENT_UPTODATE) 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3246 return 0; 3331 return 0;
3247 3332
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3333 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3273 } 3358 }
3274 if (all_uptodate) { 3359 if (all_uptodate) {
3275 if (start_i == 0) 3360 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE; 3361 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 goto unlock_exit; 3362 goto unlock_exit;
3278 } 3363 }
3279 3364
@@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3309 } 3394 }
3310 3395
3311 if (!ret) 3396 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE; 3397 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3313 return ret; 3398 return ret;
3314 3399
3315unlock_exit: 3400unlock_exit:
@@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3406 unmap_extent_buffer(eb, eb->map_token, km); 3491 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL; 3492 eb->map_token = NULL;
3408 save = 1; 3493 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 } 3494 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map, 3495 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km); 3496 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a79137..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
193 unsigned from, unsigned to); 205 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 206sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent); 207 get_extent_t *get_extent);
208int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
209 __u64 start, __u64 len, get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 210int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 211int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 212int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada32..50da69da20ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h> 6#include <linux/hardirq.h>
8#include "extent_map.h" 7#include "extent_map.h"
9 8
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145e..dc78954861b3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
29#include <linux/writeback.h> 29#include <linux/writeback.h>
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h" 32#include "ctree.h"
34#include "disk-io.h" 33#include "disk-io.h"
35#include "transaction.h" 34#include "transaction.h"
@@ -1092,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1092 WARN_ON(num_pages > nrptrs); 1091 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs); 1092 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094 1093
1095 ret = btrfs_check_free_space(root, write_bytes, 0); 1094 ret = btrfs_check_data_free_space(root, inode, write_bytes);
1096 if (ret) 1095 if (ret)
1097 goto out; 1096 goto out;
1098 1097
1099 ret = prepare_pages(root, file, pages, num_pages, 1098 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index, 1099 pos, first_index, last_index,
1101 write_bytes); 1100 write_bytes);
1102 if (ret) 1101 if (ret) {
1102 btrfs_free_reserved_data_space(root, inode,
1103 write_bytes);
1103 goto out; 1104 goto out;
1105 }
1104 1106
1105 ret = btrfs_copy_from_user(pos, num_pages, 1107 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf); 1108 write_bytes, pages, buf);
1107 if (ret) { 1109 if (ret) {
1110 btrfs_free_reserved_data_space(root, inode,
1111 write_bytes);
1108 btrfs_drop_pages(pages, num_pages); 1112 btrfs_drop_pages(pages, num_pages);
1109 goto out; 1113 goto out;
1110 } 1114 }
@@ -1112,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1112 ret = dirty_and_release_pages(NULL, root, file, pages, 1116 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes); 1117 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages); 1118 btrfs_drop_pages(pages, num_pages);
1115 if (ret) 1119 if (ret) {
1120 btrfs_free_reserved_data_space(root, inode,
1121 write_bytes);
1116 goto out; 1122 goto out;
1123 }
1117 1124
1118 if (will_write) { 1125 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos, 1126 btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1137,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1137 } 1144 }
1138out: 1145out:
1139 mutex_unlock(&inode->i_mutex); 1146 mutex_unlock(&inode->i_mutex);
1147 if (ret)
1148 err = ret;
1140 1149
1141out_nolock: 1150out_nolock:
1142 kfree(pages); 1151 kfree(pages);
@@ -1215,15 +1224,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1215 } 1224 }
1216 mutex_unlock(&root->fs_info->trans_mutex); 1225 mutex_unlock(&root->fs_info->trans_mutex);
1217 1226
1218 root->fs_info->tree_log_batch++; 1227 root->log_batch++;
1219 filemap_fdatawrite(inode->i_mapping); 1228 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1229 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++; 1230 root->log_batch++;
1222 1231
1223 /* 1232 /*
1224 * ok we haven't committed the transaction yet, lets do a commit 1233 * ok we haven't committed the transaction yet, lets do a commit
1225 */ 1234 */
1226 if (file->private_data) 1235 if (file && file->private_data)
1227 btrfs_ioctl_trans_end(file); 1236 btrfs_ioctl_trans_end(file);
1228 1237
1229 trans = btrfs_start_transaction(root, 1); 1238 trans = btrfs_start_transaction(root, 1);
@@ -1232,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1232 goto out; 1241 goto out;
1233 } 1242 }
1234 1243
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); 1244 ret = btrfs_log_dentry_safe(trans, root, dentry);
1236 if (ret < 0) 1245 if (ret < 0)
1237 goto out; 1246 goto out;
1238 1247
@@ -1246,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1246 * file again, but that will end up using the synchronization 1255 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe. 1256 * inside btrfs_sync_log to keep things safe.
1248 */ 1257 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex); 1258 mutex_unlock(&dentry->d_inode->i_mutex);
1250 1259
1251 if (ret > 0) { 1260 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root); 1261 ret = btrfs_commit_transaction(trans, root);
@@ -1254,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1254 btrfs_sync_log(trans, root); 1263 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root); 1264 ret = btrfs_end_transaction(trans, root);
1256 } 1265 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex); 1266 mutex_lock(&dentry->d_inode->i_mutex);
1258out: 1267out:
1259 return ret > 0 ? EIO : ret; 1268 return ret > 0 ? EIO : ret;
1260} 1269}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
86 86
87 btrfs_init_path(path);
88 start_found = 0; 87 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); 88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0) 89 if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab41..7d4f948bc22a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/statfs.h> 34#include <linux/statfs.h>
35#include <linux/compat.h> 35#include <linux/compat.h>
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h> 37#include <linux/xattr.h>
39#include <linux/posix_acl.h> 38#include <linux/posix_acl.h>
40#include <linux/falloc.h> 39#include <linux/falloc.h>
@@ -51,6 +50,7 @@
51#include "tree-log.h" 50#include "tree-log.h"
52#include "ref-cache.h" 51#include "ref-cache.h"
53#include "compression.h" 52#include "compression.h"
53#include "locking.h"
54 54
55struct btrfs_iget_args { 55struct btrfs_iget_args {
56 u64 ino; 56 u64 ino;
@@ -91,32 +91,14 @@ static noinline int cow_file_range(struct inode *inode,
91 u64 start, u64 end, int *page_started, 91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock); 92 unsigned long *nr_written, int unlock);
93 93
94/* 94static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{ 95{
101 u64 total; 96 int err;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115 97
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh) 98 err = btrfs_init_acl(inode, dir);
117 ret = -ENOSPC; 99 if (!err)
118 spin_unlock(&root->fs_info->delalloc_lock); 100 err = btrfs_xattr_security_init(inode, dir);
119 return ret; 101 return err;
120} 102}
121 103
122/* 104/*
@@ -350,6 +332,19 @@ again:
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 332 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 333 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352 334
335 /*
336 * we don't want to send crud past the end of i_size through
337 * compression, that's just a waste of CPU time. So, if the
338 * end of the file is before the start of our current
339 * requested range of bytes, we bail out to the uncompressed
340 * cleanup code that can deal with all of this.
341 *
342 * It isn't really the fastest way to fix things, but this is a
343 * very uncommon corner.
344 */
345 if (actual_end <= start)
346 goto cleanup_and_bail_uncompressed;
347
353 total_compressed = actual_end - start; 348 total_compressed = actual_end - start;
354 349
355 /* we want to make sure that amount of ram required to uncompress 350 /* we want to make sure that amount of ram required to uncompress
@@ -494,6 +489,7 @@ again:
494 goto again; 489 goto again;
495 } 490 }
496 } else { 491 } else {
492cleanup_and_bail_uncompressed:
497 /* 493 /*
498 * No compression, but we still need to write the pages in 494 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked 495 * the file we've been given so far. redirty the locked
@@ -1166,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1166 */ 1162 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1163 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root; 1164 struct btrfs_root *root = BTRFS_I(inode)->root;
1165 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1169 spin_lock(&root->fs_info->delalloc_lock); 1166 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1167 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1; 1168 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1199,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1199 (unsigned long long)end - start + 1, 1196 (unsigned long long)end - start + 1,
1200 (unsigned long long) 1197 (unsigned long long)
1201 root->fs_info->delalloc_bytes); 1198 root->fs_info->delalloc_bytes);
1199 btrfs_delalloc_free_space(root, inode, (u64)-1);
1202 root->fs_info->delalloc_bytes = 0; 1200 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0; 1201 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else { 1202 } else {
1203 btrfs_delalloc_free_space(root, inode,
1204 end - start + 1);
1205 root->fs_info->delalloc_bytes -= end - start + 1; 1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 } 1207 }
@@ -1324,12 +1324,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset, 1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list) 1325 struct list_head *list)
1326{ 1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum; 1327 struct btrfs_ordered_sum *sum;
1329 1328
1330 btrfs_set_trans_block_group(trans, inode); 1329 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) { 1330
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1331 list_for_each_entry(sum, list, list) {
1333 btrfs_csum_file_blocks(trans, 1332 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1333 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 } 1334 }
@@ -2013,6 +2012,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2012 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014 2013
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2014 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2015
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2017 alloc_group_block, 0);
2018 btrfs_free_path(path); 2018 btrfs_free_path(path);
@@ -2039,6 +2039,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break; 2040 break;
2041 default: 2041 default:
2042 inode->i_op = &btrfs_special_inode_operations;
2042 init_special_inode(inode, inode->i_mode, rdev); 2043 init_special_inode(inode, inode->i_mode, rdev);
2043 break; 2044 break;
2044 } 2045 }
@@ -2108,6 +2109,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2108 goto failed; 2109 goto failed;
2109 } 2110 }
2110 2111
2112 btrfs_unlock_up_safe(path, 1);
2111 leaf = path->nodes[0]; 2113 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2114 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item); 2115 struct btrfs_inode_item);
@@ -2219,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2219 2221
2220 root = BTRFS_I(dir)->root; 2222 root = BTRFS_I(dir)->root;
2221 2223
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1); 2224 trans = btrfs_start_transaction(root, 1);
2227 2225
2228 btrfs_set_trans_block_group(trans, dir); 2226 btrfs_set_trans_block_group(trans, dir);
@@ -2235,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2235 nr = trans->blocks_used; 2233 nr = trans->blocks_used;
2236 2234
2237 btrfs_end_transaction_throttle(trans, root); 2235 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr); 2236 btrfs_btree_balance_dirty(root, nr);
2240 return ret; 2237 return ret;
2241} 2238}
@@ -2258,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2258 return -ENOTEMPTY; 2255 return -ENOTEMPTY;
2259 } 2256 }
2260 2257
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1); 2258 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir); 2259 btrfs_set_trans_block_group(trans, dir);
2267 2260
@@ -2278,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2278fail_trans: 2271fail_trans:
2279 nr = trans->blocks_used; 2272 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root); 2273 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr); 2274 btrfs_btree_balance_dirty(root, nr);
2283 2275
2284 if (ret && !err) 2276 if (ret && !err)
@@ -2429,6 +2421,8 @@ next_node:
2429 ref->generation = leaf_gen; 2421 ref->generation = leaf_gen;
2430 ref->nritems = 0; 2422 ref->nritems = 0;
2431 2423
2424 btrfs_sort_leaf_ref(ref);
2425
2432 ret = btrfs_add_leaf_ref(root, ref, 0); 2426 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret); 2427 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref); 2428 btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2470,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_path *path; 2470 struct btrfs_path *path;
2477 struct btrfs_key key; 2471 struct btrfs_key key;
2478 struct btrfs_key found_key; 2472 struct btrfs_key found_key;
2479 u32 found_type; 2473 u32 found_type = (u8)-1;
2480 struct extent_buffer *leaf; 2474 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi; 2475 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0; 2476 u64 extent_start = 0;
@@ -2503,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2503 key.offset = (u64)-1; 2497 key.offset = (u64)-1;
2504 key.type = (u8)-1; 2498 key.type = (u8)-1;
2505 2499
2506 btrfs_init_path(path);
2507
2508search_again: 2500search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0) 2502 if (ret < 0)
@@ -2663,6 +2655,8 @@ next:
2663 if (pending_del_nr) 2655 if (pending_del_nr)
2664 goto del_pending; 2656 goto del_pending;
2665 btrfs_release_path(root, path); 2657 btrfs_release_path(root, path);
2658 if (found_type == BTRFS_INODE_ITEM_KEY)
2659 break;
2666 goto search_again; 2660 goto search_again;
2667 } 2661 }
2668 2662
@@ -2679,6 +2673,8 @@ del_pending:
2679 BUG_ON(ret); 2673 BUG_ON(ret);
2680 pending_del_nr = 0; 2674 pending_del_nr = 0;
2681 btrfs_release_path(root, path); 2675 btrfs_release_path(root, path);
2676 if (found_type == BTRFS_INODE_ITEM_KEY)
2677 break;
2682 goto search_again; 2678 goto search_again;
2683 } 2679 }
2684 } 2680 }
@@ -2788,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2788 if (size <= hole_start) 2784 if (size <= hole_start)
2789 return 0; 2785 return 0;
2790 2786
2791 err = btrfs_check_free_space(root, 1, 0); 2787 err = btrfs_check_metadata_free_space(root);
2792 if (err) 2788 if (err)
2793 return err; 2789 return err;
2794 2790
@@ -2984,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->last_trans = 0; 2980 bi->last_trans = 0;
2985 bi->logged_trans = 0; 2981 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0; 2982 bi->delalloc_bytes = 0;
2983 bi->reserved_bytes = 0;
2987 bi->disk_i_size = 0; 2984 bi->disk_i_size = 0;
2988 bi->flags = 0; 2985 bi->flags = 0;
2989 bi->index_cnt = (u64)-1; 2986 bi->index_cnt = (u64)-1;
@@ -3005,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
3005 inode->i_ino = args->ino; 3002 inode->i_ino = args->ino;
3006 init_btrfs_i(inode); 3003 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root; 3004 BTRFS_I(inode)->root = args->root;
3005 btrfs_set_inode_space_info(args->root, inode);
3008 return 0; 3006 return 0;
3009} 3007}
3010 3008
@@ -3265,7 +3263,7 @@ skip:
3265 3263
3266 /* Reached end of directory/root. Bump pos past the last item. */ 3264 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3265 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3266 filp->f_pos = INT_LIMIT(off_t);
3269 else 3267 else
3270 filp->f_pos++; 3268 filp->f_pos++;
3271nopos: 3269nopos:
@@ -3425,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3425 BTRFS_I(inode)->index_cnt = 2; 3423 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root; 3424 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid; 3425 BTRFS_I(inode)->generation = trans->transid;
3426 btrfs_set_inode_space_info(root, inode);
3428 3427
3429 if (mode & S_IFDIR) 3428 if (mode & S_IFDIR)
3430 owner = 0; 3429 owner = 0;
@@ -3458,7 +3457,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3458 root->highest_inode = objectid; 3457 root->highest_inode = objectid;
3459 3458
3460 inode->i_uid = current_fsuid(); 3459 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid(); 3460
3461 if (dir && (dir->i_mode & S_ISGID)) {
3462 inode->i_gid = dir->i_gid;
3463 if (S_ISDIR(mode))
3464 mode |= S_ISGID;
3465 } else
3466 inode->i_gid = current_fsgid();
3467
3462 inode->i_mode = mode; 3468 inode->i_mode = mode;
3463 inode->i_ino = objectid; 3469 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0); 3470 inode_set_bytes(inode, 0);
@@ -3565,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3565 if (!new_valid_dev(rdev)) 3571 if (!new_valid_dev(rdev))
3566 return -EINVAL; 3572 return -EINVAL;
3567 3573
3568 err = btrfs_check_free_space(root, 1, 0); 3574 err = btrfs_check_metadata_free_space(root);
3569 if (err) 3575 if (err)
3570 goto fail; 3576 goto fail;
3571 3577
@@ -3586,7 +3592,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3586 if (IS_ERR(inode)) 3592 if (IS_ERR(inode))
3587 goto out_unlock; 3593 goto out_unlock;
3588 3594
3589 err = btrfs_init_acl(inode, dir); 3595 err = btrfs_init_inode_security(inode, dir);
3590 if (err) { 3596 if (err) {
3591 drop_inode = 1; 3597 drop_inode = 1;
3592 goto out_unlock; 3598 goto out_unlock;
@@ -3628,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3628 u64 objectid; 3634 u64 objectid;
3629 u64 index = 0; 3635 u64 index = 0;
3630 3636
3631 err = btrfs_check_free_space(root, 1, 0); 3637 err = btrfs_check_metadata_free_space(root);
3632 if (err) 3638 if (err)
3633 goto fail; 3639 goto fail;
3634 trans = btrfs_start_transaction(root, 1); 3640 trans = btrfs_start_transaction(root, 1);
@@ -3649,7 +3655,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3649 if (IS_ERR(inode)) 3655 if (IS_ERR(inode))
3650 goto out_unlock; 3656 goto out_unlock;
3651 3657
3652 err = btrfs_init_acl(inode, dir); 3658 err = btrfs_init_inode_security(inode, dir);
3653 if (err) { 3659 if (err) {
3654 drop_inode = 1; 3660 drop_inode = 1;
3655 goto out_unlock; 3661 goto out_unlock;
@@ -3696,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3696 return -ENOENT; 3702 return -ENOENT;
3697 3703
3698 btrfs_inc_nlink(inode); 3704 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0); 3705 err = btrfs_check_metadata_free_space(root);
3700 if (err) 3706 if (err)
3701 goto fail; 3707 goto fail;
3702 err = btrfs_set_inode_index(dir, &index); 3708 err = btrfs_set_inode_index(dir, &index);
@@ -3742,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3742 u64 index = 0; 3748 u64 index = 0;
3743 unsigned long nr = 1; 3749 unsigned long nr = 1;
3744 3750
3745 err = btrfs_check_free_space(root, 1, 0); 3751 err = btrfs_check_metadata_free_space(root);
3746 if (err) 3752 if (err)
3747 goto out_unlock; 3753 goto out_unlock;
3748 3754
@@ -3772,7 +3778,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3772 3778
3773 drop_on_err = 1; 3779 drop_on_err = 1;
3774 3780
3775 err = btrfs_init_acl(inode, dir); 3781 err = btrfs_init_inode_security(inode, dir);
3776 if (err) 3782 if (err)
3777 goto out_fail; 3783 goto out_fail;
3778 3784
@@ -4158,9 +4164,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4158 return -EINVAL; 4164 return -EINVAL;
4159} 4165}
4160 4166
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4167static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4168 __u64 start, __u64 len)
4162{ 4169{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent); 4170 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4164} 4171}
4165 4172
4166int btrfs_readpage(struct file *file, struct page *page) 4173int btrfs_readpage(struct file *file, struct page *page)
@@ -4223,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{ 4230{
4224 if (PageWriteback(page) || PageDirty(page)) 4231 if (PageWriteback(page) || PageDirty(page))
4225 return 0; 4232 return 0;
4226 return __btrfs_releasepage(page, gfp_flags); 4233 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
4227} 4234}
4228 4235
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4236static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4298,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4298 u64 page_start; 4305 u64 page_start;
4299 u64 page_end; 4306 u64 page_end;
4300 4307
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); 4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4302 if (ret) 4309 if (ret)
4303 goto out; 4310 goto out;
4304 4311
@@ -4311,6 +4318,7 @@ again:
4311 4318
4312 if ((page->mapping != inode->i_mapping) || 4319 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) { 4320 (page_start >= size)) {
4321 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4314 /* page got truncated out from underneath us */ 4322 /* page got truncated out from underneath us */
4315 goto out_unlock; 4323 goto out_unlock;
4316 } 4324 }
@@ -4593,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 4601 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV; 4602 return -EXDEV;
4595 4603
4596 ret = btrfs_check_free_space(root, 1, 0); 4604 ret = btrfs_check_metadata_free_space(root);
4597 if (ret) 4605 if (ret)
4598 goto out_unlock; 4606 goto out_unlock;
4599 4607
@@ -4711,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 4719 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG; 4720 return -ENAMETOOLONG;
4713 4721
4714 err = btrfs_check_free_space(root, 1, 0); 4722 err = btrfs_check_metadata_free_space(root);
4715 if (err) 4723 if (err)
4716 goto out_fail; 4724 goto out_fail;
4717 4725
@@ -4733,7 +4741,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4733 if (IS_ERR(inode)) 4741 if (IS_ERR(inode))
4734 goto out_unlock; 4742 goto out_unlock;
4735 4743
4736 err = btrfs_init_acl(inode, dir); 4744 err = btrfs_init_inode_security(inode, dir);
4737 if (err) { 4745 if (err) {
4738 drop_inode = 1; 4746 drop_inode = 1;
4739 goto out_unlock; 4747 goto out_unlock;
@@ -4987,13 +4995,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
4987 .clear_bit_hook = btrfs_clear_bit_hook, 4995 .clear_bit_hook = btrfs_clear_bit_hook,
4988}; 4996};
4989 4997
4998/*
4999 * btrfs doesn't support the bmap operation because swapfiles
5000 * use bmap to make a mapping of extents in the file. They assume
5001 * these extents won't change over the life of the file and they
5002 * use the bmap result to do IO directly to the drive.
5003 *
5004 * the btrfs bmap call would return logical addresses that aren't
5005 * suitable for IO and they also will change frequently as COW
5006 * operations happen. So, swapfile + btrfs == corruption.
5007 *
5008 * For now we're avoiding this by dropping bmap.
5009 */
4990static struct address_space_operations btrfs_aops = { 5010static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage, 5011 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage, 5012 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages, 5013 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages, 5014 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page, 5015 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO, 5016 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage, 5017 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage, 5018 .releasepage = btrfs_releasepage,
@@ -5017,6 +5036,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5017 .removexattr = btrfs_removexattr, 5036 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission, 5037 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate, 5038 .fallocate = btrfs_fallocate,
5039 .fiemap = btrfs_fiemap,
5020}; 5040};
5021static struct inode_operations btrfs_special_inode_operations = { 5041static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr, 5042 .getattr = btrfs_getattr,
@@ -5032,4 +5052,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5032 .follow_link = page_follow_link_light, 5052 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link, 5053 .put_link = page_put_link,
5034 .permission = btrfs_permission, 5054 .permission = btrfs_permission,
5055 .setxattr = btrfs_setxattr,
5056 .getxattr = btrfs_getxattr,
5057 .listxattr = btrfs_listxattr,
5058 .removexattr = btrfs_removexattr,
5035}; 5059};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb5..bca729fc80c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
38#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/bit_spinlock.h> 39#include <linux/bit_spinlock.h>
40#include <linux/security.h> 40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h> 41#include <linux/xattr.h>
43#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
44#include "compat.h" 43#include "compat.h"
@@ -71,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
71 u64 index = 0; 70 u64 index = 0;
72 unsigned long nr = 1; 71 unsigned long nr = 1;
73 72
74 ret = btrfs_check_free_space(root, 1, 0); 73 ret = btrfs_check_metadata_free_space(root);
75 if (ret) 74 if (ret)
76 goto fail_commit; 75 goto fail_commit;
77 76
@@ -204,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
204 if (!root->ref_cows) 203 if (!root->ref_cows)
205 return -EINVAL; 204 return -EINVAL;
206 205
207 ret = btrfs_check_free_space(root, 1, 0); 206 ret = btrfs_check_metadata_free_space(root);
208 if (ret) 207 if (ret)
209 goto fail_unlock; 208 goto fail_unlock;
210 209
@@ -375,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
375 unsigned long i; 374 unsigned long i;
376 int ret; 375 int ret;
377 376
378 ret = btrfs_check_free_space(root, inode->i_size, 0); 377 ret = btrfs_check_data_free_space(root, inode, inode->i_size);
379 if (ret) 378 if (ret)
380 return -ENOSPC; 379 return -ENOSPC;
381 380
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..47b0a88c12a2 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,64 +25,203 @@
25#include "extent_io.h" 25#include "extent_io.h"
26#include "locking.h" 26#include "locking.h"
27 27
28static inline void spin_nested(struct extent_buffer *eb)
29{
30 spin_lock(&eb->lock);
31}
32
28/* 33/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 34 * Setting a lock to blocking will drop the spinlock and set the
30 * and the spin is not tuned very extensively. The spinning does make a big 35 * flag that forces other procs who want the lock to wait. After
31 * difference in almost every workload, but spinning for the right amount of 36 * this you can safely schedule with the lock held.
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 37 */
38void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{
40 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
41 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
42 spin_unlock(&eb->lock);
43 }
44 /* exit with the spin lock released and the bit set */
45}
37 46
38int btrfs_tree_lock(struct extent_buffer *eb) 47/*
48 * clearing the blocking flag will take the spinlock again.
49 * After this you can't safely schedule
50 */
51void btrfs_clear_lock_blocking(struct extent_buffer *eb)
39{ 52{
40 int i; 53 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
54 spin_nested(eb);
55 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
56 smp_mb__after_clear_bit();
57 }
58 /* exit with the spin lock held */
59}
41 60
42 if (mutex_trylock(&eb->mutex)) 61/*
43 return 0; 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 *
67 * So, we're still stuck with this crummy spin on the blocking bit,
68 * at least until the most common causes of the short blocks
69 * can be dealt with.
70 */
71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{
73 int i;
44 for (i = 0; i < 512; i++) { 74 for (i = 0; i < 512; i++) {
45 cpu_relax(); 75 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 77 return 1;
78 if (need_resched())
79 break;
48 } 80 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 81 return 0;
52} 82}
53 83
54int btrfs_try_tree_lock(struct extent_buffer *eb) 84/*
85 * This is somewhat different from trylock. It will take the
86 * spinlock but if it finds the lock is set to blocking, it will
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */
94int btrfs_try_spin_lock(struct extent_buffer *eb)
55{ 95{
56 return mutex_trylock(&eb->mutex); 96 int i;
97
98 spin_nested(eb);
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
100 return 1;
101 spin_unlock(&eb->lock);
102
103 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) {
105 if (!btrfs_spin_on_block(eb))
106 break;
107
108 spin_nested(eb);
109 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
110 return 1;
111 spin_unlock(&eb->lock);
112 }
113 return 0;
57} 114}
58 115
59int btrfs_tree_unlock(struct extent_buffer *eb) 116/*
117 * the autoremove wake function will return 0 if it tried to wake up
118 * a process that was already awake, which means that process won't
119 * count as an exclusive wakeup. The waitq code will continue waking
120 * procs until it finds one that was actually sleeping.
121 *
122 * For btrfs, this isn't quite what we want. We want a single proc
123 * to be notified that the lock is ready for taking. If that proc
124 * already happen to be awake, great, it will loop around and try for
125 * the lock.
126 *
127 * So, btrfs_wake_function always returns 1, even when the proc that we
128 * tried to wake up was already awake.
129 */
130static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
131 int sync, void *key)
60{ 132{
61 mutex_unlock(&eb->mutex); 133 autoremove_wake_function(wait, mode, sync, key);
62 return 0; 134 return 1;
63} 135}
64 136
65int btrfs_tree_locked(struct extent_buffer *eb) 137/*
138 * returns with the extent buffer spinlocked.
139 *
140 * This will spin and/or wait as required to take the lock, and then
141 * return with the spinlock held.
142 *
143 * After this call, scheduling is not safe without first calling
144 * btrfs_set_lock_blocking()
145 */
146int btrfs_tree_lock(struct extent_buffer *eb)
66{ 147{
67 return mutex_is_locked(&eb->mutex); 148 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function;
150
151 while(1) {
152 spin_nested(eb);
153
154 /* nobody is blocking, exit with the spinlock held */
155 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
156 return 0;
157
158 /*
159 * we have the spinlock, but the real owner is blocking.
160 * wait for them
161 */
162 spin_unlock(&eb->lock);
163
164 /*
165 * spin for a bit, and if the blocking flag goes away,
166 * loop around
167 */
168 if (btrfs_spin_on_block(eb))
169 continue;
170
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE);
173
174 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
175 schedule();
176
177 finish_wait(&eb->lock_wq, &wait);
178 }
179 return 0;
68} 180}
69 181
70/* 182/*
71 * btrfs_search_slot uses this to decide if it should drop its locks 183 * Very quick trylock, this does not spin or schedule. It returns
72 * before doing something expensive like allocating free blocks for cow. 184 * 1 with the spinlock held if it was able to take the lock, or it
185 * returns zero if it was unable to take the lock.
186 *
187 * After this call, scheduling is not safe without first calling
188 * btrfs_set_lock_blocking()
73 */ 189 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 190int btrfs_try_tree_lock(struct extent_buffer *eb)
75{ 191{
76 int i; 192 if (spin_trylock(&eb->lock)) {
77 struct extent_buffer *eb; 193 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 194 /*
79 eb = path->nodes[i]; 195 * we've got the spinlock, but the real owner is
80 if (!eb) 196 * blocking. Drop the spinlock and return failure
81 break; 197 */
82 smp_mb(); 198 spin_unlock(&eb->lock);
83 if (!list_empty(&eb->mutex.wait_list)) 199 return 0;
84 return 1; 200 }
201 return 1;
85 } 202 }
203 /* someone else has the spinlock giveup */
86 return 0; 204 return 0;
87} 205}
88 206
207int btrfs_tree_unlock(struct extent_buffer *eb)
208{
209 /*
210 * if we were a blocking owner, we don't have the spinlock held
211 * just clear the bit and look for waiters
212 */
213 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
214 smp_mb__after_clear_bit();
215 else
216 spin_unlock(&eb->lock);
217
218 if (waitqueue_active(&eb->lock_wq))
219 wake_up(&eb->lock_wq);
220 return 0;
221}
222
223void btrfs_assert_tree_locked(struct extent_buffer *eb)
224{
225 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
226 assert_spin_locked(&eb->lock);
227}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,7 +21,11 @@
21 21
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24
25int btrfs_try_tree_lock(struct extent_buffer *eb); 25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 26int btrfs_try_spin_lock(struct extent_buffer *eb);
27
28void btrfs_set_lock_blocking(struct extent_buffer *eb);
29void btrfs_clear_lock_blocking(struct extent_buffer *eb);
30void btrfs_assert_tree_locked(struct extent_buffer *eb);
27#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a20940170274..77c2411a5f0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
613 struct btrfs_sector_sum *sector_sums; 613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered; 614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors; 616 unsigned long num_sectors;
618 unsigned long i; 617 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
624 return 1; 623 return 1;
625 624
626 mutex_lock(&tree->mutex); 625 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) { 626 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) { 627 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize; 628 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums; 629 sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9eab..d0cc62bccb94 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "ref-cache.h" 22#include "ref-cache.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c59..bc283ad2db73 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared); 74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif 76#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db9fb3bc1e33..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
37#include <linux/ctype.h> 37#include <linux/ctype.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h> 40#include <linux/magic.h>
42#include "compat.h" 41#include "compat.h"
43#include "ctree.h" 42#include "ctree.h"
@@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
380 btrfs_start_delalloc_inodes(root); 379 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0); 380 btrfs_wait_ordered_extents(root, 0);
382 381
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1); 382 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root); 383 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0; 384 sb->s_dirt = 0;
@@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
512 struct btrfs_root *root = btrfs_sb(sb); 510 struct btrfs_root *root = btrfs_sb(sb);
513 int ret; 511 int ret;
514 512
513 ret = btrfs_parse_options(root, data);
514 if (ret)
515 return -EINVAL;
516
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 517 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0; 518 return 0;
517 519
@@ -583,17 +585,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
583 struct btrfs_ioctl_vol_args *vol; 585 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices; 586 struct btrfs_fs_devices *fs_devices;
585 int ret = -ENOTTY; 587 int ret = -ENOTTY;
586 int len;
587 588
588 if (!capable(CAP_SYS_ADMIN)) 589 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM; 590 return -EPERM;
590 591
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 592 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
593 if (!vol)
594 return -ENOMEM;
595
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 596 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT; 597 ret = -EFAULT;
594 goto out; 598 goto out;
595 } 599 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 600
598 switch (cmd) { 601 switch (cmd) {
599 case BTRFS_IOC_SCAN_DEV: 602 case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f9443340..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
688 num_bytes -= btrfs_root_used(&dirty->root->root_item); 688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item); 689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) { 690 if (num_bytes) {
691 mutex_lock(&root->fs_info->trans_mutex);
691 btrfs_record_root_in_trans(root); 692 btrfs_record_root_in_trans(root);
693 mutex_unlock(&root->fs_info->trans_mutex);
692 btrfs_set_root_used(&root->root_item, 694 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes); 695 bytes_used - num_bytes);
694 } 696 }
@@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
852{ 854{
853 struct btrfs_pending_snapshot *pending; 855 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots; 856 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret; 857 int ret;
857 858
858 list_for_each(cur, head) { 859 list_for_each_entry(pending, head, list) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending); 860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret); 861 BUG_ON(ret);
862 } 862 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1704 1615
1705 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1707 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1709 1621
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1750 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1751 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1753 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1755 1668
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1807 1720
1808 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1810 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1812 1726
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1879 1793
1880 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1882 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1884 1799
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1817 }
1903 } 1818 }
1904 btrfs_free_path(path); 1819 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1820 return ret;
1908} 1821}
1909 1822
1910static int wait_log_commit(struct btrfs_root *log) 1823/*
1824 * helper function to update the item for a given subvolumes log root
1825 * in the tree of log roots
1826 */
1827static int update_log_root(struct btrfs_trans_handle *trans,
1828 struct btrfs_root *log)
1829{
1830 int ret;
1831
1832 if (log->log_transid == 1) {
1833 /* insert root item on the first sync */
1834 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1835 &log->root_key, &log->root_item);
1836 } else {
1837 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1838 &log->root_key, &log->root_item);
1839 }
1840 return ret;
1841}
1842
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1844{
1912 DEFINE_WAIT(wait); 1845 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1846 int index = transid % 2;
1914 1847
1848 /*
1849 * we only allow two pending log transactions at a time,
1850 * so we know that if ours is more than 2 older than the
1851 * current transaction, we're done
1852 */
1915 do { 1853 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1854 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1855 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1856 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1857 if (root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index]))
1920 schedule(); 1859 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1860 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1861 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1862 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1863 atomic_read(&root->log_commit[index]));
1864 return 0;
1865}
1866
1867static int wait_for_writer(struct btrfs_root *root)
1868{
1869 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers))
1875 schedule();
1876 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait);
1878 }
1925 return 0; 1879 return 0;
1926} 1880}
1927 1881
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1887int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1888 struct btrfs_root *root)
1935{ 1889{
1890 int index1;
1891 int index2;
1936 int ret; 1892 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1893 struct btrfs_root *log = root->log_root;
1894 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1895
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1896 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1897 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1898 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1899 wait_log_commit(root, root->log_transid);
1900 mutex_unlock(&root->log_mutex);
1901 return 0;
1944 } 1902 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1903 atomic_set(&root->log_commit[index1], 1);
1904
1905 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1);
1946 1908
1947 while (1) { 1909 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1910 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1911 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1912 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1913 mutex_lock(&root->log_mutex);
1952 1914 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1915 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1916 break;
1965 } 1917 }
1966 1918
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1920 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1921
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1922 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1923 btrfs_set_root_generation(&log->root_item, trans->transid);
1924 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1925
1926 root->log_batch = 0;
1927 root->log_transid++;
1928 log->log_transid = root->log_transid;
1929 smp_mb();
1930 /*
1931 * log tree has been flushed to disk, new modifications of
1932 * the log will be written to new positions. so it's safe to
1933 * allow log writers to go in.
1934 */
1935 mutex_unlock(&root->log_mutex);
1936
1937 mutex_lock(&log_root_tree->log_mutex);
1938 log_root_tree->log_batch++;
1939 atomic_inc(&log_root_tree->log_writers);
1940 mutex_unlock(&log_root_tree->log_mutex);
1941
1942 ret = update_log_root(trans, log);
1943 BUG_ON(ret);
1944
1945 mutex_lock(&log_root_tree->log_mutex);
1946 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1947 smp_mb();
1948 if (waitqueue_active(&log_root_tree->log_writer_wait))
1949 wake_up(&log_root_tree->log_writer_wait);
1950 }
1951
1952 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out;
1957 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1);
1959
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1962
1963 wait_for_writer(log_root_tree);
1964
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1967 BUG_ON(ret);
1972 1968
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1969 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1970 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1971 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1972 btrfs_header_level(log_root_tree->node));
1973
1974 log_root_tree->log_batch = 0;
1975 log_root_tree->log_transid++;
1976 smp_mb();
1977
1978 mutex_unlock(&log_root_tree->log_mutex);
1979
1980 /*
1981 * nobody else is going to jump in and write the the ctree
1982 * super here because the log_commit atomic below is protecting
1983 * us. We must be called with a transaction handle pinning
1984 * the running transaction open, so a full commit can't hop
1985 * in and cause problems either.
1986 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1988
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1989 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1990 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1992 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1993out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1994 atomic_set(&root->log_commit[index1], 0);
1995 smp_mb();
1996 if (waitqueue_active(&root->log_commit_wait[index1]))
1997 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1998 return 0;
1988} 1999}
1989 2000
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2030 start, end, GFP_NOFS);
2020 } 2031 }
2021 2032
2022 log = root->log_root; 2033 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2034 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2035 &log->root_key);
2025 BUG_ON(ret); 2036 BUG_ON(ret);
2037 }
2026 root->log_root = NULL; 2038 root->log_root = NULL;
2027 kfree(root->log_root); 2039 free_extent_buffer(log->node);
2040 kfree(log);
2028 return 0; 2041 return 0;
2029} 2042}
2030 2043
2031/* 2044/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2045 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2046 * mixed in, we have a few interesting corners:
2056 * 2047 *
@@ -2711,11 +2702,6 @@ next_slot:
2711 2702
2712 btrfs_free_path(path); 2703 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2704 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2705out:
2720 return 0; 2706 return 0;
2721} 2707}
@@ -2846,7 +2832,9 @@ again:
2846 BUG_ON(!wc.replay_dest); 2832 BUG_ON(!wc.replay_dest);
2847 2833
2848 wc.replay_dest->log_root = log; 2834 wc.replay_dest->log_root = log;
2835 mutex_lock(&fs_info->trans_mutex);
2849 btrfs_record_root_in_trans(wc.replay_dest); 2836 btrfs_record_root_in_trans(wc.replay_dest);
2837 mutex_unlock(&fs_info->trans_mutex);
2850 ret = walk_log_tree(trans, log, &wc); 2838 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret); 2839 BUG_ON(ret);
2852 2840
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1cca2b5..dd06e18e5aac 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h> 23#include <asm/div64.h>
25#include "compat.h" 24#include "compat.h"
26#include "ctree.h" 25#include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid) 103 u64 devid, u8 *uuid)
105{ 104{
106 struct btrfs_device *dev; 105 struct btrfs_device *dev;
107 struct list_head *cur;
108 106
109 list_for_each(cur, head) { 107 list_for_each_entry(dev, head, dev_list) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid && 108 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev; 110 return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
118 115
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 116static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{ 117{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices; 118 struct btrfs_fs_devices *fs_devices;
123 119
124 list_for_each(cur, &fs_uuids) { 120 list_for_each_entry(fs_devices, &fs_uuids, list) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices; 122 return fs_devices;
128 } 123 }
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
159loop: 154loop:
160 spin_lock(&device->io_lock); 155 spin_lock(&device->io_lock);
161 156
157loop_lock:
162 /* take all the bios off the list at once and process them 158 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the 159 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted 160 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
208 * is now congested. Back off and let other work structs 204 * is now congested. Back off and let other work structs
209 * run instead 205 * run instead
210 */ 206 */
211 if (pending && bdi_write_congested(bdi) && 207 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
212 fs_info->fs_devices->open_devices > 1) { 208 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head; 209 struct bio *old_head;
214 210
@@ -220,7 +216,8 @@ loop:
220 tail->bi_next = old_head; 216 tail->bi_next = old_head;
221 else 217 else
222 device->pending_bio_tail = tail; 218 device->pending_bio_tail = tail;
223 device->running_pending = 0; 219
220 device->running_pending = 1;
224 221
225 spin_unlock(&device->io_lock); 222 spin_unlock(&device->io_lock);
226 btrfs_requeue_work(&device->work); 223 btrfs_requeue_work(&device->work);
@@ -229,6 +226,11 @@ loop:
229 } 226 }
230 if (again) 227 if (again)
231 goto loop; 228 goto loop;
229
230 spin_lock(&device->io_lock);
231 if (device->pending_bios)
232 goto loop_lock;
233 spin_unlock(&device->io_lock);
232done: 234done:
233 return 0; 235 return 0;
234} 236}
@@ -345,14 +347,11 @@ error:
345 347
346int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 348int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
347{ 349{
348 struct list_head *tmp; 350 struct btrfs_device *device, *next;
349 struct list_head *cur;
350 struct btrfs_device *device;
351 351
352 mutex_lock(&uuid_mutex); 352 mutex_lock(&uuid_mutex);
353again: 353again:
354 list_for_each_safe(cur, tmp, &fs_devices->devices) { 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
355 device = list_entry(cur, struct btrfs_device, dev_list);
356 if (device->in_fs_metadata) 355 if (device->in_fs_metadata)
357 continue; 356 continue;
358 357
@@ -383,14 +382,12 @@ again:
383 382
384static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{ 384{
386 struct list_head *cur;
387 struct btrfs_device *device; 385 struct btrfs_device *device;
388 386
389 if (--fs_devices->opened > 0) 387 if (--fs_devices->opened > 0)
390 return 0; 388 return 0;
391 389
392 list_for_each(cur, &fs_devices->devices) { 390 list_for_each_entry(device, &fs_devices->devices, dev_list) {
393 device = list_entry(cur, struct btrfs_device, dev_list);
394 if (device->bdev) { 391 if (device->bdev) {
395 close_bdev_exclusive(device->bdev, device->mode); 392 close_bdev_exclusive(device->bdev, device->mode);
396 fs_devices->open_devices--; 393 fs_devices->open_devices--;
@@ -439,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
439{ 436{
440 struct block_device *bdev; 437 struct block_device *bdev;
441 struct list_head *head = &fs_devices->devices; 438 struct list_head *head = &fs_devices->devices;
442 struct list_head *cur;
443 struct btrfs_device *device; 439 struct btrfs_device *device;
444 struct block_device *latest_bdev = NULL; 440 struct block_device *latest_bdev = NULL;
445 struct buffer_head *bh; 441 struct buffer_head *bh;
@@ -450,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
450 int seeding = 1; 446 int seeding = 1;
451 int ret = 0; 447 int ret = 0;
452 448
453 list_for_each(cur, head) { 449 list_for_each_entry(device, head, dev_list) {
454 device = list_entry(cur, struct btrfs_device, dev_list);
455 if (device->bdev) 450 if (device->bdev)
456 continue; 451 continue;
457 if (!device->name) 452 if (!device->name)
@@ -578,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
578 *(unsigned long long *)disk_super->fsid, 573 *(unsigned long long *)disk_super->fsid,
579 *(unsigned long long *)(disk_super->fsid + 8)); 574 *(unsigned long long *)(disk_super->fsid + 8));
580 } 575 }
581 printk(KERN_INFO "devid %llu transid %llu %s\n", 576 printk(KERN_CONT "devid %llu transid %llu %s\n",
582 (unsigned long long)devid, (unsigned long long)transid, path); 577 (unsigned long long)devid, (unsigned long long)transid, path);
583 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
584 579
@@ -1017,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1017 } 1012 }
1018 1013
1019 if (strcmp(device_path, "missing") == 0) { 1014 if (strcmp(device_path, "missing") == 0) {
1020 struct list_head *cur;
1021 struct list_head *devices; 1015 struct list_head *devices;
1022 struct btrfs_device *tmp; 1016 struct btrfs_device *tmp;
1023 1017
1024 device = NULL; 1018 device = NULL;
1025 devices = &root->fs_info->fs_devices->devices; 1019 devices = &root->fs_info->fs_devices->devices;
1026 list_for_each(cur, devices) { 1020 list_for_each_entry(tmp, devices, dev_list) {
1027 tmp = list_entry(cur, struct btrfs_device, dev_list);
1028 if (tmp->in_fs_metadata && !tmp->bdev) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) {
1029 device = tmp; 1022 device = tmp;
1030 break; 1023 break;
@@ -1280,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1280 struct btrfs_trans_handle *trans; 1273 struct btrfs_trans_handle *trans;
1281 struct btrfs_device *device; 1274 struct btrfs_device *device;
1282 struct block_device *bdev; 1275 struct block_device *bdev;
1283 struct list_head *cur;
1284 struct list_head *devices; 1276 struct list_head *devices;
1285 struct super_block *sb = root->fs_info->sb; 1277 struct super_block *sb = root->fs_info->sb;
1286 u64 total_bytes; 1278 u64 total_bytes;
@@ -1304,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1304 mutex_lock(&root->fs_info->volume_mutex); 1296 mutex_lock(&root->fs_info->volume_mutex);
1305 1297
1306 devices = &root->fs_info->fs_devices->devices; 1298 devices = &root->fs_info->fs_devices->devices;
1307 list_for_each(cur, devices) { 1299 list_for_each_entry(device, devices, dev_list) {
1308 device = list_entry(cur, struct btrfs_device, dev_list);
1309 if (device->bdev == bdev) { 1300 if (device->bdev == bdev) {
1310 ret = -EEXIST; 1301 ret = -EEXIST;
1311 goto error; 1302 goto error;
@@ -1383,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1383 ret = btrfs_add_device(trans, root, device); 1374 ret = btrfs_add_device(trans, root, device);
1384 } 1375 }
1385 1376
1377 /*
1378 * we've got more storage, clear any full flags on the space
1379 * infos
1380 */
1381 btrfs_clear_space_info_full(root->fs_info);
1382
1386 unlock_chunks(root); 1383 unlock_chunks(root);
1387 btrfs_commit_transaction(trans, root); 1384 btrfs_commit_transaction(trans, root);
1388 1385
@@ -1468,6 +1465,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1468 device->fs_devices->total_rw_bytes += diff; 1465 device->fs_devices->total_rw_bytes += diff;
1469 1466
1470 device->total_bytes = new_size; 1467 device->total_bytes = new_size;
1468 btrfs_clear_space_info_full(device->dev_root->fs_info);
1469
1471 return btrfs_update_device(trans, device); 1470 return btrfs_update_device(trans, device);
1472} 1471}
1473 1472
@@ -1704,7 +1703,6 @@ static u64 div_factor(u64 num, int factor)
1704int btrfs_balance(struct btrfs_root *dev_root) 1703int btrfs_balance(struct btrfs_root *dev_root)
1705{ 1704{
1706 int ret; 1705 int ret;
1707 struct list_head *cur;
1708 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1706 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1709 struct btrfs_device *device; 1707 struct btrfs_device *device;
1710 u64 old_size; 1708 u64 old_size;
@@ -1723,8 +1721,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1723 dev_root = dev_root->fs_info->dev_root; 1721 dev_root = dev_root->fs_info->dev_root;
1724 1722
1725 /* step one make some room on all the devices */ 1723 /* step one make some room on all the devices */
1726 list_for_each(cur, devices) { 1724 list_for_each_entry(device, devices, dev_list) {
1727 device = list_entry(cur, struct btrfs_device, dev_list);
1728 old_size = device->total_bytes; 1725 old_size = device->total_bytes;
1729 size_to_free = div_factor(old_size, 1); 1726 size_to_free = div_factor(old_size, 1);
1730 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1727 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
@@ -2905,10 +2902,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2905 free_extent_map(em); 2902 free_extent_map(em);
2906 } 2903 }
2907 2904
2908 map = kzalloc(sizeof(*map), GFP_NOFS);
2909 if (!map)
2910 return -ENOMEM;
2911
2912 em = alloc_extent_map(GFP_NOFS); 2905 em = alloc_extent_map(GFP_NOFS);
2913 if (!em) 2906 if (!em)
2914 return -ENOMEM; 2907 return -ENOMEM;
@@ -3117,6 +3110,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3117 if (!sb) 3110 if (!sb)
3118 return -ENOMEM; 3111 return -ENOMEM;
3119 btrfs_set_buffer_uptodate(sb); 3112 btrfs_set_buffer_uptodate(sb);
3113 btrfs_set_buffer_lockdep_class(sb, 0);
3114
3120 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3115 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3121 array_size = btrfs_super_sys_array_size(super_copy); 3116 array_size = btrfs_super_sys_array_size(super_copy);
3122 3117
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e270894..a9d3bf4d2689 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "btrfs_inode.h" 26#include "btrfs_inode.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
45 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0); 48 strlen(name), 0);
48 if (!di || IS_ERR(di)) { 49 if (!di) {
49 ret = -ENODATA; 50 ret = -ENODATA;
50 goto out; 51 goto out;
52 } else if (IS_ERR(di)) {
53 ret = PTR_ERR(di);
54 goto out;
51 } 55 }
52 56
53 leaf = path->nodes[0]; 57 leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
62 ret = -ERANGE; 66 ret = -ERANGE;
63 goto out; 67 goto out;
64 } 68 }
69
70 /*
71 * The way things are packed into the leaf is like this
72 * |struct btrfs_dir_item|name|data|
73 * where name is the xattr name, so security.foo, and data is the
74 * content of the xattr. data_ptr points to the location in memory
75 * where the data starts in the in memory leaf
76 */
65 data_ptr = (unsigned long)((char *)(di + 1) + 77 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di)); 78 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr, 79 read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
86 if (!path) 98 if (!path)
87 return -ENOMEM; 99 return -ENOMEM;
88 100
89 trans = btrfs_start_transaction(root, 1); 101 trans = btrfs_join_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode); 102 btrfs_set_trans_block_group(trans, inode);
91 103
92 /* first lets see if we already have this xattr */ 104 /* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0) 189 if (ret < 0)
178 goto err; 190 goto err;
179 ret = 0;
180 advance = 0; 191 advance = 0;
181 while (1) { 192 while (1) {
182 leaf = path->nodes[0]; 193 leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
320 return -EOPNOTSUPP; 331 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322} 333}
334
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
336{
337 int err;
338 size_t len;
339 void *value;
340 char *suffix;
341 char *name;
342
343 err = security_inode_init_security(inode, dir, &suffix, &value, &len);
344 if (err) {
345 if (err == -EOPNOTSUPP)
346 return 0;
347 return err;
348 }
349
350 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
351 GFP_NOFS);
352 if (!name) {
353 err = -ENOMEM;
354 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0);
358 kfree(name);
359 }
360
361 kfree(suffix);
362 kfree(value);
363 return err;
364}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68d..c71e9c3cf3f7 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
40
39#endif /* __XATTR__ */ 41#endif /* __XATTR__ */