aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig13
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c315
-rw-r--r--fs/btrfs/ctree.h37
-rw-r--r--fs/btrfs/disk-io.c166
-rw-r--r--fs/btrfs/disk-io.h12
-rw-r--r--fs/btrfs/extent-tree.c519
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c88
-rw-r--r--fs/btrfs/ioctl.c1
-rw-r--r--fs/btrfs/locking.c207
-rw-r--r--fs/btrfs/locking.h6
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
-rw-r--r--fs/btrfs/super.c11
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c356
-rw-r--r--fs/btrfs/volumes.c55
-rw-r--r--fs/btrfs/xattr.c48
-rw-r--r--fs/btrfs/xattr.h2
27 files changed, 1544 insertions, 534 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf999ea1b..7bb3c020e570 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
16 module will be called btrfs. 16 module will be called btrfs.
17 17
18 If unsure, say N. 18 If unsure, say N.
19
20config BTRFS_FS_POSIX_ACL
21 bool "Btrfs POSIX Access Control Lists"
22 depends on BTRFS_FS
23 select FS_POSIX_ACL
24 help
25 POSIX Access Control Lists (ACLs) support permissions for users and
26 groups beyond the owner/group/world scheme.
27
28 To learn more about Access Control Lists, visit the POSIX ACLs for
29 Linux website <http://acl.bestbits.at/>.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe0..c84ca1f5259a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/kthread.h> 19#include <linux/kthread.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/spinlock.h> 21#include <linux/spinlock.h>
23# include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 24#include "async-thread.h"
25 25
26#define WORK_QUEUED_BIT 0 26#define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
143 struct btrfs_work *work; 143 struct btrfs_work *work;
144 do { 144 do {
145 spin_lock_irq(&worker->lock); 145 spin_lock_irq(&worker->lock);
146again_locked:
146 while (!list_empty(&worker->pending)) { 147 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next; 148 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list); 149 work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
165 check_idle_worker(worker); 166 check_idle_worker(worker);
166 167
167 } 168 }
168 worker->working = 0;
169 if (freezing(current)) { 169 if (freezing(current)) {
170 worker->working = 0;
171 spin_unlock_irq(&worker->lock);
170 refrigerator(); 172 refrigerator();
171 } else { 173 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock); 174 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop()) 175 if (!kthread_should_stop()) {
176 cpu_relax();
177 /*
178 * we've dropped the lock, did someone else
179 * jump_in?
180 */
181 smp_mb();
182 if (!list_empty(&worker->pending))
183 continue;
184
185 /*
186 * this short schedule allows more work to
187 * come in without the queue functions
188 * needing to go through wake_up_process()
189 *
190 * worker->working is still 1, so nobody
191 * is going to try and wake us up
192 */
193 schedule_timeout(1);
194 smp_mb();
195 if (!list_empty(&worker->pending))
196 continue;
197
198 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending))
202 goto again_locked;
203
204 /*
205 * this makes sure we get a wakeup when someone
206 * adds something new to the queue
207 */
208 worker->working = 0;
209 spin_unlock_irq(&worker->lock);
210
175 schedule(); 211 schedule();
212 }
176 __set_current_state(TASK_RUNNING); 213 __set_current_state(TASK_RUNNING);
177 } 214 }
178 } while (!kthread_should_stop()); 215 } while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
350{ 387{
351 struct btrfs_worker_thread *worker = work->worker; 388 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags; 389 unsigned long flags;
390 int wake = 0;
353 391
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out; 393 goto out;
356 394
357 spin_lock_irqsave(&worker->lock, flags); 395 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending); 396 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending);
360 398
361 /* by definition we're busy, take ourselves off the idle 399 /* by definition we're busy, take ourselves off the idle
362 * list 400 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
368 &worker->workers->worker_list); 406 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags); 407 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 } 408 }
409 if (!worker->working) {
410 wake = 1;
411 worker->working = 1;
412 }
371 413
372 spin_unlock_irqrestore(&worker->lock, flags); 414 spin_unlock_irqrestore(&worker->lock, flags);
373 415 if (wake)
416 wake_up_process(worker->task);
374out: 417out:
418
375 return 0; 419 return 0;
376} 420}
377 421
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
398 } 442 }
399 443
400 spin_lock_irqsave(&worker->lock, flags); 444 spin_lock_irqsave(&worker->lock, flags);
445
446 list_add_tail(&work->list, &worker->pending);
401 atomic_inc(&worker->num_pending); 447 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker); 448 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404 449
405 /* 450 /*
406 * avoid calling into wake_up_process if this thread has already 451 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d9..ab07627084f1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h> 35#include <linux/pagevec.h>
37#include "compat.h" 36#include "compat.h"
38#include "ctree.h" 37#include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c0776816..42491d728e99 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot); 39 struct btrfs_path *path, int level, int slot);
40 40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void) 41struct btrfs_path *btrfs_alloc_path(void)
47{ 42{
48 struct btrfs_path *path; 43 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); 44 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) { 45 if (path)
51 btrfs_init_path(path);
52 path->reada = 1; 46 path->reada = 1;
53 }
54 return path; 47 return path;
55} 48}
56 49
50/*
51 * set all locked nodes in the path to blocking locks. This should
52 * be done before scheduling
53 */
54noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55{
56 int i;
57 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58 if (p->nodes[i] && p->locks[i])
59 btrfs_set_lock_blocking(p->nodes[i]);
60 }
61}
62
63/*
64 * reset all the locked nodes in the patch to spinning locks.
65 *
66 * held is used to keep lockdep happy, when lockdep is enabled
67 * we set held to a blocking lock before we go around and
68 * retake all the spinlocks in the path. You can safely use NULL
69 * for held
70 */
71noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 struct extent_buffer *held)
73{
74 int i;
75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77 /* lockdep really cares that we take all of these spinlocks
78 * in the right order. If any of the locks in the path are not
79 * currently blocking, it is going to complain. So, make really
80 * really sure by forcing the path to blocking before we clear
81 * the path blocking.
82 */
83 if (held)
84 btrfs_set_lock_blocking(held);
85 btrfs_set_path_blocking(p);
86#endif
87
88 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 if (p->nodes[i] && p->locks[i])
90 btrfs_clear_lock_blocking(p->nodes[i]);
91 }
92
93#ifdef CONFIG_DEBUG_LOCK_ALLOC
94 if (held)
95 btrfs_clear_lock_blocking(held);
96#endif
97}
98
57/* this also releases the path */ 99/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 100void btrfs_free_path(struct btrfs_path *p)
59{ 101{
@@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
261 trans->transid, level, &ins); 303 trans->transid, level, &ins);
262 BUG_ON(ret); 304 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest, 305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len); 306 buf->len, level);
265 } else { 307 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len, 308 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start, 309 parent_start,
@@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 314 if (IS_ERR(cow))
273 return PTR_ERR(cow); 315 return PTR_ERR(cow);
274 316
317 /* cow is set to blocking by btrfs_init_new_buffer */
318
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 319 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 320 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 321 btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
388 WARN_ON(1); 432 WARN_ON(1);
389 } 433 }
390 434
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid && 435 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid && 436 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf; 438 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest); 439 WARN_ON(prealloc_dest);
398 return 0; 440 return 0;
399 } 441 }
400 spin_unlock(&root->fs_info->hash_lock); 442
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 443 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
444
445 if (parent)
446 btrfs_set_lock_blocking(parent);
447 btrfs_set_lock_blocking(buf);
448
402 ret = __btrfs_cow_block(trans, root, buf, parent, 449 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0, 450 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest); 451 prealloc_dest);
@@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
504 if (parent_nritems == 1) 551 if (parent_nritems == 1)
505 return 0; 552 return 0;
506 553
554 btrfs_set_lock_blocking(parent);
555
507 for (i = start_slot; i < end_slot; i++) { 556 for (i = start_slot; i < end_slot; i++) {
508 int close = 1; 557 int close = 1;
509 558
@@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
564 search_start = last_block; 613 search_start = last_block;
565 614
566 btrfs_tree_lock(cur); 615 btrfs_tree_lock(cur);
616 btrfs_set_lock_blocking(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i, 617 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start, 618 &cur, search_start,
569 min(16 * blocksize, 619 min(16 * blocksize,
@@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 return 0; 912 return 0;
863 913
864 mid = path->nodes[level]; 914 mid = path->nodes[level];
915
865 WARN_ON(!path->locks[level]); 916 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 917 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 918
@@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
883 934
884 /* promote the child to a root */ 935 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0); 936 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child); 937 BUG_ON(!child);
938 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret); 941 BUG_ON(ret);
890 942
@@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
900 952
901 add_root_to_dirty_list(root); 953 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child); 954 btrfs_tree_unlock(child);
955
903 path->locks[level] = 0; 956 path->locks[level] = 0;
904 path->nodes[level] = NULL; 957 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid); 958 clean_tree_block(trans, root, mid);
@@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
924 left = read_node_slot(root, parent, pslot - 1); 977 left = read_node_slot(root, parent, pslot - 1);
925 if (left) { 978 if (left) {
926 btrfs_tree_lock(left); 979 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left);
927 wret = btrfs_cow_block(trans, root, left, 981 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0); 982 parent, pslot - 1, &left, 0);
929 if (wret) { 983 if (wret) {
@@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
934 right = read_node_slot(root, parent, pslot + 1); 988 right = read_node_slot(root, parent, pslot + 1);
935 if (right) { 989 if (right) {
936 btrfs_tree_lock(right); 990 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right);
937 wret = btrfs_cow_block(trans, root, right, 992 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0); 993 parent, pslot + 1, &right, 0);
939 if (wret) { 994 if (wret) {
@@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 u32 left_nr; 1164 u32 left_nr;
1110 1165
1111 btrfs_tree_lock(left); 1166 btrfs_tree_lock(left);
1167 btrfs_set_lock_blocking(left);
1168
1112 left_nr = btrfs_header_nritems(left); 1169 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1; 1171 wret = 1;
@@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1155 */ 1212 */
1156 if (right) { 1213 if (right) {
1157 u32 right_nr; 1214 u32 right_nr;
1215
1158 btrfs_tree_lock(right); 1216 btrfs_tree_lock(right);
1217 btrfs_set_lock_blocking(right);
1218
1159 right_nr = btrfs_header_nritems(right); 1219 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1220 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1; 1221 wret = 1;
@@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1210 struct btrfs_disk_key disk_key; 1270 struct btrfs_disk_key disk_key;
1211 u32 nritems; 1271 u32 nritems;
1212 u64 search; 1272 u64 search;
1213 u64 lowest_read; 1273 u64 target;
1214 u64 highest_read;
1215 u64 nread = 0; 1274 u64 nread = 0;
1216 int direction = path->reada; 1275 int direction = path->reada;
1217 struct extent_buffer *eb; 1276 struct extent_buffer *eb;
@@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1235 return; 1294 return;
1236 } 1295 }
1237 1296
1238 highest_read = search; 1297 target = search;
1239 lowest_read = search;
1240 1298
1241 nritems = btrfs_header_nritems(node); 1299 nritems = btrfs_header_nritems(node);
1242 nr = slot; 1300 nr = slot;
@@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
1256 break; 1314 break;
1257 } 1315 }
1258 search = btrfs_node_blockptr(node, nr); 1316 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) || 1317 if ((search <= target && target - search <= 65536) ||
1260 (search < lowest_read && lowest_read - search <= 16384) || 1318 (search > target && search - target <= 65536)) {
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize, 1319 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr)); 1320 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize; 1321 nread += blocksize;
1265 } 1322 }
1266 nscan++; 1323 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1324 if ((nread > 65536 || nscan > 32))
1268 break; 1325 break;
1326 }
1327}
1269 1328
1270 if (nread > (256 * 1024) || nscan > 128) 1329/*
1271 break; 1330 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1331 * cache
1332 */
1333static noinline int reada_for_balance(struct btrfs_root *root,
1334 struct btrfs_path *path, int level)
1335{
1336 int slot;
1337 int nritems;
1338 struct extent_buffer *parent;
1339 struct extent_buffer *eb;
1340 u64 gen;
1341 u64 block1 = 0;
1342 u64 block2 = 0;
1343 int ret = 0;
1344 int blocksize;
1345
1346 parent = path->nodes[level - 1];
1347 if (!parent)
1348 return 0;
1272 1349
1273 if (search < lowest_read) 1350 nritems = btrfs_header_nritems(parent);
1274 lowest_read = search; 1351 slot = path->slots[level];
1275 if (search > highest_read) 1352 blocksize = btrfs_level_size(root, level);
1276 highest_read = search; 1353
1354 if (slot > 0) {
1355 block1 = btrfs_node_blockptr(parent, slot - 1);
1356 gen = btrfs_node_ptr_generation(parent, slot - 1);
1357 eb = btrfs_find_tree_block(root, block1, blocksize);
1358 if (eb && btrfs_buffer_uptodate(eb, gen))
1359 block1 = 0;
1360 free_extent_buffer(eb);
1361 }
1362 if (slot < nritems) {
1363 block2 = btrfs_node_blockptr(parent, slot + 1);
1364 gen = btrfs_node_ptr_generation(parent, slot + 1);
1365 eb = btrfs_find_tree_block(root, block2, blocksize);
1366 if (eb && btrfs_buffer_uptodate(eb, gen))
1367 block2 = 0;
1368 free_extent_buffer(eb);
1277 } 1369 }
1370 if (block1 || block2) {
1371 ret = -EAGAIN;
1372 btrfs_release_path(root, path);
1373 if (block1)
1374 readahead_tree_block(root, block1, blocksize, 0);
1375 if (block2)
1376 readahead_tree_block(root, block2, blocksize, 0);
1377
1378 if (block1) {
1379 eb = read_tree_block(root, block1, blocksize, 0);
1380 free_extent_buffer(eb);
1381 }
1382 if (block1) {
1383 eb = read_tree_block(root, block2, blocksize, 0);
1384 free_extent_buffer(eb);
1385 }
1386 }
1387 return ret;
1278} 1388}
1279 1389
1390
1280/* 1391/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers 1392 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because 1393 * in the tree. The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1328} 1439}
1329 1440
1330/* 1441/*
1442 * This releases any locks held in the path starting at level and
1443 * going all the way up to the root.
1444 *
1445 * btrfs_search_slot will keep the lock held on higher nodes in a few
1446 * corner cases, such as COW of the block at slot zero in the node. This
1447 * ignores those rules, and it should only be called when there are no
1448 * more updates to be done higher up in the tree.
1449 */
1450noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1451{
1452 int i;
1453
1454 if (path->keep_locks || path->lowest_level)
1455 return;
1456
1457 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1458 if (!path->nodes[i])
1459 continue;
1460 if (!path->locks[i])
1461 continue;
1462 btrfs_tree_unlock(path->nodes[i]);
1463 path->locks[i] = 0;
1464 }
1465}
1466
1467/*
1331 * look for key in the tree. path is filled in with nodes along the way 1468 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf 1469 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0) 1470 * level of the path (level 0)
@@ -1387,32 +1524,30 @@ again:
1387 int wret; 1524 int wret;
1388 1525
1389 /* is a cow on this block not required */ 1526 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid && 1527 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid && 1528 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done; 1530 goto cow_done;
1396 } 1531 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398 1532
1399 /* ok, we have to cow, is our old prealloc the right 1533 /* ok, we have to cow, is our old prealloc the right
1400 * size? 1534 * size?
1401 */ 1535 */
1402 if (prealloc_block.objectid && 1536 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) { 1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1404 btrfs_free_reserved_extent(root, 1539 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid, 1540 prealloc_block.objectid,
1406 prealloc_block.offset); 1541 prealloc_block.offset);
1407 prealloc_block.objectid = 0; 1542 prealloc_block.objectid = 0;
1543 goto again;
1408 } 1544 }
1409 1545
1410 /* 1546 /*
1411 * for higher level blocks, try not to allocate blocks 1547 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held. 1548 * with the block and the parent locks held.
1413 */ 1549 */
1414 if (level > 1 && !prealloc_block.objectid && 1550 if (level > 0 && !prealloc_block.objectid) {
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len; 1551 u32 size = b->len;
1417 u64 hint = b->start; 1552 u64 hint = b->start;
1418 1553
@@ -1425,6 +1560,8 @@ again:
1425 goto again; 1560 goto again;
1426 } 1561 }
1427 1562
1563 btrfs_set_path_blocking(p);
1564
1428 wret = btrfs_cow_block(trans, root, b, 1565 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1], 1566 p->nodes[level + 1],
1430 p->slots[level + 1], 1567 p->slots[level + 1],
@@ -1446,6 +1583,22 @@ cow_done:
1446 if (!p->skip_locking) 1583 if (!p->skip_locking)
1447 p->locks[level] = 1; 1584 p->locks[level] = 1;
1448 1585
1586 btrfs_clear_path_blocking(p, NULL);
1587
1588 /*
1589 * we have a lock on b and as long as we aren't changing
1590 * the tree, there is no way to for the items in b to change.
1591 * It is safe to drop the lock on our parent before we
1592 * go through the expensive btree search on b.
1593 *
1594 * If cow is true, then we might be changing slot zero,
1595 * which may require changing the parent. So, we can't
1596 * drop the lock until after we know which slot we're
1597 * operating on.
1598 */
1599 if (!cow)
1600 btrfs_unlock_up_safe(p, level + 1);
1601
1449 ret = check_block(root, p, level); 1602 ret = check_block(root, p, level);
1450 if (ret) { 1603 if (ret) {
1451 ret = -1; 1604 ret = -1;
@@ -1453,6 +1606,7 @@ cow_done:
1453 } 1606 }
1454 1607
1455 ret = bin_search(b, key, level, &slot); 1608 ret = bin_search(b, key, level, &slot);
1609
1456 if (level != 0) { 1610 if (level != 0) {
1457 if (ret && slot > 0) 1611 if (ret && slot > 0)
1458 slot -= 1; 1612 slot -= 1;
@@ -1460,7 +1614,16 @@ cow_done:
1460 if ((p->search_for_split || ins_len > 0) && 1614 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >= 1615 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level); 1617 int sret;
1618
1619 sret = reada_for_balance(root, p, level);
1620 if (sret)
1621 goto again;
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1464 BUG_ON(sret > 0); 1627 BUG_ON(sret > 0);
1465 if (sret) { 1628 if (sret) {
1466 ret = sret; 1629 ret = sret;
@@ -1468,9 +1631,19 @@ cow_done:
1468 } 1631 }
1469 b = p->nodes[level]; 1632 b = p->nodes[level];
1470 slot = p->slots[level]; 1633 slot = p->slots[level];
1471 } else if (ins_len < 0) { 1634 } else if (ins_len < 0 &&
1472 int sret = balance_level(trans, root, p, 1635 btrfs_header_nritems(b) <
1473 level); 1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646
1474 if (sret) { 1647 if (sret) {
1475 ret = sret; 1648 ret = sret;
1476 goto done; 1649 goto done;
@@ -1504,7 +1677,7 @@ cow_done:
1504 * of the btree by dropping locks before 1677 * of the btree by dropping locks before
1505 * we read. 1678 * we read.
1506 */ 1679 */
1507 if (level > 1) { 1680 if (level > 0) {
1508 btrfs_release_path(NULL, p); 1681 btrfs_release_path(NULL, p);
1509 if (tmp) 1682 if (tmp)
1510 free_extent_buffer(tmp); 1683 free_extent_buffer(tmp);
@@ -1519,6 +1692,7 @@ cow_done:
1519 free_extent_buffer(tmp); 1692 free_extent_buffer(tmp);
1520 goto again; 1693 goto again;
1521 } else { 1694 } else {
1695 btrfs_set_path_blocking(p);
1522 if (tmp) 1696 if (tmp)
1523 free_extent_buffer(tmp); 1697 free_extent_buffer(tmp);
1524 if (should_reada) 1698 if (should_reada)
@@ -1528,14 +1702,29 @@ cow_done:
1528 b = read_node_slot(root, b, slot); 1702 b = read_node_slot(root, b, slot);
1529 } 1703 }
1530 } 1704 }
1531 if (!p->skip_locking) 1705 if (!p->skip_locking) {
1532 btrfs_tree_lock(b); 1706 int lret;
1707
1708 btrfs_clear_path_blocking(p, NULL);
1709 lret = btrfs_try_spin_lock(b);
1710
1711 if (!lret) {
1712 btrfs_set_path_blocking(p);
1713 btrfs_tree_lock(b);
1714 btrfs_clear_path_blocking(p, b);
1715 }
1716 }
1533 } else { 1717 } else {
1534 p->slots[level] = slot; 1718 p->slots[level] = slot;
1535 if (ins_len > 0 && 1719 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) { 1720 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key, 1721 int sret;
1722
1723 btrfs_set_path_blocking(p);
1724 sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0); 1725 p, ins_len, ret == 0);
1726 btrfs_clear_path_blocking(p, NULL);
1727
1539 BUG_ON(sret > 0); 1728 BUG_ON(sret > 0);
1540 if (sret) { 1729 if (sret) {
1541 ret = sret; 1730 ret = sret;
@@ -1549,12 +1738,16 @@ cow_done:
1549 } 1738 }
1550 ret = 1; 1739 ret = 1;
1551done: 1740done:
1741 /*
1742 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking
1744 */
1745 btrfs_set_path_blocking(p);
1552 if (prealloc_block.objectid) { 1746 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root, 1747 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid, 1748 prealloc_block.objectid,
1555 prealloc_block.offset); 1749 prealloc_block.offset);
1556 } 1750 }
1557
1558 return ret; 1751 return ret;
1559} 1752}
1560 1753
@@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret); 1772 BUG_ON(ret);
1580 1773
1774 btrfs_set_lock_blocking(eb);
1775
1581 parent = eb; 1776 parent = eb;
1582 while (1) { 1777 while (1) {
1583 level = btrfs_header_level(parent); 1778 level = btrfs_header_level(parent);
@@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1602 eb = read_tree_block(root, bytenr, blocksize, 1797 eb = read_tree_block(root, bytenr, blocksize,
1603 generation); 1798 generation);
1604 btrfs_tree_lock(eb); 1799 btrfs_tree_lock(eb);
1800 btrfs_set_lock_blocking(eb);
1605 } 1801 }
1606 1802
1607 /* 1803 /*
@@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1626 eb = read_tree_block(root, bytenr, blocksize, 1822 eb = read_tree_block(root, bytenr, blocksize,
1627 generation); 1823 generation);
1628 btrfs_tree_lock(eb); 1824 btrfs_tree_lock(eb);
1825 btrfs_set_lock_blocking(eb);
1629 } 1826 }
1630 1827
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1828 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2172,6 +2369,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2172 2369
2173 right = read_node_slot(root, upper, slot + 1); 2370 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right); 2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2175 free_space = btrfs_leaf_free_space(root, right); 2374 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size) 2375 if (free_space < data_size)
2177 goto out_unlock; 2376 goto out_unlock;
@@ -2367,6 +2566,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2367 2566
2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left); 2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2370 free_space = btrfs_leaf_free_space(root, left); 2571 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) { 2572 if (free_space < data_size) {
2372 ret = 1; 2573 ret = 1;
@@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2825 path->keep_locks = 0; 3026 path->keep_locks = 0;
2826 BUG_ON(ret); 3027 BUG_ON(ret);
2827 3028
3029 /*
3030 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state
3032 */
3033 btrfs_set_path_blocking(path);
3034
2828 leaf = path->nodes[0]; 3035 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 3037
@@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3354 BUG(); 3561 BUG();
3355 } 3562 }
3356out: 3563out:
3564 btrfs_unlock_up_safe(path, 1);
3357 return ret; 3565 return ret;
3358} 3566}
3359 3567
@@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3441{ 3649{
3442 int ret; 3650 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3651 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3652 u64 parent_start = path->nodes[1]->start;
3653 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3444 3654
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3655 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret) 3656 if (ret)
3447 return ret; 3657 return ret;
3448 3658
3659 /*
3660 * btrfs_free_extent is expensive, we want to make sure we
3661 * aren't holding any locks when we call it
3662 */
3663 btrfs_unlock_up_safe(path, 0);
3664
3449 ret = btrfs_free_extent(trans, root, bytenr, 3665 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0), 3666 btrfs_level_size(root, 0),
3451 path->nodes[1]->start, 3667 parent_start, parent_owner,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1); 3668 root_gen, 0, 1);
3454 return ret; 3669 return ret;
3455} 3670}
@@ -3721,6 +3936,7 @@ find_next_key:
3721 */ 3936 */
3722 if (slot >= nritems) { 3937 if (slot >= nritems) {
3723 path->slots[level] = slot; 3938 path->slots[level] = slot;
3939 btrfs_set_path_blocking(path);
3724 sret = btrfs_find_next_key(root, path, min_key, level, 3940 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans); 3941 cache_only, min_trans);
3726 if (sret == 0) { 3942 if (sret == 0) {
@@ -3738,16 +3954,20 @@ find_next_key:
3738 unlock_up(path, level, 1); 3954 unlock_up(path, level, 1);
3739 goto out; 3955 goto out;
3740 } 3956 }
3957 btrfs_set_path_blocking(path);
3741 cur = read_node_slot(root, cur, slot); 3958 cur = read_node_slot(root, cur, slot);
3742 3959
3743 btrfs_tree_lock(cur); 3960 btrfs_tree_lock(cur);
3961
3744 path->locks[level - 1] = 1; 3962 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur; 3963 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1); 3964 unlock_up(path, level, 1);
3965 btrfs_clear_path_blocking(path, NULL);
3747 } 3966 }
3748out: 3967out:
3749 if (ret == 0) 3968 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key)); 3969 memcpy(min_key, &found_key, sizeof(found_key));
3970 btrfs_set_path_blocking(path);
3751 return ret; 3971 return ret;
3752} 3972}
3753 3973
@@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3843 if (ret < 0) 4063 if (ret < 0)
3844 return ret; 4064 return ret;
3845 4065
4066 btrfs_set_path_blocking(path);
3846 nritems = btrfs_header_nritems(path->nodes[0]); 4067 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /* 4068 /*
3848 * by releasing the path above we dropped all our locks. A balance 4069 * by releasing the path above we dropped all our locks. A balance
@@ -3873,6 +4094,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3873 free_extent_buffer(next); 4094 free_extent_buffer(next);
3874 } 4095 }
3875 4096
4097 /* the path was set to blocking above */
3876 if (level == 1 && (path->locks[1] || path->skip_locking) && 4098 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada) 4099 path->reada)
3878 reada_for_search(root, path, level, slot, 0); 4100 reada_for_search(root, path, level, slot, 0);
@@ -3881,6 +4103,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3881 if (!path->skip_locking) { 4103 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c)); 4104 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next); 4105 btrfs_tree_lock(next);
4106 btrfs_set_lock_blocking(next);
3884 } 4107 }
3885 break; 4108 break;
3886 } 4109 }
@@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3897 path->locks[level] = 1; 4120 path->locks[level] = 1;
3898 if (!level) 4121 if (!level)
3899 break; 4122 break;
4123
4124 btrfs_set_path_blocking(path);
3900 if (level == 1 && path->locks[1] && path->reada) 4125 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0); 4126 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0); 4127 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) { 4128 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4129 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next); 4130 btrfs_tree_lock(next);
4131 btrfs_set_lock_blocking(next);
3906 } 4132 }
3907 } 4133 }
3908done: 4134done:
@@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3927 4153
3928 while (1) { 4154 while (1) {
3929 if (path->slots[0] == 0) { 4155 if (path->slots[0] == 0) {
4156 btrfs_set_path_blocking(path);
3930 ret = btrfs_prev_leaf(root, path); 4157 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0) 4158 if (ret != 0)
3932 return ret; 4159 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f88113..766b31ae3186 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
43 43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1) 44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45 45
46#ifdef CONFIG_LOCKDEP 46#define BTRFS_MAX_LEVEL 8
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51 47
52/* holds pointers to all of the tree roots */ 48/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL 49#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -454,17 +450,11 @@ struct btrfs_timespec {
454 __le32 nsec; 450 __le32 nsec;
455} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
456 452
457typedef enum { 453enum btrfs_compression_type {
458 BTRFS_COMPRESS_NONE = 0, 454 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1, 455 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2, 456 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type; 457};
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468 458
469struct btrfs_inode_item { 459struct btrfs_inode_item {
470 /* nfs style generation number */ 460 /* nfs style generation number */
@@ -701,9 +691,7 @@ struct btrfs_fs_info {
701 struct btrfs_transaction *running_transaction; 691 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle; 692 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait; 693 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait; 694 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707 695
708 struct btrfs_super_block super_copy; 696 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit; 697 struct btrfs_super_block super_for_commit;
@@ -711,7 +699,6 @@ struct btrfs_fs_info {
711 struct super_block *sb; 699 struct super_block *sb;
712 struct inode *btree_inode; 700 struct inode *btree_inode;
713 struct backing_dev_info bdi; 701 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex; 702 struct mutex trans_mutex;
716 struct mutex tree_log_mutex; 703 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex; 704 struct mutex transaction_kthread_mutex;
@@ -730,10 +717,6 @@ struct btrfs_fs_info {
730 atomic_t async_submit_draining; 717 atomic_t async_submit_draining;
731 atomic_t nr_async_bios; 718 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages; 719 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737 720
738 /* 721 /*
739 * this is used by the balancing code to wait for all the pending 722 * this is used by the balancing code to wait for all the pending
@@ -833,7 +816,14 @@ struct btrfs_root {
833 struct kobject root_kobj; 816 struct kobject root_kobj;
834 struct completion kobj_unregister; 817 struct completion kobj_unregister;
835 struct mutex objectid_mutex; 818 struct mutex objectid_mutex;
819
836 struct mutex log_mutex; 820 struct mutex log_mutex;
821 wait_queue_head_t log_writer_wait;
822 wait_queue_head_t log_commit_wait[2];
823 atomic_t log_writers;
824 atomic_t log_commit[2];
825 unsigned long log_transid;
826 unsigned long log_batch;
837 827
838 u64 objectid; 828 u64 objectid;
839 u64 last_trans; 829 u64 last_trans;
@@ -1721,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1721 u64 empty_size); 1711 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1712struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root, 1713 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize); 1714 u64 bytenr, u32 blocksize,
1715 int level);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1716int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root, 1717 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes, 1718 u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1840,7 +1831,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); 1831void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void); 1832struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p); 1833void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p); 1834void btrfs_set_path_blocking(struct btrfs_path *p);
1835void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1836
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1837int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr); 1838 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1839int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae5..adda739a0215 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/scatterlist.h> 21#include <linux/scatterlist.h>
@@ -76,6 +75,40 @@ struct async_submit_bio {
76 struct btrfs_work work; 75 struct btrfs_work work;
77}; 76};
78 77
78/* These are used to set the lockdep class on the extent buffer locks.
79 * The class is set by the readpage_end_io_hook after the buffer has
80 * passed csum validation but before the pages are unlocked.
81 *
82 * The lockdep class is also set by btrfs_init_new_buffer on freshly
83 * allocated blocks.
84 *
85 * The class is based on the level in the tree block, which allows lockdep
86 * to know that lower nodes nest inside the locks of higher nodes.
87 *
88 * We also add a check to make sure the highest level of the tree is
89 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
90 * code needs update as well.
91 */
92#ifdef CONFIG_DEBUG_LOCK_ALLOC
93# if BTRFS_MAX_LEVEL != 8
94# error
95# endif
96static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
97static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
98 /* leaf */
99 "btrfs-extent-00",
100 "btrfs-extent-01",
101 "btrfs-extent-02",
102 "btrfs-extent-03",
103 "btrfs-extent-04",
104 "btrfs-extent-05",
105 "btrfs-extent-06",
106 "btrfs-extent-07",
107 /* highest possible level */
108 "btrfs-extent-08",
109};
110#endif
111
79/* 112/*
80 * extents on the btree inode are pretty simple, there's one extent 113 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device 114 * that covers the entire device
@@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
348 return ret; 381 return ret;
349} 382}
350 383
384#ifdef CONFIG_DEBUG_LOCK_ALLOC
385void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
386{
387 lockdep_set_class_and_name(&eb->lock,
388 &btrfs_eb_class[level],
389 btrfs_eb_name[level]);
390}
391#endif
392
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 393static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state) 394 struct extent_state *state)
353{ 395{
@@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
393 } 435 }
394 found_level = btrfs_header_level(eb); 436 found_level = btrfs_header_level(eb);
395 437
438 btrfs_set_buffer_lockdep_class(eb, found_level);
439
396 ret = csum_tree_block(root, eb, 1); 440 ret = csum_tree_block(root, eb, 1);
397 if (ret) 441 if (ret)
398 ret = -EIO; 442 ret = -EIO;
@@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 844 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801 845
802 if (ret == 0) 846 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE; 847 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
804 else 848 else
805 WARN_ON(1); 849 WARN_ON(1);
806 return buf; 850 return buf;
@@ -814,6 +858,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
814 if (btrfs_header_generation(buf) == 858 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) { 859 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf)); 860 WARN_ON(!btrfs_tree_locked(buf));
861
862 /* ugh, clear_extent_buffer_dirty can be expensive */
863 btrfs_set_lock_blocking(buf);
864
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf); 866 buf);
819 } 867 }
@@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
850 spin_lock_init(&root->list_lock); 898 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex); 899 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex); 900 mutex_init(&root->log_mutex);
901 init_waitqueue_head(&root->log_writer_wait);
902 init_waitqueue_head(&root->log_commit_wait[0]);
903 init_waitqueue_head(&root->log_commit_wait[1]);
904 atomic_set(&root->log_commit[0], 0);
905 atomic_set(&root->log_commit[1], 0);
906 atomic_set(&root->log_writers, 0);
907 root->log_batch = 0;
908 root->log_transid = 0;
853 extent_io_tree_init(&root->dirty_log_pages, 909 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS); 910 fs_info->btree_inode->i_mapping, GFP_NOFS);
855 911
@@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
934 return 0; 990 return 0;
935} 991}
936 992
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 993static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info) 994 struct btrfs_fs_info *fs_info)
939{ 995{
940 struct btrfs_root *root; 996 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root; 997 struct btrfs_root *tree_root = fs_info->tree_root;
998 struct extent_buffer *leaf;
942 999
943 root = kzalloc(sizeof(*root), GFP_NOFS); 1000 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root) 1001 if (!root)
945 return -ENOMEM; 1002 return ERR_PTR(-ENOMEM);
946 1003
947 __setup_root(tree_root->nodesize, tree_root->leafsize, 1004 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize, 1005 tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1008 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1009 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1010 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1011 /*
1012 * log trees do not get reference counted because they go away
1013 * before a real commit is actually done. They do store pointers
1014 * to file data extents, and those reference counts still get
1015 * updated (along with back refs to the log tree).
1016 */
954 root->ref_cows = 0; 1017 root->ref_cows = 0;
955 1018
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 1019 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID, 1020 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0); 1021 trans->transid, 0, 0, 0);
1022 if (IS_ERR(leaf)) {
1023 kfree(root);
1024 return ERR_CAST(leaf);
1025 }
959 1026
1027 root->node = leaf;
960 btrfs_set_header_nritems(root->node, 0); 1028 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0); 1029 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start); 1030 btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
968 BTRFS_FSID_SIZE); 1036 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node); 1037 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node); 1038 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root; 1039 return root;
1040}
1041
1042int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info)
1044{
1045 struct btrfs_root *log_root;
1046
1047 log_root = alloc_log_tree(trans, fs_info);
1048 if (IS_ERR(log_root))
1049 return PTR_ERR(log_root);
1050 WARN_ON(fs_info->log_root_tree);
1051 fs_info->log_root_tree = log_root;
1052 return 0;
1053}
1054
1055int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1056 struct btrfs_root *root)
1057{
1058 struct btrfs_root *log_root;
1059 struct btrfs_inode_item *inode_item;
1060
1061 log_root = alloc_log_tree(trans, root->fs_info);
1062 if (IS_ERR(log_root))
1063 return PTR_ERR(log_root);
1064
1065 log_root->last_trans = trans->transid;
1066 log_root->root_key.offset = root->root_key.objectid;
1067
1068 inode_item = &log_root->root_item.inode;
1069 inode_item->generation = cpu_to_le64(1);
1070 inode_item->size = cpu_to_le64(3);
1071 inode_item->nlink = cpu_to_le32(1);
1072 inode_item->nbytes = cpu_to_le64(root->leafsize);
1073 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1074
1075 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
1076 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1077
1078 WARN_ON(root->log_root);
1079 root->log_root = log_root;
1080 root->log_transid = 0;
972 return 0; 1081 return 0;
973} 1082}
974 1083
@@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{ 1245{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1246 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0; 1247 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device; 1248 struct btrfs_device *device;
1141 struct backing_dev_info *bdi; 1249 struct backing_dev_info *bdi;
1142#if 0 1250#if 0
@@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1144 btrfs_congested_async(info, 0)) 1252 btrfs_congested_async(info, 0))
1145 return 1; 1253 return 1;
1146#endif 1254#endif
1147 list_for_each(cur, &info->fs_devices->devices) { 1255 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev) 1256 if (!device->bdev)
1150 continue; 1257 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev); 1258 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1163 */ 1270 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1271static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{ 1272{
1166 struct list_head *cur;
1167 struct btrfs_device *device; 1273 struct btrfs_device *device;
1168 struct btrfs_fs_info *info; 1274 struct btrfs_fs_info *info;
1169 1275
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1276 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) { 1277 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev) 1278 if (!device->bdev)
1174 continue; 1279 continue;
1175 1280
@@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1552 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers); 1553 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock); 1555 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock); 1556 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock); 1557 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1535 init_waitqueue_head(&fs_info->transaction_throttle); 1639 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait); 1640 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait); 1641 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542 1642
1543 __setup_root(4096, 4096, 4096, 4096, tree_root, 1643 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1644 fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 * low idle thresh 1727 * low idle thresh
1628 */ 1728 */
1629 fs_info->endio_workers.idle_thresh = 4; 1729 fs_info->endio_workers.idle_thresh = 4;
1730 fs_info->endio_meta_workers.idle_thresh = 4;
1731
1630 fs_info->endio_write_workers.idle_thresh = 64; 1732 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64; 1733 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632 1734
@@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1720 ret = find_and_setup_root(tree_root, fs_info, 1822 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root); 1823 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1; 1824 dev_root->track_dirty = 1;
1723
1724 if (ret) 1825 if (ret)
1725 goto fail_extent_root; 1826 goto fail_extent_root;
1726 1827
@@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1841 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1842 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner"); 1843 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread) 1844 if (IS_ERR(fs_info->cleaner_kthread))
1744 goto fail_csum_root; 1845 goto fail_csum_root;
1745 1846
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1847 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root, 1848 tree_root,
1748 "btrfs-transaction"); 1849 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread) 1850 if (IS_ERR(fs_info->transaction_kthread))
1750 goto fail_cleaner; 1851 goto fail_cleaner;
1751 1852
1752 if (btrfs_super_log_root(disk_super) != 0) { 1853 if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1929,14 @@ fail_sb_buffer:
1828fail_iput: 1929fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1930 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode); 1931 iput(fs_info->btree_inode);
1831fail: 1932
1832 btrfs_close_devices(fs_info->fs_devices); 1933 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1934 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1935 bdi_destroy(&fs_info->bdi);
1834 1936
1937fail:
1835 kfree(extent_root); 1938 kfree(extent_root);
1836 kfree(tree_root); 1939 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info); 1940 kfree(fs_info);
1839 kfree(chunk_root); 1941 kfree(chunk_root);
1840 kfree(dev_root); 1942 kfree(dev_root);
@@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device,
1995 2097
1996int write_all_supers(struct btrfs_root *root, int max_mirrors) 2098int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{ 2099{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices; 2100 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev; 2101 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb; 2102 struct btrfs_super_block *sb;
@@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2011 2112
2012 sb = &root->fs_info->super_for_commit; 2113 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item; 2114 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) { 2115 list_for_each_entry(dev, head, dev_list) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) { 2116 if (!dev->bdev) {
2017 total_errors++; 2117 total_errors++;
2018 continue; 2118 continue;
@@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2045 } 2145 }
2046 2146
2047 total_errors = 0; 2147 total_errors = 0;
2048 list_for_each(cur, head) { 2148 list_for_each_entry(dev, head, dev_list) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev) 2149 if (!dev->bdev)
2051 continue; 2150 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable) 2151 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,6 +2359,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2260 u64 transid = btrfs_header_generation(buf); 2359 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode; 2360 struct inode *btree_inode = root->fs_info->btree_inode;
2262 2361
2362 btrfs_set_lock_blocking(buf);
2363
2263 WARN_ON(!btrfs_tree_locked(buf)); 2364 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) { 2365 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2366 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2302 int ret; 2403 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2404 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0) 2405 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE; 2406 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2306 return ret; 2407 return ret;
2307} 2408}
2308 2409
2309int btree_lock_page_hook(struct page *page) 2410int btree_lock_page_hook(struct page *page)
2310{ 2411{
2311 struct inode *inode = page->mapping->host; 2412 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb; 2414 struct extent_buffer *eb;
2315 unsigned long len; 2415 unsigned long len;
@@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page)
2324 goto out; 2424 goto out;
2325 2425
2326 btrfs_tree_lock(eb); 2426 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb); 2428 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb); 2429 free_extent_buffer(eb);
2332out: 2430out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b7..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info); 98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root);
101int btree_lock_page_hook(struct page *page); 103int btree_lock_page_hook(struct page *page);
104
105
106#ifdef CONFIG_DEBUG_LOCK_ALLOC
107void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
108#else
109static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
110 int level)
111{
112}
113#endif
102#endif 114#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..0a5d796c9f7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include "compat.h" 23#include "compat.h"
24#include "hash.h" 24#include "hash.h"
25#include "crc32c.h" 25#include "crc32c.h"
@@ -30,7 +30,6 @@
30#include "volumes.h" 30#include "volumes.h"
31#include "locking.h" 31#include "locking.h"
32#include "ref-cache.h" 32#include "ref-cache.h"
33#include "compat.h"
34 33
35#define PENDING_EXTENT_INSERT 0 34#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 35#define PENDING_EXTENT_DELETE 1
@@ -326,10 +325,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 325 u64 flags)
327{ 326{
328 struct list_head *head = &info->space_info; 327 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 328 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 329 list_for_each_entry(found, head, list) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags) 330 if (found->flags == flags)
334 return found; 331 return found;
335 } 332 }
@@ -1326,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 1323int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root) 1324 struct btrfs_root *root)
1328{ 1325{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1); 1326 u64 start;
1330 del_pending_extents(trans, root->fs_info->extent_root, 1); 1327 u64 end;
1328 int ret;
1329
1330 while(1) {
1331 finish_current_insert(trans, root->fs_info->extent_root, 1);
1332 del_pending_extents(trans, root->fs_info->extent_root, 1);
1333
1334 /* is there more work to do? */
1335 ret = find_first_extent_bit(&root->fs_info->pending_del,
1336 0, &start, &end, EXTENT_WRITEBACK);
1337 if (!ret)
1338 continue;
1339 ret = find_first_extent_bit(&root->fs_info->extent_ins,
1340 0, &start, &end, EXTENT_WRITEBACK);
1341 if (!ret)
1342 continue;
1343 break;
1344 }
1331 return 0; 1345 return 0;
1332} 1346}
1333 1347
@@ -1525,15 +1539,55 @@ out:
1525 return ret; 1539 return ret;
1526} 1540}
1527 1541
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1542/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1543 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1544 * can be in just about any order, and it is likely to have clusters of
1545 * things that are close together and clusters of things that are not.
1546 *
1547 * To help reduce the seeks that come with updating all of these reference
1548 * counts, sort them by byte number before actual updates are done.
1549 *
1550 * struct refsort is used to match byte number to slot in the btree block.
1551 * we sort based on the byte number and then use the slot to actually
1552 * find the item.
1553 *
1554 * struct refsort is smaller than strcut btrfs_item and smaller than
1555 * struct btrfs_key_ptr. Since we're currently limited to the page size
1556 * for a btree block, there's no way for a kmalloc of refsorts for a
1557 * single node to be bigger than a page.
1558 */
1559struct refsort {
1560 u64 bytenr;
1561 u32 slot;
1562};
1563
1564/*
1565 * for passing into sort()
1566 */
1567static int refsort_cmp(const void *a_void, const void *b_void)
1568{
1569 const struct refsort *a = a_void;
1570 const struct refsort *b = b_void;
1571
1572 if (a->bytenr < b->bytenr)
1573 return -1;
1574 if (a->bytenr > b->bytenr)
1575 return 1;
1576 return 0;
1577}
1578
1579
1580noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1581 struct btrfs_root *root,
1582 struct extent_buffer *orig_buf,
1583 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1584{
1532 u64 bytenr; 1585 u64 bytenr;
1533 u64 ref_root; 1586 u64 ref_root;
1534 u64 orig_root; 1587 u64 orig_root;
1535 u64 ref_generation; 1588 u64 ref_generation;
1536 u64 orig_generation; 1589 u64 orig_generation;
1590 struct refsort *sorted;
1537 u32 nritems; 1591 u32 nritems;
1538 u32 nr_file_extents = 0; 1592 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1593 struct btrfs_key key;
@@ -1542,6 +1596,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1596 int level;
1543 int ret = 0; 1597 int ret = 0;
1544 int faili = 0; 1598 int faili = 0;
1599 int refi = 0;
1600 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1601 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1602 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1603
@@ -1553,6 +1609,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1609 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1610 level = btrfs_header_level(buf);
1555 1611
1612 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1613 BUG_ON(!sorted);
1614
1556 if (root->ref_cows) { 1615 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1616 process_func = __btrfs_inc_extent_ref;
1558 } else { 1617 } else {
@@ -1565,6 +1624,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1624 process_func = __btrfs_update_extent_ref;
1566 } 1625 }
1567 1626
1627 /*
1628 * we make two passes through the items. In the first pass we
1629 * only record the byte number and slot. Then we sort based on
1630 * byte number and do the actual work based on the sorted results
1631 */
1568 for (i = 0; i < nritems; i++) { 1632 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1633 cond_resched();
1570 if (level == 0) { 1634 if (level == 0) {
@@ -1581,6 +1645,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1645 continue;
1582 1646
1583 nr_file_extents++; 1647 nr_file_extents++;
1648 sorted[refi].bytenr = bytenr;
1649 sorted[refi].slot = i;
1650 refi++;
1651 } else {
1652 bytenr = btrfs_node_blockptr(buf, i);
1653 sorted[refi].bytenr = bytenr;
1654 sorted[refi].slot = i;
1655 refi++;
1656 }
1657 }
1658 /*
1659 * if refi == 0, we didn't actually put anything into the sorted
1660 * array and we're done
1661 */
1662 if (refi == 0)
1663 goto out;
1664
1665 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1666
1667 for (i = 0; i < refi; i++) {
1668 cond_resched();
1669 slot = sorted[i].slot;
1670 bytenr = sorted[i].bytenr;
1671
1672 if (level == 0) {
1673 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1674
1585 ret = process_func(trans, root, bytenr, 1675 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1676 orig_buf->start, buf->start,
@@ -1589,25 +1679,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1679 key.objectid);
1590 1680
1591 if (ret) { 1681 if (ret) {
1592 faili = i; 1682 faili = slot;
1593 WARN_ON(1); 1683 WARN_ON(1);
1594 goto fail; 1684 goto fail;
1595 } 1685 }
1596 } else { 1686 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1687 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1688 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1689 orig_root, ref_root,
1601 orig_generation, ref_generation, 1690 orig_generation, ref_generation,
1602 level - 1); 1691 level - 1);
1603 if (ret) { 1692 if (ret) {
1604 faili = i; 1693 faili = slot;
1605 WARN_ON(1); 1694 WARN_ON(1);
1606 goto fail; 1695 goto fail;
1607 } 1696 }
1608 } 1697 }
1609 } 1698 }
1610out: 1699out:
1700 kfree(sorted);
1611 if (nr_extents) { 1701 if (nr_extents) {
1612 if (level == 0) 1702 if (level == 0)
1613 *nr_extents = nr_file_extents; 1703 *nr_extents = nr_file_extents;
@@ -1616,6 +1706,7 @@ out:
1616 } 1706 }
1617 return 0; 1707 return 0;
1618fail: 1708fail:
1709 kfree(sorted);
1619 WARN_ON(1); 1710 WARN_ON(1);
1620 return ret; 1711 return ret;
1621} 1712}
@@ -2137,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
2137 u64 end; 2228 u64 end;
2138 u64 priv; 2229 u64 priv;
2139 u64 search = 0; 2230 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info; 2231 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path; 2232 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp; 2233 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list; 2234 struct list_head insert_list, update_list;
2145 int ret; 2235 int ret;
2146 int num_inserts = 0, max_inserts; 2236 int num_inserts = 0, max_inserts, restart = 0;
2147 2237
2148 path = btrfs_alloc_path(); 2238 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list); 2239 INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2249,19 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2249 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2250 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2251 if (ret) {
2162 if (skipped && all && !num_inserts) { 2252 if (restart && !num_inserts &&
2163 skipped = 0; 2253 list_empty(&update_list)) {
2254 restart = 0;
2164 search = 0; 2255 search = 0;
2165 continue; 2256 continue;
2166 } 2257 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break; 2258 break;
2169 } 2259 }
2170 2260
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); 2261 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) { 2262 if (!ret) {
2173 skipped = 1; 2263 if (all)
2264 restart = 1;
2174 search = end + 1; 2265 search = end + 1;
2175 if (need_resched()) { 2266 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex); 2267 mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2280,7 @@ again:
2189 list_add_tail(&extent_op->list, &insert_list); 2280 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1; 2281 search = end + 1;
2191 if (num_inserts == max_inserts) { 2282 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex); 2283 restart = 1;
2193 break; 2284 break;
2194 } 2285 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) { 2286 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2296,6 @@ again:
2205 * somebody marked this thing for deletion then just unlock it and be 2296 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it 2297 * done, the free_extents will handle it
2207 */ 2298 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) { 2299 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr, 2300 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1, 2301 extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2317,10 @@ again:
2227 if (!list_empty(&update_list)) { 2317 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list); 2318 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret); 2319 BUG_ON(ret);
2320
2321 /* we may have COW'ed new blocks, so lets start over */
2322 if (all)
2323 restart = 1;
2230 } 2324 }
2231 2325
2232 /* 2326 /*
@@ -2234,9 +2328,9 @@ again:
2234 * need to make sure everything is cleaned then reset everything and 2328 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning 2329 * go back to the beginning
2236 */ 2330 */
2237 if (!num_inserts && all && skipped) { 2331 if (!num_inserts && restart) {
2238 search = 0; 2332 search = 0;
2239 skipped = 0; 2333 restart = 0;
2240 INIT_LIST_HEAD(&update_list); 2334 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list); 2335 INIT_LIST_HEAD(&insert_list);
2242 goto again; 2336 goto again;
@@ -2293,27 +2387,19 @@ again:
2293 BUG_ON(ret); 2387 BUG_ON(ret);
2294 2388
2295 /* 2389 /*
2296 * if we broke out of the loop in order to insert stuff because we hit 2390 * if restart is set for whatever reason we need to go back and start
2297 * the maximum number of inserts at a time we can handle, then loop 2391 * searching through the pending list again.
2298 * back and pick up where we left off 2392 *
2299 */ 2393 * We just inserted some extents, which could have resulted in new
2300 if (num_inserts == max_inserts) { 2394 * blocks being allocated, which would result in new blocks needing
2301 INIT_LIST_HEAD(&insert_list); 2395 * updates, so if all is set we _must_ restart to get the updated
2302 INIT_LIST_HEAD(&update_list); 2396 * blocks.
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */ 2397 */
2312 if (all && skipped) { 2398 if (restart || all) {
2313 INIT_LIST_HEAD(&insert_list); 2399 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list); 2400 INIT_LIST_HEAD(&update_list);
2315 search = 0; 2401 search = 0;
2316 skipped = 0; 2402 restart = 0;
2317 num_inserts = 0; 2403 num_inserts = 0;
2318 goto again; 2404 goto again;
2319 } 2405 }
@@ -2547,6 +2633,7 @@ again:
2547 if (ret) { 2633 if (ret) {
2548 if (all && skipped && !nr) { 2634 if (all && skipped && !nr) {
2549 search = 0; 2635 search = 0;
2636 skipped = 0;
2550 continue; 2637 continue;
2551 } 2638 }
2552 mutex_unlock(&info->extent_ins_mutex); 2639 mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2720,8 @@ again:
2633 goto again; 2720 goto again;
2634 } 2721 }
2635 2722
2723 if (!err)
2724 finish_current_insert(trans, extent_root, 0);
2636 return err; 2725 return err;
2637} 2726}
2638 2727
@@ -2700,13 +2789,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 2789 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2790 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 2791 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 2792 mutex_lock(&root->fs_info->pinned_mutex);
2704 2793 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 2794 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 2795 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 2796 return 0;
2712 } 2797 }
@@ -2787,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2787 2872
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2873 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc; 2874 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024; 2875 if (!btrfs_test_opt(root, SSD))
2876 empty_cluster = 64 * 1024;
2791 } 2877 }
2792 2878
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2879 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,7 +3100,6 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3100static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3101{
3016 struct btrfs_block_group_cache *cache; 3102 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3103
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3104 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3105 (unsigned long long)(info->total_bytes - info->bytes_used -
@@ -3022,8 +3107,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3022 (info->full) ? "" : "not "); 3107 (info->full) ? "" : "not ");
3023 3108
3024 down_read(&info->groups_sem); 3109 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3110 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3111 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3112 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3113 "%llu pinned %llu reserved\n",
@@ -3332,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3332 3416
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3417struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, 3418 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize) 3419 u64 bytenr, u32 blocksize,
3420 int level)
3336{ 3421{
3337 struct extent_buffer *buf; 3422 struct extent_buffer *buf;
3338 3423
@@ -3340,9 +3425,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3340 if (!buf) 3425 if (!buf)
3341 return ERR_PTR(-ENOMEM); 3426 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid); 3427 btrfs_set_header_generation(buf, trans->transid);
3428 btrfs_set_buffer_lockdep_class(buf, level);
3343 btrfs_tree_lock(buf); 3429 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3430 clean_tree_block(trans, root, buf);
3431
3432 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3433 btrfs_set_buffer_uptodate(buf);
3434
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3435 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3436 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3437 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3440,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3440 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3441 }
3353 trans->blocks_used++; 3442 trans->blocks_used++;
3443 /* this returns a buffer locked for blocking */
3354 return buf; 3444 return buf;
3355} 3445}
3356 3446
@@ -3379,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3379 return ERR_PTR(ret); 3469 return ERR_PTR(ret);
3380 } 3470 }
3381 3471
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3472 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3473 blocksize, level);
3383 return buf; 3474 return buf;
3384} 3475}
3385 3476
@@ -3388,36 +3479,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3479{
3389 u64 leaf_owner; 3480 u64 leaf_owner;
3390 u64 leaf_generation; 3481 u64 leaf_generation;
3482 struct refsort *sorted;
3391 struct btrfs_key key; 3483 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3484 struct btrfs_file_extent_item *fi;
3393 int i; 3485 int i;
3394 int nritems; 3486 int nritems;
3395 int ret; 3487 int ret;
3488 int refi = 0;
3489 int slot;
3396 3490
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3491 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3492 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3493 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3494 leaf_generation = btrfs_header_generation(leaf);
3401 3495
3496 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3497 /* we do this loop twice. The first time we build a list
3498 * of the extents we have a reference on, then we sort the list
3499 * by bytenr. The second time around we actually do the
3500 * extent freeing.
3501 */
3402 for (i = 0; i < nritems; i++) { 3502 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3503 u64 disk_bytenr;
3404 cond_resched(); 3504 cond_resched();
3405 3505
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3506 btrfs_item_key_to_cpu(leaf, &key, i);
3507
3508 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3509 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3510 continue;
3511
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3512 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3513
3514 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3515 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3516 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3517 continue;
3413 /* 3518
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3519 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3520
3521 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3522 if (disk_bytenr == 0)
3419 continue; 3523 continue;
3420 3524
3525 sorted[refi].bytenr = disk_bytenr;
3526 sorted[refi].slot = i;
3527 refi++;
3528 }
3529
3530 if (refi == 0)
3531 goto out;
3532
3533 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3534
3535 for (i = 0; i < refi; i++) {
3536 u64 disk_bytenr;
3537
3538 disk_bytenr = sorted[i].bytenr;
3539 slot = sorted[i].slot;
3540
3541 cond_resched();
3542
3543 btrfs_item_key_to_cpu(leaf, &key, slot);
3544 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3545 continue;
3546
3547 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3548
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3549 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3550 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3551 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3556,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3556 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3557 cond_resched();
3430 } 3558 }
3559out:
3560 kfree(sorted);
3431 return 0; 3561 return 0;
3432} 3562}
3433 3563
@@ -3437,9 +3567,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3567{
3438 int i; 3568 int i;
3439 int ret; 3569 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3570 struct btrfs_extent_info *info;
3571 struct refsort *sorted;
3572
3573 if (ref->nritems == 0)
3574 return 0;
3575
3576 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3577 for (i = 0; i < ref->nritems; i++) {
3578 sorted[i].bytenr = ref->extents[i].bytenr;
3579 sorted[i].slot = i;
3580 }
3581 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3441 3582
3583 /*
3584 * the items in the ref were sorted when the ref was inserted
3585 * into the ref cache, so this is already in order
3586 */
3442 for (i = 0; i < ref->nritems; i++) { 3587 for (i = 0; i < ref->nritems; i++) {
3588 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3589 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3590 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3591 ref->owner, ref->generation,
@@ -3453,6 +3599,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3599 info++;
3454 } 3600 }
3455 3601
3602 kfree(sorted);
3456 return 0; 3603 return 0;
3457} 3604}
3458 3605
@@ -3497,6 +3644,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3644}
3498 3645
3499/* 3646/*
3647 * this is used while deleting old snapshots, and it drops the refs
3648 * on a whole subtree starting from a level 1 node.
3649 *
3650 * The idea is to sort all the leaf pointers, and then drop the
3651 * ref on all the leaves in order. Most of the time the leaves
3652 * will have ref cache entries, so no leaf IOs will be required to
3653 * find the extents they have references on.
3654 *
3655 * For each leaf, any references it has are also dropped in order
3656 *
3657 * This ends up dropping the references in something close to optimal
3658 * order for reading and modifying the extent allocation tree.
3659 */
3660static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3661 struct btrfs_root *root,
3662 struct btrfs_path *path)
3663{
3664 u64 bytenr;
3665 u64 root_owner;
3666 u64 root_gen;
3667 struct extent_buffer *eb = path->nodes[1];
3668 struct extent_buffer *leaf;
3669 struct btrfs_leaf_ref *ref;
3670 struct refsort *sorted = NULL;
3671 int nritems = btrfs_header_nritems(eb);
3672 int ret;
3673 int i;
3674 int refi = 0;
3675 int slot = path->slots[1];
3676 u32 blocksize = btrfs_level_size(root, 0);
3677 u32 refs;
3678
3679 if (nritems == 0)
3680 goto out;
3681
3682 root_owner = btrfs_header_owner(eb);
3683 root_gen = btrfs_header_generation(eb);
3684 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3685
3686 /*
3687 * step one, sort all the leaf pointers so we don't scribble
3688 * randomly into the extent allocation tree
3689 */
3690 for (i = slot; i < nritems; i++) {
3691 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3692 sorted[refi].slot = i;
3693 refi++;
3694 }
3695
3696 /*
3697 * nritems won't be zero, but if we're picking up drop_snapshot
3698 * after a crash, slot might be > 0, so double check things
3699 * just in case.
3700 */
3701 if (refi == 0)
3702 goto out;
3703
3704 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3705
3706 /*
3707 * the first loop frees everything the leaves point to
3708 */
3709 for (i = 0; i < refi; i++) {
3710 u64 ptr_gen;
3711
3712 bytenr = sorted[i].bytenr;
3713
3714 /*
3715 * check the reference count on this leaf. If it is > 1
3716 * we just decrement it below and don't update any
3717 * of the refs the leaf points to.
3718 */
3719 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3720 BUG_ON(ret);
3721 if (refs != 1)
3722 continue;
3723
3724 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3725
3726 /*
3727 * the leaf only had one reference, which means the
3728 * only thing pointing to this leaf is the snapshot
3729 * we're deleting. It isn't possible for the reference
3730 * count to increase again later
3731 *
3732 * The reference cache is checked for the leaf,
3733 * and if found we'll be able to drop any refs held by
3734 * the leaf without needing to read it in.
3735 */
3736 ref = btrfs_lookup_leaf_ref(root, bytenr);
3737 if (ref && ref->generation != ptr_gen) {
3738 btrfs_free_leaf_ref(root, ref);
3739 ref = NULL;
3740 }
3741 if (ref) {
3742 ret = cache_drop_leaf_ref(trans, root, ref);
3743 BUG_ON(ret);
3744 btrfs_remove_leaf_ref(root, ref);
3745 btrfs_free_leaf_ref(root, ref);
3746 } else {
3747 /*
3748 * the leaf wasn't in the reference cache, so
3749 * we have to read it.
3750 */
3751 leaf = read_tree_block(root, bytenr, blocksize,
3752 ptr_gen);
3753 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3754 BUG_ON(ret);
3755 free_extent_buffer(leaf);
3756 }
3757 atomic_inc(&root->fs_info->throttle_gen);
3758 wake_up(&root->fs_info->transaction_throttle);
3759 cond_resched();
3760 }
3761
3762 /*
3763 * run through the loop again to free the refs on the leaves.
3764 * This is faster than doing it in the loop above because
3765 * the leaves are likely to be clustered together. We end up
3766 * working in nice chunks on the extent allocation tree.
3767 */
3768 for (i = 0; i < refi; i++) {
3769 bytenr = sorted[i].bytenr;
3770 ret = __btrfs_free_extent(trans, root, bytenr,
3771 blocksize, eb->start,
3772 root_owner, root_gen, 0, 1);
3773 BUG_ON(ret);
3774
3775 atomic_inc(&root->fs_info->throttle_gen);
3776 wake_up(&root->fs_info->transaction_throttle);
3777 cond_resched();
3778 }
3779out:
3780 kfree(sorted);
3781
3782 /*
3783 * update the path to show we've processed the entire level 1
3784 * node. This will get saved into the root's drop_snapshot_progress
3785 * field so these drops are not repeated again if this transaction
3786 * commits.
3787 */
3788 path->slots[1] = nritems;
3789 return 0;
3790}
3791
3792/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 3793 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 3794 * counts as it goes.
3502 */ 3795 */
@@ -3511,7 +3804,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 3804 struct extent_buffer *next;
3512 struct extent_buffer *cur; 3805 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 3806 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 3807 u32 blocksize;
3516 int ret; 3808 int ret;
3517 u32 refs; 3809 u32 refs;
@@ -3538,17 +3830,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 3830 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 3831 btrfs_header_nritems(cur))
3540 break; 3832 break;
3833
3834 /* the new code goes down to level 1 and does all the
3835 * leaves pointed to that node in bulk. So, this check
3836 * for level 0 will always be false.
3837 *
3838 * But, the disk format allows the drop_snapshot_progress
3839 * field in the root to leave things in a state where
3840 * a leaf will need cleaning up here. If someone crashes
3841 * with the old code and then boots with the new code,
3842 * we might find a leaf here.
3843 */
3541 if (*level == 0) { 3844 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 3845 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 3846 BUG_ON(ret);
3544 break; 3847 break;
3545 } 3848 }
3849
3850 /*
3851 * once we get to level one, process the whole node
3852 * at once, including everything below it.
3853 */
3854 if (*level == 1) {
3855 ret = drop_level_one_refs(trans, root, path);
3856 BUG_ON(ret);
3857 break;
3858 }
3859
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3860 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3861 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 3862 blocksize = btrfs_level_size(root, *level - 1);
3549 3863
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3864 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 3865 BUG_ON(ret);
3866
3867 /*
3868 * if there is more than one reference, we don't need
3869 * to read that node to drop any references it has. We
3870 * just drop the ref we hold on that node and move on to the
3871 * next slot in this level.
3872 */
3552 if (refs != 1) { 3873 if (refs != 1) {
3553 parent = path->nodes[*level]; 3874 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 3875 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +3888,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 3888
3568 continue; 3889 continue;
3569 } 3890 }
3891
3570 /* 3892 /*
3571 * at this point, we have a single ref, and since the 3893 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 3894 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 3895 */
3576 if (*level == 1) { 3896 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 3897 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 3898 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 3899 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +3918,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 3918 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 3919 root_gen = btrfs_header_generation(parent);
3633 3920
3921 /*
3922 * cleanup and free the reference on the last node
3923 * we processed
3924 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3925 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 3926 parent->start, root_owner, root_gen,
3636 *level, 1); 3927 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 3928 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 3929 path->nodes[*level] = NULL;
3930
3639 *level += 1; 3931 *level += 1;
3640 BUG_ON(ret); 3932 BUG_ON(ret);
3641 3933
@@ -3687,6 +3979,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 3979
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3980 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 3981 btrfs_tree_lock(next);
3982 btrfs_set_lock_blocking(next);
3690 3983
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3984 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 3985 &refs);
@@ -3754,6 +4047,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4047 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4048 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4049 struct btrfs_disk_key disk_key;
4050
4051 /*
4052 * there is more work to do in this level.
4053 * Update the drop_progress marker to reflect
4054 * the work we've done so far, and then bump
4055 * the slot number
4056 */
3757 node = path->nodes[i]; 4057 node = path->nodes[i];
3758 path->slots[i]++; 4058 path->slots[i]++;
3759 *level = i; 4059 *level = i;
@@ -3765,6 +4065,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4065 return 0;
3766 } else { 4066 } else {
3767 struct extent_buffer *parent; 4067 struct extent_buffer *parent;
4068
4069 /*
4070 * this whole node is done, free our reference
4071 * on it and go up one level
4072 */
3768 if (path->nodes[*level] == root->node) 4073 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4074 parent = path->nodes[*level];
3770 else 4075 else
@@ -4444,7 +4749,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4749 u64 lock_end = 0;
4445 u64 num_bytes; 4750 u64 num_bytes;
4446 u64 ext_offset; 4751 u64 ext_offset;
4447 u64 first_pos; 4752 u64 search_end = (u64)-1;
4448 u32 nritems; 4753 u32 nritems;
4449 int nr_scaned = 0; 4754 int nr_scaned = 0;
4450 int extent_locked = 0; 4755 int extent_locked = 0;
@@ -4452,7 +4757,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 4757 int ret;
4453 4758
4454 memcpy(&key, leaf_key, sizeof(key)); 4759 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4760 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 4761 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 4762 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +4805,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 4805 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 4806 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 4807 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 4808 key.offset >= search_end)
4505 break; 4809 break;
4506 } 4810 }
4507 4811
@@ -4534,8 +4838,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4838 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 4839 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 4840
4537 if (first_pos > key.offset - ext_offset) 4841 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 4842 search_end = key.offset - ext_offset +
4843 btrfs_file_extent_ram_bytes(leaf, fi);
4844 }
4539 4845
4540 if (!extent_locked) { 4846 if (!extent_locked) {
4541 lock_start = key.offset; 4847 lock_start = key.offset;
@@ -4724,7 +5030,7 @@ next:
4724 } 5030 }
4725skip: 5031skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5032 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5033 key.offset >= search_end)
4728 break; 5034 break;
4729 5035
4730 cond_resched(); 5036 cond_resched();
@@ -4778,6 +5084,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5084 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5085 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5086 ref->generation = btrfs_header_generation(buf);
5087
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5088 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5089 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5090 btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5351 prev_block = block_start; 5658 prev_block = block_start;
5352 } 5659 }
5353 5660
5661 mutex_lock(&extent_root->fs_info->trans_mutex);
5354 btrfs_record_root_in_trans(found_root); 5662 btrfs_record_root_in_trans(found_root);
5663 mutex_unlock(&extent_root->fs_info->trans_mutex);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5664 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /* 5665 /*
5357 * try to update data extent references while 5666 * try to update data extent references while
@@ -5957,9 +6266,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6266 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6267 BUG_ON(!path);
5959 6268
5960 btrfs_remove_free_space_cache(block_group); 6269 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6270 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6271 &root->fs_info->block_group_cache_tree);
6272 spin_unlock(&root->fs_info->block_group_cache_lock);
6273 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6274 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6275 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6276 up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1fa..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/pagevec.h> 13#include <linux/pagevec.h>
15#include "extent_io.h" 14#include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
31static LIST_HEAD(states); 30static LIST_HEAD(states);
32 31
33#define LEAK_DEBUG 0 32#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG 33#if LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock); 34static DEFINE_SPINLOCK(leak_lock);
36#endif 35#endif
37 36
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
120static struct extent_state *alloc_extent_state(gfp_t mask) 119static struct extent_state *alloc_extent_state(gfp_t mask)
121{ 120{
122 struct extent_state *state; 121 struct extent_state *state;
123#ifdef LEAK_DEBUG 122#if LEAK_DEBUG
124 unsigned long flags; 123 unsigned long flags;
125#endif 124#endif
126 125
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
130 state->state = 0; 129 state->state = 0;
131 state->private = 0; 130 state->private = 0;
132 state->tree = NULL; 131 state->tree = NULL;
133#ifdef LEAK_DEBUG 132#if LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags); 133 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states); 134 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags); 135 spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
145 if (!state) 144 if (!state)
146 return; 145 return;
147 if (atomic_dec_and_test(&state->refs)) { 146 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG 147#if LEAK_DEBUG
149 unsigned long flags; 148 unsigned long flags;
150#endif 149#endif
151 WARN_ON(state->tree); 150 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG 151#if LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags); 152 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list); 153 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags); 154 spin_unlock_irqrestore(&leak_lock, flags);
@@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
416 415
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 416 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) { 417 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc); 418 free_extent_state(prealloc);
422 return -EEXIST; 419 return -EEXIST;
423 } 420 }
@@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2378 int scanned = 0; 2375 int scanned = 0;
2379 int range_whole = 0; 2376 int range_whole = 0;
2380 2377
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0); 2378 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) { 2379 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */ 2380 index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2847,98 @@ out:
2855 return sector; 2847 return sector;
2856} 2848}
2857 2849
2850int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2851 __u64 start, __u64 len, get_extent_t *get_extent)
2852{
2853 int ret;
2854 u64 off = start;
2855 u64 max = start + len;
2856 u32 flags = 0;
2857 u64 disko = 0;
2858 struct extent_map *em = NULL;
2859 int end = 0;
2860 u64 em_start = 0, em_len = 0;
2861 unsigned long emflags;
2862 ret = 0;
2863
2864 if (len == 0)
2865 return -EINVAL;
2866
2867 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2868 GFP_NOFS);
2869 em = get_extent(inode, NULL, 0, off, max - off, 0);
2870 if (!em)
2871 goto out;
2872 if (IS_ERR(em)) {
2873 ret = PTR_ERR(em);
2874 goto out;
2875 }
2876 while (!end) {
2877 off = em->start + em->len;
2878 if (off >= max)
2879 end = 1;
2880
2881 em_start = em->start;
2882 em_len = em->len;
2883
2884 disko = 0;
2885 flags = 0;
2886
2887 switch (em->block_start) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST;
2891 break;
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break;
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break;
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN);
2902 break;
2903 default:
2904 disko = em->block_start;
2905 break;
2906 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED;
2909
2910 emflags = em->flags;
2911 free_extent_map(em);
2912 em = NULL;
2913
2914 if (!end) {
2915 em = get_extent(inode, NULL, 0, off, max - off, 0);
2916 if (!em)
2917 goto out;
2918 if (IS_ERR(em)) {
2919 ret = PTR_ERR(em);
2920 goto out;
2921 }
2922 emflags = em->flags;
2923 }
2924 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2925 flags |= FIEMAP_EXTENT_LAST;
2926 end = 1;
2927 }
2928
2929 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2930 em_len, flags);
2931 if (ret)
2932 goto out_free;
2933 }
2934out_free:
2935 free_extent_map(em);
2936out:
2937 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2938 GFP_NOFS);
2939 return ret;
2940}
2941
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2942static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i) 2943 unsigned long i)
2860{ 2944{
@@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2892 gfp_t mask) 2976 gfp_t mask)
2893{ 2977{
2894 struct extent_buffer *eb = NULL; 2978 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG 2979#if LEAK_DEBUG
2896 unsigned long flags; 2980 unsigned long flags;
2897#endif 2981#endif
2898 2982
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2983 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start; 2984 eb->start = start;
2901 eb->len = len; 2985 eb->len = len;
2902 mutex_init(&eb->mutex); 2986 spin_lock_init(&eb->lock);
2903#ifdef LEAK_DEBUG 2987 init_waitqueue_head(&eb->lock_wq);
2988
2989#if LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags); 2990 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers); 2991 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags); 2992 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2912 2998
2913static void __free_extent_buffer(struct extent_buffer *eb) 2999static void __free_extent_buffer(struct extent_buffer *eb)
2914{ 3000{
2915#ifdef LEAK_DEBUG 3001#if LEAK_DEBUG
2916 unsigned long flags; 3002 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags); 3003 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list); 3004 list_del(&eb->leak_list);
@@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2980 unlock_page(p); 3066 unlock_page(p);
2981 } 3067 }
2982 if (uptodate) 3068 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE; 3069 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985 3070
2986 spin_lock(&tree->buffer_lock); 3071 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3072 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3135 unsigned long num_pages; 3220 unsigned long num_pages;
3136 3221
3137 num_pages = num_extent_pages(eb->start, eb->len); 3222 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE; 3223 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3139 3224
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3225 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS); 3226 GFP_NOFS);
@@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3206 struct page *page; 3291 struct page *page;
3207 int pg_uptodate = 1; 3292 int pg_uptodate = 1;
3208 3293
3209 if (eb->flags & EXTENT_UPTODATE) 3294 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3210 return 1; 3295 return 1;
3211 3296
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3297 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 struct bio *bio = NULL; 3327 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0; 3328 unsigned long bio_flags = 0;
3244 3329
3245 if (eb->flags & EXTENT_UPTODATE) 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3246 return 0; 3331 return 0;
3247 3332
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3333 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3273 } 3358 }
3274 if (all_uptodate) { 3359 if (all_uptodate) {
3275 if (start_i == 0) 3360 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE; 3361 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 goto unlock_exit; 3362 goto unlock_exit;
3278 } 3363 }
3279 3364
@@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3309 } 3394 }
3310 3395
3311 if (!ret) 3396 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE; 3397 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3313 return ret; 3398 return ret;
3314 3399
3315unlock_exit: 3400unlock_exit:
@@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3406 unmap_extent_buffer(eb, eb->map_token, km); 3491 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL; 3492 eb->map_token = NULL;
3408 save = 1; 3493 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 } 3494 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map, 3495 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km); 3496 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a79137..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
193 unsigned from, unsigned to); 205 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 206sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent); 207 get_extent_t *get_extent);
208int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
209 __u64 start, __u64 len, get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 210int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 211int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 212int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada32..50da69da20ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h> 6#include <linux/hardirq.h>
8#include "extent_map.h" 7#include "extent_map.h"
9 8
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145e..872f104576e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
29#include <linux/writeback.h> 29#include <linux/writeback.h>
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h" 32#include "ctree.h"
34#include "disk-io.h" 33#include "disk-io.h"
35#include "transaction.h" 34#include "transaction.h"
@@ -1215,15 +1214,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1215 } 1214 }
1216 mutex_unlock(&root->fs_info->trans_mutex); 1215 mutex_unlock(&root->fs_info->trans_mutex);
1217 1216
1218 root->fs_info->tree_log_batch++; 1217 root->log_batch++;
1219 filemap_fdatawrite(inode->i_mapping); 1218 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1219 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++; 1220 root->log_batch++;
1222 1221
1223 /* 1222 /*
1224 * ok we haven't committed the transaction yet, lets do a commit 1223 * ok we haven't committed the transaction yet, lets do a commit
1225 */ 1224 */
1226 if (file->private_data) 1225 if (file && file->private_data)
1227 btrfs_ioctl_trans_end(file); 1226 btrfs_ioctl_trans_end(file);
1228 1227
1229 trans = btrfs_start_transaction(root, 1); 1228 trans = btrfs_start_transaction(root, 1);
@@ -1232,7 +1231,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1232 goto out; 1231 goto out;
1233 } 1232 }
1234 1233
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); 1234 ret = btrfs_log_dentry_safe(trans, root, dentry);
1236 if (ret < 0) 1235 if (ret < 0)
1237 goto out; 1236 goto out;
1238 1237
@@ -1246,7 +1245,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1246 * file again, but that will end up using the synchronization 1245 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe. 1246 * inside btrfs_sync_log to keep things safe.
1248 */ 1247 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex); 1248 mutex_unlock(&dentry->d_inode->i_mutex);
1250 1249
1251 if (ret > 0) { 1250 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root); 1251 ret = btrfs_commit_transaction(trans, root);
@@ -1254,7 +1253,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1254 btrfs_sync_log(trans, root); 1253 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root); 1254 ret = btrfs_end_transaction(trans, root);
1256 } 1255 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex); 1256 mutex_lock(&dentry->d_inode->i_mutex);
1258out: 1257out:
1259 return ret > 0 ? EIO : ret; 1258 return ret > 0 ? EIO : ret;
1260} 1259}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
86 86
87 btrfs_init_path(path);
88 start_found = 0; 87 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); 88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0) 89 if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab41..3cee77ae03c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/statfs.h> 34#include <linux/statfs.h>
35#include <linux/compat.h> 35#include <linux/compat.h>
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h> 37#include <linux/xattr.h>
39#include <linux/posix_acl.h> 38#include <linux/posix_acl.h>
40#include <linux/falloc.h> 39#include <linux/falloc.h>
@@ -51,6 +50,7 @@
51#include "tree-log.h" 50#include "tree-log.h"
52#include "ref-cache.h" 51#include "ref-cache.h"
53#include "compression.h" 52#include "compression.h"
53#include "locking.h"
54 54
55struct btrfs_iget_args { 55struct btrfs_iget_args {
56 u64 ino; 56 u64 ino;
@@ -91,6 +91,16 @@ static noinline int cow_file_range(struct inode *inode,
91 u64 start, u64 end, int *page_started, 91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock); 92 unsigned long *nr_written, int unlock);
93 93
94static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
95{
96 int err;
97
98 err = btrfs_init_acl(inode, dir);
99 if (!err)
100 err = btrfs_xattr_security_init(inode, dir);
101 return err;
102}
103
94/* 104/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There 105 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing. 106 * are countless ways this is incorrect, but it is better than nothing.
@@ -350,6 +360,19 @@ again:
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 360 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 361 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352 362
363 /*
364 * we don't want to send crud past the end of i_size through
365 * compression, that's just a waste of CPU time. So, if the
366 * end of the file is before the start of our current
367 * requested range of bytes, we bail out to the uncompressed
368 * cleanup code that can deal with all of this.
369 *
370 * It isn't really the fastest way to fix things, but this is a
371 * very uncommon corner.
372 */
373 if (actual_end <= start)
374 goto cleanup_and_bail_uncompressed;
375
353 total_compressed = actual_end - start; 376 total_compressed = actual_end - start;
354 377
355 /* we want to make sure that amount of ram required to uncompress 378 /* we want to make sure that amount of ram required to uncompress
@@ -494,6 +517,7 @@ again:
494 goto again; 517 goto again;
495 } 518 }
496 } else { 519 } else {
520cleanup_and_bail_uncompressed:
497 /* 521 /*
498 * No compression, but we still need to write the pages in 522 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked 523 * the file we've been given so far. redirty the locked
@@ -1324,12 +1348,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset, 1348 struct inode *inode, u64 file_offset,
1325 struct list_head *list) 1349 struct list_head *list)
1326{ 1350{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum; 1351 struct btrfs_ordered_sum *sum;
1329 1352
1330 btrfs_set_trans_block_group(trans, inode); 1353 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) { 1354
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1355 list_for_each_entry(sum, list, list) {
1333 btrfs_csum_file_blocks(trans, 1356 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1357 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 } 1358 }
@@ -2013,6 +2036,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2036 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014 2037
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2038 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2039
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2040 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2041 alloc_group_block, 0);
2018 btrfs_free_path(path); 2042 btrfs_free_path(path);
@@ -2039,6 +2063,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2063 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break; 2064 break;
2041 default: 2065 default:
2066 inode->i_op = &btrfs_special_inode_operations;
2042 init_special_inode(inode, inode->i_mode, rdev); 2067 init_special_inode(inode, inode->i_mode, rdev);
2043 break; 2068 break;
2044 } 2069 }
@@ -2108,6 +2133,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2108 goto failed; 2133 goto failed;
2109 } 2134 }
2110 2135
2136 btrfs_unlock_up_safe(path, 1);
2111 leaf = path->nodes[0]; 2137 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2138 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item); 2139 struct btrfs_inode_item);
@@ -2429,6 +2455,8 @@ next_node:
2429 ref->generation = leaf_gen; 2455 ref->generation = leaf_gen;
2430 ref->nritems = 0; 2456 ref->nritems = 0;
2431 2457
2458 btrfs_sort_leaf_ref(ref);
2459
2432 ret = btrfs_add_leaf_ref(root, ref, 0); 2460 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret); 2461 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref); 2462 btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2504,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_path *path; 2504 struct btrfs_path *path;
2477 struct btrfs_key key; 2505 struct btrfs_key key;
2478 struct btrfs_key found_key; 2506 struct btrfs_key found_key;
2479 u32 found_type; 2507 u32 found_type = (u8)-1;
2480 struct extent_buffer *leaf; 2508 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi; 2509 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0; 2510 u64 extent_start = 0;
@@ -2503,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2503 key.offset = (u64)-1; 2531 key.offset = (u64)-1;
2504 key.type = (u8)-1; 2532 key.type = (u8)-1;
2505 2533
2506 btrfs_init_path(path);
2507
2508search_again: 2534search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2535 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0) 2536 if (ret < 0)
@@ -2663,6 +2689,8 @@ next:
2663 if (pending_del_nr) 2689 if (pending_del_nr)
2664 goto del_pending; 2690 goto del_pending;
2665 btrfs_release_path(root, path); 2691 btrfs_release_path(root, path);
2692 if (found_type == BTRFS_INODE_ITEM_KEY)
2693 break;
2666 goto search_again; 2694 goto search_again;
2667 } 2695 }
2668 2696
@@ -2679,6 +2707,8 @@ del_pending:
2679 BUG_ON(ret); 2707 BUG_ON(ret);
2680 pending_del_nr = 0; 2708 pending_del_nr = 0;
2681 btrfs_release_path(root, path); 2709 btrfs_release_path(root, path);
2710 if (found_type == BTRFS_INODE_ITEM_KEY)
2711 break;
2682 goto search_again; 2712 goto search_again;
2683 } 2713 }
2684 } 2714 }
@@ -3265,7 +3295,7 @@ skip:
3265 3295
3266 /* Reached end of directory/root. Bump pos past the last item. */ 3296 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3297 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3298 filp->f_pos = INT_LIMIT(off_t);
3269 else 3299 else
3270 filp->f_pos++; 3300 filp->f_pos++;
3271nopos: 3301nopos:
@@ -3458,7 +3488,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3458 root->highest_inode = objectid; 3488 root->highest_inode = objectid;
3459 3489
3460 inode->i_uid = current_fsuid(); 3490 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid(); 3491
3492 if (dir && (dir->i_mode & S_ISGID)) {
3493 inode->i_gid = dir->i_gid;
3494 if (S_ISDIR(mode))
3495 mode |= S_ISGID;
3496 } else
3497 inode->i_gid = current_fsgid();
3498
3462 inode->i_mode = mode; 3499 inode->i_mode = mode;
3463 inode->i_ino = objectid; 3500 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0); 3501 inode_set_bytes(inode, 0);
@@ -3586,7 +3623,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3586 if (IS_ERR(inode)) 3623 if (IS_ERR(inode))
3587 goto out_unlock; 3624 goto out_unlock;
3588 3625
3589 err = btrfs_init_acl(inode, dir); 3626 err = btrfs_init_inode_security(inode, dir);
3590 if (err) { 3627 if (err) {
3591 drop_inode = 1; 3628 drop_inode = 1;
3592 goto out_unlock; 3629 goto out_unlock;
@@ -3649,7 +3686,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3649 if (IS_ERR(inode)) 3686 if (IS_ERR(inode))
3650 goto out_unlock; 3687 goto out_unlock;
3651 3688
3652 err = btrfs_init_acl(inode, dir); 3689 err = btrfs_init_inode_security(inode, dir);
3653 if (err) { 3690 if (err) {
3654 drop_inode = 1; 3691 drop_inode = 1;
3655 goto out_unlock; 3692 goto out_unlock;
@@ -3772,7 +3809,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3772 3809
3773 drop_on_err = 1; 3810 drop_on_err = 1;
3774 3811
3775 err = btrfs_init_acl(inode, dir); 3812 err = btrfs_init_inode_security(inode, dir);
3776 if (err) 3813 if (err)
3777 goto out_fail; 3814 goto out_fail;
3778 3815
@@ -4158,9 +4195,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4158 return -EINVAL; 4195 return -EINVAL;
4159} 4196}
4160 4197
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4198static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4199 __u64 start, __u64 len)
4162{ 4200{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent); 4201 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4164} 4202}
4165 4203
4166int btrfs_readpage(struct file *file, struct page *page) 4204int btrfs_readpage(struct file *file, struct page *page)
@@ -4223,7 +4261,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{ 4261{
4224 if (PageWriteback(page) || PageDirty(page)) 4262 if (PageWriteback(page) || PageDirty(page))
4225 return 0; 4263 return 0;
4226 return __btrfs_releasepage(page, gfp_flags); 4264 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
4227} 4265}
4228 4266
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4267static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4733,7 +4771,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4733 if (IS_ERR(inode)) 4771 if (IS_ERR(inode))
4734 goto out_unlock; 4772 goto out_unlock;
4735 4773
4736 err = btrfs_init_acl(inode, dir); 4774 err = btrfs_init_inode_security(inode, dir);
4737 if (err) { 4775 if (err) {
4738 drop_inode = 1; 4776 drop_inode = 1;
4739 goto out_unlock; 4777 goto out_unlock;
@@ -4987,13 +5025,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
4987 .clear_bit_hook = btrfs_clear_bit_hook, 5025 .clear_bit_hook = btrfs_clear_bit_hook,
4988}; 5026};
4989 5027
5028/*
5029 * btrfs doesn't support the bmap operation because swapfiles
5030 * use bmap to make a mapping of extents in the file. They assume
5031 * these extents won't change over the life of the file and they
5032 * use the bmap result to do IO directly to the drive.
5033 *
5034 * the btrfs bmap call would return logical addresses that aren't
5035 * suitable for IO and they also will change frequently as COW
5036 * operations happen. So, swapfile + btrfs == corruption.
5037 *
5038 * For now we're avoiding this by dropping bmap.
5039 */
4990static struct address_space_operations btrfs_aops = { 5040static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage, 5041 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage, 5042 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages, 5043 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages, 5044 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page, 5045 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO, 5046 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage, 5047 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage, 5048 .releasepage = btrfs_releasepage,
@@ -5017,6 +5066,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5017 .removexattr = btrfs_removexattr, 5066 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission, 5067 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate, 5068 .fallocate = btrfs_fallocate,
5069 .fiemap = btrfs_fiemap,
5020}; 5070};
5021static struct inode_operations btrfs_special_inode_operations = { 5071static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr, 5072 .getattr = btrfs_getattr,
@@ -5032,4 +5082,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5032 .follow_link = page_follow_link_light, 5082 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link, 5083 .put_link = page_put_link,
5034 .permission = btrfs_permission, 5084 .permission = btrfs_permission,
5085 .setxattr = btrfs_setxattr,
5086 .getxattr = btrfs_getxattr,
5087 .listxattr = btrfs_listxattr,
5088 .removexattr = btrfs_removexattr,
5035}; 5089};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb5..988fdc8b49eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
38#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/bit_spinlock.h> 39#include <linux/bit_spinlock.h>
40#include <linux/security.h> 40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h> 41#include <linux/xattr.h>
43#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
44#include "compat.h" 43#include "compat.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..85506c4a3af7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,64 +25,203 @@
25#include "extent_io.h" 25#include "extent_io.h"
26#include "locking.h" 26#include "locking.h"
27 27
28static inline void spin_nested(struct extent_buffer *eb)
29{
30 spin_lock(&eb->lock);
31}
32
28/* 33/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 34 * Setting a lock to blocking will drop the spinlock and set the
30 * and the spin is not tuned very extensively. The spinning does make a big 35 * flag that forces other procs who want the lock to wait. After
31 * difference in almost every workload, but spinning for the right amount of 36 * this you can safely schedule with the lock held.
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 37 */
38void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{
40 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
41 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
42 spin_unlock(&eb->lock);
43 }
44 /* exit with the spin lock released and the bit set */
45}
37 46
38int btrfs_tree_lock(struct extent_buffer *eb) 47/*
48 * clearing the blocking flag will take the spinlock again.
49 * After this you can't safely schedule
50 */
51void btrfs_clear_lock_blocking(struct extent_buffer *eb)
39{ 52{
40 int i; 53 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
54 spin_nested(eb);
55 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
56 smp_mb__after_clear_bit();
57 }
58 /* exit with the spin lock held */
59}
41 60
42 if (mutex_trylock(&eb->mutex)) 61/*
43 return 0; 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 *
67 * So, we're still stuck with this crummy spin on the blocking bit,
68 * at least until the most common causes of the short blocks
69 * can be dealt with.
70 */
71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{
73 int i;
44 for (i = 0; i < 512; i++) { 74 for (i = 0; i < 512; i++) {
45 cpu_relax(); 75 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 77 return 1;
78 if (need_resched())
79 break;
48 } 80 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 81 return 0;
52} 82}
53 83
54int btrfs_try_tree_lock(struct extent_buffer *eb) 84/*
85 * This is somewhat different from trylock. It will take the
86 * spinlock but if it finds the lock is set to blocking, it will
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */
94int btrfs_try_spin_lock(struct extent_buffer *eb)
55{ 95{
56 return mutex_trylock(&eb->mutex); 96 int i;
97
98 spin_nested(eb);
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
100 return 1;
101 spin_unlock(&eb->lock);
102
103 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) {
105 if (!btrfs_spin_on_block(eb))
106 break;
107
108 spin_nested(eb);
109 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
110 return 1;
111 spin_unlock(&eb->lock);
112 }
113 return 0;
57} 114}
58 115
59int btrfs_tree_unlock(struct extent_buffer *eb) 116/*
117 * the autoremove wake function will return 0 if it tried to wake up
118 * a process that was already awake, which means that process won't
119 * count as an exclusive wakeup. The waitq code will continue waking
120 * procs until it finds one that was actually sleeping.
121 *
122 * For btrfs, this isn't quite what we want. We want a single proc
123 * to be notified that the lock is ready for taking. If that proc
124 * already happen to be awake, great, it will loop around and try for
125 * the lock.
126 *
127 * So, btrfs_wake_function always returns 1, even when the proc that we
128 * tried to wake up was already awake.
129 */
130static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
131 int sync, void *key)
60{ 132{
61 mutex_unlock(&eb->mutex); 133 autoremove_wake_function(wait, mode, sync, key);
62 return 0; 134 return 1;
63} 135}
64 136
65int btrfs_tree_locked(struct extent_buffer *eb) 137/*
138 * returns with the extent buffer spinlocked.
139 *
140 * This will spin and/or wait as required to take the lock, and then
141 * return with the spinlock held.
142 *
143 * After this call, scheduling is not safe without first calling
144 * btrfs_set_lock_blocking()
145 */
146int btrfs_tree_lock(struct extent_buffer *eb)
66{ 147{
67 return mutex_is_locked(&eb->mutex); 148 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function;
150
151 while(1) {
152 spin_nested(eb);
153
154 /* nobody is blocking, exit with the spinlock held */
155 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
156 return 0;
157
158 /*
159 * we have the spinlock, but the real owner is blocking.
160 * wait for them
161 */
162 spin_unlock(&eb->lock);
163
164 /*
165 * spin for a bit, and if the blocking flag goes away,
166 * loop around
167 */
168 if (btrfs_spin_on_block(eb))
169 continue;
170
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE);
173
174 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
175 schedule();
176
177 finish_wait(&eb->lock_wq, &wait);
178 }
179 return 0;
68} 180}
69 181
70/* 182/*
71 * btrfs_search_slot uses this to decide if it should drop its locks 183 * Very quick trylock, this does not spin or schedule. It returns
72 * before doing something expensive like allocating free blocks for cow. 184 * 1 with the spinlock held if it was able to take the lock, or it
185 * returns zero if it was unable to take the lock.
186 *
187 * After this call, scheduling is not safe without first calling
188 * btrfs_set_lock_blocking()
73 */ 189 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 190int btrfs_try_tree_lock(struct extent_buffer *eb)
75{ 191{
76 int i; 192 if (spin_trylock(&eb->lock)) {
77 struct extent_buffer *eb; 193 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 194 /*
79 eb = path->nodes[i]; 195 * we've got the spinlock, but the real owner is
80 if (!eb) 196 * blocking. Drop the spinlock and return failure
81 break; 197 */
82 smp_mb(); 198 spin_unlock(&eb->lock);
83 if (!list_empty(&eb->mutex.wait_list)) 199 return 0;
84 return 1; 200 }
201 return 1;
85 } 202 }
203 /* someone else has the spinlock giveup */
86 return 0; 204 return 0;
87} 205}
88 206
207int btrfs_tree_unlock(struct extent_buffer *eb)
208{
209 /*
210 * if we were a blocking owner, we don't have the spinlock held
211 * just clear the bit and look for waiters
212 */
213 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
214 smp_mb__after_clear_bit();
215 else
216 spin_unlock(&eb->lock);
217
218 if (waitqueue_active(&eb->lock_wq))
219 wake_up(&eb->lock_wq);
220 return 0;
221}
222
223int btrfs_tree_locked(struct extent_buffer *eb)
224{
225 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
226 spin_is_locked(&eb->lock);
227}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..6bb0afbff928 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,10 @@
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24int btrfs_tree_locked(struct extent_buffer *eb);
25
25int btrfs_try_tree_lock(struct extent_buffer *eb); 26int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 27int btrfs_try_spin_lock(struct extent_buffer *eb);
28
29void btrfs_set_lock_blocking(struct extent_buffer *eb);
30void btrfs_clear_lock_blocking(struct extent_buffer *eb);
27#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a20940170274..77c2411a5f0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
613 struct btrfs_sector_sum *sector_sums; 613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered; 614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors; 616 unsigned long num_sectors;
618 unsigned long i; 617 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
624 return 1; 623 return 1;
625 624
626 mutex_lock(&tree->mutex); 625 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) { 626 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) { 627 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize; 628 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums; 629 sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9eab..d0cc62bccb94 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "ref-cache.h" 22#include "ref-cache.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c59..bc283ad2db73 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared); 74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif 76#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db9fb3bc1e33..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
37#include <linux/ctype.h> 37#include <linux/ctype.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h> 40#include <linux/magic.h>
42#include "compat.h" 41#include "compat.h"
43#include "ctree.h" 42#include "ctree.h"
@@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
380 btrfs_start_delalloc_inodes(root); 379 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0); 380 btrfs_wait_ordered_extents(root, 0);
382 381
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1); 382 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root); 383 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0; 384 sb->s_dirt = 0;
@@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
512 struct btrfs_root *root = btrfs_sb(sb); 510 struct btrfs_root *root = btrfs_sb(sb);
513 int ret; 511 int ret;
514 512
513 ret = btrfs_parse_options(root, data);
514 if (ret)
515 return -EINVAL;
516
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 517 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0; 518 return 0;
517 519
@@ -583,17 +585,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
583 struct btrfs_ioctl_vol_args *vol; 585 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices; 586 struct btrfs_fs_devices *fs_devices;
585 int ret = -ENOTTY; 587 int ret = -ENOTTY;
586 int len;
587 588
588 if (!capable(CAP_SYS_ADMIN)) 589 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM; 590 return -EPERM;
590 591
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 592 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
593 if (!vol)
594 return -ENOMEM;
595
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 596 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT; 597 ret = -EFAULT;
594 goto out; 598 goto out;
595 } 599 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 600
598 switch (cmd) { 601 switch (cmd) {
599 case BTRFS_IOC_SCAN_DEV: 602 case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f9443340..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
688 num_bytes -= btrfs_root_used(&dirty->root->root_item); 688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item); 689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) { 690 if (num_bytes) {
691 mutex_lock(&root->fs_info->trans_mutex);
691 btrfs_record_root_in_trans(root); 692 btrfs_record_root_in_trans(root);
693 mutex_unlock(&root->fs_info->trans_mutex);
692 btrfs_set_root_used(&root->root_item, 694 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes); 695 bytes_used - num_bytes);
694 } 696 }
@@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
852{ 854{
853 struct btrfs_pending_snapshot *pending; 855 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots; 856 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret; 857 int ret;
857 858
858 list_for_each(cur, head) { 859 list_for_each_entry(pending, head, list) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending); 860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret); 861 BUG_ON(ret);
862 } 862 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1704 1615
1705 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1707 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1709 1621
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1750 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1751 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1753 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1755 1668
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1807 1720
1808 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1810 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1812 1726
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1879 1793
1880 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1882 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1884 1799
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1817 }
1903 } 1818 }
1904 btrfs_free_path(path); 1819 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1820 return ret;
1908} 1821}
1909 1822
1910static int wait_log_commit(struct btrfs_root *log) 1823/*
1824 * helper function to update the item for a given subvolumes log root
1825 * in the tree of log roots
1826 */
1827static int update_log_root(struct btrfs_trans_handle *trans,
1828 struct btrfs_root *log)
1829{
1830 int ret;
1831
1832 if (log->log_transid == 1) {
1833 /* insert root item on the first sync */
1834 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1835 &log->root_key, &log->root_item);
1836 } else {
1837 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1838 &log->root_key, &log->root_item);
1839 }
1840 return ret;
1841}
1842
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1844{
1912 DEFINE_WAIT(wait); 1845 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1846 int index = transid % 2;
1914 1847
1848 /*
1849 * we only allow two pending log transactions at a time,
1850 * so we know that if ours is more than 2 older than the
1851 * current transaction, we're done
1852 */
1915 do { 1853 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1854 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1855 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1856 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1857 if (root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index]))
1920 schedule(); 1859 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1860 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1861 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1862 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1863 atomic_read(&root->log_commit[index]));
1864 return 0;
1865}
1866
1867static int wait_for_writer(struct btrfs_root *root)
1868{
1869 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers))
1875 schedule();
1876 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait);
1878 }
1925 return 0; 1879 return 0;
1926} 1880}
1927 1881
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1887int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1888 struct btrfs_root *root)
1935{ 1889{
1890 int index1;
1891 int index2;
1936 int ret; 1892 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1893 struct btrfs_root *log = root->log_root;
1894 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1895
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1896 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1897 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1898 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1899 wait_log_commit(root, root->log_transid);
1900 mutex_unlock(&root->log_mutex);
1901 return 0;
1944 } 1902 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1903 atomic_set(&root->log_commit[index1], 1);
1904
1905 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1);
1946 1908
1947 while (1) { 1909 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1910 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1911 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1912 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1913 mutex_lock(&root->log_mutex);
1952 1914 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1915 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1916 break;
1965 } 1917 }
1966 1918
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1920 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1921
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1922 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1923 btrfs_set_root_generation(&log->root_item, trans->transid);
1924 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1925
1926 root->log_batch = 0;
1927 root->log_transid++;
1928 log->log_transid = root->log_transid;
1929 smp_mb();
1930 /*
1931 * log tree has been flushed to disk, new modifications of
1932 * the log will be written to new positions. so it's safe to
1933 * allow log writers to go in.
1934 */
1935 mutex_unlock(&root->log_mutex);
1936
1937 mutex_lock(&log_root_tree->log_mutex);
1938 log_root_tree->log_batch++;
1939 atomic_inc(&log_root_tree->log_writers);
1940 mutex_unlock(&log_root_tree->log_mutex);
1941
1942 ret = update_log_root(trans, log);
1943 BUG_ON(ret);
1944
1945 mutex_lock(&log_root_tree->log_mutex);
1946 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1947 smp_mb();
1948 if (waitqueue_active(&log_root_tree->log_writer_wait))
1949 wake_up(&log_root_tree->log_writer_wait);
1950 }
1951
1952 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out;
1957 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1);
1959
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1962
1963 wait_for_writer(log_root_tree);
1964
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1967 BUG_ON(ret);
1972 1968
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1969 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1970 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1971 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1972 btrfs_header_level(log_root_tree->node));
1973
1974 log_root_tree->log_batch = 0;
1975 log_root_tree->log_transid++;
1976 smp_mb();
1977
1978 mutex_unlock(&log_root_tree->log_mutex);
1979
1980 /*
1981 * nobody else is going to jump in and write the the ctree
1982 * super here because the log_commit atomic below is protecting
1983 * us. We must be called with a transaction handle pinning
1984 * the running transaction open, so a full commit can't hop
1985 * in and cause problems either.
1986 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1988
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1989 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1990 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1992 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1993out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1994 atomic_set(&root->log_commit[index1], 0);
1995 smp_mb();
1996 if (waitqueue_active(&root->log_commit_wait[index1]))
1997 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1998 return 0;
1988} 1999}
1989 2000
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2030 start, end, GFP_NOFS);
2020 } 2031 }
2021 2032
2022 log = root->log_root; 2033 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2034 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2035 &log->root_key);
2025 BUG_ON(ret); 2036 BUG_ON(ret);
2037 }
2026 root->log_root = NULL; 2038 root->log_root = NULL;
2027 kfree(root->log_root); 2039 free_extent_buffer(log->node);
2040 kfree(log);
2028 return 0; 2041 return 0;
2029} 2042}
2030 2043
2031/* 2044/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2045 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2046 * mixed in, we have a few interesting corners:
2056 * 2047 *
@@ -2711,11 +2702,6 @@ next_slot:
2711 2702
2712 btrfs_free_path(path); 2703 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2704 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2705out:
2720 return 0; 2706 return 0;
2721} 2707}
@@ -2846,7 +2832,9 @@ again:
2846 BUG_ON(!wc.replay_dest); 2832 BUG_ON(!wc.replay_dest);
2847 2833
2848 wc.replay_dest->log_root = log; 2834 wc.replay_dest->log_root = log;
2835 mutex_lock(&fs_info->trans_mutex);
2849 btrfs_record_root_in_trans(wc.replay_dest); 2836 btrfs_record_root_in_trans(wc.replay_dest);
2837 mutex_unlock(&fs_info->trans_mutex);
2850 ret = walk_log_tree(trans, log, &wc); 2838 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret); 2839 BUG_ON(ret);
2852 2840
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1cca2b5..1316139bf9e8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h> 23#include <asm/div64.h>
25#include "compat.h" 24#include "compat.h"
26#include "ctree.h" 25#include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid) 103 u64 devid, u8 *uuid)
105{ 104{
106 struct btrfs_device *dev; 105 struct btrfs_device *dev;
107 struct list_head *cur;
108 106
109 list_for_each(cur, head) { 107 list_for_each_entry(dev, head, dev_list) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid && 108 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev; 110 return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
118 115
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 116static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{ 117{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices; 118 struct btrfs_fs_devices *fs_devices;
123 119
124 list_for_each(cur, &fs_uuids) { 120 list_for_each_entry(fs_devices, &fs_uuids, list) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices; 122 return fs_devices;
128 } 123 }
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
159loop: 154loop:
160 spin_lock(&device->io_lock); 155 spin_lock(&device->io_lock);
161 156
157loop_lock:
162 /* take all the bios off the list at once and process them 158 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the 159 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted 160 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
208 * is now congested. Back off and let other work structs 204 * is now congested. Back off and let other work structs
209 * run instead 205 * run instead
210 */ 206 */
211 if (pending && bdi_write_congested(bdi) && 207 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
212 fs_info->fs_devices->open_devices > 1) { 208 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head; 209 struct bio *old_head;
214 210
@@ -220,7 +216,8 @@ loop:
220 tail->bi_next = old_head; 216 tail->bi_next = old_head;
221 else 217 else
222 device->pending_bio_tail = tail; 218 device->pending_bio_tail = tail;
223 device->running_pending = 0; 219
220 device->running_pending = 1;
224 221
225 spin_unlock(&device->io_lock); 222 spin_unlock(&device->io_lock);
226 btrfs_requeue_work(&device->work); 223 btrfs_requeue_work(&device->work);
@@ -229,6 +226,11 @@ loop:
229 } 226 }
230 if (again) 227 if (again)
231 goto loop; 228 goto loop;
229
230 spin_lock(&device->io_lock);
231 if (device->pending_bios)
232 goto loop_lock;
233 spin_unlock(&device->io_lock);
232done: 234done:
233 return 0; 235 return 0;
234} 236}
@@ -345,14 +347,11 @@ error:
345 347
346int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 348int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
347{ 349{
348 struct list_head *tmp; 350 struct btrfs_device *device, *next;
349 struct list_head *cur;
350 struct btrfs_device *device;
351 351
352 mutex_lock(&uuid_mutex); 352 mutex_lock(&uuid_mutex);
353again: 353again:
354 list_for_each_safe(cur, tmp, &fs_devices->devices) { 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
355 device = list_entry(cur, struct btrfs_device, dev_list);
356 if (device->in_fs_metadata) 355 if (device->in_fs_metadata)
357 continue; 356 continue;
358 357
@@ -383,14 +382,12 @@ again:
383 382
384static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{ 384{
386 struct list_head *cur;
387 struct btrfs_device *device; 385 struct btrfs_device *device;
388 386
389 if (--fs_devices->opened > 0) 387 if (--fs_devices->opened > 0)
390 return 0; 388 return 0;
391 389
392 list_for_each(cur, &fs_devices->devices) { 390 list_for_each_entry(device, &fs_devices->devices, dev_list) {
393 device = list_entry(cur, struct btrfs_device, dev_list);
394 if (device->bdev) { 391 if (device->bdev) {
395 close_bdev_exclusive(device->bdev, device->mode); 392 close_bdev_exclusive(device->bdev, device->mode);
396 fs_devices->open_devices--; 393 fs_devices->open_devices--;
@@ -439,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
439{ 436{
440 struct block_device *bdev; 437 struct block_device *bdev;
441 struct list_head *head = &fs_devices->devices; 438 struct list_head *head = &fs_devices->devices;
442 struct list_head *cur;
443 struct btrfs_device *device; 439 struct btrfs_device *device;
444 struct block_device *latest_bdev = NULL; 440 struct block_device *latest_bdev = NULL;
445 struct buffer_head *bh; 441 struct buffer_head *bh;
@@ -450,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
450 int seeding = 1; 446 int seeding = 1;
451 int ret = 0; 447 int ret = 0;
452 448
453 list_for_each(cur, head) { 449 list_for_each_entry(device, head, dev_list) {
454 device = list_entry(cur, struct btrfs_device, dev_list);
455 if (device->bdev) 450 if (device->bdev)
456 continue; 451 continue;
457 if (!device->name) 452 if (!device->name)
@@ -578,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
578 *(unsigned long long *)disk_super->fsid, 573 *(unsigned long long *)disk_super->fsid,
579 *(unsigned long long *)(disk_super->fsid + 8)); 574 *(unsigned long long *)(disk_super->fsid + 8));
580 } 575 }
581 printk(KERN_INFO "devid %llu transid %llu %s\n", 576 printk(KERN_CONT "devid %llu transid %llu %s\n",
582 (unsigned long long)devid, (unsigned long long)transid, path); 577 (unsigned long long)devid, (unsigned long long)transid, path);
583 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
584 579
@@ -1017,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1017 } 1012 }
1018 1013
1019 if (strcmp(device_path, "missing") == 0) { 1014 if (strcmp(device_path, "missing") == 0) {
1020 struct list_head *cur;
1021 struct list_head *devices; 1015 struct list_head *devices;
1022 struct btrfs_device *tmp; 1016 struct btrfs_device *tmp;
1023 1017
1024 device = NULL; 1018 device = NULL;
1025 devices = &root->fs_info->fs_devices->devices; 1019 devices = &root->fs_info->fs_devices->devices;
1026 list_for_each(cur, devices) { 1020 list_for_each_entry(tmp, devices, dev_list) {
1027 tmp = list_entry(cur, struct btrfs_device, dev_list);
1028 if (tmp->in_fs_metadata && !tmp->bdev) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) {
1029 device = tmp; 1022 device = tmp;
1030 break; 1023 break;
@@ -1280,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1280 struct btrfs_trans_handle *trans; 1273 struct btrfs_trans_handle *trans;
1281 struct btrfs_device *device; 1274 struct btrfs_device *device;
1282 struct block_device *bdev; 1275 struct block_device *bdev;
1283 struct list_head *cur;
1284 struct list_head *devices; 1276 struct list_head *devices;
1285 struct super_block *sb = root->fs_info->sb; 1277 struct super_block *sb = root->fs_info->sb;
1286 u64 total_bytes; 1278 u64 total_bytes;
@@ -1304,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1304 mutex_lock(&root->fs_info->volume_mutex); 1296 mutex_lock(&root->fs_info->volume_mutex);
1305 1297
1306 devices = &root->fs_info->fs_devices->devices; 1298 devices = &root->fs_info->fs_devices->devices;
1307 list_for_each(cur, devices) { 1299 list_for_each_entry(device, devices, dev_list) {
1308 device = list_entry(cur, struct btrfs_device, dev_list);
1309 if (device->bdev == bdev) { 1300 if (device->bdev == bdev) {
1310 ret = -EEXIST; 1301 ret = -EEXIST;
1311 goto error; 1302 goto error;
@@ -1704,7 +1695,6 @@ static u64 div_factor(u64 num, int factor)
1704int btrfs_balance(struct btrfs_root *dev_root) 1695int btrfs_balance(struct btrfs_root *dev_root)
1705{ 1696{
1706 int ret; 1697 int ret;
1707 struct list_head *cur;
1708 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1698 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1709 struct btrfs_device *device; 1699 struct btrfs_device *device;
1710 u64 old_size; 1700 u64 old_size;
@@ -1723,8 +1713,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1723 dev_root = dev_root->fs_info->dev_root; 1713 dev_root = dev_root->fs_info->dev_root;
1724 1714
1725 /* step one make some room on all the devices */ 1715 /* step one make some room on all the devices */
1726 list_for_each(cur, devices) { 1716 list_for_each_entry(device, devices, dev_list) {
1727 device = list_entry(cur, struct btrfs_device, dev_list);
1728 old_size = device->total_bytes; 1717 old_size = device->total_bytes;
1729 size_to_free = div_factor(old_size, 1); 1718 size_to_free = div_factor(old_size, 1);
1730 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1719 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
@@ -2905,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2905 free_extent_map(em); 2894 free_extent_map(em);
2906 } 2895 }
2907 2896
2908 map = kzalloc(sizeof(*map), GFP_NOFS);
2909 if (!map)
2910 return -ENOMEM;
2911
2912 em = alloc_extent_map(GFP_NOFS); 2897 em = alloc_extent_map(GFP_NOFS);
2913 if (!em) 2898 if (!em)
2914 return -ENOMEM; 2899 return -ENOMEM;
@@ -3117,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3117 if (!sb) 3102 if (!sb)
3118 return -ENOMEM; 3103 return -ENOMEM;
3119 btrfs_set_buffer_uptodate(sb); 3104 btrfs_set_buffer_uptodate(sb);
3105 btrfs_set_buffer_lockdep_class(sb, 0);
3106
3120 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3107 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3121 array_size = btrfs_super_sys_array_size(super_copy); 3108 array_size = btrfs_super_sys_array_size(super_copy);
3122 3109
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e270894..a9d3bf4d2689 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "btrfs_inode.h" 26#include "btrfs_inode.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
45 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0); 48 strlen(name), 0);
48 if (!di || IS_ERR(di)) { 49 if (!di) {
49 ret = -ENODATA; 50 ret = -ENODATA;
50 goto out; 51 goto out;
52 } else if (IS_ERR(di)) {
53 ret = PTR_ERR(di);
54 goto out;
51 } 55 }
52 56
53 leaf = path->nodes[0]; 57 leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
62 ret = -ERANGE; 66 ret = -ERANGE;
63 goto out; 67 goto out;
64 } 68 }
69
70 /*
71 * The way things are packed into the leaf is like this
72 * |struct btrfs_dir_item|name|data|
73 * where name is the xattr name, so security.foo, and data is the
74 * content of the xattr. data_ptr points to the location in memory
75 * where the data starts in the in memory leaf
76 */
65 data_ptr = (unsigned long)((char *)(di + 1) + 77 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di)); 78 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr, 79 read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
86 if (!path) 98 if (!path)
87 return -ENOMEM; 99 return -ENOMEM;
88 100
89 trans = btrfs_start_transaction(root, 1); 101 trans = btrfs_join_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode); 102 btrfs_set_trans_block_group(trans, inode);
91 103
92 /* first lets see if we already have this xattr */ 104 /* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0) 189 if (ret < 0)
178 goto err; 190 goto err;
179 ret = 0;
180 advance = 0; 191 advance = 0;
181 while (1) { 192 while (1) {
182 leaf = path->nodes[0]; 193 leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
320 return -EOPNOTSUPP; 331 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322} 333}
334
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
336{
337 int err;
338 size_t len;
339 void *value;
340 char *suffix;
341 char *name;
342
343 err = security_inode_init_security(inode, dir, &suffix, &value, &len);
344 if (err) {
345 if (err == -EOPNOTSUPP)
346 return 0;
347 return err;
348 }
349
350 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
351 GFP_NOFS);
352 if (!name) {
353 err = -ENOMEM;
354 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0);
358 kfree(name);
359 }
360
361 kfree(suffix);
362 kfree(value);
363 return err;
364}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68d..c71e9c3cf3f7 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
40
39#endif /* __XATTR__ */ 41#endif /* __XATTR__ */