aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_elf.c14
-rw-r--r--fs/btrfs/Kconfig13
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c277
-rw-r--r--fs/btrfs/ctree.h28
-rw-r--r--fs/btrfs/disk-io.c120
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c438
-rw-r--r--fs/btrfs/extent_io.c132
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file.c5
-rw-r--r--fs/btrfs/inode.c84
-rw-r--r--fs/btrfs/ioctl.c1
-rw-r--r--fs/btrfs/locking.c218
-rw-r--r--fs/btrfs/locking.h6
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c354
-rw-r--r--fs/btrfs/volumes.c49
-rw-r--r--fs/btrfs/xattr.c48
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/ecryptfs/crypto.c4
-rw-r--r--fs/exec.c28
-rw-r--r--fs/ext2/super.c9
-rw-r--r--fs/ext3/super.c11
-rw-r--r--fs/hugetlbfs/inode.c8
-rw-r--r--fs/internal.h2
-rw-r--r--fs/jbd/journal.c17
-rw-r--r--fs/lockd/svclock.c6
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/dcache.c42
-rw-r--r--fs/ocfs2/dcache.h9
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/ocfs2/xattr.c17
-rw-r--r--fs/seq_file.c115
-rw-r--r--fs/super.c4
-rw-r--r--fs/ubifs/budget.c35
-rw-r--r--fs/ubifs/debug.c122
-rw-r--r--fs/ubifs/debug.h36
-rw-r--r--fs/ubifs/dir.c96
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/ubifs/gc.c28
-rw-r--r--fs/ubifs/io.c22
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/lprops.c12
-rw-r--r--fs/ubifs/lpt_commit.c44
-rw-r--r--fs/ubifs/master.c2
-rw-r--r--fs/ubifs/orphan.c38
-rw-r--r--fs/ubifs/super.c195
-rw-r--r--fs/ubifs/tnc.c12
-rw-r--r--fs/ubifs/ubifs.h26
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c6
-rw-r--r--fs/xfs/xfs_dfrag.c10
-rw-r--r--fs/xfs/xfs_log_recover.c31
66 files changed, 2037 insertions, 876 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e3ff2b9e602..33b7235f853 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1208 * check for an ELF header. If we find one, dump the first page to 1208 * check for an ELF header. If we find one, dump the first page to
1209 * aid in determining what was mapped here. 1209 * aid in determining what was mapped here.
1210 */ 1210 */
1211 if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) { 1211 if (FILTER(ELF_HEADERS) &&
1212 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1212 u32 __user *header = (u32 __user *) vma->vm_start; 1213 u32 __user *header = (u32 __user *) vma->vm_start;
1213 u32 word; 1214 u32 word;
1215 mm_segment_t fs = get_fs();
1214 /* 1216 /*
1215 * Doing it this way gets the constant folded by GCC. 1217 * Doing it this way gets the constant folded by GCC.
1216 */ 1218 */
@@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1223 magic.elfmag[EI_MAG1] = ELFMAG1; 1225 magic.elfmag[EI_MAG1] = ELFMAG1;
1224 magic.elfmag[EI_MAG2] = ELFMAG2; 1226 magic.elfmag[EI_MAG2] = ELFMAG2;
1225 magic.elfmag[EI_MAG3] = ELFMAG3; 1227 magic.elfmag[EI_MAG3] = ELFMAG3;
1226 if (get_user(word, header) == 0 && word == magic.cmp) 1228 /*
1229 * Switch to the user "segment" for get_user(),
1230 * then put back what elf_core_dump() had in place.
1231 */
1232 set_fs(USER_DS);
1233 if (unlikely(get_user(word, header)))
1234 word = 0;
1235 set_fs(fs);
1236 if (word == magic.cmp)
1227 return PAGE_SIZE; 1237 return PAGE_SIZE;
1228 } 1238 }
1229 1239
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf999ea1..7bb3c020e57 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
16 module will be called btrfs. 16 module will be called btrfs.
17 17
18 If unsure, say N. 18 If unsure, say N.
19
20config BTRFS_FS_POSIX_ACL
21 bool "Btrfs POSIX Access Control Lists"
22 depends on BTRFS_FS
23 select FS_POSIX_ACL
24 help
25 POSIX Access Control Lists (ACLs) support permissions for users and
26 groups beyond the owner/group/world scheme.
27
28 To learn more about Access Control Lists, visit the POSIX ACLs for
29 Linux website <http://acl.bestbits.at/>.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe..c84ca1f5259 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/kthread.h> 19#include <linux/kthread.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/spinlock.h> 21#include <linux/spinlock.h>
23# include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 24#include "async-thread.h"
25 25
26#define WORK_QUEUED_BIT 0 26#define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
143 struct btrfs_work *work; 143 struct btrfs_work *work;
144 do { 144 do {
145 spin_lock_irq(&worker->lock); 145 spin_lock_irq(&worker->lock);
146again_locked:
146 while (!list_empty(&worker->pending)) { 147 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next; 148 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list); 149 work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
165 check_idle_worker(worker); 166 check_idle_worker(worker);
166 167
167 } 168 }
168 worker->working = 0;
169 if (freezing(current)) { 169 if (freezing(current)) {
170 worker->working = 0;
171 spin_unlock_irq(&worker->lock);
170 refrigerator(); 172 refrigerator();
171 } else { 173 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock); 174 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop()) 175 if (!kthread_should_stop()) {
176 cpu_relax();
177 /*
178 * we've dropped the lock, did someone else
179 * jump_in?
180 */
181 smp_mb();
182 if (!list_empty(&worker->pending))
183 continue;
184
185 /*
186 * this short schedule allows more work to
187 * come in without the queue functions
188 * needing to go through wake_up_process()
189 *
190 * worker->working is still 1, so nobody
191 * is going to try and wake us up
192 */
193 schedule_timeout(1);
194 smp_mb();
195 if (!list_empty(&worker->pending))
196 continue;
197
198 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending))
202 goto again_locked;
203
204 /*
205 * this makes sure we get a wakeup when someone
206 * adds something new to the queue
207 */
208 worker->working = 0;
209 spin_unlock_irq(&worker->lock);
210
175 schedule(); 211 schedule();
212 }
176 __set_current_state(TASK_RUNNING); 213 __set_current_state(TASK_RUNNING);
177 } 214 }
178 } while (!kthread_should_stop()); 215 } while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
350{ 387{
351 struct btrfs_worker_thread *worker = work->worker; 388 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags; 389 unsigned long flags;
390 int wake = 0;
353 391
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out; 393 goto out;
356 394
357 spin_lock_irqsave(&worker->lock, flags); 395 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending); 396 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending);
360 398
361 /* by definition we're busy, take ourselves off the idle 399 /* by definition we're busy, take ourselves off the idle
362 * list 400 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
368 &worker->workers->worker_list); 406 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags); 407 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 } 408 }
409 if (!worker->working) {
410 wake = 1;
411 worker->working = 1;
412 }
371 413
372 spin_unlock_irqrestore(&worker->lock, flags); 414 spin_unlock_irqrestore(&worker->lock, flags);
373 415 if (wake)
416 wake_up_process(worker->task);
374out: 417out:
418
375 return 0; 419 return 0;
376} 420}
377 421
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
398 } 442 }
399 443
400 spin_lock_irqsave(&worker->lock, flags); 444 spin_lock_irqsave(&worker->lock, flags);
445
446 list_add_tail(&work->list, &worker->pending);
401 atomic_inc(&worker->num_pending); 447 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker); 448 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404 449
405 /* 450 /*
406 * avoid calling into wake_up_process if this thread has already 451 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d..ab07627084f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h> 35#include <linux/pagevec.h>
37#include "compat.h" 36#include "compat.h"
38#include "ctree.h" 37#include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c077681..35443cc4b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
54 return path; 54 return path;
55} 55}
56 56
57/*
58 * set all locked nodes in the path to blocking locks. This should
59 * be done before scheduling
60 */
61noinline void btrfs_set_path_blocking(struct btrfs_path *p)
62{
63 int i;
64 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
65 if (p->nodes[i] && p->locks[i])
66 btrfs_set_lock_blocking(p->nodes[i]);
67 }
68}
69
70/*
71 * reset all the locked nodes in the patch to spinning locks.
72 */
73noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
74{
75 int i;
76 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
77 if (p->nodes[i] && p->locks[i])
78 btrfs_clear_lock_blocking(p->nodes[i]);
79 }
80}
81
57/* this also releases the path */ 82/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 83void btrfs_free_path(struct btrfs_path *p)
59{ 84{
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 297 if (IS_ERR(cow))
273 return PTR_ERR(cow); 298 return PTR_ERR(cow);
274 299
300 /* cow is set to blocking by btrfs_init_new_buffer */
301
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 302 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 303 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 304 btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +415,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
388 WARN_ON(1); 415 WARN_ON(1);
389 } 416 }
390 417
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid && 418 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid && 419 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 420 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf; 421 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest); 422 WARN_ON(prealloc_dest);
398 return 0; 423 return 0;
399 } 424 }
400 spin_unlock(&root->fs_info->hash_lock); 425
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 426 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
427
428 if (parent)
429 btrfs_set_lock_blocking(parent);
430 btrfs_set_lock_blocking(buf);
431
402 ret = __btrfs_cow_block(trans, root, buf, parent, 432 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0, 433 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest); 434 prealloc_dest);
@@ -504,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
504 if (parent_nritems == 1) 534 if (parent_nritems == 1)
505 return 0; 535 return 0;
506 536
537 btrfs_set_lock_blocking(parent);
538
507 for (i = start_slot; i < end_slot; i++) { 539 for (i = start_slot; i < end_slot; i++) {
508 int close = 1; 540 int close = 1;
509 541
@@ -564,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
564 search_start = last_block; 596 search_start = last_block;
565 597
566 btrfs_tree_lock(cur); 598 btrfs_tree_lock(cur);
599 btrfs_set_lock_blocking(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i, 600 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start, 601 &cur, search_start,
569 min(16 * blocksize, 602 min(16 * blocksize,
@@ -862,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 return 0; 895 return 0;
863 896
864 mid = path->nodes[level]; 897 mid = path->nodes[level];
898
865 WARN_ON(!path->locks[level]); 899 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 900 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 901
@@ -884,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
884 /* promote the child to a root */ 918 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0); 919 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child); 920 btrfs_tree_lock(child);
921 btrfs_set_lock_blocking(child);
887 BUG_ON(!child); 922 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 923 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret); 924 BUG_ON(ret);
@@ -900,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
900 935
901 add_root_to_dirty_list(root); 936 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child); 937 btrfs_tree_unlock(child);
938
903 path->locks[level] = 0; 939 path->locks[level] = 0;
904 path->nodes[level] = NULL; 940 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid); 941 clean_tree_block(trans, root, mid);
@@ -924,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
924 left = read_node_slot(root, parent, pslot - 1); 960 left = read_node_slot(root, parent, pslot - 1);
925 if (left) { 961 if (left) {
926 btrfs_tree_lock(left); 962 btrfs_tree_lock(left);
963 btrfs_set_lock_blocking(left);
927 wret = btrfs_cow_block(trans, root, left, 964 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0); 965 parent, pslot - 1, &left, 0);
929 if (wret) { 966 if (wret) {
@@ -934,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
934 right = read_node_slot(root, parent, pslot + 1); 971 right = read_node_slot(root, parent, pslot + 1);
935 if (right) { 972 if (right) {
936 btrfs_tree_lock(right); 973 btrfs_tree_lock(right);
974 btrfs_set_lock_blocking(right);
937 wret = btrfs_cow_block(trans, root, right, 975 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0); 976 parent, pslot + 1, &right, 0);
939 if (wret) { 977 if (wret) {
@@ -1109,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 u32 left_nr; 1147 u32 left_nr;
1110 1148
1111 btrfs_tree_lock(left); 1149 btrfs_tree_lock(left);
1150 btrfs_set_lock_blocking(left);
1151
1112 left_nr = btrfs_header_nritems(left); 1152 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1153 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1; 1154 wret = 1;
@@ -1155,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1155 */ 1195 */
1156 if (right) { 1196 if (right) {
1157 u32 right_nr; 1197 u32 right_nr;
1198
1158 btrfs_tree_lock(right); 1199 btrfs_tree_lock(right);
1200 btrfs_set_lock_blocking(right);
1201
1159 right_nr = btrfs_header_nritems(right); 1202 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1203 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1; 1204 wret = 1;
@@ -1210,8 +1253,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1210 struct btrfs_disk_key disk_key; 1253 struct btrfs_disk_key disk_key;
1211 u32 nritems; 1254 u32 nritems;
1212 u64 search; 1255 u64 search;
1213 u64 lowest_read; 1256 u64 target;
1214 u64 highest_read;
1215 u64 nread = 0; 1257 u64 nread = 0;
1216 int direction = path->reada; 1258 int direction = path->reada;
1217 struct extent_buffer *eb; 1259 struct extent_buffer *eb;
@@ -1235,8 +1277,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1235 return; 1277 return;
1236 } 1278 }
1237 1279
1238 highest_read = search; 1280 target = search;
1239 lowest_read = search;
1240 1281
1241 nritems = btrfs_header_nritems(node); 1282 nritems = btrfs_header_nritems(node);
1242 nr = slot; 1283 nr = slot;
@@ -1256,27 +1297,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
1256 break; 1297 break;
1257 } 1298 }
1258 search = btrfs_node_blockptr(node, nr); 1299 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) || 1300 if ((search <= target && target - search <= 65536) ||
1260 (search < lowest_read && lowest_read - search <= 16384) || 1301 (search > target && search - target <= 65536)) {
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize, 1302 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr)); 1303 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize; 1304 nread += blocksize;
1265 } 1305 }
1266 nscan++; 1306 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1307 if ((nread > 65536 || nscan > 32))
1268 break; 1308 break;
1309 }
1310}
1269 1311
1270 if (nread > (256 * 1024) || nscan > 128) 1312/*
1271 break; 1313 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1314 * cache
1315 */
1316static noinline int reada_for_balance(struct btrfs_root *root,
1317 struct btrfs_path *path, int level)
1318{
1319 int slot;
1320 int nritems;
1321 struct extent_buffer *parent;
1322 struct extent_buffer *eb;
1323 u64 gen;
1324 u64 block1 = 0;
1325 u64 block2 = 0;
1326 int ret = 0;
1327 int blocksize;
1272 1328
1273 if (search < lowest_read) 1329 parent = path->nodes[level - 1];
1274 lowest_read = search; 1330 if (!parent)
1275 if (search > highest_read) 1331 return 0;
1276 highest_read = search; 1332
1333 nritems = btrfs_header_nritems(parent);
1334 slot = path->slots[level];
1335 blocksize = btrfs_level_size(root, level);
1336
1337 if (slot > 0) {
1338 block1 = btrfs_node_blockptr(parent, slot - 1);
1339 gen = btrfs_node_ptr_generation(parent, slot - 1);
1340 eb = btrfs_find_tree_block(root, block1, blocksize);
1341 if (eb && btrfs_buffer_uptodate(eb, gen))
1342 block1 = 0;
1343 free_extent_buffer(eb);
1344 }
1345 if (slot < nritems) {
1346 block2 = btrfs_node_blockptr(parent, slot + 1);
1347 gen = btrfs_node_ptr_generation(parent, slot + 1);
1348 eb = btrfs_find_tree_block(root, block2, blocksize);
1349 if (eb && btrfs_buffer_uptodate(eb, gen))
1350 block2 = 0;
1351 free_extent_buffer(eb);
1352 }
1353 if (block1 || block2) {
1354 ret = -EAGAIN;
1355 btrfs_release_path(root, path);
1356 if (block1)
1357 readahead_tree_block(root, block1, blocksize, 0);
1358 if (block2)
1359 readahead_tree_block(root, block2, blocksize, 0);
1360
1361 if (block1) {
1362 eb = read_tree_block(root, block1, blocksize, 0);
1363 free_extent_buffer(eb);
1364 }
1365 if (block1) {
1366 eb = read_tree_block(root, block2, blocksize, 0);
1367 free_extent_buffer(eb);
1368 }
1277 } 1369 }
1370 return ret;
1278} 1371}
1279 1372
1373
1280/* 1374/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers 1375 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because 1376 * in the tree. The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1328} 1422}
1329 1423
1330/* 1424/*
1425 * This releases any locks held in the path starting at level and
1426 * going all the way up to the root.
1427 *
1428 * btrfs_search_slot will keep the lock held on higher nodes in a few
1429 * corner cases, such as COW of the block at slot zero in the node. This
1430 * ignores those rules, and it should only be called when there are no
1431 * more updates to be done higher up in the tree.
1432 */
1433noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1434{
1435 int i;
1436
1437 if (path->keep_locks || path->lowest_level)
1438 return;
1439
1440 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1441 if (!path->nodes[i])
1442 continue;
1443 if (!path->locks[i])
1444 continue;
1445 btrfs_tree_unlock(path->nodes[i]);
1446 path->locks[i] = 0;
1447 }
1448}
1449
1450/*
1331 * look for key in the tree. path is filled in with nodes along the way 1451 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf 1452 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0) 1453 * level of the path (level 0)
@@ -1387,32 +1507,30 @@ again:
1387 int wret; 1507 int wret;
1388 1508
1389 /* is a cow on this block not required */ 1509 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid && 1510 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid && 1511 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1512 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done; 1513 goto cow_done;
1396 } 1514 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398 1515
1399 /* ok, we have to cow, is our old prealloc the right 1516 /* ok, we have to cow, is our old prealloc the right
1400 * size? 1517 * size?
1401 */ 1518 */
1402 if (prealloc_block.objectid && 1519 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) { 1520 prealloc_block.offset != b->len) {
1521 btrfs_release_path(root, p);
1404 btrfs_free_reserved_extent(root, 1522 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid, 1523 prealloc_block.objectid,
1406 prealloc_block.offset); 1524 prealloc_block.offset);
1407 prealloc_block.objectid = 0; 1525 prealloc_block.objectid = 0;
1526 goto again;
1408 } 1527 }
1409 1528
1410 /* 1529 /*
1411 * for higher level blocks, try not to allocate blocks 1530 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held. 1531 * with the block and the parent locks held.
1413 */ 1532 */
1414 if (level > 1 && !prealloc_block.objectid && 1533 if (level > 0 && !prealloc_block.objectid) {
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len; 1534 u32 size = b->len;
1417 u64 hint = b->start; 1535 u64 hint = b->start;
1418 1536
@@ -1425,6 +1543,8 @@ again:
1425 goto again; 1543 goto again;
1426 } 1544 }
1427 1545
1546 btrfs_set_path_blocking(p);
1547
1428 wret = btrfs_cow_block(trans, root, b, 1548 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1], 1549 p->nodes[level + 1],
1430 p->slots[level + 1], 1550 p->slots[level + 1],
@@ -1446,6 +1566,22 @@ cow_done:
1446 if (!p->skip_locking) 1566 if (!p->skip_locking)
1447 p->locks[level] = 1; 1567 p->locks[level] = 1;
1448 1568
1569 btrfs_clear_path_blocking(p);
1570
1571 /*
1572 * we have a lock on b and as long as we aren't changing
1573 * the tree, there is no way to for the items in b to change.
1574 * It is safe to drop the lock on our parent before we
1575 * go through the expensive btree search on b.
1576 *
1577 * If cow is true, then we might be changing slot zero,
1578 * which may require changing the parent. So, we can't
1579 * drop the lock until after we know which slot we're
1580 * operating on.
1581 */
1582 if (!cow)
1583 btrfs_unlock_up_safe(p, level + 1);
1584
1449 ret = check_block(root, p, level); 1585 ret = check_block(root, p, level);
1450 if (ret) { 1586 if (ret) {
1451 ret = -1; 1587 ret = -1;
@@ -1453,6 +1589,7 @@ cow_done:
1453 } 1589 }
1454 1590
1455 ret = bin_search(b, key, level, &slot); 1591 ret = bin_search(b, key, level, &slot);
1592
1456 if (level != 0) { 1593 if (level != 0) {
1457 if (ret && slot > 0) 1594 if (ret && slot > 0)
1458 slot -= 1; 1595 slot -= 1;
@@ -1460,7 +1597,16 @@ cow_done:
1460 if ((p->search_for_split || ins_len > 0) && 1597 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >= 1598 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1599 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level); 1600 int sret;
1601
1602 sret = reada_for_balance(root, p, level);
1603 if (sret)
1604 goto again;
1605
1606 btrfs_set_path_blocking(p);
1607 sret = split_node(trans, root, p, level);
1608 btrfs_clear_path_blocking(p);
1609
1464 BUG_ON(sret > 0); 1610 BUG_ON(sret > 0);
1465 if (sret) { 1611 if (sret) {
1466 ret = sret; 1612 ret = sret;
@@ -1468,9 +1614,19 @@ cow_done:
1468 } 1614 }
1469 b = p->nodes[level]; 1615 b = p->nodes[level];
1470 slot = p->slots[level]; 1616 slot = p->slots[level];
1471 } else if (ins_len < 0) { 1617 } else if (ins_len < 0 &&
1472 int sret = balance_level(trans, root, p, 1618 btrfs_header_nritems(b) <
1473 level); 1619 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1620 int sret;
1621
1622 sret = reada_for_balance(root, p, level);
1623 if (sret)
1624 goto again;
1625
1626 btrfs_set_path_blocking(p);
1627 sret = balance_level(trans, root, p, level);
1628 btrfs_clear_path_blocking(p);
1629
1474 if (sret) { 1630 if (sret) {
1475 ret = sret; 1631 ret = sret;
1476 goto done; 1632 goto done;
@@ -1504,7 +1660,7 @@ cow_done:
1504 * of the btree by dropping locks before 1660 * of the btree by dropping locks before
1505 * we read. 1661 * we read.
1506 */ 1662 */
1507 if (level > 1) { 1663 if (level > 0) {
1508 btrfs_release_path(NULL, p); 1664 btrfs_release_path(NULL, p);
1509 if (tmp) 1665 if (tmp)
1510 free_extent_buffer(tmp); 1666 free_extent_buffer(tmp);
@@ -1519,6 +1675,7 @@ cow_done:
1519 free_extent_buffer(tmp); 1675 free_extent_buffer(tmp);
1520 goto again; 1676 goto again;
1521 } else { 1677 } else {
1678 btrfs_set_path_blocking(p);
1522 if (tmp) 1679 if (tmp)
1523 free_extent_buffer(tmp); 1680 free_extent_buffer(tmp);
1524 if (should_reada) 1681 if (should_reada)
@@ -1528,14 +1685,29 @@ cow_done:
1528 b = read_node_slot(root, b, slot); 1685 b = read_node_slot(root, b, slot);
1529 } 1686 }
1530 } 1687 }
1531 if (!p->skip_locking) 1688 if (!p->skip_locking) {
1532 btrfs_tree_lock(b); 1689 int lret;
1690
1691 btrfs_clear_path_blocking(p);
1692 lret = btrfs_try_spin_lock(b);
1693
1694 if (!lret) {
1695 btrfs_set_path_blocking(p);
1696 btrfs_tree_lock(b);
1697 btrfs_clear_path_blocking(p);
1698 }
1699 }
1533 } else { 1700 } else {
1534 p->slots[level] = slot; 1701 p->slots[level] = slot;
1535 if (ins_len > 0 && 1702 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) { 1703 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key, 1704 int sret;
1705
1706 btrfs_set_path_blocking(p);
1707 sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0); 1708 p, ins_len, ret == 0);
1709 btrfs_clear_path_blocking(p);
1710
1539 BUG_ON(sret > 0); 1711 BUG_ON(sret > 0);
1540 if (sret) { 1712 if (sret) {
1541 ret = sret; 1713 ret = sret;
@@ -1549,12 +1721,16 @@ cow_done:
1549 } 1721 }
1550 ret = 1; 1722 ret = 1;
1551done: 1723done:
1724 /*
1725 * we don't really know what they plan on doing with the path
1726 * from here on, so for now just mark it as blocking
1727 */
1728 btrfs_set_path_blocking(p);
1552 if (prealloc_block.objectid) { 1729 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root, 1730 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid, 1731 prealloc_block.objectid,
1555 prealloc_block.offset); 1732 prealloc_block.offset);
1556 } 1733 }
1557
1558 return ret; 1734 return ret;
1559} 1735}
1560 1736
@@ -1578,6 +1754,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1754 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret); 1755 BUG_ON(ret);
1580 1756
1757 btrfs_set_lock_blocking(eb);
1758
1581 parent = eb; 1759 parent = eb;
1582 while (1) { 1760 while (1) {
1583 level = btrfs_header_level(parent); 1761 level = btrfs_header_level(parent);
@@ -1602,6 +1780,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1602 eb = read_tree_block(root, bytenr, blocksize, 1780 eb = read_tree_block(root, bytenr, blocksize,
1603 generation); 1781 generation);
1604 btrfs_tree_lock(eb); 1782 btrfs_tree_lock(eb);
1783 btrfs_set_lock_blocking(eb);
1605 } 1784 }
1606 1785
1607 /* 1786 /*
@@ -1626,6 +1805,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1626 eb = read_tree_block(root, bytenr, blocksize, 1805 eb = read_tree_block(root, bytenr, blocksize,
1627 generation); 1806 generation);
1628 btrfs_tree_lock(eb); 1807 btrfs_tree_lock(eb);
1808 btrfs_set_lock_blocking(eb);
1629 } 1809 }
1630 1810
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1811 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2172,6 +2352,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2172 2352
2173 right = read_node_slot(root, upper, slot + 1); 2353 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right); 2354 btrfs_tree_lock(right);
2355 btrfs_set_lock_blocking(right);
2356
2175 free_space = btrfs_leaf_free_space(root, right); 2357 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size) 2358 if (free_space < data_size)
2177 goto out_unlock; 2359 goto out_unlock;
@@ -2367,6 +2549,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2367 2549
2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2550 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left); 2551 btrfs_tree_lock(left);
2552 btrfs_set_lock_blocking(left);
2553
2370 free_space = btrfs_leaf_free_space(root, left); 2554 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) { 2555 if (free_space < data_size) {
2372 ret = 1; 2556 ret = 1;
@@ -2825,6 +3009,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2825 path->keep_locks = 0; 3009 path->keep_locks = 0;
2826 BUG_ON(ret); 3010 BUG_ON(ret);
2827 3011
3012 /*
3013 * make sure any changes to the path from split_leaf leave it
3014 * in a blocking state
3015 */
3016 btrfs_set_path_blocking(path);
3017
2828 leaf = path->nodes[0]; 3018 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3019 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 3020
@@ -3354,6 +3544,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3354 BUG(); 3544 BUG();
3355 } 3545 }
3356out: 3546out:
3547 btrfs_unlock_up_safe(path, 1);
3357 return ret; 3548 return ret;
3358} 3549}
3359 3550
@@ -3441,15 +3632,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3441{ 3632{
3442 int ret; 3633 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3634 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3635 u64 parent_start = path->nodes[1]->start;
3636 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3444 3637
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3638 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret) 3639 if (ret)
3447 return ret; 3640 return ret;
3448 3641
3642 /*
3643 * btrfs_free_extent is expensive, we want to make sure we
3644 * aren't holding any locks when we call it
3645 */
3646 btrfs_unlock_up_safe(path, 0);
3647
3449 ret = btrfs_free_extent(trans, root, bytenr, 3648 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0), 3649 btrfs_level_size(root, 0),
3451 path->nodes[1]->start, 3650 parent_start, parent_owner,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1); 3651 root_gen, 0, 1);
3454 return ret; 3652 return ret;
3455} 3653}
@@ -3721,12 +3919,14 @@ find_next_key:
3721 */ 3919 */
3722 if (slot >= nritems) { 3920 if (slot >= nritems) {
3723 path->slots[level] = slot; 3921 path->slots[level] = slot;
3922 btrfs_set_path_blocking(path);
3724 sret = btrfs_find_next_key(root, path, min_key, level, 3923 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans); 3924 cache_only, min_trans);
3726 if (sret == 0) { 3925 if (sret == 0) {
3727 btrfs_release_path(root, path); 3926 btrfs_release_path(root, path);
3728 goto again; 3927 goto again;
3729 } else { 3928 } else {
3929 btrfs_clear_path_blocking(path);
3730 goto out; 3930 goto out;
3731 } 3931 }
3732 } 3932 }
@@ -3738,16 +3938,20 @@ find_next_key:
3738 unlock_up(path, level, 1); 3938 unlock_up(path, level, 1);
3739 goto out; 3939 goto out;
3740 } 3940 }
3941 btrfs_set_path_blocking(path);
3741 cur = read_node_slot(root, cur, slot); 3942 cur = read_node_slot(root, cur, slot);
3742 3943
3743 btrfs_tree_lock(cur); 3944 btrfs_tree_lock(cur);
3945
3744 path->locks[level - 1] = 1; 3946 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur; 3947 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1); 3948 unlock_up(path, level, 1);
3949 btrfs_clear_path_blocking(path);
3747 } 3950 }
3748out: 3951out:
3749 if (ret == 0) 3952 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key)); 3953 memcpy(min_key, &found_key, sizeof(found_key));
3954 btrfs_set_path_blocking(path);
3751 return ret; 3955 return ret;
3752} 3956}
3753 3957
@@ -3843,6 +4047,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3843 if (ret < 0) 4047 if (ret < 0)
3844 return ret; 4048 return ret;
3845 4049
4050 btrfs_set_path_blocking(path);
3846 nritems = btrfs_header_nritems(path->nodes[0]); 4051 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /* 4052 /*
3848 * by releasing the path above we dropped all our locks. A balance 4053 * by releasing the path above we dropped all our locks. A balance
@@ -3873,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3873 free_extent_buffer(next); 4078 free_extent_buffer(next);
3874 } 4079 }
3875 4080
4081 /* the path was set to blocking above */
3876 if (level == 1 && (path->locks[1] || path->skip_locking) && 4082 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada) 4083 path->reada)
3878 reada_for_search(root, path, level, slot, 0); 4084 reada_for_search(root, path, level, slot, 0);
@@ -3881,6 +4087,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3881 if (!path->skip_locking) { 4087 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c)); 4088 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next); 4089 btrfs_tree_lock(next);
4090 btrfs_set_lock_blocking(next);
3884 } 4091 }
3885 break; 4092 break;
3886 } 4093 }
@@ -3897,12 +4104,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3897 path->locks[level] = 1; 4104 path->locks[level] = 1;
3898 if (!level) 4105 if (!level)
3899 break; 4106 break;
4107
4108 btrfs_set_path_blocking(path);
3900 if (level == 1 && path->locks[1] && path->reada) 4109 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0); 4110 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0); 4111 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) { 4112 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4113 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next); 4114 btrfs_tree_lock(next);
4115 btrfs_set_lock_blocking(next);
3906 } 4116 }
3907 } 4117 }
3908done: 4118done:
@@ -3927,6 +4137,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3927 4137
3928 while (1) { 4138 while (1) {
3929 if (path->slots[0] == 0) { 4139 if (path->slots[0] == 0) {
4140 btrfs_set_path_blocking(path);
3930 ret = btrfs_prev_leaf(root, path); 4141 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0) 4142 if (ret != 0)
3932 return ret; 4143 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f8811..531db112c8b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -454,17 +454,11 @@ struct btrfs_timespec {
454 __le32 nsec; 454 __le32 nsec;
455} __attribute__ ((__packed__)); 455} __attribute__ ((__packed__));
456 456
457typedef enum { 457enum btrfs_compression_type {
458 BTRFS_COMPRESS_NONE = 0, 458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1, 459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2, 460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type; 461};
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468 462
469struct btrfs_inode_item { 463struct btrfs_inode_item {
470 /* nfs style generation number */ 464 /* nfs style generation number */
@@ -701,9 +695,7 @@ struct btrfs_fs_info {
701 struct btrfs_transaction *running_transaction; 695 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle; 696 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait; 697 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait; 698 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707 699
708 struct btrfs_super_block super_copy; 700 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit; 701 struct btrfs_super_block super_for_commit;
@@ -711,7 +703,6 @@ struct btrfs_fs_info {
711 struct super_block *sb; 703 struct super_block *sb;
712 struct inode *btree_inode; 704 struct inode *btree_inode;
713 struct backing_dev_info bdi; 705 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex; 706 struct mutex trans_mutex;
716 struct mutex tree_log_mutex; 707 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex; 708 struct mutex transaction_kthread_mutex;
@@ -730,10 +721,6 @@ struct btrfs_fs_info {
730 atomic_t async_submit_draining; 721 atomic_t async_submit_draining;
731 atomic_t nr_async_bios; 722 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages; 723 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737 724
738 /* 725 /*
739 * this is used by the balancing code to wait for all the pending 726 * this is used by the balancing code to wait for all the pending
@@ -833,7 +820,14 @@ struct btrfs_root {
833 struct kobject root_kobj; 820 struct kobject root_kobj;
834 struct completion kobj_unregister; 821 struct completion kobj_unregister;
835 struct mutex objectid_mutex; 822 struct mutex objectid_mutex;
823
836 struct mutex log_mutex; 824 struct mutex log_mutex;
825 wait_queue_head_t log_writer_wait;
826 wait_queue_head_t log_commit_wait[2];
827 atomic_t log_writers;
828 atomic_t log_commit[2];
829 unsigned long log_transid;
830 unsigned long log_batch;
837 831
838 u64 objectid; 832 u64 objectid;
839 u64 last_trans; 833 u64 last_trans;
@@ -1841,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void); 1835struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p); 1836void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p); 1837void btrfs_init_path(struct btrfs_path *p);
1838void btrfs_set_path_blocking(struct btrfs_path *p);
1839void btrfs_clear_path_blocking(struct btrfs_path *p);
1840void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1841
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1842int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr); 1843 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1844int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae..5aebddd7119 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/scatterlist.h> 21#include <linux/scatterlist.h>
@@ -800,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801 800
802 if (ret == 0) 801 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE; 802 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
804 else 803 else
805 WARN_ON(1); 804 WARN_ON(1);
806 return buf; 805 return buf;
@@ -814,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
814 if (btrfs_header_generation(buf) == 813 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) { 814 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf)); 815 WARN_ON(!btrfs_tree_locked(buf));
816
817 /* ugh, clear_extent_buffer_dirty can be expensive */
818 btrfs_set_lock_blocking(buf);
819
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 820 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf); 821 buf);
819 } 822 }
@@ -850,6 +853,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
850 spin_lock_init(&root->list_lock); 853 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex); 854 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex); 855 mutex_init(&root->log_mutex);
856 init_waitqueue_head(&root->log_writer_wait);
857 init_waitqueue_head(&root->log_commit_wait[0]);
858 init_waitqueue_head(&root->log_commit_wait[1]);
859 atomic_set(&root->log_commit[0], 0);
860 atomic_set(&root->log_commit[1], 0);
861 atomic_set(&root->log_writers, 0);
862 root->log_batch = 0;
863 root->log_transid = 0;
853 extent_io_tree_init(&root->dirty_log_pages, 864 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS); 865 fs_info->btree_inode->i_mapping, GFP_NOFS);
855 866
@@ -934,15 +945,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
934 return 0; 945 return 0;
935} 946}
936 947
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 948static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info) 949 struct btrfs_fs_info *fs_info)
939{ 950{
940 struct btrfs_root *root; 951 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root; 952 struct btrfs_root *tree_root = fs_info->tree_root;
953 struct extent_buffer *leaf;
942 954
943 root = kzalloc(sizeof(*root), GFP_NOFS); 955 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root) 956 if (!root)
945 return -ENOMEM; 957 return ERR_PTR(-ENOMEM);
946 958
947 __setup_root(tree_root->nodesize, tree_root->leafsize, 959 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize, 960 tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +963,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 963 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 964 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 965 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
966 /*
967 * log trees do not get reference counted because they go away
968 * before a real commit is actually done. They do store pointers
969 * to file data extents, and those reference counts still get
970 * updated (along with back refs to the log tree).
971 */
954 root->ref_cows = 0; 972 root->ref_cows = 0;
955 973
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 974 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID, 975 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0); 976 trans->transid, 0, 0, 0);
977 if (IS_ERR(leaf)) {
978 kfree(root);
979 return ERR_CAST(leaf);
980 }
959 981
982 root->node = leaf;
960 btrfs_set_header_nritems(root->node, 0); 983 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0); 984 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start); 985 btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +991,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
968 BTRFS_FSID_SIZE); 991 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node); 992 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node); 993 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root; 994 return root;
995}
996
997int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
998 struct btrfs_fs_info *fs_info)
999{
1000 struct btrfs_root *log_root;
1001
1002 log_root = alloc_log_tree(trans, fs_info);
1003 if (IS_ERR(log_root))
1004 return PTR_ERR(log_root);
1005 WARN_ON(fs_info->log_root_tree);
1006 fs_info->log_root_tree = log_root;
1007 return 0;
1008}
1009
1010int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1011 struct btrfs_root *root)
1012{
1013 struct btrfs_root *log_root;
1014 struct btrfs_inode_item *inode_item;
1015
1016 log_root = alloc_log_tree(trans, root->fs_info);
1017 if (IS_ERR(log_root))
1018 return PTR_ERR(log_root);
1019
1020 log_root->last_trans = trans->transid;
1021 log_root->root_key.offset = root->root_key.objectid;
1022
1023 inode_item = &log_root->root_item.inode;
1024 inode_item->generation = cpu_to_le64(1);
1025 inode_item->size = cpu_to_le64(3);
1026 inode_item->nlink = cpu_to_le32(1);
1027 inode_item->nbytes = cpu_to_le64(root->leafsize);
1028 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1029
1030 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
1031 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1032
1033 WARN_ON(root->log_root);
1034 root->log_root = log_root;
1035 root->log_transid = 0;
972 return 0; 1036 return 0;
973} 1037}
974 1038
@@ -1136,7 +1200,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{ 1200{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1201 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0; 1202 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device; 1203 struct btrfs_device *device;
1141 struct backing_dev_info *bdi; 1204 struct backing_dev_info *bdi;
1142#if 0 1205#if 0
@@ -1144,8 +1207,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1144 btrfs_congested_async(info, 0)) 1207 btrfs_congested_async(info, 0))
1145 return 1; 1208 return 1;
1146#endif 1209#endif
1147 list_for_each(cur, &info->fs_devices->devices) { 1210 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev) 1211 if (!device->bdev)
1150 continue; 1212 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev); 1213 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1225,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1163 */ 1225 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1226static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{ 1227{
1166 struct list_head *cur;
1167 struct btrfs_device *device; 1228 struct btrfs_device *device;
1168 struct btrfs_fs_info *info; 1229 struct btrfs_fs_info *info;
1169 1230
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1231 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) { 1232 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev) 1233 if (!device->bdev)
1174 continue; 1234 continue;
1175 1235
@@ -1447,7 +1507,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1507 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers); 1508 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1509 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock); 1510 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock); 1511 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock); 1512 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1594,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1535 init_waitqueue_head(&fs_info->transaction_throttle); 1594 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait); 1595 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait); 1596 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542 1597
1543 __setup_root(4096, 4096, 4096, 4096, tree_root, 1598 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1599 fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 * low idle thresh 1682 * low idle thresh
1628 */ 1683 */
1629 fs_info->endio_workers.idle_thresh = 4; 1684 fs_info->endio_workers.idle_thresh = 4;
1685 fs_info->endio_meta_workers.idle_thresh = 4;
1686
1630 fs_info->endio_write_workers.idle_thresh = 64; 1687 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64; 1688 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632 1689
@@ -1740,13 +1797,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1797 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1798 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner"); 1799 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread) 1800 if (IS_ERR(fs_info->cleaner_kthread))
1744 goto fail_csum_root; 1801 goto fail_csum_root;
1745 1802
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1803 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root, 1804 tree_root,
1748 "btrfs-transaction"); 1805 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread) 1806 if (IS_ERR(fs_info->transaction_kthread))
1750 goto fail_cleaner; 1807 goto fail_cleaner;
1751 1808
1752 if (btrfs_super_log_root(disk_super) != 0) { 1809 if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1885,14 @@ fail_sb_buffer:
1828fail_iput: 1885fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1886 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode); 1887 iput(fs_info->btree_inode);
1831fail: 1888
1832 btrfs_close_devices(fs_info->fs_devices); 1889 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1890 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1891 bdi_destroy(&fs_info->bdi);
1834 1892
1893fail:
1835 kfree(extent_root); 1894 kfree(extent_root);
1836 kfree(tree_root); 1895 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info); 1896 kfree(fs_info);
1839 kfree(chunk_root); 1897 kfree(chunk_root);
1840 kfree(dev_root); 1898 kfree(dev_root);
@@ -1995,7 +2053,6 @@ static int write_dev_supers(struct btrfs_device *device,
1995 2053
1996int write_all_supers(struct btrfs_root *root, int max_mirrors) 2054int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{ 2055{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices; 2056 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev; 2057 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb; 2058 struct btrfs_super_block *sb;
@@ -2011,8 +2068,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2011 2068
2012 sb = &root->fs_info->super_for_commit; 2069 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item; 2070 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) { 2071 list_for_each_entry(dev, head, dev_list) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) { 2072 if (!dev->bdev) {
2017 total_errors++; 2073 total_errors++;
2018 continue; 2074 continue;
@@ -2045,8 +2101,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2045 } 2101 }
2046 2102
2047 total_errors = 0; 2103 total_errors = 0;
2048 list_for_each(cur, head) { 2104 list_for_each_entry(dev, head, dev_list) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev) 2105 if (!dev->bdev)
2051 continue; 2106 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable) 2107 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2260 u64 transid = btrfs_header_generation(buf); 2315 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode; 2316 struct inode *btree_inode = root->fs_info->btree_inode;
2262 2317
2318 btrfs_set_lock_blocking(buf);
2319
2263 WARN_ON(!btrfs_tree_locked(buf)); 2320 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) { 2321 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2322 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2302,14 +2359,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2302 int ret; 2359 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2360 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0) 2361 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE; 2362 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2306 return ret; 2363 return ret;
2307} 2364}
2308 2365
2309int btree_lock_page_hook(struct page *page) 2366int btree_lock_page_hook(struct page *page)
2310{ 2367{
2311 struct inode *inode = page->mapping->host; 2368 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2369 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb; 2370 struct extent_buffer *eb;
2315 unsigned long len; 2371 unsigned long len;
@@ -2324,9 +2380,7 @@ int btree_lock_page_hook(struct page *page)
2324 goto out; 2380 goto out;
2325 2381
2326 btrfs_tree_lock(eb); 2382 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2383 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb); 2384 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb); 2385 free_extent_buffer(eb);
2332out: 2386out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b..494a56eb298 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info); 98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root);
101int btree_lock_page_hook(struct page *page); 103int btree_lock_page_hook(struct page *page);
102#endif 104#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873..7527523c2d2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include "compat.h" 23#include "compat.h"
24#include "hash.h" 24#include "hash.h"
25#include "crc32c.h" 25#include "crc32c.h"
@@ -30,7 +30,6 @@
30#include "volumes.h" 30#include "volumes.h"
31#include "locking.h" 31#include "locking.h"
32#include "ref-cache.h" 32#include "ref-cache.h"
33#include "compat.h"
34 33
35#define PENDING_EXTENT_INSERT 0 34#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 35#define PENDING_EXTENT_DELETE 1
@@ -326,10 +325,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 325 u64 flags)
327{ 326{
328 struct list_head *head = &info->space_info; 327 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 328 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 329 list_for_each_entry(found, head, list) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags) 330 if (found->flags == flags)
334 return found; 331 return found;
335 } 332 }
@@ -1525,15 +1522,55 @@ out:
1525 return ret; 1522 return ret;
1526} 1523}
1527 1524
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1525/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1526 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1527 * can be in just about any order, and it is likely to have clusters of
1528 * things that are close together and clusters of things that are not.
1529 *
1530 * To help reduce the seeks that come with updating all of these reference
1531 * counts, sort them by byte number before actual updates are done.
1532 *
1533 * struct refsort is used to match byte number to slot in the btree block.
1534 * we sort based on the byte number and then use the slot to actually
1535 * find the item.
1536 *
1537 * struct refsort is smaller than strcut btrfs_item and smaller than
1538 * struct btrfs_key_ptr. Since we're currently limited to the page size
1539 * for a btree block, there's no way for a kmalloc of refsorts for a
1540 * single node to be bigger than a page.
1541 */
1542struct refsort {
1543 u64 bytenr;
1544 u32 slot;
1545};
1546
1547/*
1548 * for passing into sort()
1549 */
1550static int refsort_cmp(const void *a_void, const void *b_void)
1551{
1552 const struct refsort *a = a_void;
1553 const struct refsort *b = b_void;
1554
1555 if (a->bytenr < b->bytenr)
1556 return -1;
1557 if (a->bytenr > b->bytenr)
1558 return 1;
1559 return 0;
1560}
1561
1562
1563noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1564 struct btrfs_root *root,
1565 struct extent_buffer *orig_buf,
1566 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1567{
1532 u64 bytenr; 1568 u64 bytenr;
1533 u64 ref_root; 1569 u64 ref_root;
1534 u64 orig_root; 1570 u64 orig_root;
1535 u64 ref_generation; 1571 u64 ref_generation;
1536 u64 orig_generation; 1572 u64 orig_generation;
1573 struct refsort *sorted;
1537 u32 nritems; 1574 u32 nritems;
1538 u32 nr_file_extents = 0; 1575 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1576 struct btrfs_key key;
@@ -1542,6 +1579,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1579 int level;
1543 int ret = 0; 1580 int ret = 0;
1544 int faili = 0; 1581 int faili = 0;
1582 int refi = 0;
1583 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1584 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1585 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1586
@@ -1553,6 +1592,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1592 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1593 level = btrfs_header_level(buf);
1555 1594
1595 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1596 BUG_ON(!sorted);
1597
1556 if (root->ref_cows) { 1598 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1599 process_func = __btrfs_inc_extent_ref;
1558 } else { 1600 } else {
@@ -1565,6 +1607,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1607 process_func = __btrfs_update_extent_ref;
1566 } 1608 }
1567 1609
1610 /*
1611 * we make two passes through the items. In the first pass we
1612 * only record the byte number and slot. Then we sort based on
1613 * byte number and do the actual work based on the sorted results
1614 */
1568 for (i = 0; i < nritems; i++) { 1615 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1616 cond_resched();
1570 if (level == 0) { 1617 if (level == 0) {
@@ -1581,6 +1628,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1628 continue;
1582 1629
1583 nr_file_extents++; 1630 nr_file_extents++;
1631 sorted[refi].bytenr = bytenr;
1632 sorted[refi].slot = i;
1633 refi++;
1634 } else {
1635 bytenr = btrfs_node_blockptr(buf, i);
1636 sorted[refi].bytenr = bytenr;
1637 sorted[refi].slot = i;
1638 refi++;
1639 }
1640 }
1641 /*
1642 * if refi == 0, we didn't actually put anything into the sorted
1643 * array and we're done
1644 */
1645 if (refi == 0)
1646 goto out;
1647
1648 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1649
1650 for (i = 0; i < refi; i++) {
1651 cond_resched();
1652 slot = sorted[i].slot;
1653 bytenr = sorted[i].bytenr;
1654
1655 if (level == 0) {
1656 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1657
1585 ret = process_func(trans, root, bytenr, 1658 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1659 orig_buf->start, buf->start,
@@ -1589,25 +1662,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1662 key.objectid);
1590 1663
1591 if (ret) { 1664 if (ret) {
1592 faili = i; 1665 faili = slot;
1593 WARN_ON(1); 1666 WARN_ON(1);
1594 goto fail; 1667 goto fail;
1595 } 1668 }
1596 } else { 1669 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1670 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1671 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1672 orig_root, ref_root,
1601 orig_generation, ref_generation, 1673 orig_generation, ref_generation,
1602 level - 1); 1674 level - 1);
1603 if (ret) { 1675 if (ret) {
1604 faili = i; 1676 faili = slot;
1605 WARN_ON(1); 1677 WARN_ON(1);
1606 goto fail; 1678 goto fail;
1607 } 1679 }
1608 } 1680 }
1609 } 1681 }
1610out: 1682out:
1683 kfree(sorted);
1611 if (nr_extents) { 1684 if (nr_extents) {
1612 if (level == 0) 1685 if (level == 0)
1613 *nr_extents = nr_file_extents; 1686 *nr_extents = nr_file_extents;
@@ -1616,6 +1689,7 @@ out:
1616 } 1689 }
1617 return 0; 1690 return 0;
1618fail: 1691fail:
1692 kfree(sorted);
1619 WARN_ON(1); 1693 WARN_ON(1);
1620 return ret; 1694 return ret;
1621} 1695}
@@ -2159,7 +2233,8 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2233 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2234 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2235 if (ret) {
2162 if (skipped && all && !num_inserts) { 2236 if (skipped && all && !num_inserts &&
2237 list_empty(&update_list)) {
2163 skipped = 0; 2238 skipped = 0;
2164 search = 0; 2239 search = 0;
2165 continue; 2240 continue;
@@ -2547,6 +2622,7 @@ again:
2547 if (ret) { 2622 if (ret) {
2548 if (all && skipped && !nr) { 2623 if (all && skipped && !nr) {
2549 search = 0; 2624 search = 0;
2625 skipped = 0;
2550 continue; 2626 continue;
2551 } 2627 }
2552 mutex_unlock(&info->extent_ins_mutex); 2628 mutex_unlock(&info->extent_ins_mutex);
@@ -2700,13 +2776,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 2776 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2777 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 2778 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 2779 mutex_lock(&root->fs_info->pinned_mutex);
2704 2780 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 2781 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 2782 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 2783 return 0;
2712 } 2784 }
@@ -3014,7 +3086,6 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3086static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3087{
3016 struct btrfs_block_group_cache *cache; 3088 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3089
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3090 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3091 (unsigned long long)(info->total_bytes - info->bytes_used -
@@ -3022,8 +3093,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3022 (info->full) ? "" : "not "); 3093 (info->full) ? "" : "not ");
3023 3094
3024 down_read(&info->groups_sem); 3095 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3096 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3097 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3098 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3099 "%llu pinned %llu reserved\n",
@@ -3342,7 +3412,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3342 btrfs_set_header_generation(buf, trans->transid); 3412 btrfs_set_header_generation(buf, trans->transid);
3343 btrfs_tree_lock(buf); 3413 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3414 clean_tree_block(trans, root, buf);
3415
3416 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3417 btrfs_set_buffer_uptodate(buf);
3418
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3419 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3420 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3421 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3424,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3424 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3425 }
3353 trans->blocks_used++; 3426 trans->blocks_used++;
3427 /* this returns a buffer locked for blocking */
3354 return buf; 3428 return buf;
3355} 3429}
3356 3430
@@ -3388,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3462{
3389 u64 leaf_owner; 3463 u64 leaf_owner;
3390 u64 leaf_generation; 3464 u64 leaf_generation;
3465 struct refsort *sorted;
3391 struct btrfs_key key; 3466 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3467 struct btrfs_file_extent_item *fi;
3393 int i; 3468 int i;
3394 int nritems; 3469 int nritems;
3395 int ret; 3470 int ret;
3471 int refi = 0;
3472 int slot;
3396 3473
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3474 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3475 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3476 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3477 leaf_generation = btrfs_header_generation(leaf);
3401 3478
3479 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3480 /* we do this loop twice. The first time we build a list
3481 * of the extents we have a reference on, then we sort the list
3482 * by bytenr. The second time around we actually do the
3483 * extent freeing.
3484 */
3402 for (i = 0; i < nritems; i++) { 3485 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3486 u64 disk_bytenr;
3404 cond_resched(); 3487 cond_resched();
3405 3488
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3489 btrfs_item_key_to_cpu(leaf, &key, i);
3490
3491 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3492 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3493 continue;
3494
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3495 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3496
3497 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3498 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3499 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3500 continue;
3413 /* 3501
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3502 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3503
3504 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3505 if (disk_bytenr == 0)
3419 continue; 3506 continue;
3420 3507
3508 sorted[refi].bytenr = disk_bytenr;
3509 sorted[refi].slot = i;
3510 refi++;
3511 }
3512
3513 if (refi == 0)
3514 goto out;
3515
3516 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3517
3518 for (i = 0; i < refi; i++) {
3519 u64 disk_bytenr;
3520
3521 disk_bytenr = sorted[i].bytenr;
3522 slot = sorted[i].slot;
3523
3524 cond_resched();
3525
3526 btrfs_item_key_to_cpu(leaf, &key, slot);
3527 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3528 continue;
3529
3530 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3531
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3532 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3533 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3534 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3539 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3540 cond_resched();
3430 } 3541 }
3542out:
3543 kfree(sorted);
3431 return 0; 3544 return 0;
3432} 3545}
3433 3546
@@ -3437,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3550{
3438 int i; 3551 int i;
3439 int ret; 3552 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3553 struct btrfs_extent_info *info;
3554 struct refsort *sorted;
3555
3556 if (ref->nritems == 0)
3557 return 0;
3441 3558
3559 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3560 for (i = 0; i < ref->nritems; i++) {
3561 sorted[i].bytenr = ref->extents[i].bytenr;
3562 sorted[i].slot = i;
3563 }
3564 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3565
3566 /*
3567 * the items in the ref were sorted when the ref was inserted
3568 * into the ref cache, so this is already in order
3569 */
3570 for (i = 0; i < ref->nritems; i++) {
3571 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3572 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3573 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3574 ref->owner, ref->generation,
@@ -3453,6 +3582,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3582 info++;
3454 } 3583 }
3455 3584
3585 kfree(sorted);
3456 return 0; 3586 return 0;
3457} 3587}
3458 3588
@@ -3497,6 +3627,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3627}
3498 3628
3499/* 3629/*
3630 * this is used while deleting old snapshots, and it drops the refs
3631 * on a whole subtree starting from a level 1 node.
3632 *
3633 * The idea is to sort all the leaf pointers, and then drop the
3634 * ref on all the leaves in order. Most of the time the leaves
3635 * will have ref cache entries, so no leaf IOs will be required to
3636 * find the extents they have references on.
3637 *
3638 * For each leaf, any references it has are also dropped in order
3639 *
3640 * This ends up dropping the references in something close to optimal
3641 * order for reading and modifying the extent allocation tree.
3642 */
3643static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3644 struct btrfs_root *root,
3645 struct btrfs_path *path)
3646{
3647 u64 bytenr;
3648 u64 root_owner;
3649 u64 root_gen;
3650 struct extent_buffer *eb = path->nodes[1];
3651 struct extent_buffer *leaf;
3652 struct btrfs_leaf_ref *ref;
3653 struct refsort *sorted = NULL;
3654 int nritems = btrfs_header_nritems(eb);
3655 int ret;
3656 int i;
3657 int refi = 0;
3658 int slot = path->slots[1];
3659 u32 blocksize = btrfs_level_size(root, 0);
3660 u32 refs;
3661
3662 if (nritems == 0)
3663 goto out;
3664
3665 root_owner = btrfs_header_owner(eb);
3666 root_gen = btrfs_header_generation(eb);
3667 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3668
3669 /*
3670 * step one, sort all the leaf pointers so we don't scribble
3671 * randomly into the extent allocation tree
3672 */
3673 for (i = slot; i < nritems; i++) {
3674 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3675 sorted[refi].slot = i;
3676 refi++;
3677 }
3678
3679 /*
3680 * nritems won't be zero, but if we're picking up drop_snapshot
3681 * after a crash, slot might be > 0, so double check things
3682 * just in case.
3683 */
3684 if (refi == 0)
3685 goto out;
3686
3687 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3688
3689 /*
3690 * the first loop frees everything the leaves point to
3691 */
3692 for (i = 0; i < refi; i++) {
3693 u64 ptr_gen;
3694
3695 bytenr = sorted[i].bytenr;
3696
3697 /*
3698 * check the reference count on this leaf. If it is > 1
3699 * we just decrement it below and don't update any
3700 * of the refs the leaf points to.
3701 */
3702 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3703 BUG_ON(ret);
3704 if (refs != 1)
3705 continue;
3706
3707 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3708
3709 /*
3710 * the leaf only had one reference, which means the
3711 * only thing pointing to this leaf is the snapshot
3712 * we're deleting. It isn't possible for the reference
3713 * count to increase again later
3714 *
3715 * The reference cache is checked for the leaf,
3716 * and if found we'll be able to drop any refs held by
3717 * the leaf without needing to read it in.
3718 */
3719 ref = btrfs_lookup_leaf_ref(root, bytenr);
3720 if (ref && ref->generation != ptr_gen) {
3721 btrfs_free_leaf_ref(root, ref);
3722 ref = NULL;
3723 }
3724 if (ref) {
3725 ret = cache_drop_leaf_ref(trans, root, ref);
3726 BUG_ON(ret);
3727 btrfs_remove_leaf_ref(root, ref);
3728 btrfs_free_leaf_ref(root, ref);
3729 } else {
3730 /*
3731 * the leaf wasn't in the reference cache, so
3732 * we have to read it.
3733 */
3734 leaf = read_tree_block(root, bytenr, blocksize,
3735 ptr_gen);
3736 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3737 BUG_ON(ret);
3738 free_extent_buffer(leaf);
3739 }
3740 atomic_inc(&root->fs_info->throttle_gen);
3741 wake_up(&root->fs_info->transaction_throttle);
3742 cond_resched();
3743 }
3744
3745 /*
3746 * run through the loop again to free the refs on the leaves.
3747 * This is faster than doing it in the loop above because
3748 * the leaves are likely to be clustered together. We end up
3749 * working in nice chunks on the extent allocation tree.
3750 */
3751 for (i = 0; i < refi; i++) {
3752 bytenr = sorted[i].bytenr;
3753 ret = __btrfs_free_extent(trans, root, bytenr,
3754 blocksize, eb->start,
3755 root_owner, root_gen, 0, 1);
3756 BUG_ON(ret);
3757
3758 atomic_inc(&root->fs_info->throttle_gen);
3759 wake_up(&root->fs_info->transaction_throttle);
3760 cond_resched();
3761 }
3762out:
3763 kfree(sorted);
3764
3765 /*
3766 * update the path to show we've processed the entire level 1
3767 * node. This will get saved into the root's drop_snapshot_progress
3768 * field so these drops are not repeated again if this transaction
3769 * commits.
3770 */
3771 path->slots[1] = nritems;
3772 return 0;
3773}
3774
3775/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 3776 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 3777 * counts as it goes.
3502 */ 3778 */
@@ -3511,7 +3787,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 3787 struct extent_buffer *next;
3512 struct extent_buffer *cur; 3788 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 3789 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 3790 u32 blocksize;
3516 int ret; 3791 int ret;
3517 u32 refs; 3792 u32 refs;
@@ -3538,17 +3813,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 3813 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 3814 btrfs_header_nritems(cur))
3540 break; 3815 break;
3816
3817 /* the new code goes down to level 1 and does all the
3818 * leaves pointed to that node in bulk. So, this check
3819 * for level 0 will always be false.
3820 *
3821 * But, the disk format allows the drop_snapshot_progress
3822 * field in the root to leave things in a state where
3823 * a leaf will need cleaning up here. If someone crashes
3824 * with the old code and then boots with the new code,
3825 * we might find a leaf here.
3826 */
3541 if (*level == 0) { 3827 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 3828 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 3829 BUG_ON(ret);
3544 break; 3830 break;
3545 } 3831 }
3832
3833 /*
3834 * once we get to level one, process the whole node
3835 * at once, including everything below it.
3836 */
3837 if (*level == 1) {
3838 ret = drop_level_one_refs(trans, root, path);
3839 BUG_ON(ret);
3840 break;
3841 }
3842
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3843 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3844 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 3845 blocksize = btrfs_level_size(root, *level - 1);
3549 3846
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3847 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 3848 BUG_ON(ret);
3849
3850 /*
3851 * if there is more than one reference, we don't need
3852 * to read that node to drop any references it has. We
3853 * just drop the ref we hold on that node and move on to the
3854 * next slot in this level.
3855 */
3552 if (refs != 1) { 3856 if (refs != 1) {
3553 parent = path->nodes[*level]; 3857 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 3858 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +3871,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 3871
3568 continue; 3872 continue;
3569 } 3873 }
3874
3570 /* 3875 /*
3571 * at this point, we have a single ref, and since the 3876 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 3877 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 3878 */
3576 if (*level == 1) { 3879 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 3880 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 3881 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 3882 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +3901,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 3901 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 3902 root_gen = btrfs_header_generation(parent);
3633 3903
3904 /*
3905 * cleanup and free the reference on the last node
3906 * we processed
3907 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3908 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 3909 parent->start, root_owner, root_gen,
3636 *level, 1); 3910 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 3911 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 3912 path->nodes[*level] = NULL;
3913
3639 *level += 1; 3914 *level += 1;
3640 BUG_ON(ret); 3915 BUG_ON(ret);
3641 3916
@@ -3687,6 +3962,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 3962
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3963 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 3964 btrfs_tree_lock(next);
3965 btrfs_set_lock_blocking(next);
3690 3966
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3967 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 3968 &refs);
@@ -3754,6 +4030,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4030 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4031 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4032 struct btrfs_disk_key disk_key;
4033
4034 /*
4035 * there is more work to do in this level.
4036 * Update the drop_progress marker to reflect
4037 * the work we've done so far, and then bump
4038 * the slot number
4039 */
3757 node = path->nodes[i]; 4040 node = path->nodes[i];
3758 path->slots[i]++; 4041 path->slots[i]++;
3759 *level = i; 4042 *level = i;
@@ -3765,6 +4048,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4048 return 0;
3766 } else { 4049 } else {
3767 struct extent_buffer *parent; 4050 struct extent_buffer *parent;
4051
4052 /*
4053 * this whole node is done, free our reference
4054 * on it and go up one level
4055 */
3768 if (path->nodes[*level] == root->node) 4056 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4057 parent = path->nodes[*level];
3770 else 4058 else
@@ -4444,7 +4732,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4732 u64 lock_end = 0;
4445 u64 num_bytes; 4733 u64 num_bytes;
4446 u64 ext_offset; 4734 u64 ext_offset;
4447 u64 first_pos; 4735 u64 search_end = (u64)-1;
4448 u32 nritems; 4736 u32 nritems;
4449 int nr_scaned = 0; 4737 int nr_scaned = 0;
4450 int extent_locked = 0; 4738 int extent_locked = 0;
@@ -4452,7 +4740,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 4740 int ret;
4453 4741
4454 memcpy(&key, leaf_key, sizeof(key)); 4742 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4743 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 4744 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 4745 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +4788,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 4788 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 4789 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 4790 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 4791 key.offset >= search_end)
4505 break; 4792 break;
4506 } 4793 }
4507 4794
@@ -4534,8 +4821,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4821 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 4822 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 4823
4537 if (first_pos > key.offset - ext_offset) 4824 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 4825 search_end = key.offset - ext_offset +
4826 btrfs_file_extent_ram_bytes(leaf, fi);
4827 }
4539 4828
4540 if (!extent_locked) { 4829 if (!extent_locked) {
4541 lock_start = key.offset; 4830 lock_start = key.offset;
@@ -4724,7 +5013,7 @@ next:
4724 } 5013 }
4725skip: 5014skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5015 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5016 key.offset >= search_end)
4728 break; 5017 break;
4729 5018
4730 cond_resched(); 5019 cond_resched();
@@ -4778,6 +5067,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5067 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5068 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5069 ref->generation = btrfs_header_generation(buf);
5070
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5071 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5072 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5073 btrfs_free_leaf_ref(root, ref);
@@ -5957,9 +6247,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6247 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6248 BUG_ON(!path);
5959 6249
5960 btrfs_remove_free_space_cache(block_group); 6250 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6251 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6252 &root->fs_info->block_group_cache_tree);
6253 spin_unlock(&root->fs_info->block_group_cache_lock);
6254 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6255 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6256 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6257 up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1f..37d43b516b7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/pagevec.h> 13#include <linux/pagevec.h>
15#include "extent_io.h" 14#include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
31static LIST_HEAD(states); 30static LIST_HEAD(states);
32 31
33#define LEAK_DEBUG 0 32#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG 33#if LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock); 34static DEFINE_SPINLOCK(leak_lock);
36#endif 35#endif
37 36
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
120static struct extent_state *alloc_extent_state(gfp_t mask) 119static struct extent_state *alloc_extent_state(gfp_t mask)
121{ 120{
122 struct extent_state *state; 121 struct extent_state *state;
123#ifdef LEAK_DEBUG 122#if LEAK_DEBUG
124 unsigned long flags; 123 unsigned long flags;
125#endif 124#endif
126 125
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
130 state->state = 0; 129 state->state = 0;
131 state->private = 0; 130 state->private = 0;
132 state->tree = NULL; 131 state->tree = NULL;
133#ifdef LEAK_DEBUG 132#if LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags); 133 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states); 134 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags); 135 spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
145 if (!state) 144 if (!state)
146 return; 145 return;
147 if (atomic_dec_and_test(&state->refs)) { 146 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG 147#if LEAK_DEBUG
149 unsigned long flags; 148 unsigned long flags;
150#endif 149#endif
151 WARN_ON(state->tree); 150 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG 151#if LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags); 152 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list); 153 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags); 154 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2378,11 +2377,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2378 int scanned = 0; 2377 int scanned = 0;
2379 int range_whole = 0; 2378 int range_whole = 0;
2380 2379
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0); 2380 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) { 2381 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */ 2382 index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2849,98 @@ out:
2855 return sector; 2849 return sector;
2856} 2850}
2857 2851
2852int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2853 __u64 start, __u64 len, get_extent_t *get_extent)
2854{
2855 int ret;
2856 u64 off = start;
2857 u64 max = start + len;
2858 u32 flags = 0;
2859 u64 disko = 0;
2860 struct extent_map *em = NULL;
2861 int end = 0;
2862 u64 em_start = 0, em_len = 0;
2863 unsigned long emflags;
2864 ret = 0;
2865
2866 if (len == 0)
2867 return -EINVAL;
2868
2869 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2870 GFP_NOFS);
2871 em = get_extent(inode, NULL, 0, off, max - off, 0);
2872 if (!em)
2873 goto out;
2874 if (IS_ERR(em)) {
2875 ret = PTR_ERR(em);
2876 goto out;
2877 }
2878 while (!end) {
2879 off = em->start + em->len;
2880 if (off >= max)
2881 end = 1;
2882
2883 em_start = em->start;
2884 em_len = em->len;
2885
2886 disko = 0;
2887 flags = 0;
2888
2889 switch (em->block_start) {
2890 case EXTENT_MAP_LAST_BYTE:
2891 end = 1;
2892 flags |= FIEMAP_EXTENT_LAST;
2893 break;
2894 case EXTENT_MAP_HOLE:
2895 flags |= FIEMAP_EXTENT_UNWRITTEN;
2896 break;
2897 case EXTENT_MAP_INLINE:
2898 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2899 FIEMAP_EXTENT_NOT_ALIGNED);
2900 break;
2901 case EXTENT_MAP_DELALLOC:
2902 flags |= (FIEMAP_EXTENT_DELALLOC |
2903 FIEMAP_EXTENT_UNKNOWN);
2904 break;
2905 default:
2906 disko = em->block_start;
2907 break;
2908 }
2909 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2910 flags |= FIEMAP_EXTENT_ENCODED;
2911
2912 emflags = em->flags;
2913 free_extent_map(em);
2914 em = NULL;
2915
2916 if (!end) {
2917 em = get_extent(inode, NULL, 0, off, max - off, 0);
2918 if (!em)
2919 goto out;
2920 if (IS_ERR(em)) {
2921 ret = PTR_ERR(em);
2922 goto out;
2923 }
2924 emflags = em->flags;
2925 }
2926 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2927 flags |= FIEMAP_EXTENT_LAST;
2928 end = 1;
2929 }
2930
2931 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2932 em_len, flags);
2933 if (ret)
2934 goto out_free;
2935 }
2936out_free:
2937 free_extent_map(em);
2938out:
2939 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2940 GFP_NOFS);
2941 return ret;
2942}
2943
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2944static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i) 2945 unsigned long i)
2860{ 2946{
@@ -2892,15 +2978,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2892 gfp_t mask) 2978 gfp_t mask)
2893{ 2979{
2894 struct extent_buffer *eb = NULL; 2980 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG 2981#if LEAK_DEBUG
2896 unsigned long flags; 2982 unsigned long flags;
2897#endif 2983#endif
2898 2984
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2985 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start; 2986 eb->start = start;
2901 eb->len = len; 2987 eb->len = len;
2902 mutex_init(&eb->mutex); 2988 spin_lock_init(&eb->lock);
2903#ifdef LEAK_DEBUG 2989 init_waitqueue_head(&eb->lock_wq);
2990
2991#if LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags); 2992 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers); 2993 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags); 2994 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +3000,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2912 3000
2913static void __free_extent_buffer(struct extent_buffer *eb) 3001static void __free_extent_buffer(struct extent_buffer *eb)
2914{ 3002{
2915#ifdef LEAK_DEBUG 3003#if LEAK_DEBUG
2916 unsigned long flags; 3004 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags); 3005 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list); 3006 list_del(&eb->leak_list);
@@ -2980,8 +3068,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2980 unlock_page(p); 3068 unlock_page(p);
2981 } 3069 }
2982 if (uptodate) 3070 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE; 3071 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985 3072
2986 spin_lock(&tree->buffer_lock); 3073 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3074 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3222,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3135 unsigned long num_pages; 3222 unsigned long num_pages;
3136 3223
3137 num_pages = num_extent_pages(eb->start, eb->len); 3224 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE; 3225 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3139 3226
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3227 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS); 3228 GFP_NOFS);
@@ -3206,7 +3293,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3206 struct page *page; 3293 struct page *page;
3207 int pg_uptodate = 1; 3294 int pg_uptodate = 1;
3208 3295
3209 if (eb->flags & EXTENT_UPTODATE) 3296 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3210 return 1; 3297 return 1;
3211 3298
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3299 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3329,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 struct bio *bio = NULL; 3329 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0; 3330 unsigned long bio_flags = 0;
3244 3331
3245 if (eb->flags & EXTENT_UPTODATE) 3332 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3246 return 0; 3333 return 0;
3247 3334
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3335 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3360,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3273 } 3360 }
3274 if (all_uptodate) { 3361 if (all_uptodate) {
3275 if (start_i == 0) 3362 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE; 3363 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 goto unlock_exit; 3364 goto unlock_exit;
3278 } 3365 }
3279 3366
@@ -3309,7 +3396,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3309 } 3396 }
3310 3397
3311 if (!ret) 3398 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE; 3399 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3313 return ret; 3400 return ret;
3314 3401
3315unlock_exit: 3402unlock_exit:
@@ -3406,7 +3493,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3406 unmap_extent_buffer(eb, eb->map_token, km); 3493 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL; 3494 eb->map_token = NULL;
3408 save = 1; 3495 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 } 3496 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map, 3497 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km); 3498 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a7913..1f9df88afbf 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
193 unsigned from, unsigned to); 205 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 206sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent); 207 get_extent_t *get_extent);
208int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
209 __u64 start, __u64 len, get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 210int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 211int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 212int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada3..50da69da20c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h> 6#include <linux/hardirq.h>
8#include "extent_map.h" 7#include "extent_map.h"
9 8
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145..3e8023efaff 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
29#include <linux/writeback.h> 29#include <linux/writeback.h>
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h" 32#include "ctree.h"
34#include "disk-io.h" 33#include "disk-io.h"
35#include "transaction.h" 34#include "transaction.h"
@@ -1215,10 +1214,10 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1215 } 1214 }
1216 mutex_unlock(&root->fs_info->trans_mutex); 1215 mutex_unlock(&root->fs_info->trans_mutex);
1217 1216
1218 root->fs_info->tree_log_batch++; 1217 root->log_batch++;
1219 filemap_fdatawrite(inode->i_mapping); 1218 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1219 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++; 1220 root->log_batch++;
1222 1221
1223 /* 1222 /*
1224 * ok we haven't committed the transaction yet, lets do a commit 1223 * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab4..8f0706210a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/statfs.h> 34#include <linux/statfs.h>
35#include <linux/compat.h> 35#include <linux/compat.h>
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h> 37#include <linux/xattr.h>
39#include <linux/posix_acl.h> 38#include <linux/posix_acl.h>
40#include <linux/falloc.h> 39#include <linux/falloc.h>
@@ -51,6 +50,7 @@
51#include "tree-log.h" 50#include "tree-log.h"
52#include "ref-cache.h" 51#include "ref-cache.h"
53#include "compression.h" 52#include "compression.h"
53#include "locking.h"
54 54
55struct btrfs_iget_args { 55struct btrfs_iget_args {
56 u64 ino; 56 u64 ino;
@@ -91,6 +91,16 @@ static noinline int cow_file_range(struct inode *inode,
91 u64 start, u64 end, int *page_started, 91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock); 92 unsigned long *nr_written, int unlock);
93 93
94static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
95{
96 int err;
97
98 err = btrfs_init_acl(inode, dir);
99 if (!err)
100 err = btrfs_xattr_security_init(inode, dir);
101 return err;
102}
103
94/* 104/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There 105 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing. 106 * are countless ways this is incorrect, but it is better than nothing.
@@ -350,6 +360,19 @@ again:
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 360 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 361 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352 362
363 /*
364 * we don't want to send crud past the end of i_size through
365 * compression, that's just a waste of CPU time. So, if the
366 * end of the file is before the start of our current
367 * requested range of bytes, we bail out to the uncompressed
368 * cleanup code that can deal with all of this.
369 *
370 * It isn't really the fastest way to fix things, but this is a
371 * very uncommon corner.
372 */
373 if (actual_end <= start)
374 goto cleanup_and_bail_uncompressed;
375
353 total_compressed = actual_end - start; 376 total_compressed = actual_end - start;
354 377
355 /* we want to make sure that amount of ram required to uncompress 378 /* we want to make sure that amount of ram required to uncompress
@@ -494,6 +517,7 @@ again:
494 goto again; 517 goto again;
495 } 518 }
496 } else { 519 } else {
520cleanup_and_bail_uncompressed:
497 /* 521 /*
498 * No compression, but we still need to write the pages in 522 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked 523 * the file we've been given so far. redirty the locked
@@ -1324,12 +1348,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset, 1348 struct inode *inode, u64 file_offset,
1325 struct list_head *list) 1349 struct list_head *list)
1326{ 1350{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum; 1351 struct btrfs_ordered_sum *sum;
1329 1352
1330 btrfs_set_trans_block_group(trans, inode); 1353 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) { 1354
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1355 list_for_each_entry(sum, list, list) {
1333 btrfs_csum_file_blocks(trans, 1356 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1357 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 } 1358 }
@@ -2013,6 +2036,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2036 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014 2037
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2038 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2039
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2040 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2041 alloc_group_block, 0);
2018 btrfs_free_path(path); 2042 btrfs_free_path(path);
@@ -2039,6 +2063,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2063 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break; 2064 break;
2041 default: 2065 default:
2066 inode->i_op = &btrfs_special_inode_operations;
2042 init_special_inode(inode, inode->i_mode, rdev); 2067 init_special_inode(inode, inode->i_mode, rdev);
2043 break; 2068 break;
2044 } 2069 }
@@ -2108,6 +2133,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2108 goto failed; 2133 goto failed;
2109 } 2134 }
2110 2135
2136 btrfs_unlock_up_safe(path, 1);
2111 leaf = path->nodes[0]; 2137 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2138 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item); 2139 struct btrfs_inode_item);
@@ -2429,6 +2455,8 @@ next_node:
2429 ref->generation = leaf_gen; 2455 ref->generation = leaf_gen;
2430 ref->nritems = 0; 2456 ref->nritems = 0;
2431 2457
2458 btrfs_sort_leaf_ref(ref);
2459
2432 ret = btrfs_add_leaf_ref(root, ref, 0); 2460 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret); 2461 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref); 2462 btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2504,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_path *path; 2504 struct btrfs_path *path;
2477 struct btrfs_key key; 2505 struct btrfs_key key;
2478 struct btrfs_key found_key; 2506 struct btrfs_key found_key;
2479 u32 found_type; 2507 u32 found_type = (u8)-1;
2480 struct extent_buffer *leaf; 2508 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi; 2509 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0; 2510 u64 extent_start = 0;
@@ -2663,6 +2691,8 @@ next:
2663 if (pending_del_nr) 2691 if (pending_del_nr)
2664 goto del_pending; 2692 goto del_pending;
2665 btrfs_release_path(root, path); 2693 btrfs_release_path(root, path);
2694 if (found_type == BTRFS_INODE_ITEM_KEY)
2695 break;
2666 goto search_again; 2696 goto search_again;
2667 } 2697 }
2668 2698
@@ -2679,6 +2709,8 @@ del_pending:
2679 BUG_ON(ret); 2709 BUG_ON(ret);
2680 pending_del_nr = 0; 2710 pending_del_nr = 0;
2681 btrfs_release_path(root, path); 2711 btrfs_release_path(root, path);
2712 if (found_type == BTRFS_INODE_ITEM_KEY)
2713 break;
2682 goto search_again; 2714 goto search_again;
2683 } 2715 }
2684 } 2716 }
@@ -3265,7 +3297,7 @@ skip:
3265 3297
3266 /* Reached end of directory/root. Bump pos past the last item. */ 3298 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3299 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3300 filp->f_pos = INT_LIMIT(off_t);
3269 else 3301 else
3270 filp->f_pos++; 3302 filp->f_pos++;
3271nopos: 3303nopos:
@@ -3458,7 +3490,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3458 root->highest_inode = objectid; 3490 root->highest_inode = objectid;
3459 3491
3460 inode->i_uid = current_fsuid(); 3492 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid(); 3493
3494 if (dir && (dir->i_mode & S_ISGID)) {
3495 inode->i_gid = dir->i_gid;
3496 if (S_ISDIR(mode))
3497 mode |= S_ISGID;
3498 } else
3499 inode->i_gid = current_fsgid();
3500
3462 inode->i_mode = mode; 3501 inode->i_mode = mode;
3463 inode->i_ino = objectid; 3502 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0); 3503 inode_set_bytes(inode, 0);
@@ -3586,7 +3625,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3586 if (IS_ERR(inode)) 3625 if (IS_ERR(inode))
3587 goto out_unlock; 3626 goto out_unlock;
3588 3627
3589 err = btrfs_init_acl(inode, dir); 3628 err = btrfs_init_inode_security(inode, dir);
3590 if (err) { 3629 if (err) {
3591 drop_inode = 1; 3630 drop_inode = 1;
3592 goto out_unlock; 3631 goto out_unlock;
@@ -3649,7 +3688,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3649 if (IS_ERR(inode)) 3688 if (IS_ERR(inode))
3650 goto out_unlock; 3689 goto out_unlock;
3651 3690
3652 err = btrfs_init_acl(inode, dir); 3691 err = btrfs_init_inode_security(inode, dir);
3653 if (err) { 3692 if (err) {
3654 drop_inode = 1; 3693 drop_inode = 1;
3655 goto out_unlock; 3694 goto out_unlock;
@@ -3772,7 +3811,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3772 3811
3773 drop_on_err = 1; 3812 drop_on_err = 1;
3774 3813
3775 err = btrfs_init_acl(inode, dir); 3814 err = btrfs_init_inode_security(inode, dir);
3776 if (err) 3815 if (err)
3777 goto out_fail; 3816 goto out_fail;
3778 3817
@@ -4158,9 +4197,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4158 return -EINVAL; 4197 return -EINVAL;
4159} 4198}
4160 4199
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4200static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4201 __u64 start, __u64 len)
4162{ 4202{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent); 4203 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4164} 4204}
4165 4205
4166int btrfs_readpage(struct file *file, struct page *page) 4206int btrfs_readpage(struct file *file, struct page *page)
@@ -4733,7 +4773,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4733 if (IS_ERR(inode)) 4773 if (IS_ERR(inode))
4734 goto out_unlock; 4774 goto out_unlock;
4735 4775
4736 err = btrfs_init_acl(inode, dir); 4776 err = btrfs_init_inode_security(inode, dir);
4737 if (err) { 4777 if (err) {
4738 drop_inode = 1; 4778 drop_inode = 1;
4739 goto out_unlock; 4779 goto out_unlock;
@@ -4987,13 +5027,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
4987 .clear_bit_hook = btrfs_clear_bit_hook, 5027 .clear_bit_hook = btrfs_clear_bit_hook,
4988}; 5028};
4989 5029
5030/*
5031 * btrfs doesn't support the bmap operation because swapfiles
5032 * use bmap to make a mapping of extents in the file. They assume
5033 * these extents won't change over the life of the file and they
5034 * use the bmap result to do IO directly to the drive.
5035 *
5036 * the btrfs bmap call would return logical addresses that aren't
5037 * suitable for IO and they also will change frequently as COW
5038 * operations happen. So, swapfile + btrfs == corruption.
5039 *
5040 * For now we're avoiding this by dropping bmap.
5041 */
4990static struct address_space_operations btrfs_aops = { 5042static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage, 5043 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage, 5044 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages, 5045 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages, 5046 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page, 5047 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO, 5048 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage, 5049 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage, 5050 .releasepage = btrfs_releasepage,
@@ -5017,6 +5068,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5017 .removexattr = btrfs_removexattr, 5068 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission, 5069 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate, 5070 .fallocate = btrfs_fallocate,
5071 .fiemap = btrfs_fiemap,
5020}; 5072};
5021static struct inode_operations btrfs_special_inode_operations = { 5073static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr, 5074 .getattr = btrfs_getattr,
@@ -5032,4 +5084,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5032 .follow_link = page_follow_link_light, 5084 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link, 5085 .put_link = page_put_link,
5034 .permission = btrfs_permission, 5086 .permission = btrfs_permission,
5087 .setxattr = btrfs_setxattr,
5088 .getxattr = btrfs_getxattr,
5089 .listxattr = btrfs_listxattr,
5090 .removexattr = btrfs_removexattr,
5035}; 5091};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb..988fdc8b49e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
38#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/bit_spinlock.h> 39#include <linux/bit_spinlock.h>
40#include <linux/security.h> 40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h> 41#include <linux/xattr.h>
43#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
44#include "compat.h" 43#include "compat.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db..9ebe9385129 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,63 +26,213 @@
26#include "locking.h" 26#include "locking.h"
27 27
28/* 28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 29 * btrfs_header_level() isn't free, so don't call it when lockdep isn't
30 * and the spin is not tuned very extensively. The spinning does make a big 30 * on
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 31 */
32#ifdef CONFIG_DEBUG_LOCK_ALLOC
33static inline void spin_nested(struct extent_buffer *eb)
34{
35 spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
36}
37#else
38static inline void spin_nested(struct extent_buffer *eb)
39{
40 spin_lock(&eb->lock);
41}
42#endif
37 43
38int btrfs_tree_lock(struct extent_buffer *eb) 44/*
45 * Setting a lock to blocking will drop the spinlock and set the
46 * flag that forces other procs who want the lock to wait. After
47 * this you can safely schedule with the lock held.
48 */
49void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{ 50{
40 int i; 51 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
52 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
53 spin_unlock(&eb->lock);
54 }
55 /* exit with the spin lock released and the bit set */
56}
41 57
42 if (mutex_trylock(&eb->mutex)) 58/*
43 return 0; 59 * clearing the blocking flag will take the spinlock again.
60 * After this you can't safely schedule
61 */
62void btrfs_clear_lock_blocking(struct extent_buffer *eb)
63{
64 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
65 spin_nested(eb);
66 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
67 smp_mb__after_clear_bit();
68 }
69 /* exit with the spin lock held */
70}
71
72/*
73 * unfortunately, many of the places that currently set a lock to blocking
74 * don't end up blocking for every long, and often they don't block
75 * at all. For a dbench 50 run, if we don't spin one the blocking bit
76 * at all, the context switch rate can jump up to 400,000/sec or more.
77 *
78 * So, we're still stuck with this crummy spin on the blocking bit,
79 * at least until the most common causes of the short blocks
80 * can be dealt with.
81 */
82static int btrfs_spin_on_block(struct extent_buffer *eb)
83{
84 int i;
44 for (i = 0; i < 512; i++) { 85 for (i = 0; i < 512; i++) {
45 cpu_relax(); 86 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 87 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 88 return 1;
89 if (need_resched())
90 break;
48 } 91 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 92 return 0;
52} 93}
53 94
54int btrfs_try_tree_lock(struct extent_buffer *eb) 95/*
96 * This is somewhat different from trylock. It will take the
97 * spinlock but if it finds the lock is set to blocking, it will
98 * return without the lock held.
99 *
100 * returns 1 if it was able to take the lock and zero otherwise
101 *
102 * After this call, scheduling is not safe without first calling
103 * btrfs_set_lock_blocking()
104 */
105int btrfs_try_spin_lock(struct extent_buffer *eb)
55{ 106{
56 return mutex_trylock(&eb->mutex); 107 int i;
108
109 spin_nested(eb);
110 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
111 return 1;
112 spin_unlock(&eb->lock);
113
114 /* spin for a bit on the BLOCKING flag */
115 for (i = 0; i < 2; i++) {
116 if (!btrfs_spin_on_block(eb))
117 break;
118
119 spin_nested(eb);
120 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
121 return 1;
122 spin_unlock(&eb->lock);
123 }
124 return 0;
57} 125}
58 126
59int btrfs_tree_unlock(struct extent_buffer *eb) 127/*
128 * the autoremove wake function will return 0 if it tried to wake up
129 * a process that was already awake, which means that process won't
130 * count as an exclusive wakeup. The waitq code will continue waking
131 * procs until it finds one that was actually sleeping.
132 *
133 * For btrfs, this isn't quite what we want. We want a single proc
134 * to be notified that the lock is ready for taking. If that proc
135 * already happen to be awake, great, it will loop around and try for
136 * the lock.
137 *
138 * So, btrfs_wake_function always returns 1, even when the proc that we
139 * tried to wake up was already awake.
140 */
141static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
142 int sync, void *key)
60{ 143{
61 mutex_unlock(&eb->mutex); 144 autoremove_wake_function(wait, mode, sync, key);
62 return 0; 145 return 1;
63} 146}
64 147
65int btrfs_tree_locked(struct extent_buffer *eb) 148/*
149 * returns with the extent buffer spinlocked.
150 *
151 * This will spin and/or wait as required to take the lock, and then
152 * return with the spinlock held.
153 *
154 * After this call, scheduling is not safe without first calling
155 * btrfs_set_lock_blocking()
156 */
157int btrfs_tree_lock(struct extent_buffer *eb)
66{ 158{
67 return mutex_is_locked(&eb->mutex); 159 DEFINE_WAIT(wait);
160 wait.func = btrfs_wake_function;
161
162 while(1) {
163 spin_nested(eb);
164
165 /* nobody is blocking, exit with the spinlock held */
166 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
167 return 0;
168
169 /*
170 * we have the spinlock, but the real owner is blocking.
171 * wait for them
172 */
173 spin_unlock(&eb->lock);
174
175 /*
176 * spin for a bit, and if the blocking flag goes away,
177 * loop around
178 */
179 if (btrfs_spin_on_block(eb))
180 continue;
181
182 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
183 TASK_UNINTERRUPTIBLE);
184
185 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
186 schedule();
187
188 finish_wait(&eb->lock_wq, &wait);
189 }
190 return 0;
68} 191}
69 192
70/* 193/*
71 * btrfs_search_slot uses this to decide if it should drop its locks 194 * Very quick trylock, this does not spin or schedule. It returns
72 * before doing something expensive like allocating free blocks for cow. 195 * 1 with the spinlock held if it was able to take the lock, or it
196 * returns zero if it was unable to take the lock.
197 *
198 * After this call, scheduling is not safe without first calling
199 * btrfs_set_lock_blocking()
73 */ 200 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 201int btrfs_try_tree_lock(struct extent_buffer *eb)
75{ 202{
76 int i; 203 if (spin_trylock(&eb->lock)) {
77 struct extent_buffer *eb; 204 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 205 /*
79 eb = path->nodes[i]; 206 * we've got the spinlock, but the real owner is
80 if (!eb) 207 * blocking. Drop the spinlock and return failure
81 break; 208 */
82 smp_mb(); 209 spin_unlock(&eb->lock);
83 if (!list_empty(&eb->mutex.wait_list)) 210 return 0;
84 return 1; 211 }
212 return 1;
85 } 213 }
214 /* someone else has the spinlock giveup */
86 return 0; 215 return 0;
87} 216}
88 217
218int btrfs_tree_unlock(struct extent_buffer *eb)
219{
220 /*
221 * if we were a blocking owner, we don't have the spinlock held
222 * just clear the bit and look for waiters
223 */
224 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
225 smp_mb__after_clear_bit();
226 else
227 spin_unlock(&eb->lock);
228
229 if (waitqueue_active(&eb->lock_wq))
230 wake_up(&eb->lock_wq);
231 return 0;
232}
233
234int btrfs_tree_locked(struct extent_buffer *eb)
235{
236 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
237 spin_is_locked(&eb->lock);
238}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef1251..6bb0afbff92 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,10 @@
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24int btrfs_tree_locked(struct extent_buffer *eb);
25
25int btrfs_try_tree_lock(struct extent_buffer *eb); 26int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 27int btrfs_try_spin_lock(struct extent_buffer *eb);
28
29void btrfs_set_lock_blocking(struct extent_buffer *eb);
30void btrfs_clear_lock_blocking(struct extent_buffer *eb);
27#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a2094017027..77c2411a5f0 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
613 struct btrfs_sector_sum *sector_sums; 613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered; 614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors; 616 unsigned long num_sectors;
618 unsigned long i; 617 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
624 return 1; 623 return 1;
625 624
626 mutex_lock(&tree->mutex); 625 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) { 626 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) { 627 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize; 628 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums; 629 sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9ea..d0cc62bccb9 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "ref-cache.h" 22#include "ref-cache.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c5..bc283ad2db7 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared); 74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif 76#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db9fb3bc1e3..f3fd7e2cbc3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
37#include <linux/ctype.h> 37#include <linux/ctype.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h> 40#include <linux/magic.h>
42#include "compat.h" 41#include "compat.h"
43#include "ctree.h" 42#include "ctree.h"
@@ -583,17 +582,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
583 struct btrfs_ioctl_vol_args *vol; 582 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices; 583 struct btrfs_fs_devices *fs_devices;
585 int ret = -ENOTTY; 584 int ret = -ENOTTY;
586 int len;
587 585
588 if (!capable(CAP_SYS_ADMIN)) 586 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM; 587 return -EPERM;
590 588
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 589 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
590 if (!vol)
591 return -ENOMEM;
592
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 593 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT; 594 ret = -EFAULT;
594 goto out; 595 goto out;
595 } 596 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 597
598 switch (cmd) { 598 switch (cmd) {
599 case BTRFS_IOC_SCAN_DEV: 599 case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f944334..919172de5c9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -852,11 +852,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
852{ 852{
853 struct btrfs_pending_snapshot *pending; 853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots; 854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret; 855 int ret;
857 856
858 list_for_each(cur, head) { 857 list_for_each_entry(pending, head, list) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending); 858 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret); 859 BUG_ON(ret);
862 } 860 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c3616..98d25fa4570 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077..20794290256 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1704 1615
1705 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1707 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1709 1621
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1750 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1751 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1753 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1755 1668
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1807 1720
1808 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1810 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1812 1726
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1879 1793
1880 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1882 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1884 1799
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1817 }
1903 } 1818 }
1904 btrfs_free_path(path); 1819 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1820 return ret;
1908} 1821}
1909 1822
1910static int wait_log_commit(struct btrfs_root *log) 1823/*
1824 * helper function to update the item for a given subvolumes log root
1825 * in the tree of log roots
1826 */
1827static int update_log_root(struct btrfs_trans_handle *trans,
1828 struct btrfs_root *log)
1829{
1830 int ret;
1831
1832 if (log->log_transid == 1) {
1833 /* insert root item on the first sync */
1834 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1835 &log->root_key, &log->root_item);
1836 } else {
1837 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1838 &log->root_key, &log->root_item);
1839 }
1840 return ret;
1841}
1842
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1844{
1912 DEFINE_WAIT(wait); 1845 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1846 int index = transid % 2;
1914 1847
1848 /*
1849 * we only allow two pending log transactions at a time,
1850 * so we know that if ours is more than 2 older than the
1851 * current transaction, we're done
1852 */
1915 do { 1853 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1854 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1855 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1856 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1857 if (root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index]))
1920 schedule(); 1859 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1860 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1861 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1862 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1863 atomic_read(&root->log_commit[index]));
1864 return 0;
1865}
1866
1867static int wait_for_writer(struct btrfs_root *root)
1868{
1869 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers))
1875 schedule();
1876 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait);
1878 }
1925 return 0; 1879 return 0;
1926} 1880}
1927 1881
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1887int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1888 struct btrfs_root *root)
1935{ 1889{
1890 int index1;
1891 int index2;
1936 int ret; 1892 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1893 struct btrfs_root *log = root->log_root;
1894 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1895
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1896 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1897 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1898 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1899 wait_log_commit(root, root->log_transid);
1900 mutex_unlock(&root->log_mutex);
1901 return 0;
1944 } 1902 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1903 atomic_set(&root->log_commit[index1], 1);
1904
1905 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1);
1946 1908
1947 while (1) { 1909 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1910 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1911 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1912 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1913 mutex_lock(&root->log_mutex);
1952 1914 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1915 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1916 break;
1965 } 1917 }
1966 1918
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1920 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1921
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1922 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1923 btrfs_set_root_generation(&log->root_item, trans->transid);
1924 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1925
1926 root->log_batch = 0;
1927 root->log_transid++;
1928 log->log_transid = root->log_transid;
1929 smp_mb();
1930 /*
1931 * log tree has been flushed to disk, new modifications of
1932 * the log will be written to new positions. so it's safe to
1933 * allow log writers to go in.
1934 */
1935 mutex_unlock(&root->log_mutex);
1936
1937 mutex_lock(&log_root_tree->log_mutex);
1938 log_root_tree->log_batch++;
1939 atomic_inc(&log_root_tree->log_writers);
1940 mutex_unlock(&log_root_tree->log_mutex);
1941
1942 ret = update_log_root(trans, log);
1943 BUG_ON(ret);
1944
1945 mutex_lock(&log_root_tree->log_mutex);
1946 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1947 smp_mb();
1948 if (waitqueue_active(&log_root_tree->log_writer_wait))
1949 wake_up(&log_root_tree->log_writer_wait);
1950 }
1951
1952 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out;
1957 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1);
1959
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1962
1963 wait_for_writer(log_root_tree);
1964
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1967 BUG_ON(ret);
1972 1968
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1969 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1970 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1971 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1972 btrfs_header_level(log_root_tree->node));
1973
1974 log_root_tree->log_batch = 0;
1975 log_root_tree->log_transid++;
1976 smp_mb();
1977
1978 mutex_unlock(&log_root_tree->log_mutex);
1979
1980 /*
1981 * nobody else is going to jump in and write the the ctree
1982 * super here because the log_commit atomic below is protecting
1983 * us. We must be called with a transaction handle pinning
1984 * the running transaction open, so a full commit can't hop
1985 * in and cause problems either.
1986 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1988
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1989 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1990 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1992 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1993out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1994 atomic_set(&root->log_commit[index1], 0);
1995 smp_mb();
1996 if (waitqueue_active(&root->log_commit_wait[index1]))
1997 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1998 return 0;
1988} 1999}
1989 2000
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2030 start, end, GFP_NOFS);
2020 } 2031 }
2021 2032
2022 log = root->log_root; 2033 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2034 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2035 &log->root_key);
2025 BUG_ON(ret); 2036 BUG_ON(ret);
2037 }
2026 root->log_root = NULL; 2038 root->log_root = NULL;
2027 kfree(root->log_root); 2039 free_extent_buffer(log->node);
2040 kfree(log);
2028 return 0; 2041 return 0;
2029} 2042}
2030 2043
2031/* 2044/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2045 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2046 * mixed in, we have a few interesting corners:
2056 * 2047 *
@@ -2711,11 +2702,6 @@ next_slot:
2711 2702
2712 btrfs_free_path(path); 2703 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2704 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2705out:
2720 return 0; 2706 return 0;
2721} 2707}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1cca2b..bcd14ebccae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h> 23#include <asm/div64.h>
25#include "compat.h" 24#include "compat.h"
26#include "ctree.h" 25#include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid) 103 u64 devid, u8 *uuid)
105{ 104{
106 struct btrfs_device *dev; 105 struct btrfs_device *dev;
107 struct list_head *cur;
108 106
109 list_for_each(cur, head) { 107 list_for_each_entry(dev, head, dev_list) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid && 108 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev; 110 return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
118 115
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 116static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{ 117{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices; 118 struct btrfs_fs_devices *fs_devices;
123 119
124 list_for_each(cur, &fs_uuids) { 120 list_for_each_entry(fs_devices, &fs_uuids, list) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices; 122 return fs_devices;
128 } 123 }
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
159loop: 154loop:
160 spin_lock(&device->io_lock); 155 spin_lock(&device->io_lock);
161 156
157loop_lock:
162 /* take all the bios off the list at once and process them 158 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the 159 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted 160 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
208 * is now congested. Back off and let other work structs 204 * is now congested. Back off and let other work structs
209 * run instead 205 * run instead
210 */ 206 */
211 if (pending && bdi_write_congested(bdi) && 207 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
212 fs_info->fs_devices->open_devices > 1) { 208 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head; 209 struct bio *old_head;
214 210
@@ -220,7 +216,8 @@ loop:
220 tail->bi_next = old_head; 216 tail->bi_next = old_head;
221 else 217 else
222 device->pending_bio_tail = tail; 218 device->pending_bio_tail = tail;
223 device->running_pending = 0; 219
220 device->running_pending = 1;
224 221
225 spin_unlock(&device->io_lock); 222 spin_unlock(&device->io_lock);
226 btrfs_requeue_work(&device->work); 223 btrfs_requeue_work(&device->work);
@@ -229,6 +226,11 @@ loop:
229 } 226 }
230 if (again) 227 if (again)
231 goto loop; 228 goto loop;
229
230 spin_lock(&device->io_lock);
231 if (device->pending_bios)
232 goto loop_lock;
233 spin_unlock(&device->io_lock);
232done: 234done:
233 return 0; 235 return 0;
234} 236}
@@ -345,14 +347,11 @@ error:
345 347
346int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 348int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
347{ 349{
348 struct list_head *tmp; 350 struct btrfs_device *device, *next;
349 struct list_head *cur;
350 struct btrfs_device *device;
351 351
352 mutex_lock(&uuid_mutex); 352 mutex_lock(&uuid_mutex);
353again: 353again:
354 list_for_each_safe(cur, tmp, &fs_devices->devices) { 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
355 device = list_entry(cur, struct btrfs_device, dev_list);
356 if (device->in_fs_metadata) 355 if (device->in_fs_metadata)
357 continue; 356 continue;
358 357
@@ -383,14 +382,12 @@ again:
383 382
384static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{ 384{
386 struct list_head *cur;
387 struct btrfs_device *device; 385 struct btrfs_device *device;
388 386
389 if (--fs_devices->opened > 0) 387 if (--fs_devices->opened > 0)
390 return 0; 388 return 0;
391 389
392 list_for_each(cur, &fs_devices->devices) { 390 list_for_each_entry(device, &fs_devices->devices, dev_list) {
393 device = list_entry(cur, struct btrfs_device, dev_list);
394 if (device->bdev) { 391 if (device->bdev) {
395 close_bdev_exclusive(device->bdev, device->mode); 392 close_bdev_exclusive(device->bdev, device->mode);
396 fs_devices->open_devices--; 393 fs_devices->open_devices--;
@@ -439,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
439{ 436{
440 struct block_device *bdev; 437 struct block_device *bdev;
441 struct list_head *head = &fs_devices->devices; 438 struct list_head *head = &fs_devices->devices;
442 struct list_head *cur;
443 struct btrfs_device *device; 439 struct btrfs_device *device;
444 struct block_device *latest_bdev = NULL; 440 struct block_device *latest_bdev = NULL;
445 struct buffer_head *bh; 441 struct buffer_head *bh;
@@ -450,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
450 int seeding = 1; 446 int seeding = 1;
451 int ret = 0; 447 int ret = 0;
452 448
453 list_for_each(cur, head) { 449 list_for_each_entry(device, head, dev_list) {
454 device = list_entry(cur, struct btrfs_device, dev_list);
455 if (device->bdev) 450 if (device->bdev)
456 continue; 451 continue;
457 if (!device->name) 452 if (!device->name)
@@ -578,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
578 *(unsigned long long *)disk_super->fsid, 573 *(unsigned long long *)disk_super->fsid,
579 *(unsigned long long *)(disk_super->fsid + 8)); 574 *(unsigned long long *)(disk_super->fsid + 8));
580 } 575 }
581 printk(KERN_INFO "devid %llu transid %llu %s\n", 576 printk(KERN_CONT "devid %llu transid %llu %s\n",
582 (unsigned long long)devid, (unsigned long long)transid, path); 577 (unsigned long long)devid, (unsigned long long)transid, path);
583 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
584 579
@@ -1017,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1017 } 1012 }
1018 1013
1019 if (strcmp(device_path, "missing") == 0) { 1014 if (strcmp(device_path, "missing") == 0) {
1020 struct list_head *cur;
1021 struct list_head *devices; 1015 struct list_head *devices;
1022 struct btrfs_device *tmp; 1016 struct btrfs_device *tmp;
1023 1017
1024 device = NULL; 1018 device = NULL;
1025 devices = &root->fs_info->fs_devices->devices; 1019 devices = &root->fs_info->fs_devices->devices;
1026 list_for_each(cur, devices) { 1020 list_for_each_entry(tmp, devices, dev_list) {
1027 tmp = list_entry(cur, struct btrfs_device, dev_list);
1028 if (tmp->in_fs_metadata && !tmp->bdev) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) {
1029 device = tmp; 1022 device = tmp;
1030 break; 1023 break;
@@ -1280,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1280 struct btrfs_trans_handle *trans; 1273 struct btrfs_trans_handle *trans;
1281 struct btrfs_device *device; 1274 struct btrfs_device *device;
1282 struct block_device *bdev; 1275 struct block_device *bdev;
1283 struct list_head *cur;
1284 struct list_head *devices; 1276 struct list_head *devices;
1285 struct super_block *sb = root->fs_info->sb; 1277 struct super_block *sb = root->fs_info->sb;
1286 u64 total_bytes; 1278 u64 total_bytes;
@@ -1304,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1304 mutex_lock(&root->fs_info->volume_mutex); 1296 mutex_lock(&root->fs_info->volume_mutex);
1305 1297
1306 devices = &root->fs_info->fs_devices->devices; 1298 devices = &root->fs_info->fs_devices->devices;
1307 list_for_each(cur, devices) { 1299 list_for_each_entry(device, devices, dev_list) {
1308 device = list_entry(cur, struct btrfs_device, dev_list);
1309 if (device->bdev == bdev) { 1300 if (device->bdev == bdev) {
1310 ret = -EEXIST; 1301 ret = -EEXIST;
1311 goto error; 1302 goto error;
@@ -1704,7 +1695,6 @@ static u64 div_factor(u64 num, int factor)
1704int btrfs_balance(struct btrfs_root *dev_root) 1695int btrfs_balance(struct btrfs_root *dev_root)
1705{ 1696{
1706 int ret; 1697 int ret;
1707 struct list_head *cur;
1708 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1698 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1709 struct btrfs_device *device; 1699 struct btrfs_device *device;
1710 u64 old_size; 1700 u64 old_size;
@@ -1723,8 +1713,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1723 dev_root = dev_root->fs_info->dev_root; 1713 dev_root = dev_root->fs_info->dev_root;
1724 1714
1725 /* step one make some room on all the devices */ 1715 /* step one make some room on all the devices */
1726 list_for_each(cur, devices) { 1716 list_for_each_entry(device, devices, dev_list) {
1727 device = list_entry(cur, struct btrfs_device, dev_list);
1728 old_size = device->total_bytes; 1717 old_size = device->total_bytes;
1729 size_to_free = div_factor(old_size, 1); 1718 size_to_free = div_factor(old_size, 1);
1730 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1719 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e27089..a9d3bf4d268 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "btrfs_inode.h" 26#include "btrfs_inode.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
45 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0); 48 strlen(name), 0);
48 if (!di || IS_ERR(di)) { 49 if (!di) {
49 ret = -ENODATA; 50 ret = -ENODATA;
50 goto out; 51 goto out;
52 } else if (IS_ERR(di)) {
53 ret = PTR_ERR(di);
54 goto out;
51 } 55 }
52 56
53 leaf = path->nodes[0]; 57 leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
62 ret = -ERANGE; 66 ret = -ERANGE;
63 goto out; 67 goto out;
64 } 68 }
69
70 /*
71 * The way things are packed into the leaf is like this
72 * |struct btrfs_dir_item|name|data|
73 * where name is the xattr name, so security.foo, and data is the
74 * content of the xattr. data_ptr points to the location in memory
75 * where the data starts in the in memory leaf
76 */
65 data_ptr = (unsigned long)((char *)(di + 1) + 77 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di)); 78 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr, 79 read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
86 if (!path) 98 if (!path)
87 return -ENOMEM; 99 return -ENOMEM;
88 100
89 trans = btrfs_start_transaction(root, 1); 101 trans = btrfs_join_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode); 102 btrfs_set_trans_block_group(trans, inode);
91 103
92 /* first lets see if we already have this xattr */ 104 /* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0) 189 if (ret < 0)
178 goto err; 190 goto err;
179 ret = 0;
180 advance = 0; 191 advance = 0;
181 while (1) { 192 while (1) {
182 leaf = path->nodes[0]; 193 leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
320 return -EOPNOTSUPP; 331 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322} 333}
334
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
336{
337 int err;
338 size_t len;
339 void *value;
340 char *suffix;
341 char *name;
342
343 err = security_inode_init_security(inode, dir, &suffix, &value, &len);
344 if (err) {
345 if (err == -EOPNOTSUPP)
346 return 0;
347 return err;
348 }
349
350 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
351 GFP_NOFS);
352 if (!name) {
353 err = -ENOMEM;
354 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0);
358 kfree(name);
359 }
360
361 kfree(suffix);
362 kfree(value);
363 return err;
364}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68..c71e9c3cf3f 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
40
39#endif /* __XATTR__ */ 41#endif /* __XATTR__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index b58208f1640..665d446b25b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2688,7 +2688,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
2688 struct buffer_head *bh; 2688 struct buffer_head *bh;
2689 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2689 BUG_ON(fsdata != NULL && page_has_buffers(page));
2690 2690
2691 if (unlikely(copied < len) && !page_has_buffers(page)) 2691 if (unlikely(copied < len) && head)
2692 attach_nobh_buffers(page, head); 2692 attach_nobh_buffers(page, head);
2693 if (page_has_buffers(page)) 2693 if (page_has_buffers(page))
2694 return generic_write_end(file, mapping, pos, len, 2694 return generic_write_end(file, mapping, pos, len,
diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705a..d0145ca2757 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename,
1407 bprm->cred = prepare_exec_creds(); 1407 bprm->cred = prepare_exec_creds();
1408 if (!bprm->cred) 1408 if (!bprm->cred)
1409 goto out_unlock; 1409 goto out_unlock;
1410 check_unsafe_exec(bprm); 1410 check_unsafe_exec(bprm, current->files);
1411 1411
1412 file = open_exec(filename); 1412 file = open_exec(filename);
1413 retval = PTR_ERR(file); 1413 retval = PTR_ERR(file);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c8f8d5904f5..9c6d815dd19 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -785,7 +785,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
785 785
786 if (copy_in_user(&sgio->status, &sgio32->status, 786 if (copy_in_user(&sgio->status, &sgio32->status,
787 (4 * sizeof(unsigned char)) + 787 (4 * sizeof(unsigned char)) +
788 (2 * sizeof(unsigned (short))) + 788 (2 * sizeof(unsigned short)) +
789 (3 * sizeof(int)))) 789 (3 * sizeof(int))))
790 return -EFAULT; 790 return -EFAULT;
791 791
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c01e043670e..f6caeb1d110 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1716,7 +1716,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1716{ 1716{
1717 int rc = 0; 1717 int rc = 0;
1718 1718
1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); 1719 (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
1720 if (!(*copied_name)) { 1720 if (!(*copied_name)) {
1721 rc = -ENOMEM; 1721 rc = -ENOMEM;
1722 goto out; 1722 goto out;
@@ -1726,7 +1726,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1726 * in printing out the 1726 * in printing out the
1727 * string in debug 1727 * string in debug
1728 * messages */ 1728 * messages */
1729 (*copied_name_size) = (name_size + 1); 1729 (*copied_name_size) = name_size;
1730out: 1730out:
1731 return rc; 1731 return rc;
1732} 1732}
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a01f1b..929b58004b7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds);
1049 * - the caller must hold current->cred_exec_mutex to protect against 1049 * - the caller must hold current->cred_exec_mutex to protect against
1050 * PTRACE_ATTACH 1050 * PTRACE_ATTACH
1051 */ 1051 */
1052void check_unsafe_exec(struct linux_binprm *bprm) 1052void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
1053{ 1053{
1054 struct task_struct *p = current; 1054 struct task_struct *p = current, *t;
1055 unsigned long flags;
1056 unsigned n_fs, n_files, n_sighand;
1055 1057
1056 bprm->unsafe = tracehook_unsafe_exec(p); 1058 bprm->unsafe = tracehook_unsafe_exec(p);
1057 1059
1058 if (atomic_read(&p->fs->count) > 1 || 1060 n_fs = 1;
1059 atomic_read(&p->files->count) > 1 || 1061 n_files = 1;
1060 atomic_read(&p->sighand->count) > 1) 1062 n_sighand = 1;
1063 lock_task_sighand(p, &flags);
1064 for (t = next_thread(p); t != p; t = next_thread(t)) {
1065 if (t->fs == p->fs)
1066 n_fs++;
1067 if (t->files == files)
1068 n_files++;
1069 n_sighand++;
1070 }
1071
1072 if (atomic_read(&p->fs->count) > n_fs ||
1073 atomic_read(&p->files->count) > n_files ||
1074 atomic_read(&p->sighand->count) > n_sighand)
1061 bprm->unsafe |= LSM_UNSAFE_SHARE; 1075 bprm->unsafe |= LSM_UNSAFE_SHARE;
1076
1077 unlock_task_sighand(p, &flags);
1062} 1078}
1063 1079
1064/* 1080/*
@@ -1273,7 +1289,7 @@ int do_execve(char * filename,
1273 bprm->cred = prepare_exec_creds(); 1289 bprm->cred = prepare_exec_creds();
1274 if (!bprm->cred) 1290 if (!bprm->cred)
1275 goto out_unlock; 1291 goto out_unlock;
1276 check_unsafe_exec(bprm); 1292 check_unsafe_exec(bprm, displaced);
1277 1293
1278 file = open_exec(filename); 1294 file = open_exec(filename);
1279 retval = PTR_ERR(file); 1295 retval = PTR_ERR(file);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6..7c6e3606f0e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1185 es = sbi->s_es; 1185 es = sbi->s_es;
1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
1187 (old_mount_opt & EXT2_MOUNT_XIP)) && 1187 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1188 invalidate_inodes(sb)) 1188 invalidate_inodes(sb)) {
1189 ext2_warning(sb, __func__, "busy inodes while remounting "\ 1189 ext2_warning(sb, __func__, "refusing change of xip flag "
1190 "xip remain in cache (no functional problem)"); 1190 "with busy inodes while remounting");
1191 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1192 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1193 }
1191 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1194 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1192 return 0; 1195 return 0;
1193 if (*flags & MS_RDONLY) { 1196 if (*flags & MS_RDONLY) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3..4a970411a45 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
2428 2428
2429static int ext3_sync_fs(struct super_block *sb, int wait) 2429static int ext3_sync_fs(struct super_block *sb, int wait)
2430{ 2430{
2431 sb->s_dirt = 0; 2431 tid_t target;
2432 if (wait)
2433 ext3_force_commit(sb);
2434 else
2435 journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
2436 2432
2433 sb->s_dirt = 0;
2434 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2435 if (wait)
2436 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2437 }
2437 return 0; 2438 return 0;
2438} 2439}
2439 2440
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af03..9b800d97a68 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
108 108
109 if (hugetlb_reserve_pages(inode, 109 if (hugetlb_reserve_pages(inode,
110 vma->vm_pgoff >> huge_page_order(h), 110 vma->vm_pgoff >> huge_page_order(h),
111 len >> huge_page_shift(h), vma)) 111 len >> huge_page_shift(h), vma,
112 vma->vm_flags))
112 goto out; 113 goto out;
113 114
114 ret = 0; 115 ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
947 can_do_mlock()); 948 can_do_mlock());
948} 949}
949 950
950struct file *hugetlb_file_setup(const char *name, size_t size) 951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951{ 952{
952 int error = -ENOMEM; 953 int error = -ENOMEM;
953 struct file *file; 954 struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
981 982
982 error = -ENOMEM; 983 error = -ENOMEM;
983 if (hugetlb_reserve_pages(inode, 0, 984 if (hugetlb_reserve_pages(inode, 0,
984 size >> huge_page_shift(hstate_inode(inode)), NULL)) 985 size >> huge_page_shift(hstate_inode(inode)), NULL,
986 acctflag))
985 goto out_inode; 987 goto out_inode;
986 988
987 d_instantiate(dentry, inode); 989 d_instantiate(dentry, inode);
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f173..0d8ac497b3d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,7 +43,7 @@ extern void __init chrdev_init(void);
43/* 43/*
44 * exec.c 44 * exec.c
45 */ 45 */
46extern void check_unsafe_exec(struct linux_binprm *); 46extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
47 47
48/* 48/*
49 * namespace.c 49 * namespace.c
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc..e79c07812af 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
427} 427}
428 428
429/* 429/*
430 * Called under j_state_lock. Returns true if a transaction was started. 430 * Called under j_state_lock. Returns true if a transaction commit was started.
431 */ 431 */
432int __log_start_commit(journal_t *journal, tid_t target) 432int __log_start_commit(journal_t *journal, tid_t target)
433{ 433{
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
495 495
496/* 496/*
497 * Start a commit of the current running transaction (if any). Returns true 497 * Start a commit of the current running transaction (if any). Returns true
498 * if a transaction was started, and fills its tid in at *ptid 498 * if a transaction is going to be committed (or is currently already
499 * committing), and fills its tid in at *ptid
499 */ 500 */
500int journal_start_commit(journal_t *journal, tid_t *ptid) 501int journal_start_commit(journal_t *journal, tid_t *ptid)
501{ 502{
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
505 if (journal->j_running_transaction) { 506 if (journal->j_running_transaction) {
506 tid_t tid = journal->j_running_transaction->t_tid; 507 tid_t tid = journal->j_running_transaction->t_tid;
507 508
508 ret = __log_start_commit(journal, tid); 509 __log_start_commit(journal, tid);
509 if (ret && ptid) 510 /* There's a running transaction and we've just made sure
511 * it's commit has been scheduled. */
512 if (ptid)
510 *ptid = tid; 513 *ptid = tid;
511 } else if (journal->j_committing_transaction && ptid) { 514 ret = 1;
515 } else if (journal->j_committing_transaction) {
512 /* 516 /*
513 * If ext3_write_super() recently started a commit, then we 517 * If ext3_write_super() recently started a commit, then we
514 * have to wait for completion of that transaction 518 * have to wait for completion of that transaction
515 */ 519 */
516 *ptid = journal->j_committing_transaction->t_tid; 520 if (ptid)
521 *ptid = journal->j_committing_transaction->t_tid;
517 ret = 1; 522 ret = 1;
518 } 523 }
519 spin_unlock(&journal->j_state_lock); 524 spin_unlock(&journal->j_state_lock);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e4b9f..763b78a6e9d 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 ret = nlm_lck_denied; 429 ret = nlm_lck_denied;
430 goto out; 430 break;
431 case FILE_LOCK_DEFERRED: 431 case FILE_LOCK_DEFERRED:
432 if (wait) 432 if (wait)
433 break; 433 break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 443 goto out;
444 } 444 }
445 445
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
446 ret = nlm_lck_blocked; 450 ret = nlm_lck_blocked;
447 451
448 /* Append to list of blocked */ 452 /* Append to list of blocked */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d861096c9d8..60fe74035db 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5390,6 +5390,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
5390 goto out; 5390 goto out;
5391 } 5391 }
5392 5392
5393 vfs_dq_free_space_nodirty(inode,
5394 ocfs2_clusters_to_bytes(inode->i_sb, len));
5395
5393 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5396 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5394 dealloc); 5397 dealloc);
5395 if (ret) { 5398 if (ret) {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e8..e9d7c2038c0 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
38#include "dlmglue.h" 38#include "dlmglue.h"
39#include "file.h" 39#include "file.h"
40#include "inode.h" 40#include "inode.h"
41#include "super.h"
41 42
42 43
43static int ocfs2_dentry_revalidate(struct dentry *dentry, 44static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
294 return ret; 295 return ret;
295} 296}
296 297
298static DEFINE_SPINLOCK(dentry_list_lock);
299
300/* We limit the number of dentry locks to drop in one go. We have
301 * this limit so that we don't starve other users of ocfs2_wq. */
302#define DL_INODE_DROP_COUNT 64
303
304/* Drop inode references from dentry locks */
305void ocfs2_drop_dl_inodes(struct work_struct *work)
306{
307 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
308 dentry_lock_work);
309 struct ocfs2_dentry_lock *dl;
310 int drop_count = DL_INODE_DROP_COUNT;
311
312 spin_lock(&dentry_list_lock);
313 while (osb->dentry_lock_list && drop_count--) {
314 dl = osb->dentry_lock_list;
315 osb->dentry_lock_list = dl->dl_next;
316 spin_unlock(&dentry_list_lock);
317 iput(dl->dl_inode);
318 kfree(dl);
319 spin_lock(&dentry_list_lock);
320 }
321 if (osb->dentry_lock_list)
322 queue_work(ocfs2_wq, &osb->dentry_lock_work);
323 spin_unlock(&dentry_list_lock);
324}
325
297/* 326/*
298 * ocfs2_dentry_iput() and friends. 327 * ocfs2_dentry_iput() and friends.
299 * 328 *
@@ -318,16 +347,23 @@ out_attach:
318static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 347static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
319 struct ocfs2_dentry_lock *dl) 348 struct ocfs2_dentry_lock *dl)
320{ 349{
321 iput(dl->dl_inode);
322 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 350 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
323 ocfs2_lock_res_free(&dl->dl_lockres); 351 ocfs2_lock_res_free(&dl->dl_lockres);
324 kfree(dl); 352
353 /* We leave dropping of inode reference to ocfs2_wq as that can
354 * possibly lead to inode deletion which gets tricky */
355 spin_lock(&dentry_list_lock);
356 if (!osb->dentry_lock_list)
357 queue_work(ocfs2_wq, &osb->dentry_lock_work);
358 dl->dl_next = osb->dentry_lock_list;
359 osb->dentry_lock_list = dl;
360 spin_unlock(&dentry_list_lock);
325} 361}
326 362
327void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 363void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
328 struct ocfs2_dentry_lock *dl) 364 struct ocfs2_dentry_lock *dl)
329{ 365{
330 int unlock = 0; 366 int unlock;
331 367
332 BUG_ON(dl->dl_count == 0); 368 BUG_ON(dl->dl_count == 0);
333 369
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d988..d06e16c0664 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
29extern struct dentry_operations ocfs2_dentry_ops; 29extern struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
32 unsigned int dl_count; 33 unsigned int dl_count;
33 u64 dl_parent_blkno; 34 union {
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
34 39
35 /* 40 /*
36 * The ocfs2_dentry_lock keeps an inode reference until 41 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
47void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 52void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
48 struct ocfs2_dentry_lock *dl); 53 struct ocfs2_dentry_lock *dl);
49 54
55void ocfs2_drop_dl_inodes(struct work_struct *work);
56
50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 57struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
51 int skip_unhashed); 58 int skip_unhashed);
52 59
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b0c4cadd4c4..206a2370876 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2860,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
2860 case OCFS2_UNLOCK_CANCEL_CONVERT: 2860 case OCFS2_UNLOCK_CANCEL_CONVERT:
2861 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2861 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2862 lockres->l_action = OCFS2_AST_INVALID; 2862 lockres->l_action = OCFS2_AST_INVALID;
2863 /* Downconvert thread may have requeued this lock, we
2864 * need to wake it. */
2865 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2866 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
2863 break; 2867 break;
2864 case OCFS2_UNLOCK_DROP_LOCK: 2868 case OCFS2_UNLOCK_DROP_LOCK:
2865 lockres->l_level = DLM_LOCK_IV; 2869 lockres->l_level = DLM_LOCK_IV;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ad5c24a29ed..077384135f4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -210,6 +210,7 @@ struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_quota_recovery; 212struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock;
213struct ocfs2_super 214struct ocfs2_super
214{ 215{
215 struct task_struct *commit_task; 216 struct task_struct *commit_task;
@@ -325,6 +326,11 @@ struct ocfs2_super
325 struct list_head blocked_lock_list; 326 struct list_head blocked_lock_list;
326 unsigned long blocked_lock_count; 327 unsigned long blocked_lock_count;
327 328
329 /* List of dentry locks to release. Anyone can add locks to
330 * the list, ocfs2_wq processes the list */
331 struct ocfs2_dentry_lock *dentry_lock_list;
332 struct work_struct dentry_lock_work;
333
328 wait_queue_head_t osb_mount_event; 334 wait_queue_head_t osb_mount_event;
329 335
330 /* Truncate log info */ 336 /* Truncate log info */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f4efa89baee..1ed0f7c8686 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
754 if (dquot->dq_flags & mask) 754 if (dquot->dq_flags & mask)
755 sync = 1; 755 sync = 1;
756 spin_unlock(&dq_data_lock); 756 spin_unlock(&dq_data_lock);
757 if (!sync) { 757 /* This is a slight hack but we can't afford getting global quota
758 * lock if we already have a transaction started. */
759 if (!sync || journal_current_handle()) {
758 status = ocfs2_write_dquot(dquot); 760 status = ocfs2_write_dquot(dquot);
759 goto out; 761 goto out;
760 } 762 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 43ed11345b5..b1cb38fbe80 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1887,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1887 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 1887 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
1888 journal->j_state = OCFS2_JOURNAL_FREE; 1888 journal->j_state = OCFS2_JOURNAL_FREE;
1889 1889
1890 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
1891 osb->dentry_lock_list = NULL;
1892
1890 /* get some pseudo constants for clustersize bits */ 1893 /* get some pseudo constants for clustersize bits */
1891 osb->s_clustersize_bits = 1894 osb->s_clustersize_bits =
1892 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 1895 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e1d638af6ac..915039fffe6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4729,13 +4729,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4729 vb.vb_xv = (struct ocfs2_xattr_value_root *) 4729 vb.vb_xv = (struct ocfs2_xattr_value_root *)
4730 (vb.vb_bh->b_data + offset % blocksize); 4730 (vb.vb_bh->b_data + offset % blocksize);
4731 4731
4732 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4733 OCFS2_JOURNAL_ACCESS_WRITE);
4734 if (ret) {
4735 mlog_errno(ret);
4736 goto out;
4737 }
4738
4739 /* 4732 /*
4740 * From here on out we have to dirty the bucket. The generic 4733 * From here on out we have to dirty the bucket. The generic
4741 * value calls only modify one of the bucket's bhs, but we need 4734 * value calls only modify one of the bucket's bhs, but we need
@@ -4748,12 +4741,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4748 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); 4741 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
4749 if (ret) { 4742 if (ret) {
4750 mlog_errno(ret); 4743 mlog_errno(ret);
4751 goto out_dirty; 4744 goto out;
4745 }
4746
4747 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4748 OCFS2_JOURNAL_ACCESS_WRITE);
4749 if (ret) {
4750 mlog_errno(ret);
4751 goto out;
4752 } 4752 }
4753 4753
4754 xe->xe_value_size = cpu_to_le64(len); 4754 xe->xe_value_size = cpu_to_le64(len);
4755 4755
4756out_dirty:
4757 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); 4756 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
4758 4757
4759out: 4758out:
diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1c4dc..5267098532b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -54,6 +54,64 @@ int seq_open(struct file *file, const struct seq_operations *op)
54} 54}
55EXPORT_SYMBOL(seq_open); 55EXPORT_SYMBOL(seq_open);
56 56
57static int traverse(struct seq_file *m, loff_t offset)
58{
59 loff_t pos = 0, index;
60 int error = 0;
61 void *p;
62
63 m->version = 0;
64 index = 0;
65 m->count = m->from = 0;
66 if (!offset) {
67 m->index = index;
68 return 0;
69 }
70 if (!m->buf) {
71 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
72 if (!m->buf)
73 return -ENOMEM;
74 }
75 p = m->op->start(m, &index);
76 while (p) {
77 error = PTR_ERR(p);
78 if (IS_ERR(p))
79 break;
80 error = m->op->show(m, p);
81 if (error < 0)
82 break;
83 if (unlikely(error)) {
84 error = 0;
85 m->count = 0;
86 }
87 if (m->count == m->size)
88 goto Eoverflow;
89 if (pos + m->count > offset) {
90 m->from = offset - pos;
91 m->count -= m->from;
92 m->index = index;
93 break;
94 }
95 pos += m->count;
96 m->count = 0;
97 if (pos == offset) {
98 index++;
99 m->index = index;
100 break;
101 }
102 p = m->op->next(m, p, &index);
103 }
104 m->op->stop(m, p);
105 m->index = index;
106 return error;
107
108Eoverflow:
109 m->op->stop(m, p);
110 kfree(m->buf);
111 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
112 return !m->buf ? -ENOMEM : -EAGAIN;
113}
114
57/** 115/**
58 * seq_read - ->read() method for sequential files. 116 * seq_read - ->read() method for sequential files.
59 * @file: the file to read from 117 * @file: the file to read from
@@ -186,63 +244,6 @@ Efault:
186} 244}
187EXPORT_SYMBOL(seq_read); 245EXPORT_SYMBOL(seq_read);
188 246
189static int traverse(struct seq_file *m, loff_t offset)
190{
191 loff_t pos = 0, index;
192 int error = 0;
193 void *p;
194
195 m->version = 0;
196 index = 0;
197 m->count = m->from = 0;
198 if (!offset) {
199 m->index = index;
200 return 0;
201 }
202 if (!m->buf) {
203 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
204 if (!m->buf)
205 return -ENOMEM;
206 }
207 p = m->op->start(m, &index);
208 while (p) {
209 error = PTR_ERR(p);
210 if (IS_ERR(p))
211 break;
212 error = m->op->show(m, p);
213 if (error < 0)
214 break;
215 if (unlikely(error)) {
216 error = 0;
217 m->count = 0;
218 }
219 if (m->count == m->size)
220 goto Eoverflow;
221 if (pos + m->count > offset) {
222 m->from = offset - pos;
223 m->count -= m->from;
224 m->index = index;
225 break;
226 }
227 pos += m->count;
228 m->count = 0;
229 if (pos == offset) {
230 index++;
231 m->index = index;
232 break;
233 }
234 p = m->op->next(m, p, &index);
235 }
236 m->op->stop(m, p);
237 return error;
238
239Eoverflow:
240 m->op->stop(m, p);
241 kfree(m->buf);
242 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
243 return !m->buf ? -ENOMEM : -EAGAIN;
244}
245
246/** 247/**
247 * seq_lseek - ->llseek() method for sequential files. 248 * seq_lseek - ->llseek() method for sequential files.
248 * @file: the file in question 249 * @file: the file in question
diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a..61dce001dd5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb)
301 /* 301 /*
302 * wait for asynchronous fs operations to finish before going further 302 * wait for asynchronous fs operations to finish before going further
303 */ 303 */
304 async_synchronize_full_special(&sb->s_async_list); 304 async_synchronize_full_domain(&sb->s_async_list);
305 305
306 /* bad name - it should be evict_inodes() */ 306 /* bad name - it should be evict_inodes() */
307 invalidate_inodes(sb); 307 invalidate_inodes(sb);
@@ -470,7 +470,7 @@ restart:
470 sb->s_count++; 470 sb->s_count++;
471 spin_unlock(&sb_lock); 471 spin_unlock(&sb_lock);
472 down_read(&sb->s_umount); 472 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list); 473 async_synchronize_full_domain(&sb->s_async_list);
474 if (sb->s_root && (wait || sb->s_dirt)) 474 if (sb->s_root && (wait || sb->s_dirt))
475 sb->s_op->sync_fs(sb, wait); 475 sb->s_op->sync_fs(sb, wait);
476 up_read(&sb->s_umount); 476 up_read(&sb->s_umount);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 175f9c590b7..f393620890e 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
689} 689}
690 690
691/** 691/**
692 * ubifs_get_free_space - return amount of free space. 692 * ubifs_get_free_space_nolock - return amount of free space.
693 * @c: UBIFS file-system description object 693 * @c: UBIFS file-system description object
694 * 694 *
695 * This function calculates amount of free space to report to user-space. 695 * This function calculates amount of free space to report to user-space.
@@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
704 * traditional file-systems, because they have way less overhead than UBIFS. 704 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 705 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 706 */
707long long ubifs_get_free_space(struct ubifs_info *c) 707long long ubifs_get_free_space_nolock(struct ubifs_info *c)
708{ 708{
709 int min_idx_lebs, rsvd_idx_lebs, lebs; 709 int rsvd_idx_lebs, lebs;
710 long long available, outstanding, free; 710 long long available, outstanding, free;
711 711
712 spin_lock(&c->space_lock); 712 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
713 min_idx_lebs = c->min_idx_lebs;
714 ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
715 outstanding = c->budg_data_growth + c->budg_dd_growth; 713 outstanding = c->budg_data_growth + c->budg_dd_growth;
716 available = ubifs_calc_available(c, min_idx_lebs); 714 available = ubifs_calc_available(c, c->min_idx_lebs);
717 715
718 /* 716 /*
719 * When reporting free space to user-space, UBIFS guarantees that it is 717 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
726 * Note, the calculations below are similar to what we have in 724 * Note, the calculations below are similar to what we have in
727 * 'do_budget_space()', so refer there for comments. 725 * 'do_budget_space()', so refer there for comments.
728 */ 726 */
729 if (min_idx_lebs > c->lst.idx_lebs) 727 if (c->min_idx_lebs > c->lst.idx_lebs)
730 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 728 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
731 else 729 else
732 rsvd_idx_lebs = 0; 730 rsvd_idx_lebs = 0;
733 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 731 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
734 c->lst.taken_empty_lebs; 732 c->lst.taken_empty_lebs;
735 lebs -= rsvd_idx_lebs; 733 lebs -= rsvd_idx_lebs;
736 available += lebs * (c->dark_wm - c->leb_overhead); 734 available += lebs * (c->dark_wm - c->leb_overhead);
737 spin_unlock(&c->space_lock);
738 735
739 if (available > outstanding) 736 if (available > outstanding)
740 free = ubifs_reported_space(c, available - outstanding); 737 free = ubifs_reported_space(c, available - outstanding);
@@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
742 free = 0; 739 free = 0;
743 return free; 740 return free;
744} 741}
742
743/**
744 * ubifs_get_free_space - return amount of free space.
745 * @c: UBIFS file-system description object
746 *
747 * This function calculates and retuns amount of free space to report to
748 * user-space.
749 */
750long long ubifs_get_free_space(struct ubifs_info *c)
751{
752 long long free;
753
754 spin_lock(&c->space_lock);
755 free = ubifs_get_free_space_nolock(c);
756 spin_unlock(&c->space_lock);
757
758 return free;
759}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 792c5a16c18..e975bd82f38 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
620 c->dark_wm, c->dead_wm, c->max_idx_node_sz); 620 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
622 c->gc_lnum, c->ihead_lnum); 622 c->gc_lnum, c->ihead_lnum);
623 for (i = 0; i < c->jhead_cnt; i++) 623 /* If we are in R/O mode, journal heads do not exist */
624 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 624 if (c->jheads)
625 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 625 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
626 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
627 bud = rb_entry(rb, struct ubifs_bud, rb); 629 bud = rb_entry(rb, struct ubifs_bud, rb);
628 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c)
637 /* Print budgeting predictions */ 639 /* Print budgeting predictions */
638 available = ubifs_calc_available(c, c->min_idx_lebs); 640 available = ubifs_calc_available(c, c->min_idx_lebs);
639 outstanding = c->budg_data_growth + c->budg_dd_growth; 641 outstanding = c->budg_data_growth + c->budg_dd_growth;
640 if (available > outstanding) 642 free = ubifs_get_free_space_nolock(c);
641 free = ubifs_reported_space(c, available - outstanding);
642 else
643 free = 0;
644 printk(KERN_DEBUG "Budgeting predictions:\n"); 643 printk(KERN_DEBUG "Budgeting predictions:\n");
645 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", 644 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
646 available, outstanding, free); 645 available, outstanding, free);
@@ -861,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c)
861} 860}
862 861
863/** 862/**
863 * dbg_save_space_info - save information about flash space.
864 * @c: UBIFS file-system description object
865 *
866 * This function saves information about UBIFS free space, dirty space, etc, in
867 * order to check it later.
868 */
869void dbg_save_space_info(struct ubifs_info *c)
870{
871 struct ubifs_debug_info *d = c->dbg;
872
873 ubifs_get_lp_stats(c, &d->saved_lst);
874
875 spin_lock(&c->space_lock);
876 d->saved_free = ubifs_get_free_space_nolock(c);
877 spin_unlock(&c->space_lock);
878}
879
880/**
881 * dbg_check_space_info - check flash space information.
882 * @c: UBIFS file-system description object
883 *
884 * This function compares current flash space information with the information
885 * which was saved when the 'dbg_save_space_info()' function was called.
886 * Returns zero if the information has not changed, and %-EINVAL it it has
887 * changed.
888 */
889int dbg_check_space_info(struct ubifs_info *c)
890{
891 struct ubifs_debug_info *d = c->dbg;
892 struct ubifs_lp_stats lst;
893 long long avail, free;
894
895 spin_lock(&c->space_lock);
896 avail = ubifs_calc_available(c, c->min_idx_lebs);
897 spin_unlock(&c->space_lock);
898 free = ubifs_get_free_space(c);
899
900 if (free != d->saved_free) {
901 ubifs_err("free space changed from %lld to %lld",
902 d->saved_free, free);
903 goto out;
904 }
905
906 return 0;
907
908out:
909 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst);
912 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst);
914 spin_lock(&c->space_lock);
915 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock);
917 dump_stack();
918 return -EINVAL;
919}
920
921/**
864 * dbg_check_synced_i_size - check synchronized inode size. 922 * dbg_check_synced_i_size - check synchronized inode size.
865 * @inode: inode to check 923 * @inode: inode to check
866 * 924 *
@@ -1349,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
1349 * @c: UBIFS file-system description object 1407 * @c: UBIFS file-system description object
1350 * @leaf_cb: called for each leaf node 1408 * @leaf_cb: called for each leaf node
1351 * @znode_cb: called for each indexing node 1409 * @znode_cb: called for each indexing node
1352 * @priv: private date which is passed to callbacks 1410 * @priv: private data which is passed to callbacks
1353 * 1411 *
1354 * This function walks the UBIFS index and calls the @leaf_cb for each leaf 1412 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
1355 * node and @znode_cb for each indexing node. Returns zero in case of success 1413 * node and @znode_cb for each indexing node. Returns zero in case of success
@@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c)
2409 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which 2467 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
2410 * contain the stuff specific to particular file-system mounts. 2468 * contain the stuff specific to particular file-system mounts.
2411 */ 2469 */
2412static struct dentry *debugfs_rootdir; 2470static struct dentry *dfs_rootdir;
2413 2471
2414/** 2472/**
2415 * dbg_debugfs_init - initialize debugfs file-system. 2473 * dbg_debugfs_init - initialize debugfs file-system.
@@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir;
2421 */ 2479 */
2422int dbg_debugfs_init(void) 2480int dbg_debugfs_init(void)
2423{ 2481{
2424 debugfs_rootdir = debugfs_create_dir("ubifs", NULL); 2482 dfs_rootdir = debugfs_create_dir("ubifs", NULL);
2425 if (IS_ERR(debugfs_rootdir)) { 2483 if (IS_ERR(dfs_rootdir)) {
2426 int err = PTR_ERR(debugfs_rootdir); 2484 int err = PTR_ERR(dfs_rootdir);
2427 ubifs_err("cannot create \"ubifs\" debugfs directory, " 2485 ubifs_err("cannot create \"ubifs\" debugfs directory, "
2428 "error %d\n", err); 2486 "error %d\n", err);
2429 return err; 2487 return err;
@@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void)
2437 */ 2495 */
2438void dbg_debugfs_exit(void) 2496void dbg_debugfs_exit(void)
2439{ 2497{
2440 debugfs_remove(debugfs_rootdir); 2498 debugfs_remove(dfs_rootdir);
2441} 2499}
2442 2500
2443static int open_debugfs_file(struct inode *inode, struct file *file) 2501static int open_debugfs_file(struct inode *inode, struct file *file)
@@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2452 struct ubifs_info *c = file->private_data; 2510 struct ubifs_info *c = file->private_data;
2453 struct ubifs_debug_info *d = c->dbg; 2511 struct ubifs_debug_info *d = c->dbg;
2454 2512
2455 if (file->f_path.dentry == d->dump_lprops) 2513 if (file->f_path.dentry == d->dfs_dump_lprops)
2456 dbg_dump_lprops(c); 2514 dbg_dump_lprops(c);
2457 else if (file->f_path.dentry == d->dump_budg) { 2515 else if (file->f_path.dentry == d->dfs_dump_budg) {
2458 spin_lock(&c->space_lock); 2516 spin_lock(&c->space_lock);
2459 dbg_dump_budg(c); 2517 dbg_dump_budg(c);
2460 spin_unlock(&c->space_lock); 2518 spin_unlock(&c->space_lock);
2461 } else if (file->f_path.dentry == d->dump_tnc) { 2519 } else if (file->f_path.dentry == d->dfs_dump_tnc) {
2462 mutex_lock(&c->tnc_mutex); 2520 mutex_lock(&c->tnc_mutex);
2463 dbg_dump_tnc(c); 2521 dbg_dump_tnc(c);
2464 mutex_unlock(&c->tnc_mutex); 2522 mutex_unlock(&c->tnc_mutex);
@@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2469 return count; 2527 return count;
2470} 2528}
2471 2529
2472static const struct file_operations debugfs_fops = { 2530static const struct file_operations dfs_fops = {
2473 .open = open_debugfs_file, 2531 .open = open_debugfs_file,
2474 .write = write_debugfs_file, 2532 .write = write_debugfs_file,
2475 .owner = THIS_MODULE, 2533 .owner = THIS_MODULE,
@@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
2494 struct dentry *dent; 2552 struct dentry *dent;
2495 struct ubifs_debug_info *d = c->dbg; 2553 struct ubifs_debug_info *d = c->dbg;
2496 2554
2497 sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); 2555 sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
2498 d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name, 2556 d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
2499 debugfs_rootdir); 2557 if (IS_ERR(d->dfs_dir)) {
2500 if (IS_ERR(d->debugfs_dir)) { 2558 err = PTR_ERR(d->dfs_dir);
2501 err = PTR_ERR(d->debugfs_dir);
2502 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", 2559 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2503 d->debugfs_dir_name, err); 2560 d->dfs_dir_name, err);
2504 goto out; 2561 goto out;
2505 } 2562 }
2506 2563
2507 fname = "dump_lprops"; 2564 fname = "dump_lprops";
2508 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2565 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2509 &debugfs_fops);
2510 if (IS_ERR(dent)) 2566 if (IS_ERR(dent))
2511 goto out_remove; 2567 goto out_remove;
2512 d->dump_lprops = dent; 2568 d->dfs_dump_lprops = dent;
2513 2569
2514 fname = "dump_budg"; 2570 fname = "dump_budg";
2515 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2571 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2516 &debugfs_fops);
2517 if (IS_ERR(dent)) 2572 if (IS_ERR(dent))
2518 goto out_remove; 2573 goto out_remove;
2519 d->dump_budg = dent; 2574 d->dfs_dump_budg = dent;
2520 2575
2521 fname = "dump_tnc"; 2576 fname = "dump_tnc";
2522 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2577 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2523 &debugfs_fops);
2524 if (IS_ERR(dent)) 2578 if (IS_ERR(dent))
2525 goto out_remove; 2579 goto out_remove;
2526 d->dump_tnc = dent; 2580 d->dfs_dump_tnc = dent;
2527 2581
2528 return 0; 2582 return 0;
2529 2583
@@ -2531,7 +2585,7 @@ out_remove:
2531 err = PTR_ERR(dent); 2585 err = PTR_ERR(dent);
2532 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", 2586 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2533 fname, err); 2587 fname, err);
2534 debugfs_remove_recursive(d->debugfs_dir); 2588 debugfs_remove_recursive(d->dfs_dir);
2535out: 2589out:
2536 return err; 2590 return err;
2537} 2591}
@@ -2542,7 +2596,7 @@ out:
2542 */ 2596 */
2543void dbg_debugfs_exit_fs(struct ubifs_info *c) 2597void dbg_debugfs_exit_fs(struct ubifs_info *c)
2544{ 2598{
2545 debugfs_remove_recursive(c->dbg->debugfs_dir); 2599 debugfs_remove_recursive(c->dbg->dfs_dir);
2546} 2600}
2547 2601
2548#endif /* CONFIG_UBIFS_FS_DEBUG */ 2602#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9820d6999f7..c1cd73b2e06 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -41,15 +41,17 @@
41 * @chk_lpt_wastage: used by LPT tree size checker 41 * @chk_lpt_wastage: used by LPT tree size checker
42 * @chk_lpt_lebs: used by LPT tree size checker 42 * @chk_lpt_lebs: used by LPT tree size checker
43 * @new_nhead_offs: used by LPT tree size checker 43 * @new_nhead_offs: used by LPT tree size checker
44 * @new_ihead_lnum: used by debugging to check ihead_lnum 44 * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
45 * @new_ihead_offs: used by debugging to check ihead_offs 45 * @new_ihead_offs: used by debugging to check @c->ihead_offs
46 * 46 *
47 * debugfs_dir_name: name of debugfs directory containing this file-system's 47 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
48 * files 48 * @saved_free: saved free space (used by 'dbg_save_space_info()')
49 * debugfs_dir: direntry object of the file-system debugfs directory 49 *
50 * dump_lprops: "dump lprops" debugfs knob 50 * dfs_dir_name: name of debugfs directory containing this file-system's files
51 * dump_budg: "dump budgeting information" debugfs knob 51 * dfs_dir: direntry object of the file-system debugfs directory
52 * dump_tnc: "dump TNC" debugfs knob 52 * dfs_dump_lprops: "dump lprops" debugfs knob
53 * dfs_dump_budg: "dump budgeting information" debugfs knob
54 * dfs_dump_tnc: "dump TNC" debugfs knob
53 */ 55 */
54struct ubifs_debug_info { 56struct ubifs_debug_info {
55 void *buf; 57 void *buf;
@@ -69,11 +71,14 @@ struct ubifs_debug_info {
69 int new_ihead_lnum; 71 int new_ihead_lnum;
70 int new_ihead_offs; 72 int new_ihead_offs;
71 73
72 char debugfs_dir_name[100]; 74 struct ubifs_lp_stats saved_lst;
73 struct dentry *debugfs_dir; 75 long long saved_free;
74 struct dentry *dump_lprops; 76
75 struct dentry *dump_budg; 77 char dfs_dir_name[100];
76 struct dentry *dump_tnc; 78 struct dentry *dfs_dir;
79 struct dentry *dfs_dump_lprops;
80 struct dentry *dfs_dump_budg;
81 struct dentry *dfs_dump_tnc;
77}; 82};
78 83
79#define ubifs_assert(expr) do { \ 84#define ubifs_assert(expr) do { \
@@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
297 dbg_znode_callback znode_cb, void *priv); 302 dbg_znode_callback znode_cb, void *priv);
298 303
299/* Checking functions */ 304/* Checking functions */
300 305void dbg_save_space_info(struct ubifs_info *c);
306int dbg_check_space_info(struct ubifs_info *c);
301int dbg_check_lprops(struct ubifs_info *c); 307int dbg_check_lprops(struct ubifs_info *c);
302int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); 308int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
303int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); 309int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
439 445
440#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 446#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
441#define dbg_old_index_check_init(c, zroot) 0 447#define dbg_old_index_check_init(c, zroot) 0
448#define dbg_save_space_info(c) ({})
449#define dbg_check_space_info(c) 0
442#define dbg_check_old_index(c, zroot) 0 450#define dbg_check_old_index(c, zroot) 0
443#define dbg_check_cats(c) 0 451#define dbg_check_cats(c) 0
444#define dbg_check_ltab(c) 0 452#define dbg_check_ltab(c) 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c3..f55d523c52b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
482} 482}
483 483
484/** 484/**
485 * lock_2_inodes - lock two UBIFS inodes. 485 * lock_2_inodes - a wrapper for locking two UBIFS inodes.
486 * @inode1: first inode 486 * @inode1: first inode
487 * @inode2: second inode 487 * @inode2: second inode
488 *
489 * We do not implement any tricks to guarantee strict lock ordering, because
490 * VFS has already done it for us on the @i_mutex. So this is just a simple
491 * wrapper function.
488 */ 492 */
489static void lock_2_inodes(struct inode *inode1, struct inode *inode2) 493static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
490{ 494{
491 if (inode1->i_ino < inode2->i_ino) { 495 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
492 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); 496 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
493 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
494 } else {
495 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
496 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
497 }
498} 497}
499 498
500/** 499/**
501 * unlock_2_inodes - unlock two UBIFS inodes inodes. 500 * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
502 * @inode1: first inode 501 * @inode1: first inode
503 * @inode2: second inode 502 * @inode2: second inode
504 */ 503 */
505static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) 504static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
506{ 505{
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
508 mutex_unlock(&ubifs_inode(inode2)->ui_mutex); 506 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
509} 508}
510 509
511static int ubifs_link(struct dentry *old_dentry, struct inode *dir, 510static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
527 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", 526 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
528 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 527 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
529 inode->i_nlink, dir->i_ino); 528 inode->i_nlink, dir->i_ino);
529 ubifs_assert(mutex_is_locked(&dir->i_mutex));
530 ubifs_assert(mutex_is_locked(&inode->i_mutex));
530 err = dbg_check_synced_i_size(inode); 531 err = dbg_check_synced_i_size(inode);
531 if (err) 532 if (err)
532 return err; 533 return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
580 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", 581 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
581 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 582 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
582 inode->i_nlink, dir->i_ino); 583 inode->i_nlink, dir->i_ino);
584 ubifs_assert(mutex_is_locked(&dir->i_mutex));
585 ubifs_assert(mutex_is_locked(&inode->i_mutex));
583 err = dbg_check_synced_i_size(inode); 586 err = dbg_check_synced_i_size(inode);
584 if (err) 587 if (err)
585 return err; 588 return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
667 670
668 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, 671 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
669 dentry->d_name.name, inode->i_ino, dir->i_ino); 672 dentry->d_name.name, inode->i_ino, dir->i_ino);
670 673 ubifs_assert(mutex_is_locked(&dir->i_mutex));
674 ubifs_assert(mutex_is_locked(&inode->i_mutex));
671 err = check_dir_empty(c, dentry->d_inode); 675 err = check_dir_empty(c, dentry->d_inode);
672 if (err) 676 if (err)
673 return err; 677 return err;
@@ -922,59 +926,30 @@ out_budg:
922} 926}
923 927
924/** 928/**
925 * lock_3_inodes - lock three UBIFS inodes for rename. 929 * lock_3_inodes - a wrapper for locking three UBIFS inodes.
926 * @inode1: first inode 930 * @inode1: first inode
927 * @inode2: second inode 931 * @inode2: second inode
928 * @inode3: third inode 932 * @inode3: third inode
929 * 933 *
930 * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may 934 * This function is used for 'ubifs_rename()' and @inode1 may be the same as
931 * be null. 935 * @inode2 whereas @inode3 may be %NULL.
936 *
937 * We do not implement any tricks to guarantee strict lock ordering, because
938 * VFS has already done it for us on the @i_mutex. So this is just a simple
939 * wrapper function.
932 */ 940 */
933static void lock_3_inodes(struct inode *inode1, struct inode *inode2, 941static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
934 struct inode *inode3) 942 struct inode *inode3)
935{ 943{
936 struct inode *i1, *i2, *i3; 944 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
937 945 if (inode2 != inode1)
938 if (!inode3) { 946 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
939 if (inode1 != inode2) { 947 if (inode3)
940 lock_2_inodes(inode1, inode2); 948 mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
941 return;
942 }
943 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
944 return;
945 }
946
947 if (inode1 == inode2) {
948 lock_2_inodes(inode1, inode3);
949 return;
950 }
951
952 /* 3 different inodes */
953 if (inode1 < inode2) {
954 i3 = inode2;
955 if (inode1 < inode3) {
956 i1 = inode1;
957 i2 = inode3;
958 } else {
959 i1 = inode3;
960 i2 = inode1;
961 }
962 } else {
963 i3 = inode1;
964 if (inode2 < inode3) {
965 i1 = inode2;
966 i2 = inode3;
967 } else {
968 i1 = inode3;
969 i2 = inode2;
970 }
971 }
972 mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
973 lock_2_inodes(i2, i3);
974} 949}
975 950
976/** 951/**
977 * unlock_3_inodes - unlock three UBIFS inodes for rename. 952 * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
978 * @inode1: first inode 953 * @inode1: first inode
979 * @inode2: second inode 954 * @inode2: second inode
980 * @inode3: third inode 955 * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
982static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, 957static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
983 struct inode *inode3) 958 struct inode *inode3)
984{ 959{
985 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
986 if (inode1 != inode2)
987 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
988 if (inode3) 960 if (inode3)
989 mutex_unlock(&ubifs_inode(inode3)->ui_mutex); 961 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
962 if (inode1 != inode2)
963 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
964 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
990} 965}
991 966
992static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, 967static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1020 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, 995 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
1021 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, 996 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
1022 new_dentry->d_name.name, new_dir->i_ino); 997 new_dentry->d_name.name, new_dir->i_ino);
998 ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
999 ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
1000 if (unlink)
1001 ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
1002
1023 1003
1024 if (unlink && is_dir) { 1004 if (unlink && is_dir) {
1025 err = check_dir_empty(c, new_inode); 1005 err = check_dir_empty(c, new_inode);
@@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1199 return 0; 1179 return 0;
1200} 1180}
1201 1181
1202struct inode_operations ubifs_dir_inode_operations = { 1182const struct inode_operations ubifs_dir_inode_operations = {
1203 .lookup = ubifs_lookup, 1183 .lookup = ubifs_lookup,
1204 .create = ubifs_create, 1184 .create = ubifs_create,
1205 .link = ubifs_link, 1185 .link = ubifs_link,
@@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = {
1219#endif 1199#endif
1220}; 1200};
1221 1201
1222struct file_operations ubifs_dir_operations = { 1202const struct file_operations ubifs_dir_operations = {
1223 .llseek = ubifs_dir_llseek, 1203 .llseek = ubifs_dir_llseek,
1224 .release = ubifs_dir_release, 1204 .release = ubifs_dir_release,
1225 .read = generic_read_dir, 1205 .read = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf37374567f..93b6de51f26 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 struct page *page; 433 struct page *page;
434 434
435
436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
437 436
438 if (unlikely(c->ro_media)) 437 if (unlikely(c->ro_media))
@@ -1541,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1541 return 0; 1540 return 0;
1542} 1541}
1543 1542
1544struct address_space_operations ubifs_file_address_operations = { 1543const struct address_space_operations ubifs_file_address_operations = {
1545 .readpage = ubifs_readpage, 1544 .readpage = ubifs_readpage,
1546 .writepage = ubifs_writepage, 1545 .writepage = ubifs_writepage,
1547 .write_begin = ubifs_write_begin, 1546 .write_begin = ubifs_write_begin,
@@ -1551,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = {
1551 .releasepage = ubifs_releasepage, 1550 .releasepage = ubifs_releasepage,
1552}; 1551};
1553 1552
1554struct inode_operations ubifs_file_inode_operations = { 1553const struct inode_operations ubifs_file_inode_operations = {
1555 .setattr = ubifs_setattr, 1554 .setattr = ubifs_setattr,
1556 .getattr = ubifs_getattr, 1555 .getattr = ubifs_getattr,
1557#ifdef CONFIG_UBIFS_FS_XATTR 1556#ifdef CONFIG_UBIFS_FS_XATTR
@@ -1562,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = {
1562#endif 1561#endif
1563}; 1562};
1564 1563
1565struct inode_operations ubifs_symlink_inode_operations = { 1564const struct inode_operations ubifs_symlink_inode_operations = {
1566 .readlink = generic_readlink, 1565 .readlink = generic_readlink,
1567 .follow_link = ubifs_follow_link, 1566 .follow_link = ubifs_follow_link,
1568 .setattr = ubifs_setattr, 1567 .setattr = ubifs_setattr,
1569 .getattr = ubifs_getattr, 1568 .getattr = ubifs_getattr,
1570}; 1569};
1571 1570
1572struct file_operations ubifs_file_operations = { 1571const struct file_operations ubifs_file_operations = {
1573 .llseek = generic_file_llseek, 1572 .llseek = generic_file_llseek,
1574 .read = do_sync_read, 1573 .read = do_sync_read,
1575 .write = do_sync_write, 1574 .write = do_sync_write,
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9832f9abe28..a711d33b3d3 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
31 * to be reused. Garbage collection will cause the number of dirty index nodes 31 * to be reused. Garbage collection will cause the number of dirty index nodes
32 * to grow, however sufficient space is reserved for the index to ensure the 32 * to grow, however sufficient space is reserved for the index to ensure the
33 * commit will never run out of space. 33 * commit will never run out of space.
34 *
35 * Notes about dead watermark. At current UBIFS implementation we assume that
36 * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
37 * and not worth garbage-collecting. The dead watermark is one min. I/O unit
38 * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
39 * Garbage Collector has to synchronize the GC head's write buffer before
40 * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
41 * actually reclaim even very small pieces of dirty space by garbage collecting
42 * enough dirty LEBs, but we do not bother doing this at this implementation.
43 *
44 * Notes about dark watermark. The results of GC work depends on how big are
45 * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
46 * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them.
34 */ 54 */
35 55
36#include <linux/pagemap.h> 56#include <linux/pagemap.h>
@@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
381 401
382 /* 402 /*
383 * Don't release the LEB until after the next commit, because 403 * Don't release the LEB until after the next commit, because
384 * it may contain date which is needed for recovery. So 404 * it may contain data which is needed for recovery. So
385 * although we freed this LEB, it will become usable only after 405 * although we freed this LEB, it will become usable only after
386 * the commit. 406 * the commit.
387 */ 407 */
@@ -810,8 +830,9 @@ out:
810 * ubifs_destroy_idx_gc - destroy idx_gc list. 830 * ubifs_destroy_idx_gc - destroy idx_gc list.
811 * @c: UBIFS file-system description object 831 * @c: UBIFS file-system description object
812 * 832 *
813 * This function destroys the idx_gc list. It is called when unmounting or 833 * This function destroys the @c->idx_gc list. It is called when unmounting
814 * remounting read-only so locks are not needed. 834 * so locks are not needed. Returns zero in case of success and a negative
835 * error code in case of failure.
815 */ 836 */
816void ubifs_destroy_idx_gc(struct ubifs_info *c) 837void ubifs_destroy_idx_gc(struct ubifs_info *c)
817{ 838{
@@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c)
824 list_del(&idx_gc->list); 845 list_del(&idx_gc->list);
825 kfree(idx_gc); 846 kfree(idx_gc);
826 } 847 }
827
828} 848}
829 849
830/** 850/**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af6..e8e632a1dcd 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
29 * would have been wasted for padding to the nearest minimal I/O unit boundary. 29 * would have been wasted for padding to the nearest minimal I/O unit boundary.
30 * Instead, data first goes to the write-buffer and is flushed when the 30 * Instead, data first goes to the write-buffer and is flushed when the
31 * buffer is full or when it is not used for some time (by timer). This is 31 * buffer is full or when it is not used for some time (by timer). This is
32 * similarto the mechanism is used by JFFS2. 32 * similar to the mechanism is used by JFFS2.
33 * 33 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by 34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code 35 * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
75 * @lnum: logical eraseblock number 75 * @lnum: logical eraseblock number
76 * @offs: offset within the logical eraseblock 76 * @offs: offset within the logical eraseblock
77 * @quiet: print no messages 77 * @quiet: print no messages
78 * @chk_crc: indicates whether to always check the CRC 78 * @must_chk_crc: indicates whether to always check the CRC
79 * 79 *
80 * This function checks node magic number and CRC checksum. This function also 80 * This function checks node magic number and CRC checksum. This function also
81 * validates node length to prevent UBIFS from becoming crazy when an attacker 81 * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
83 * node length in the common header could cause UBIFS to read memory outside of 83 * node length in the common header could cause UBIFS to read memory outside of
84 * allocated buffer when checking the CRC checksum. 84 * allocated buffer when checking the CRC checksum.
85 * 85 *
86 * This function returns zero in case of success %-EUCLEAN in case of bad CRC 86 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
87 * or magic. 87 * true, which is controlled by corresponding UBIFS mount option. However, if
88 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
89 * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
90 * ignored and CRC is checked.
91 *
92 * This function returns zero in case of success and %-EUCLEAN in case of bad
93 * CRC or magic.
88 */ 94 */
89int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 95int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
90 int offs, int quiet, int chk_crc) 96 int offs, int quiet, int must_chk_crc)
91{ 97{
92 int err = -EINVAL, type, node_len; 98 int err = -EINVAL, type, node_len;
93 uint32_t crc, node_crc, magic; 99 uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
123 node_len > c->ranges[type].max_len) 129 node_len > c->ranges[type].max_len)
124 goto out_len; 130 goto out_len;
125 131
126 if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) 132 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
127 if (c->no_chk_data_crc) 133 c->no_chk_data_crc)
128 return 0; 134 return 0;
129 135
130 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 136 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
131 node_crc = le32_to_cpu(ch->crc); 137 node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 9b7c54e0cd2..a11ca0958a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -208,7 +208,7 @@ again:
208 offs = 0; 208 offs = 0;
209 209
210out: 210out:
211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); 211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
212 if (err) 212 if (err)
213 goto out_unlock; 213 goto out_unlock;
214 214
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index dfd2bcece27..4cdd284dea5 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
635 * @c: UBIFS file-system description object 635 * @c: UBIFS file-system description object
636 * @st: return statistics 636 * @st: return statistics
637 */ 637 */
638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) 638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
639{ 639{
640 spin_lock(&c->space_lock); 640 spin_lock(&c->space_lock);
641 memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); 641 memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
642 spin_unlock(&c->space_lock); 642 spin_unlock(&c->space_lock);
643} 643}
644 644
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
678 678
679out: 679out:
680 ubifs_release_lprops(c); 680 ubifs_release_lprops(c);
681 if (err)
682 ubifs_err("cannot change properties of LEB %d, error %d",
683 lnum, err);
681 return err; 684 return err;
682} 685}
683 686
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
714 717
715out: 718out:
716 ubifs_release_lprops(c); 719 ubifs_release_lprops(c);
720 if (err)
721 ubifs_err("cannot update properties of LEB %d, error %d",
722 lnum, err);
717 return err; 723 return err;
718} 724}
719 725
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
737 lpp = ubifs_lpt_lookup(c, lnum); 743 lpp = ubifs_lpt_lookup(c, lnum);
738 if (IS_ERR(lpp)) { 744 if (IS_ERR(lpp)) {
739 err = PTR_ERR(lpp); 745 err = PTR_ERR(lpp);
746 ubifs_err("cannot read properties of LEB %d, error %d",
747 lnum, err);
740 goto out; 748 goto out;
741 } 749 }
742 750
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 96ca9570717..3216a1f277f 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -556,23 +556,23 @@ no_space:
556} 556}
557 557
558/** 558/**
559 * next_pnode - find next pnode. 559 * next_pnode_to_dirty - find next pnode to dirty.
560 * @c: UBIFS file-system description object 560 * @c: UBIFS file-system description object
561 * @pnode: pnode 561 * @pnode: pnode
562 * 562 *
563 * This function returns the next pnode or %NULL if there are no more pnodes. 563 * This function returns the next pnode to dirty or %NULL if there are no more
564 * pnodes. Note that pnodes that have never been written (lnum == 0) are
565 * skipped.
564 */ 566 */
565static struct ubifs_pnode *next_pnode(struct ubifs_info *c, 567static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
566 struct ubifs_pnode *pnode) 568 struct ubifs_pnode *pnode)
567{ 569{
568 struct ubifs_nnode *nnode; 570 struct ubifs_nnode *nnode;
569 int iip; 571 int iip;
570 572
571 /* Try to go right */ 573 /* Try to go right */
572 nnode = pnode->parent; 574 nnode = pnode->parent;
573 iip = pnode->iip + 1; 575 for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
574 if (iip < UBIFS_LPT_FANOUT) {
575 /* We assume here that LEB zero is never an LPT LEB */
576 if (nnode->nbranch[iip].lnum) 576 if (nnode->nbranch[iip].lnum)
577 return ubifs_get_pnode(c, nnode, iip); 577 return ubifs_get_pnode(c, nnode, iip);
578 } 578 }
@@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
583 nnode = nnode->parent; 583 nnode = nnode->parent;
584 if (!nnode) 584 if (!nnode)
585 return NULL; 585 return NULL;
586 /* We assume here that LEB zero is never an LPT LEB */ 586 for (; iip < UBIFS_LPT_FANOUT; iip++) {
587 } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); 587 if (nnode->nbranch[iip].lnum)
588 break;
589 }
590 } while (iip >= UBIFS_LPT_FANOUT);
588 591
589 /* Go right */ 592 /* Go right */
590 nnode = ubifs_get_nnode(c, nnode, iip); 593 nnode = ubifs_get_nnode(c, nnode, iip);
@@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
593 596
594 /* Go down to level 1 */ 597 /* Go down to level 1 */
595 while (nnode->level > 1) { 598 while (nnode->level > 1) {
596 nnode = ubifs_get_nnode(c, nnode, 0); 599 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
600 if (nnode->nbranch[iip].lnum)
601 break;
602 }
603 if (iip >= UBIFS_LPT_FANOUT) {
604 /*
605 * Should not happen, but we need to keep going
606 * if it does.
607 */
608 iip = 0;
609 }
610 nnode = ubifs_get_nnode(c, nnode, iip);
597 if (IS_ERR(nnode)) 611 if (IS_ERR(nnode))
598 return (void *)nnode; 612 return (void *)nnode;
599 } 613 }
600 614
601 return ubifs_get_pnode(c, nnode, 0); 615 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
616 if (nnode->nbranch[iip].lnum)
617 break;
618 if (iip >= UBIFS_LPT_FANOUT)
619 /* Should not happen, but we need to keep going if it does */
620 iip = 0;
621 return ubifs_get_pnode(c, nnode, iip);
602} 622}
603 623
604/** 624/**
@@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
688 pnode = pnode_lookup(c, 0); 708 pnode = pnode_lookup(c, 0);
689 while (pnode) { 709 while (pnode) {
690 do_make_pnode_dirty(c, pnode); 710 do_make_pnode_dirty(c, pnode);
691 pnode = next_pnode(c, pnode); 711 pnode = next_pnode_to_dirty(c, pnode);
692 if (IS_ERR(pnode)) 712 if (IS_ERR(pnode))
693 return PTR_ERR(pnode); 713 return PTR_ERR(pnode);
694 } 714 }
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf56..a88f33801b9 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
354 int err, lnum, offs, len; 354 int err, lnum, offs, len;
355 355
356 if (c->ro_media) 356 if (c->ro_media)
357 return -EINVAL; 357 return -EROFS;
358 358
359 lnum = UBIFS_MST_LNUM; 359 lnum = UBIFS_MST_LNUM;
360 offs = c->mst_offs + c->mst_node_alsz; 360 offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9e6f403f170..152a7b34a14 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to 46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
47 * zero, the inode number is added to the rb-tree. It is removed from the tree 47 * zero, the inode number is added to the rb-tree. It is removed from the tree
48 * when the inode is deleted. Any new orphans that are in the orphan tree when 48 * when the inode is deleted. Any new orphans that are in the orphan tree when
49 * the commit is run, are written to the orphan area in 1 or more orph nodes. 49 * the commit is run, are written to the orphan area in 1 or more orphan nodes.
50 * If the orphan area is full, it is consolidated to make space. There is 50 * If the orphan area is full, it is consolidated to make space. There is
51 * always enough space because validation prevents the user from creating more 51 * always enough space because validation prevents the user from creating more
52 * than the maximum number of orphans allowed. 52 * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
231} 231}
232 232
233/** 233/**
234 * do_write_orph_node - write a node 234 * do_write_orph_node - write a node to the orphan head.
235 * @c: UBIFS file-system description object 235 * @c: UBIFS file-system description object
236 * @len: length of node 236 * @len: length of node
237 * @atomic: write atomically 237 * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
264} 264}
265 265
266/** 266/**
267 * write_orph_node - write an orph node 267 * write_orph_node - write an orphan node.
268 * @c: UBIFS file-system description object 268 * @c: UBIFS file-system description object
269 * @atomic: write atomically 269 * @atomic: write atomically
270 * 270 *
271 * This function builds an orph node from the cnext list and writes it to the 271 * This function builds an orphan node from the cnext list and writes it to the
272 * orphan head. On success, %0 is returned, otherwise a negative error code 272 * orphan head. On success, %0 is returned, otherwise a negative error code
273 * is returned. 273 * is returned.
274 */ 274 */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
326} 326}
327 327
328/** 328/**
329 * write_orph_nodes - write orph nodes until there are no more to commit 329 * write_orph_nodes - write orphan nodes until there are no more to commit.
330 * @c: UBIFS file-system description object 330 * @c: UBIFS file-system description object
331 * @atomic: write atomically 331 * @atomic: write atomically
332 * 332 *
333 * This function writes orph nodes for all the orphans to commit. On success, 333 * This function writes orphan nodes for all the orphans to commit. On success,
334 * %0 is returned, otherwise a negative error code is returned. 334 * %0 is returned, otherwise a negative error code is returned.
335 */ 335 */
336static int write_orph_nodes(struct ubifs_info *c, int atomic) 336static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
478} 478}
479 479
480/** 480/**
481 * clear_orphans - erase all LEBs used for orphans. 481 * ubifs_clear_orphans - erase all LEBs used for orphans.
482 * @c: UBIFS file-system description object 482 * @c: UBIFS file-system description object
483 * 483 *
484 * If recovery is not required, then the orphans from the previous session 484 * If recovery is not required, then the orphans from the previous session
485 * are not needed. This function locates the LEBs used to record 485 * are not needed. This function locates the LEBs used to record
486 * orphans, and un-maps them. 486 * orphans, and un-maps them.
487 */ 487 */
488static int clear_orphans(struct ubifs_info *c) 488int ubifs_clear_orphans(struct ubifs_info *c)
489{ 489{
490 int lnum, err; 490 int lnum, err;
491 491
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
547 * do_kill_orphans - remove orphan inodes from the index. 547 * do_kill_orphans - remove orphan inodes from the index.
548 * @c: UBIFS file-system description object 548 * @c: UBIFS file-system description object
549 * @sleb: scanned LEB 549 * @sleb: scanned LEB
550 * @last_cmt_no: cmt_no of last orph node read is passed and returned here 550 * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
551 * @outofdate: whether the LEB is out of date is returned here 551 * @outofdate: whether the LEB is out of date is returned here
552 * @last_flagged: whether the end orph node is encountered 552 * @last_flagged: whether the end orphan node is encountered
553 * 553 *
554 * This function is a helper to the 'kill_orphans()' function. It goes through 554 * This function is a helper to the 'kill_orphans()' function. It goes through
555 * every orphan node in a LEB and for every inode number recorded, removes 555 * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
580 /* 580 /*
581 * The commit number on the master node may be less, because 581 * The commit number on the master node may be less, because
582 * of a failed commit. If there are several failed commits in a 582 * of a failed commit. If there are several failed commits in a
583 * row, the commit number written on orph nodes will continue to 583 * row, the commit number written on orphan nodes will continue
584 * increase (because the commit number is adjusted here) even 584 * to increase (because the commit number is adjusted here) even
585 * though the commit number on the master node stays the same 585 * though the commit number on the master node stays the same
586 * because the master node has not been re-written. 586 * because the master node has not been re-written.
587 */ 587 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
589 c->cmt_no = cmt_no; 589 c->cmt_no = cmt_no;
590 if (cmt_no < *last_cmt_no && *last_flagged) { 590 if (cmt_no < *last_cmt_no && *last_flagged) {
591 /* 591 /*
592 * The last orph node had a higher commit number and was 592 * The last orphan node had a higher commit number and
593 * flagged as the last written for that commit number. 593 * was flagged as the last written for that commit
594 * That makes this orph node, out of date. 594 * number. That makes this orphan node, out of date.
595 */ 595 */
596 if (!first) { 596 if (!first) {
597 ubifs_err("out of order commit number %llu in " 597 ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
658 /* 658 /*
659 * Orph nodes always start at c->orph_first and are written to each 659 * Orph nodes always start at c->orph_first and are written to each
660 * successive LEB in turn. Generally unused LEBs will have been unmapped 660 * successive LEB in turn. Generally unused LEBs will have been unmapped
661 * but may contain out of date orph nodes if the unmap didn't go 661 * but may contain out of date orphan nodes if the unmap didn't go
662 * through. In addition, the last orph node written for each commit is 662 * through. In addition, the last orphan node written for each commit is
663 * marked (top bit of orph->cmt_no is set to 1). It is possible that 663 * marked (top bit of orph->cmt_no is set to 1). It is possible that
664 * there are orph nodes from the next commit (i.e. the commit did not 664 * there are orphan nodes from the next commit (i.e. the commit did not
665 * complete successfully). In that case, no orphans will have been lost 665 * complete successfully). In that case, no orphans will have been lost
666 * due to the way that orphans are written, and any orphans added will 666 * due to the way that orphans are written, and any orphans added will
667 * be valid orphans anyway and so can be deleted. 667 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
718 if (unclean) 718 if (unclean)
719 err = kill_orphans(c); 719 err = kill_orphans(c);
720 else if (!read_only) 720 else if (!read_only)
721 err = clear_orphans(c); 721 err = ubifs_clear_orphans(c);
722 722
723 return err; 723 return err;
724} 724}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 89556ee7251..1182b66a549 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
397 buf->f_namelen = UBIFS_MAX_NLEN; 397 buf->f_namelen = UBIFS_MAX_NLEN;
398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); 398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); 399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
400 ubifs_assert(buf->f_bfree <= c->block_cnt);
400 return 0; 401 return 0;
401} 402}
402 403
@@ -432,33 +433,24 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
432 int i, err; 433 int i, err;
433 struct ubifs_info *c = sb->s_fs_info; 434 struct ubifs_info *c = sb->s_fs_info;
434 struct writeback_control wbc = { 435 struct writeback_control wbc = {
435 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 436 .sync_mode = WB_SYNC_ALL,
436 .range_start = 0, 437 .range_start = 0,
437 .range_end = LLONG_MAX, 438 .range_end = LLONG_MAX,
438 .nr_to_write = LONG_MAX, 439 .nr_to_write = LONG_MAX,
439 }; 440 };
440 441
441 /* 442 /*
442 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an 443 * Zero @wait is just an advisory thing to help the file system shove
443 * advisory thing to help the file system shove lots of data into the 444 * lots of data into the queues, and there will be the second
444 * queues. If some gets missed then it'll be picked up on the second
445 * '->sync_fs()' call, with non-zero @wait. 445 * '->sync_fs()' call, with non-zero @wait.
446 */ 446 */
447 if (!wait)
448 return 0;
447 449
448 if (sb->s_flags & MS_RDONLY) 450 if (sb->s_flags & MS_RDONLY)
449 return 0; 451 return 0;
450 452
451 /* 453 /*
452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
453 * do this if it waits for an already running commit.
454 */
455 for (i = 0; i < c->jhead_cnt; i++) {
456 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
457 if (err)
458 return err;
459 }
460
461 /*
462 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and 454 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
463 * pages, so synchronize them first, then commit the journal. Strictly 455 * pages, so synchronize them first, then commit the journal. Strictly
464 * speaking, it is not necessary to commit the journal here, 456 * speaking, it is not necessary to commit the journal here,
@@ -469,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
469 */ 461 */
470 generic_sync_sb_inodes(sb, &wbc); 462 generic_sync_sb_inodes(sb, &wbc);
471 463
464 /*
465 * Synchronize write buffers, because 'ubifs_run_commit()' does not
466 * do this if it waits for an already running commit.
467 */
468 for (i = 0; i < c->jhead_cnt; i++) {
469 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
470 if (err)
471 return err;
472 }
473
472 err = ubifs_run_commit(c); 474 err = ubifs_run_commit(c);
473 if (err) 475 if (err)
474 return err; 476 return err;
@@ -572,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c)
572 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; 574 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
573 575
574 /* 576 /*
575 * Initialize dead and dark LEB space watermarks. 577 * Initialize dead and dark LEB space watermarks. See gc.c for comments
576 * 578 * about these values.
577 * Dead space is the space which cannot be used. Its watermark is
578 * equivalent to min. I/O unit or minimum node size if it is greater
579 * then min. I/O unit.
580 *
581 * Dark space is the space which might be used, or might not, depending
582 * on which node should be written to the LEB. Its watermark is
583 * equivalent to maximum UBIFS node size.
584 */ 579 */
585 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 580 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
586 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 581 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
@@ -741,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c)
741 * take_gc_lnum - reserve GC LEB. 736 * take_gc_lnum - reserve GC LEB.
742 * @c: UBIFS file-system description object 737 * @c: UBIFS file-system description object
743 * 738 *
744 * This function ensures that the LEB reserved for garbage collection is 739 * This function ensures that the LEB reserved for garbage collection is marked
745 * unmapped and is marked as "taken" in lprops. We also have to set free space 740 * as "taken" in lprops. We also have to set free space to LEB size and dirty
746 * to LEB size and dirty space to zero, because lprops may contain out-of-date 741 * space to zero, because lprops may contain out-of-date information if the
747 * information if the file-system was un-mounted before it has been committed. 742 * file-system was un-mounted before it has been committed. This function
748 * This function returns zero in case of success and a negative error code in 743 * returns zero in case of success and a negative error code in case of
749 * case of failure. 744 * failure.
750 */ 745 */
751static int take_gc_lnum(struct ubifs_info *c) 746static int take_gc_lnum(struct ubifs_info *c)
752{ 747{
@@ -757,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
757 return -EINVAL; 752 return -EINVAL;
758 } 753 }
759 754
760 err = ubifs_leb_unmap(c, c->gc_lnum);
761 if (err)
762 return err;
763
764 /* And we have to tell lprops that this LEB is taken */ 755 /* And we have to tell lprops that this LEB is taken */
765 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, 756 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
766 LPROPS_TAKEN, 0, 0); 757 LPROPS_TAKEN, 0, 0);
@@ -966,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
966 957
967 token = match_token(p, tokens, args); 958 token = match_token(p, tokens, args);
968 switch (token) { 959 switch (token) {
960 /*
961 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
962 * We accepte them in order to be backware-compatible. But this
963 * should be removed at some point.
964 */
969 case Opt_fast_unmount: 965 case Opt_fast_unmount:
970 c->mount_opts.unmount_mode = 2; 966 c->mount_opts.unmount_mode = 2;
971 c->fast_unmount = 1;
972 break; 967 break;
973 case Opt_norm_unmount: 968 case Opt_norm_unmount:
974 c->mount_opts.unmount_mode = 1; 969 c->mount_opts.unmount_mode = 1;
975 c->fast_unmount = 0;
976 break; 970 break;
977 case Opt_bulk_read: 971 case Opt_bulk_read:
978 c->mount_opts.bulk_read = 2; 972 c->mount_opts.bulk_read = 2;
@@ -1094,12 +1088,7 @@ static int check_free_space(struct ubifs_info *c)
1094 ubifs_err("insufficient free space to mount in read/write mode"); 1088 ubifs_err("insufficient free space to mount in read/write mode");
1095 dbg_dump_budg(c); 1089 dbg_dump_budg(c);
1096 dbg_dump_lprops(c); 1090 dbg_dump_lprops(c);
1097 /* 1091 return -ENOSPC;
1098 * We return %-EINVAL instead of %-ENOSPC because it seems to
1099 * be the closest error code mentioned in the mount function
1100 * documentation.
1101 */
1102 return -EINVAL;
1103 } 1092 }
1104 return 0; 1093 return 0;
1105} 1094}
@@ -1286,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c)
1286 if (err) 1275 if (err)
1287 goto out_orphans; 1276 goto out_orphans;
1288 err = ubifs_rcvry_gc_commit(c); 1277 err = ubifs_rcvry_gc_commit(c);
1289 } else 1278 } else {
1290 err = take_gc_lnum(c); 1279 err = take_gc_lnum(c);
1291 if (err) 1280 if (err)
1292 goto out_orphans; 1281 goto out_orphans;
1282
1283 /*
1284 * GC LEB may contain garbage if there was an unclean
1285 * reboot, and it should be un-mapped.
1286 */
1287 err = ubifs_leb_unmap(c, c->gc_lnum);
1288 if (err)
1289 return err;
1290 }
1293 1291
1294 err = dbg_check_lprops(c); 1292 err = dbg_check_lprops(c);
1295 if (err) 1293 if (err)
@@ -1298,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c)
1298 err = ubifs_recover_size(c); 1296 err = ubifs_recover_size(c);
1299 if (err) 1297 if (err)
1300 goto out_orphans; 1298 goto out_orphans;
1299 } else {
1300 /*
1301 * Even if we mount read-only, we have to set space in GC LEB
1302 * to proper value because this affects UBIFS free space
1303 * reporting. We do not want to have a situation when
1304 * re-mounting from R/O to R/W changes amount of free space.
1305 */
1306 err = take_gc_lnum(c);
1307 if (err)
1308 goto out_orphans;
1301 } 1309 }
1302 1310
1303 spin_lock(&ubifs_infos_lock); 1311 spin_lock(&ubifs_infos_lock);
@@ -1310,14 +1318,17 @@ static int mount_ubifs(struct ubifs_info *c)
1310 else { 1318 else {
1311 c->need_recovery = 0; 1319 c->need_recovery = 0;
1312 ubifs_msg("recovery completed"); 1320 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */
1322 ubifs_assert(c->lst.taken_empty_lebs == 1);
1313 } 1323 }
1314 } 1324 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1);
1315 1326
1316 err = dbg_debugfs_init_fs(c); 1327 err = dbg_check_filesystem(c);
1317 if (err) 1328 if (err)
1318 goto out_infos; 1329 goto out_infos;
1319 1330
1320 err = dbg_check_filesystem(c); 1331 err = dbg_debugfs_init_fs(c);
1321 if (err) 1332 if (err)
1322 goto out_infos; 1333 goto out_infos;
1323 1334
@@ -1351,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
1351 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], 1362 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1352 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], 1363 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1353 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); 1364 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1354 dbg_msg("fast unmount: %d", c->fast_unmount);
1355 dbg_msg("big_lpt %d", c->big_lpt); 1365 dbg_msg("big_lpt %d", c->big_lpt);
1356 dbg_msg("log LEBs: %d (%d - %d)", 1366 dbg_msg("log LEBs: %d (%d - %d)",
1357 c->log_lebs, UBIFS_LOG_LNUM, c->log_last); 1367 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1475,10 +1485,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1475{ 1485{
1476 int err, lnum; 1486 int err, lnum;
1477 1487
1478 if (c->ro_media)
1479 return -EINVAL;
1480
1481 mutex_lock(&c->umount_mutex); 1488 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c);
1482 c->remounting_rw = 1; 1490 c->remounting_rw = 1;
1483 c->always_chk_crc = 1; 1491 c->always_chk_crc = 1;
1484 1492
@@ -1514,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1514 err = ubifs_recover_inl_heads(c, c->sbuf); 1522 err = ubifs_recover_inl_heads(c, c->sbuf);
1515 if (err) 1523 if (err)
1516 goto out; 1524 goto out;
1525 } else {
1526 /* A readonly mount is not allowed to have orphans */
1527 ubifs_assert(c->tot_orphans == 0);
1528 err = ubifs_clear_orphans(c);
1529 if (err)
1530 goto out;
1517 } 1531 }
1518 1532
1519 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { 1533 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
@@ -1569,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1569 if (c->need_recovery) 1583 if (c->need_recovery)
1570 err = ubifs_rcvry_gc_commit(c); 1584 err = ubifs_rcvry_gc_commit(c);
1571 else 1585 else
1572 err = take_gc_lnum(c); 1586 err = ubifs_leb_unmap(c, c->gc_lnum);
1573 if (err) 1587 if (err)
1574 goto out; 1588 goto out;
1575 1589
@@ -1582,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1582 c->vfs_sb->s_flags &= ~MS_RDONLY; 1596 c->vfs_sb->s_flags &= ~MS_RDONLY;
1583 c->remounting_rw = 0; 1597 c->remounting_rw = 0;
1584 c->always_chk_crc = 0; 1598 c->always_chk_crc = 0;
1599 err = dbg_check_space_info(c);
1585 mutex_unlock(&c->umount_mutex); 1600 mutex_unlock(&c->umount_mutex);
1586 return 0; 1601 return err;
1587 1602
1588out: 1603out:
1589 vfree(c->orph_buf); 1604 vfree(c->orph_buf);
@@ -1603,43 +1618,18 @@ out:
1603} 1618}
1604 1619
1605/** 1620/**
1606 * commit_on_unmount - commit the journal when un-mounting.
1607 * @c: UBIFS file-system description object
1608 *
1609 * This function is called during un-mounting and re-mounting, and it commits
1610 * the journal unless the "fast unmount" mode is enabled.
1611 */
1612static void commit_on_unmount(struct ubifs_info *c)
1613{
1614 struct super_block *sb = c->vfs_sb;
1615 long long bud_bytes;
1616
1617 /*
1618 * This function is called before the background thread is stopped, so
1619 * we may race with ongoing commit, which means we have to take
1620 * @c->bud_lock to access @c->bud_bytes.
1621 */
1622 spin_lock(&c->buds_lock);
1623 bud_bytes = c->bud_bytes;
1624 spin_unlock(&c->buds_lock);
1625
1626 if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
1627 ubifs_run_commit(c);
1628}
1629
1630/**
1631 * ubifs_remount_ro - re-mount in read-only mode. 1621 * ubifs_remount_ro - re-mount in read-only mode.
1632 * @c: UBIFS file-system description object 1622 * @c: UBIFS file-system description object
1633 * 1623 *
1634 * We rely on VFS to have stopped writing. Possibly the background thread could 1624 * We assume VFS has stopped writing. Possibly the background thread could be
1635 * be running a commit, however kthread_stop will wait in that case. 1625 * running a commit, however kthread_stop will wait in that case.
1636 */ 1626 */
1637static void ubifs_remount_ro(struct ubifs_info *c) 1627static void ubifs_remount_ro(struct ubifs_info *c)
1638{ 1628{
1639 int i, err; 1629 int i, err;
1640 1630
1641 ubifs_assert(!c->need_recovery); 1631 ubifs_assert(!c->need_recovery);
1642 commit_on_unmount(c); 1632 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
1643 1633
1644 mutex_lock(&c->umount_mutex); 1634 mutex_lock(&c->umount_mutex);
1645 if (c->bgt) { 1635 if (c->bgt) {
@@ -1647,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1647 c->bgt = NULL; 1637 c->bgt = NULL;
1648 } 1638 }
1649 1639
1640 dbg_save_space_info(c);
1641
1650 for (i = 0; i < c->jhead_cnt; i++) { 1642 for (i = 0; i < c->jhead_cnt; i++) {
1651 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1643 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1652 del_timer_sync(&c->jheads[i].wbuf.timer); 1644 del_timer_sync(&c->jheads[i].wbuf.timer);
1653 } 1645 }
1654 1646
1655 if (!c->ro_media) { 1647 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1656 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1648 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1657 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1649 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1658 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1650 err = ubifs_write_master(c);
1659 err = ubifs_write_master(c); 1651 if (err)
1660 if (err) 1652 ubifs_ro_mode(c, err);
1661 ubifs_ro_mode(c, err);
1662 }
1663 1653
1664 ubifs_destroy_idx_gc(c);
1665 free_wbufs(c); 1654 free_wbufs(c);
1666 vfree(c->orph_buf); 1655 vfree(c->orph_buf);
1667 c->orph_buf = NULL; 1656 c->orph_buf = NULL;
1668 vfree(c->ileb_buf); 1657 vfree(c->ileb_buf);
1669 c->ileb_buf = NULL; 1658 c->ileb_buf = NULL;
1670 ubifs_lpt_free(c, 1); 1659 ubifs_lpt_free(c, 1);
1660 err = dbg_check_space_info(c);
1661 if (err)
1662 ubifs_ro_mode(c, err);
1671 mutex_unlock(&c->umount_mutex); 1663 mutex_unlock(&c->umount_mutex);
1672} 1664}
1673 1665
@@ -1760,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1760 } 1752 }
1761 1753
1762 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1754 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1755 if (c->ro_media) {
1756 ubifs_msg("cannot re-mount due to prior errors");
1757 return -EROFS;
1758 }
1763 err = ubifs_remount_rw(c); 1759 err = ubifs_remount_rw(c);
1764 if (err) 1760 if (err)
1765 return err; 1761 return err;
1766 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) 1762 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1763 if (c->ro_media) {
1764 ubifs_msg("cannot re-mount due to prior errors");
1765 return -EROFS;
1766 }
1767 ubifs_remount_ro(c); 1767 ubifs_remount_ro(c);
1768 }
1768 1769
1769 if (c->bulk_read == 1) 1770 if (c->bulk_read == 1)
1770 bu_init(c); 1771 bu_init(c);
@@ -1774,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1774 c->bu.buf = NULL; 1775 c->bu.buf = NULL;
1775 } 1776 }
1776 1777
1778 ubifs_assert(c->lst.taken_empty_lebs == 1);
1777 return 0; 1779 return 0;
1778} 1780}
1779 1781
1780struct super_operations ubifs_super_operations = { 1782const struct super_operations ubifs_super_operations = {
1781 .alloc_inode = ubifs_alloc_inode, 1783 .alloc_inode = ubifs_alloc_inode,
1782 .destroy_inode = ubifs_destroy_inode, 1784 .destroy_inode = ubifs_destroy_inode,
1783 .put_super = ubifs_put_super, 1785 .put_super = ubifs_put_super,
@@ -2044,15 +2046,6 @@ out_close:
2044 2046
2045static void ubifs_kill_sb(struct super_block *sb) 2047static void ubifs_kill_sb(struct super_block *sb)
2046{ 2048{
2047 struct ubifs_info *c = sb->s_fs_info;
2048
2049 /*
2050 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
2051 * in order to be outside BKL.
2052 */
2053 if (sb->s_root)
2054 commit_on_unmount(c);
2055 /* The un-mount routine is actually done in put_super() */
2056 generic_shutdown_super(sb); 2049 generic_shutdown_super(sb);
2057} 2050}
2058 2051
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f7e36f54552..fa28a84c6a1 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
443 * This function performs that same function as ubifs_read_node except that 443 * This function performs that same function as ubifs_read_node except that
444 * it does not require that there is actually a node present and instead 444 * it does not require that there is actually a node present and instead
445 * the return code indicates if a node was read. 445 * the return code indicates if a node was read.
446 *
447 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
448 * is true (it is controlled by corresponding mount option). However, if
449 * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
450 * checked.
446 */ 451 */
447static int try_read_node(const struct ubifs_info *c, void *buf, int type, 452static int try_read_node(const struct ubifs_info *c, void *buf, int type,
448 int len, int lnum, int offs) 453 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
470 if (node_len != len) 475 if (node_len != len)
471 return 0; 476 return 0;
472 477
473 if (type == UBIFS_DATA_NODE && !c->always_chk_crc) 478 if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
474 if (c->no_chk_data_crc) 479 return 1;
475 return 0;
476 480
477 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 481 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
478 node_crc = le32_to_cpu(ch->crc); 482 node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
1506 * 1510 *
1507 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function 1511 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
1508 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares 1512 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
1509 * maxumum possible amount of nodes for bulk-read. 1513 * maximum possible amount of nodes for bulk-read.
1510 */ 1514 */
1511int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) 1515int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
1512{ 1516{
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2a4cc66d0..039a68bee29 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -426,9 +426,9 @@ struct ubifs_unclean_leb {
426 * LEB properties flags. 426 * LEB properties flags.
427 * 427 *
428 * LPROPS_UNCAT: not categorized 428 * LPROPS_UNCAT: not categorized
429 * LPROPS_DIRTY: dirty > 0, not index 429 * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index 430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
431 * LPROPS_FREE: free > 0, not empty, not index 431 * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs 432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
433 * LPROPS_EMPTY: LEB is empty, not taken 433 * LPROPS_EMPTY: LEB is empty, not taken
434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken 434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
@@ -961,7 +961,6 @@ struct ubifs_debug_info;
961 * @cs_lock: commit state lock 961 * @cs_lock: commit state lock
962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running 962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
963 * 963 *
964 * @fast_unmount: do not run journal commit before un-mounting
965 * @big_lpt: flag that LPT is too big to write whole during commit 964 * @big_lpt: flag that LPT is too big to write whole during commit
966 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 965 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
967 * recovery) 966 * recovery)
@@ -1202,7 +1201,6 @@ struct ubifs_info {
1202 spinlock_t cs_lock; 1201 spinlock_t cs_lock;
1203 wait_queue_head_t cmt_wq; 1202 wait_queue_head_t cmt_wq;
1204 1203
1205 unsigned int fast_unmount:1;
1206 unsigned int big_lpt:1; 1204 unsigned int big_lpt:1;
1207 unsigned int no_chk_data_crc:1; 1205 unsigned int no_chk_data_crc:1;
1208 unsigned int bulk_read:1; 1206 unsigned int bulk_read:1;
@@ -1405,13 +1403,13 @@ extern struct list_head ubifs_infos;
1405extern spinlock_t ubifs_infos_lock; 1403extern spinlock_t ubifs_infos_lock;
1406extern atomic_long_t ubifs_clean_zn_cnt; 1404extern atomic_long_t ubifs_clean_zn_cnt;
1407extern struct kmem_cache *ubifs_inode_slab; 1405extern struct kmem_cache *ubifs_inode_slab;
1408extern struct super_operations ubifs_super_operations; 1406extern const struct super_operations ubifs_super_operations;
1409extern struct address_space_operations ubifs_file_address_operations; 1407extern const struct address_space_operations ubifs_file_address_operations;
1410extern struct file_operations ubifs_file_operations; 1408extern const struct file_operations ubifs_file_operations;
1411extern struct inode_operations ubifs_file_inode_operations; 1409extern const struct inode_operations ubifs_file_inode_operations;
1412extern struct file_operations ubifs_dir_operations; 1410extern const struct file_operations ubifs_dir_operations;
1413extern struct inode_operations ubifs_dir_inode_operations; 1411extern const struct inode_operations ubifs_dir_inode_operations;
1414extern struct inode_operations ubifs_symlink_inode_operations; 1412extern const struct inode_operations ubifs_symlink_inode_operations;
1415extern struct backing_dev_info ubifs_backing_dev_info; 1413extern struct backing_dev_info ubifs_backing_dev_info;
1416extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; 1414extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1417 1415
@@ -1428,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
1428int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, 1426int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
1429 int offs, int dtype); 1427 int offs, int dtype);
1430int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 1428int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
1431 int offs, int quiet, int chk_crc); 1429 int offs, int quiet, int must_chk_crc);
1432void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); 1430void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
1433void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); 1431void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
1434int ubifs_io_init(struct ubifs_info *c); 1432int ubifs_io_init(struct ubifs_info *c);
@@ -1495,6 +1493,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1495void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1493void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1496 struct ubifs_budget_req *req); 1494 struct ubifs_budget_req *req);
1497long long ubifs_get_free_space(struct ubifs_info *c); 1495long long ubifs_get_free_space(struct ubifs_info *c);
1496long long ubifs_get_free_space_nolock(struct ubifs_info *c);
1498int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1497int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1499void ubifs_convert_page_budget(struct ubifs_info *c); 1498void ubifs_convert_page_budget(struct ubifs_info *c);
1500long long ubifs_reported_space(const struct ubifs_info *c, long long free); 1499long long ubifs_reported_space(const struct ubifs_info *c, long long free);
@@ -1603,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
1603int ubifs_orphan_start_commit(struct ubifs_info *c); 1602int ubifs_orphan_start_commit(struct ubifs_info *c);
1604int ubifs_orphan_end_commit(struct ubifs_info *c); 1603int ubifs_orphan_end_commit(struct ubifs_info *c);
1605int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); 1604int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
1605int ubifs_clear_orphans(struct ubifs_info *c);
1606 1606
1607/* lpt.c */ 1607/* lpt.c */
1608int ubifs_calc_lpt_geom(struct ubifs_info *c); 1608int ubifs_calc_lpt_geom(struct ubifs_info *c);
@@ -1646,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
1646 const struct ubifs_lprops *lp, 1646 const struct ubifs_lprops *lp,
1647 int free, int dirty, int flags, 1647 int free, int dirty, int flags,
1648 int idx_gc_cnt); 1648 int idx_gc_cnt);
1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); 1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, 1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
1651 int cat); 1651 int cat);
1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, 1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c2..a608e72fa40 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
371 /* flush inodes and push all remaining buffers out to disk */ 371 /* flush inodes and push all remaining buffers out to disk */
372 xfs_quiesce_fs(mp); 372 xfs_quiesce_fs(mp);
373 373
374 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); 374 /*
375 * Just warn here till VFS can correctly support
376 * read-only remount without racing.
377 */
378 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
375 379
376 /* Push the superblock and write an unmount record */ 380 /* Push the superblock and write an unmount record */
377 error = xfs_log_sbcount(mp, 1); 381 error = xfs_log_sbcount(mp, 1);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee71349..f8278cfcc1d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
55 struct file *file, *target_file; 55 struct file *file, *target_file;
56 int error = 0; 56 int error = 0;
57 57
58 sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
59 if (!sxp) {
60 error = XFS_ERROR(ENOMEM);
61 goto out;
62 }
63
64 /* Pull information for the target fd */ 58 /* Pull information for the target fd */
65 file = fget((int)sxp->sx_fdtarget); 59 file = fget((int)sxp->sx_fdtarget);
66 if (!file) { 60 if (!file) {
67 error = XFS_ERROR(EINVAL); 61 error = XFS_ERROR(EINVAL);
68 goto out_free_sxp; 62 goto out;
69 } 63 }
70 64
71 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { 65 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
109 fput(target_file); 103 fput(target_file);
110 out_put_file: 104 out_put_file:
111 fput(file); 105 fput(file);
112 out_free_sxp:
113 kmem_free(sxp);
114 out: 106 out:
115 return error; 107 return error;
116} 108}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94..b1047de2fff 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void xlog_recover_check_summary(xlog_t *);
70xfs_buf_t * 70xfs_buf_t *
71xlog_get_bp( 71xlog_get_bp(
72 xlog_t *log, 72 xlog_t *log,
73 int num_bblks) 73 int nbblks)
74{ 74{
75 ASSERT(num_bblks > 0); 75 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
76 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
77 XFS_ERROR_REPORT("xlog_get_bp(1)",
78 XFS_ERRLEVEL_HIGH, log->l_mp);
79 return NULL;
80 }
76 81
77 if (log->l_sectbb_log) { 82 if (log->l_sectbb_log) {
78 if (num_bblks > 1) 83 if (nbblks > 1)
79 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 84 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
80 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); 85 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
81 } 86 }
82 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); 87 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
83} 88}
84 89
85void 90void
@@ -102,6 +107,13 @@ xlog_bread(
102{ 107{
103 int error; 108 int error;
104 109
110 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
111 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
112 XFS_ERROR_REPORT("xlog_bread(1)",
113 XFS_ERRLEVEL_HIGH, log->l_mp);
114 return EFSCORRUPTED;
115 }
116
105 if (log->l_sectbb_log) { 117 if (log->l_sectbb_log) {
106 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 118 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
107 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 119 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
139{ 151{
140 int error; 152 int error;
141 153
154 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
155 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
156 XFS_ERROR_REPORT("xlog_bwrite(1)",
157 XFS_ERRLEVEL_HIGH, log->l_mp);
158 return EFSCORRUPTED;
159 }
160
142 if (log->l_sectbb_log) { 161 if (log->l_sectbb_log) {
143 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 162 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
144 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 163 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);