aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_elf.c14
-rw-r--r--fs/bio-integrity.c26
-rw-r--r--fs/btrfs/Kconfig13
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c277
-rw-r--r--fs/btrfs/ctree.h28
-rw-r--r--fs/btrfs/disk-io.c120
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c438
-rw-r--r--fs/btrfs/extent_io.c132
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file.c5
-rw-r--r--fs/btrfs/inode.c84
-rw-r--r--fs/btrfs/ioctl.c1
-rw-r--r--fs/btrfs/locking.c218
-rw-r--r--fs/btrfs/locking.h6
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c354
-rw-r--r--fs/btrfs/volumes.c49
-rw-r--r--fs/btrfs/xattr.c48
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/cifs/CHANGES4
-rw-r--r--fs/cifs/cifsencrypt.c18
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/connect.c24
-rw-r--r--fs/cifs/dir.c56
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/md5.c38
-rw-r--r--fs/cifs/md5.h6
-rw-r--r--fs/cifs/transport.c127
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c9
-rw-r--r--fs/ecryptfs/crypto.c4
-rw-r--r--fs/eventpoll.c22
-rw-r--r--fs/exec.c28
-rw-r--r--fs/ext2/super.c9
-rw-r--r--fs/ext3/namei.c20
-rw-r--r--fs/ext3/super.c11
-rw-r--r--fs/ext4/balloc.c6
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/ext4/inode.c9
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/namei.c21
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/hugetlbfs/inode.c8
-rw-r--r--fs/internal.h2
-rw-r--r--fs/jbd/journal.c17
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/lockd/svclock.c6
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/dcache.c42
-rw-r--r--fs/ocfs2/dcache.h9
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/ocfs2/xattr.c17
-rw-r--r--fs/seq_file.c115
-rw-r--r--fs/super.c4
-rw-r--r--fs/ubifs/budget.c35
-rw-r--r--fs/ubifs/debug.c122
-rw-r--r--fs/ubifs/debug.h36
-rw-r--r--fs/ubifs/dir.c96
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/ubifs/gc.c28
-rw-r--r--fs/ubifs/io.c22
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/lprops.c12
-rw-r--r--fs/ubifs/lpt_commit.c44
-rw-r--r--fs/ubifs/master.c2
-rw-r--r--fs/ubifs/orphan.c38
-rw-r--r--fs/ubifs/super.c195
-rw-r--r--fs/ubifs/tnc.c12
-rw-r--r--fs/ubifs/ubifs.h26
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c6
-rw-r--r--fs/xfs/xfs_dfrag.c10
-rw-r--r--fs/xfs/xfs_log_recover.c31
86 files changed, 2231 insertions, 1095 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e3ff2b9e602f..33b7235f853b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1208 * check for an ELF header. If we find one, dump the first page to 1208 * check for an ELF header. If we find one, dump the first page to
1209 * aid in determining what was mapped here. 1209 * aid in determining what was mapped here.
1210 */ 1210 */
1211 if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) { 1211 if (FILTER(ELF_HEADERS) &&
1212 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1212 u32 __user *header = (u32 __user *) vma->vm_start; 1213 u32 __user *header = (u32 __user *) vma->vm_start;
1213 u32 word; 1214 u32 word;
1215 mm_segment_t fs = get_fs();
1214 /* 1216 /*
1215 * Doing it this way gets the constant folded by GCC. 1217 * Doing it this way gets the constant folded by GCC.
1216 */ 1218 */
@@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1223 magic.elfmag[EI_MAG1] = ELFMAG1; 1225 magic.elfmag[EI_MAG1] = ELFMAG1;
1224 magic.elfmag[EI_MAG2] = ELFMAG2; 1226 magic.elfmag[EI_MAG2] = ELFMAG2;
1225 magic.elfmag[EI_MAG3] = ELFMAG3; 1227 magic.elfmag[EI_MAG3] = ELFMAG3;
1226 if (get_user(word, header) == 0 && word == magic.cmp) 1228 /*
1229 * Switch to the user "segment" for get_user(),
1230 * then put back what elf_core_dump() had in place.
1231 */
1232 set_fs(USER_DS);
1233 if (unlikely(get_user(word, header)))
1234 word = 0;
1235 set_fs(fs);
1236 if (word == magic.cmp)
1227 return PAGE_SIZE; 1237 return PAGE_SIZE;
1228 } 1238 }
1229 1239
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d6..549b0144da11 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
140 140
141 iv = bip_vec_idx(bip, bip->bip_vcnt); 141 iv = bip_vec_idx(bip, bip->bip_vcnt);
142 BUG_ON(iv == NULL); 142 BUG_ON(iv == NULL);
143 BUG_ON(iv->bv_page != NULL);
144 143
145 iv->bv_page = page; 144 iv->bv_page = page;
146 iv->bv_len = len; 145 iv->bv_len = len;
@@ -465,7 +464,7 @@ static int bio_integrity_verify(struct bio *bio)
465 464
466 if (ret) { 465 if (ret) {
467 kunmap_atomic(kaddr, KM_USER0); 466 kunmap_atomic(kaddr, KM_USER0);
468 break; 467 return ret;
469 } 468 }
470 469
471 sectors = bv->bv_len / bi->sector_size; 470 sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +492,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
493 struct bio_integrity_payload *bip = 492 struct bio_integrity_payload *bip =
494 container_of(work, struct bio_integrity_payload, bip_work); 493 container_of(work, struct bio_integrity_payload, bip_work);
495 struct bio *bio = bip->bip_bio; 494 struct bio *bio = bip->bip_bio;
496 int error = bip->bip_error; 495 int error;
497 496
498 if (bio_integrity_verify(bio)) { 497 error = bio_integrity_verify(bio);
499 clear_bit(BIO_UPTODATE, &bio->bi_flags);
500 error = -EIO;
501 }
502 498
503 /* Restore original bio completion handler */ 499 /* Restore original bio completion handler */
504 bio->bi_end_io = bip->bip_end_io; 500 bio->bi_end_io = bip->bip_end_io;
505 501 bio_endio(bio, error);
506 if (bio->bi_end_io)
507 bio->bi_end_io(bio, error);
508} 502}
509 503
510/** 504/**
@@ -525,7 +519,17 @@ void bio_integrity_endio(struct bio *bio, int error)
525 519
526 BUG_ON(bip->bip_bio != bio); 520 BUG_ON(bip->bip_bio != bio);
527 521
528 bip->bip_error = error; 522 /* In case of an I/O error there is no point in verifying the
523 * integrity metadata. Restore original bio end_io handler
524 * and run it.
525 */
526 if (error) {
527 bio->bi_end_io = bip->bip_end_io;
528 bio_endio(bio, error);
529
530 return;
531 }
532
529 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); 533 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
530 queue_work(kintegrityd_wq, &bip->bip_work); 534 queue_work(kintegrityd_wq, &bip->bip_work);
531} 535}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf999ea1b..7bb3c020e570 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
16 module will be called btrfs. 16 module will be called btrfs.
17 17
18 If unsure, say N. 18 If unsure, say N.
19
20config BTRFS_FS_POSIX_ACL
21 bool "Btrfs POSIX Access Control Lists"
22 depends on BTRFS_FS
23 select FS_POSIX_ACL
24 help
25 POSIX Access Control Lists (ACLs) support permissions for users and
26 groups beyond the owner/group/world scheme.
27
28 To learn more about Access Control Lists, visit the POSIX ACLs for
29 Linux website <http://acl.bestbits.at/>.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe0..c84ca1f5259a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/kthread.h> 19#include <linux/kthread.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/spinlock.h> 21#include <linux/spinlock.h>
23# include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 24#include "async-thread.h"
25 25
26#define WORK_QUEUED_BIT 0 26#define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
143 struct btrfs_work *work; 143 struct btrfs_work *work;
144 do { 144 do {
145 spin_lock_irq(&worker->lock); 145 spin_lock_irq(&worker->lock);
146again_locked:
146 while (!list_empty(&worker->pending)) { 147 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next; 148 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list); 149 work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
165 check_idle_worker(worker); 166 check_idle_worker(worker);
166 167
167 } 168 }
168 worker->working = 0;
169 if (freezing(current)) { 169 if (freezing(current)) {
170 worker->working = 0;
171 spin_unlock_irq(&worker->lock);
170 refrigerator(); 172 refrigerator();
171 } else { 173 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock); 174 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop()) 175 if (!kthread_should_stop()) {
176 cpu_relax();
177 /*
178 * we've dropped the lock, did someone else
179 * jump_in?
180 */
181 smp_mb();
182 if (!list_empty(&worker->pending))
183 continue;
184
185 /*
186 * this short schedule allows more work to
187 * come in without the queue functions
188 * needing to go through wake_up_process()
189 *
190 * worker->working is still 1, so nobody
191 * is going to try and wake us up
192 */
193 schedule_timeout(1);
194 smp_mb();
195 if (!list_empty(&worker->pending))
196 continue;
197
198 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending))
202 goto again_locked;
203
204 /*
205 * this makes sure we get a wakeup when someone
206 * adds something new to the queue
207 */
208 worker->working = 0;
209 spin_unlock_irq(&worker->lock);
210
175 schedule(); 211 schedule();
212 }
176 __set_current_state(TASK_RUNNING); 213 __set_current_state(TASK_RUNNING);
177 } 214 }
178 } while (!kthread_should_stop()); 215 } while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
350{ 387{
351 struct btrfs_worker_thread *worker = work->worker; 388 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags; 389 unsigned long flags;
390 int wake = 0;
353 391
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out; 393 goto out;
356 394
357 spin_lock_irqsave(&worker->lock, flags); 395 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending); 396 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending);
360 398
361 /* by definition we're busy, take ourselves off the idle 399 /* by definition we're busy, take ourselves off the idle
362 * list 400 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
368 &worker->workers->worker_list); 406 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags); 407 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 } 408 }
409 if (!worker->working) {
410 wake = 1;
411 worker->working = 1;
412 }
371 413
372 spin_unlock_irqrestore(&worker->lock, flags); 414 spin_unlock_irqrestore(&worker->lock, flags);
373 415 if (wake)
416 wake_up_process(worker->task);
374out: 417out:
418
375 return 0; 419 return 0;
376} 420}
377 421
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
398 } 442 }
399 443
400 spin_lock_irqsave(&worker->lock, flags); 444 spin_lock_irqsave(&worker->lock, flags);
445
446 list_add_tail(&work->list, &worker->pending);
401 atomic_inc(&worker->num_pending); 447 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker); 448 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404 449
405 /* 450 /*
406 * avoid calling into wake_up_process if this thread has already 451 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d9..ab07627084f1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h> 35#include <linux/pagevec.h>
37#include "compat.h" 36#include "compat.h"
38#include "ctree.h" 37#include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c0776816..35443cc4b9a9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
54 return path; 54 return path;
55} 55}
56 56
57/*
58 * set all locked nodes in the path to blocking locks. This should
59 * be done before scheduling
60 */
61noinline void btrfs_set_path_blocking(struct btrfs_path *p)
62{
63 int i;
64 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
65 if (p->nodes[i] && p->locks[i])
66 btrfs_set_lock_blocking(p->nodes[i]);
67 }
68}
69
70/*
71 * reset all the locked nodes in the patch to spinning locks.
72 */
73noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
74{
75 int i;
76 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
77 if (p->nodes[i] && p->locks[i])
78 btrfs_clear_lock_blocking(p->nodes[i]);
79 }
80}
81
57/* this also releases the path */ 82/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 83void btrfs_free_path(struct btrfs_path *p)
59{ 84{
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 297 if (IS_ERR(cow))
273 return PTR_ERR(cow); 298 return PTR_ERR(cow);
274 299
300 /* cow is set to blocking by btrfs_init_new_buffer */
301
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 302 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 303 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 304 btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +415,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
388 WARN_ON(1); 415 WARN_ON(1);
389 } 416 }
390 417
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid && 418 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid && 419 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 420 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf; 421 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest); 422 WARN_ON(prealloc_dest);
398 return 0; 423 return 0;
399 } 424 }
400 spin_unlock(&root->fs_info->hash_lock); 425
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 426 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
427
428 if (parent)
429 btrfs_set_lock_blocking(parent);
430 btrfs_set_lock_blocking(buf);
431
402 ret = __btrfs_cow_block(trans, root, buf, parent, 432 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0, 433 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest); 434 prealloc_dest);
@@ -504,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
504 if (parent_nritems == 1) 534 if (parent_nritems == 1)
505 return 0; 535 return 0;
506 536
537 btrfs_set_lock_blocking(parent);
538
507 for (i = start_slot; i < end_slot; i++) { 539 for (i = start_slot; i < end_slot; i++) {
508 int close = 1; 540 int close = 1;
509 541
@@ -564,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
564 search_start = last_block; 596 search_start = last_block;
565 597
566 btrfs_tree_lock(cur); 598 btrfs_tree_lock(cur);
599 btrfs_set_lock_blocking(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i, 600 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start, 601 &cur, search_start,
569 min(16 * blocksize, 602 min(16 * blocksize,
@@ -862,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 return 0; 895 return 0;
863 896
864 mid = path->nodes[level]; 897 mid = path->nodes[level];
898
865 WARN_ON(!path->locks[level]); 899 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 900 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 901
@@ -884,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
884 /* promote the child to a root */ 918 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0); 919 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child); 920 btrfs_tree_lock(child);
921 btrfs_set_lock_blocking(child);
887 BUG_ON(!child); 922 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 923 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret); 924 BUG_ON(ret);
@@ -900,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
900 935
901 add_root_to_dirty_list(root); 936 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child); 937 btrfs_tree_unlock(child);
938
903 path->locks[level] = 0; 939 path->locks[level] = 0;
904 path->nodes[level] = NULL; 940 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid); 941 clean_tree_block(trans, root, mid);
@@ -924,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
924 left = read_node_slot(root, parent, pslot - 1); 960 left = read_node_slot(root, parent, pslot - 1);
925 if (left) { 961 if (left) {
926 btrfs_tree_lock(left); 962 btrfs_tree_lock(left);
963 btrfs_set_lock_blocking(left);
927 wret = btrfs_cow_block(trans, root, left, 964 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0); 965 parent, pslot - 1, &left, 0);
929 if (wret) { 966 if (wret) {
@@ -934,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
934 right = read_node_slot(root, parent, pslot + 1); 971 right = read_node_slot(root, parent, pslot + 1);
935 if (right) { 972 if (right) {
936 btrfs_tree_lock(right); 973 btrfs_tree_lock(right);
974 btrfs_set_lock_blocking(right);
937 wret = btrfs_cow_block(trans, root, right, 975 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0); 976 parent, pslot + 1, &right, 0);
939 if (wret) { 977 if (wret) {
@@ -1109,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 u32 left_nr; 1147 u32 left_nr;
1110 1148
1111 btrfs_tree_lock(left); 1149 btrfs_tree_lock(left);
1150 btrfs_set_lock_blocking(left);
1151
1112 left_nr = btrfs_header_nritems(left); 1152 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1153 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1; 1154 wret = 1;
@@ -1155,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1155 */ 1195 */
1156 if (right) { 1196 if (right) {
1157 u32 right_nr; 1197 u32 right_nr;
1198
1158 btrfs_tree_lock(right); 1199 btrfs_tree_lock(right);
1200 btrfs_set_lock_blocking(right);
1201
1159 right_nr = btrfs_header_nritems(right); 1202 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1203 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1; 1204 wret = 1;
@@ -1210,8 +1253,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1210 struct btrfs_disk_key disk_key; 1253 struct btrfs_disk_key disk_key;
1211 u32 nritems; 1254 u32 nritems;
1212 u64 search; 1255 u64 search;
1213 u64 lowest_read; 1256 u64 target;
1214 u64 highest_read;
1215 u64 nread = 0; 1257 u64 nread = 0;
1216 int direction = path->reada; 1258 int direction = path->reada;
1217 struct extent_buffer *eb; 1259 struct extent_buffer *eb;
@@ -1235,8 +1277,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1235 return; 1277 return;
1236 } 1278 }
1237 1279
1238 highest_read = search; 1280 target = search;
1239 lowest_read = search;
1240 1281
1241 nritems = btrfs_header_nritems(node); 1282 nritems = btrfs_header_nritems(node);
1242 nr = slot; 1283 nr = slot;
@@ -1256,27 +1297,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
1256 break; 1297 break;
1257 } 1298 }
1258 search = btrfs_node_blockptr(node, nr); 1299 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) || 1300 if ((search <= target && target - search <= 65536) ||
1260 (search < lowest_read && lowest_read - search <= 16384) || 1301 (search > target && search - target <= 65536)) {
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize, 1302 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr)); 1303 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize; 1304 nread += blocksize;
1265 } 1305 }
1266 nscan++; 1306 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1307 if ((nread > 65536 || nscan > 32))
1268 break; 1308 break;
1309 }
1310}
1269 1311
1270 if (nread > (256 * 1024) || nscan > 128) 1312/*
1271 break; 1313 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1314 * cache
1315 */
1316static noinline int reada_for_balance(struct btrfs_root *root,
1317 struct btrfs_path *path, int level)
1318{
1319 int slot;
1320 int nritems;
1321 struct extent_buffer *parent;
1322 struct extent_buffer *eb;
1323 u64 gen;
1324 u64 block1 = 0;
1325 u64 block2 = 0;
1326 int ret = 0;
1327 int blocksize;
1272 1328
1273 if (search < lowest_read) 1329 parent = path->nodes[level - 1];
1274 lowest_read = search; 1330 if (!parent)
1275 if (search > highest_read) 1331 return 0;
1276 highest_read = search; 1332
1333 nritems = btrfs_header_nritems(parent);
1334 slot = path->slots[level];
1335 blocksize = btrfs_level_size(root, level);
1336
1337 if (slot > 0) {
1338 block1 = btrfs_node_blockptr(parent, slot - 1);
1339 gen = btrfs_node_ptr_generation(parent, slot - 1);
1340 eb = btrfs_find_tree_block(root, block1, blocksize);
1341 if (eb && btrfs_buffer_uptodate(eb, gen))
1342 block1 = 0;
1343 free_extent_buffer(eb);
1344 }
1345 if (slot < nritems) {
1346 block2 = btrfs_node_blockptr(parent, slot + 1);
1347 gen = btrfs_node_ptr_generation(parent, slot + 1);
1348 eb = btrfs_find_tree_block(root, block2, blocksize);
1349 if (eb && btrfs_buffer_uptodate(eb, gen))
1350 block2 = 0;
1351 free_extent_buffer(eb);
1352 }
1353 if (block1 || block2) {
1354 ret = -EAGAIN;
1355 btrfs_release_path(root, path);
1356 if (block1)
1357 readahead_tree_block(root, block1, blocksize, 0);
1358 if (block2)
1359 readahead_tree_block(root, block2, blocksize, 0);
1360
1361 if (block1) {
1362 eb = read_tree_block(root, block1, blocksize, 0);
1363 free_extent_buffer(eb);
1364 }
1365 if (block1) {
1366 eb = read_tree_block(root, block2, blocksize, 0);
1367 free_extent_buffer(eb);
1368 }
1277 } 1369 }
1370 return ret;
1278} 1371}
1279 1372
1373
1280/* 1374/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers 1375 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because 1376 * in the tree. The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1328} 1422}
1329 1423
1330/* 1424/*
1425 * This releases any locks held in the path starting at level and
1426 * going all the way up to the root.
1427 *
1428 * btrfs_search_slot will keep the lock held on higher nodes in a few
1429 * corner cases, such as COW of the block at slot zero in the node. This
1430 * ignores those rules, and it should only be called when there are no
1431 * more updates to be done higher up in the tree.
1432 */
1433noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1434{
1435 int i;
1436
1437 if (path->keep_locks || path->lowest_level)
1438 return;
1439
1440 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1441 if (!path->nodes[i])
1442 continue;
1443 if (!path->locks[i])
1444 continue;
1445 btrfs_tree_unlock(path->nodes[i]);
1446 path->locks[i] = 0;
1447 }
1448}
1449
1450/*
1331 * look for key in the tree. path is filled in with nodes along the way 1451 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf 1452 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0) 1453 * level of the path (level 0)
@@ -1387,32 +1507,30 @@ again:
1387 int wret; 1507 int wret;
1388 1508
1389 /* is a cow on this block not required */ 1509 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid && 1510 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid && 1511 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1512 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done; 1513 goto cow_done;
1396 } 1514 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398 1515
1399 /* ok, we have to cow, is our old prealloc the right 1516 /* ok, we have to cow, is our old prealloc the right
1400 * size? 1517 * size?
1401 */ 1518 */
1402 if (prealloc_block.objectid && 1519 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) { 1520 prealloc_block.offset != b->len) {
1521 btrfs_release_path(root, p);
1404 btrfs_free_reserved_extent(root, 1522 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid, 1523 prealloc_block.objectid,
1406 prealloc_block.offset); 1524 prealloc_block.offset);
1407 prealloc_block.objectid = 0; 1525 prealloc_block.objectid = 0;
1526 goto again;
1408 } 1527 }
1409 1528
1410 /* 1529 /*
1411 * for higher level blocks, try not to allocate blocks 1530 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held. 1531 * with the block and the parent locks held.
1413 */ 1532 */
1414 if (level > 1 && !prealloc_block.objectid && 1533 if (level > 0 && !prealloc_block.objectid) {
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len; 1534 u32 size = b->len;
1417 u64 hint = b->start; 1535 u64 hint = b->start;
1418 1536
@@ -1425,6 +1543,8 @@ again:
1425 goto again; 1543 goto again;
1426 } 1544 }
1427 1545
1546 btrfs_set_path_blocking(p);
1547
1428 wret = btrfs_cow_block(trans, root, b, 1548 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1], 1549 p->nodes[level + 1],
1430 p->slots[level + 1], 1550 p->slots[level + 1],
@@ -1446,6 +1566,22 @@ cow_done:
1446 if (!p->skip_locking) 1566 if (!p->skip_locking)
1447 p->locks[level] = 1; 1567 p->locks[level] = 1;
1448 1568
1569 btrfs_clear_path_blocking(p);
1570
1571 /*
1572 * we have a lock on b and as long as we aren't changing
1573 * the tree, there is no way to for the items in b to change.
1574 * It is safe to drop the lock on our parent before we
1575 * go through the expensive btree search on b.
1576 *
1577 * If cow is true, then we might be changing slot zero,
1578 * which may require changing the parent. So, we can't
1579 * drop the lock until after we know which slot we're
1580 * operating on.
1581 */
1582 if (!cow)
1583 btrfs_unlock_up_safe(p, level + 1);
1584
1449 ret = check_block(root, p, level); 1585 ret = check_block(root, p, level);
1450 if (ret) { 1586 if (ret) {
1451 ret = -1; 1587 ret = -1;
@@ -1453,6 +1589,7 @@ cow_done:
1453 } 1589 }
1454 1590
1455 ret = bin_search(b, key, level, &slot); 1591 ret = bin_search(b, key, level, &slot);
1592
1456 if (level != 0) { 1593 if (level != 0) {
1457 if (ret && slot > 0) 1594 if (ret && slot > 0)
1458 slot -= 1; 1595 slot -= 1;
@@ -1460,7 +1597,16 @@ cow_done:
1460 if ((p->search_for_split || ins_len > 0) && 1597 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >= 1598 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1599 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level); 1600 int sret;
1601
1602 sret = reada_for_balance(root, p, level);
1603 if (sret)
1604 goto again;
1605
1606 btrfs_set_path_blocking(p);
1607 sret = split_node(trans, root, p, level);
1608 btrfs_clear_path_blocking(p);
1609
1464 BUG_ON(sret > 0); 1610 BUG_ON(sret > 0);
1465 if (sret) { 1611 if (sret) {
1466 ret = sret; 1612 ret = sret;
@@ -1468,9 +1614,19 @@ cow_done:
1468 } 1614 }
1469 b = p->nodes[level]; 1615 b = p->nodes[level];
1470 slot = p->slots[level]; 1616 slot = p->slots[level];
1471 } else if (ins_len < 0) { 1617 } else if (ins_len < 0 &&
1472 int sret = balance_level(trans, root, p, 1618 btrfs_header_nritems(b) <
1473 level); 1619 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1620 int sret;
1621
1622 sret = reada_for_balance(root, p, level);
1623 if (sret)
1624 goto again;
1625
1626 btrfs_set_path_blocking(p);
1627 sret = balance_level(trans, root, p, level);
1628 btrfs_clear_path_blocking(p);
1629
1474 if (sret) { 1630 if (sret) {
1475 ret = sret; 1631 ret = sret;
1476 goto done; 1632 goto done;
@@ -1504,7 +1660,7 @@ cow_done:
1504 * of the btree by dropping locks before 1660 * of the btree by dropping locks before
1505 * we read. 1661 * we read.
1506 */ 1662 */
1507 if (level > 1) { 1663 if (level > 0) {
1508 btrfs_release_path(NULL, p); 1664 btrfs_release_path(NULL, p);
1509 if (tmp) 1665 if (tmp)
1510 free_extent_buffer(tmp); 1666 free_extent_buffer(tmp);
@@ -1519,6 +1675,7 @@ cow_done:
1519 free_extent_buffer(tmp); 1675 free_extent_buffer(tmp);
1520 goto again; 1676 goto again;
1521 } else { 1677 } else {
1678 btrfs_set_path_blocking(p);
1522 if (tmp) 1679 if (tmp)
1523 free_extent_buffer(tmp); 1680 free_extent_buffer(tmp);
1524 if (should_reada) 1681 if (should_reada)
@@ -1528,14 +1685,29 @@ cow_done:
1528 b = read_node_slot(root, b, slot); 1685 b = read_node_slot(root, b, slot);
1529 } 1686 }
1530 } 1687 }
1531 if (!p->skip_locking) 1688 if (!p->skip_locking) {
1532 btrfs_tree_lock(b); 1689 int lret;
1690
1691 btrfs_clear_path_blocking(p);
1692 lret = btrfs_try_spin_lock(b);
1693
1694 if (!lret) {
1695 btrfs_set_path_blocking(p);
1696 btrfs_tree_lock(b);
1697 btrfs_clear_path_blocking(p);
1698 }
1699 }
1533 } else { 1700 } else {
1534 p->slots[level] = slot; 1701 p->slots[level] = slot;
1535 if (ins_len > 0 && 1702 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) { 1703 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key, 1704 int sret;
1705
1706 btrfs_set_path_blocking(p);
1707 sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0); 1708 p, ins_len, ret == 0);
1709 btrfs_clear_path_blocking(p);
1710
1539 BUG_ON(sret > 0); 1711 BUG_ON(sret > 0);
1540 if (sret) { 1712 if (sret) {
1541 ret = sret; 1713 ret = sret;
@@ -1549,12 +1721,16 @@ cow_done:
1549 } 1721 }
1550 ret = 1; 1722 ret = 1;
1551done: 1723done:
1724 /*
1725 * we don't really know what they plan on doing with the path
1726 * from here on, so for now just mark it as blocking
1727 */
1728 btrfs_set_path_blocking(p);
1552 if (prealloc_block.objectid) { 1729 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root, 1730 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid, 1731 prealloc_block.objectid,
1555 prealloc_block.offset); 1732 prealloc_block.offset);
1556 } 1733 }
1557
1558 return ret; 1734 return ret;
1559} 1735}
1560 1736
@@ -1578,6 +1754,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1754 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret); 1755 BUG_ON(ret);
1580 1756
1757 btrfs_set_lock_blocking(eb);
1758
1581 parent = eb; 1759 parent = eb;
1582 while (1) { 1760 while (1) {
1583 level = btrfs_header_level(parent); 1761 level = btrfs_header_level(parent);
@@ -1602,6 +1780,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1602 eb = read_tree_block(root, bytenr, blocksize, 1780 eb = read_tree_block(root, bytenr, blocksize,
1603 generation); 1781 generation);
1604 btrfs_tree_lock(eb); 1782 btrfs_tree_lock(eb);
1783 btrfs_set_lock_blocking(eb);
1605 } 1784 }
1606 1785
1607 /* 1786 /*
@@ -1626,6 +1805,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1626 eb = read_tree_block(root, bytenr, blocksize, 1805 eb = read_tree_block(root, bytenr, blocksize,
1627 generation); 1806 generation);
1628 btrfs_tree_lock(eb); 1807 btrfs_tree_lock(eb);
1808 btrfs_set_lock_blocking(eb);
1629 } 1809 }
1630 1810
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1811 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2172,6 +2352,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2172 2352
2173 right = read_node_slot(root, upper, slot + 1); 2353 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right); 2354 btrfs_tree_lock(right);
2355 btrfs_set_lock_blocking(right);
2356
2175 free_space = btrfs_leaf_free_space(root, right); 2357 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size) 2358 if (free_space < data_size)
2177 goto out_unlock; 2359 goto out_unlock;
@@ -2367,6 +2549,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2367 2549
2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2550 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left); 2551 btrfs_tree_lock(left);
2552 btrfs_set_lock_blocking(left);
2553
2370 free_space = btrfs_leaf_free_space(root, left); 2554 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) { 2555 if (free_space < data_size) {
2372 ret = 1; 2556 ret = 1;
@@ -2825,6 +3009,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2825 path->keep_locks = 0; 3009 path->keep_locks = 0;
2826 BUG_ON(ret); 3010 BUG_ON(ret);
2827 3011
3012 /*
3013 * make sure any changes to the path from split_leaf leave it
3014 * in a blocking state
3015 */
3016 btrfs_set_path_blocking(path);
3017
2828 leaf = path->nodes[0]; 3018 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3019 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 3020
@@ -3354,6 +3544,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3354 BUG(); 3544 BUG();
3355 } 3545 }
3356out: 3546out:
3547 btrfs_unlock_up_safe(path, 1);
3357 return ret; 3548 return ret;
3358} 3549}
3359 3550
@@ -3441,15 +3632,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3441{ 3632{
3442 int ret; 3633 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3634 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3635 u64 parent_start = path->nodes[1]->start;
3636 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3444 3637
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3638 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret) 3639 if (ret)
3447 return ret; 3640 return ret;
3448 3641
3642 /*
3643 * btrfs_free_extent is expensive, we want to make sure we
3644 * aren't holding any locks when we call it
3645 */
3646 btrfs_unlock_up_safe(path, 0);
3647
3449 ret = btrfs_free_extent(trans, root, bytenr, 3648 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0), 3649 btrfs_level_size(root, 0),
3451 path->nodes[1]->start, 3650 parent_start, parent_owner,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1); 3651 root_gen, 0, 1);
3454 return ret; 3652 return ret;
3455} 3653}
@@ -3721,12 +3919,14 @@ find_next_key:
3721 */ 3919 */
3722 if (slot >= nritems) { 3920 if (slot >= nritems) {
3723 path->slots[level] = slot; 3921 path->slots[level] = slot;
3922 btrfs_set_path_blocking(path);
3724 sret = btrfs_find_next_key(root, path, min_key, level, 3923 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans); 3924 cache_only, min_trans);
3726 if (sret == 0) { 3925 if (sret == 0) {
3727 btrfs_release_path(root, path); 3926 btrfs_release_path(root, path);
3728 goto again; 3927 goto again;
3729 } else { 3928 } else {
3929 btrfs_clear_path_blocking(path);
3730 goto out; 3930 goto out;
3731 } 3931 }
3732 } 3932 }
@@ -3738,16 +3938,20 @@ find_next_key:
3738 unlock_up(path, level, 1); 3938 unlock_up(path, level, 1);
3739 goto out; 3939 goto out;
3740 } 3940 }
3941 btrfs_set_path_blocking(path);
3741 cur = read_node_slot(root, cur, slot); 3942 cur = read_node_slot(root, cur, slot);
3742 3943
3743 btrfs_tree_lock(cur); 3944 btrfs_tree_lock(cur);
3945
3744 path->locks[level - 1] = 1; 3946 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur; 3947 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1); 3948 unlock_up(path, level, 1);
3949 btrfs_clear_path_blocking(path);
3747 } 3950 }
3748out: 3951out:
3749 if (ret == 0) 3952 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key)); 3953 memcpy(min_key, &found_key, sizeof(found_key));
3954 btrfs_set_path_blocking(path);
3751 return ret; 3955 return ret;
3752} 3956}
3753 3957
@@ -3843,6 +4047,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3843 if (ret < 0) 4047 if (ret < 0)
3844 return ret; 4048 return ret;
3845 4049
4050 btrfs_set_path_blocking(path);
3846 nritems = btrfs_header_nritems(path->nodes[0]); 4051 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /* 4052 /*
3848 * by releasing the path above we dropped all our locks. A balance 4053 * by releasing the path above we dropped all our locks. A balance
@@ -3873,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3873 free_extent_buffer(next); 4078 free_extent_buffer(next);
3874 } 4079 }
3875 4080
4081 /* the path was set to blocking above */
3876 if (level == 1 && (path->locks[1] || path->skip_locking) && 4082 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada) 4083 path->reada)
3878 reada_for_search(root, path, level, slot, 0); 4084 reada_for_search(root, path, level, slot, 0);
@@ -3881,6 +4087,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3881 if (!path->skip_locking) { 4087 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c)); 4088 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next); 4089 btrfs_tree_lock(next);
4090 btrfs_set_lock_blocking(next);
3884 } 4091 }
3885 break; 4092 break;
3886 } 4093 }
@@ -3897,12 +4104,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3897 path->locks[level] = 1; 4104 path->locks[level] = 1;
3898 if (!level) 4105 if (!level)
3899 break; 4106 break;
4107
4108 btrfs_set_path_blocking(path);
3900 if (level == 1 && path->locks[1] && path->reada) 4109 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0); 4110 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0); 4111 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) { 4112 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4113 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next); 4114 btrfs_tree_lock(next);
4115 btrfs_set_lock_blocking(next);
3906 } 4116 }
3907 } 4117 }
3908done: 4118done:
@@ -3927,6 +4137,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3927 4137
3928 while (1) { 4138 while (1) {
3929 if (path->slots[0] == 0) { 4139 if (path->slots[0] == 0) {
4140 btrfs_set_path_blocking(path);
3930 ret = btrfs_prev_leaf(root, path); 4141 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0) 4142 if (ret != 0)
3932 return ret; 4143 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f88113..531db112c8bd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -454,17 +454,11 @@ struct btrfs_timespec {
454 __le32 nsec; 454 __le32 nsec;
455} __attribute__ ((__packed__)); 455} __attribute__ ((__packed__));
456 456
457typedef enum { 457enum btrfs_compression_type {
458 BTRFS_COMPRESS_NONE = 0, 458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1, 459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2, 460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type; 461};
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468 462
469struct btrfs_inode_item { 463struct btrfs_inode_item {
470 /* nfs style generation number */ 464 /* nfs style generation number */
@@ -701,9 +695,7 @@ struct btrfs_fs_info {
701 struct btrfs_transaction *running_transaction; 695 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle; 696 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait; 697 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait; 698 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707 699
708 struct btrfs_super_block super_copy; 700 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit; 701 struct btrfs_super_block super_for_commit;
@@ -711,7 +703,6 @@ struct btrfs_fs_info {
711 struct super_block *sb; 703 struct super_block *sb;
712 struct inode *btree_inode; 704 struct inode *btree_inode;
713 struct backing_dev_info bdi; 705 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex; 706 struct mutex trans_mutex;
716 struct mutex tree_log_mutex; 707 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex; 708 struct mutex transaction_kthread_mutex;
@@ -730,10 +721,6 @@ struct btrfs_fs_info {
730 atomic_t async_submit_draining; 721 atomic_t async_submit_draining;
731 atomic_t nr_async_bios; 722 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages; 723 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737 724
738 /* 725 /*
739 * this is used by the balancing code to wait for all the pending 726 * this is used by the balancing code to wait for all the pending
@@ -833,7 +820,14 @@ struct btrfs_root {
833 struct kobject root_kobj; 820 struct kobject root_kobj;
834 struct completion kobj_unregister; 821 struct completion kobj_unregister;
835 struct mutex objectid_mutex; 822 struct mutex objectid_mutex;
823
836 struct mutex log_mutex; 824 struct mutex log_mutex;
825 wait_queue_head_t log_writer_wait;
826 wait_queue_head_t log_commit_wait[2];
827 atomic_t log_writers;
828 atomic_t log_commit[2];
829 unsigned long log_transid;
830 unsigned long log_batch;
837 831
838 u64 objectid; 832 u64 objectid;
839 u64 last_trans; 833 u64 last_trans;
@@ -1841,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void); 1835struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p); 1836void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p); 1837void btrfs_init_path(struct btrfs_path *p);
1838void btrfs_set_path_blocking(struct btrfs_path *p);
1839void btrfs_clear_path_blocking(struct btrfs_path *p);
1840void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1841
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1842int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr); 1843 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1844int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae5..5aebddd71193 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/scatterlist.h> 21#include <linux/scatterlist.h>
@@ -800,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801 800
802 if (ret == 0) 801 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE; 802 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
804 else 803 else
805 WARN_ON(1); 804 WARN_ON(1);
806 return buf; 805 return buf;
@@ -814,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
814 if (btrfs_header_generation(buf) == 813 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) { 814 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf)); 815 WARN_ON(!btrfs_tree_locked(buf));
816
817 /* ugh, clear_extent_buffer_dirty can be expensive */
818 btrfs_set_lock_blocking(buf);
819
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 820 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf); 821 buf);
819 } 822 }
@@ -850,6 +853,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
850 spin_lock_init(&root->list_lock); 853 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex); 854 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex); 855 mutex_init(&root->log_mutex);
856 init_waitqueue_head(&root->log_writer_wait);
857 init_waitqueue_head(&root->log_commit_wait[0]);
858 init_waitqueue_head(&root->log_commit_wait[1]);
859 atomic_set(&root->log_commit[0], 0);
860 atomic_set(&root->log_commit[1], 0);
861 atomic_set(&root->log_writers, 0);
862 root->log_batch = 0;
863 root->log_transid = 0;
853 extent_io_tree_init(&root->dirty_log_pages, 864 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS); 865 fs_info->btree_inode->i_mapping, GFP_NOFS);
855 866
@@ -934,15 +945,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
934 return 0; 945 return 0;
935} 946}
936 947
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 948static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info) 949 struct btrfs_fs_info *fs_info)
939{ 950{
940 struct btrfs_root *root; 951 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root; 952 struct btrfs_root *tree_root = fs_info->tree_root;
953 struct extent_buffer *leaf;
942 954
943 root = kzalloc(sizeof(*root), GFP_NOFS); 955 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root) 956 if (!root)
945 return -ENOMEM; 957 return ERR_PTR(-ENOMEM);
946 958
947 __setup_root(tree_root->nodesize, tree_root->leafsize, 959 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize, 960 tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +963,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 963 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 964 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 965 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
966 /*
967 * log trees do not get reference counted because they go away
968 * before a real commit is actually done. They do store pointers
969 * to file data extents, and those reference counts still get
970 * updated (along with back refs to the log tree).
971 */
954 root->ref_cows = 0; 972 root->ref_cows = 0;
955 973
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 974 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID, 975 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0); 976 trans->transid, 0, 0, 0);
977 if (IS_ERR(leaf)) {
978 kfree(root);
979 return ERR_CAST(leaf);
980 }
959 981
982 root->node = leaf;
960 btrfs_set_header_nritems(root->node, 0); 983 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0); 984 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start); 985 btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +991,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
968 BTRFS_FSID_SIZE); 991 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node); 992 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node); 993 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root; 994 return root;
995}
996
997int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
998 struct btrfs_fs_info *fs_info)
999{
1000 struct btrfs_root *log_root;
1001
1002 log_root = alloc_log_tree(trans, fs_info);
1003 if (IS_ERR(log_root))
1004 return PTR_ERR(log_root);
1005 WARN_ON(fs_info->log_root_tree);
1006 fs_info->log_root_tree = log_root;
1007 return 0;
1008}
1009
1010int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1011 struct btrfs_root *root)
1012{
1013 struct btrfs_root *log_root;
1014 struct btrfs_inode_item *inode_item;
1015
1016 log_root = alloc_log_tree(trans, root->fs_info);
1017 if (IS_ERR(log_root))
1018 return PTR_ERR(log_root);
1019
1020 log_root->last_trans = trans->transid;
1021 log_root->root_key.offset = root->root_key.objectid;
1022
1023 inode_item = &log_root->root_item.inode;
1024 inode_item->generation = cpu_to_le64(1);
1025 inode_item->size = cpu_to_le64(3);
1026 inode_item->nlink = cpu_to_le32(1);
1027 inode_item->nbytes = cpu_to_le64(root->leafsize);
1028 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1029
1030 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
1031 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1032
1033 WARN_ON(root->log_root);
1034 root->log_root = log_root;
1035 root->log_transid = 0;
972 return 0; 1036 return 0;
973} 1037}
974 1038
@@ -1136,7 +1200,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{ 1200{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1201 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0; 1202 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device; 1203 struct btrfs_device *device;
1141 struct backing_dev_info *bdi; 1204 struct backing_dev_info *bdi;
1142#if 0 1205#if 0
@@ -1144,8 +1207,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1144 btrfs_congested_async(info, 0)) 1207 btrfs_congested_async(info, 0))
1145 return 1; 1208 return 1;
1146#endif 1209#endif
1147 list_for_each(cur, &info->fs_devices->devices) { 1210 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev) 1211 if (!device->bdev)
1150 continue; 1212 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev); 1213 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1225,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1163 */ 1225 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1226static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{ 1227{
1166 struct list_head *cur;
1167 struct btrfs_device *device; 1228 struct btrfs_device *device;
1168 struct btrfs_fs_info *info; 1229 struct btrfs_fs_info *info;
1169 1230
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1231 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) { 1232 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev) 1233 if (!device->bdev)
1174 continue; 1234 continue;
1175 1235
@@ -1447,7 +1507,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1507 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers); 1508 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1509 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock); 1510 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock); 1511 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock); 1512 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1594,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1535 init_waitqueue_head(&fs_info->transaction_throttle); 1594 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait); 1595 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait); 1596 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542 1597
1543 __setup_root(4096, 4096, 4096, 4096, tree_root, 1598 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1599 fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 * low idle thresh 1682 * low idle thresh
1628 */ 1683 */
1629 fs_info->endio_workers.idle_thresh = 4; 1684 fs_info->endio_workers.idle_thresh = 4;
1685 fs_info->endio_meta_workers.idle_thresh = 4;
1686
1630 fs_info->endio_write_workers.idle_thresh = 64; 1687 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64; 1688 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632 1689
@@ -1740,13 +1797,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1797 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1798 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner"); 1799 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread) 1800 if (IS_ERR(fs_info->cleaner_kthread))
1744 goto fail_csum_root; 1801 goto fail_csum_root;
1745 1802
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1803 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root, 1804 tree_root,
1748 "btrfs-transaction"); 1805 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread) 1806 if (IS_ERR(fs_info->transaction_kthread))
1750 goto fail_cleaner; 1807 goto fail_cleaner;
1751 1808
1752 if (btrfs_super_log_root(disk_super) != 0) { 1809 if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1885,14 @@ fail_sb_buffer:
1828fail_iput: 1885fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1886 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode); 1887 iput(fs_info->btree_inode);
1831fail: 1888
1832 btrfs_close_devices(fs_info->fs_devices); 1889 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1890 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1891 bdi_destroy(&fs_info->bdi);
1834 1892
1893fail:
1835 kfree(extent_root); 1894 kfree(extent_root);
1836 kfree(tree_root); 1895 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info); 1896 kfree(fs_info);
1839 kfree(chunk_root); 1897 kfree(chunk_root);
1840 kfree(dev_root); 1898 kfree(dev_root);
@@ -1995,7 +2053,6 @@ static int write_dev_supers(struct btrfs_device *device,
1995 2053
1996int write_all_supers(struct btrfs_root *root, int max_mirrors) 2054int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{ 2055{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices; 2056 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev; 2057 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb; 2058 struct btrfs_super_block *sb;
@@ -2011,8 +2068,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2011 2068
2012 sb = &root->fs_info->super_for_commit; 2069 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item; 2070 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) { 2071 list_for_each_entry(dev, head, dev_list) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) { 2072 if (!dev->bdev) {
2017 total_errors++; 2073 total_errors++;
2018 continue; 2074 continue;
@@ -2045,8 +2101,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2045 } 2101 }
2046 2102
2047 total_errors = 0; 2103 total_errors = 0;
2048 list_for_each(cur, head) { 2104 list_for_each_entry(dev, head, dev_list) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev) 2105 if (!dev->bdev)
2051 continue; 2106 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable) 2107 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2260 u64 transid = btrfs_header_generation(buf); 2315 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode; 2316 struct inode *btree_inode = root->fs_info->btree_inode;
2262 2317
2318 btrfs_set_lock_blocking(buf);
2319
2263 WARN_ON(!btrfs_tree_locked(buf)); 2320 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) { 2321 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2322 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2302,14 +2359,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2302 int ret; 2359 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2360 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0) 2361 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE; 2362 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2306 return ret; 2363 return ret;
2307} 2364}
2308 2365
2309int btree_lock_page_hook(struct page *page) 2366int btree_lock_page_hook(struct page *page)
2310{ 2367{
2311 struct inode *inode = page->mapping->host; 2368 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2369 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb; 2370 struct extent_buffer *eb;
2315 unsigned long len; 2371 unsigned long len;
@@ -2324,9 +2380,7 @@ int btree_lock_page_hook(struct page *page)
2324 goto out; 2380 goto out;
2325 2381
2326 btrfs_tree_lock(eb); 2382 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2383 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb); 2384 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb); 2385 free_extent_buffer(eb);
2332out: 2386out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b7..494a56eb2986 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info); 98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root);
101int btree_lock_page_hook(struct page *page); 103int btree_lock_page_hook(struct page *page);
102#endif 104#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..7527523c2d2d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include "compat.h" 23#include "compat.h"
24#include "hash.h" 24#include "hash.h"
25#include "crc32c.h" 25#include "crc32c.h"
@@ -30,7 +30,6 @@
30#include "volumes.h" 30#include "volumes.h"
31#include "locking.h" 31#include "locking.h"
32#include "ref-cache.h" 32#include "ref-cache.h"
33#include "compat.h"
34 33
35#define PENDING_EXTENT_INSERT 0 34#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 35#define PENDING_EXTENT_DELETE 1
@@ -326,10 +325,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 325 u64 flags)
327{ 326{
328 struct list_head *head = &info->space_info; 327 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 328 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 329 list_for_each_entry(found, head, list) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags) 330 if (found->flags == flags)
334 return found; 331 return found;
335 } 332 }
@@ -1525,15 +1522,55 @@ out:
1525 return ret; 1522 return ret;
1526} 1523}
1527 1524
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1525/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1526 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1527 * can be in just about any order, and it is likely to have clusters of
1528 * things that are close together and clusters of things that are not.
1529 *
1530 * To help reduce the seeks that come with updating all of these reference
1531 * counts, sort them by byte number before actual updates are done.
1532 *
1533 * struct refsort is used to match byte number to slot in the btree block.
1534 * we sort based on the byte number and then use the slot to actually
1535 * find the item.
1536 *
1537 * struct refsort is smaller than strcut btrfs_item and smaller than
1538 * struct btrfs_key_ptr. Since we're currently limited to the page size
1539 * for a btree block, there's no way for a kmalloc of refsorts for a
1540 * single node to be bigger than a page.
1541 */
1542struct refsort {
1543 u64 bytenr;
1544 u32 slot;
1545};
1546
1547/*
1548 * for passing into sort()
1549 */
1550static int refsort_cmp(const void *a_void, const void *b_void)
1551{
1552 const struct refsort *a = a_void;
1553 const struct refsort *b = b_void;
1554
1555 if (a->bytenr < b->bytenr)
1556 return -1;
1557 if (a->bytenr > b->bytenr)
1558 return 1;
1559 return 0;
1560}
1561
1562
1563noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1564 struct btrfs_root *root,
1565 struct extent_buffer *orig_buf,
1566 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1567{
1532 u64 bytenr; 1568 u64 bytenr;
1533 u64 ref_root; 1569 u64 ref_root;
1534 u64 orig_root; 1570 u64 orig_root;
1535 u64 ref_generation; 1571 u64 ref_generation;
1536 u64 orig_generation; 1572 u64 orig_generation;
1573 struct refsort *sorted;
1537 u32 nritems; 1574 u32 nritems;
1538 u32 nr_file_extents = 0; 1575 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1576 struct btrfs_key key;
@@ -1542,6 +1579,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1579 int level;
1543 int ret = 0; 1580 int ret = 0;
1544 int faili = 0; 1581 int faili = 0;
1582 int refi = 0;
1583 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1584 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1585 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1586
@@ -1553,6 +1592,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1592 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1593 level = btrfs_header_level(buf);
1555 1594
1595 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1596 BUG_ON(!sorted);
1597
1556 if (root->ref_cows) { 1598 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1599 process_func = __btrfs_inc_extent_ref;
1558 } else { 1600 } else {
@@ -1565,6 +1607,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1607 process_func = __btrfs_update_extent_ref;
1566 } 1608 }
1567 1609
1610 /*
1611 * we make two passes through the items. In the first pass we
1612 * only record the byte number and slot. Then we sort based on
1613 * byte number and do the actual work based on the sorted results
1614 */
1568 for (i = 0; i < nritems; i++) { 1615 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1616 cond_resched();
1570 if (level == 0) { 1617 if (level == 0) {
@@ -1581,6 +1628,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1628 continue;
1582 1629
1583 nr_file_extents++; 1630 nr_file_extents++;
1631 sorted[refi].bytenr = bytenr;
1632 sorted[refi].slot = i;
1633 refi++;
1634 } else {
1635 bytenr = btrfs_node_blockptr(buf, i);
1636 sorted[refi].bytenr = bytenr;
1637 sorted[refi].slot = i;
1638 refi++;
1639 }
1640 }
1641 /*
1642 * if refi == 0, we didn't actually put anything into the sorted
1643 * array and we're done
1644 */
1645 if (refi == 0)
1646 goto out;
1647
1648 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1649
1650 for (i = 0; i < refi; i++) {
1651 cond_resched();
1652 slot = sorted[i].slot;
1653 bytenr = sorted[i].bytenr;
1654
1655 if (level == 0) {
1656 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1657
1585 ret = process_func(trans, root, bytenr, 1658 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1659 orig_buf->start, buf->start,
@@ -1589,25 +1662,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1662 key.objectid);
1590 1663
1591 if (ret) { 1664 if (ret) {
1592 faili = i; 1665 faili = slot;
1593 WARN_ON(1); 1666 WARN_ON(1);
1594 goto fail; 1667 goto fail;
1595 } 1668 }
1596 } else { 1669 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1670 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1671 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1672 orig_root, ref_root,
1601 orig_generation, ref_generation, 1673 orig_generation, ref_generation,
1602 level - 1); 1674 level - 1);
1603 if (ret) { 1675 if (ret) {
1604 faili = i; 1676 faili = slot;
1605 WARN_ON(1); 1677 WARN_ON(1);
1606 goto fail; 1678 goto fail;
1607 } 1679 }
1608 } 1680 }
1609 } 1681 }
1610out: 1682out:
1683 kfree(sorted);
1611 if (nr_extents) { 1684 if (nr_extents) {
1612 if (level == 0) 1685 if (level == 0)
1613 *nr_extents = nr_file_extents; 1686 *nr_extents = nr_file_extents;
@@ -1616,6 +1689,7 @@ out:
1616 } 1689 }
1617 return 0; 1690 return 0;
1618fail: 1691fail:
1692 kfree(sorted);
1619 WARN_ON(1); 1693 WARN_ON(1);
1620 return ret; 1694 return ret;
1621} 1695}
@@ -2159,7 +2233,8 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2233 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2234 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2235 if (ret) {
2162 if (skipped && all && !num_inserts) { 2236 if (skipped && all && !num_inserts &&
2237 list_empty(&update_list)) {
2163 skipped = 0; 2238 skipped = 0;
2164 search = 0; 2239 search = 0;
2165 continue; 2240 continue;
@@ -2547,6 +2622,7 @@ again:
2547 if (ret) { 2622 if (ret) {
2548 if (all && skipped && !nr) { 2623 if (all && skipped && !nr) {
2549 search = 0; 2624 search = 0;
2625 skipped = 0;
2550 continue; 2626 continue;
2551 } 2627 }
2552 mutex_unlock(&info->extent_ins_mutex); 2628 mutex_unlock(&info->extent_ins_mutex);
@@ -2700,13 +2776,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 2776 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2777 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 2778 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 2779 mutex_lock(&root->fs_info->pinned_mutex);
2704 2780 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 2781 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 2782 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 2783 return 0;
2712 } 2784 }
@@ -3014,7 +3086,6 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3086static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3087{
3016 struct btrfs_block_group_cache *cache; 3088 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3089
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3090 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3091 (unsigned long long)(info->total_bytes - info->bytes_used -
@@ -3022,8 +3093,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3022 (info->full) ? "" : "not "); 3093 (info->full) ? "" : "not ");
3023 3094
3024 down_read(&info->groups_sem); 3095 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3096 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3097 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3098 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3099 "%llu pinned %llu reserved\n",
@@ -3342,7 +3412,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3342 btrfs_set_header_generation(buf, trans->transid); 3412 btrfs_set_header_generation(buf, trans->transid);
3343 btrfs_tree_lock(buf); 3413 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3414 clean_tree_block(trans, root, buf);
3415
3416 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3417 btrfs_set_buffer_uptodate(buf);
3418
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3419 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3420 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3421 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3424,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3424 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3425 }
3353 trans->blocks_used++; 3426 trans->blocks_used++;
3427 /* this returns a buffer locked for blocking */
3354 return buf; 3428 return buf;
3355} 3429}
3356 3430
@@ -3388,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3462{
3389 u64 leaf_owner; 3463 u64 leaf_owner;
3390 u64 leaf_generation; 3464 u64 leaf_generation;
3465 struct refsort *sorted;
3391 struct btrfs_key key; 3466 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3467 struct btrfs_file_extent_item *fi;
3393 int i; 3468 int i;
3394 int nritems; 3469 int nritems;
3395 int ret; 3470 int ret;
3471 int refi = 0;
3472 int slot;
3396 3473
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3474 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3475 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3476 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3477 leaf_generation = btrfs_header_generation(leaf);
3401 3478
3479 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3480 /* we do this loop twice. The first time we build a list
3481 * of the extents we have a reference on, then we sort the list
3482 * by bytenr. The second time around we actually do the
3483 * extent freeing.
3484 */
3402 for (i = 0; i < nritems; i++) { 3485 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3486 u64 disk_bytenr;
3404 cond_resched(); 3487 cond_resched();
3405 3488
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3489 btrfs_item_key_to_cpu(leaf, &key, i);
3490
3491 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3492 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3493 continue;
3494
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3495 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3496
3497 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3498 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3499 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3500 continue;
3413 /* 3501
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3502 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3503
3504 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3505 if (disk_bytenr == 0)
3419 continue; 3506 continue;
3420 3507
3508 sorted[refi].bytenr = disk_bytenr;
3509 sorted[refi].slot = i;
3510 refi++;
3511 }
3512
3513 if (refi == 0)
3514 goto out;
3515
3516 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3517
3518 for (i = 0; i < refi; i++) {
3519 u64 disk_bytenr;
3520
3521 disk_bytenr = sorted[i].bytenr;
3522 slot = sorted[i].slot;
3523
3524 cond_resched();
3525
3526 btrfs_item_key_to_cpu(leaf, &key, slot);
3527 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3528 continue;
3529
3530 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3531
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3532 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3533 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3534 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3539 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3540 cond_resched();
3430 } 3541 }
3542out:
3543 kfree(sorted);
3431 return 0; 3544 return 0;
3432} 3545}
3433 3546
@@ -3437,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3550{
3438 int i; 3551 int i;
3439 int ret; 3552 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3553 struct btrfs_extent_info *info;
3554 struct refsort *sorted;
3555
3556 if (ref->nritems == 0)
3557 return 0;
3441 3558
3559 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3560 for (i = 0; i < ref->nritems; i++) {
3561 sorted[i].bytenr = ref->extents[i].bytenr;
3562 sorted[i].slot = i;
3563 }
3564 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3565
3566 /*
3567 * the items in the ref were sorted when the ref was inserted
3568 * into the ref cache, so this is already in order
3569 */
3570 for (i = 0; i < ref->nritems; i++) {
3571 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3572 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3573 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3574 ref->owner, ref->generation,
@@ -3453,6 +3582,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3582 info++;
3454 } 3583 }
3455 3584
3585 kfree(sorted);
3456 return 0; 3586 return 0;
3457} 3587}
3458 3588
@@ -3497,6 +3627,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3627}
3498 3628
3499/* 3629/*
3630 * this is used while deleting old snapshots, and it drops the refs
3631 * on a whole subtree starting from a level 1 node.
3632 *
3633 * The idea is to sort all the leaf pointers, and then drop the
3634 * ref on all the leaves in order. Most of the time the leaves
3635 * will have ref cache entries, so no leaf IOs will be required to
3636 * find the extents they have references on.
3637 *
3638 * For each leaf, any references it has are also dropped in order
3639 *
3640 * This ends up dropping the references in something close to optimal
3641 * order for reading and modifying the extent allocation tree.
3642 */
3643static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3644 struct btrfs_root *root,
3645 struct btrfs_path *path)
3646{
3647 u64 bytenr;
3648 u64 root_owner;
3649 u64 root_gen;
3650 struct extent_buffer *eb = path->nodes[1];
3651 struct extent_buffer *leaf;
3652 struct btrfs_leaf_ref *ref;
3653 struct refsort *sorted = NULL;
3654 int nritems = btrfs_header_nritems(eb);
3655 int ret;
3656 int i;
3657 int refi = 0;
3658 int slot = path->slots[1];
3659 u32 blocksize = btrfs_level_size(root, 0);
3660 u32 refs;
3661
3662 if (nritems == 0)
3663 goto out;
3664
3665 root_owner = btrfs_header_owner(eb);
3666 root_gen = btrfs_header_generation(eb);
3667 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3668
3669 /*
3670 * step one, sort all the leaf pointers so we don't scribble
3671 * randomly into the extent allocation tree
3672 */
3673 for (i = slot; i < nritems; i++) {
3674 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3675 sorted[refi].slot = i;
3676 refi++;
3677 }
3678
3679 /*
3680 * nritems won't be zero, but if we're picking up drop_snapshot
3681 * after a crash, slot might be > 0, so double check things
3682 * just in case.
3683 */
3684 if (refi == 0)
3685 goto out;
3686
3687 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3688
3689 /*
3690 * the first loop frees everything the leaves point to
3691 */
3692 for (i = 0; i < refi; i++) {
3693 u64 ptr_gen;
3694
3695 bytenr = sorted[i].bytenr;
3696
3697 /*
3698 * check the reference count on this leaf. If it is > 1
3699 * we just decrement it below and don't update any
3700 * of the refs the leaf points to.
3701 */
3702 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3703 BUG_ON(ret);
3704 if (refs != 1)
3705 continue;
3706
3707 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3708
3709 /*
3710 * the leaf only had one reference, which means the
3711 * only thing pointing to this leaf is the snapshot
3712 * we're deleting. It isn't possible for the reference
3713 * count to increase again later
3714 *
3715 * The reference cache is checked for the leaf,
3716 * and if found we'll be able to drop any refs held by
3717 * the leaf without needing to read it in.
3718 */
3719 ref = btrfs_lookup_leaf_ref(root, bytenr);
3720 if (ref && ref->generation != ptr_gen) {
3721 btrfs_free_leaf_ref(root, ref);
3722 ref = NULL;
3723 }
3724 if (ref) {
3725 ret = cache_drop_leaf_ref(trans, root, ref);
3726 BUG_ON(ret);
3727 btrfs_remove_leaf_ref(root, ref);
3728 btrfs_free_leaf_ref(root, ref);
3729 } else {
3730 /*
3731 * the leaf wasn't in the reference cache, so
3732 * we have to read it.
3733 */
3734 leaf = read_tree_block(root, bytenr, blocksize,
3735 ptr_gen);
3736 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3737 BUG_ON(ret);
3738 free_extent_buffer(leaf);
3739 }
3740 atomic_inc(&root->fs_info->throttle_gen);
3741 wake_up(&root->fs_info->transaction_throttle);
3742 cond_resched();
3743 }
3744
3745 /*
3746 * run through the loop again to free the refs on the leaves.
3747 * This is faster than doing it in the loop above because
3748 * the leaves are likely to be clustered together. We end up
3749 * working in nice chunks on the extent allocation tree.
3750 */
3751 for (i = 0; i < refi; i++) {
3752 bytenr = sorted[i].bytenr;
3753 ret = __btrfs_free_extent(trans, root, bytenr,
3754 blocksize, eb->start,
3755 root_owner, root_gen, 0, 1);
3756 BUG_ON(ret);
3757
3758 atomic_inc(&root->fs_info->throttle_gen);
3759 wake_up(&root->fs_info->transaction_throttle);
3760 cond_resched();
3761 }
3762out:
3763 kfree(sorted);
3764
3765 /*
3766 * update the path to show we've processed the entire level 1
3767 * node. This will get saved into the root's drop_snapshot_progress
3768 * field so these drops are not repeated again if this transaction
3769 * commits.
3770 */
3771 path->slots[1] = nritems;
3772 return 0;
3773}
3774
3775/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 3776 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 3777 * counts as it goes.
3502 */ 3778 */
@@ -3511,7 +3787,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 3787 struct extent_buffer *next;
3512 struct extent_buffer *cur; 3788 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 3789 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 3790 u32 blocksize;
3516 int ret; 3791 int ret;
3517 u32 refs; 3792 u32 refs;
@@ -3538,17 +3813,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 3813 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 3814 btrfs_header_nritems(cur))
3540 break; 3815 break;
3816
3817 /* the new code goes down to level 1 and does all the
3818 * leaves pointed to that node in bulk. So, this check
3819 * for level 0 will always be false.
3820 *
3821 * But, the disk format allows the drop_snapshot_progress
3822 * field in the root to leave things in a state where
3823 * a leaf will need cleaning up here. If someone crashes
3824 * with the old code and then boots with the new code,
3825 * we might find a leaf here.
3826 */
3541 if (*level == 0) { 3827 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 3828 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 3829 BUG_ON(ret);
3544 break; 3830 break;
3545 } 3831 }
3832
3833 /*
3834 * once we get to level one, process the whole node
3835 * at once, including everything below it.
3836 */
3837 if (*level == 1) {
3838 ret = drop_level_one_refs(trans, root, path);
3839 BUG_ON(ret);
3840 break;
3841 }
3842
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3843 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3844 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 3845 blocksize = btrfs_level_size(root, *level - 1);
3549 3846
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3847 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 3848 BUG_ON(ret);
3849
3850 /*
3851 * if there is more than one reference, we don't need
3852 * to read that node to drop any references it has. We
3853 * just drop the ref we hold on that node and move on to the
3854 * next slot in this level.
3855 */
3552 if (refs != 1) { 3856 if (refs != 1) {
3553 parent = path->nodes[*level]; 3857 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 3858 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +3871,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 3871
3568 continue; 3872 continue;
3569 } 3873 }
3874
3570 /* 3875 /*
3571 * at this point, we have a single ref, and since the 3876 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 3877 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 3878 */
3576 if (*level == 1) { 3879 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 3880 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 3881 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 3882 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +3901,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 3901 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 3902 root_gen = btrfs_header_generation(parent);
3633 3903
3904 /*
3905 * cleanup and free the reference on the last node
3906 * we processed
3907 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3908 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 3909 parent->start, root_owner, root_gen,
3636 *level, 1); 3910 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 3911 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 3912 path->nodes[*level] = NULL;
3913
3639 *level += 1; 3914 *level += 1;
3640 BUG_ON(ret); 3915 BUG_ON(ret);
3641 3916
@@ -3687,6 +3962,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 3962
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3963 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 3964 btrfs_tree_lock(next);
3965 btrfs_set_lock_blocking(next);
3690 3966
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3967 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 3968 &refs);
@@ -3754,6 +4030,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4030 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4031 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4032 struct btrfs_disk_key disk_key;
4033
4034 /*
4035 * there is more work to do in this level.
4036 * Update the drop_progress marker to reflect
4037 * the work we've done so far, and then bump
4038 * the slot number
4039 */
3757 node = path->nodes[i]; 4040 node = path->nodes[i];
3758 path->slots[i]++; 4041 path->slots[i]++;
3759 *level = i; 4042 *level = i;
@@ -3765,6 +4048,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4048 return 0;
3766 } else { 4049 } else {
3767 struct extent_buffer *parent; 4050 struct extent_buffer *parent;
4051
4052 /*
4053 * this whole node is done, free our reference
4054 * on it and go up one level
4055 */
3768 if (path->nodes[*level] == root->node) 4056 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4057 parent = path->nodes[*level];
3770 else 4058 else
@@ -4444,7 +4732,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4732 u64 lock_end = 0;
4445 u64 num_bytes; 4733 u64 num_bytes;
4446 u64 ext_offset; 4734 u64 ext_offset;
4447 u64 first_pos; 4735 u64 search_end = (u64)-1;
4448 u32 nritems; 4736 u32 nritems;
4449 int nr_scaned = 0; 4737 int nr_scaned = 0;
4450 int extent_locked = 0; 4738 int extent_locked = 0;
@@ -4452,7 +4740,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 4740 int ret;
4453 4741
4454 memcpy(&key, leaf_key, sizeof(key)); 4742 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4743 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 4744 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 4745 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +4788,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 4788 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 4789 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 4790 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 4791 key.offset >= search_end)
4505 break; 4792 break;
4506 } 4793 }
4507 4794
@@ -4534,8 +4821,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4821 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 4822 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 4823
4537 if (first_pos > key.offset - ext_offset) 4824 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 4825 search_end = key.offset - ext_offset +
4826 btrfs_file_extent_ram_bytes(leaf, fi);
4827 }
4539 4828
4540 if (!extent_locked) { 4829 if (!extent_locked) {
4541 lock_start = key.offset; 4830 lock_start = key.offset;
@@ -4724,7 +5013,7 @@ next:
4724 } 5013 }
4725skip: 5014skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5015 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5016 key.offset >= search_end)
4728 break; 5017 break;
4729 5018
4730 cond_resched(); 5019 cond_resched();
@@ -4778,6 +5067,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5067 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5068 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5069 ref->generation = btrfs_header_generation(buf);
5070
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5071 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5072 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5073 btrfs_free_leaf_ref(root, ref);
@@ -5957,9 +6247,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6247 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6248 BUG_ON(!path);
5959 6249
5960 btrfs_remove_free_space_cache(block_group); 6250 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6251 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6252 &root->fs_info->block_group_cache_tree);
6253 spin_unlock(&root->fs_info->block_group_cache_lock);
6254 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6255 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6256 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6257 up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1fa..37d43b516b79 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/pagevec.h> 13#include <linux/pagevec.h>
15#include "extent_io.h" 14#include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
31static LIST_HEAD(states); 30static LIST_HEAD(states);
32 31
33#define LEAK_DEBUG 0 32#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG 33#if LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock); 34static DEFINE_SPINLOCK(leak_lock);
36#endif 35#endif
37 36
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
120static struct extent_state *alloc_extent_state(gfp_t mask) 119static struct extent_state *alloc_extent_state(gfp_t mask)
121{ 120{
122 struct extent_state *state; 121 struct extent_state *state;
123#ifdef LEAK_DEBUG 122#if LEAK_DEBUG
124 unsigned long flags; 123 unsigned long flags;
125#endif 124#endif
126 125
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
130 state->state = 0; 129 state->state = 0;
131 state->private = 0; 130 state->private = 0;
132 state->tree = NULL; 131 state->tree = NULL;
133#ifdef LEAK_DEBUG 132#if LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags); 133 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states); 134 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags); 135 spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
145 if (!state) 144 if (!state)
146 return; 145 return;
147 if (atomic_dec_and_test(&state->refs)) { 146 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG 147#if LEAK_DEBUG
149 unsigned long flags; 148 unsigned long flags;
150#endif 149#endif
151 WARN_ON(state->tree); 150 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG 151#if LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags); 152 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list); 153 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags); 154 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2378,11 +2377,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2378 int scanned = 0; 2377 int scanned = 0;
2379 int range_whole = 0; 2378 int range_whole = 0;
2380 2379
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0); 2380 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) { 2381 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */ 2382 index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2849,98 @@ out:
2855 return sector; 2849 return sector;
2856} 2850}
2857 2851
2852int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2853 __u64 start, __u64 len, get_extent_t *get_extent)
2854{
2855 int ret;
2856 u64 off = start;
2857 u64 max = start + len;
2858 u32 flags = 0;
2859 u64 disko = 0;
2860 struct extent_map *em = NULL;
2861 int end = 0;
2862 u64 em_start = 0, em_len = 0;
2863 unsigned long emflags;
2864 ret = 0;
2865
2866 if (len == 0)
2867 return -EINVAL;
2868
2869 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2870 GFP_NOFS);
2871 em = get_extent(inode, NULL, 0, off, max - off, 0);
2872 if (!em)
2873 goto out;
2874 if (IS_ERR(em)) {
2875 ret = PTR_ERR(em);
2876 goto out;
2877 }
2878 while (!end) {
2879 off = em->start + em->len;
2880 if (off >= max)
2881 end = 1;
2882
2883 em_start = em->start;
2884 em_len = em->len;
2885
2886 disko = 0;
2887 flags = 0;
2888
2889 switch (em->block_start) {
2890 case EXTENT_MAP_LAST_BYTE:
2891 end = 1;
2892 flags |= FIEMAP_EXTENT_LAST;
2893 break;
2894 case EXTENT_MAP_HOLE:
2895 flags |= FIEMAP_EXTENT_UNWRITTEN;
2896 break;
2897 case EXTENT_MAP_INLINE:
2898 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2899 FIEMAP_EXTENT_NOT_ALIGNED);
2900 break;
2901 case EXTENT_MAP_DELALLOC:
2902 flags |= (FIEMAP_EXTENT_DELALLOC |
2903 FIEMAP_EXTENT_UNKNOWN);
2904 break;
2905 default:
2906 disko = em->block_start;
2907 break;
2908 }
2909 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2910 flags |= FIEMAP_EXTENT_ENCODED;
2911
2912 emflags = em->flags;
2913 free_extent_map(em);
2914 em = NULL;
2915
2916 if (!end) {
2917 em = get_extent(inode, NULL, 0, off, max - off, 0);
2918 if (!em)
2919 goto out;
2920 if (IS_ERR(em)) {
2921 ret = PTR_ERR(em);
2922 goto out;
2923 }
2924 emflags = em->flags;
2925 }
2926 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2927 flags |= FIEMAP_EXTENT_LAST;
2928 end = 1;
2929 }
2930
2931 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2932 em_len, flags);
2933 if (ret)
2934 goto out_free;
2935 }
2936out_free:
2937 free_extent_map(em);
2938out:
2939 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2940 GFP_NOFS);
2941 return ret;
2942}
2943
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2944static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i) 2945 unsigned long i)
2860{ 2946{
@@ -2892,15 +2978,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2892 gfp_t mask) 2978 gfp_t mask)
2893{ 2979{
2894 struct extent_buffer *eb = NULL; 2980 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG 2981#if LEAK_DEBUG
2896 unsigned long flags; 2982 unsigned long flags;
2897#endif 2983#endif
2898 2984
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2985 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start; 2986 eb->start = start;
2901 eb->len = len; 2987 eb->len = len;
2902 mutex_init(&eb->mutex); 2988 spin_lock_init(&eb->lock);
2903#ifdef LEAK_DEBUG 2989 init_waitqueue_head(&eb->lock_wq);
2990
2991#if LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags); 2992 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers); 2993 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags); 2994 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +3000,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2912 3000
2913static void __free_extent_buffer(struct extent_buffer *eb) 3001static void __free_extent_buffer(struct extent_buffer *eb)
2914{ 3002{
2915#ifdef LEAK_DEBUG 3003#if LEAK_DEBUG
2916 unsigned long flags; 3004 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags); 3005 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list); 3006 list_del(&eb->leak_list);
@@ -2980,8 +3068,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2980 unlock_page(p); 3068 unlock_page(p);
2981 } 3069 }
2982 if (uptodate) 3070 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE; 3071 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985 3072
2986 spin_lock(&tree->buffer_lock); 3073 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3074 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3222,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3135 unsigned long num_pages; 3222 unsigned long num_pages;
3136 3223
3137 num_pages = num_extent_pages(eb->start, eb->len); 3224 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE; 3225 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3139 3226
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3227 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS); 3228 GFP_NOFS);
@@ -3206,7 +3293,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3206 struct page *page; 3293 struct page *page;
3207 int pg_uptodate = 1; 3294 int pg_uptodate = 1;
3208 3295
3209 if (eb->flags & EXTENT_UPTODATE) 3296 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3210 return 1; 3297 return 1;
3211 3298
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3299 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3329,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 struct bio *bio = NULL; 3329 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0; 3330 unsigned long bio_flags = 0;
3244 3331
3245 if (eb->flags & EXTENT_UPTODATE) 3332 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3246 return 0; 3333 return 0;
3247 3334
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3335 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3360,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3273 } 3360 }
3274 if (all_uptodate) { 3361 if (all_uptodate) {
3275 if (start_i == 0) 3362 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE; 3363 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 goto unlock_exit; 3364 goto unlock_exit;
3278 } 3365 }
3279 3366
@@ -3309,7 +3396,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3309 } 3396 }
3310 3397
3311 if (!ret) 3398 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE; 3399 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3313 return ret; 3400 return ret;
3314 3401
3315unlock_exit: 3402unlock_exit:
@@ -3406,7 +3493,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3406 unmap_extent_buffer(eb, eb->map_token, km); 3493 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL; 3494 eb->map_token = NULL;
3408 save = 1; 3495 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 } 3496 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map, 3497 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km); 3498 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a79137..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
193 unsigned from, unsigned to); 205 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 206sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent); 207 get_extent_t *get_extent);
208int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
209 __u64 start, __u64 len, get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 210int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 211int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 212int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada32..50da69da20ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h> 6#include <linux/hardirq.h>
8#include "extent_map.h" 7#include "extent_map.h"
9 8
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145e..3e8023efaff7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
29#include <linux/writeback.h> 29#include <linux/writeback.h>
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h" 32#include "ctree.h"
34#include "disk-io.h" 33#include "disk-io.h"
35#include "transaction.h" 34#include "transaction.h"
@@ -1215,10 +1214,10 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1215 } 1214 }
1216 mutex_unlock(&root->fs_info->trans_mutex); 1215 mutex_unlock(&root->fs_info->trans_mutex);
1217 1216
1218 root->fs_info->tree_log_batch++; 1217 root->log_batch++;
1219 filemap_fdatawrite(inode->i_mapping); 1218 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1219 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++; 1220 root->log_batch++;
1222 1221
1223 /* 1222 /*
1224 * ok we haven't committed the transaction yet, lets do a commit 1223 * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab41..8f0706210a47 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/statfs.h> 34#include <linux/statfs.h>
35#include <linux/compat.h> 35#include <linux/compat.h>
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h> 37#include <linux/xattr.h>
39#include <linux/posix_acl.h> 38#include <linux/posix_acl.h>
40#include <linux/falloc.h> 39#include <linux/falloc.h>
@@ -51,6 +50,7 @@
51#include "tree-log.h" 50#include "tree-log.h"
52#include "ref-cache.h" 51#include "ref-cache.h"
53#include "compression.h" 52#include "compression.h"
53#include "locking.h"
54 54
55struct btrfs_iget_args { 55struct btrfs_iget_args {
56 u64 ino; 56 u64 ino;
@@ -91,6 +91,16 @@ static noinline int cow_file_range(struct inode *inode,
91 u64 start, u64 end, int *page_started, 91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock); 92 unsigned long *nr_written, int unlock);
93 93
94static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
95{
96 int err;
97
98 err = btrfs_init_acl(inode, dir);
99 if (!err)
100 err = btrfs_xattr_security_init(inode, dir);
101 return err;
102}
103
94/* 104/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There 105 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing. 106 * are countless ways this is incorrect, but it is better than nothing.
@@ -350,6 +360,19 @@ again:
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 360 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 361 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352 362
363 /*
364 * we don't want to send crud past the end of i_size through
365 * compression, that's just a waste of CPU time. So, if the
366 * end of the file is before the start of our current
367 * requested range of bytes, we bail out to the uncompressed
368 * cleanup code that can deal with all of this.
369 *
370 * It isn't really the fastest way to fix things, but this is a
371 * very uncommon corner.
372 */
373 if (actual_end <= start)
374 goto cleanup_and_bail_uncompressed;
375
353 total_compressed = actual_end - start; 376 total_compressed = actual_end - start;
354 377
355 /* we want to make sure that amount of ram required to uncompress 378 /* we want to make sure that amount of ram required to uncompress
@@ -494,6 +517,7 @@ again:
494 goto again; 517 goto again;
495 } 518 }
496 } else { 519 } else {
520cleanup_and_bail_uncompressed:
497 /* 521 /*
498 * No compression, but we still need to write the pages in 522 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked 523 * the file we've been given so far. redirty the locked
@@ -1324,12 +1348,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset, 1348 struct inode *inode, u64 file_offset,
1325 struct list_head *list) 1349 struct list_head *list)
1326{ 1350{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum; 1351 struct btrfs_ordered_sum *sum;
1329 1352
1330 btrfs_set_trans_block_group(trans, inode); 1353 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) { 1354
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1355 list_for_each_entry(sum, list, list) {
1333 btrfs_csum_file_blocks(trans, 1356 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1357 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 } 1358 }
@@ -2013,6 +2036,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2036 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014 2037
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2038 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2039
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2040 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2041 alloc_group_block, 0);
2018 btrfs_free_path(path); 2042 btrfs_free_path(path);
@@ -2039,6 +2063,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2063 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break; 2064 break;
2041 default: 2065 default:
2066 inode->i_op = &btrfs_special_inode_operations;
2042 init_special_inode(inode, inode->i_mode, rdev); 2067 init_special_inode(inode, inode->i_mode, rdev);
2043 break; 2068 break;
2044 } 2069 }
@@ -2108,6 +2133,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2108 goto failed; 2133 goto failed;
2109 } 2134 }
2110 2135
2136 btrfs_unlock_up_safe(path, 1);
2111 leaf = path->nodes[0]; 2137 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2138 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item); 2139 struct btrfs_inode_item);
@@ -2429,6 +2455,8 @@ next_node:
2429 ref->generation = leaf_gen; 2455 ref->generation = leaf_gen;
2430 ref->nritems = 0; 2456 ref->nritems = 0;
2431 2457
2458 btrfs_sort_leaf_ref(ref);
2459
2432 ret = btrfs_add_leaf_ref(root, ref, 0); 2460 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret); 2461 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref); 2462 btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2504,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_path *path; 2504 struct btrfs_path *path;
2477 struct btrfs_key key; 2505 struct btrfs_key key;
2478 struct btrfs_key found_key; 2506 struct btrfs_key found_key;
2479 u32 found_type; 2507 u32 found_type = (u8)-1;
2480 struct extent_buffer *leaf; 2508 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi; 2509 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0; 2510 u64 extent_start = 0;
@@ -2663,6 +2691,8 @@ next:
2663 if (pending_del_nr) 2691 if (pending_del_nr)
2664 goto del_pending; 2692 goto del_pending;
2665 btrfs_release_path(root, path); 2693 btrfs_release_path(root, path);
2694 if (found_type == BTRFS_INODE_ITEM_KEY)
2695 break;
2666 goto search_again; 2696 goto search_again;
2667 } 2697 }
2668 2698
@@ -2679,6 +2709,8 @@ del_pending:
2679 BUG_ON(ret); 2709 BUG_ON(ret);
2680 pending_del_nr = 0; 2710 pending_del_nr = 0;
2681 btrfs_release_path(root, path); 2711 btrfs_release_path(root, path);
2712 if (found_type == BTRFS_INODE_ITEM_KEY)
2713 break;
2682 goto search_again; 2714 goto search_again;
2683 } 2715 }
2684 } 2716 }
@@ -3265,7 +3297,7 @@ skip:
3265 3297
3266 /* Reached end of directory/root. Bump pos past the last item. */ 3298 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3299 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3300 filp->f_pos = INT_LIMIT(off_t);
3269 else 3301 else
3270 filp->f_pos++; 3302 filp->f_pos++;
3271nopos: 3303nopos:
@@ -3458,7 +3490,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3458 root->highest_inode = objectid; 3490 root->highest_inode = objectid;
3459 3491
3460 inode->i_uid = current_fsuid(); 3492 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid(); 3493
3494 if (dir && (dir->i_mode & S_ISGID)) {
3495 inode->i_gid = dir->i_gid;
3496 if (S_ISDIR(mode))
3497 mode |= S_ISGID;
3498 } else
3499 inode->i_gid = current_fsgid();
3500
3462 inode->i_mode = mode; 3501 inode->i_mode = mode;
3463 inode->i_ino = objectid; 3502 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0); 3503 inode_set_bytes(inode, 0);
@@ -3586,7 +3625,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3586 if (IS_ERR(inode)) 3625 if (IS_ERR(inode))
3587 goto out_unlock; 3626 goto out_unlock;
3588 3627
3589 err = btrfs_init_acl(inode, dir); 3628 err = btrfs_init_inode_security(inode, dir);
3590 if (err) { 3629 if (err) {
3591 drop_inode = 1; 3630 drop_inode = 1;
3592 goto out_unlock; 3631 goto out_unlock;
@@ -3649,7 +3688,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3649 if (IS_ERR(inode)) 3688 if (IS_ERR(inode))
3650 goto out_unlock; 3689 goto out_unlock;
3651 3690
3652 err = btrfs_init_acl(inode, dir); 3691 err = btrfs_init_inode_security(inode, dir);
3653 if (err) { 3692 if (err) {
3654 drop_inode = 1; 3693 drop_inode = 1;
3655 goto out_unlock; 3694 goto out_unlock;
@@ -3772,7 +3811,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3772 3811
3773 drop_on_err = 1; 3812 drop_on_err = 1;
3774 3813
3775 err = btrfs_init_acl(inode, dir); 3814 err = btrfs_init_inode_security(inode, dir);
3776 if (err) 3815 if (err)
3777 goto out_fail; 3816 goto out_fail;
3778 3817
@@ -4158,9 +4197,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4158 return -EINVAL; 4197 return -EINVAL;
4159} 4198}
4160 4199
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4200static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4201 __u64 start, __u64 len)
4162{ 4202{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent); 4203 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4164} 4204}
4165 4205
4166int btrfs_readpage(struct file *file, struct page *page) 4206int btrfs_readpage(struct file *file, struct page *page)
@@ -4733,7 +4773,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4733 if (IS_ERR(inode)) 4773 if (IS_ERR(inode))
4734 goto out_unlock; 4774 goto out_unlock;
4735 4775
4736 err = btrfs_init_acl(inode, dir); 4776 err = btrfs_init_inode_security(inode, dir);
4737 if (err) { 4777 if (err) {
4738 drop_inode = 1; 4778 drop_inode = 1;
4739 goto out_unlock; 4779 goto out_unlock;
@@ -4987,13 +5027,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
4987 .clear_bit_hook = btrfs_clear_bit_hook, 5027 .clear_bit_hook = btrfs_clear_bit_hook,
4988}; 5028};
4989 5029
5030/*
5031 * btrfs doesn't support the bmap operation because swapfiles
5032 * use bmap to make a mapping of extents in the file. They assume
5033 * these extents won't change over the life of the file and they
5034 * use the bmap result to do IO directly to the drive.
5035 *
5036 * the btrfs bmap call would return logical addresses that aren't
5037 * suitable for IO and they also will change frequently as COW
5038 * operations happen. So, swapfile + btrfs == corruption.
5039 *
5040 * For now we're avoiding this by dropping bmap.
5041 */
4990static struct address_space_operations btrfs_aops = { 5042static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage, 5043 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage, 5044 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages, 5045 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages, 5046 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page, 5047 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO, 5048 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage, 5049 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage, 5050 .releasepage = btrfs_releasepage,
@@ -5017,6 +5068,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5017 .removexattr = btrfs_removexattr, 5068 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission, 5069 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate, 5070 .fallocate = btrfs_fallocate,
5071 .fiemap = btrfs_fiemap,
5020}; 5072};
5021static struct inode_operations btrfs_special_inode_operations = { 5073static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr, 5074 .getattr = btrfs_getattr,
@@ -5032,4 +5084,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5032 .follow_link = page_follow_link_light, 5084 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link, 5085 .put_link = page_put_link,
5034 .permission = btrfs_permission, 5086 .permission = btrfs_permission,
5087 .setxattr = btrfs_setxattr,
5088 .getxattr = btrfs_getxattr,
5089 .listxattr = btrfs_listxattr,
5090 .removexattr = btrfs_removexattr,
5035}; 5091};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb5..988fdc8b49eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
38#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/bit_spinlock.h> 39#include <linux/bit_spinlock.h>
40#include <linux/security.h> 40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h> 41#include <linux/xattr.h>
43#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
44#include "compat.h" 43#include "compat.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..9ebe9385129b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,63 +26,213 @@
26#include "locking.h" 26#include "locking.h"
27 27
28/* 28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 29 * btrfs_header_level() isn't free, so don't call it when lockdep isn't
30 * and the spin is not tuned very extensively. The spinning does make a big 30 * on
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 31 */
32#ifdef CONFIG_DEBUG_LOCK_ALLOC
33static inline void spin_nested(struct extent_buffer *eb)
34{
35 spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
36}
37#else
38static inline void spin_nested(struct extent_buffer *eb)
39{
40 spin_lock(&eb->lock);
41}
42#endif
37 43
38int btrfs_tree_lock(struct extent_buffer *eb) 44/*
45 * Setting a lock to blocking will drop the spinlock and set the
46 * flag that forces other procs who want the lock to wait. After
47 * this you can safely schedule with the lock held.
48 */
49void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{ 50{
40 int i; 51 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
52 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
53 spin_unlock(&eb->lock);
54 }
55 /* exit with the spin lock released and the bit set */
56}
41 57
42 if (mutex_trylock(&eb->mutex)) 58/*
43 return 0; 59 * clearing the blocking flag will take the spinlock again.
60 * After this you can't safely schedule
61 */
62void btrfs_clear_lock_blocking(struct extent_buffer *eb)
63{
64 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
65 spin_nested(eb);
66 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
67 smp_mb__after_clear_bit();
68 }
69 /* exit with the spin lock held */
70}
71
72/*
73 * unfortunately, many of the places that currently set a lock to blocking
74 * don't end up blocking for every long, and often they don't block
75 * at all. For a dbench 50 run, if we don't spin one the blocking bit
76 * at all, the context switch rate can jump up to 400,000/sec or more.
77 *
78 * So, we're still stuck with this crummy spin on the blocking bit,
79 * at least until the most common causes of the short blocks
80 * can be dealt with.
81 */
82static int btrfs_spin_on_block(struct extent_buffer *eb)
83{
84 int i;
44 for (i = 0; i < 512; i++) { 85 for (i = 0; i < 512; i++) {
45 cpu_relax(); 86 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 87 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 88 return 1;
89 if (need_resched())
90 break;
48 } 91 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 92 return 0;
52} 93}
53 94
54int btrfs_try_tree_lock(struct extent_buffer *eb) 95/*
96 * This is somewhat different from trylock. It will take the
97 * spinlock but if it finds the lock is set to blocking, it will
98 * return without the lock held.
99 *
100 * returns 1 if it was able to take the lock and zero otherwise
101 *
102 * After this call, scheduling is not safe without first calling
103 * btrfs_set_lock_blocking()
104 */
105int btrfs_try_spin_lock(struct extent_buffer *eb)
55{ 106{
56 return mutex_trylock(&eb->mutex); 107 int i;
108
109 spin_nested(eb);
110 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
111 return 1;
112 spin_unlock(&eb->lock);
113
114 /* spin for a bit on the BLOCKING flag */
115 for (i = 0; i < 2; i++) {
116 if (!btrfs_spin_on_block(eb))
117 break;
118
119 spin_nested(eb);
120 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
121 return 1;
122 spin_unlock(&eb->lock);
123 }
124 return 0;
57} 125}
58 126
59int btrfs_tree_unlock(struct extent_buffer *eb) 127/*
128 * the autoremove wake function will return 0 if it tried to wake up
129 * a process that was already awake, which means that process won't
130 * count as an exclusive wakeup. The waitq code will continue waking
131 * procs until it finds one that was actually sleeping.
132 *
133 * For btrfs, this isn't quite what we want. We want a single proc
134 * to be notified that the lock is ready for taking. If that proc
135 * already happen to be awake, great, it will loop around and try for
136 * the lock.
137 *
138 * So, btrfs_wake_function always returns 1, even when the proc that we
139 * tried to wake up was already awake.
140 */
141static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
142 int sync, void *key)
60{ 143{
61 mutex_unlock(&eb->mutex); 144 autoremove_wake_function(wait, mode, sync, key);
62 return 0; 145 return 1;
63} 146}
64 147
65int btrfs_tree_locked(struct extent_buffer *eb) 148/*
149 * returns with the extent buffer spinlocked.
150 *
151 * This will spin and/or wait as required to take the lock, and then
152 * return with the spinlock held.
153 *
154 * After this call, scheduling is not safe without first calling
155 * btrfs_set_lock_blocking()
156 */
157int btrfs_tree_lock(struct extent_buffer *eb)
66{ 158{
67 return mutex_is_locked(&eb->mutex); 159 DEFINE_WAIT(wait);
160 wait.func = btrfs_wake_function;
161
162 while(1) {
163 spin_nested(eb);
164
165 /* nobody is blocking, exit with the spinlock held */
166 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
167 return 0;
168
169 /*
170 * we have the spinlock, but the real owner is blocking.
171 * wait for them
172 */
173 spin_unlock(&eb->lock);
174
175 /*
176 * spin for a bit, and if the blocking flag goes away,
177 * loop around
178 */
179 if (btrfs_spin_on_block(eb))
180 continue;
181
182 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
183 TASK_UNINTERRUPTIBLE);
184
185 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
186 schedule();
187
188 finish_wait(&eb->lock_wq, &wait);
189 }
190 return 0;
68} 191}
69 192
70/* 193/*
71 * btrfs_search_slot uses this to decide if it should drop its locks 194 * Very quick trylock, this does not spin or schedule. It returns
72 * before doing something expensive like allocating free blocks for cow. 195 * 1 with the spinlock held if it was able to take the lock, or it
196 * returns zero if it was unable to take the lock.
197 *
198 * After this call, scheduling is not safe without first calling
199 * btrfs_set_lock_blocking()
73 */ 200 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 201int btrfs_try_tree_lock(struct extent_buffer *eb)
75{ 202{
76 int i; 203 if (spin_trylock(&eb->lock)) {
77 struct extent_buffer *eb; 204 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 205 /*
79 eb = path->nodes[i]; 206 * we've got the spinlock, but the real owner is
80 if (!eb) 207 * blocking. Drop the spinlock and return failure
81 break; 208 */
82 smp_mb(); 209 spin_unlock(&eb->lock);
83 if (!list_empty(&eb->mutex.wait_list)) 210 return 0;
84 return 1; 211 }
212 return 1;
85 } 213 }
214 /* someone else has the spinlock giveup */
86 return 0; 215 return 0;
87} 216}
88 217
218int btrfs_tree_unlock(struct extent_buffer *eb)
219{
220 /*
221 * if we were a blocking owner, we don't have the spinlock held
222 * just clear the bit and look for waiters
223 */
224 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
225 smp_mb__after_clear_bit();
226 else
227 spin_unlock(&eb->lock);
228
229 if (waitqueue_active(&eb->lock_wq))
230 wake_up(&eb->lock_wq);
231 return 0;
232}
233
234int btrfs_tree_locked(struct extent_buffer *eb)
235{
236 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
237 spin_is_locked(&eb->lock);
238}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..6bb0afbff928 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,10 @@
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24int btrfs_tree_locked(struct extent_buffer *eb);
25
25int btrfs_try_tree_lock(struct extent_buffer *eb); 26int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 27int btrfs_try_spin_lock(struct extent_buffer *eb);
28
29void btrfs_set_lock_blocking(struct extent_buffer *eb);
30void btrfs_clear_lock_blocking(struct extent_buffer *eb);
27#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a20940170274..77c2411a5f0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
613 struct btrfs_sector_sum *sector_sums; 613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered; 614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors; 616 unsigned long num_sectors;
618 unsigned long i; 617 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
624 return 1; 623 return 1;
625 624
626 mutex_lock(&tree->mutex); 625 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) { 626 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) { 627 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize; 628 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums; 629 sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9eab..d0cc62bccb94 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "ref-cache.h" 22#include "ref-cache.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c59..bc283ad2db73 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared); 74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif 76#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db9fb3bc1e33..f3fd7e2cbc38 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
37#include <linux/ctype.h> 37#include <linux/ctype.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h> 40#include <linux/magic.h>
42#include "compat.h" 41#include "compat.h"
43#include "ctree.h" 42#include "ctree.h"
@@ -583,17 +582,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
583 struct btrfs_ioctl_vol_args *vol; 582 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices; 583 struct btrfs_fs_devices *fs_devices;
585 int ret = -ENOTTY; 584 int ret = -ENOTTY;
586 int len;
587 585
588 if (!capable(CAP_SYS_ADMIN)) 586 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM; 587 return -EPERM;
590 588
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 589 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
590 if (!vol)
591 return -ENOMEM;
592
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 593 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT; 594 ret = -EFAULT;
594 goto out; 595 goto out;
595 } 596 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 597
598 switch (cmd) { 598 switch (cmd) {
599 case BTRFS_IOC_SCAN_DEV: 599 case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f9443340..919172de5c9a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -852,11 +852,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
852{ 852{
853 struct btrfs_pending_snapshot *pending; 853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots; 854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret; 855 int ret;
857 856
858 list_for_each(cur, head) { 857 list_for_each_entry(pending, head, list) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending); 858 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret); 859 BUG_ON(ret);
862 } 860 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..20794290256b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1704 1615
1705 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1707 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1709 1621
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1750 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1751 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1753 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1755 1668
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1807 1720
1808 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1810 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1812 1726
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1879 1793
1880 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1882 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1884 1799
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1817 }
1903 } 1818 }
1904 btrfs_free_path(path); 1819 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1820 return ret;
1908} 1821}
1909 1822
1910static int wait_log_commit(struct btrfs_root *log) 1823/*
1824 * helper function to update the item for a given subvolumes log root
1825 * in the tree of log roots
1826 */
1827static int update_log_root(struct btrfs_trans_handle *trans,
1828 struct btrfs_root *log)
1829{
1830 int ret;
1831
1832 if (log->log_transid == 1) {
1833 /* insert root item on the first sync */
1834 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1835 &log->root_key, &log->root_item);
1836 } else {
1837 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1838 &log->root_key, &log->root_item);
1839 }
1840 return ret;
1841}
1842
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1844{
1912 DEFINE_WAIT(wait); 1845 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1846 int index = transid % 2;
1914 1847
1848 /*
1849 * we only allow two pending log transactions at a time,
1850 * so we know that if ours is more than 2 older than the
1851 * current transaction, we're done
1852 */
1915 do { 1853 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1854 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1855 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1856 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1857 if (root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index]))
1920 schedule(); 1859 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1860 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1861 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1862 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1863 atomic_read(&root->log_commit[index]));
1864 return 0;
1865}
1866
1867static int wait_for_writer(struct btrfs_root *root)
1868{
1869 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers))
1875 schedule();
1876 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait);
1878 }
1925 return 0; 1879 return 0;
1926} 1880}
1927 1881
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1887int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1888 struct btrfs_root *root)
1935{ 1889{
1890 int index1;
1891 int index2;
1936 int ret; 1892 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1893 struct btrfs_root *log = root->log_root;
1894 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1895
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1896 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1897 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1898 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1899 wait_log_commit(root, root->log_transid);
1900 mutex_unlock(&root->log_mutex);
1901 return 0;
1944 } 1902 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1903 atomic_set(&root->log_commit[index1], 1);
1904
1905 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1);
1946 1908
1947 while (1) { 1909 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1910 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1911 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1912 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1913 mutex_lock(&root->log_mutex);
1952 1914 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1915 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1916 break;
1965 } 1917 }
1966 1918
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1920 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1921
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1922 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1923 btrfs_set_root_generation(&log->root_item, trans->transid);
1924 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1925
1926 root->log_batch = 0;
1927 root->log_transid++;
1928 log->log_transid = root->log_transid;
1929 smp_mb();
1930 /*
1931 * log tree has been flushed to disk, new modifications of
1932 * the log will be written to new positions. so it's safe to
1933 * allow log writers to go in.
1934 */
1935 mutex_unlock(&root->log_mutex);
1936
1937 mutex_lock(&log_root_tree->log_mutex);
1938 log_root_tree->log_batch++;
1939 atomic_inc(&log_root_tree->log_writers);
1940 mutex_unlock(&log_root_tree->log_mutex);
1941
1942 ret = update_log_root(trans, log);
1943 BUG_ON(ret);
1944
1945 mutex_lock(&log_root_tree->log_mutex);
1946 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1947 smp_mb();
1948 if (waitqueue_active(&log_root_tree->log_writer_wait))
1949 wake_up(&log_root_tree->log_writer_wait);
1950 }
1951
1952 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out;
1957 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1);
1959
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1962
1963 wait_for_writer(log_root_tree);
1964
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1967 BUG_ON(ret);
1972 1968
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1969 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1970 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1971 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1972 btrfs_header_level(log_root_tree->node));
1973
1974 log_root_tree->log_batch = 0;
1975 log_root_tree->log_transid++;
1976 smp_mb();
1977
1978 mutex_unlock(&log_root_tree->log_mutex);
1979
1980 /*
1981 * nobody else is going to jump in and write the the ctree
1982 * super here because the log_commit atomic below is protecting
1983 * us. We must be called with a transaction handle pinning
1984 * the running transaction open, so a full commit can't hop
1985 * in and cause problems either.
1986 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1988
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1989 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1990 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1992 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1993out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1994 atomic_set(&root->log_commit[index1], 0);
1995 smp_mb();
1996 if (waitqueue_active(&root->log_commit_wait[index1]))
1997 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1998 return 0;
1988} 1999}
1989 2000
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2030 start, end, GFP_NOFS);
2020 } 2031 }
2021 2032
2022 log = root->log_root; 2033 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2034 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2035 &log->root_key);
2025 BUG_ON(ret); 2036 BUG_ON(ret);
2037 }
2026 root->log_root = NULL; 2038 root->log_root = NULL;
2027 kfree(root->log_root); 2039 free_extent_buffer(log->node);
2040 kfree(log);
2028 return 0; 2041 return 0;
2029} 2042}
2030 2043
2031/* 2044/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2045 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2046 * mixed in, we have a few interesting corners:
2056 * 2047 *
@@ -2711,11 +2702,6 @@ next_slot:
2711 2702
2712 btrfs_free_path(path); 2703 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2704 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2705out:
2720 return 0; 2706 return 0;
2721} 2707}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1cca2b5..bcd14ebccae1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h> 23#include <asm/div64.h>
25#include "compat.h" 24#include "compat.h"
26#include "ctree.h" 25#include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid) 103 u64 devid, u8 *uuid)
105{ 104{
106 struct btrfs_device *dev; 105 struct btrfs_device *dev;
107 struct list_head *cur;
108 106
109 list_for_each(cur, head) { 107 list_for_each_entry(dev, head, dev_list) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid && 108 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev; 110 return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
118 115
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 116static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{ 117{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices; 118 struct btrfs_fs_devices *fs_devices;
123 119
124 list_for_each(cur, &fs_uuids) { 120 list_for_each_entry(fs_devices, &fs_uuids, list) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices; 122 return fs_devices;
128 } 123 }
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
159loop: 154loop:
160 spin_lock(&device->io_lock); 155 spin_lock(&device->io_lock);
161 156
157loop_lock:
162 /* take all the bios off the list at once and process them 158 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the 159 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted 160 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
208 * is now congested. Back off and let other work structs 204 * is now congested. Back off and let other work structs
209 * run instead 205 * run instead
210 */ 206 */
211 if (pending && bdi_write_congested(bdi) && 207 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
212 fs_info->fs_devices->open_devices > 1) { 208 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head; 209 struct bio *old_head;
214 210
@@ -220,7 +216,8 @@ loop:
220 tail->bi_next = old_head; 216 tail->bi_next = old_head;
221 else 217 else
222 device->pending_bio_tail = tail; 218 device->pending_bio_tail = tail;
223 device->running_pending = 0; 219
220 device->running_pending = 1;
224 221
225 spin_unlock(&device->io_lock); 222 spin_unlock(&device->io_lock);
226 btrfs_requeue_work(&device->work); 223 btrfs_requeue_work(&device->work);
@@ -229,6 +226,11 @@ loop:
229 } 226 }
230 if (again) 227 if (again)
231 goto loop; 228 goto loop;
229
230 spin_lock(&device->io_lock);
231 if (device->pending_bios)
232 goto loop_lock;
233 spin_unlock(&device->io_lock);
232done: 234done:
233 return 0; 235 return 0;
234} 236}
@@ -345,14 +347,11 @@ error:
345 347
346int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 348int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
347{ 349{
348 struct list_head *tmp; 350 struct btrfs_device *device, *next;
349 struct list_head *cur;
350 struct btrfs_device *device;
351 351
352 mutex_lock(&uuid_mutex); 352 mutex_lock(&uuid_mutex);
353again: 353again:
354 list_for_each_safe(cur, tmp, &fs_devices->devices) { 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
355 device = list_entry(cur, struct btrfs_device, dev_list);
356 if (device->in_fs_metadata) 355 if (device->in_fs_metadata)
357 continue; 356 continue;
358 357
@@ -383,14 +382,12 @@ again:
383 382
384static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{ 384{
386 struct list_head *cur;
387 struct btrfs_device *device; 385 struct btrfs_device *device;
388 386
389 if (--fs_devices->opened > 0) 387 if (--fs_devices->opened > 0)
390 return 0; 388 return 0;
391 389
392 list_for_each(cur, &fs_devices->devices) { 390 list_for_each_entry(device, &fs_devices->devices, dev_list) {
393 device = list_entry(cur, struct btrfs_device, dev_list);
394 if (device->bdev) { 391 if (device->bdev) {
395 close_bdev_exclusive(device->bdev, device->mode); 392 close_bdev_exclusive(device->bdev, device->mode);
396 fs_devices->open_devices--; 393 fs_devices->open_devices--;
@@ -439,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
439{ 436{
440 struct block_device *bdev; 437 struct block_device *bdev;
441 struct list_head *head = &fs_devices->devices; 438 struct list_head *head = &fs_devices->devices;
442 struct list_head *cur;
443 struct btrfs_device *device; 439 struct btrfs_device *device;
444 struct block_device *latest_bdev = NULL; 440 struct block_device *latest_bdev = NULL;
445 struct buffer_head *bh; 441 struct buffer_head *bh;
@@ -450,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
450 int seeding = 1; 446 int seeding = 1;
451 int ret = 0; 447 int ret = 0;
452 448
453 list_for_each(cur, head) { 449 list_for_each_entry(device, head, dev_list) {
454 device = list_entry(cur, struct btrfs_device, dev_list);
455 if (device->bdev) 450 if (device->bdev)
456 continue; 451 continue;
457 if (!device->name) 452 if (!device->name)
@@ -578,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
578 *(unsigned long long *)disk_super->fsid, 573 *(unsigned long long *)disk_super->fsid,
579 *(unsigned long long *)(disk_super->fsid + 8)); 574 *(unsigned long long *)(disk_super->fsid + 8));
580 } 575 }
581 printk(KERN_INFO "devid %llu transid %llu %s\n", 576 printk(KERN_CONT "devid %llu transid %llu %s\n",
582 (unsigned long long)devid, (unsigned long long)transid, path); 577 (unsigned long long)devid, (unsigned long long)transid, path);
583 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
584 579
@@ -1017,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1017 } 1012 }
1018 1013
1019 if (strcmp(device_path, "missing") == 0) { 1014 if (strcmp(device_path, "missing") == 0) {
1020 struct list_head *cur;
1021 struct list_head *devices; 1015 struct list_head *devices;
1022 struct btrfs_device *tmp; 1016 struct btrfs_device *tmp;
1023 1017
1024 device = NULL; 1018 device = NULL;
1025 devices = &root->fs_info->fs_devices->devices; 1019 devices = &root->fs_info->fs_devices->devices;
1026 list_for_each(cur, devices) { 1020 list_for_each_entry(tmp, devices, dev_list) {
1027 tmp = list_entry(cur, struct btrfs_device, dev_list);
1028 if (tmp->in_fs_metadata && !tmp->bdev) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) {
1029 device = tmp; 1022 device = tmp;
1030 break; 1023 break;
@@ -1280,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1280 struct btrfs_trans_handle *trans; 1273 struct btrfs_trans_handle *trans;
1281 struct btrfs_device *device; 1274 struct btrfs_device *device;
1282 struct block_device *bdev; 1275 struct block_device *bdev;
1283 struct list_head *cur;
1284 struct list_head *devices; 1276 struct list_head *devices;
1285 struct super_block *sb = root->fs_info->sb; 1277 struct super_block *sb = root->fs_info->sb;
1286 u64 total_bytes; 1278 u64 total_bytes;
@@ -1304,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1304 mutex_lock(&root->fs_info->volume_mutex); 1296 mutex_lock(&root->fs_info->volume_mutex);
1305 1297
1306 devices = &root->fs_info->fs_devices->devices; 1298 devices = &root->fs_info->fs_devices->devices;
1307 list_for_each(cur, devices) { 1299 list_for_each_entry(device, devices, dev_list) {
1308 device = list_entry(cur, struct btrfs_device, dev_list);
1309 if (device->bdev == bdev) { 1300 if (device->bdev == bdev) {
1310 ret = -EEXIST; 1301 ret = -EEXIST;
1311 goto error; 1302 goto error;
@@ -1704,7 +1695,6 @@ static u64 div_factor(u64 num, int factor)
1704int btrfs_balance(struct btrfs_root *dev_root) 1695int btrfs_balance(struct btrfs_root *dev_root)
1705{ 1696{
1706 int ret; 1697 int ret;
1707 struct list_head *cur;
1708 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1698 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1709 struct btrfs_device *device; 1699 struct btrfs_device *device;
1710 u64 old_size; 1700 u64 old_size;
@@ -1723,8 +1713,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1723 dev_root = dev_root->fs_info->dev_root; 1713 dev_root = dev_root->fs_info->dev_root;
1724 1714
1725 /* step one make some room on all the devices */ 1715 /* step one make some room on all the devices */
1726 list_for_each(cur, devices) { 1716 list_for_each_entry(device, devices, dev_list) {
1727 device = list_entry(cur, struct btrfs_device, dev_list);
1728 old_size = device->total_bytes; 1717 old_size = device->total_bytes;
1729 size_to_free = div_factor(old_size, 1); 1718 size_to_free = div_factor(old_size, 1);
1730 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1719 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e270894..a9d3bf4d2689 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "btrfs_inode.h" 26#include "btrfs_inode.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
45 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0); 48 strlen(name), 0);
48 if (!di || IS_ERR(di)) { 49 if (!di) {
49 ret = -ENODATA; 50 ret = -ENODATA;
50 goto out; 51 goto out;
52 } else if (IS_ERR(di)) {
53 ret = PTR_ERR(di);
54 goto out;
51 } 55 }
52 56
53 leaf = path->nodes[0]; 57 leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
62 ret = -ERANGE; 66 ret = -ERANGE;
63 goto out; 67 goto out;
64 } 68 }
69
70 /*
71 * The way things are packed into the leaf is like this
72 * |struct btrfs_dir_item|name|data|
73 * where name is the xattr name, so security.foo, and data is the
74 * content of the xattr. data_ptr points to the location in memory
75 * where the data starts in the in memory leaf
76 */
65 data_ptr = (unsigned long)((char *)(di + 1) + 77 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di)); 78 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr, 79 read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
86 if (!path) 98 if (!path)
87 return -ENOMEM; 99 return -ENOMEM;
88 100
89 trans = btrfs_start_transaction(root, 1); 101 trans = btrfs_join_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode); 102 btrfs_set_trans_block_group(trans, inode);
91 103
92 /* first lets see if we already have this xattr */ 104 /* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0) 189 if (ret < 0)
178 goto err; 190 goto err;
179 ret = 0;
180 advance = 0; 191 advance = 0;
181 while (1) { 192 while (1) {
182 leaf = path->nodes[0]; 193 leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
320 return -EOPNOTSUPP; 331 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322} 333}
334
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
336{
337 int err;
338 size_t len;
339 void *value;
340 char *suffix;
341 char *name;
342
343 err = security_inode_init_security(inode, dir, &suffix, &value, &len);
344 if (err) {
345 if (err == -EOPNOTSUPP)
346 return 0;
347 return err;
348 }
349
350 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
351 GFP_NOFS);
352 if (!name) {
353 err = -ENOMEM;
354 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0);
358 kfree(name);
359 }
360
361 kfree(suffix);
362 kfree(value);
363 return err;
364}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68d..c71e9c3cf3f7 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
40
39#endif /* __XATTR__ */ 41#endif /* __XATTR__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index b58208f1640a..665d446b25bc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2688,7 +2688,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
2688 struct buffer_head *bh; 2688 struct buffer_head *bh;
2689 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2689 BUG_ON(fsdata != NULL && page_has_buffers(page));
2690 2690
2691 if (unlikely(copied < len) && !page_has_buffers(page)) 2691 if (unlikely(copied < len) && head)
2692 attach_nobh_buffers(page, head); 2692 attach_nobh_buffers(page, head);
2693 if (page_has_buffers(page)) 2693 if (page_has_buffers(page))
2694 return generic_write_end(file, mapping, pos, len, 2694 return generic_write_end(file, mapping, pos, len,
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 080703a15f44..73ac7ebd1dfc 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ rather than posix (advisory) byte range locks, even though server would
5support posix byte range locks. Fix query of root inode when prefixpath 5support posix byte range locks. Fix query of root inode when prefixpath
6specified and user does not have access to query information about the 6specified and user does not have access to query information about the
7top of the share. Fix problem in 2.6.28 resolving DFS paths to 7top of the share. Fix problem in 2.6.28 resolving DFS paths to
8Samba servers (worked to Windows). 8Samba servers (worked to Windows). Fix rmdir so that pending search
9(readdir) requests do not get invalid results which include the now
10removed directory.
9 11
10Version 1.55 12Version 1.55
11------------ 13------------
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d4839cf0cb2c..7c9809523f42 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
48 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 48 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
49 return -EINVAL; 49 return -EINVAL;
50 50
51 MD5Init(&context); 51 cifs_MD5_init(&context);
52 MD5Update(&context, (char *)&key->data, key->len); 52 cifs_MD5_update(&context, (char *)&key->data, key->len);
53 MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 53 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
54 54
55 MD5Final(signature, &context); 55 cifs_MD5_final(signature, &context);
56 return 0; 56 return 0;
57} 57}
58 58
@@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
96 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 96 if ((iov == NULL) || (signature == NULL) || (key == NULL))
97 return -EINVAL; 97 return -EINVAL;
98 98
99 MD5Init(&context); 99 cifs_MD5_init(&context);
100 MD5Update(&context, (char *)&key->data, key->len); 100 cifs_MD5_update(&context, (char *)&key->data, key->len);
101 for (i = 0; i < n_vec; i++) { 101 for (i = 0; i < n_vec; i++) {
102 if (iov[i].iov_len == 0) 102 if (iov[i].iov_len == 0)
103 continue; 103 continue;
@@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
110 if (i == 0) { 110 if (i == 0) {
111 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 111 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
112 break; /* nothing to sign or corrupt header */ 112 break; /* nothing to sign or corrupt header */
113 MD5Update(&context, iov[0].iov_base+4, 113 cifs_MD5_update(&context, iov[0].iov_base+4,
114 iov[0].iov_len-4); 114 iov[0].iov_len-4);
115 } else 115 } else
116 MD5Update(&context, iov[i].iov_base, iov[i].iov_len); 116 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
117 } 117 }
118 118
119 MD5Final(signature, &context); 119 cifs_MD5_final(signature, &context);
120 120
121 return 0; 121 return 0;
122} 122}
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 06f6779988bf..382ba6298809 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -35,8 +35,8 @@ extern struct smb_hdr *cifs_buf_get(void);
35extern void cifs_buf_release(void *); 35extern void cifs_buf_release(void *);
36extern struct smb_hdr *cifs_small_buf_get(void); 36extern struct smb_hdr *cifs_small_buf_get(void);
37extern void cifs_small_buf_release(void *); 37extern void cifs_small_buf_release(void *);
38extern int smb_send(struct socket *, struct smb_hdr *, 38extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */ , struct sockaddr *, bool); 39 unsigned int /* length */);
40extern unsigned int _GetXid(void); 40extern unsigned int _GetXid(void);
41extern void _FreeXid(unsigned int); 41extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9ea394ee075..2209be943051 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1354,7 +1354,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1354} 1354}
1355 1355
1356static struct TCP_Server_Info * 1356static struct TCP_Server_Info *
1357cifs_find_tcp_session(struct sockaddr *addr) 1357cifs_find_tcp_session(struct sockaddr_storage *addr)
1358{ 1358{
1359 struct list_head *tmp; 1359 struct list_head *tmp;
1360 struct TCP_Server_Info *server; 1360 struct TCP_Server_Info *server;
@@ -1374,11 +1374,11 @@ cifs_find_tcp_session(struct sockaddr *addr)
1374 if (server->tcpStatus == CifsNew) 1374 if (server->tcpStatus == CifsNew)
1375 continue; 1375 continue;
1376 1376
1377 if (addr->sa_family == AF_INET && 1377 if (addr->ss_family == AF_INET &&
1378 (addr4->sin_addr.s_addr != 1378 (addr4->sin_addr.s_addr !=
1379 server->addr.sockAddr.sin_addr.s_addr)) 1379 server->addr.sockAddr.sin_addr.s_addr))
1380 continue; 1380 continue;
1381 else if (addr->sa_family == AF_INET6 && 1381 else if (addr->ss_family == AF_INET6 &&
1382 memcmp(&server->addr.sockAddr6.sin6_addr, 1382 memcmp(&server->addr.sockAddr6.sin6_addr,
1383 &addr6->sin6_addr, sizeof(addr6->sin6_addr))) 1383 &addr6->sin6_addr, sizeof(addr6->sin6_addr)))
1384 continue; 1384 continue;
@@ -1419,12 +1419,12 @@ static struct TCP_Server_Info *
1419cifs_get_tcp_session(struct smb_vol *volume_info) 1419cifs_get_tcp_session(struct smb_vol *volume_info)
1420{ 1420{
1421 struct TCP_Server_Info *tcp_ses = NULL; 1421 struct TCP_Server_Info *tcp_ses = NULL;
1422 struct sockaddr addr; 1422 struct sockaddr_storage addr;
1423 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; 1423 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
1424 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; 1424 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
1425 int rc; 1425 int rc;
1426 1426
1427 memset(&addr, 0, sizeof(struct sockaddr)); 1427 memset(&addr, 0, sizeof(struct sockaddr_storage));
1428 1428
1429 if (volume_info->UNCip && volume_info->UNC) { 1429 if (volume_info->UNCip && volume_info->UNC) {
1430 rc = cifs_inet_pton(AF_INET, volume_info->UNCip, 1430 rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
@@ -1435,9 +1435,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1435 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip, 1435 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
1436 &sin_server6->sin6_addr.in6_u); 1436 &sin_server6->sin6_addr.in6_u);
1437 if (rc > 0) 1437 if (rc > 0)
1438 addr.sa_family = AF_INET6; 1438 addr.ss_family = AF_INET6;
1439 } else { 1439 } else {
1440 addr.sa_family = AF_INET; 1440 addr.ss_family = AF_INET;
1441 } 1441 }
1442 1442
1443 if (rc <= 0) { 1443 if (rc <= 0) {
@@ -1502,7 +1502,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1502 tcp_ses->tcpStatus = CifsNew; 1502 tcp_ses->tcpStatus = CifsNew;
1503 ++tcp_ses->srv_count; 1503 ++tcp_ses->srv_count;
1504 1504
1505 if (addr.sa_family == AF_INET6) { 1505 if (addr.ss_family == AF_INET6) {
1506 cFYI(1, ("attempting ipv6 connect")); 1506 cFYI(1, ("attempting ipv6 connect"));
1507 /* BB should we allow ipv6 on port 139? */ 1507 /* BB should we allow ipv6 on port 139? */
1508 /* other OS never observed in Wild doing 139 with v6 */ 1508 /* other OS never observed in Wild doing 139 with v6 */
@@ -1802,7 +1802,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1802 * user space buffer 1802 * user space buffer
1803 */ 1803 */
1804 socket->sk->sk_rcvtimeo = 7 * HZ; 1804 socket->sk->sk_rcvtimeo = 7 * HZ;
1805 socket->sk->sk_sndtimeo = 3 * HZ; 1805 socket->sk->sk_sndtimeo = 5 * HZ;
1806 1806
1807 /* make the bufsizes depend on wsize/rsize and max requests */ 1807 /* make the bufsizes depend on wsize/rsize and max requests */
1808 if (server->noautotune) { 1808 if (server->noautotune) {
@@ -1860,9 +1860,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1860 smb_buf = (struct smb_hdr *)ses_init_buf; 1860 smb_buf = (struct smb_hdr *)ses_init_buf;
1861 /* sizeof RFC1002_SESSION_REQUEST with no scope */ 1861 /* sizeof RFC1002_SESSION_REQUEST with no scope */
1862 smb_buf->smb_buf_length = 0x81000044; 1862 smb_buf->smb_buf_length = 0x81000044;
1863 rc = smb_send(socket, smb_buf, 0x44, 1863 rc = smb_send(server, smb_buf, 0x44);
1864 (struct sockaddr *) &server->addr.sockAddr,
1865 server->noblocksnd);
1866 kfree(ses_init_buf); 1864 kfree(ses_init_buf);
1867 msleep(1); /* RFC1001 layer in at least one server 1865 msleep(1); /* RFC1001 layer in at least one server
1868 requires very short break before negprot 1866 requires very short break before negprot
@@ -1955,7 +1953,7 @@ ipv6_connect(struct TCP_Server_Info *server)
1955 * user space buffer 1953 * user space buffer
1956 */ 1954 */
1957 socket->sk->sk_rcvtimeo = 7 * HZ; 1955 socket->sk->sk_rcvtimeo = 7 * HZ;
1958 socket->sk->sk_sndtimeo = 3 * HZ; 1956 socket->sk->sk_sndtimeo = 5 * HZ;
1959 server->ssocket = socket; 1957 server->ssocket = socket;
1960 1958
1961 return rc; 1959 return rc;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 838d9c720a5c..964aad03c5ad 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,6 +129,17 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132static void setup_cifs_dentry(struct cifsTconInfo *tcon,
133 struct dentry *direntry,
134 struct inode *newinode)
135{
136 if (tcon->nocase)
137 direntry->d_op = &cifs_ci_dentry_ops;
138 else
139 direntry->d_op = &cifs_dentry_ops;
140 d_instantiate(direntry, newinode);
141}
142
132/* Inode operations in similar order to how they appear in Linux file fs.h */ 143/* Inode operations in similar order to how they appear in Linux file fs.h */
133 144
134int 145int
@@ -139,14 +150,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
139 int xid; 150 int xid;
140 int create_options = CREATE_NOT_DIR; 151 int create_options = CREATE_NOT_DIR;
141 int oplock = 0; 152 int oplock = 0;
153 /* BB below access is too much for the mknod to request */
142 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 154 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
143 __u16 fileHandle; 155 __u16 fileHandle;
144 struct cifs_sb_info *cifs_sb; 156 struct cifs_sb_info *cifs_sb;
145 struct cifsTconInfo *pTcon; 157 struct cifsTconInfo *tcon;
146 char *full_path = NULL; 158 char *full_path = NULL;
147 FILE_ALL_INFO *buf = NULL; 159 FILE_ALL_INFO *buf = NULL;
148 struct inode *newinode = NULL; 160 struct inode *newinode = NULL;
149 struct cifsFileInfo *pCifsFile = NULL;
150 struct cifsInodeInfo *pCifsInode; 161 struct cifsInodeInfo *pCifsInode;
151 int disposition = FILE_OVERWRITE_IF; 162 int disposition = FILE_OVERWRITE_IF;
152 bool write_only = false; 163 bool write_only = false;
@@ -154,7 +165,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
154 xid = GetXid(); 165 xid = GetXid();
155 166
156 cifs_sb = CIFS_SB(inode->i_sb); 167 cifs_sb = CIFS_SB(inode->i_sb);
157 pTcon = cifs_sb->tcon; 168 tcon = cifs_sb->tcon;
158 169
159 full_path = build_path_from_dentry(direntry); 170 full_path = build_path_from_dentry(direntry);
160 if (full_path == NULL) { 171 if (full_path == NULL) {
@@ -162,6 +173,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
162 return -ENOMEM; 173 return -ENOMEM;
163 } 174 }
164 175
176 mode &= ~current->fs->umask;
177
165 if (nd && (nd->flags & LOOKUP_OPEN)) { 178 if (nd && (nd->flags & LOOKUP_OPEN)) {
166 int oflags = nd->intent.open.flags; 179 int oflags = nd->intent.open.flags;
167 180
@@ -196,17 +209,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
196 return -ENOMEM; 209 return -ENOMEM;
197 } 210 }
198 211
199 mode &= ~current->fs->umask;
200
201 /* 212 /*
202 * if we're not using unix extensions, see if we need to set 213 * if we're not using unix extensions, see if we need to set
203 * ATTR_READONLY on the create call 214 * ATTR_READONLY on the create call
204 */ 215 */
205 if (!pTcon->unix_ext && (mode & S_IWUGO) == 0) 216 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
206 create_options |= CREATE_OPTION_READONLY; 217 create_options |= CREATE_OPTION_READONLY;
207 218
208 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 219 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
209 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 220 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
210 desiredAccess, create_options, 221 desiredAccess, create_options,
211 &fileHandle, &oplock, buf, cifs_sb->local_nls, 222 &fileHandle, &oplock, buf, cifs_sb->local_nls,
212 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 223 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -215,7 +226,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
215 226
216 if (rc == -EIO) { 227 if (rc == -EIO) {
217 /* old server, retry the open legacy style */ 228 /* old server, retry the open legacy style */
218 rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, 229 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
219 desiredAccess, create_options, 230 desiredAccess, create_options,
220 &fileHandle, &oplock, buf, cifs_sb->local_nls, 231 &fileHandle, &oplock, buf, cifs_sb->local_nls,
221 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 232 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -225,7 +236,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
225 } else { 236 } else {
226 /* If Open reported that we actually created a file 237 /* If Open reported that we actually created a file
227 then we now have to set the mode if possible */ 238 then we now have to set the mode if possible */
228 if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { 239 if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
229 struct cifs_unix_set_info_args args = { 240 struct cifs_unix_set_info_args args = {
230 .mode = mode, 241 .mode = mode,
231 .ctime = NO_CHANGE_64, 242 .ctime = NO_CHANGE_64,
@@ -244,20 +255,20 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
244 args.uid = NO_CHANGE_64; 255 args.uid = NO_CHANGE_64;
245 args.gid = NO_CHANGE_64; 256 args.gid = NO_CHANGE_64;
246 } 257 }
247 CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, 258 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
248 cifs_sb->local_nls, 259 cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & 260 cifs_sb->mnt_cifs_flags &
250 CIFS_MOUNT_MAP_SPECIAL_CHR); 261 CIFS_MOUNT_MAP_SPECIAL_CHR);
251 } else { 262 } else {
252 /* BB implement mode setting via Windows security 263 /* BB implement mode setting via Windows security
253 descriptors e.g. */ 264 descriptors e.g. */
254 /* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/ 265 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
255 266
256 /* Could set r/o dos attribute if mode & 0222 == 0 */ 267 /* Could set r/o dos attribute if mode & 0222 == 0 */
257 } 268 }
258 269
259 /* server might mask mode so we have to query for it */ 270 /* server might mask mode so we have to query for it */
260 if (pTcon->unix_ext) 271 if (tcon->unix_ext)
261 rc = cifs_get_inode_info_unix(&newinode, full_path, 272 rc = cifs_get_inode_info_unix(&newinode, full_path,
262 inode->i_sb, xid); 273 inode->i_sb, xid);
263 else { 274 else {
@@ -283,22 +294,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
283 } 294 }
284 295
285 if (rc != 0) { 296 if (rc != 0) {
286 cFYI(1, 297 cFYI(1, ("Create worked, get_inode_info failed rc = %d",
287 ("Create worked but get_inode_info failed rc = %d", 298 rc));
288 rc)); 299 } else
289 } else { 300 setup_cifs_dentry(tcon, direntry, newinode);
290 if (pTcon->nocase) 301
291 direntry->d_op = &cifs_ci_dentry_ops;
292 else
293 direntry->d_op = &cifs_dentry_ops;
294 d_instantiate(direntry, newinode);
295 }
296 if ((nd == NULL /* nfsd case - nfs srv does not set nd */) || 302 if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
297 (!(nd->flags & LOOKUP_OPEN))) { 303 (!(nd->flags & LOOKUP_OPEN))) {
298 /* mknod case - do not leave file open */ 304 /* mknod case - do not leave file open */
299 CIFSSMBClose(xid, pTcon, fileHandle); 305 CIFSSMBClose(xid, tcon, fileHandle);
300 } else if (newinode) { 306 } else if (newinode) {
301 pCifsFile = 307 struct cifsFileInfo *pCifsFile =
302 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 308 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
303 309
304 if (pCifsFile == NULL) 310 if (pCifsFile == NULL)
@@ -316,7 +322,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
316 /* set the following in open now 322 /* set the following in open now
317 pCifsFile->pfile = file; */ 323 pCifsFile->pfile = file; */
318 write_lock(&GlobalSMBSeslock); 324 write_lock(&GlobalSMBSeslock);
319 list_add(&pCifsFile->tlist, &pTcon->openFileList); 325 list_add(&pCifsFile->tlist, &tcon->openFileList);
320 pCifsInode = CIFS_I(newinode); 326 pCifsInode = CIFS_I(newinode);
321 if (pCifsInode) { 327 if (pCifsInode) {
322 /* if readable file instance put first in list*/ 328 /* if readable file instance put first in list*/
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5ab9896fdcb2..bcf7b5184664 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1285,6 +1285,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1285 cifsInode = CIFS_I(direntry->d_inode); 1285 cifsInode = CIFS_I(direntry->d_inode);
1286 cifsInode->time = 0; /* force revalidate to go get info when 1286 cifsInode->time = 0; /* force revalidate to go get info when
1287 needed */ 1287 needed */
1288
1289 cifsInode = CIFS_I(inode);
1290 cifsInode->time = 0; /* force revalidate to get parent dir info
1291 since cached search results now invalid */
1292
1288 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1293 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1289 current_fs_time(inode->i_sb); 1294 current_fs_time(inode->i_sb);
1290 1295
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 462bbfefd4b6..98b66a54c319 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -10,8 +10,8 @@
10 * with every copy. 10 * with every copy.
11 * 11 *
12 * To compute the message digest of a chunk of bytes, declare an 12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to MD5Init, call MD5Update as 13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call MD5Final, which 14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest. 15 * will fill a supplied 16-byte array with the digest.
16 */ 16 */
17 17
@@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs)
45 * initialization constants. 45 * initialization constants.
46 */ 46 */
47void 47void
48MD5Init(struct MD5Context *ctx) 48cifs_MD5_init(struct MD5Context *ctx)
49{ 49{
50 ctx->buf[0] = 0x67452301; 50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89; 51 ctx->buf[1] = 0xefcdab89;
@@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx)
61 * of bytes. 61 * of bytes.
62 */ 62 */
63void 63void
64MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) 64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{ 65{
66 register __u32 t; 66 register __u32 t;
67 67
@@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
110 * 1 0* (64-bit count of bits processed, MSB-first) 110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */ 111 */
112void 112void
113MD5Final(unsigned char digest[16], struct MD5Context *ctx) 113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{ 114{
115 unsigned int count; 115 unsigned int count;
116 unsigned char *p; 116 unsigned char *p;
@@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
165 165
166/* 166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to 167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. MD5Update blocks 168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine. 169 * the data and converts bytes into longwords for this routine.
170 */ 170 */
171static void 171static void
@@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
267 unsigned char tk[16]; 267 unsigned char tk[16];
268 struct MD5Context tctx; 268 struct MD5Context tctx;
269 269
270 MD5Init(&tctx); 270 cifs_MD5_init(&tctx);
271 MD5Update(&tctx, key, key_len); 271 cifs_MD5_update(&tctx, key, key_len);
272 MD5Final(tk, &tctx); 272 cifs_MD5_final(tk, &tctx);
273 273
274 key = tk; 274 key = tk;
275 key_len = 16; 275 key_len = 16;
@@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
287 ctx->k_opad[i] ^= 0x5c; 287 ctx->k_opad[i] ^= 0x5c;
288 } 288 }
289 289
290 MD5Init(&ctx->ctx); 290 cifs_MD5_init(&ctx->ctx);
291 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292} 292}
293#endif 293#endif
294 294
@@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
317 ctx->k_opad[i] ^= 0x5c; 317 ctx->k_opad[i] ^= 0x5c;
318 } 318 }
319 319
320 MD5Init(&ctx->ctx); 320 cifs_MD5_init(&ctx->ctx);
321 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322} 322}
323 323
324/*********************************************************************** 324/***********************************************************************
@@ -328,7 +328,7 @@ void
328hmac_md5_update(const unsigned char *text, int text_len, 328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx) 329 struct HMACMD5Context *ctx)
330{ 330{
331 MD5Update(&ctx->ctx, text, text_len); /* then text of datagram */ 331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332} 332}
333 333
334/*********************************************************************** 334/***********************************************************************
@@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{ 339{
340 struct MD5Context ctx_o; 340 struct MD5Context ctx_o;
341 341
342 MD5Final(digest, &ctx->ctx); 342 cifs_MD5_final(digest, &ctx->ctx);
343 343
344 MD5Init(&ctx_o); 344 cifs_MD5_init(&ctx_o);
345 MD5Update(&ctx_o, ctx->k_opad, 64); 345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 MD5Update(&ctx_o, digest, 16); 346 cifs_MD5_update(&ctx_o, digest, 16);
347 MD5Final(digest, &ctx_o); 347 cifs_MD5_final(digest, &ctx_o);
348} 348}
349 349
350/*********************************************************** 350/***********************************************************
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index f7d4f4197bac..6fba8cb402fd 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -20,10 +20,10 @@ struct HMACMD5Context {
20}; 20};
21#endif /* _HMAC_MD5_H */ 21#endif /* _HMAC_MD5_H */
22 22
23void MD5Init(struct MD5Context *context); 23void cifs_MD5_init(struct MD5Context *context);
24void MD5Update(struct MD5Context *context, unsigned char const *buf, 24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len); 25 unsigned len);
26void MD5Final(unsigned char digest[16], struct MD5Context *context); 26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27 27
28/* The following definitions come from lib/hmacmd5.c */ 28/* The following definitions come from lib/hmacmd5.c */
29 29
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ebe6599ed3a..0ad3e2d116a6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
154 spin_unlock(&GlobalMid_Lock); 154 spin_unlock(&GlobalMid_Lock);
155} 155}
156 156
157int
158smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
159 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
160{
161 int rc = 0;
162 int i = 0;
163 struct msghdr smb_msg;
164 struct kvec iov;
165 unsigned len = smb_buf_length + 4;
166
167 if (ssocket == NULL)
168 return -ENOTSOCK; /* BB eventually add reconnect code here */
169 iov.iov_base = smb_buffer;
170 iov.iov_len = len;
171
172 smb_msg.msg_name = sin;
173 smb_msg.msg_namelen = sizeof(struct sockaddr);
174 smb_msg.msg_control = NULL;
175 smb_msg.msg_controllen = 0;
176 if (noblocksnd)
177 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
178 else
179 smb_msg.msg_flags = MSG_NOSIGNAL;
180
181 /* smb header is converted in header_assemble. bcc and rest of SMB word
182 area, and byte area if necessary, is converted to littleendian in
183 cifssmb.c and RFC1001 len is converted to bigendian in smb_send
184 Flags2 is converted in SendReceive */
185
186 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
187 cFYI(1, ("Sending smb of length %d", smb_buf_length));
188 dump_smb(smb_buffer, len);
189
190 while (len > 0) {
191 rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len);
192 if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
193 i++;
194 /* smaller timeout here than send2 since smaller size */
195 /* Although it may not be required, this also is smaller
196 oplock break time */
197 if (i > 12) {
198 cERROR(1,
199 ("sends on sock %p stuck for 7 seconds",
200 ssocket));
201 rc = -EAGAIN;
202 break;
203 }
204 msleep(1 << i);
205 continue;
206 }
207 if (rc < 0)
208 break;
209 else
210 i = 0; /* reset i after each successful send */
211 iov.iov_base += rc;
212 iov.iov_len -= rc;
213 len -= rc;
214 }
215
216 if (rc < 0) {
217 cERROR(1, ("Error %d sending data on socket to server", rc));
218 } else {
219 rc = 0;
220 }
221
222 /* Don't want to modify the buffer as a
223 side effect of this call. */
224 smb_buffer->smb_buf_length = smb_buf_length;
225
226 return rc;
227}
228
229static int 157static int
230smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, 158smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
231 struct sockaddr *sin, bool noblocksnd)
232{ 159{
233 int rc = 0; 160 int rc = 0;
234 int i = 0; 161 int i = 0;
@@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
243 if (ssocket == NULL) 170 if (ssocket == NULL)
244 return -ENOTSOCK; /* BB eventually add reconnect code here */ 171 return -ENOTSOCK; /* BB eventually add reconnect code here */
245 172
246 smb_msg.msg_name = sin; 173 smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
247 smb_msg.msg_namelen = sizeof(struct sockaddr); 174 smb_msg.msg_namelen = sizeof(struct sockaddr);
248 smb_msg.msg_control = NULL; 175 smb_msg.msg_control = NULL;
249 smb_msg.msg_controllen = 0; 176 smb_msg.msg_controllen = 0;
250 if (noblocksnd) 177 if (server->noblocksnd)
251 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; 178 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
252 else 179 else
253 smb_msg.msg_flags = MSG_NOSIGNAL; 180 smb_msg.msg_flags = MSG_NOSIGNAL;
@@ -272,7 +199,25 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
272 n_vec - first_vec, total_len); 199 n_vec - first_vec, total_len);
273 if ((rc == -ENOSPC) || (rc == -EAGAIN)) { 200 if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
274 i++; 201 i++;
275 if (i >= 14) { 202 /* if blocking send we try 3 times, since each can block
203 for 5 seconds. For nonblocking we have to try more
204 but wait increasing amounts of time allowing time for
205 socket to clear. The overall time we wait in either
206 case to send on the socket is about 15 seconds.
207 Similarly we wait for 15 seconds for
208 a response from the server in SendReceive[2]
209 for the server to send a response back for
210 most types of requests (except SMB Write
211 past end of file which can be slow, and
212 blocking lock operations). NFS waits slightly longer
213 than CIFS, but this can make it take longer for
214 nonresponsive servers to be detected and 15 seconds
215 is more than enough time for modern networks to
216 send a packet. In most cases if we fail to send
217 after the retries we will kill the socket and
218 reconnect which may clear the network problem.
219 */
220 if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
276 cERROR(1, 221 cERROR(1,
277 ("sends on sock %p stuck for 15 seconds", 222 ("sends on sock %p stuck for 15 seconds",
278 ssocket)); 223 ssocket));
@@ -339,6 +284,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
339 return rc; 284 return rc;
340} 285}
341 286
287int
288smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
289 unsigned int smb_buf_length)
290{
291 struct kvec iov;
292
293 iov.iov_base = smb_buffer;
294 iov.iov_len = smb_buf_length + 4;
295
296 return smb_sendv(server, &iov, 1);
297}
298
342static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 299static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
343{ 300{
344 if (long_op == CIFS_ASYNC_OP) { 301 if (long_op == CIFS_ASYNC_OP) {
@@ -540,9 +497,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
540#ifdef CONFIG_CIFS_STATS2 497#ifdef CONFIG_CIFS_STATS2
541 atomic_inc(&ses->server->inSend); 498 atomic_inc(&ses->server->inSend);
542#endif 499#endif
543 rc = smb_send2(ses->server, iov, n_vec, 500 rc = smb_sendv(ses->server, iov, n_vec);
544 (struct sockaddr *) &(ses->server->addr.sockAddr),
545 ses->server->noblocksnd);
546#ifdef CONFIG_CIFS_STATS2 501#ifdef CONFIG_CIFS_STATS2
547 atomic_dec(&ses->server->inSend); 502 atomic_dec(&ses->server->inSend);
548 midQ->when_sent = jiffies; 503 midQ->when_sent = jiffies;
@@ -736,9 +691,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
736#ifdef CONFIG_CIFS_STATS2 691#ifdef CONFIG_CIFS_STATS2
737 atomic_inc(&ses->server->inSend); 692 atomic_inc(&ses->server->inSend);
738#endif 693#endif
739 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 694 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
740 (struct sockaddr *) &(ses->server->addr.sockAddr),
741 ses->server->noblocksnd);
742#ifdef CONFIG_CIFS_STATS2 695#ifdef CONFIG_CIFS_STATS2
743 atomic_dec(&ses->server->inSend); 696 atomic_dec(&ses->server->inSend);
744 midQ->when_sent = jiffies; 697 midQ->when_sent = jiffies;
@@ -879,9 +832,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
879 mutex_unlock(&ses->server->srv_mutex); 832 mutex_unlock(&ses->server->srv_mutex);
880 return rc; 833 return rc;
881 } 834 }
882 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 835 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
883 (struct sockaddr *) &(ses->server->addr.sockAddr),
884 ses->server->noblocksnd);
885 mutex_unlock(&ses->server->srv_mutex); 836 mutex_unlock(&ses->server->srv_mutex);
886 return rc; 837 return rc;
887} 838}
@@ -973,9 +924,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
973#ifdef CONFIG_CIFS_STATS2 924#ifdef CONFIG_CIFS_STATS2
974 atomic_inc(&ses->server->inSend); 925 atomic_inc(&ses->server->inSend);
975#endif 926#endif
976 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 927 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
977 (struct sockaddr *) &(ses->server->addr.sockAddr),
978 ses->server->noblocksnd);
979#ifdef CONFIG_CIFS_STATS2 928#ifdef CONFIG_CIFS_STATS2
980 atomic_dec(&ses->server->inSend); 929 atomic_dec(&ses->server->inSend);
981 midQ->when_sent = jiffies; 930 midQ->when_sent = jiffies;
diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705ab..d0145ca27572 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename,
1407 bprm->cred = prepare_exec_creds(); 1407 bprm->cred = prepare_exec_creds();
1408 if (!bprm->cred) 1408 if (!bprm->cred)
1409 goto out_unlock; 1409 goto out_unlock;
1410 check_unsafe_exec(bprm); 1410 check_unsafe_exec(bprm, current->files);
1411 1411
1412 file = open_exec(filename); 1412 file = open_exec(filename);
1413 retval = PTR_ERR(file); 1413 retval = PTR_ERR(file);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5235c67e7594..9c6d815dd191 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
538 * cannot be fixed without breaking all existing apps. 538 * cannot be fixed without breaking all existing apps.
539 */ 539 */
540 case TUNSETIFF: 540 case TUNSETIFF:
541 case TUNGETIFF:
541 case SIOCGIFFLAGS: 542 case SIOCGIFFLAGS:
542 case SIOCGIFMETRIC: 543 case SIOCGIFMETRIC:
543 case SIOCGIFMTU: 544 case SIOCGIFMTU:
@@ -784,7 +785,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
784 785
785 if (copy_in_user(&sgio->status, &sgio32->status, 786 if (copy_in_user(&sgio->status, &sgio32->status,
786 (4 * sizeof(unsigned char)) + 787 (4 * sizeof(unsigned char)) +
787 (2 * sizeof(unsigned (short))) + 788 (2 * sizeof(unsigned short)) +
788 (3 * sizeof(int)))) 789 (3 * sizeof(int))))
789 return -EFAULT; 790 return -EFAULT;
790 791
@@ -1982,6 +1983,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM)
1982COMPATIBLE_IOCTL(TUNSETDEBUG) 1983COMPATIBLE_IOCTL(TUNSETDEBUG)
1983COMPATIBLE_IOCTL(TUNSETPERSIST) 1984COMPATIBLE_IOCTL(TUNSETPERSIST)
1984COMPATIBLE_IOCTL(TUNSETOWNER) 1985COMPATIBLE_IOCTL(TUNSETOWNER)
1986COMPATIBLE_IOCTL(TUNSETLINK)
1987COMPATIBLE_IOCTL(TUNSETGROUP)
1988COMPATIBLE_IOCTL(TUNGETFEATURES)
1989COMPATIBLE_IOCTL(TUNSETOFFLOAD)
1990COMPATIBLE_IOCTL(TUNSETTXFILTER)
1985/* Big V */ 1991/* Big V */
1986COMPATIBLE_IOCTL(VT_SETMODE) 1992COMPATIBLE_IOCTL(VT_SETMODE)
1987COMPATIBLE_IOCTL(VT_GETMODE) 1993COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2573,6 +2579,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
2573HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc) 2579HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
2574HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc) 2580HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
2575HANDLE_IOCTL(TUNSETIFF, dev_ifsioc) 2581HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
2582HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
2576HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl) 2583HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
2577HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl) 2584HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
2578HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl) 2585HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c01e043670e2..f6caeb1d1106 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1716,7 +1716,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1716{ 1716{
1717 int rc = 0; 1717 int rc = 0;
1718 1718
1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); 1719 (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
1720 if (!(*copied_name)) { 1720 if (!(*copied_name)) {
1721 rc = -ENOMEM; 1721 rc = -ENOMEM;
1722 goto out; 1722 goto out;
@@ -1726,7 +1726,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1726 * in printing out the 1726 * in printing out the
1727 * string in debug 1727 * string in debug
1728 * messages */ 1728 * messages */
1729 (*copied_name_size) = (name_size + 1); 1729 (*copied_name_size) = name_size;
1730out: 1730out:
1731 return rc; 1731 return rc;
1732} 1732}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ba2f9ec71192..011b9b8c90c6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
234/* 234/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 235 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 236 */
237/* Maximum number of epoll devices, per user */
238static int max_user_instances __read_mostly;
239/* Maximum number of epoll watched descriptors, per user */ 237/* Maximum number of epoll watched descriptors, per user */
240static int max_user_watches __read_mostly; 238static int max_user_watches __read_mostly;
241 239
@@ -261,14 +259,6 @@ static int zero;
261 259
262ctl_table epoll_table[] = { 260ctl_table epoll_table[] = {
263 { 261 {
264 .procname = "max_user_instances",
265 .data = &max_user_instances,
266 .maxlen = sizeof(int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .extra1 = &zero,
270 },
271 {
272 .procname = "max_user_watches", 262 .procname = "max_user_watches",
273 .data = &max_user_watches, 263 .data = &max_user_watches,
274 .maxlen = sizeof(int), 264 .maxlen = sizeof(int),
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
491 481
492 mutex_unlock(&epmutex); 482 mutex_unlock(&epmutex);
493 mutex_destroy(&ep->mtx); 483 mutex_destroy(&ep->mtx);
494 atomic_dec(&ep->user->epoll_devs);
495 free_uid(ep->user); 484 free_uid(ep->user);
496 kfree(ep); 485 kfree(ep);
497} 486}
@@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep)
581 struct eventpoll *ep; 570 struct eventpoll *ep;
582 571
583 user = get_current_user(); 572 user = get_current_user();
584 error = -EMFILE;
585 if (unlikely(atomic_read(&user->epoll_devs) >=
586 max_user_instances))
587 goto free_uid;
588 error = -ENOMEM; 573 error = -ENOMEM;
589 ep = kzalloc(sizeof(*ep), GFP_KERNEL); 574 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
590 if (unlikely(!ep)) 575 if (unlikely(!ep))
@@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1141 flags & O_CLOEXEC); 1126 flags & O_CLOEXEC);
1142 if (fd < 0) 1127 if (fd < 0)
1143 ep_free(ep); 1128 ep_free(ep);
1144 atomic_inc(&ep->user->epoll_devs);
1145 1129
1146error_return: 1130error_return:
1147 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1131 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void)
1366 struct sysinfo si; 1350 struct sysinfo si;
1367 1351
1368 si_meminfo(&si); 1352 si_meminfo(&si);
1369 max_user_instances = 128; 1353 /*
1370 max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / 1354 * Allows top 4% of lomem to be allocated for epoll watches (per user).
1355 */
1356 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1371 EP_ITEM_COST; 1357 EP_ITEM_COST;
1372 1358
1373 /* Initialize the structure used to perform safe poll wait head wake ups */ 1359 /* Initialize the structure used to perform safe poll wait head wake ups */
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a01f1b4..929b58004b7e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds);
1049 * - the caller must hold current->cred_exec_mutex to protect against 1049 * - the caller must hold current->cred_exec_mutex to protect against
1050 * PTRACE_ATTACH 1050 * PTRACE_ATTACH
1051 */ 1051 */
1052void check_unsafe_exec(struct linux_binprm *bprm) 1052void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
1053{ 1053{
1054 struct task_struct *p = current; 1054 struct task_struct *p = current, *t;
1055 unsigned long flags;
1056 unsigned n_fs, n_files, n_sighand;
1055 1057
1056 bprm->unsafe = tracehook_unsafe_exec(p); 1058 bprm->unsafe = tracehook_unsafe_exec(p);
1057 1059
1058 if (atomic_read(&p->fs->count) > 1 || 1060 n_fs = 1;
1059 atomic_read(&p->files->count) > 1 || 1061 n_files = 1;
1060 atomic_read(&p->sighand->count) > 1) 1062 n_sighand = 1;
1063 lock_task_sighand(p, &flags);
1064 for (t = next_thread(p); t != p; t = next_thread(t)) {
1065 if (t->fs == p->fs)
1066 n_fs++;
1067 if (t->files == files)
1068 n_files++;
1069 n_sighand++;
1070 }
1071
1072 if (atomic_read(&p->fs->count) > n_fs ||
1073 atomic_read(&p->files->count) > n_files ||
1074 atomic_read(&p->sighand->count) > n_sighand)
1061 bprm->unsafe |= LSM_UNSAFE_SHARE; 1075 bprm->unsafe |= LSM_UNSAFE_SHARE;
1076
1077 unlock_task_sighand(p, &flags);
1062} 1078}
1063 1079
1064/* 1080/*
@@ -1273,7 +1289,7 @@ int do_execve(char * filename,
1273 bprm->cred = prepare_exec_creds(); 1289 bprm->cred = prepare_exec_creds();
1274 if (!bprm->cred) 1290 if (!bprm->cred)
1275 goto out_unlock; 1291 goto out_unlock;
1276 check_unsafe_exec(bprm); 1292 check_unsafe_exec(bprm, displaced);
1277 1293
1278 file = open_exec(filename); 1294 file = open_exec(filename);
1279 retval = PTR_ERR(file); 1295 retval = PTR_ERR(file);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6d..7c6e3606f0ec 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1185 es = sbi->s_es; 1185 es = sbi->s_es;
1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
1187 (old_mount_opt & EXT2_MOUNT_XIP)) && 1187 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1188 invalidate_inodes(sb)) 1188 invalidate_inodes(sb)) {
1189 ext2_warning(sb, __func__, "busy inodes while remounting "\ 1189 ext2_warning(sb, __func__, "refusing change of xip flag "
1190 "xip remain in cache (no functional problem)"); 1190 "with busy inodes while remounting");
1191 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1192 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1193 }
1191 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1194 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1192 return 0; 1195 return 0;
1193 if (*flags & MS_RDONLY) { 1196 if (*flags & MS_RDONLY) {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 69a3d19ca9fd..4db4ffa1edad 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1358,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1358 struct fake_dirent *fde; 1358 struct fake_dirent *fde;
1359 1359
1360 blocksize = dir->i_sb->s_blocksize; 1360 blocksize = dir->i_sb->s_blocksize;
1361 dxtrace(printk("Creating index\n")); 1361 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1362 retval = ext3_journal_get_write_access(handle, bh); 1362 retval = ext3_journal_get_write_access(handle, bh);
1363 if (retval) { 1363 if (retval) {
1364 ext3_std_error(dir->i_sb, retval); 1364 ext3_std_error(dir->i_sb, retval);
@@ -1367,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1367 } 1367 }
1368 root = (struct dx_root *) bh->b_data; 1368 root = (struct dx_root *) bh->b_data;
1369 1369
1370 /* The 0th block becomes the root, move the dirents out */
1371 fde = &root->dotdot;
1372 de = (struct ext3_dir_entry_2 *)((char *)fde +
1373 ext3_rec_len_from_disk(fde->rec_len));
1374 if ((char *) de >= (((char *) root) + blocksize)) {
1375 ext3_error(dir->i_sb, __func__,
1376 "invalid rec_len for '..' in inode %lu",
1377 dir->i_ino);
1378 brelse(bh);
1379 return -EIO;
1380 }
1381 len = ((char *) root) + blocksize - (char *) de;
1382
1370 bh2 = ext3_append (handle, dir, &block, &retval); 1383 bh2 = ext3_append (handle, dir, &block, &retval);
1371 if (!(bh2)) { 1384 if (!(bh2)) {
1372 brelse(bh); 1385 brelse(bh);
@@ -1375,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1375 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; 1388 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1376 data1 = bh2->b_data; 1389 data1 = bh2->b_data;
1377 1390
1378 /* The 0th block becomes the root, move the dirents out */
1379 fde = &root->dotdot;
1380 de = (struct ext3_dir_entry_2 *)((char *)fde +
1381 ext3_rec_len_from_disk(fde->rec_len));
1382 len = ((char *) root) + blocksize - (char *) de;
1383 memcpy (data1, de, len); 1391 memcpy (data1, de, len);
1384 de = (struct ext3_dir_entry_2 *) data1; 1392 de = (struct ext3_dir_entry_2 *) data1;
1385 top = data1 + len; 1393 top = data1 + len;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3c..4a970411a458 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
2428 2428
2429static int ext3_sync_fs(struct super_block *sb, int wait) 2429static int ext3_sync_fs(struct super_block *sb, int wait)
2430{ 2430{
2431 sb->s_dirt = 0; 2431 tid_t target;
2432 if (wait)
2433 ext3_force_commit(sb);
2434 else
2435 journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
2436 2432
2433 sb->s_dirt = 0;
2434 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2435 if (wait)
2436 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2437 }
2437 return 0; 2438 return 0;
2438} 2439}
2439 2440
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6bba06b09dd1..9a50b8052dcf 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -684,15 +684,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
684 gdp = ext4_get_group_desc(sb, i, NULL); 684 gdp = ext4_get_group_desc(sb, i, NULL);
685 if (!gdp) 685 if (!gdp)
686 continue; 686 continue;
687 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 687 desc_count += ext4_free_blks_count(sb, gdp);
688 brelse(bitmap_bh); 688 brelse(bitmap_bh);
689 bitmap_bh = ext4_read_block_bitmap(sb, i); 689 bitmap_bh = ext4_read_block_bitmap(sb, i);
690 if (bitmap_bh == NULL) 690 if (bitmap_bh == NULL)
691 continue; 691 continue;
692 692
693 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 693 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
694 printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", 694 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
695 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 695 i, ext4_free_blks_count(sb, gdp), x);
696 bitmap_count += x; 696 bitmap_count += x;
697 } 697 }
698 brelse(bitmap_bh); 698 brelse(bitmap_bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c668e4377d76..aafc9eba1c25 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1206,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
1206 1206
1207static inline loff_t ext4_isize(struct ext4_inode *raw_inode) 1207static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
1208{ 1208{
1209 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | 1209 if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1210 le32_to_cpu(raw_inode->i_size_lo); 1210 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
1211 le32_to_cpu(raw_inode->i_size_lo);
1212 else
1213 return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1211} 1214}
1212 1215
1213static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) 1216static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54bf0623a9ae..e2eab196875f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3048,7 +3048,7 @@ retry:
3048 WARN_ON(ret <= 0); 3048 WARN_ON(ret <= 0);
3049 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3049 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3050 "returned error inode#%lu, block=%u, " 3050 "returned error inode#%lu, block=%u, "
3051 "max_blocks=%lu", __func__, 3051 "max_blocks=%u", __func__,
3052 inode->i_ino, block, max_blocks); 3052 inode->i_ino, block, max_blocks);
3053#endif 3053#endif
3054 ext4_mark_inode_dirty(handle, inode); 3054 ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6444cee0c7e..03ba20be1329 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -360,9 +360,9 @@ static int ext4_block_to_path(struct inode *inode,
360 final = ptrs; 360 final = ptrs;
361 } else { 361 } else {
362 ext4_warning(inode->i_sb, "ext4_block_to_path", 362 ext4_warning(inode->i_sb, "ext4_block_to_path",
363 "block %lu > max", 363 "block %lu > max in inode %lu",
364 i_block + direct_blocks + 364 i_block + direct_blocks +
365 indirect_blocks + double_blocks); 365 indirect_blocks + double_blocks, inode->i_ino);
366 } 366 }
367 if (boundary) 367 if (boundary)
368 *boundary = final - 1 - (i_block & (ptrs - 1)); 368 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -2821,9 +2821,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2821 filemap_write_and_wait(mapping); 2821 filemap_write_and_wait(mapping);
2822 } 2822 }
2823 2823
2824 BUG_ON(!EXT4_JOURNAL(inode) &&
2825 EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
2826
2827 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2824 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2828 /* 2825 /*
2829 * This is a REALLY heavyweight approach, but the use of 2826 * This is a REALLY heavyweight approach, but the use of
@@ -3622,7 +3619,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3622 * block pointed to itself, it would have been detached when 3619 * block pointed to itself, it would have been detached when
3623 * the block was cleared. Check for this instead of OOPSing. 3620 * the block was cleared. Check for this instead of OOPSing.
3624 */ 3621 */
3625 if (bh2jh(this_bh)) 3622 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
3626 ext4_handle_dirty_metadata(handle, inode, this_bh); 3623 ext4_handle_dirty_metadata(handle, inode, this_bh);
3627 else 3624 else
3628 ext4_error(inode->i_sb, __func__, 3625 ext4_error(inode->i_sb, __func__,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 918aec0c8a11..deba54f6cbed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3025,7 +3025,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3025 goto out_err; 3025 goto out_err;
3026 3026
3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
3028 gdp->bg_free_blocks_count); 3028 ext4_free_blks_count(sb, gdp));
3029 3029
3030 err = ext4_journal_get_write_access(handle, gdp_bh); 3030 err = ext4_journal_get_write_access(handle, gdp_bh);
3031 if (err) 3031 if (err)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fec0b4c2f5f1..ba702bd7910d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1368,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1368 struct fake_dirent *fde; 1368 struct fake_dirent *fde;
1369 1369
1370 blocksize = dir->i_sb->s_blocksize; 1370 blocksize = dir->i_sb->s_blocksize;
1371 dxtrace(printk(KERN_DEBUG "Creating index\n")); 1371 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1372 retval = ext4_journal_get_write_access(handle, bh); 1372 retval = ext4_journal_get_write_access(handle, bh);
1373 if (retval) { 1373 if (retval) {
1374 ext4_std_error(dir->i_sb, retval); 1374 ext4_std_error(dir->i_sb, retval);
@@ -1377,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1377 } 1377 }
1378 root = (struct dx_root *) bh->b_data; 1378 root = (struct dx_root *) bh->b_data;
1379 1379
1380 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len));
1384 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu",
1387 dir->i_ino);
1388 brelse(bh);
1389 return -EIO;
1390 }
1391 len = ((char *) root) + blocksize - (char *) de;
1392
1393 /* Allocate new block for the 0th block's dirents */
1380 bh2 = ext4_append(handle, dir, &block, &retval); 1394 bh2 = ext4_append(handle, dir, &block, &retval);
1381 if (!(bh2)) { 1395 if (!(bh2)) {
1382 brelse(bh); 1396 brelse(bh);
@@ -1385,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1385 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1399 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
1386 data1 = bh2->b_data; 1400 data1 = bh2->b_data;
1387 1401
1388 /* The 0th block becomes the root, move the dirents out */
1389 fde = &root->dotdot;
1390 de = (struct ext4_dir_entry_2 *)((char *)fde +
1391 ext4_rec_len_from_disk(fde->rec_len));
1392 len = ((char *) root) + blocksize - (char *) de;
1393 memcpy (data1, de, len); 1402 memcpy (data1, de, len);
1394 de = (struct ext4_dir_entry_2 *) data1; 1403 de = (struct ext4_dir_entry_2 *) data1;
1395 top = data1 + len; 1404 top = data1 + len;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c328be5d6885..c06886abd658 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -861,12 +861,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 861 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 862 gdb_off * EXT4_DESC_SIZE(sb));
863 863
864 memset(gdp, 0, EXT4_DESC_SIZE(sb));
864 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 865 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
865 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 866 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
866 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 867 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
867 ext4_free_blks_set(sb, gdp, input->free_blocks_count); 868 ext4_free_blks_set(sb, gdp, input->free_blocks_count);
868 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 869 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
869 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); 870 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
870 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 871 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
871 872
872 /* 873 /*
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
108 108
109 if (hugetlb_reserve_pages(inode, 109 if (hugetlb_reserve_pages(inode,
110 vma->vm_pgoff >> huge_page_order(h), 110 vma->vm_pgoff >> huge_page_order(h),
111 len >> huge_page_shift(h), vma)) 111 len >> huge_page_shift(h), vma,
112 vma->vm_flags))
112 goto out; 113 goto out;
113 114
114 ret = 0; 115 ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
947 can_do_mlock()); 948 can_do_mlock());
948} 949}
949 950
950struct file *hugetlb_file_setup(const char *name, size_t size) 951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951{ 952{
952 int error = -ENOMEM; 953 int error = -ENOMEM;
953 struct file *file; 954 struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
981 982
982 error = -ENOMEM; 983 error = -ENOMEM;
983 if (hugetlb_reserve_pages(inode, 0, 984 if (hugetlb_reserve_pages(inode, 0,
984 size >> huge_page_shift(hstate_inode(inode)), NULL)) 985 size >> huge_page_shift(hstate_inode(inode)), NULL,
986 acctflag))
985 goto out_inode; 987 goto out_inode;
986 988
987 d_instantiate(dentry, inode); 989 d_instantiate(dentry, inode);
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..0d8ac497b3d5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,7 +43,7 @@ extern void __init chrdev_init(void);
43/* 43/*
44 * exec.c 44 * exec.c
45 */ 45 */
46extern void check_unsafe_exec(struct linux_binprm *); 46extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
47 47
48/* 48/*
49 * namespace.c 49 * namespace.c
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc8..e79c07812afa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
427} 427}
428 428
429/* 429/*
430 * Called under j_state_lock. Returns true if a transaction was started. 430 * Called under j_state_lock. Returns true if a transaction commit was started.
431 */ 431 */
432int __log_start_commit(journal_t *journal, tid_t target) 432int __log_start_commit(journal_t *journal, tid_t target)
433{ 433{
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
495 495
496/* 496/*
497 * Start a commit of the current running transaction (if any). Returns true 497 * Start a commit of the current running transaction (if any). Returns true
498 * if a transaction was started, and fills its tid in at *ptid 498 * if a transaction is going to be committed (or is currently already
499 * committing), and fills its tid in at *ptid
499 */ 500 */
500int journal_start_commit(journal_t *journal, tid_t *ptid) 501int journal_start_commit(journal_t *journal, tid_t *ptid)
501{ 502{
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
505 if (journal->j_running_transaction) { 506 if (journal->j_running_transaction) {
506 tid_t tid = journal->j_running_transaction->t_tid; 507 tid_t tid = journal->j_running_transaction->t_tid;
507 508
508 ret = __log_start_commit(journal, tid); 509 __log_start_commit(journal, tid);
509 if (ret && ptid) 510 /* There's a running transaction and we've just made sure
511 * it's commit has been scheduled. */
512 if (ptid)
510 *ptid = tid; 513 *ptid = tid;
511 } else if (journal->j_committing_transaction && ptid) { 514 ret = 1;
515 } else if (journal->j_committing_transaction) {
512 /* 516 /*
513 * If ext3_write_super() recently started a commit, then we 517 * If ext3_write_super() recently started a commit, then we
514 * have to wait for completion of that transaction 518 * have to wait for completion of that transaction
515 */ 519 */
516 *ptid = journal->j_committing_transaction->t_tid; 520 if (ptid)
521 *ptid = journal->j_committing_transaction->t_tid;
517 ret = 1; 522 ret = 1;
518 } 523 }
519 spin_unlock(&journal->j_state_lock); 524 spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 56675306ed81..eb343008eded 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -37,10 +37,10 @@
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/page.h> 43#include <asm/page.h>
43#include <asm/div64.h>
44 44
45EXPORT_SYMBOL(jbd2_journal_start); 45EXPORT_SYMBOL(jbd2_journal_start);
46EXPORT_SYMBOL(jbd2_journal_restart); 46EXPORT_SYMBOL(jbd2_journal_restart);
@@ -846,8 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
847 seq_printf(seq, " %ums logging transaction\n", 847 seq_printf(seq, " %ums logging transaction\n",
848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %luus average transaction commit time\n", 849 seq_printf(seq, " %lluus average transaction commit time\n",
850 do_div(s->journal->j_average_commit_time, 1000)); 850 div_u64(s->journal->j_average_commit_time, 1000));
851 seq_printf(seq, " %lu handles per transaction\n", 851 seq_printf(seq, " %lu handles per transaction\n",
852 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 852 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
853 seq_printf(seq, " %lu blocks per transaction\n", 853 seq_printf(seq, " %lu blocks per transaction\n",
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e4b9f3..763b78a6e9de 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 ret = nlm_lck_denied; 429 ret = nlm_lck_denied;
430 goto out; 430 break;
431 case FILE_LOCK_DEFERRED: 431 case FILE_LOCK_DEFERRED:
432 if (wait) 432 if (wait)
433 break; 433 break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 443 goto out;
444 } 444 }
445 445
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
446 ret = nlm_lck_blocked; 450 ret = nlm_lck_blocked;
447 451
448 /* Append to list of blocked */ 452 /* Append to list of blocked */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d861096c9d81..60fe74035db5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5390,6 +5390,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
5390 goto out; 5390 goto out;
5391 } 5391 }
5392 5392
5393 vfs_dq_free_space_nodirty(inode,
5394 ocfs2_clusters_to_bytes(inode->i_sb, len));
5395
5393 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5396 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5394 dealloc); 5397 dealloc);
5395 if (ret) { 5398 if (ret) {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e88..e9d7c2038c0f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
38#include "dlmglue.h" 38#include "dlmglue.h"
39#include "file.h" 39#include "file.h"
40#include "inode.h" 40#include "inode.h"
41#include "super.h"
41 42
42 43
43static int ocfs2_dentry_revalidate(struct dentry *dentry, 44static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
294 return ret; 295 return ret;
295} 296}
296 297
298static DEFINE_SPINLOCK(dentry_list_lock);
299
300/* We limit the number of dentry locks to drop in one go. We have
301 * this limit so that we don't starve other users of ocfs2_wq. */
302#define DL_INODE_DROP_COUNT 64
303
304/* Drop inode references from dentry locks */
305void ocfs2_drop_dl_inodes(struct work_struct *work)
306{
307 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
308 dentry_lock_work);
309 struct ocfs2_dentry_lock *dl;
310 int drop_count = DL_INODE_DROP_COUNT;
311
312 spin_lock(&dentry_list_lock);
313 while (osb->dentry_lock_list && drop_count--) {
314 dl = osb->dentry_lock_list;
315 osb->dentry_lock_list = dl->dl_next;
316 spin_unlock(&dentry_list_lock);
317 iput(dl->dl_inode);
318 kfree(dl);
319 spin_lock(&dentry_list_lock);
320 }
321 if (osb->dentry_lock_list)
322 queue_work(ocfs2_wq, &osb->dentry_lock_work);
323 spin_unlock(&dentry_list_lock);
324}
325
297/* 326/*
298 * ocfs2_dentry_iput() and friends. 327 * ocfs2_dentry_iput() and friends.
299 * 328 *
@@ -318,16 +347,23 @@ out_attach:
318static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 347static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
319 struct ocfs2_dentry_lock *dl) 348 struct ocfs2_dentry_lock *dl)
320{ 349{
321 iput(dl->dl_inode);
322 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 350 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
323 ocfs2_lock_res_free(&dl->dl_lockres); 351 ocfs2_lock_res_free(&dl->dl_lockres);
324 kfree(dl); 352
353 /* We leave dropping of inode reference to ocfs2_wq as that can
354 * possibly lead to inode deletion which gets tricky */
355 spin_lock(&dentry_list_lock);
356 if (!osb->dentry_lock_list)
357 queue_work(ocfs2_wq, &osb->dentry_lock_work);
358 dl->dl_next = osb->dentry_lock_list;
359 osb->dentry_lock_list = dl;
360 spin_unlock(&dentry_list_lock);
325} 361}
326 362
327void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 363void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
328 struct ocfs2_dentry_lock *dl) 364 struct ocfs2_dentry_lock *dl)
329{ 365{
330 int unlock = 0; 366 int unlock;
331 367
332 BUG_ON(dl->dl_count == 0); 368 BUG_ON(dl->dl_count == 0);
333 369
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d9883..d06e16c06640 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
29extern struct dentry_operations ocfs2_dentry_ops; 29extern struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
32 unsigned int dl_count; 33 unsigned int dl_count;
33 u64 dl_parent_blkno; 34 union {
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
34 39
35 /* 40 /*
36 * The ocfs2_dentry_lock keeps an inode reference until 41 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
47void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 52void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
48 struct ocfs2_dentry_lock *dl); 53 struct ocfs2_dentry_lock *dl);
49 54
55void ocfs2_drop_dl_inodes(struct work_struct *work);
56
50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 57struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
51 int skip_unhashed); 58 int skip_unhashed);
52 59
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b0c4cadd4c45..206a2370876a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2860,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
2860 case OCFS2_UNLOCK_CANCEL_CONVERT: 2860 case OCFS2_UNLOCK_CANCEL_CONVERT:
2861 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2861 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2862 lockres->l_action = OCFS2_AST_INVALID; 2862 lockres->l_action = OCFS2_AST_INVALID;
2863 /* Downconvert thread may have requeued this lock, we
2864 * need to wake it. */
2865 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2866 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
2863 break; 2867 break;
2864 case OCFS2_UNLOCK_DROP_LOCK: 2868 case OCFS2_UNLOCK_DROP_LOCK:
2865 lockres->l_level = DLM_LOCK_IV; 2869 lockres->l_level = DLM_LOCK_IV;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ad5c24a29edd..077384135f4e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -210,6 +210,7 @@ struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_quota_recovery; 212struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock;
213struct ocfs2_super 214struct ocfs2_super
214{ 215{
215 struct task_struct *commit_task; 216 struct task_struct *commit_task;
@@ -325,6 +326,11 @@ struct ocfs2_super
325 struct list_head blocked_lock_list; 326 struct list_head blocked_lock_list;
326 unsigned long blocked_lock_count; 327 unsigned long blocked_lock_count;
327 328
329 /* List of dentry locks to release. Anyone can add locks to
330 * the list, ocfs2_wq processes the list */
331 struct ocfs2_dentry_lock *dentry_lock_list;
332 struct work_struct dentry_lock_work;
333
328 wait_queue_head_t osb_mount_event; 334 wait_queue_head_t osb_mount_event;
329 335
330 /* Truncate log info */ 336 /* Truncate log info */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f4efa89baee5..1ed0f7c86869 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
754 if (dquot->dq_flags & mask) 754 if (dquot->dq_flags & mask)
755 sync = 1; 755 sync = 1;
756 spin_unlock(&dq_data_lock); 756 spin_unlock(&dq_data_lock);
757 if (!sync) { 757 /* This is a slight hack but we can't afford getting global quota
758 * lock if we already have a transaction started. */
759 if (!sync || journal_current_handle()) {
758 status = ocfs2_write_dquot(dquot); 760 status = ocfs2_write_dquot(dquot);
759 goto out; 761 goto out;
760 } 762 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 43ed11345b59..b1cb38fbe807 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1887,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1887 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 1887 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
1888 journal->j_state = OCFS2_JOURNAL_FREE; 1888 journal->j_state = OCFS2_JOURNAL_FREE;
1889 1889
1890 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
1891 osb->dentry_lock_list = NULL;
1892
1890 /* get some pseudo constants for clustersize bits */ 1893 /* get some pseudo constants for clustersize bits */
1891 osb->s_clustersize_bits = 1894 osb->s_clustersize_bits =
1892 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 1895 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e1d638af6ac3..915039fffe6e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4729,13 +4729,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4729 vb.vb_xv = (struct ocfs2_xattr_value_root *) 4729 vb.vb_xv = (struct ocfs2_xattr_value_root *)
4730 (vb.vb_bh->b_data + offset % blocksize); 4730 (vb.vb_bh->b_data + offset % blocksize);
4731 4731
4732 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4733 OCFS2_JOURNAL_ACCESS_WRITE);
4734 if (ret) {
4735 mlog_errno(ret);
4736 goto out;
4737 }
4738
4739 /* 4732 /*
4740 * From here on out we have to dirty the bucket. The generic 4733 * From here on out we have to dirty the bucket. The generic
4741 * value calls only modify one of the bucket's bhs, but we need 4734 * value calls only modify one of the bucket's bhs, but we need
@@ -4748,12 +4741,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4748 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); 4741 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
4749 if (ret) { 4742 if (ret) {
4750 mlog_errno(ret); 4743 mlog_errno(ret);
4751 goto out_dirty; 4744 goto out;
4745 }
4746
4747 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4748 OCFS2_JOURNAL_ACCESS_WRITE);
4749 if (ret) {
4750 mlog_errno(ret);
4751 goto out;
4752 } 4752 }
4753 4753
4754 xe->xe_value_size = cpu_to_le64(len); 4754 xe->xe_value_size = cpu_to_le64(len);
4755 4755
4756out_dirty:
4757 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); 4756 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
4758 4757
4759out: 4758out:
diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1c4dc8..5267098532bf 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -54,6 +54,64 @@ int seq_open(struct file *file, const struct seq_operations *op)
54} 54}
55EXPORT_SYMBOL(seq_open); 55EXPORT_SYMBOL(seq_open);
56 56
57static int traverse(struct seq_file *m, loff_t offset)
58{
59 loff_t pos = 0, index;
60 int error = 0;
61 void *p;
62
63 m->version = 0;
64 index = 0;
65 m->count = m->from = 0;
66 if (!offset) {
67 m->index = index;
68 return 0;
69 }
70 if (!m->buf) {
71 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
72 if (!m->buf)
73 return -ENOMEM;
74 }
75 p = m->op->start(m, &index);
76 while (p) {
77 error = PTR_ERR(p);
78 if (IS_ERR(p))
79 break;
80 error = m->op->show(m, p);
81 if (error < 0)
82 break;
83 if (unlikely(error)) {
84 error = 0;
85 m->count = 0;
86 }
87 if (m->count == m->size)
88 goto Eoverflow;
89 if (pos + m->count > offset) {
90 m->from = offset - pos;
91 m->count -= m->from;
92 m->index = index;
93 break;
94 }
95 pos += m->count;
96 m->count = 0;
97 if (pos == offset) {
98 index++;
99 m->index = index;
100 break;
101 }
102 p = m->op->next(m, p, &index);
103 }
104 m->op->stop(m, p);
105 m->index = index;
106 return error;
107
108Eoverflow:
109 m->op->stop(m, p);
110 kfree(m->buf);
111 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
112 return !m->buf ? -ENOMEM : -EAGAIN;
113}
114
57/** 115/**
58 * seq_read - ->read() method for sequential files. 116 * seq_read - ->read() method for sequential files.
59 * @file: the file to read from 117 * @file: the file to read from
@@ -186,63 +244,6 @@ Efault:
186} 244}
187EXPORT_SYMBOL(seq_read); 245EXPORT_SYMBOL(seq_read);
188 246
189static int traverse(struct seq_file *m, loff_t offset)
190{
191 loff_t pos = 0, index;
192 int error = 0;
193 void *p;
194
195 m->version = 0;
196 index = 0;
197 m->count = m->from = 0;
198 if (!offset) {
199 m->index = index;
200 return 0;
201 }
202 if (!m->buf) {
203 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
204 if (!m->buf)
205 return -ENOMEM;
206 }
207 p = m->op->start(m, &index);
208 while (p) {
209 error = PTR_ERR(p);
210 if (IS_ERR(p))
211 break;
212 error = m->op->show(m, p);
213 if (error < 0)
214 break;
215 if (unlikely(error)) {
216 error = 0;
217 m->count = 0;
218 }
219 if (m->count == m->size)
220 goto Eoverflow;
221 if (pos + m->count > offset) {
222 m->from = offset - pos;
223 m->count -= m->from;
224 m->index = index;
225 break;
226 }
227 pos += m->count;
228 m->count = 0;
229 if (pos == offset) {
230 index++;
231 m->index = index;
232 break;
233 }
234 p = m->op->next(m, p, &index);
235 }
236 m->op->stop(m, p);
237 return error;
238
239Eoverflow:
240 m->op->stop(m, p);
241 kfree(m->buf);
242 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
243 return !m->buf ? -ENOMEM : -EAGAIN;
244}
245
246/** 247/**
247 * seq_lseek - ->llseek() method for sequential files. 248 * seq_lseek - ->llseek() method for sequential files.
248 * @file: the file in question 249 * @file: the file in question
diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..61dce001dd57 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb)
301 /* 301 /*
302 * wait for asynchronous fs operations to finish before going further 302 * wait for asynchronous fs operations to finish before going further
303 */ 303 */
304 async_synchronize_full_special(&sb->s_async_list); 304 async_synchronize_full_domain(&sb->s_async_list);
305 305
306 /* bad name - it should be evict_inodes() */ 306 /* bad name - it should be evict_inodes() */
307 invalidate_inodes(sb); 307 invalidate_inodes(sb);
@@ -470,7 +470,7 @@ restart:
470 sb->s_count++; 470 sb->s_count++;
471 spin_unlock(&sb_lock); 471 spin_unlock(&sb_lock);
472 down_read(&sb->s_umount); 472 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list); 473 async_synchronize_full_domain(&sb->s_async_list);
474 if (sb->s_root && (wait || sb->s_dirt)) 474 if (sb->s_root && (wait || sb->s_dirt))
475 sb->s_op->sync_fs(sb, wait); 475 sb->s_op->sync_fs(sb, wait);
476 up_read(&sb->s_umount); 476 up_read(&sb->s_umount);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 175f9c590b77..f393620890ee 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
689} 689}
690 690
691/** 691/**
692 * ubifs_get_free_space - return amount of free space. 692 * ubifs_get_free_space_nolock - return amount of free space.
693 * @c: UBIFS file-system description object 693 * @c: UBIFS file-system description object
694 * 694 *
695 * This function calculates amount of free space to report to user-space. 695 * This function calculates amount of free space to report to user-space.
@@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
704 * traditional file-systems, because they have way less overhead than UBIFS. 704 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 705 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 706 */
707long long ubifs_get_free_space(struct ubifs_info *c) 707long long ubifs_get_free_space_nolock(struct ubifs_info *c)
708{ 708{
709 int min_idx_lebs, rsvd_idx_lebs, lebs; 709 int rsvd_idx_lebs, lebs;
710 long long available, outstanding, free; 710 long long available, outstanding, free;
711 711
712 spin_lock(&c->space_lock); 712 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
713 min_idx_lebs = c->min_idx_lebs;
714 ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
715 outstanding = c->budg_data_growth + c->budg_dd_growth; 713 outstanding = c->budg_data_growth + c->budg_dd_growth;
716 available = ubifs_calc_available(c, min_idx_lebs); 714 available = ubifs_calc_available(c, c->min_idx_lebs);
717 715
718 /* 716 /*
719 * When reporting free space to user-space, UBIFS guarantees that it is 717 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
726 * Note, the calculations below are similar to what we have in 724 * Note, the calculations below are similar to what we have in
727 * 'do_budget_space()', so refer there for comments. 725 * 'do_budget_space()', so refer there for comments.
728 */ 726 */
729 if (min_idx_lebs > c->lst.idx_lebs) 727 if (c->min_idx_lebs > c->lst.idx_lebs)
730 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 728 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
731 else 729 else
732 rsvd_idx_lebs = 0; 730 rsvd_idx_lebs = 0;
733 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 731 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
734 c->lst.taken_empty_lebs; 732 c->lst.taken_empty_lebs;
735 lebs -= rsvd_idx_lebs; 733 lebs -= rsvd_idx_lebs;
736 available += lebs * (c->dark_wm - c->leb_overhead); 734 available += lebs * (c->dark_wm - c->leb_overhead);
737 spin_unlock(&c->space_lock);
738 735
739 if (available > outstanding) 736 if (available > outstanding)
740 free = ubifs_reported_space(c, available - outstanding); 737 free = ubifs_reported_space(c, available - outstanding);
@@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
742 free = 0; 739 free = 0;
743 return free; 740 return free;
744} 741}
742
743/**
744 * ubifs_get_free_space - return amount of free space.
745 * @c: UBIFS file-system description object
746 *
747 * This function calculates and retuns amount of free space to report to
748 * user-space.
749 */
750long long ubifs_get_free_space(struct ubifs_info *c)
751{
752 long long free;
753
754 spin_lock(&c->space_lock);
755 free = ubifs_get_free_space_nolock(c);
756 spin_unlock(&c->space_lock);
757
758 return free;
759}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 792c5a16c182..e975bd82f38b 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
620 c->dark_wm, c->dead_wm, c->max_idx_node_sz); 620 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
622 c->gc_lnum, c->ihead_lnum); 622 c->gc_lnum, c->ihead_lnum);
623 for (i = 0; i < c->jhead_cnt; i++) 623 /* If we are in R/O mode, journal heads do not exist */
624 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 624 if (c->jheads)
625 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 625 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
626 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
627 bud = rb_entry(rb, struct ubifs_bud, rb); 629 bud = rb_entry(rb, struct ubifs_bud, rb);
628 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c)
637 /* Print budgeting predictions */ 639 /* Print budgeting predictions */
638 available = ubifs_calc_available(c, c->min_idx_lebs); 640 available = ubifs_calc_available(c, c->min_idx_lebs);
639 outstanding = c->budg_data_growth + c->budg_dd_growth; 641 outstanding = c->budg_data_growth + c->budg_dd_growth;
640 if (available > outstanding) 642 free = ubifs_get_free_space_nolock(c);
641 free = ubifs_reported_space(c, available - outstanding);
642 else
643 free = 0;
644 printk(KERN_DEBUG "Budgeting predictions:\n"); 643 printk(KERN_DEBUG "Budgeting predictions:\n");
645 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", 644 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
646 available, outstanding, free); 645 available, outstanding, free);
@@ -861,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c)
861} 860}
862 861
863/** 862/**
863 * dbg_save_space_info - save information about flash space.
864 * @c: UBIFS file-system description object
865 *
866 * This function saves information about UBIFS free space, dirty space, etc, in
867 * order to check it later.
868 */
869void dbg_save_space_info(struct ubifs_info *c)
870{
871 struct ubifs_debug_info *d = c->dbg;
872
873 ubifs_get_lp_stats(c, &d->saved_lst);
874
875 spin_lock(&c->space_lock);
876 d->saved_free = ubifs_get_free_space_nolock(c);
877 spin_unlock(&c->space_lock);
878}
879
880/**
881 * dbg_check_space_info - check flash space information.
882 * @c: UBIFS file-system description object
883 *
884 * This function compares current flash space information with the information
885 * which was saved when the 'dbg_save_space_info()' function was called.
886 * Returns zero if the information has not changed, and %-EINVAL it it has
887 * changed.
888 */
889int dbg_check_space_info(struct ubifs_info *c)
890{
891 struct ubifs_debug_info *d = c->dbg;
892 struct ubifs_lp_stats lst;
893 long long avail, free;
894
895 spin_lock(&c->space_lock);
896 avail = ubifs_calc_available(c, c->min_idx_lebs);
897 spin_unlock(&c->space_lock);
898 free = ubifs_get_free_space(c);
899
900 if (free != d->saved_free) {
901 ubifs_err("free space changed from %lld to %lld",
902 d->saved_free, free);
903 goto out;
904 }
905
906 return 0;
907
908out:
909 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst);
912 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst);
914 spin_lock(&c->space_lock);
915 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock);
917 dump_stack();
918 return -EINVAL;
919}
920
921/**
864 * dbg_check_synced_i_size - check synchronized inode size. 922 * dbg_check_synced_i_size - check synchronized inode size.
865 * @inode: inode to check 923 * @inode: inode to check
866 * 924 *
@@ -1349,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
1349 * @c: UBIFS file-system description object 1407 * @c: UBIFS file-system description object
1350 * @leaf_cb: called for each leaf node 1408 * @leaf_cb: called for each leaf node
1351 * @znode_cb: called for each indexing node 1409 * @znode_cb: called for each indexing node
1352 * @priv: private date which is passed to callbacks 1410 * @priv: private data which is passed to callbacks
1353 * 1411 *
1354 * This function walks the UBIFS index and calls the @leaf_cb for each leaf 1412 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
1355 * node and @znode_cb for each indexing node. Returns zero in case of success 1413 * node and @znode_cb for each indexing node. Returns zero in case of success
@@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c)
2409 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which 2467 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
2410 * contain the stuff specific to particular file-system mounts. 2468 * contain the stuff specific to particular file-system mounts.
2411 */ 2469 */
2412static struct dentry *debugfs_rootdir; 2470static struct dentry *dfs_rootdir;
2413 2471
2414/** 2472/**
2415 * dbg_debugfs_init - initialize debugfs file-system. 2473 * dbg_debugfs_init - initialize debugfs file-system.
@@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir;
2421 */ 2479 */
2422int dbg_debugfs_init(void) 2480int dbg_debugfs_init(void)
2423{ 2481{
2424 debugfs_rootdir = debugfs_create_dir("ubifs", NULL); 2482 dfs_rootdir = debugfs_create_dir("ubifs", NULL);
2425 if (IS_ERR(debugfs_rootdir)) { 2483 if (IS_ERR(dfs_rootdir)) {
2426 int err = PTR_ERR(debugfs_rootdir); 2484 int err = PTR_ERR(dfs_rootdir);
2427 ubifs_err("cannot create \"ubifs\" debugfs directory, " 2485 ubifs_err("cannot create \"ubifs\" debugfs directory, "
2428 "error %d\n", err); 2486 "error %d\n", err);
2429 return err; 2487 return err;
@@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void)
2437 */ 2495 */
2438void dbg_debugfs_exit(void) 2496void dbg_debugfs_exit(void)
2439{ 2497{
2440 debugfs_remove(debugfs_rootdir); 2498 debugfs_remove(dfs_rootdir);
2441} 2499}
2442 2500
2443static int open_debugfs_file(struct inode *inode, struct file *file) 2501static int open_debugfs_file(struct inode *inode, struct file *file)
@@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2452 struct ubifs_info *c = file->private_data; 2510 struct ubifs_info *c = file->private_data;
2453 struct ubifs_debug_info *d = c->dbg; 2511 struct ubifs_debug_info *d = c->dbg;
2454 2512
2455 if (file->f_path.dentry == d->dump_lprops) 2513 if (file->f_path.dentry == d->dfs_dump_lprops)
2456 dbg_dump_lprops(c); 2514 dbg_dump_lprops(c);
2457 else if (file->f_path.dentry == d->dump_budg) { 2515 else if (file->f_path.dentry == d->dfs_dump_budg) {
2458 spin_lock(&c->space_lock); 2516 spin_lock(&c->space_lock);
2459 dbg_dump_budg(c); 2517 dbg_dump_budg(c);
2460 spin_unlock(&c->space_lock); 2518 spin_unlock(&c->space_lock);
2461 } else if (file->f_path.dentry == d->dump_tnc) { 2519 } else if (file->f_path.dentry == d->dfs_dump_tnc) {
2462 mutex_lock(&c->tnc_mutex); 2520 mutex_lock(&c->tnc_mutex);
2463 dbg_dump_tnc(c); 2521 dbg_dump_tnc(c);
2464 mutex_unlock(&c->tnc_mutex); 2522 mutex_unlock(&c->tnc_mutex);
@@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2469 return count; 2527 return count;
2470} 2528}
2471 2529
2472static const struct file_operations debugfs_fops = { 2530static const struct file_operations dfs_fops = {
2473 .open = open_debugfs_file, 2531 .open = open_debugfs_file,
2474 .write = write_debugfs_file, 2532 .write = write_debugfs_file,
2475 .owner = THIS_MODULE, 2533 .owner = THIS_MODULE,
@@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
2494 struct dentry *dent; 2552 struct dentry *dent;
2495 struct ubifs_debug_info *d = c->dbg; 2553 struct ubifs_debug_info *d = c->dbg;
2496 2554
2497 sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); 2555 sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
2498 d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name, 2556 d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
2499 debugfs_rootdir); 2557 if (IS_ERR(d->dfs_dir)) {
2500 if (IS_ERR(d->debugfs_dir)) { 2558 err = PTR_ERR(d->dfs_dir);
2501 err = PTR_ERR(d->debugfs_dir);
2502 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", 2559 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2503 d->debugfs_dir_name, err); 2560 d->dfs_dir_name, err);
2504 goto out; 2561 goto out;
2505 } 2562 }
2506 2563
2507 fname = "dump_lprops"; 2564 fname = "dump_lprops";
2508 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2565 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2509 &debugfs_fops);
2510 if (IS_ERR(dent)) 2566 if (IS_ERR(dent))
2511 goto out_remove; 2567 goto out_remove;
2512 d->dump_lprops = dent; 2568 d->dfs_dump_lprops = dent;
2513 2569
2514 fname = "dump_budg"; 2570 fname = "dump_budg";
2515 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2571 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2516 &debugfs_fops);
2517 if (IS_ERR(dent)) 2572 if (IS_ERR(dent))
2518 goto out_remove; 2573 goto out_remove;
2519 d->dump_budg = dent; 2574 d->dfs_dump_budg = dent;
2520 2575
2521 fname = "dump_tnc"; 2576 fname = "dump_tnc";
2522 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, 2577 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2523 &debugfs_fops);
2524 if (IS_ERR(dent)) 2578 if (IS_ERR(dent))
2525 goto out_remove; 2579 goto out_remove;
2526 d->dump_tnc = dent; 2580 d->dfs_dump_tnc = dent;
2527 2581
2528 return 0; 2582 return 0;
2529 2583
@@ -2531,7 +2585,7 @@ out_remove:
2531 err = PTR_ERR(dent); 2585 err = PTR_ERR(dent);
2532 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", 2586 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2533 fname, err); 2587 fname, err);
2534 debugfs_remove_recursive(d->debugfs_dir); 2588 debugfs_remove_recursive(d->dfs_dir);
2535out: 2589out:
2536 return err; 2590 return err;
2537} 2591}
@@ -2542,7 +2596,7 @@ out:
2542 */ 2596 */
2543void dbg_debugfs_exit_fs(struct ubifs_info *c) 2597void dbg_debugfs_exit_fs(struct ubifs_info *c)
2544{ 2598{
2545 debugfs_remove_recursive(c->dbg->debugfs_dir); 2599 debugfs_remove_recursive(c->dbg->dfs_dir);
2546} 2600}
2547 2601
2548#endif /* CONFIG_UBIFS_FS_DEBUG */ 2602#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9820d6999f7e..c1cd73b2e06e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -41,15 +41,17 @@
41 * @chk_lpt_wastage: used by LPT tree size checker 41 * @chk_lpt_wastage: used by LPT tree size checker
42 * @chk_lpt_lebs: used by LPT tree size checker 42 * @chk_lpt_lebs: used by LPT tree size checker
43 * @new_nhead_offs: used by LPT tree size checker 43 * @new_nhead_offs: used by LPT tree size checker
44 * @new_ihead_lnum: used by debugging to check ihead_lnum 44 * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
45 * @new_ihead_offs: used by debugging to check ihead_offs 45 * @new_ihead_offs: used by debugging to check @c->ihead_offs
46 * 46 *
47 * debugfs_dir_name: name of debugfs directory containing this file-system's 47 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
48 * files 48 * @saved_free: saved free space (used by 'dbg_save_space_info()')
49 * debugfs_dir: direntry object of the file-system debugfs directory 49 *
50 * dump_lprops: "dump lprops" debugfs knob 50 * dfs_dir_name: name of debugfs directory containing this file-system's files
51 * dump_budg: "dump budgeting information" debugfs knob 51 * dfs_dir: direntry object of the file-system debugfs directory
52 * dump_tnc: "dump TNC" debugfs knob 52 * dfs_dump_lprops: "dump lprops" debugfs knob
53 * dfs_dump_budg: "dump budgeting information" debugfs knob
54 * dfs_dump_tnc: "dump TNC" debugfs knob
53 */ 55 */
54struct ubifs_debug_info { 56struct ubifs_debug_info {
55 void *buf; 57 void *buf;
@@ -69,11 +71,14 @@ struct ubifs_debug_info {
69 int new_ihead_lnum; 71 int new_ihead_lnum;
70 int new_ihead_offs; 72 int new_ihead_offs;
71 73
72 char debugfs_dir_name[100]; 74 struct ubifs_lp_stats saved_lst;
73 struct dentry *debugfs_dir; 75 long long saved_free;
74 struct dentry *dump_lprops; 76
75 struct dentry *dump_budg; 77 char dfs_dir_name[100];
76 struct dentry *dump_tnc; 78 struct dentry *dfs_dir;
79 struct dentry *dfs_dump_lprops;
80 struct dentry *dfs_dump_budg;
81 struct dentry *dfs_dump_tnc;
77}; 82};
78 83
79#define ubifs_assert(expr) do { \ 84#define ubifs_assert(expr) do { \
@@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
297 dbg_znode_callback znode_cb, void *priv); 302 dbg_znode_callback znode_cb, void *priv);
298 303
299/* Checking functions */ 304/* Checking functions */
300 305void dbg_save_space_info(struct ubifs_info *c);
306int dbg_check_space_info(struct ubifs_info *c);
301int dbg_check_lprops(struct ubifs_info *c); 307int dbg_check_lprops(struct ubifs_info *c);
302int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); 308int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
303int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); 309int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
439 445
440#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 446#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
441#define dbg_old_index_check_init(c, zroot) 0 447#define dbg_old_index_check_init(c, zroot) 0
448#define dbg_save_space_info(c) ({})
449#define dbg_check_space_info(c) 0
442#define dbg_check_old_index(c, zroot) 0 450#define dbg_check_old_index(c, zroot) 0
443#define dbg_check_cats(c) 0 451#define dbg_check_cats(c) 0
444#define dbg_check_ltab(c) 0 452#define dbg_check_ltab(c) 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c38..f55d523c52bb 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
482} 482}
483 483
484/** 484/**
485 * lock_2_inodes - lock two UBIFS inodes. 485 * lock_2_inodes - a wrapper for locking two UBIFS inodes.
486 * @inode1: first inode 486 * @inode1: first inode
487 * @inode2: second inode 487 * @inode2: second inode
488 *
489 * We do not implement any tricks to guarantee strict lock ordering, because
490 * VFS has already done it for us on the @i_mutex. So this is just a simple
491 * wrapper function.
488 */ 492 */
489static void lock_2_inodes(struct inode *inode1, struct inode *inode2) 493static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
490{ 494{
491 if (inode1->i_ino < inode2->i_ino) { 495 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
492 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); 496 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
493 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
494 } else {
495 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
496 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
497 }
498} 497}
499 498
500/** 499/**
501 * unlock_2_inodes - unlock two UBIFS inodes inodes. 500 * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
502 * @inode1: first inode 501 * @inode1: first inode
503 * @inode2: second inode 502 * @inode2: second inode
504 */ 503 */
505static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) 504static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
506{ 505{
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
508 mutex_unlock(&ubifs_inode(inode2)->ui_mutex); 506 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
509} 508}
510 509
511static int ubifs_link(struct dentry *old_dentry, struct inode *dir, 510static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
527 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", 526 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
528 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 527 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
529 inode->i_nlink, dir->i_ino); 528 inode->i_nlink, dir->i_ino);
529 ubifs_assert(mutex_is_locked(&dir->i_mutex));
530 ubifs_assert(mutex_is_locked(&inode->i_mutex));
530 err = dbg_check_synced_i_size(inode); 531 err = dbg_check_synced_i_size(inode);
531 if (err) 532 if (err)
532 return err; 533 return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
580 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", 581 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
581 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 582 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
582 inode->i_nlink, dir->i_ino); 583 inode->i_nlink, dir->i_ino);
584 ubifs_assert(mutex_is_locked(&dir->i_mutex));
585 ubifs_assert(mutex_is_locked(&inode->i_mutex));
583 err = dbg_check_synced_i_size(inode); 586 err = dbg_check_synced_i_size(inode);
584 if (err) 587 if (err)
585 return err; 588 return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
667 670
668 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, 671 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
669 dentry->d_name.name, inode->i_ino, dir->i_ino); 672 dentry->d_name.name, inode->i_ino, dir->i_ino);
670 673 ubifs_assert(mutex_is_locked(&dir->i_mutex));
674 ubifs_assert(mutex_is_locked(&inode->i_mutex));
671 err = check_dir_empty(c, dentry->d_inode); 675 err = check_dir_empty(c, dentry->d_inode);
672 if (err) 676 if (err)
673 return err; 677 return err;
@@ -922,59 +926,30 @@ out_budg:
922} 926}
923 927
924/** 928/**
925 * lock_3_inodes - lock three UBIFS inodes for rename. 929 * lock_3_inodes - a wrapper for locking three UBIFS inodes.
926 * @inode1: first inode 930 * @inode1: first inode
927 * @inode2: second inode 931 * @inode2: second inode
928 * @inode3: third inode 932 * @inode3: third inode
929 * 933 *
930 * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may 934 * This function is used for 'ubifs_rename()' and @inode1 may be the same as
931 * be null. 935 * @inode2 whereas @inode3 may be %NULL.
936 *
937 * We do not implement any tricks to guarantee strict lock ordering, because
938 * VFS has already done it for us on the @i_mutex. So this is just a simple
939 * wrapper function.
932 */ 940 */
933static void lock_3_inodes(struct inode *inode1, struct inode *inode2, 941static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
934 struct inode *inode3) 942 struct inode *inode3)
935{ 943{
936 struct inode *i1, *i2, *i3; 944 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
937 945 if (inode2 != inode1)
938 if (!inode3) { 946 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
939 if (inode1 != inode2) { 947 if (inode3)
940 lock_2_inodes(inode1, inode2); 948 mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
941 return;
942 }
943 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
944 return;
945 }
946
947 if (inode1 == inode2) {
948 lock_2_inodes(inode1, inode3);
949 return;
950 }
951
952 /* 3 different inodes */
953 if (inode1 < inode2) {
954 i3 = inode2;
955 if (inode1 < inode3) {
956 i1 = inode1;
957 i2 = inode3;
958 } else {
959 i1 = inode3;
960 i2 = inode1;
961 }
962 } else {
963 i3 = inode1;
964 if (inode2 < inode3) {
965 i1 = inode2;
966 i2 = inode3;
967 } else {
968 i1 = inode3;
969 i2 = inode2;
970 }
971 }
972 mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
973 lock_2_inodes(i2, i3);
974} 949}
975 950
976/** 951/**
977 * unlock_3_inodes - unlock three UBIFS inodes for rename. 952 * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
978 * @inode1: first inode 953 * @inode1: first inode
979 * @inode2: second inode 954 * @inode2: second inode
980 * @inode3: third inode 955 * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
982static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, 957static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
983 struct inode *inode3) 958 struct inode *inode3)
984{ 959{
985 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
986 if (inode1 != inode2)
987 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
988 if (inode3) 960 if (inode3)
989 mutex_unlock(&ubifs_inode(inode3)->ui_mutex); 961 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
962 if (inode1 != inode2)
963 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
964 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
990} 965}
991 966
992static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, 967static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1020 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, 995 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
1021 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, 996 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
1022 new_dentry->d_name.name, new_dir->i_ino); 997 new_dentry->d_name.name, new_dir->i_ino);
998 ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
999 ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
1000 if (unlink)
1001 ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
1002
1023 1003
1024 if (unlink && is_dir) { 1004 if (unlink && is_dir) {
1025 err = check_dir_empty(c, new_inode); 1005 err = check_dir_empty(c, new_inode);
@@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1199 return 0; 1179 return 0;
1200} 1180}
1201 1181
1202struct inode_operations ubifs_dir_inode_operations = { 1182const struct inode_operations ubifs_dir_inode_operations = {
1203 .lookup = ubifs_lookup, 1183 .lookup = ubifs_lookup,
1204 .create = ubifs_create, 1184 .create = ubifs_create,
1205 .link = ubifs_link, 1185 .link = ubifs_link,
@@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = {
1219#endif 1199#endif
1220}; 1200};
1221 1201
1222struct file_operations ubifs_dir_operations = { 1202const struct file_operations ubifs_dir_operations = {
1223 .llseek = ubifs_dir_llseek, 1203 .llseek = ubifs_dir_llseek,
1224 .release = ubifs_dir_release, 1204 .release = ubifs_dir_release,
1225 .read = generic_read_dir, 1205 .read = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf37374567fa..93b6de51f261 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 struct page *page; 433 struct page *page;
434 434
435
436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
437 436
438 if (unlikely(c->ro_media)) 437 if (unlikely(c->ro_media))
@@ -1541,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1541 return 0; 1540 return 0;
1542} 1541}
1543 1542
1544struct address_space_operations ubifs_file_address_operations = { 1543const struct address_space_operations ubifs_file_address_operations = {
1545 .readpage = ubifs_readpage, 1544 .readpage = ubifs_readpage,
1546 .writepage = ubifs_writepage, 1545 .writepage = ubifs_writepage,
1547 .write_begin = ubifs_write_begin, 1546 .write_begin = ubifs_write_begin,
@@ -1551,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = {
1551 .releasepage = ubifs_releasepage, 1550 .releasepage = ubifs_releasepage,
1552}; 1551};
1553 1552
1554struct inode_operations ubifs_file_inode_operations = { 1553const struct inode_operations ubifs_file_inode_operations = {
1555 .setattr = ubifs_setattr, 1554 .setattr = ubifs_setattr,
1556 .getattr = ubifs_getattr, 1555 .getattr = ubifs_getattr,
1557#ifdef CONFIG_UBIFS_FS_XATTR 1556#ifdef CONFIG_UBIFS_FS_XATTR
@@ -1562,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = {
1562#endif 1561#endif
1563}; 1562};
1564 1563
1565struct inode_operations ubifs_symlink_inode_operations = { 1564const struct inode_operations ubifs_symlink_inode_operations = {
1566 .readlink = generic_readlink, 1565 .readlink = generic_readlink,
1567 .follow_link = ubifs_follow_link, 1566 .follow_link = ubifs_follow_link,
1568 .setattr = ubifs_setattr, 1567 .setattr = ubifs_setattr,
1569 .getattr = ubifs_getattr, 1568 .getattr = ubifs_getattr,
1570}; 1569};
1571 1570
1572struct file_operations ubifs_file_operations = { 1571const struct file_operations ubifs_file_operations = {
1573 .llseek = generic_file_llseek, 1572 .llseek = generic_file_llseek,
1574 .read = do_sync_read, 1573 .read = do_sync_read,
1575 .write = do_sync_write, 1574 .write = do_sync_write,
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9832f9abe28e..a711d33b3d3e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
31 * to be reused. Garbage collection will cause the number of dirty index nodes 31 * to be reused. Garbage collection will cause the number of dirty index nodes
32 * to grow, however sufficient space is reserved for the index to ensure the 32 * to grow, however sufficient space is reserved for the index to ensure the
33 * commit will never run out of space. 33 * commit will never run out of space.
34 *
35 * Notes about dead watermark. At current UBIFS implementation we assume that
36 * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
37 * and not worth garbage-collecting. The dead watermark is one min. I/O unit
38 * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
39 * Garbage Collector has to synchronize the GC head's write buffer before
40 * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
41 * actually reclaim even very small pieces of dirty space by garbage collecting
42 * enough dirty LEBs, but we do not bother doing this at this implementation.
43 *
44 * Notes about dark watermark. The results of GC work depends on how big are
45 * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
46 * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them.
34 */ 54 */
35 55
36#include <linux/pagemap.h> 56#include <linux/pagemap.h>
@@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
381 401
382 /* 402 /*
383 * Don't release the LEB until after the next commit, because 403 * Don't release the LEB until after the next commit, because
384 * it may contain date which is needed for recovery. So 404 * it may contain data which is needed for recovery. So
385 * although we freed this LEB, it will become usable only after 405 * although we freed this LEB, it will become usable only after
386 * the commit. 406 * the commit.
387 */ 407 */
@@ -810,8 +830,9 @@ out:
810 * ubifs_destroy_idx_gc - destroy idx_gc list. 830 * ubifs_destroy_idx_gc - destroy idx_gc list.
811 * @c: UBIFS file-system description object 831 * @c: UBIFS file-system description object
812 * 832 *
813 * This function destroys the idx_gc list. It is called when unmounting or 833 * This function destroys the @c->idx_gc list. It is called when unmounting
814 * remounting read-only so locks are not needed. 834 * so locks are not needed. Returns zero in case of success and a negative
835 * error code in case of failure.
815 */ 836 */
816void ubifs_destroy_idx_gc(struct ubifs_info *c) 837void ubifs_destroy_idx_gc(struct ubifs_info *c)
817{ 838{
@@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c)
824 list_del(&idx_gc->list); 845 list_del(&idx_gc->list);
825 kfree(idx_gc); 846 kfree(idx_gc);
826 } 847 }
827
828} 848}
829 849
830/** 850/**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af69..e8e632a1dcdf 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
29 * would have been wasted for padding to the nearest minimal I/O unit boundary. 29 * would have been wasted for padding to the nearest minimal I/O unit boundary.
30 * Instead, data first goes to the write-buffer and is flushed when the 30 * Instead, data first goes to the write-buffer and is flushed when the
31 * buffer is full or when it is not used for some time (by timer). This is 31 * buffer is full or when it is not used for some time (by timer). This is
32 * similarto the mechanism is used by JFFS2. 32 * similar to the mechanism is used by JFFS2.
33 * 33 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by 34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code 35 * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
75 * @lnum: logical eraseblock number 75 * @lnum: logical eraseblock number
76 * @offs: offset within the logical eraseblock 76 * @offs: offset within the logical eraseblock
77 * @quiet: print no messages 77 * @quiet: print no messages
78 * @chk_crc: indicates whether to always check the CRC 78 * @must_chk_crc: indicates whether to always check the CRC
79 * 79 *
80 * This function checks node magic number and CRC checksum. This function also 80 * This function checks node magic number and CRC checksum. This function also
81 * validates node length to prevent UBIFS from becoming crazy when an attacker 81 * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
83 * node length in the common header could cause UBIFS to read memory outside of 83 * node length in the common header could cause UBIFS to read memory outside of
84 * allocated buffer when checking the CRC checksum. 84 * allocated buffer when checking the CRC checksum.
85 * 85 *
86 * This function returns zero in case of success %-EUCLEAN in case of bad CRC 86 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
87 * or magic. 87 * true, which is controlled by corresponding UBIFS mount option. However, if
88 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
89 * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
90 * ignored and CRC is checked.
91 *
92 * This function returns zero in case of success and %-EUCLEAN in case of bad
93 * CRC or magic.
88 */ 94 */
89int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 95int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
90 int offs, int quiet, int chk_crc) 96 int offs, int quiet, int must_chk_crc)
91{ 97{
92 int err = -EINVAL, type, node_len; 98 int err = -EINVAL, type, node_len;
93 uint32_t crc, node_crc, magic; 99 uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
123 node_len > c->ranges[type].max_len) 129 node_len > c->ranges[type].max_len)
124 goto out_len; 130 goto out_len;
125 131
126 if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) 132 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
127 if (c->no_chk_data_crc) 133 c->no_chk_data_crc)
128 return 0; 134 return 0;
129 135
130 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 136 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
131 node_crc = le32_to_cpu(ch->crc); 137 node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 9b7c54e0cd2a..a11ca0958a23 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -208,7 +208,7 @@ again:
208 offs = 0; 208 offs = 0;
209 209
210out: 210out:
211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); 211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
212 if (err) 212 if (err)
213 goto out_unlock; 213 goto out_unlock;
214 214
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index dfd2bcece27a..4cdd284dea56 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
635 * @c: UBIFS file-system description object 635 * @c: UBIFS file-system description object
636 * @st: return statistics 636 * @st: return statistics
637 */ 637 */
638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) 638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
639{ 639{
640 spin_lock(&c->space_lock); 640 spin_lock(&c->space_lock);
641 memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); 641 memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
642 spin_unlock(&c->space_lock); 642 spin_unlock(&c->space_lock);
643} 643}
644 644
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
678 678
679out: 679out:
680 ubifs_release_lprops(c); 680 ubifs_release_lprops(c);
681 if (err)
682 ubifs_err("cannot change properties of LEB %d, error %d",
683 lnum, err);
681 return err; 684 return err;
682} 685}
683 686
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
714 717
715out: 718out:
716 ubifs_release_lprops(c); 719 ubifs_release_lprops(c);
720 if (err)
721 ubifs_err("cannot update properties of LEB %d, error %d",
722 lnum, err);
717 return err; 723 return err;
718} 724}
719 725
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
737 lpp = ubifs_lpt_lookup(c, lnum); 743 lpp = ubifs_lpt_lookup(c, lnum);
738 if (IS_ERR(lpp)) { 744 if (IS_ERR(lpp)) {
739 err = PTR_ERR(lpp); 745 err = PTR_ERR(lpp);
746 ubifs_err("cannot read properties of LEB %d, error %d",
747 lnum, err);
740 goto out; 748 goto out;
741 } 749 }
742 750
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 96ca95707175..3216a1f277f8 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -556,23 +556,23 @@ no_space:
556} 556}
557 557
558/** 558/**
559 * next_pnode - find next pnode. 559 * next_pnode_to_dirty - find next pnode to dirty.
560 * @c: UBIFS file-system description object 560 * @c: UBIFS file-system description object
561 * @pnode: pnode 561 * @pnode: pnode
562 * 562 *
563 * This function returns the next pnode or %NULL if there are no more pnodes. 563 * This function returns the next pnode to dirty or %NULL if there are no more
564 * pnodes. Note that pnodes that have never been written (lnum == 0) are
565 * skipped.
564 */ 566 */
565static struct ubifs_pnode *next_pnode(struct ubifs_info *c, 567static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
566 struct ubifs_pnode *pnode) 568 struct ubifs_pnode *pnode)
567{ 569{
568 struct ubifs_nnode *nnode; 570 struct ubifs_nnode *nnode;
569 int iip; 571 int iip;
570 572
571 /* Try to go right */ 573 /* Try to go right */
572 nnode = pnode->parent; 574 nnode = pnode->parent;
573 iip = pnode->iip + 1; 575 for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
574 if (iip < UBIFS_LPT_FANOUT) {
575 /* We assume here that LEB zero is never an LPT LEB */
576 if (nnode->nbranch[iip].lnum) 576 if (nnode->nbranch[iip].lnum)
577 return ubifs_get_pnode(c, nnode, iip); 577 return ubifs_get_pnode(c, nnode, iip);
578 } 578 }
@@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
583 nnode = nnode->parent; 583 nnode = nnode->parent;
584 if (!nnode) 584 if (!nnode)
585 return NULL; 585 return NULL;
586 /* We assume here that LEB zero is never an LPT LEB */ 586 for (; iip < UBIFS_LPT_FANOUT; iip++) {
587 } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); 587 if (nnode->nbranch[iip].lnum)
588 break;
589 }
590 } while (iip >= UBIFS_LPT_FANOUT);
588 591
589 /* Go right */ 592 /* Go right */
590 nnode = ubifs_get_nnode(c, nnode, iip); 593 nnode = ubifs_get_nnode(c, nnode, iip);
@@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
593 596
594 /* Go down to level 1 */ 597 /* Go down to level 1 */
595 while (nnode->level > 1) { 598 while (nnode->level > 1) {
596 nnode = ubifs_get_nnode(c, nnode, 0); 599 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
600 if (nnode->nbranch[iip].lnum)
601 break;
602 }
603 if (iip >= UBIFS_LPT_FANOUT) {
604 /*
605 * Should not happen, but we need to keep going
606 * if it does.
607 */
608 iip = 0;
609 }
610 nnode = ubifs_get_nnode(c, nnode, iip);
597 if (IS_ERR(nnode)) 611 if (IS_ERR(nnode))
598 return (void *)nnode; 612 return (void *)nnode;
599 } 613 }
600 614
601 return ubifs_get_pnode(c, nnode, 0); 615 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
616 if (nnode->nbranch[iip].lnum)
617 break;
618 if (iip >= UBIFS_LPT_FANOUT)
619 /* Should not happen, but we need to keep going if it does */
620 iip = 0;
621 return ubifs_get_pnode(c, nnode, iip);
602} 622}
603 623
604/** 624/**
@@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
688 pnode = pnode_lookup(c, 0); 708 pnode = pnode_lookup(c, 0);
689 while (pnode) { 709 while (pnode) {
690 do_make_pnode_dirty(c, pnode); 710 do_make_pnode_dirty(c, pnode);
691 pnode = next_pnode(c, pnode); 711 pnode = next_pnode_to_dirty(c, pnode);
692 if (IS_ERR(pnode)) 712 if (IS_ERR(pnode))
693 return PTR_ERR(pnode); 713 return PTR_ERR(pnode);
694 } 714 }
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf565..a88f33801b98 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
354 int err, lnum, offs, len; 354 int err, lnum, offs, len;
355 355
356 if (c->ro_media) 356 if (c->ro_media)
357 return -EINVAL; 357 return -EROFS;
358 358
359 lnum = UBIFS_MST_LNUM; 359 lnum = UBIFS_MST_LNUM;
360 offs = c->mst_offs + c->mst_node_alsz; 360 offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9e6f403f170e..152a7b34a141 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to 46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
47 * zero, the inode number is added to the rb-tree. It is removed from the tree 47 * zero, the inode number is added to the rb-tree. It is removed from the tree
48 * when the inode is deleted. Any new orphans that are in the orphan tree when 48 * when the inode is deleted. Any new orphans that are in the orphan tree when
49 * the commit is run, are written to the orphan area in 1 or more orph nodes. 49 * the commit is run, are written to the orphan area in 1 or more orphan nodes.
50 * If the orphan area is full, it is consolidated to make space. There is 50 * If the orphan area is full, it is consolidated to make space. There is
51 * always enough space because validation prevents the user from creating more 51 * always enough space because validation prevents the user from creating more
52 * than the maximum number of orphans allowed. 52 * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
231} 231}
232 232
233/** 233/**
234 * do_write_orph_node - write a node 234 * do_write_orph_node - write a node to the orphan head.
235 * @c: UBIFS file-system description object 235 * @c: UBIFS file-system description object
236 * @len: length of node 236 * @len: length of node
237 * @atomic: write atomically 237 * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
264} 264}
265 265
266/** 266/**
267 * write_orph_node - write an orph node 267 * write_orph_node - write an orphan node.
268 * @c: UBIFS file-system description object 268 * @c: UBIFS file-system description object
269 * @atomic: write atomically 269 * @atomic: write atomically
270 * 270 *
271 * This function builds an orph node from the cnext list and writes it to the 271 * This function builds an orphan node from the cnext list and writes it to the
272 * orphan head. On success, %0 is returned, otherwise a negative error code 272 * orphan head. On success, %0 is returned, otherwise a negative error code
273 * is returned. 273 * is returned.
274 */ 274 */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
326} 326}
327 327
328/** 328/**
329 * write_orph_nodes - write orph nodes until there are no more to commit 329 * write_orph_nodes - write orphan nodes until there are no more to commit.
330 * @c: UBIFS file-system description object 330 * @c: UBIFS file-system description object
331 * @atomic: write atomically 331 * @atomic: write atomically
332 * 332 *
333 * This function writes orph nodes for all the orphans to commit. On success, 333 * This function writes orphan nodes for all the orphans to commit. On success,
334 * %0 is returned, otherwise a negative error code is returned. 334 * %0 is returned, otherwise a negative error code is returned.
335 */ 335 */
336static int write_orph_nodes(struct ubifs_info *c, int atomic) 336static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
478} 478}
479 479
480/** 480/**
481 * clear_orphans - erase all LEBs used for orphans. 481 * ubifs_clear_orphans - erase all LEBs used for orphans.
482 * @c: UBIFS file-system description object 482 * @c: UBIFS file-system description object
483 * 483 *
484 * If recovery is not required, then the orphans from the previous session 484 * If recovery is not required, then the orphans from the previous session
485 * are not needed. This function locates the LEBs used to record 485 * are not needed. This function locates the LEBs used to record
486 * orphans, and un-maps them. 486 * orphans, and un-maps them.
487 */ 487 */
488static int clear_orphans(struct ubifs_info *c) 488int ubifs_clear_orphans(struct ubifs_info *c)
489{ 489{
490 int lnum, err; 490 int lnum, err;
491 491
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
547 * do_kill_orphans - remove orphan inodes from the index. 547 * do_kill_orphans - remove orphan inodes from the index.
548 * @c: UBIFS file-system description object 548 * @c: UBIFS file-system description object
549 * @sleb: scanned LEB 549 * @sleb: scanned LEB
550 * @last_cmt_no: cmt_no of last orph node read is passed and returned here 550 * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
551 * @outofdate: whether the LEB is out of date is returned here 551 * @outofdate: whether the LEB is out of date is returned here
552 * @last_flagged: whether the end orph node is encountered 552 * @last_flagged: whether the end orphan node is encountered
553 * 553 *
554 * This function is a helper to the 'kill_orphans()' function. It goes through 554 * This function is a helper to the 'kill_orphans()' function. It goes through
555 * every orphan node in a LEB and for every inode number recorded, removes 555 * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
580 /* 580 /*
581 * The commit number on the master node may be less, because 581 * The commit number on the master node may be less, because
582 * of a failed commit. If there are several failed commits in a 582 * of a failed commit. If there are several failed commits in a
583 * row, the commit number written on orph nodes will continue to 583 * row, the commit number written on orphan nodes will continue
584 * increase (because the commit number is adjusted here) even 584 * to increase (because the commit number is adjusted here) even
585 * though the commit number on the master node stays the same 585 * though the commit number on the master node stays the same
586 * because the master node has not been re-written. 586 * because the master node has not been re-written.
587 */ 587 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
589 c->cmt_no = cmt_no; 589 c->cmt_no = cmt_no;
590 if (cmt_no < *last_cmt_no && *last_flagged) { 590 if (cmt_no < *last_cmt_no && *last_flagged) {
591 /* 591 /*
592 * The last orph node had a higher commit number and was 592 * The last orphan node had a higher commit number and
593 * flagged as the last written for that commit number. 593 * was flagged as the last written for that commit
594 * That makes this orph node, out of date. 594 * number. That makes this orphan node, out of date.
595 */ 595 */
596 if (!first) { 596 if (!first) {
597 ubifs_err("out of order commit number %llu in " 597 ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
658 /* 658 /*
659 * Orph nodes always start at c->orph_first and are written to each 659 * Orph nodes always start at c->orph_first and are written to each
660 * successive LEB in turn. Generally unused LEBs will have been unmapped 660 * successive LEB in turn. Generally unused LEBs will have been unmapped
661 * but may contain out of date orph nodes if the unmap didn't go 661 * but may contain out of date orphan nodes if the unmap didn't go
662 * through. In addition, the last orph node written for each commit is 662 * through. In addition, the last orphan node written for each commit is
663 * marked (top bit of orph->cmt_no is set to 1). It is possible that 663 * marked (top bit of orph->cmt_no is set to 1). It is possible that
664 * there are orph nodes from the next commit (i.e. the commit did not 664 * there are orphan nodes from the next commit (i.e. the commit did not
665 * complete successfully). In that case, no orphans will have been lost 665 * complete successfully). In that case, no orphans will have been lost
666 * due to the way that orphans are written, and any orphans added will 666 * due to the way that orphans are written, and any orphans added will
667 * be valid orphans anyway and so can be deleted. 667 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
718 if (unclean) 718 if (unclean)
719 err = kill_orphans(c); 719 err = kill_orphans(c);
720 else if (!read_only) 720 else if (!read_only)
721 err = clear_orphans(c); 721 err = ubifs_clear_orphans(c);
722 722
723 return err; 723 return err;
724} 724}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 89556ee72518..1182b66a5491 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
397 buf->f_namelen = UBIFS_MAX_NLEN; 397 buf->f_namelen = UBIFS_MAX_NLEN;
398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); 398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); 399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
400 ubifs_assert(buf->f_bfree <= c->block_cnt);
400 return 0; 401 return 0;
401} 402}
402 403
@@ -432,33 +433,24 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
432 int i, err; 433 int i, err;
433 struct ubifs_info *c = sb->s_fs_info; 434 struct ubifs_info *c = sb->s_fs_info;
434 struct writeback_control wbc = { 435 struct writeback_control wbc = {
435 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 436 .sync_mode = WB_SYNC_ALL,
436 .range_start = 0, 437 .range_start = 0,
437 .range_end = LLONG_MAX, 438 .range_end = LLONG_MAX,
438 .nr_to_write = LONG_MAX, 439 .nr_to_write = LONG_MAX,
439 }; 440 };
440 441
441 /* 442 /*
442 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an 443 * Zero @wait is just an advisory thing to help the file system shove
443 * advisory thing to help the file system shove lots of data into the 444 * lots of data into the queues, and there will be the second
444 * queues. If some gets missed then it'll be picked up on the second
445 * '->sync_fs()' call, with non-zero @wait. 445 * '->sync_fs()' call, with non-zero @wait.
446 */ 446 */
447 if (!wait)
448 return 0;
447 449
448 if (sb->s_flags & MS_RDONLY) 450 if (sb->s_flags & MS_RDONLY)
449 return 0; 451 return 0;
450 452
451 /* 453 /*
452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
453 * do this if it waits for an already running commit.
454 */
455 for (i = 0; i < c->jhead_cnt; i++) {
456 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
457 if (err)
458 return err;
459 }
460
461 /*
462 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and 454 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
463 * pages, so synchronize them first, then commit the journal. Strictly 455 * pages, so synchronize them first, then commit the journal. Strictly
464 * speaking, it is not necessary to commit the journal here, 456 * speaking, it is not necessary to commit the journal here,
@@ -469,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
469 */ 461 */
470 generic_sync_sb_inodes(sb, &wbc); 462 generic_sync_sb_inodes(sb, &wbc);
471 463
464 /*
465 * Synchronize write buffers, because 'ubifs_run_commit()' does not
466 * do this if it waits for an already running commit.
467 */
468 for (i = 0; i < c->jhead_cnt; i++) {
469 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
470 if (err)
471 return err;
472 }
473
472 err = ubifs_run_commit(c); 474 err = ubifs_run_commit(c);
473 if (err) 475 if (err)
474 return err; 476 return err;
@@ -572,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c)
572 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; 574 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
573 575
574 /* 576 /*
575 * Initialize dead and dark LEB space watermarks. 577 * Initialize dead and dark LEB space watermarks. See gc.c for comments
576 * 578 * about these values.
577 * Dead space is the space which cannot be used. Its watermark is
578 * equivalent to min. I/O unit or minimum node size if it is greater
579 * then min. I/O unit.
580 *
581 * Dark space is the space which might be used, or might not, depending
582 * on which node should be written to the LEB. Its watermark is
583 * equivalent to maximum UBIFS node size.
584 */ 579 */
585 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 580 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
586 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 581 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
@@ -741,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c)
741 * take_gc_lnum - reserve GC LEB. 736 * take_gc_lnum - reserve GC LEB.
742 * @c: UBIFS file-system description object 737 * @c: UBIFS file-system description object
743 * 738 *
744 * This function ensures that the LEB reserved for garbage collection is 739 * This function ensures that the LEB reserved for garbage collection is marked
745 * unmapped and is marked as "taken" in lprops. We also have to set free space 740 * as "taken" in lprops. We also have to set free space to LEB size and dirty
746 * to LEB size and dirty space to zero, because lprops may contain out-of-date 741 * space to zero, because lprops may contain out-of-date information if the
747 * information if the file-system was un-mounted before it has been committed. 742 * file-system was un-mounted before it has been committed. This function
748 * This function returns zero in case of success and a negative error code in 743 * returns zero in case of success and a negative error code in case of
749 * case of failure. 744 * failure.
750 */ 745 */
751static int take_gc_lnum(struct ubifs_info *c) 746static int take_gc_lnum(struct ubifs_info *c)
752{ 747{
@@ -757,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
757 return -EINVAL; 752 return -EINVAL;
758 } 753 }
759 754
760 err = ubifs_leb_unmap(c, c->gc_lnum);
761 if (err)
762 return err;
763
764 /* And we have to tell lprops that this LEB is taken */ 755 /* And we have to tell lprops that this LEB is taken */
765 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, 756 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
766 LPROPS_TAKEN, 0, 0); 757 LPROPS_TAKEN, 0, 0);
@@ -966,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
966 957
967 token = match_token(p, tokens, args); 958 token = match_token(p, tokens, args);
968 switch (token) { 959 switch (token) {
960 /*
961 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
962 * We accepte them in order to be backware-compatible. But this
963 * should be removed at some point.
964 */
969 case Opt_fast_unmount: 965 case Opt_fast_unmount:
970 c->mount_opts.unmount_mode = 2; 966 c->mount_opts.unmount_mode = 2;
971 c->fast_unmount = 1;
972 break; 967 break;
973 case Opt_norm_unmount: 968 case Opt_norm_unmount:
974 c->mount_opts.unmount_mode = 1; 969 c->mount_opts.unmount_mode = 1;
975 c->fast_unmount = 0;
976 break; 970 break;
977 case Opt_bulk_read: 971 case Opt_bulk_read:
978 c->mount_opts.bulk_read = 2; 972 c->mount_opts.bulk_read = 2;
@@ -1094,12 +1088,7 @@ static int check_free_space(struct ubifs_info *c)
1094 ubifs_err("insufficient free space to mount in read/write mode"); 1088 ubifs_err("insufficient free space to mount in read/write mode");
1095 dbg_dump_budg(c); 1089 dbg_dump_budg(c);
1096 dbg_dump_lprops(c); 1090 dbg_dump_lprops(c);
1097 /* 1091 return -ENOSPC;
1098 * We return %-EINVAL instead of %-ENOSPC because it seems to
1099 * be the closest error code mentioned in the mount function
1100 * documentation.
1101 */
1102 return -EINVAL;
1103 } 1092 }
1104 return 0; 1093 return 0;
1105} 1094}
@@ -1286,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c)
1286 if (err) 1275 if (err)
1287 goto out_orphans; 1276 goto out_orphans;
1288 err = ubifs_rcvry_gc_commit(c); 1277 err = ubifs_rcvry_gc_commit(c);
1289 } else 1278 } else {
1290 err = take_gc_lnum(c); 1279 err = take_gc_lnum(c);
1291 if (err) 1280 if (err)
1292 goto out_orphans; 1281 goto out_orphans;
1282
1283 /*
1284 * GC LEB may contain garbage if there was an unclean
1285 * reboot, and it should be un-mapped.
1286 */
1287 err = ubifs_leb_unmap(c, c->gc_lnum);
1288 if (err)
1289 return err;
1290 }
1293 1291
1294 err = dbg_check_lprops(c); 1292 err = dbg_check_lprops(c);
1295 if (err) 1293 if (err)
@@ -1298,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c)
1298 err = ubifs_recover_size(c); 1296 err = ubifs_recover_size(c);
1299 if (err) 1297 if (err)
1300 goto out_orphans; 1298 goto out_orphans;
1299 } else {
1300 /*
1301 * Even if we mount read-only, we have to set space in GC LEB
1302 * to proper value because this affects UBIFS free space
1303 * reporting. We do not want to have a situation when
1304 * re-mounting from R/O to R/W changes amount of free space.
1305 */
1306 err = take_gc_lnum(c);
1307 if (err)
1308 goto out_orphans;
1301 } 1309 }
1302 1310
1303 spin_lock(&ubifs_infos_lock); 1311 spin_lock(&ubifs_infos_lock);
@@ -1310,14 +1318,17 @@ static int mount_ubifs(struct ubifs_info *c)
1310 else { 1318 else {
1311 c->need_recovery = 0; 1319 c->need_recovery = 0;
1312 ubifs_msg("recovery completed"); 1320 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */
1322 ubifs_assert(c->lst.taken_empty_lebs == 1);
1313 } 1323 }
1314 } 1324 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1);
1315 1326
1316 err = dbg_debugfs_init_fs(c); 1327 err = dbg_check_filesystem(c);
1317 if (err) 1328 if (err)
1318 goto out_infos; 1329 goto out_infos;
1319 1330
1320 err = dbg_check_filesystem(c); 1331 err = dbg_debugfs_init_fs(c);
1321 if (err) 1332 if (err)
1322 goto out_infos; 1333 goto out_infos;
1323 1334
@@ -1351,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
1351 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], 1362 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1352 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], 1363 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1353 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); 1364 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1354 dbg_msg("fast unmount: %d", c->fast_unmount);
1355 dbg_msg("big_lpt %d", c->big_lpt); 1365 dbg_msg("big_lpt %d", c->big_lpt);
1356 dbg_msg("log LEBs: %d (%d - %d)", 1366 dbg_msg("log LEBs: %d (%d - %d)",
1357 c->log_lebs, UBIFS_LOG_LNUM, c->log_last); 1367 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1475,10 +1485,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1475{ 1485{
1476 int err, lnum; 1486 int err, lnum;
1477 1487
1478 if (c->ro_media)
1479 return -EINVAL;
1480
1481 mutex_lock(&c->umount_mutex); 1488 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c);
1482 c->remounting_rw = 1; 1490 c->remounting_rw = 1;
1483 c->always_chk_crc = 1; 1491 c->always_chk_crc = 1;
1484 1492
@@ -1514,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1514 err = ubifs_recover_inl_heads(c, c->sbuf); 1522 err = ubifs_recover_inl_heads(c, c->sbuf);
1515 if (err) 1523 if (err)
1516 goto out; 1524 goto out;
1525 } else {
1526 /* A readonly mount is not allowed to have orphans */
1527 ubifs_assert(c->tot_orphans == 0);
1528 err = ubifs_clear_orphans(c);
1529 if (err)
1530 goto out;
1517 } 1531 }
1518 1532
1519 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { 1533 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
@@ -1569,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1569 if (c->need_recovery) 1583 if (c->need_recovery)
1570 err = ubifs_rcvry_gc_commit(c); 1584 err = ubifs_rcvry_gc_commit(c);
1571 else 1585 else
1572 err = take_gc_lnum(c); 1586 err = ubifs_leb_unmap(c, c->gc_lnum);
1573 if (err) 1587 if (err)
1574 goto out; 1588 goto out;
1575 1589
@@ -1582,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1582 c->vfs_sb->s_flags &= ~MS_RDONLY; 1596 c->vfs_sb->s_flags &= ~MS_RDONLY;
1583 c->remounting_rw = 0; 1597 c->remounting_rw = 0;
1584 c->always_chk_crc = 0; 1598 c->always_chk_crc = 0;
1599 err = dbg_check_space_info(c);
1585 mutex_unlock(&c->umount_mutex); 1600 mutex_unlock(&c->umount_mutex);
1586 return 0; 1601 return err;
1587 1602
1588out: 1603out:
1589 vfree(c->orph_buf); 1604 vfree(c->orph_buf);
@@ -1603,43 +1618,18 @@ out:
1603} 1618}
1604 1619
1605/** 1620/**
1606 * commit_on_unmount - commit the journal when un-mounting.
1607 * @c: UBIFS file-system description object
1608 *
1609 * This function is called during un-mounting and re-mounting, and it commits
1610 * the journal unless the "fast unmount" mode is enabled.
1611 */
1612static void commit_on_unmount(struct ubifs_info *c)
1613{
1614 struct super_block *sb = c->vfs_sb;
1615 long long bud_bytes;
1616
1617 /*
1618 * This function is called before the background thread is stopped, so
1619 * we may race with ongoing commit, which means we have to take
1620 * @c->bud_lock to access @c->bud_bytes.
1621 */
1622 spin_lock(&c->buds_lock);
1623 bud_bytes = c->bud_bytes;
1624 spin_unlock(&c->buds_lock);
1625
1626 if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
1627 ubifs_run_commit(c);
1628}
1629
1630/**
1631 * ubifs_remount_ro - re-mount in read-only mode. 1621 * ubifs_remount_ro - re-mount in read-only mode.
1632 * @c: UBIFS file-system description object 1622 * @c: UBIFS file-system description object
1633 * 1623 *
1634 * We rely on VFS to have stopped writing. Possibly the background thread could 1624 * We assume VFS has stopped writing. Possibly the background thread could be
1635 * be running a commit, however kthread_stop will wait in that case. 1625 * running a commit, however kthread_stop will wait in that case.
1636 */ 1626 */
1637static void ubifs_remount_ro(struct ubifs_info *c) 1627static void ubifs_remount_ro(struct ubifs_info *c)
1638{ 1628{
1639 int i, err; 1629 int i, err;
1640 1630
1641 ubifs_assert(!c->need_recovery); 1631 ubifs_assert(!c->need_recovery);
1642 commit_on_unmount(c); 1632 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
1643 1633
1644 mutex_lock(&c->umount_mutex); 1634 mutex_lock(&c->umount_mutex);
1645 if (c->bgt) { 1635 if (c->bgt) {
@@ -1647,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1647 c->bgt = NULL; 1637 c->bgt = NULL;
1648 } 1638 }
1649 1639
1640 dbg_save_space_info(c);
1641
1650 for (i = 0; i < c->jhead_cnt; i++) { 1642 for (i = 0; i < c->jhead_cnt; i++) {
1651 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1643 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1652 del_timer_sync(&c->jheads[i].wbuf.timer); 1644 del_timer_sync(&c->jheads[i].wbuf.timer);
1653 } 1645 }
1654 1646
1655 if (!c->ro_media) { 1647 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1656 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1648 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1657 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1649 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1658 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1650 err = ubifs_write_master(c);
1659 err = ubifs_write_master(c); 1651 if (err)
1660 if (err) 1652 ubifs_ro_mode(c, err);
1661 ubifs_ro_mode(c, err);
1662 }
1663 1653
1664 ubifs_destroy_idx_gc(c);
1665 free_wbufs(c); 1654 free_wbufs(c);
1666 vfree(c->orph_buf); 1655 vfree(c->orph_buf);
1667 c->orph_buf = NULL; 1656 c->orph_buf = NULL;
1668 vfree(c->ileb_buf); 1657 vfree(c->ileb_buf);
1669 c->ileb_buf = NULL; 1658 c->ileb_buf = NULL;
1670 ubifs_lpt_free(c, 1); 1659 ubifs_lpt_free(c, 1);
1660 err = dbg_check_space_info(c);
1661 if (err)
1662 ubifs_ro_mode(c, err);
1671 mutex_unlock(&c->umount_mutex); 1663 mutex_unlock(&c->umount_mutex);
1672} 1664}
1673 1665
@@ -1760,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1760 } 1752 }
1761 1753
1762 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1754 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1755 if (c->ro_media) {
1756 ubifs_msg("cannot re-mount due to prior errors");
1757 return -EROFS;
1758 }
1763 err = ubifs_remount_rw(c); 1759 err = ubifs_remount_rw(c);
1764 if (err) 1760 if (err)
1765 return err; 1761 return err;
1766 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) 1762 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1763 if (c->ro_media) {
1764 ubifs_msg("cannot re-mount due to prior errors");
1765 return -EROFS;
1766 }
1767 ubifs_remount_ro(c); 1767 ubifs_remount_ro(c);
1768 }
1768 1769
1769 if (c->bulk_read == 1) 1770 if (c->bulk_read == 1)
1770 bu_init(c); 1771 bu_init(c);
@@ -1774,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1774 c->bu.buf = NULL; 1775 c->bu.buf = NULL;
1775 } 1776 }
1776 1777
1778 ubifs_assert(c->lst.taken_empty_lebs == 1);
1777 return 0; 1779 return 0;
1778} 1780}
1779 1781
1780struct super_operations ubifs_super_operations = { 1782const struct super_operations ubifs_super_operations = {
1781 .alloc_inode = ubifs_alloc_inode, 1783 .alloc_inode = ubifs_alloc_inode,
1782 .destroy_inode = ubifs_destroy_inode, 1784 .destroy_inode = ubifs_destroy_inode,
1783 .put_super = ubifs_put_super, 1785 .put_super = ubifs_put_super,
@@ -2044,15 +2046,6 @@ out_close:
2044 2046
2045static void ubifs_kill_sb(struct super_block *sb) 2047static void ubifs_kill_sb(struct super_block *sb)
2046{ 2048{
2047 struct ubifs_info *c = sb->s_fs_info;
2048
2049 /*
2050 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
2051 * in order to be outside BKL.
2052 */
2053 if (sb->s_root)
2054 commit_on_unmount(c);
2055 /* The un-mount routine is actually done in put_super() */
2056 generic_shutdown_super(sb); 2049 generic_shutdown_super(sb);
2057} 2050}
2058 2051
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f7e36f545527..fa28a84c6a1b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
443 * This function performs that same function as ubifs_read_node except that 443 * This function performs that same function as ubifs_read_node except that
444 * it does not require that there is actually a node present and instead 444 * it does not require that there is actually a node present and instead
445 * the return code indicates if a node was read. 445 * the return code indicates if a node was read.
446 *
447 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
448 * is true (it is controlled by corresponding mount option). However, if
449 * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
450 * checked.
446 */ 451 */
447static int try_read_node(const struct ubifs_info *c, void *buf, int type, 452static int try_read_node(const struct ubifs_info *c, void *buf, int type,
448 int len, int lnum, int offs) 453 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
470 if (node_len != len) 475 if (node_len != len)
471 return 0; 476 return 0;
472 477
473 if (type == UBIFS_DATA_NODE && !c->always_chk_crc) 478 if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
474 if (c->no_chk_data_crc) 479 return 1;
475 return 0;
476 480
477 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 481 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
478 node_crc = le32_to_cpu(ch->crc); 482 node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
1506 * 1510 *
1507 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function 1511 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
1508 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares 1512 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
1509 * maxumum possible amount of nodes for bulk-read. 1513 * maximum possible amount of nodes for bulk-read.
1510 */ 1514 */
1511int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) 1515int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
1512{ 1516{
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2a4cc66d03..039a68bee29a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -426,9 +426,9 @@ struct ubifs_unclean_leb {
426 * LEB properties flags. 426 * LEB properties flags.
427 * 427 *
428 * LPROPS_UNCAT: not categorized 428 * LPROPS_UNCAT: not categorized
429 * LPROPS_DIRTY: dirty > 0, not index 429 * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index 430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
431 * LPROPS_FREE: free > 0, not empty, not index 431 * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs 432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
433 * LPROPS_EMPTY: LEB is empty, not taken 433 * LPROPS_EMPTY: LEB is empty, not taken
434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken 434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
@@ -961,7 +961,6 @@ struct ubifs_debug_info;
961 * @cs_lock: commit state lock 961 * @cs_lock: commit state lock
962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running 962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
963 * 963 *
964 * @fast_unmount: do not run journal commit before un-mounting
965 * @big_lpt: flag that LPT is too big to write whole during commit 964 * @big_lpt: flag that LPT is too big to write whole during commit
966 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 965 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
967 * recovery) 966 * recovery)
@@ -1202,7 +1201,6 @@ struct ubifs_info {
1202 spinlock_t cs_lock; 1201 spinlock_t cs_lock;
1203 wait_queue_head_t cmt_wq; 1202 wait_queue_head_t cmt_wq;
1204 1203
1205 unsigned int fast_unmount:1;
1206 unsigned int big_lpt:1; 1204 unsigned int big_lpt:1;
1207 unsigned int no_chk_data_crc:1; 1205 unsigned int no_chk_data_crc:1;
1208 unsigned int bulk_read:1; 1206 unsigned int bulk_read:1;
@@ -1405,13 +1403,13 @@ extern struct list_head ubifs_infos;
1405extern spinlock_t ubifs_infos_lock; 1403extern spinlock_t ubifs_infos_lock;
1406extern atomic_long_t ubifs_clean_zn_cnt; 1404extern atomic_long_t ubifs_clean_zn_cnt;
1407extern struct kmem_cache *ubifs_inode_slab; 1405extern struct kmem_cache *ubifs_inode_slab;
1408extern struct super_operations ubifs_super_operations; 1406extern const struct super_operations ubifs_super_operations;
1409extern struct address_space_operations ubifs_file_address_operations; 1407extern const struct address_space_operations ubifs_file_address_operations;
1410extern struct file_operations ubifs_file_operations; 1408extern const struct file_operations ubifs_file_operations;
1411extern struct inode_operations ubifs_file_inode_operations; 1409extern const struct inode_operations ubifs_file_inode_operations;
1412extern struct file_operations ubifs_dir_operations; 1410extern const struct file_operations ubifs_dir_operations;
1413extern struct inode_operations ubifs_dir_inode_operations; 1411extern const struct inode_operations ubifs_dir_inode_operations;
1414extern struct inode_operations ubifs_symlink_inode_operations; 1412extern const struct inode_operations ubifs_symlink_inode_operations;
1415extern struct backing_dev_info ubifs_backing_dev_info; 1413extern struct backing_dev_info ubifs_backing_dev_info;
1416extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; 1414extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1417 1415
@@ -1428,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
1428int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, 1426int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
1429 int offs, int dtype); 1427 int offs, int dtype);
1430int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 1428int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
1431 int offs, int quiet, int chk_crc); 1429 int offs, int quiet, int must_chk_crc);
1432void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); 1430void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
1433void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); 1431void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
1434int ubifs_io_init(struct ubifs_info *c); 1432int ubifs_io_init(struct ubifs_info *c);
@@ -1495,6 +1493,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1495void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1493void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1496 struct ubifs_budget_req *req); 1494 struct ubifs_budget_req *req);
1497long long ubifs_get_free_space(struct ubifs_info *c); 1495long long ubifs_get_free_space(struct ubifs_info *c);
1496long long ubifs_get_free_space_nolock(struct ubifs_info *c);
1498int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1497int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1499void ubifs_convert_page_budget(struct ubifs_info *c); 1498void ubifs_convert_page_budget(struct ubifs_info *c);
1500long long ubifs_reported_space(const struct ubifs_info *c, long long free); 1499long long ubifs_reported_space(const struct ubifs_info *c, long long free);
@@ -1603,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
1603int ubifs_orphan_start_commit(struct ubifs_info *c); 1602int ubifs_orphan_start_commit(struct ubifs_info *c);
1604int ubifs_orphan_end_commit(struct ubifs_info *c); 1603int ubifs_orphan_end_commit(struct ubifs_info *c);
1605int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); 1604int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
1605int ubifs_clear_orphans(struct ubifs_info *c);
1606 1606
1607/* lpt.c */ 1607/* lpt.c */
1608int ubifs_calc_lpt_geom(struct ubifs_info *c); 1608int ubifs_calc_lpt_geom(struct ubifs_info *c);
@@ -1646,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
1646 const struct ubifs_lprops *lp, 1646 const struct ubifs_lprops *lp,
1647 int free, int dirty, int flags, 1647 int free, int dirty, int flags,
1648 int idx_gc_cnt); 1648 int idx_gc_cnt);
1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); 1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, 1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
1651 int cat); 1651 int cat);
1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, 1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c26..a608e72fa405 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
371 /* flush inodes and push all remaining buffers out to disk */ 371 /* flush inodes and push all remaining buffers out to disk */
372 xfs_quiesce_fs(mp); 372 xfs_quiesce_fs(mp);
373 373
374 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); 374 /*
375 * Just warn here till VFS can correctly support
376 * read-only remount without racing.
377 */
378 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
375 379
376 /* Push the superblock and write an unmount record */ 380 /* Push the superblock and write an unmount record */
377 error = xfs_log_sbcount(mp, 1); 381 error = xfs_log_sbcount(mp, 1);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee713492..f8278cfcc1d3 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
55 struct file *file, *target_file; 55 struct file *file, *target_file;
56 int error = 0; 56 int error = 0;
57 57
58 sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
59 if (!sxp) {
60 error = XFS_ERROR(ENOMEM);
61 goto out;
62 }
63
64 /* Pull information for the target fd */ 58 /* Pull information for the target fd */
65 file = fget((int)sxp->sx_fdtarget); 59 file = fget((int)sxp->sx_fdtarget);
66 if (!file) { 60 if (!file) {
67 error = XFS_ERROR(EINVAL); 61 error = XFS_ERROR(EINVAL);
68 goto out_free_sxp; 62 goto out;
69 } 63 }
70 64
71 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { 65 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
109 fput(target_file); 103 fput(target_file);
110 out_put_file: 104 out_put_file:
111 fput(file); 105 fput(file);
112 out_free_sxp:
113 kmem_free(sxp);
114 out: 106 out:
115 return error; 107 return error;
116} 108}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94c..b1047de2fffd 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void xlog_recover_check_summary(xlog_t *);
70xfs_buf_t * 70xfs_buf_t *
71xlog_get_bp( 71xlog_get_bp(
72 xlog_t *log, 72 xlog_t *log,
73 int num_bblks) 73 int nbblks)
74{ 74{
75 ASSERT(num_bblks > 0); 75 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
76 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
77 XFS_ERROR_REPORT("xlog_get_bp(1)",
78 XFS_ERRLEVEL_HIGH, log->l_mp);
79 return NULL;
80 }
76 81
77 if (log->l_sectbb_log) { 82 if (log->l_sectbb_log) {
78 if (num_bblks > 1) 83 if (nbblks > 1)
79 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 84 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
80 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); 85 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
81 } 86 }
82 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); 87 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
83} 88}
84 89
85void 90void
@@ -102,6 +107,13 @@ xlog_bread(
102{ 107{
103 int error; 108 int error;
104 109
110 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
111 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
112 XFS_ERROR_REPORT("xlog_bread(1)",
113 XFS_ERRLEVEL_HIGH, log->l_mp);
114 return EFSCORRUPTED;
115 }
116
105 if (log->l_sectbb_log) { 117 if (log->l_sectbb_log) {
106 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 118 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
107 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 119 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
139{ 151{
140 int error; 152 int error;
141 153
154 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
155 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
156 XFS_ERROR_REPORT("xlog_bwrite(1)",
157 XFS_ERRLEVEL_HIGH, log->l_mp);
158 return EFSCORRUPTED;
159 }
160
142 if (log->l_sectbb_log) { 161 if (log->l_sectbb_log) {
143 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 162 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
144 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 163 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);