aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio.c7
-rw-r--r--fs/btrfs/async-thread.c117
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/disk-io.c181
-rw-r--r--fs/btrfs/extent-tree.c290
-rw-r--r--fs/btrfs/extent_io.c60
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c8
-rw-r--r--fs/btrfs/free-space-cache.c65
-rw-r--r--fs/btrfs/inode.c188
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c15
-rw-r--r--fs/btrfs/super.c76
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/volumes.c10
-rw-r--r--fs/btrfs/volumes.h6
-rw-r--r--fs/ceph/addr.c8
-rw-r--r--fs/ceph/caps.c187
-rw-r--r--fs/ceph/dir.c26
-rw-r--r--fs/ceph/file.c23
-rw-r--r--fs/ceph/inode.c62
-rw-r--r--fs/ceph/ioctl.c4
-rw-r--r--fs/ceph/mds_client.c33
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/snap.c16
-rw-r--r--fs/ceph/super.c8
-rw-r--r--fs/ceph/super.h31
-rw-r--r--fs/ceph/xattr.c42
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c26
-rw-r--r--fs/cifs/readdir.c10
-rw-r--r--fs/cifs/smbencrypt.c6
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/mount.c36
-rw-r--r--fs/dcache.c82
-rw-r--r--fs/ecryptfs/crypto.c26
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c23
-rw-r--r--fs/ecryptfs/inode.c52
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/extents.c3
-rw-r--r--fs/ext4/inode.c55
-rw-r--r--fs/ext4/page-io.c12
-rw-r--r--fs/ext4/super.c23
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fuse/dev.c3
-rw-r--r--fs/fuse/file.c6
-rw-r--r--fs/fuse/inode.c24
-rw-r--r--fs/minix/bitmap.c55
-rw-r--r--fs/minix/inode.c25
-rw-r--r--fs/minix/minix.h11
-rw-r--r--fs/namespace.c52
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/file.c91
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/pnfs.c26
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c14
-rw-r--r--fs/nfs/super.c37
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/aops.h14
-rw-r--r--fs/ocfs2/cluster/heartbeat.c194
-rw-r--r--fs/ocfs2/cluster/netdebug.c102
-rw-r--r--fs/ocfs2/cluster/tcp.c138
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h56
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmlock.c54
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c175
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c164
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlmglue.c21
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c96
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/ioctl.c11
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c53
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/ocfs2.h51
-rw-r--r--fs/ocfs2/quota_local.c23
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c71
-rw-r--r--fs/ocfs2/super.c25
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/pstore/platform.c13
-rw-r--r--fs/seq_file.c6
-rw-r--r--fs/ubifs/super.c18
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/xfs_bmap.c20
-rw-r--r--fs/xfs/xfs_export.c8
-rw-r--r--fs/xfs/xfs_inode.c21
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_log.c348
-rw-r--r--fs/xfs/xfs_qm.c3
-rw-r--r--fs/xfs/xfs_sync.c11
-rw-r--r--fs/xfs/xfs_trace.h12
115 files changed, 2688 insertions, 1661 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 41c93c722244..b1fe82cf88cf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -337,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio)
337 * RETURNS: 337 * RETURNS:
338 * Pointer to new bio on success, NULL on failure. 338 * Pointer to new bio on success, NULL on failure.
339 */ 339 */
340struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 340struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
341{ 341{
342 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 342 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
343 343
@@ -365,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio)
365 * %__GFP_WAIT, the allocation is guaranteed to succeed. 365 * %__GFP_WAIT, the allocation is guaranteed to succeed.
366 * 366 *
367 **/ 367 **/
368struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 368struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
369{ 369{
370 struct bio *bio; 370 struct bio *bio;
371 371
@@ -696,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd)
696 kfree(bmd); 696 kfree(bmd);
697} 697}
698 698
699static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 699static struct bio_map_data *bio_alloc_map_data(int nr_segs,
700 unsigned int iov_count,
700 gfp_t gfp_mask) 701 gfp_t gfp_mask)
701{ 702{
702 struct bio_map_data *bmd; 703 struct bio_map_data *bmd;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..cb97174e2366 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
64 int idle; 64 int idle;
65}; 65};
66 66
67static int __btrfs_start_workers(struct btrfs_workers *workers);
68
67/* 69/*
68 * btrfs_start_workers uses kthread_run, which can block waiting for memory 70 * btrfs_start_workers uses kthread_run, which can block waiting for memory
69 * for a very long time. It will actually throttle on page writeback, 71 * for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
88{ 90{
89 struct worker_start *start; 91 struct worker_start *start;
90 start = container_of(work, struct worker_start, work); 92 start = container_of(work, struct worker_start, work);
91 btrfs_start_workers(start->queue, 1); 93 __btrfs_start_workers(start->queue);
92 kfree(start); 94 kfree(start);
93} 95}
94 96
95static int start_new_worker(struct btrfs_workers *queue)
96{
97 struct worker_start *start;
98 int ret;
99
100 start = kzalloc(sizeof(*start), GFP_NOFS);
101 if (!start)
102 return -ENOMEM;
103
104 start->work.func = start_new_worker_func;
105 start->queue = queue;
106 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
107 if (ret)
108 kfree(start);
109 return ret;
110}
111
112/* 97/*
113 * helper function to move a thread onto the idle list after it 98 * helper function to move a thread onto the idle list after it
114 * has finished some requests. 99 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
153static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 138static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
154{ 139{
155 struct btrfs_workers *workers = worker->workers; 140 struct btrfs_workers *workers = worker->workers;
141 struct worker_start *start;
156 unsigned long flags; 142 unsigned long flags;
157 143
158 rmb(); 144 rmb();
159 if (!workers->atomic_start_pending) 145 if (!workers->atomic_start_pending)
160 return; 146 return;
161 147
148 start = kzalloc(sizeof(*start), GFP_NOFS);
149 if (!start)
150 return;
151
152 start->work.func = start_new_worker_func;
153 start->queue = workers;
154
162 spin_lock_irqsave(&workers->lock, flags); 155 spin_lock_irqsave(&workers->lock, flags);
163 if (!workers->atomic_start_pending) 156 if (!workers->atomic_start_pending)
164 goto out; 157 goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
170 163
171 workers->num_workers_starting += 1; 164 workers->num_workers_starting += 1;
172 spin_unlock_irqrestore(&workers->lock, flags); 165 spin_unlock_irqrestore(&workers->lock, flags);
173 start_new_worker(workers); 166 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
174 return; 167 return;
175 168
176out: 169out:
170 kfree(start);
177 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
178} 172}
179 173
@@ -331,7 +325,7 @@ again:
331 run_ordered_completions(worker->workers, work); 325 run_ordered_completions(worker->workers, work);
332 326
333 check_pending_worker_creates(worker); 327 check_pending_worker_creates(worker);
334 328 cond_resched();
335 } 329 }
336 330
337 spin_lock_irq(&worker->lock); 331 spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
462 * starts new worker threads. This does not enforce the max worker 456 * starts new worker threads. This does not enforce the max worker
463 * count in case you need to temporarily go past it. 457 * count in case you need to temporarily go past it.
464 */ 458 */
465static int __btrfs_start_workers(struct btrfs_workers *workers, 459static int __btrfs_start_workers(struct btrfs_workers *workers)
466 int num_workers)
467{ 460{
468 struct btrfs_worker_thread *worker; 461 struct btrfs_worker_thread *worker;
469 int ret = 0; 462 int ret = 0;
470 int i;
471 463
472 for (i = 0; i < num_workers; i++) { 464 worker = kzalloc(sizeof(*worker), GFP_NOFS);
473 worker = kzalloc(sizeof(*worker), GFP_NOFS); 465 if (!worker) {
474 if (!worker) { 466 ret = -ENOMEM;
475 ret = -ENOMEM; 467 goto fail;
476 goto fail; 468 }
477 }
478 469
479 INIT_LIST_HEAD(&worker->pending); 470 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending); 471 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list); 472 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock); 473 spin_lock_init(&worker->lock);
483 474
484 atomic_set(&worker->num_pending, 0); 475 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1); 476 atomic_set(&worker->refs, 1);
486 worker->workers = workers; 477 worker->workers = workers;
487 worker->task = kthread_run(worker_loop, worker, 478 worker->task = kthread_run(worker_loop, worker,
488 "btrfs-%s-%d", workers->name, 479 "btrfs-%s-%d", workers->name,
489 workers->num_workers + i); 480 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) { 481 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task); 482 ret = PTR_ERR(worker->task);
492 kfree(worker); 483 kfree(worker);
493 goto fail; 484 goto fail;
494 }
495 spin_lock_irq(&workers->lock);
496 list_add_tail(&worker->worker_list, &workers->idle_list);
497 worker->idle = 1;
498 workers->num_workers++;
499 workers->num_workers_starting--;
500 WARN_ON(workers->num_workers_starting < 0);
501 spin_unlock_irq(&workers->lock);
502 } 485 }
486 spin_lock_irq(&workers->lock);
487 list_add_tail(&worker->worker_list, &workers->idle_list);
488 worker->idle = 1;
489 workers->num_workers++;
490 workers->num_workers_starting--;
491 WARN_ON(workers->num_workers_starting < 0);
492 spin_unlock_irq(&workers->lock);
493
503 return 0; 494 return 0;
504fail: 495fail:
505 btrfs_stop_workers(workers); 496 spin_lock_irq(&workers->lock);
497 workers->num_workers_starting--;
498 spin_unlock_irq(&workers->lock);
506 return ret; 499 return ret;
507} 500}
508 501
509int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 502int btrfs_start_workers(struct btrfs_workers *workers)
510{ 503{
511 spin_lock_irq(&workers->lock); 504 spin_lock_irq(&workers->lock);
512 workers->num_workers_starting += num_workers; 505 workers->num_workers_starting++;
513 spin_unlock_irq(&workers->lock); 506 spin_unlock_irq(&workers->lock);
514 return __btrfs_start_workers(workers, num_workers); 507 return __btrfs_start_workers(workers);
515} 508}
516 509
517/* 510/*
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
568 struct btrfs_worker_thread *worker; 561 struct btrfs_worker_thread *worker;
569 unsigned long flags; 562 unsigned long flags;
570 struct list_head *fallback; 563 struct list_head *fallback;
564 int ret;
571 565
572again: 566again:
573 spin_lock_irqsave(&workers->lock, flags); 567 spin_lock_irqsave(&workers->lock, flags);
@@ -584,7 +578,9 @@ again:
584 workers->num_workers_starting++; 578 workers->num_workers_starting++;
585 spin_unlock_irqrestore(&workers->lock, flags); 579 spin_unlock_irqrestore(&workers->lock, flags);
586 /* we're below the limit, start another worker */ 580 /* we're below the limit, start another worker */
587 __btrfs_start_workers(workers, 1); 581 ret = __btrfs_start_workers(workers);
582 if (ret)
583 goto fallback;
588 goto again; 584 goto again;
589 } 585 }
590 } 586 }
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
665/* 661/*
666 * places a struct btrfs_work into the pending queue of one of the kthreads 662 * places a struct btrfs_work into the pending queue of one of the kthreads
667 */ 663 */
668int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) 664void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
669{ 665{
670 struct btrfs_worker_thread *worker; 666 struct btrfs_worker_thread *worker;
671 unsigned long flags; 667 unsigned long flags;
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
673 669
674 /* don't requeue something already on a list */ 670 /* don't requeue something already on a list */
675 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 671 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
676 goto out; 672 return;
677 673
678 worker = find_worker(workers); 674 worker = find_worker(workers);
679 if (workers->ordered) { 675 if (workers->ordered) {
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
712 if (wake) 708 if (wake)
713 wake_up_process(worker->task); 709 wake_up_process(worker->task);
714 spin_unlock_irqrestore(&worker->lock, flags); 710 spin_unlock_irqrestore(&worker->lock, flags);
715
716out:
717 return 0;
718} 711}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
109 char *name; 109 char *name;
110}; 110};
111 111
112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers);
114int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter); 116 struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929c..22c64fff1bd5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 683 return PTR_ERR(fspath);
684 684
685 if (fspath > fspath_min) { 685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)fspath; 686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 689 } else {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea38..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff9292..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
848enum btrfs_caching_type { 848enum btrfs_caching_type {
849 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
850 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
851 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
852}; 853};
853 854
854enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
1271 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1272 */ 1273 */
1273 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1274}; 1277};
1275 1278
1276struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
2366int btrfs_block_rsv_refill(struct btrfs_root *root, 2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2367 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2368 u64 min_reserved); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2370 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2371 u64 num_bytes); 2377 u64 num_bytes);
@@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2686int btrfs_readpage(struct file *file, struct page *page); 2692int btrfs_readpage(struct file *file, struct page *page);
2687void btrfs_evict_inode(struct inode *inode); 2693void btrfs_evict_inode(struct inode *inode);
2688int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2694int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2689void btrfs_dirty_inode(struct inode *inode, int flags); 2695int btrfs_dirty_inode(struct inode *inode);
2696int btrfs_update_time(struct file *file);
2690struct inode *btrfs_alloc_inode(struct super_block *sb); 2697struct inode *btrfs_alloc_inode(struct super_block *sb);
2691void btrfs_destroy_inode(struct inode *inode); 2698void btrfs_destroy_inode(struct inode *inode);
2692int btrfs_drop_inode(struct inode *inode); 2699int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5b163572e0ca..9c1eccc2c503 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -640,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for. 641 * we're accounted for.
642 */ 642 */
643 if (!trans->bytes_reserved && 643 if (!src_rsv || (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv) { 644 src_rsv != &root->fs_info->delalloc_block_rsv)) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /* 646 /*
647 * Since we're under a transaction reserve_metadata_bytes could 647 * Since we're under a transaction reserve_metadata_bytes could
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62afe5c5694e..f44b3928dc2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
620 620
621static int btree_io_failed_hook(struct bio *failed_bio, 621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end, 622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state) 623 int mirror_num, struct extent_state *state)
624{ 624{
625 struct extent_io_tree *tree; 625 struct extent_io_tree *tree;
626 unsigned long len; 626 unsigned long len;
@@ -2194,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2194 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2; 2195 fs_info->readahead_workers.idle_thresh = 2;
2196 2196
2197 btrfs_start_workers(&fs_info->workers, 1); 2197 /*
2198 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 * btrfs_start_workers can really only fail because of ENOMEM so just
2199 btrfs_start_workers(&fs_info->submit_workers, 1); 2199 * return -ENOMEM if any of these fail.
2200 btrfs_start_workers(&fs_info->delalloc_workers, 1); 2200 */
2201 btrfs_start_workers(&fs_info->fixup_workers, 1); 2201 ret = btrfs_start_workers(&fs_info->workers);
2202 btrfs_start_workers(&fs_info->endio_workers, 1); 2202 ret |= btrfs_start_workers(&fs_info->generic_worker);
2203 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 2203 ret |= btrfs_start_workers(&fs_info->submit_workers);
2204 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 2204 ret |= btrfs_start_workers(&fs_info->delalloc_workers);
2205 btrfs_start_workers(&fs_info->endio_write_workers, 1); 2205 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2206 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 ret |= btrfs_start_workers(&fs_info->endio_workers);
2207 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2208 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2209 btrfs_start_workers(&fs_info->readahead_workers, 1); 2209 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2210 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2211 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2212 ret |= btrfs_start_workers(&fs_info->caching_workers);
2213 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2214 if (ret) {
2215 ret = -ENOMEM;
2216 goto fail_sb_buffer;
2217 }
2210 2218
2211 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2219 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2212 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2220 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2573,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
2573 int errors = 0; 2581 int errors = 0;
2574 u32 crc; 2582 u32 crc;
2575 u64 bytenr; 2583 u64 bytenr;
2576 int last_barrier = 0;
2577 2584
2578 if (max_mirrors == 0) 2585 if (max_mirrors == 0)
2579 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2586 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2580 2587
2581 /* make sure only the last submit_bh does a barrier */
2582 if (do_barriers) {
2583 for (i = 0; i < max_mirrors; i++) {
2584 bytenr = btrfs_sb_offset(i);
2585 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2586 device->total_bytes)
2587 break;
2588 last_barrier = i;
2589 }
2590 }
2591
2592 for (i = 0; i < max_mirrors; i++) { 2588 for (i = 0; i < max_mirrors; i++) {
2593 bytenr = btrfs_sb_offset(i); 2589 bytenr = btrfs_sb_offset(i);
2594 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2590 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2634,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
2634 bh->b_end_io = btrfs_end_buffer_write_sync; 2630 bh->b_end_io = btrfs_end_buffer_write_sync;
2635 } 2631 }
2636 2632
2637 if (i == last_barrier && do_barriers) 2633 /*
2638 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2634 * we fua the first super. The others we allow
2639 else 2635 * to go down lazy.
2640 ret = submit_bh(WRITE_SYNC, bh); 2636 */
2641 2637 ret = submit_bh(WRITE_FUA, bh);
2642 if (ret) 2638 if (ret)
2643 errors++; 2639 errors++;
2644 } 2640 }
2645 return errors < i ? 0 : -1; 2641 return errors < i ? 0 : -1;
2646} 2642}
2647 2643
2644/*
2645 * endio for the write_dev_flush, this will wake anyone waiting
2646 * for the barrier when it is done
2647 */
2648static void btrfs_end_empty_barrier(struct bio *bio, int err)
2649{
2650 if (err) {
2651 if (err == -EOPNOTSUPP)
2652 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2653 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2654 }
2655 if (bio->bi_private)
2656 complete(bio->bi_private);
2657 bio_put(bio);
2658}
2659
2660/*
2661 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2662 * sent down. With wait == 1, it waits for the previous flush.
2663 *
2664 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2665 * capable
2666 */
2667static int write_dev_flush(struct btrfs_device *device, int wait)
2668{
2669 struct bio *bio;
2670 int ret = 0;
2671
2672 if (device->nobarriers)
2673 return 0;
2674
2675 if (wait) {
2676 bio = device->flush_bio;
2677 if (!bio)
2678 return 0;
2679
2680 wait_for_completion(&device->flush_wait);
2681
2682 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2683 printk("btrfs: disabling barriers on dev %s\n",
2684 device->name);
2685 device->nobarriers = 1;
2686 }
2687 if (!bio_flagged(bio, BIO_UPTODATE)) {
2688 ret = -EIO;
2689 }
2690
2691 /* drop the reference from the wait == 0 run */
2692 bio_put(bio);
2693 device->flush_bio = NULL;
2694
2695 return ret;
2696 }
2697
2698 /*
2699 * one reference for us, and we leave it for the
2700 * caller
2701 */
2702 device->flush_bio = NULL;;
2703 bio = bio_alloc(GFP_NOFS, 0);
2704 if (!bio)
2705 return -ENOMEM;
2706
2707 bio->bi_end_io = btrfs_end_empty_barrier;
2708 bio->bi_bdev = device->bdev;
2709 init_completion(&device->flush_wait);
2710 bio->bi_private = &device->flush_wait;
2711 device->flush_bio = bio;
2712
2713 bio_get(bio);
2714 submit_bio(WRITE_FLUSH, bio);
2715
2716 return 0;
2717}
2718
2719/*
2720 * send an empty flush down to each device in parallel,
2721 * then wait for them
2722 */
2723static int barrier_all_devices(struct btrfs_fs_info *info)
2724{
2725 struct list_head *head;
2726 struct btrfs_device *dev;
2727 int errors = 0;
2728 int ret;
2729
2730 /* send down all the barriers */
2731 head = &info->fs_devices->devices;
2732 list_for_each_entry_rcu(dev, head, dev_list) {
2733 if (!dev->bdev) {
2734 errors++;
2735 continue;
2736 }
2737 if (!dev->in_fs_metadata || !dev->writeable)
2738 continue;
2739
2740 ret = write_dev_flush(dev, 0);
2741 if (ret)
2742 errors++;
2743 }
2744
2745 /* wait for all the barriers */
2746 list_for_each_entry_rcu(dev, head, dev_list) {
2747 if (!dev->bdev) {
2748 errors++;
2749 continue;
2750 }
2751 if (!dev->in_fs_metadata || !dev->writeable)
2752 continue;
2753
2754 ret = write_dev_flush(dev, 1);
2755 if (ret)
2756 errors++;
2757 }
2758 if (errors)
2759 return -EIO;
2760 return 0;
2761}
2762
2648int write_all_supers(struct btrfs_root *root, int max_mirrors) 2763int write_all_supers(struct btrfs_root *root, int max_mirrors)
2649{ 2764{
2650 struct list_head *head; 2765 struct list_head *head;
@@ -2666,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2666 2781
2667 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2782 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2668 head = &root->fs_info->fs_devices->devices; 2783 head = &root->fs_info->fs_devices->devices;
2784
2785 if (do_barriers)
2786 barrier_all_devices(root->fs_info);
2787
2669 list_for_each_entry_rcu(dev, head, dev_list) { 2788 list_for_each_entry_rcu(dev, head, dev_list) {
2670 if (!dev->bdev) { 2789 if (!dev->bdev) {
2671 total_errors++; 2790 total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b232150b5b6b..f5fbe576d2ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 struct btrfs_root *root, 467 struct btrfs_root *root,
468 int load_cache_only) 468 int load_cache_only)
469{ 469{
470 DEFINE_WAIT(wait);
470 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
471 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
472 int ret = 0; 473 int ret = 0;
473 474
474 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
475 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
476 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
477 523
478 /* 524 /*
479 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
484 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
485 (root && root != root->fs_info->tree_root) && 531 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) { 532 btrfs_test_opt(root, SPACE_CACHE)) {
487 spin_lock(&cache->lock);
488 if (cache->cached != BTRFS_CACHE_NO) {
489 spin_unlock(&cache->lock);
490 return 0;
491 }
492 cache->cached = BTRFS_CACHE_STARTED;
493 spin_unlock(&cache->lock);
494
495 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
496 534
497 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
498 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
499 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
500 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
501 } else { 540 } else {
502 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
503 } 547 }
504 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
505 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
506 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
507 return 0; 553 return 0;
508 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
509 } 569 }
510 570
511 if (load_cache_only) 571 if (load_cache_only) {
512 return 0; 572 put_caching_control(caching_ctl);
513
514 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
515 BUG_ON(!caching_ctl);
516
517 INIT_LIST_HEAD(&caching_ctl->list);
518 mutex_init(&caching_ctl->mutex);
519 init_waitqueue_head(&caching_ctl->wait);
520 caching_ctl->block_group = cache;
521 caching_ctl->progress = cache->key.objectid;
522 /* one for caching kthread, one for caching block group list */
523 atomic_set(&caching_ctl->count, 2);
524 caching_ctl->work.func = caching_thread;
525
526 spin_lock(&cache->lock);
527 if (cache->cached != BTRFS_CACHE_NO) {
528 spin_unlock(&cache->lock);
529 kfree(caching_ctl);
530 return 0; 573 return 0;
531 } 574 }
532 cache->caching_ctl = caching_ctl;
533 cache->cached = BTRFS_CACHE_STARTED;
534 spin_unlock(&cache->lock);
535 575
536 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
537 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
538 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
539 580
@@ -2781,7 +2822,7 @@ out_free:
2781 btrfs_release_path(path); 2822 btrfs_release_path(path);
2782out: 2823out:
2783 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2784 if (!ret) 2825 if (!ret && dcs == BTRFS_DC_SETUP)
2785 block_group->cache_generation = trans->transid; 2826 block_group->cache_generation = trans->transid;
2786 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2787 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
@@ -3847,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
3847 return ret; 3888 return ret;
3848} 3889}
3849 3890
3850int btrfs_block_rsv_refill(struct btrfs_root *root, 3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3851 struct btrfs_block_rsv *block_rsv, 3892 struct btrfs_block_rsv *block_rsv,
3852 u64 min_reserved) 3893 u64 min_reserved, int flush)
3853{ 3894{
3854 u64 num_bytes = 0; 3895 u64 num_bytes = 0;
3855 int ret = -ENOSPC; 3896 int ret = -ENOSPC;
@@ -3868,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3868 if (!ret) 3909 if (!ret)
3869 return 0; 3910 return 0;
3870 3911
3871 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); 3912 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3872 if (!ret) { 3913 if (!ret) {
3873 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3914 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3874 return 0; 3915 return 0;
@@ -3877,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3877 return ret; 3918 return ret;
3878} 3919}
3879 3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922 struct btrfs_block_rsv *block_rsv,
3923 u64 min_reserved)
3924{
3925 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929 struct btrfs_block_rsv *block_rsv,
3930 u64 min_reserved)
3931{
3932 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3933}
3934
3880int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3881 struct btrfs_block_rsv *dst_rsv, 3936 struct btrfs_block_rsv *dst_rsv,
3882 u64 num_bytes) 3937 u64 num_bytes)
@@ -4149,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4149 struct btrfs_root *root = BTRFS_I(inode)->root; 4204 struct btrfs_root *root = BTRFS_I(inode)->root;
4150 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4205 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4151 u64 to_reserve = 0; 4206 u64 to_reserve = 0;
4207 u64 csum_bytes;
4152 unsigned nr_extents = 0; 4208 unsigned nr_extents = 0;
4209 int extra_reserve = 0;
4153 int flush = 1; 4210 int flush = 1;
4154 int ret; 4211 int ret;
4155 4212
4213 /* Need to be holding the i_mutex here if we aren't free space cache */
4156 if (btrfs_is_free_space_inode(root, inode)) 4214 if (btrfs_is_free_space_inode(root, inode))
4157 flush = 0; 4215 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4158 4218
4159 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4219 if (flush && btrfs_transaction_in_commit(root->fs_info))
4160 schedule_timeout(1); 4220 schedule_timeout(1);
@@ -4165,11 +4225,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4165 BTRFS_I(inode)->outstanding_extents++; 4225 BTRFS_I(inode)->outstanding_extents++;
4166 4226
4167 if (BTRFS_I(inode)->outstanding_extents > 4227 if (BTRFS_I(inode)->outstanding_extents >
4168 BTRFS_I(inode)->reserved_extents) { 4228 BTRFS_I(inode)->reserved_extents)
4169 nr_extents = BTRFS_I(inode)->outstanding_extents - 4229 nr_extents = BTRFS_I(inode)->outstanding_extents -
4170 BTRFS_I(inode)->reserved_extents; 4230 BTRFS_I(inode)->reserved_extents;
4171 BTRFS_I(inode)->reserved_extents += nr_extents;
4172 }
4173 4231
4174 /* 4232 /*
4175 * Add an item to reserve for updating the inode when we complete the 4233 * Add an item to reserve for updating the inode when we complete the
@@ -4177,11 +4235,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4177 */ 4235 */
4178 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4236 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4179 nr_extents++; 4237 nr_extents++;
4180 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4238 extra_reserve = 1;
4181 } 4239 }
4182 4240
4183 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4241 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4184 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4242 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4243 csum_bytes = BTRFS_I(inode)->csum_bytes;
4185 spin_unlock(&BTRFS_I(inode)->lock); 4244 spin_unlock(&BTRFS_I(inode)->lock);
4186 4245
4187 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4246 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4191,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4191 4250
4192 spin_lock(&BTRFS_I(inode)->lock); 4251 spin_lock(&BTRFS_I(inode)->lock);
4193 dropped = drop_outstanding_extent(inode); 4252 dropped = drop_outstanding_extent(inode);
4194 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4195 spin_unlock(&BTRFS_I(inode)->lock);
4196 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4197
4198 /* 4253 /*
4199 * Somebody could have come in and twiddled with the 4254 * If the inodes csum_bytes is the same as the original
4200 * reservation, so if we have to free more than we would have 4255 * csum_bytes then we know we haven't raced with any free()ers
4201 * reserved from this reservation go ahead and release those 4256 * so we can just reduce our inodes csum bytes and carry on.
4202 * bytes. 4257 * Otherwise we have to do the normal free thing to account for
4258 * the case that the free side didn't free up its reserve
4259 * because of this outstanding reservation.
4203 */ 4260 */
4204 to_free -= to_reserve; 4261 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4262 calc_csum_metadata_size(inode, num_bytes, 0);
4263 else
4264 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4265 spin_unlock(&BTRFS_I(inode)->lock);
4266 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268
4205 if (to_free) 4269 if (to_free)
4206 btrfs_block_rsv_release(root, block_rsv, to_free); 4270 btrfs_block_rsv_release(root, block_rsv, to_free);
4207 return ret; 4271 return ret;
4208 } 4272 }
4209 4273
4274 spin_lock(&BTRFS_I(inode)->lock);
4275 if (extra_reserve) {
4276 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4277 nr_extents--;
4278 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock);
4281
4210 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4282 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4211 4283
4212 return 0; 4284 return 0;
@@ -5052,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5052 struct btrfs_root *root = orig_root->fs_info->extent_root; 5124 struct btrfs_root *root = orig_root->fs_info->extent_root;
5053 struct btrfs_free_cluster *last_ptr = NULL; 5125 struct btrfs_free_cluster *last_ptr = NULL;
5054 struct btrfs_block_group_cache *block_group = NULL; 5126 struct btrfs_block_group_cache *block_group = NULL;
5127 struct btrfs_block_group_cache *used_block_group;
5055 int empty_cluster = 2 * 1024 * 1024; 5128 int empty_cluster = 2 * 1024 * 1024;
5056 int allowed_chunk_alloc = 0; 5129 int allowed_chunk_alloc = 0;
5057 int done_chunk_alloc = 0; 5130 int done_chunk_alloc = 0;
5058 struct btrfs_space_info *space_info; 5131 struct btrfs_space_info *space_info;
5059 int last_ptr_loop = 0;
5060 int loop = 0; 5132 int loop = 0;
5061 int index = 0; 5133 int index = 0;
5062 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5134 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5118,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5118ideal_cache: 5190ideal_cache:
5119 block_group = btrfs_lookup_block_group(root->fs_info, 5191 block_group = btrfs_lookup_block_group(root->fs_info,
5120 search_start); 5192 search_start);
5193 used_block_group = block_group;
5121 /* 5194 /*
5122 * we don't want to use the block group if it doesn't match our 5195 * we don't want to use the block group if it doesn't match our
5123 * allocation bits, or if its not cached. 5196 * allocation bits, or if its not cached.
@@ -5155,6 +5228,7 @@ search:
5155 u64 offset; 5228 u64 offset;
5156 int cached; 5229 int cached;
5157 5230
5231 used_block_group = block_group;
5158 btrfs_get_block_group(block_group); 5232 btrfs_get_block_group(block_group);
5159 search_start = block_group->key.objectid; 5233 search_start = block_group->key.objectid;
5160 5234
@@ -5178,13 +5252,15 @@ search:
5178 } 5252 }
5179 5253
5180have_block_group: 5254have_block_group:
5181 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5255 cached = block_group_cache_done(block_group);
5256 if (unlikely(!cached)) {
5182 u64 free_percent; 5257 u64 free_percent;
5183 5258
5259 found_uncached_bg = true;
5184 ret = cache_block_group(block_group, trans, 5260 ret = cache_block_group(block_group, trans,
5185 orig_root, 1); 5261 orig_root, 1);
5186 if (block_group->cached == BTRFS_CACHE_FINISHED) 5262 if (block_group->cached == BTRFS_CACHE_FINISHED)
5187 goto have_block_group; 5263 goto alloc;
5188 5264
5189 free_percent = btrfs_block_group_used(&block_group->item); 5265 free_percent = btrfs_block_group_used(&block_group->item);
5190 free_percent *= 100; 5266 free_percent *= 100;
@@ -5206,7 +5282,6 @@ have_block_group:
5206 orig_root, 0); 5282 orig_root, 0);
5207 BUG_ON(ret); 5283 BUG_ON(ret);
5208 } 5284 }
5209 found_uncached_bg = true;
5210 5285
5211 /* 5286 /*
5212 * If loop is set for cached only, try the next block 5287 * If loop is set for cached only, try the next block
@@ -5216,94 +5291,80 @@ have_block_group:
5216 goto loop; 5291 goto loop;
5217 } 5292 }
5218 5293
5219 cached = block_group_cache_done(block_group); 5294alloc:
5220 if (unlikely(!cached))
5221 found_uncached_bg = true;
5222
5223 if (unlikely(block_group->ro)) 5295 if (unlikely(block_group->ro))
5224 goto loop; 5296 goto loop;
5225 5297
5226 spin_lock(&block_group->free_space_ctl->tree_lock); 5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5227 if (cached && 5299 if (cached &&
5228 block_group->free_space_ctl->free_space < 5300 block_group->free_space_ctl->free_space <
5229 num_bytes + empty_size) { 5301 num_bytes + empty_cluster + empty_size) {
5230 spin_unlock(&block_group->free_space_ctl->tree_lock); 5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5231 goto loop; 5303 goto loop;
5232 } 5304 }
5233 spin_unlock(&block_group->free_space_ctl->tree_lock); 5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5234 5306
5235 /* 5307 /*
5236 * Ok we want to try and use the cluster allocator, so lets look 5308 * Ok we want to try and use the cluster allocator, so
5237 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5309 * lets look there
5238 * have tried the cluster allocator plenty of times at this
5239 * point and not have found anything, so we are likely way too
5240 * fragmented for the clustering stuff to find anything, so lets
5241 * just skip it and let the allocator find whatever block it can
5242 * find
5243 */ 5310 */
5244 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5311 if (last_ptr) {
5245 /* 5312 /*
5246 * the refill lock keeps out other 5313 * the refill lock keeps out other
5247 * people trying to start a new cluster 5314 * people trying to start a new cluster
5248 */ 5315 */
5249 spin_lock(&last_ptr->refill_lock); 5316 spin_lock(&last_ptr->refill_lock);
5250 if (last_ptr->block_group && 5317 used_block_group = last_ptr->block_group;
5251 (last_ptr->block_group->ro || 5318 if (used_block_group != block_group &&
5252 !block_group_bits(last_ptr->block_group, data))) { 5319 (!used_block_group ||
5253 offset = 0; 5320 used_block_group->ro ||
5321 !block_group_bits(used_block_group, data))) {
5322 used_block_group = block_group;
5254 goto refill_cluster; 5323 goto refill_cluster;
5255 } 5324 }
5256 5325
5257 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5326 if (used_block_group != block_group)
5258 num_bytes, search_start); 5327 btrfs_get_block_group(used_block_group);
5328
5329 offset = btrfs_alloc_from_cluster(used_block_group,
5330 last_ptr, num_bytes, used_block_group->key.objectid);
5259 if (offset) { 5331 if (offset) {
5260 /* we have a block, we're done */ 5332 /* we have a block, we're done */
5261 spin_unlock(&last_ptr->refill_lock); 5333 spin_unlock(&last_ptr->refill_lock);
5262 goto checks; 5334 goto checks;
5263 } 5335 }
5264 5336
5265 spin_lock(&last_ptr->lock); 5337 WARN_ON(last_ptr->block_group != used_block_group);
5266 /* 5338 if (used_block_group != block_group) {
5267 * whoops, this cluster doesn't actually point to 5339 btrfs_put_block_group(used_block_group);
5268 * this block group. Get a ref on the block 5340 used_block_group = block_group;
5269 * group is does point to and try again
5270 */
5271 if (!last_ptr_loop && last_ptr->block_group &&
5272 last_ptr->block_group != block_group &&
5273 index <=
5274 get_block_group_index(last_ptr->block_group)) {
5275
5276 btrfs_put_block_group(block_group);
5277 block_group = last_ptr->block_group;
5278 btrfs_get_block_group(block_group);
5279 spin_unlock(&last_ptr->lock);
5280 spin_unlock(&last_ptr->refill_lock);
5281
5282 last_ptr_loop = 1;
5283 search_start = block_group->key.objectid;
5284 /*
5285 * we know this block group is properly
5286 * in the list because
5287 * btrfs_remove_block_group, drops the
5288 * cluster before it removes the block
5289 * group from the list
5290 */
5291 goto have_block_group;
5292 } 5341 }
5293 spin_unlock(&last_ptr->lock);
5294refill_cluster: 5342refill_cluster:
5343 BUG_ON(used_block_group != block_group);
5344 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5345 * set up a new clusters, so lets just skip it
5346 * and let the allocator find whatever block
5347 * it can find. If we reach this point, we
5348 * will have tried the cluster allocator
5349 * plenty of times and not have found
5350 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find
5352 * anything. */
5353 if (loop >= LOOP_NO_EMPTY_SIZE) {
5354 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc;
5356 }
5357
5295 /* 5358 /*
5296 * this cluster didn't work out, free it and 5359 * this cluster didn't work out, free it and
5297 * start over 5360 * start over
5298 */ 5361 */
5299 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5362 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5300 5363
5301 last_ptr_loop = 0;
5302
5303 /* allocate a cluster in this block group */ 5364 /* allocate a cluster in this block group */
5304 ret = btrfs_find_space_cluster(trans, root, 5365 ret = btrfs_find_space_cluster(trans, root,
5305 block_group, last_ptr, 5366 block_group, last_ptr,
5306 offset, num_bytes, 5367 search_start, num_bytes,
5307 empty_cluster + empty_size); 5368 empty_cluster + empty_size);
5308 if (ret == 0) { 5369 if (ret == 0) {
5309 /* 5370 /*
@@ -5339,6 +5400,7 @@ refill_cluster:
5339 goto loop; 5400 goto loop;
5340 } 5401 }
5341 5402
5403unclustered_alloc:
5342 offset = btrfs_find_space_for_alloc(block_group, search_start, 5404 offset = btrfs_find_space_for_alloc(block_group, search_start,
5343 num_bytes, empty_size); 5405 num_bytes, empty_size);
5344 /* 5406 /*
@@ -5365,14 +5427,14 @@ checks:
5365 search_start = stripe_align(root, offset); 5427 search_start = stripe_align(root, offset);
5366 /* move on to the next group */ 5428 /* move on to the next group */
5367 if (search_start + num_bytes >= search_end) { 5429 if (search_start + num_bytes >= search_end) {
5368 btrfs_add_free_space(block_group, offset, num_bytes); 5430 btrfs_add_free_space(used_block_group, offset, num_bytes);
5369 goto loop; 5431 goto loop;
5370 } 5432 }
5371 5433
5372 /* move on to the next group */ 5434 /* move on to the next group */
5373 if (search_start + num_bytes > 5435 if (search_start + num_bytes >
5374 block_group->key.objectid + block_group->key.offset) { 5436 used_block_group->key.objectid + used_block_group->key.offset) {
5375 btrfs_add_free_space(block_group, offset, num_bytes); 5437 btrfs_add_free_space(used_block_group, offset, num_bytes);
5376 goto loop; 5438 goto loop;
5377 } 5439 }
5378 5440
@@ -5380,14 +5442,14 @@ checks:
5380 ins->offset = num_bytes; 5442 ins->offset = num_bytes;
5381 5443
5382 if (offset < search_start) 5444 if (offset < search_start)
5383 btrfs_add_free_space(block_group, offset, 5445 btrfs_add_free_space(used_block_group, offset,
5384 search_start - offset); 5446 search_start - offset);
5385 BUG_ON(offset > search_start); 5447 BUG_ON(offset > search_start);
5386 5448
5387 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 5449 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5388 alloc_type); 5450 alloc_type);
5389 if (ret == -EAGAIN) { 5451 if (ret == -EAGAIN) {
5390 btrfs_add_free_space(block_group, offset, num_bytes); 5452 btrfs_add_free_space(used_block_group, offset, num_bytes);
5391 goto loop; 5453 goto loop;
5392 } 5454 }
5393 5455
@@ -5396,15 +5458,19 @@ checks:
5396 ins->offset = num_bytes; 5458 ins->offset = num_bytes;
5397 5459
5398 if (offset < search_start) 5460 if (offset < search_start)
5399 btrfs_add_free_space(block_group, offset, 5461 btrfs_add_free_space(used_block_group, offset,
5400 search_start - offset); 5462 search_start - offset);
5401 BUG_ON(offset > search_start); 5463 BUG_ON(offset > search_start);
5464 if (used_block_group != block_group)
5465 btrfs_put_block_group(used_block_group);
5402 btrfs_put_block_group(block_group); 5466 btrfs_put_block_group(block_group);
5403 break; 5467 break;
5404loop: 5468loop:
5405 failed_cluster_refill = false; 5469 failed_cluster_refill = false;
5406 failed_alloc = false; 5470 failed_alloc = false;
5407 BUG_ON(index != get_block_group_index(block_group)); 5471 BUG_ON(index != get_block_group_index(block_group));
5472 if (used_block_group != block_group)
5473 btrfs_put_block_group(used_block_group);
5408 btrfs_put_block_group(block_group); 5474 btrfs_put_block_group(block_group);
5409 } 5475 }
5410 up_read(&space_info->groups_sem); 5476 up_read(&space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a0..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
935 node = tree_search(tree, start); 935 node = tree_search(tree, start);
936 if (!node) { 936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc); 937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc) 938 if (!prealloc) {
939 return -ENOMEM; 939 err = -ENOMEM;
940 goto out;
941 }
940 err = insert_state(tree, prealloc, start, end, &bits); 942 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL; 943 prealloc = NULL;
942 BUG_ON(err == -EEXIST); 944 BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
992 */ 994 */
993 if (state->start < start) { 995 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc); 996 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc) 997 if (!prealloc) {
996 return -ENOMEM; 998 err = -ENOMEM;
999 goto out;
1000 }
997 err = split_state(tree, state, prealloc, start); 1001 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST); 1002 BUG_ON(err == -EEXIST);
999 prealloc = NULL; 1003 prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
1024 this_end = last_start - 1; 1028 this_end = last_start - 1;
1025 1029
1026 prealloc = alloc_extent_state_atomic(prealloc); 1030 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc) 1031 if (!prealloc) {
1028 return -ENOMEM; 1032 err = -ENOMEM;
1033 goto out;
1034 }
1029 1035
1030 /* 1036 /*
1031 * Avoid to free 'prealloc' if it can be merged with 1037 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
1051 */ 1057 */
1052 if (state->start <= end && state->end > end) { 1058 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc); 1059 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc) 1060 if (!prealloc) {
1055 return -ENOMEM; 1061 err = -ENOMEM;
1062 goto out;
1063 }
1056 1064
1057 err = split_state(tree, state, prealloc, end + 1); 1065 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST); 1066 BUG_ON(err == -EEXIST);
@@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2285 clean_io_failure(start, page); 2293 clean_io_failure(start, page);
2286 } 2294 }
2287 if (!uptodate) { 2295 if (!uptodate) {
2288 u64 failed_mirror; 2296 int failed_mirror;
2289 failed_mirror = (u64)bio->bi_bdev; 2297 failed_mirror = (int)(unsigned long)bio->bi_bdev;
2290 if (tree->ops && tree->ops->readpage_io_failed_hook) 2298 /*
2291 ret = tree->ops->readpage_io_failed_hook( 2299 * The generic bio_readpage_error handles errors the
2292 bio, page, start, end, 2300 * following way: If possible, new read requests are
2293 failed_mirror, state); 2301 * created and submitted and will end up in
2294 else 2302 * end_bio_extent_readpage as well (if we're lucky, not
2295 ret = bio_readpage_error(bio, page, start, end, 2303 * in the !uptodate case). In that case it returns 0 and
2296 failed_mirror, NULL); 2304 * we just go on with the next page in our bio. If it
2305 * can't handle the error it will return -EIO and we
2306 * remain responsible for that page.
2307 */
2308 ret = bio_readpage_error(bio, page, start, end,
2309 failed_mirror, NULL);
2297 if (ret == 0) { 2310 if (ret == 0) {
2311error_handled:
2298 uptodate = 2312 uptodate =
2299 test_bit(BIO_UPTODATE, &bio->bi_flags); 2313 test_bit(BIO_UPTODATE, &bio->bi_flags);
2300 if (err) 2314 if (err)
@@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2302 uncache_state(&cached); 2316 uncache_state(&cached);
2303 continue; 2317 continue;
2304 } 2318 }
2319 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2320 ret = tree->ops->readpage_io_failed_hook(
2321 bio, page, start, end,
2322 failed_mirror, state);
2323 if (ret == 0)
2324 goto error_handled;
2325 }
2305 } 2326 }
2306 2327
2307 if (uptodate) { 2328 if (uptodate) {
@@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3366 return -ENOMEM; 3387 return -ENOMEM;
3367 path->leave_spinning = 1; 3388 path->leave_spinning = 1;
3368 3389
3390 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3391 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3392
3369 /* 3393 /*
3370 * lookup the last file extent. We're not using i_size here 3394 * lookup the last file extent. We're not using i_size here
3371 * because there might be preallocation past i_size 3395 * because there might be preallocation past i_size
@@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3413 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3437 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3414 &cached_state, GFP_NOFS); 3438 &cached_state, GFP_NOFS);
3415 3439
3416 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3440 em = get_extent_skip_holes(inode, start, last_for_get_extent,
3417 get_extent); 3441 get_extent);
3418 if (!em) 3442 if (!em)
3419 goto out; 3443 goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
70 unsigned long bio_flags); 70 unsigned long bio_flags);
71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, u64 failed_mirror, 73 u64 start, u64 end, int failed_mirror,
74 struct extent_state *state); 74 struct extent_state *state);
75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
76 u64 start, u64 end, 76 u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dafdfa059bf6..97fbe939c050 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1169 (sizeof(struct page *))); 1169 (sizeof(struct page *)));
1170 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1171 nrptrs = max(nrptrs, 8);
1170 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1172 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1171 if (!pages) 1173 if (!pages)
1172 return -ENOMEM; 1174 return -ENOMEM;
@@ -1387,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1387 goto out; 1389 goto out;
1388 } 1390 }
1389 1391
1390 file_update_time(file); 1392 err = btrfs_update_time(file);
1393 if (err) {
1394 mutex_unlock(&inode->i_mutex);
1395 goto out;
1396 }
1391 BTRFS_I(inode)->sequence++; 1397 BTRFS_I(inode)->sequence++;
1392 1398
1393 start_pos = round_down(pos, root->sectorsize); 1399 start_pos = round_down(pos, root->sectorsize);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 181760f9d2ab..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
351 } 351 }
352 } 352 }
353 353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
354 return 0; 359 return 0;
355} 360}
356 361
@@ -1465,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1465{ 1470{
1466 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1467 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1468 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1469 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1470 1476
@@ -1844,7 +1850,13 @@ again:
1844 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1845 1, 0); 1851 1, 0);
1846 if (!info) { 1852 if (!info) {
1847 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1848 goto out_lock; 1860 goto out_lock;
1849 } 1861 }
1850 } 1862 }
@@ -2308,6 +2320,7 @@ again:
2308 2320
2309 if (!found) { 2321 if (!found) {
2310 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2311 found = true; 2324 found = true;
2312 } 2325 }
2313 2326
@@ -2451,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2451{ 2464{
2452 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2453 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2454 struct rb_node *node;
2455 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2456 2469
2457 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2458 return -ENOSPC; 2471 return -ENOSPC;
2459 2472
2460 /* 2473 /*
2461 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2462 * here that will work. 2475 * is just its start offset.
2463 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2464 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2465 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2466 continue; 2486 continue;
@@ -2471,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2471 } 2491 }
2472 2492
2473 /* 2493 /*
2474 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2475 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2476 * this list and start the search from there.
2477 */ 2496 */
2478 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2479 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2480 list);
2481 node = rb_next(&entry->offset_index);
2482 if (!node)
2483 return -ENOSPC;
2484 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2485 goto search;
2486 }
2487
2488 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2489 if (!entry)
2490 return -ENOSPC;
2491
2492search:
2493 node = &entry->offset_index;
2494 do {
2495 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2496 node = rb_next(&entry->offset_index);
2497 if (!entry->bitmap)
2498 continue;
2499 if (entry->bytes < min_bytes)
2500 continue;
2501 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2502 bytes, min_bytes);
2503 } while (ret && node);
2504
2505 return ret;
2506} 2498}
2507 2499
2508/* 2500/*
@@ -2520,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2520 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2521{ 2513{
2522 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2523 struct list_head bitmaps;
2524 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2525 u64 min_bytes; 2517 u64 min_bytes;
2526 int ret; 2518 int ret;
2527 2519
@@ -2560,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2560 goto out; 2552 goto out;
2561 } 2553 }
2562 2554
2563 INIT_LIST_HEAD(&bitmaps);
2564 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2565 bytes, min_bytes); 2556 bytes, min_bytes);
2566 if (ret) 2557 if (ret)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 116ab67a06df..0a6b928813a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -2031,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2031 /* insert an orphan item to track this unlinked/truncated file */ 2032 /* insert an orphan item to track this unlinked/truncated file */
2032 if (insert >= 1) { 2033 if (insert >= 1) {
2033 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2034 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2034 BUG_ON(ret); 2035 BUG_ON(ret && ret != -EEXIST);
2035 } 2036 }
2036 2037
2037 /* insert an orphan item to track subvolume contains orphan files */ 2038 /* insert an orphan item to track subvolume contains orphan files */
@@ -2158,6 +2159,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2158 if (ret && ret != -ESTALE) 2159 if (ret && ret != -ESTALE)
2159 goto out; 2160 goto out;
2160 2161
2162 if (ret == -ESTALE && root == root->fs_info->tree_root) {
2163 struct btrfs_root *dead_root;
2164 struct btrfs_fs_info *fs_info = root->fs_info;
2165 int is_dead_root = 0;
2166
2167 /*
2168 * this is an orphan in the tree root. Currently these
2169 * could come from 2 sources:
2170 * a) a snapshot deletion in progress
2171 * b) a free space cache inode
2172 * We need to distinguish those two, as the snapshot
2173 * orphan must not get deleted.
2174 * find_dead_roots already ran before us, so if this
2175 * is a snapshot deletion, we should find the root
2176 * in the dead_roots list
2177 */
2178 spin_lock(&fs_info->trans_lock);
2179 list_for_each_entry(dead_root, &fs_info->dead_roots,
2180 root_list) {
2181 if (dead_root->root_key.objectid ==
2182 found_key.objectid) {
2183 is_dead_root = 1;
2184 break;
2185 }
2186 }
2187 spin_unlock(&fs_info->trans_lock);
2188 if (is_dead_root) {
2189 /* prevent this orphan from being found again */
2190 key.offset = found_key.objectid - 1;
2191 continue;
2192 }
2193 }
2161 /* 2194 /*
2162 * Inode is already gone but the orphan item is still there, 2195 * Inode is already gone but the orphan item is still there,
2163 * kill the orphan item. 2196 * kill the orphan item.
@@ -2191,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2191 continue; 2224 continue;
2192 } 2225 }
2193 nr_truncate++; 2226 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2194 ret = btrfs_truncate(inode); 2233 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2195 } else { 2235 } else {
2196 nr_unlink++; 2236 nr_unlink++;
2197 } 2237 }
@@ -3327,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3327 u64 hint_byte = 0; 3367 u64 hint_byte = 0;
3328 hole_size = last_byte - cur_offset; 3368 hole_size = last_byte - cur_offset;
3329 3369
3330 trans = btrfs_start_transaction(root, 2); 3370 trans = btrfs_start_transaction(root, 3);
3331 if (IS_ERR(trans)) { 3371 if (IS_ERR(trans)) {
3332 err = PTR_ERR(trans); 3372 err = PTR_ERR(trans);
3333 break; 3373 break;
@@ -3337,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3337 cur_offset + hole_size, 3377 cur_offset + hole_size,
3338 &hint_byte, 1); 3378 &hint_byte, 1);
3339 if (err) { 3379 if (err) {
3380 btrfs_update_inode(trans, root, inode);
3340 btrfs_end_transaction(trans, root); 3381 btrfs_end_transaction(trans, root);
3341 break; 3382 break;
3342 } 3383 }
@@ -3346,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3346 0, hole_size, 0, hole_size, 3387 0, hole_size, 0, hole_size,
3347 0, 0, 0); 3388 0, 0, 0);
3348 if (err) { 3389 if (err) {
3390 btrfs_update_inode(trans, root, inode);
3349 btrfs_end_transaction(trans, root); 3391 btrfs_end_transaction(trans, root);
3350 break; 3392 break;
3351 } 3393 }
@@ -3353,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3353 btrfs_drop_extent_cache(inode, hole_start, 3395 btrfs_drop_extent_cache(inode, hole_start,
3354 last_byte - 1, 0); 3396 last_byte - 1, 0);
3355 3397
3398 btrfs_update_inode(trans, root, inode);
3356 btrfs_end_transaction(trans, root); 3399 btrfs_end_transaction(trans, root);
3357 } 3400 }
3358 free_extent_map(em); 3401 free_extent_map(em);
@@ -3370,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3370 3413
3371static int btrfs_setsize(struct inode *inode, loff_t newsize) 3414static int btrfs_setsize(struct inode *inode, loff_t newsize)
3372{ 3415{
3416 struct btrfs_root *root = BTRFS_I(inode)->root;
3417 struct btrfs_trans_handle *trans;
3373 loff_t oldsize = i_size_read(inode); 3418 loff_t oldsize = i_size_read(inode);
3374 int ret; 3419 int ret;
3375 3420
@@ -3377,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3377 return 0; 3422 return 0;
3378 3423
3379 if (newsize > oldsize) { 3424 if (newsize > oldsize) {
3380 i_size_write(inode, newsize);
3381 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3382 truncate_pagecache(inode, oldsize, newsize); 3425 truncate_pagecache(inode, oldsize, newsize);
3383 ret = btrfs_cont_expand(inode, oldsize, newsize); 3426 ret = btrfs_cont_expand(inode, oldsize, newsize);
3384 if (ret) { 3427 if (ret)
3385 btrfs_setsize(inode, oldsize);
3386 return ret; 3428 return ret;
3387 }
3388 3429
3389 mark_inode_dirty(inode); 3430 trans = btrfs_start_transaction(root, 1);
3431 if (IS_ERR(trans))
3432 return PTR_ERR(trans);
3433
3434 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root);
3390 } else { 3438 } else {
3391 3439
3392 /* 3440 /*
@@ -3426,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3426 3474
3427 if (attr->ia_valid) { 3475 if (attr->ia_valid) {
3428 setattr_copy(inode, attr); 3476 setattr_copy(inode, attr);
3429 mark_inode_dirty(inode); 3477 err = btrfs_dirty_inode(inode);
3430 3478
3431 if (attr->ia_valid & ATTR_MODE) 3479 if (!err && attr->ia_valid & ATTR_MODE)
3432 err = btrfs_acl_chmod(inode); 3480 err = btrfs_acl_chmod(inode);
3433 } 3481 }
3434 3482
@@ -3490,7 +3538,7 @@ void btrfs_evict_inode(struct inode *inode)
3490 * doing the truncate. 3538 * doing the truncate.
3491 */ 3539 */
3492 while (1) { 3540 while (1) {
3493 ret = btrfs_block_rsv_refill(root, rsv, min_size); 3541 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3494 3542
3495 /* 3543 /*
3496 * Try and steal from the global reserve since we will 3544 * Try and steal from the global reserve since we will
@@ -4204,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4204 * FIXME, needs more benchmarking...there are no reasons other than performance 4252 * FIXME, needs more benchmarking...there are no reasons other than performance
4205 * to keep or drop this code. 4253 * to keep or drop this code.
4206 */ 4254 */
4207void btrfs_dirty_inode(struct inode *inode, int flags) 4255int btrfs_dirty_inode(struct inode *inode)
4208{ 4256{
4209 struct btrfs_root *root = BTRFS_I(inode)->root; 4257 struct btrfs_root *root = BTRFS_I(inode)->root;
4210 struct btrfs_trans_handle *trans; 4258 struct btrfs_trans_handle *trans;
4211 int ret; 4259 int ret;
4212 4260
4213 if (BTRFS_I(inode)->dummy_inode) 4261 if (BTRFS_I(inode)->dummy_inode)
4214 return; 4262 return 0;
4215 4263
4216 trans = btrfs_join_transaction(root); 4264 trans = btrfs_join_transaction(root);
4217 BUG_ON(IS_ERR(trans)); 4265 if (IS_ERR(trans))
4266 return PTR_ERR(trans);
4218 4267
4219 ret = btrfs_update_inode(trans, root, inode); 4268 ret = btrfs_update_inode(trans, root, inode);
4220 if (ret && ret == -ENOSPC) { 4269 if (ret && ret == -ENOSPC) {
4221 /* whoops, lets try again with the full transaction */ 4270 /* whoops, lets try again with the full transaction */
4222 btrfs_end_transaction(trans, root); 4271 btrfs_end_transaction(trans, root);
4223 trans = btrfs_start_transaction(root, 1); 4272 trans = btrfs_start_transaction(root, 1);
4224 if (IS_ERR(trans)) { 4273 if (IS_ERR(trans))
4225 printk_ratelimited(KERN_ERR "btrfs: fail to " 4274 return PTR_ERR(trans);
4226 "dirty inode %llu error %ld\n",
4227 (unsigned long long)btrfs_ino(inode),
4228 PTR_ERR(trans));
4229 return;
4230 }
4231 4275
4232 ret = btrfs_update_inode(trans, root, inode); 4276 ret = btrfs_update_inode(trans, root, inode);
4233 if (ret) {
4234 printk_ratelimited(KERN_ERR "btrfs: fail to "
4235 "dirty inode %llu error %d\n",
4236 (unsigned long long)btrfs_ino(inode),
4237 ret);
4238 }
4239 } 4277 }
4240 btrfs_end_transaction(trans, root); 4278 btrfs_end_transaction(trans, root);
4241 if (BTRFS_I(inode)->delayed_node) 4279 if (BTRFS_I(inode)->delayed_node)
4242 btrfs_balance_delayed_items(root); 4280 btrfs_balance_delayed_items(root);
4281
4282 return ret;
4283}
4284
4285/*
4286 * This is a copy of file_update_time. We need this so we can return error on
4287 * ENOSPC for updating the inode in the case of file write and mmap writes.
4288 */
4289int btrfs_update_time(struct file *file)
4290{
4291 struct inode *inode = file->f_path.dentry->d_inode;
4292 struct timespec now;
4293 int ret;
4294 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4295
4296 /* First try to exhaust all avenues to not sync */
4297 if (IS_NOCMTIME(inode))
4298 return 0;
4299
4300 now = current_fs_time(inode->i_sb);
4301 if (!timespec_equal(&inode->i_mtime, &now))
4302 sync_it = S_MTIME;
4303
4304 if (!timespec_equal(&inode->i_ctime, &now))
4305 sync_it |= S_CTIME;
4306
4307 if (IS_I_VERSION(inode))
4308 sync_it |= S_VERSION;
4309
4310 if (!sync_it)
4311 return 0;
4312
4313 /* Finally allowed to write? Takes lock. */
4314 if (mnt_want_write_file(file))
4315 return 0;
4316
4317 /* Only change inode inside the lock region */
4318 if (sync_it & S_VERSION)
4319 inode_inc_iversion(inode);
4320 if (sync_it & S_CTIME)
4321 inode->i_ctime = now;
4322 if (sync_it & S_MTIME)
4323 inode->i_mtime = now;
4324 ret = btrfs_dirty_inode(inode);
4325 if (!ret)
4326 mark_inode_dirty_sync(inode);
4327 mnt_drop_write(file->f_path.mnt);
4328 return ret;
4243} 4329}
4244 4330
4245/* 4331/*
@@ -4555,11 +4641,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4555 goto out_unlock; 4641 goto out_unlock;
4556 } 4642 }
4557 4643
4644 /*
4645 * If the active LSM wants to access the inode during
4646 * d_instantiate it needs these. Smack checks to see
4647 * if the filesystem supports xattrs by looking at the
4648 * ops vector.
4649 */
4650
4651 inode->i_op = &btrfs_special_inode_operations;
4558 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4652 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4559 if (err) 4653 if (err)
4560 drop_inode = 1; 4654 drop_inode = 1;
4561 else { 4655 else {
4562 inode->i_op = &btrfs_special_inode_operations;
4563 init_special_inode(inode, inode->i_mode, rdev); 4656 init_special_inode(inode, inode->i_mode, rdev);
4564 btrfs_update_inode(trans, root, inode); 4657 btrfs_update_inode(trans, root, inode);
4565 } 4658 }
@@ -4613,14 +4706,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4613 goto out_unlock; 4706 goto out_unlock;
4614 } 4707 }
4615 4708
4709 /*
4710 * If the active LSM wants to access the inode during
4711 * d_instantiate it needs these. Smack checks to see
4712 * if the filesystem supports xattrs by looking at the
4713 * ops vector.
4714 */
4715 inode->i_fop = &btrfs_file_operations;
4716 inode->i_op = &btrfs_file_inode_operations;
4717
4616 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4718 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4617 if (err) 4719 if (err)
4618 drop_inode = 1; 4720 drop_inode = 1;
4619 else { 4721 else {
4620 inode->i_mapping->a_ops = &btrfs_aops; 4722 inode->i_mapping->a_ops = &btrfs_aops;
4621 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4723 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4622 inode->i_fop = &btrfs_file_operations;
4623 inode->i_op = &btrfs_file_inode_operations;
4624 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4724 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4625 } 4725 }
4626out_unlock: 4726out_unlock:
@@ -6303,7 +6403,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6303 u64 page_start; 6403 u64 page_start;
6304 u64 page_end; 6404 u64 page_end;
6305 6405
6406 /* Need this to keep space reservations serialized */
6407 mutex_lock(&inode->i_mutex);
6306 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6408 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6409 mutex_unlock(&inode->i_mutex);
6410 if (!ret)
6411 ret = btrfs_update_time(vma->vm_file);
6307 if (ret) { 6412 if (ret) {
6308 if (ret == -ENOMEM) 6413 if (ret == -ENOMEM)
6309 ret = VM_FAULT_OOM; 6414 ret = VM_FAULT_OOM;
@@ -6515,8 +6620,9 @@ static int btrfs_truncate(struct inode *inode)
6515 /* Just need the 1 for updating the inode */ 6620 /* Just need the 1 for updating the inode */
6516 trans = btrfs_start_transaction(root, 1); 6621 trans = btrfs_start_transaction(root, 1);
6517 if (IS_ERR(trans)) { 6622 if (IS_ERR(trans)) {
6518 err = PTR_ERR(trans); 6623 ret = err = PTR_ERR(trans);
6519 goto out; 6624 trans = NULL;
6625 break;
6520 } 6626 }
6521 } 6627 }
6522 6628
@@ -6794,11 +6900,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6794 struct dentry *dentry, struct kstat *stat) 6900 struct dentry *dentry, struct kstat *stat)
6795{ 6901{
6796 struct inode *inode = dentry->d_inode; 6902 struct inode *inode = dentry->d_inode;
6903 u32 blocksize = inode->i_sb->s_blocksize;
6904
6797 generic_fillattr(inode, stat); 6905 generic_fillattr(inode, stat);
6798 stat->dev = BTRFS_I(inode)->root->anon_dev; 6906 stat->dev = BTRFS_I(inode)->root->anon_dev;
6799 stat->blksize = PAGE_CACHE_SIZE; 6907 stat->blksize = PAGE_CACHE_SIZE;
6800 stat->blocks = (inode_get_bytes(inode) + 6908 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6801 BTRFS_I(inode)->delalloc_bytes) >> 9; 6909 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6802 return 0; 6910 return 0;
6803} 6911}
6804 6912
@@ -7074,14 +7182,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7074 goto out_unlock; 7182 goto out_unlock;
7075 } 7183 }
7076 7184
7185 /*
7186 * If the active LSM wants to access the inode during
7187 * d_instantiate it needs these. Smack checks to see
7188 * if the filesystem supports xattrs by looking at the
7189 * ops vector.
7190 */
7191 inode->i_fop = &btrfs_file_operations;
7192 inode->i_op = &btrfs_file_inode_operations;
7193
7077 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7194 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7078 if (err) 7195 if (err)
7079 drop_inode = 1; 7196 drop_inode = 1;
7080 else { 7197 else {
7081 inode->i_mapping->a_ops = &btrfs_aops; 7198 inode->i_mapping->a_ops = &btrfs_aops;
7082 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7199 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7083 inode->i_fop = &btrfs_file_operations;
7084 inode->i_op = &btrfs_file_inode_operations;
7085 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7200 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7086 } 7201 }
7087 if (drop_inode) 7202 if (drop_inode)
@@ -7351,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7351 .follow_link = page_follow_link_light, 7466 .follow_link = page_follow_link_light,
7352 .put_link = page_put_link, 7467 .put_link = page_put_link,
7353 .getattr = btrfs_getattr, 7468 .getattr = btrfs_getattr,
7469 .setattr = btrfs_setattr,
7354 .permission = btrfs_permission, 7470 .permission = btrfs_permission,
7355 .setxattr = btrfs_setxattr, 7471 .setxattr = btrfs_setxattr,
7356 .getxattr = btrfs_getxattr, 7472 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f126..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
252 trans = btrfs_join_transaction(root); 252 trans = btrfs_join_transaction(root);
253 BUG_ON(IS_ERR(trans)); 253 BUG_ON(IS_ERR(trans));
254 254
255 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME;
255 ret = btrfs_update_inode(trans, root, inode); 257 ret = btrfs_update_inode(trans, root, inode);
256 BUG_ON(ret); 258 BUG_ON(ret);
257 259
258 btrfs_update_iflags(inode);
259 inode->i_ctime = CURRENT_TIME;
260 btrfs_end_transaction(trans, root); 260 btrfs_end_transaction(trans, root);
261 261
262 mnt_drop_write(file->f_path.mnt); 262 mnt_drop_write(file->f_path.mnt);
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 858 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 860
861 mutex_lock(&inode->i_mutex);
861 ret = btrfs_delalloc_reserve_space(inode, 862 ret = btrfs_delalloc_reserve_space(inode,
862 num_pages << PAGE_CACHE_SHIFT); 863 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
863 if (ret) 865 if (ret)
864 return ret; 866 return ret;
865again: 867again:
@@ -1216,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1216 *devstr = '\0'; 1218 *devstr = '\0';
1217 devstr = vol_args->name; 1219 devstr = vol_args->name;
1218 devid = simple_strtoull(devstr, &end, 10); 1220 devid = simple_strtoull(devstr, &end, 10);
1219 printk(KERN_INFO "resizing devid %llu\n", 1221 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1220 (unsigned long long)devid); 1222 (unsigned long long)devid);
1221 } 1223 }
1222 device = btrfs_find_device(root, devid, NULL, NULL); 1224 device = btrfs_find_device(root, devid, NULL, NULL);
1223 if (!device) { 1225 if (!device) {
1224 printk(KERN_INFO "resizer unable to find device %llu\n", 1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1225 (unsigned long long)devid); 1227 (unsigned long long)devid);
1226 ret = -EINVAL; 1228 ret = -EINVAL;
1227 goto out_unlock; 1229 goto out_unlock;
@@ -1267,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1267 do_div(new_size, root->sectorsize); 1269 do_div(new_size, root->sectorsize);
1268 new_size *= root->sectorsize; 1270 new_size *= root->sectorsize;
1269 1271
1270 printk(KERN_INFO "new size for %s is %llu\n", 1272 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1271 device->name, (unsigned long long)new_size); 1273 device->name, (unsigned long long)new_size);
1272 1274
1273 if (new_size > old_size) { 1275 if (new_size > old_size) {
@@ -1278,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1278 } 1280 }
1279 ret = btrfs_grow_device(trans, device, new_size); 1281 ret = btrfs_grow_device(trans, device, new_size);
1280 btrfs_commit_transaction(trans, root); 1282 btrfs_commit_transaction(trans, root);
1281 } else { 1283 } else if (new_size < old_size) {
1282 ret = btrfs_shrink_device(device, new_size); 1284 ret = btrfs_shrink_device(device, new_size);
1283 } 1285 }
1284 1286
@@ -2930,11 +2932,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2930 goto out; 2932 goto out;
2931 2933
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 2934 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; 2935 rel_ptr = ipath->fspath->val[i] -
2936 (u64)(unsigned long)ipath->fspath->val;
2934 ipath->fspath->val[i] = rel_ptr; 2937 ipath->fspath->val[i] = rel_ptr;
2935 } 2938 }
2936 2939
2937 ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); 2940 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2941 (void *)(unsigned long)ipath->fspath, size);
2938 if (ret) { 2942 if (ret) {
2939 ret = -EFAULT; 2943 ret = -EFAULT;
2940 goto out; 2944 goto out;
@@ -3017,7 +3021,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3017 if (ret < 0) 3021 if (ret < 0)
3018 goto out; 3022 goto out;
3019 3023
3020 ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); 3024 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3025 (void *)(unsigned long)inodes, size);
3021 if (ret) 3026 if (ret)
3022 ret = -EFAULT; 3027 ret = -EFAULT;
3023 3028
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5e151a..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2949 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2950 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2951 if (ret) 2953 if (ret)
2952 goto out; 2954 goto out;
2953 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4190f22edfb..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
256 btrfs_release_path(swarn->path); 256 btrfs_release_path(swarn->path);
257 257
258 ipath = init_ipath(4096, local_root, swarn->path); 258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
259 ret = paths_from_inode(inum, ipath); 264 ret = paths_from_inode(inum, ipath);
260 265
261 if (ret < 0) 266 if (ret < 0)
@@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
272 swarn->logical, swarn->dev->name, 277 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset, 278 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink, 279 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]); 280 (char *)(unsigned long)ipath->fspath->val[i]);
276 281
277 free_ipath(ipath); 282 free_ipath(ipath);
278 return 0; 283 return 0;
@@ -1530,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1530static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1535static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1531{ 1536{
1532 struct btrfs_fs_info *fs_info = root->fs_info; 1537 struct btrfs_fs_info *fs_info = root->fs_info;
1538 int ret = 0;
1533 1539
1534 mutex_lock(&fs_info->scrub_lock); 1540 mutex_lock(&fs_info->scrub_lock);
1535 if (fs_info->scrub_workers_refcnt == 0) { 1541 if (fs_info->scrub_workers_refcnt == 0) {
1536 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1542 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1537 fs_info->thread_pool_size, &fs_info->generic_worker); 1543 fs_info->thread_pool_size, &fs_info->generic_worker);
1538 fs_info->scrub_workers.idle_thresh = 4; 1544 fs_info->scrub_workers.idle_thresh = 4;
1539 btrfs_start_workers(&fs_info->scrub_workers, 1); 1545 ret = btrfs_start_workers(&fs_info->scrub_workers);
1546 if (ret)
1547 goto out;
1540 } 1548 }
1541 ++fs_info->scrub_workers_refcnt; 1549 ++fs_info->scrub_workers_refcnt;
1550out:
1542 mutex_unlock(&fs_info->scrub_lock); 1551 mutex_unlock(&fs_info->scrub_lock);
1543 1552
1544 return 0; 1553 return ret;
1545} 1554}
1546 1555
1547static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1556static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8bd9d6d0e07a..200f63bc6675 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h> 43#include <linux/mnt_namespace.h>
44#include <linux/ratelimit.h>
44#include "compat.h" 45#include "compat.h"
45#include "delayed-inode.h" 46#include "delayed-inode.h"
46#include "ctree.h" 47#include "ctree.h"
@@ -825,13 +826,9 @@ static char *setup_root_args(char *args)
825static struct dentry *mount_subvol(const char *subvol_name, int flags, 826static struct dentry *mount_subvol(const char *subvol_name, int flags,
826 const char *device_name, char *data) 827 const char *device_name, char *data)
827{ 828{
828 struct super_block *s;
829 struct dentry *root; 829 struct dentry *root;
830 struct vfsmount *mnt; 830 struct vfsmount *mnt;
831 struct mnt_namespace *ns_private;
832 char *newargs; 831 char *newargs;
833 struct path path;
834 int error;
835 832
836 newargs = setup_root_args(data); 833 newargs = setup_root_args(data);
837 if (!newargs) 834 if (!newargs)
@@ -842,39 +839,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
842 if (IS_ERR(mnt)) 839 if (IS_ERR(mnt))
843 return ERR_CAST(mnt); 840 return ERR_CAST(mnt);
844 841
845 ns_private = create_mnt_ns(mnt); 842 root = mount_subtree(mnt, subvol_name);
846 if (IS_ERR(ns_private)) {
847 mntput(mnt);
848 return ERR_CAST(ns_private);
849 }
850 843
851 /* 844 if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
852 * This will trigger the automount of the subvol so we can just 845 struct super_block *s = root->d_sb;
853 * drop the mnt we have here and return the dentry that we 846 dput(root);
854 * found. 847 root = ERR_PTR(-EINVAL);
855 */ 848 deactivate_locked_super(s);
856 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
857 LOOKUP_FOLLOW, &path);
858 put_mnt_ns(ns_private);
859 if (error)
860 return ERR_PTR(error);
861
862 if (!is_subvolume_inode(path.dentry->d_inode)) {
863 path_put(&path);
864 mntput(mnt);
865 error = -EINVAL;
866 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", 849 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
867 subvol_name); 850 subvol_name);
868 return ERR_PTR(-EINVAL);
869 } 851 }
870 852
871 /* Get a ref to the sb and the dentry we found and return it */
872 s = path.mnt->mnt_sb;
873 atomic_inc(&s->s_active);
874 root = dget(path.dentry);
875 path_put(&path);
876 down_write(&s->s_umount);
877
878 return root; 853 return root;
879} 854}
880 855
@@ -1079,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1079 u64 avail_space; 1054 u64 avail_space;
1080 u64 used_space; 1055 u64 used_space;
1081 u64 min_stripe_size; 1056 u64 min_stripe_size;
1082 int min_stripes = 1; 1057 int min_stripes = 1, num_stripes = 1;
1083 int i = 0, nr_devices; 1058 int i = 0, nr_devices;
1084 int ret; 1059 int ret;
1085 1060
1086 nr_devices = fs_info->fs_devices->rw_devices; 1061 nr_devices = fs_info->fs_devices->open_devices;
1087 BUG_ON(!nr_devices); 1062 BUG_ON(!nr_devices);
1088 1063
1089 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1064 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1093,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1093 1068
1094 /* calc min stripe number for data space alloction */ 1069 /* calc min stripe number for data space alloction */
1095 type = btrfs_get_alloc_profile(root, 1); 1070 type = btrfs_get_alloc_profile(root, 1);
1096 if (type & BTRFS_BLOCK_GROUP_RAID0) 1071 if (type & BTRFS_BLOCK_GROUP_RAID0) {
1097 min_stripes = 2; 1072 min_stripes = 2;
1098 else if (type & BTRFS_BLOCK_GROUP_RAID1) 1073 num_stripes = nr_devices;
1074 } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
1099 min_stripes = 2; 1075 min_stripes = 2;
1100 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1076 num_stripes = 2;
1077 } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
1101 min_stripes = 4; 1078 min_stripes = 4;
1079 num_stripes = 4;
1080 }
1102 1081
1103 if (type & BTRFS_BLOCK_GROUP_DUP) 1082 if (type & BTRFS_BLOCK_GROUP_DUP)
1104 min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1083 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1105 else 1084 else
1106 min_stripe_size = BTRFS_STRIPE_LEN; 1085 min_stripe_size = BTRFS_STRIPE_LEN;
1107 1086
1108 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1087 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1109 if (!device->in_fs_metadata) 1088 if (!device->in_fs_metadata || !device->bdev)
1110 continue; 1089 continue;
1111 1090
1112 avail_space = device->total_bytes - device->bytes_used; 1091 avail_space = device->total_bytes - device->bytes_used;
@@ -1167,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1167 i = nr_devices - 1; 1146 i = nr_devices - 1;
1168 avail_space = 0; 1147 avail_space = 0;
1169 while (nr_devices >= min_stripes) { 1148 while (nr_devices >= min_stripes) {
1149 if (num_stripes > nr_devices)
1150 num_stripes = nr_devices;
1151
1170 if (devices_info[i].max_avail >= min_stripe_size) { 1152 if (devices_info[i].max_avail >= min_stripe_size) {
1171 int j; 1153 int j;
1172 u64 alloc_size; 1154 u64 alloc_size;
1173 1155
1174 avail_space += devices_info[i].max_avail * min_stripes; 1156 avail_space += devices_info[i].max_avail * num_stripes;
1175 alloc_size = devices_info[i].max_avail; 1157 alloc_size = devices_info[i].max_avail;
1176 for (j = i + 1 - min_stripes; j <= i; j++) 1158 for (j = i + 1 - num_stripes; j <= i; j++)
1177 devices_info[j].max_avail -= alloc_size; 1159 devices_info[j].max_avail -= alloc_size;
1178 } 1160 }
1179 i--; 1161 i--;
@@ -1290,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb)
1290 return 0; 1272 return 0;
1291} 1273}
1292 1274
1275static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1276{
1277 int ret;
1278
1279 ret = btrfs_dirty_inode(inode);
1280 if (ret)
1281 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1282 "error %d\n", btrfs_ino(inode), ret);
1283}
1284
1293static const struct super_operations btrfs_super_ops = { 1285static const struct super_operations btrfs_super_ops = {
1294 .drop_inode = btrfs_drop_inode, 1286 .drop_inode = btrfs_drop_inode,
1295 .evict_inode = btrfs_evict_inode, 1287 .evict_inode = btrfs_evict_inode,
@@ -1297,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = {
1297 .sync_fs = btrfs_sync_fs, 1289 .sync_fs = btrfs_sync_fs,
1298 .show_options = btrfs_show_options, 1290 .show_options = btrfs_show_options,
1299 .write_inode = btrfs_write_inode, 1291 .write_inode = btrfs_write_inode,
1300 .dirty_inode = btrfs_dirty_inode, 1292 .dirty_inode = btrfs_fs_dirty_inode,
1301 .alloc_inode = btrfs_alloc_inode, 1293 .alloc_inode = btrfs_alloc_inode,
1302 .destroy_inode = btrfs_destroy_inode, 1294 .destroy_inode = btrfs_destroy_inode,
1303 .statfs = btrfs_statfs, 1295 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6a0574e923bc..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
785 785
786 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
787 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
788 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
789 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
790 switch_commit_root(root); 794 switch_commit_root(root);
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
947 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
948 free_extent_buffer(old); 952 free_extent_buffer(old);
949 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
950 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
951 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
952 key.offset = trans->transid; 960 key.offset = trans->transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c37433d3cd82..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
295 btrfs_requeue_work(&device->work); 295 btrfs_requeue_work(&device->work);
296 goto done; 296 goto done;
297 } 297 }
298 /* unplug every 64 requests just for good measure */
299 if (batch_run % 64 == 0) {
300 blk_finish_plug(&plug);
301 blk_start_plug(&plug);
302 sync_pending = 0;
303 }
298 } 304 }
299 305
300 cond_resched(); 306 cond_resched();
@@ -1611,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1611 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1617 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1612 return -EINVAL; 1618 return -EINVAL;
1613 1619
1614 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1620 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1615 root->fs_info->bdev_holder); 1621 root->fs_info->bdev_holder);
1616 if (IS_ERR(bdev)) 1622 if (IS_ERR(bdev))
1617 return PTR_ERR(bdev); 1623 return PTR_ERR(bdev);
@@ -3258,7 +3264,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
3258 */ 3264 */
3259 if (atomic_read(&bbio->error) > bbio->max_errors) { 3265 if (atomic_read(&bbio->error) > bbio->max_errors) {
3260 err = -EIO; 3266 err = -EIO;
3261 } else if (err) { 3267 } else {
3262 /* 3268 /*
3263 * this bio is actually up to date, we didn't 3269 * this bio is actually up to date, we didn't
3264 * go over the max number of errors 3270 * go over the max number of errors
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
100 struct reada_zone *reada_curr_zone; 100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones; 101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents; 102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
103}; 109};
104 110
105struct btrfs_fs_devices { 111struct btrfs_fs_devices {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4144caf2f9d3..173b1d22e59b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
87 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); 87 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
88 88
89 /* dirty the head */ 89 /* dirty the head */
90 spin_lock(&inode->i_lock); 90 spin_lock(&ci->i_ceph_lock);
91 if (ci->i_head_snapc == NULL) 91 if (ci->i_head_snapc == NULL)
92 ci->i_head_snapc = ceph_get_snap_context(snapc); 92 ci->i_head_snapc = ceph_get_snap_context(snapc);
93 ++ci->i_wrbuffer_ref_head; 93 ++ci->i_wrbuffer_ref_head;
@@ -100,7 +100,7 @@ static int ceph_set_page_dirty(struct page *page)
100 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, 100 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
101 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, 101 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
102 snapc, snapc->seq, snapc->num_snaps); 102 snapc, snapc->seq, snapc->num_snaps);
103 spin_unlock(&inode->i_lock); 103 spin_unlock(&ci->i_ceph_lock);
104 104
105 /* now adjust page */ 105 /* now adjust page */
106 spin_lock_irq(&mapping->tree_lock); 106 spin_lock_irq(&mapping->tree_lock);
@@ -391,7 +391,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
391 struct ceph_snap_context *snapc = NULL; 391 struct ceph_snap_context *snapc = NULL;
392 struct ceph_cap_snap *capsnap = NULL; 392 struct ceph_cap_snap *capsnap = NULL;
393 393
394 spin_lock(&inode->i_lock); 394 spin_lock(&ci->i_ceph_lock);
395 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 395 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
396 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 396 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
397 capsnap->context, capsnap->dirty_pages); 397 capsnap->context, capsnap->dirty_pages);
@@ -407,7 +407,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
407 dout(" head snapc %p has %d dirty pages\n", 407 dout(" head snapc %p has %d dirty pages\n",
408 snapc, ci->i_wrbuffer_ref_head); 408 snapc, ci->i_wrbuffer_ref_head);
409 } 409 }
410 spin_unlock(&inode->i_lock); 410 spin_unlock(&ci->i_ceph_lock);
411 return snapc; 411 return snapc;
412} 412}
413 413
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0f327c6c9679..8b53193e4f7c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -309,7 +309,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
309/* 309/*
310 * Find ceph_cap for given mds, if any. 310 * Find ceph_cap for given mds, if any.
311 * 311 *
312 * Called with i_lock held. 312 * Called with i_ceph_lock held.
313 */ 313 */
314static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) 314static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
315{ 315{
@@ -332,9 +332,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
332{ 332{
333 struct ceph_cap *cap; 333 struct ceph_cap *cap;
334 334
335 spin_lock(&ci->vfs_inode.i_lock); 335 spin_lock(&ci->i_ceph_lock);
336 cap = __get_cap_for_mds(ci, mds); 336 cap = __get_cap_for_mds(ci, mds);
337 spin_unlock(&ci->vfs_inode.i_lock); 337 spin_unlock(&ci->i_ceph_lock);
338 return cap; 338 return cap;
339} 339}
340 340
@@ -361,15 +361,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
361 361
362int ceph_get_cap_mds(struct inode *inode) 362int ceph_get_cap_mds(struct inode *inode)
363{ 363{
364 struct ceph_inode_info *ci = ceph_inode(inode);
364 int mds; 365 int mds;
365 spin_lock(&inode->i_lock); 366 spin_lock(&ci->i_ceph_lock);
366 mds = __ceph_get_cap_mds(ceph_inode(inode)); 367 mds = __ceph_get_cap_mds(ceph_inode(inode));
367 spin_unlock(&inode->i_lock); 368 spin_unlock(&ci->i_ceph_lock);
368 return mds; 369 return mds;
369} 370}
370 371
371/* 372/*
372 * Called under i_lock. 373 * Called under i_ceph_lock.
373 */ 374 */
374static void __insert_cap_node(struct ceph_inode_info *ci, 375static void __insert_cap_node(struct ceph_inode_info *ci,
375 struct ceph_cap *new) 376 struct ceph_cap *new)
@@ -415,7 +416,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
415 * 416 *
416 * If I_FLUSH is set, leave the inode at the front of the list. 417 * If I_FLUSH is set, leave the inode at the front of the list.
417 * 418 *
418 * Caller holds i_lock 419 * Caller holds i_ceph_lock
419 * -> we take mdsc->cap_delay_lock 420 * -> we take mdsc->cap_delay_lock
420 */ 421 */
421static void __cap_delay_requeue(struct ceph_mds_client *mdsc, 422static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
@@ -457,7 +458,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
457/* 458/*
458 * Cancel delayed work on cap. 459 * Cancel delayed work on cap.
459 * 460 *
460 * Caller must hold i_lock. 461 * Caller must hold i_ceph_lock.
461 */ 462 */
462static void __cap_delay_cancel(struct ceph_mds_client *mdsc, 463static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
463 struct ceph_inode_info *ci) 464 struct ceph_inode_info *ci)
@@ -532,14 +533,14 @@ int ceph_add_cap(struct inode *inode,
532 wanted |= ceph_caps_for_mode(fmode); 533 wanted |= ceph_caps_for_mode(fmode);
533 534
534retry: 535retry:
535 spin_lock(&inode->i_lock); 536 spin_lock(&ci->i_ceph_lock);
536 cap = __get_cap_for_mds(ci, mds); 537 cap = __get_cap_for_mds(ci, mds);
537 if (!cap) { 538 if (!cap) {
538 if (new_cap) { 539 if (new_cap) {
539 cap = new_cap; 540 cap = new_cap;
540 new_cap = NULL; 541 new_cap = NULL;
541 } else { 542 } else {
542 spin_unlock(&inode->i_lock); 543 spin_unlock(&ci->i_ceph_lock);
543 new_cap = get_cap(mdsc, caps_reservation); 544 new_cap = get_cap(mdsc, caps_reservation);
544 if (new_cap == NULL) 545 if (new_cap == NULL)
545 return -ENOMEM; 546 return -ENOMEM;
@@ -625,7 +626,7 @@ retry:
625 626
626 if (fmode >= 0) 627 if (fmode >= 0)
627 __ceph_get_fmode(ci, fmode); 628 __ceph_get_fmode(ci, fmode);
628 spin_unlock(&inode->i_lock); 629 spin_unlock(&ci->i_ceph_lock);
629 wake_up_all(&ci->i_cap_wq); 630 wake_up_all(&ci->i_cap_wq);
630 return 0; 631 return 0;
631} 632}
@@ -792,7 +793,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
792 struct rb_node *p; 793 struct rb_node *p;
793 int ret = 0; 794 int ret = 0;
794 795
795 spin_lock(&inode->i_lock); 796 spin_lock(&ci->i_ceph_lock);
796 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 797 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
797 cap = rb_entry(p, struct ceph_cap, ci_node); 798 cap = rb_entry(p, struct ceph_cap, ci_node);
798 if (__cap_is_valid(cap) && 799 if (__cap_is_valid(cap) &&
@@ -801,7 +802,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
801 break; 802 break;
802 } 803 }
803 } 804 }
804 spin_unlock(&inode->i_lock); 805 spin_unlock(&ci->i_ceph_lock);
805 dout("ceph_caps_revoking %p %s = %d\n", inode, 806 dout("ceph_caps_revoking %p %s = %d\n", inode,
806 ceph_cap_string(mask), ret); 807 ceph_cap_string(mask), ret);
807 return ret; 808 return ret;
@@ -855,7 +856,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
855} 856}
856 857
857/* 858/*
858 * called under i_lock 859 * called under i_ceph_lock
859 */ 860 */
860static int __ceph_is_any_caps(struct ceph_inode_info *ci) 861static int __ceph_is_any_caps(struct ceph_inode_info *ci)
861{ 862{
@@ -865,7 +866,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
865/* 866/*
866 * Remove a cap. Take steps to deal with a racing iterate_session_caps. 867 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
867 * 868 *
868 * caller should hold i_lock. 869 * caller should hold i_ceph_lock.
869 * caller will not hold session s_mutex if called from destroy_inode. 870 * caller will not hold session s_mutex if called from destroy_inode.
870 */ 871 */
871void __ceph_remove_cap(struct ceph_cap *cap) 872void __ceph_remove_cap(struct ceph_cap *cap)
@@ -1028,7 +1029,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
1028 1029
1029/* 1030/*
1030 * Queue cap releases when an inode is dropped from our cache. Since 1031 * Queue cap releases when an inode is dropped from our cache. Since
1031 * inode is about to be destroyed, there is no need for i_lock. 1032 * inode is about to be destroyed, there is no need for i_ceph_lock.
1032 */ 1033 */
1033void ceph_queue_caps_release(struct inode *inode) 1034void ceph_queue_caps_release(struct inode *inode)
1034{ 1035{
@@ -1049,7 +1050,7 @@ void ceph_queue_caps_release(struct inode *inode)
1049 1050
1050/* 1051/*
1051 * Send a cap msg on the given inode. Update our caps state, then 1052 * Send a cap msg on the given inode. Update our caps state, then
1052 * drop i_lock and send the message. 1053 * drop i_ceph_lock and send the message.
1053 * 1054 *
1054 * Make note of max_size reported/requested from mds, revoked caps 1055 * Make note of max_size reported/requested from mds, revoked caps
1055 * that have now been implemented. 1056 * that have now been implemented.
@@ -1061,13 +1062,13 @@ void ceph_queue_caps_release(struct inode *inode)
1061 * Return non-zero if delayed release, or we experienced an error 1062 * Return non-zero if delayed release, or we experienced an error
1062 * such that the caller should requeue + retry later. 1063 * such that the caller should requeue + retry later.
1063 * 1064 *
1064 * called with i_lock, then drops it. 1065 * called with i_ceph_lock, then drops it.
1065 * caller should hold snap_rwsem (read), s_mutex. 1066 * caller should hold snap_rwsem (read), s_mutex.
1066 */ 1067 */
1067static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, 1068static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1068 int op, int used, int want, int retain, int flushing, 1069 int op, int used, int want, int retain, int flushing,
1069 unsigned *pflush_tid) 1070 unsigned *pflush_tid)
1070 __releases(cap->ci->vfs_inode->i_lock) 1071 __releases(cap->ci->i_ceph_lock)
1071{ 1072{
1072 struct ceph_inode_info *ci = cap->ci; 1073 struct ceph_inode_info *ci = cap->ci;
1073 struct inode *inode = &ci->vfs_inode; 1074 struct inode *inode = &ci->vfs_inode;
@@ -1170,7 +1171,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1170 xattr_version = ci->i_xattrs.version; 1171 xattr_version = ci->i_xattrs.version;
1171 } 1172 }
1172 1173
1173 spin_unlock(&inode->i_lock); 1174 spin_unlock(&ci->i_ceph_lock);
1174 1175
1175 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1176 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1176 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1177 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
@@ -1198,13 +1199,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1198 * Unless @again is true, skip cap_snaps that were already sent to 1199 * Unless @again is true, skip cap_snaps that were already sent to
1199 * the MDS (i.e., during this session). 1200 * the MDS (i.e., during this session).
1200 * 1201 *
1201 * Called under i_lock. Takes s_mutex as needed. 1202 * Called under i_ceph_lock. Takes s_mutex as needed.
1202 */ 1203 */
1203void __ceph_flush_snaps(struct ceph_inode_info *ci, 1204void __ceph_flush_snaps(struct ceph_inode_info *ci,
1204 struct ceph_mds_session **psession, 1205 struct ceph_mds_session **psession,
1205 int again) 1206 int again)
1206 __releases(ci->vfs_inode->i_lock) 1207 __releases(ci->i_ceph_lock)
1207 __acquires(ci->vfs_inode->i_lock) 1208 __acquires(ci->i_ceph_lock)
1208{ 1209{
1209 struct inode *inode = &ci->vfs_inode; 1210 struct inode *inode = &ci->vfs_inode;
1210 int mds; 1211 int mds;
@@ -1261,7 +1262,7 @@ retry:
1261 session = NULL; 1262 session = NULL;
1262 } 1263 }
1263 if (!session) { 1264 if (!session) {
1264 spin_unlock(&inode->i_lock); 1265 spin_unlock(&ci->i_ceph_lock);
1265 mutex_lock(&mdsc->mutex); 1266 mutex_lock(&mdsc->mutex);
1266 session = __ceph_lookup_mds_session(mdsc, mds); 1267 session = __ceph_lookup_mds_session(mdsc, mds);
1267 mutex_unlock(&mdsc->mutex); 1268 mutex_unlock(&mdsc->mutex);
@@ -1275,7 +1276,7 @@ retry:
1275 * deletion or migration. retry, and we'll 1276 * deletion or migration. retry, and we'll
1276 * get a better @mds value next time. 1277 * get a better @mds value next time.
1277 */ 1278 */
1278 spin_lock(&inode->i_lock); 1279 spin_lock(&ci->i_ceph_lock);
1279 goto retry; 1280 goto retry;
1280 } 1281 }
1281 1282
@@ -1285,7 +1286,7 @@ retry:
1285 list_del_init(&capsnap->flushing_item); 1286 list_del_init(&capsnap->flushing_item);
1286 list_add_tail(&capsnap->flushing_item, 1287 list_add_tail(&capsnap->flushing_item,
1287 &session->s_cap_snaps_flushing); 1288 &session->s_cap_snaps_flushing);
1288 spin_unlock(&inode->i_lock); 1289 spin_unlock(&ci->i_ceph_lock);
1289 1290
1290 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", 1291 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1291 inode, capsnap, capsnap->follows, capsnap->flush_tid); 1292 inode, capsnap, capsnap->follows, capsnap->flush_tid);
@@ -1302,7 +1303,7 @@ retry:
1302 next_follows = capsnap->follows + 1; 1303 next_follows = capsnap->follows + 1;
1303 ceph_put_cap_snap(capsnap); 1304 ceph_put_cap_snap(capsnap);
1304 1305
1305 spin_lock(&inode->i_lock); 1306 spin_lock(&ci->i_ceph_lock);
1306 goto retry; 1307 goto retry;
1307 } 1308 }
1308 1309
@@ -1322,11 +1323,9 @@ out:
1322 1323
1323static void ceph_flush_snaps(struct ceph_inode_info *ci) 1324static void ceph_flush_snaps(struct ceph_inode_info *ci)
1324{ 1325{
1325 struct inode *inode = &ci->vfs_inode; 1326 spin_lock(&ci->i_ceph_lock);
1326
1327 spin_lock(&inode->i_lock);
1328 __ceph_flush_snaps(ci, NULL, 0); 1327 __ceph_flush_snaps(ci, NULL, 0);
1329 spin_unlock(&inode->i_lock); 1328 spin_unlock(&ci->i_ceph_lock);
1330} 1329}
1331 1330
1332/* 1331/*
@@ -1373,7 +1372,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1373 * Add dirty inode to the flushing list. Assigned a seq number so we 1372 * Add dirty inode to the flushing list. Assigned a seq number so we
1374 * can wait for caps to flush without starving. 1373 * can wait for caps to flush without starving.
1375 * 1374 *
1376 * Called under i_lock. 1375 * Called under i_ceph_lock.
1377 */ 1376 */
1378static int __mark_caps_flushing(struct inode *inode, 1377static int __mark_caps_flushing(struct inode *inode,
1379 struct ceph_mds_session *session) 1378 struct ceph_mds_session *session)
@@ -1421,9 +1420,9 @@ static int try_nonblocking_invalidate(struct inode *inode)
1421 struct ceph_inode_info *ci = ceph_inode(inode); 1420 struct ceph_inode_info *ci = ceph_inode(inode);
1422 u32 invalidating_gen = ci->i_rdcache_gen; 1421 u32 invalidating_gen = ci->i_rdcache_gen;
1423 1422
1424 spin_unlock(&inode->i_lock); 1423 spin_unlock(&ci->i_ceph_lock);
1425 invalidate_mapping_pages(&inode->i_data, 0, -1); 1424 invalidate_mapping_pages(&inode->i_data, 0, -1);
1426 spin_lock(&inode->i_lock); 1425 spin_lock(&ci->i_ceph_lock);
1427 1426
1428 if (inode->i_data.nrpages == 0 && 1427 if (inode->i_data.nrpages == 0 &&
1429 invalidating_gen == ci->i_rdcache_gen) { 1428 invalidating_gen == ci->i_rdcache_gen) {
@@ -1470,7 +1469,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1470 if (mdsc->stopping) 1469 if (mdsc->stopping)
1471 is_delayed = 1; 1470 is_delayed = 1;
1472 1471
1473 spin_lock(&inode->i_lock); 1472 spin_lock(&ci->i_ceph_lock);
1474 1473
1475 if (ci->i_ceph_flags & CEPH_I_FLUSH) 1474 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1476 flags |= CHECK_CAPS_FLUSH; 1475 flags |= CHECK_CAPS_FLUSH;
@@ -1480,7 +1479,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1480 __ceph_flush_snaps(ci, &session, 0); 1479 __ceph_flush_snaps(ci, &session, 0);
1481 goto retry_locked; 1480 goto retry_locked;
1482retry: 1481retry:
1483 spin_lock(&inode->i_lock); 1482 spin_lock(&ci->i_ceph_lock);
1484retry_locked: 1483retry_locked:
1485 file_wanted = __ceph_caps_file_wanted(ci); 1484 file_wanted = __ceph_caps_file_wanted(ci);
1486 used = __ceph_caps_used(ci); 1485 used = __ceph_caps_used(ci);
@@ -1634,7 +1633,7 @@ ack:
1634 if (mutex_trylock(&session->s_mutex) == 0) { 1633 if (mutex_trylock(&session->s_mutex) == 0) {
1635 dout("inverting session/ino locks on %p\n", 1634 dout("inverting session/ino locks on %p\n",
1636 session); 1635 session);
1637 spin_unlock(&inode->i_lock); 1636 spin_unlock(&ci->i_ceph_lock);
1638 if (took_snap_rwsem) { 1637 if (took_snap_rwsem) {
1639 up_read(&mdsc->snap_rwsem); 1638 up_read(&mdsc->snap_rwsem);
1640 took_snap_rwsem = 0; 1639 took_snap_rwsem = 0;
@@ -1648,7 +1647,7 @@ ack:
1648 if (down_read_trylock(&mdsc->snap_rwsem) == 0) { 1647 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1649 dout("inverting snap/in locks on %p\n", 1648 dout("inverting snap/in locks on %p\n",
1650 inode); 1649 inode);
1651 spin_unlock(&inode->i_lock); 1650 spin_unlock(&ci->i_ceph_lock);
1652 down_read(&mdsc->snap_rwsem); 1651 down_read(&mdsc->snap_rwsem);
1653 took_snap_rwsem = 1; 1652 took_snap_rwsem = 1;
1654 goto retry; 1653 goto retry;
@@ -1664,10 +1663,10 @@ ack:
1664 mds = cap->mds; /* remember mds, so we don't repeat */ 1663 mds = cap->mds; /* remember mds, so we don't repeat */
1665 sent++; 1664 sent++;
1666 1665
1667 /* __send_cap drops i_lock */ 1666 /* __send_cap drops i_ceph_lock */
1668 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, 1667 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1669 retain, flushing, NULL); 1668 retain, flushing, NULL);
1670 goto retry; /* retake i_lock and restart our cap scan. */ 1669 goto retry; /* retake i_ceph_lock and restart our cap scan. */
1671 } 1670 }
1672 1671
1673 /* 1672 /*
@@ -1681,7 +1680,7 @@ ack:
1681 else if (!is_delayed || force_requeue) 1680 else if (!is_delayed || force_requeue)
1682 __cap_delay_requeue(mdsc, ci); 1681 __cap_delay_requeue(mdsc, ci);
1683 1682
1684 spin_unlock(&inode->i_lock); 1683 spin_unlock(&ci->i_ceph_lock);
1685 1684
1686 if (queue_invalidate) 1685 if (queue_invalidate)
1687 ceph_queue_invalidate(inode); 1686 ceph_queue_invalidate(inode);
@@ -1704,7 +1703,7 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1704 int flushing = 0; 1703 int flushing = 0;
1705 1704
1706retry: 1705retry:
1707 spin_lock(&inode->i_lock); 1706 spin_lock(&ci->i_ceph_lock);
1708 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { 1707 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1709 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); 1708 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1710 goto out; 1709 goto out;
@@ -1716,7 +1715,7 @@ retry:
1716 int delayed; 1715 int delayed;
1717 1716
1718 if (!session) { 1717 if (!session) {
1719 spin_unlock(&inode->i_lock); 1718 spin_unlock(&ci->i_ceph_lock);
1720 session = cap->session; 1719 session = cap->session;
1721 mutex_lock(&session->s_mutex); 1720 mutex_lock(&session->s_mutex);
1722 goto retry; 1721 goto retry;
@@ -1727,18 +1726,18 @@ retry:
1727 1726
1728 flushing = __mark_caps_flushing(inode, session); 1727 flushing = __mark_caps_flushing(inode, session);
1729 1728
1730 /* __send_cap drops i_lock */ 1729 /* __send_cap drops i_ceph_lock */
1731 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, 1730 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1732 cap->issued | cap->implemented, flushing, 1731 cap->issued | cap->implemented, flushing,
1733 flush_tid); 1732 flush_tid);
1734 if (!delayed) 1733 if (!delayed)
1735 goto out_unlocked; 1734 goto out_unlocked;
1736 1735
1737 spin_lock(&inode->i_lock); 1736 spin_lock(&ci->i_ceph_lock);
1738 __cap_delay_requeue(mdsc, ci); 1737 __cap_delay_requeue(mdsc, ci);
1739 } 1738 }
1740out: 1739out:
1741 spin_unlock(&inode->i_lock); 1740 spin_unlock(&ci->i_ceph_lock);
1742out_unlocked: 1741out_unlocked:
1743 if (session && unlock_session) 1742 if (session && unlock_session)
1744 mutex_unlock(&session->s_mutex); 1743 mutex_unlock(&session->s_mutex);
@@ -1753,7 +1752,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
1753 struct ceph_inode_info *ci = ceph_inode(inode); 1752 struct ceph_inode_info *ci = ceph_inode(inode);
1754 int i, ret = 1; 1753 int i, ret = 1;
1755 1754
1756 spin_lock(&inode->i_lock); 1755 spin_lock(&ci->i_ceph_lock);
1757 for (i = 0; i < CEPH_CAP_BITS; i++) 1756 for (i = 0; i < CEPH_CAP_BITS; i++)
1758 if ((ci->i_flushing_caps & (1 << i)) && 1757 if ((ci->i_flushing_caps & (1 << i)) &&
1759 ci->i_cap_flush_tid[i] <= tid) { 1758 ci->i_cap_flush_tid[i] <= tid) {
@@ -1761,7 +1760,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
1761 ret = 0; 1760 ret = 0;
1762 break; 1761 break;
1763 } 1762 }
1764 spin_unlock(&inode->i_lock); 1763 spin_unlock(&ci->i_ceph_lock);
1765 return ret; 1764 return ret;
1766} 1765}
1767 1766
@@ -1868,10 +1867,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1868 struct ceph_mds_client *mdsc = 1867 struct ceph_mds_client *mdsc =
1869 ceph_sb_to_client(inode->i_sb)->mdsc; 1868 ceph_sb_to_client(inode->i_sb)->mdsc;
1870 1869
1871 spin_lock(&inode->i_lock); 1870 spin_lock(&ci->i_ceph_lock);
1872 if (__ceph_caps_dirty(ci)) 1871 if (__ceph_caps_dirty(ci))
1873 __cap_delay_requeue_front(mdsc, ci); 1872 __cap_delay_requeue_front(mdsc, ci);
1874 spin_unlock(&inode->i_lock); 1873 spin_unlock(&ci->i_ceph_lock);
1875 } 1874 }
1876 return err; 1875 return err;
1877} 1876}
@@ -1894,7 +1893,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1894 struct inode *inode = &ci->vfs_inode; 1893 struct inode *inode = &ci->vfs_inode;
1895 struct ceph_cap *cap; 1894 struct ceph_cap *cap;
1896 1895
1897 spin_lock(&inode->i_lock); 1896 spin_lock(&ci->i_ceph_lock);
1898 cap = ci->i_auth_cap; 1897 cap = ci->i_auth_cap;
1899 if (cap && cap->session == session) { 1898 if (cap && cap->session == session) {
1900 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, 1899 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
@@ -1904,7 +1903,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1904 pr_err("%p auth cap %p not mds%d ???\n", inode, 1903 pr_err("%p auth cap %p not mds%d ???\n", inode,
1905 cap, session->s_mds); 1904 cap, session->s_mds);
1906 } 1905 }
1907 spin_unlock(&inode->i_lock); 1906 spin_unlock(&ci->i_ceph_lock);
1908 } 1907 }
1909} 1908}
1910 1909
@@ -1921,7 +1920,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1921 struct ceph_cap *cap; 1920 struct ceph_cap *cap;
1922 int delayed = 0; 1921 int delayed = 0;
1923 1922
1924 spin_lock(&inode->i_lock); 1923 spin_lock(&ci->i_ceph_lock);
1925 cap = ci->i_auth_cap; 1924 cap = ci->i_auth_cap;
1926 if (cap && cap->session == session) { 1925 if (cap && cap->session == session) {
1927 dout("kick_flushing_caps %p cap %p %s\n", inode, 1926 dout("kick_flushing_caps %p cap %p %s\n", inode,
@@ -1932,14 +1931,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1932 cap->issued | cap->implemented, 1931 cap->issued | cap->implemented,
1933 ci->i_flushing_caps, NULL); 1932 ci->i_flushing_caps, NULL);
1934 if (delayed) { 1933 if (delayed) {
1935 spin_lock(&inode->i_lock); 1934 spin_lock(&ci->i_ceph_lock);
1936 __cap_delay_requeue(mdsc, ci); 1935 __cap_delay_requeue(mdsc, ci);
1937 spin_unlock(&inode->i_lock); 1936 spin_unlock(&ci->i_ceph_lock);
1938 } 1937 }
1939 } else { 1938 } else {
1940 pr_err("%p auth cap %p not mds%d ???\n", inode, 1939 pr_err("%p auth cap %p not mds%d ???\n", inode,
1941 cap, session->s_mds); 1940 cap, session->s_mds);
1942 spin_unlock(&inode->i_lock); 1941 spin_unlock(&ci->i_ceph_lock);
1943 } 1942 }
1944 } 1943 }
1945} 1944}
@@ -1952,7 +1951,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1952 struct ceph_cap *cap; 1951 struct ceph_cap *cap;
1953 int delayed = 0; 1952 int delayed = 0;
1954 1953
1955 spin_lock(&inode->i_lock); 1954 spin_lock(&ci->i_ceph_lock);
1956 cap = ci->i_auth_cap; 1955 cap = ci->i_auth_cap;
1957 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1958 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
@@ -1964,12 +1963,12 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1964 cap->issued | cap->implemented, 1963 cap->issued | cap->implemented,
1965 ci->i_flushing_caps, NULL); 1964 ci->i_flushing_caps, NULL);
1966 if (delayed) { 1965 if (delayed) {
1967 spin_lock(&inode->i_lock); 1966 spin_lock(&ci->i_ceph_lock);
1968 __cap_delay_requeue(mdsc, ci); 1967 __cap_delay_requeue(mdsc, ci);
1969 spin_unlock(&inode->i_lock); 1968 spin_unlock(&ci->i_ceph_lock);
1970 } 1969 }
1971 } else { 1970 } else {
1972 spin_unlock(&inode->i_lock); 1971 spin_unlock(&ci->i_ceph_lock);
1973 } 1972 }
1974} 1973}
1975 1974
@@ -1978,7 +1977,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1978 * Take references to capabilities we hold, so that we don't release 1977 * Take references to capabilities we hold, so that we don't release
1979 * them to the MDS prematurely. 1978 * them to the MDS prematurely.
1980 * 1979 *
1981 * Protected by i_lock. 1980 * Protected by i_ceph_lock.
1982 */ 1981 */
1983static void __take_cap_refs(struct ceph_inode_info *ci, int got) 1982static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1984{ 1983{
@@ -2016,7 +2015,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2016 2015
2017 dout("get_cap_refs %p need %s want %s\n", inode, 2016 dout("get_cap_refs %p need %s want %s\n", inode,
2018 ceph_cap_string(need), ceph_cap_string(want)); 2017 ceph_cap_string(need), ceph_cap_string(want));
2019 spin_lock(&inode->i_lock); 2018 spin_lock(&ci->i_ceph_lock);
2020 2019
2021 /* make sure file is actually open */ 2020 /* make sure file is actually open */
2022 file_wanted = __ceph_caps_file_wanted(ci); 2021 file_wanted = __ceph_caps_file_wanted(ci);
@@ -2077,7 +2076,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2077 ceph_cap_string(have), ceph_cap_string(need)); 2076 ceph_cap_string(have), ceph_cap_string(need));
2078 } 2077 }
2079out: 2078out:
2080 spin_unlock(&inode->i_lock); 2079 spin_unlock(&ci->i_ceph_lock);
2081 dout("get_cap_refs %p ret %d got %s\n", inode, 2080 dout("get_cap_refs %p ret %d got %s\n", inode,
2082 ret, ceph_cap_string(*got)); 2081 ret, ceph_cap_string(*got));
2083 return ret; 2082 return ret;
@@ -2094,7 +2093,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2094 int check = 0; 2093 int check = 0;
2095 2094
2096 /* do we need to explicitly request a larger max_size? */ 2095 /* do we need to explicitly request a larger max_size? */
2097 spin_lock(&inode->i_lock); 2096 spin_lock(&ci->i_ceph_lock);
2098 if ((endoff >= ci->i_max_size || 2097 if ((endoff >= ci->i_max_size ||
2099 endoff > (inode->i_size << 1)) && 2098 endoff > (inode->i_size << 1)) &&
2100 endoff > ci->i_wanted_max_size) { 2099 endoff > ci->i_wanted_max_size) {
@@ -2103,7 +2102,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2103 ci->i_wanted_max_size = endoff; 2102 ci->i_wanted_max_size = endoff;
2104 check = 1; 2103 check = 1;
2105 } 2104 }
2106 spin_unlock(&inode->i_lock); 2105 spin_unlock(&ci->i_ceph_lock);
2107 if (check) 2106 if (check)
2108 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2107 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2109} 2108}
@@ -2140,9 +2139,9 @@ retry:
2140 */ 2139 */
2141void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) 2140void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2142{ 2141{
2143 spin_lock(&ci->vfs_inode.i_lock); 2142 spin_lock(&ci->i_ceph_lock);
2144 __take_cap_refs(ci, caps); 2143 __take_cap_refs(ci, caps);
2145 spin_unlock(&ci->vfs_inode.i_lock); 2144 spin_unlock(&ci->i_ceph_lock);
2146} 2145}
2147 2146
2148/* 2147/*
@@ -2160,7 +2159,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2160 int last = 0, put = 0, flushsnaps = 0, wake = 0; 2159 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2161 struct ceph_cap_snap *capsnap; 2160 struct ceph_cap_snap *capsnap;
2162 2161
2163 spin_lock(&inode->i_lock); 2162 spin_lock(&ci->i_ceph_lock);
2164 if (had & CEPH_CAP_PIN) 2163 if (had & CEPH_CAP_PIN)
2165 --ci->i_pin_ref; 2164 --ci->i_pin_ref;
2166 if (had & CEPH_CAP_FILE_RD) 2165 if (had & CEPH_CAP_FILE_RD)
@@ -2193,7 +2192,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2193 } 2192 }
2194 } 2193 }
2195 } 2194 }
2196 spin_unlock(&inode->i_lock); 2195 spin_unlock(&ci->i_ceph_lock);
2197 2196
2198 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), 2197 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2199 last ? " last" : "", put ? " put" : ""); 2198 last ? " last" : "", put ? " put" : "");
@@ -2225,7 +2224,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2225 int found = 0; 2224 int found = 0;
2226 struct ceph_cap_snap *capsnap = NULL; 2225 struct ceph_cap_snap *capsnap = NULL;
2227 2226
2228 spin_lock(&inode->i_lock); 2227 spin_lock(&ci->i_ceph_lock);
2229 ci->i_wrbuffer_ref -= nr; 2228 ci->i_wrbuffer_ref -= nr;
2230 last = !ci->i_wrbuffer_ref; 2229 last = !ci->i_wrbuffer_ref;
2231 2230
@@ -2274,7 +2273,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2274 } 2273 }
2275 } 2274 }
2276 2275
2277 spin_unlock(&inode->i_lock); 2276 spin_unlock(&ci->i_ceph_lock);
2278 2277
2279 if (last) { 2278 if (last) {
2280 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2279 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -2291,7 +2290,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2291 * Handle a cap GRANT message from the MDS. (Note that a GRANT may 2290 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2292 * actually be a revocation if it specifies a smaller cap set.) 2291 * actually be a revocation if it specifies a smaller cap set.)
2293 * 2292 *
2294 * caller holds s_mutex and i_lock, we drop both. 2293 * caller holds s_mutex and i_ceph_lock, we drop both.
2295 * 2294 *
2296 * return value: 2295 * return value:
2297 * 0 - ok 2296 * 0 - ok
@@ -2302,7 +2301,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2302 struct ceph_mds_session *session, 2301 struct ceph_mds_session *session,
2303 struct ceph_cap *cap, 2302 struct ceph_cap *cap,
2304 struct ceph_buffer *xattr_buf) 2303 struct ceph_buffer *xattr_buf)
2305 __releases(inode->i_lock) 2304 __releases(ci->i_ceph_lock)
2306{ 2305{
2307 struct ceph_inode_info *ci = ceph_inode(inode); 2306 struct ceph_inode_info *ci = ceph_inode(inode);
2308 int mds = session->s_mds; 2307 int mds = session->s_mds;
@@ -2453,7 +2452,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2453 } 2452 }
2454 BUG_ON(cap->issued & ~cap->implemented); 2453 BUG_ON(cap->issued & ~cap->implemented);
2455 2454
2456 spin_unlock(&inode->i_lock); 2455 spin_unlock(&ci->i_ceph_lock);
2457 if (writeback) 2456 if (writeback)
2458 /* 2457 /*
2459 * queue inode for writeback: we can't actually call 2458 * queue inode for writeback: we can't actually call
@@ -2483,7 +2482,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2483 struct ceph_mds_caps *m, 2482 struct ceph_mds_caps *m,
2484 struct ceph_mds_session *session, 2483 struct ceph_mds_session *session,
2485 struct ceph_cap *cap) 2484 struct ceph_cap *cap)
2486 __releases(inode->i_lock) 2485 __releases(ci->i_ceph_lock)
2487{ 2486{
2488 struct ceph_inode_info *ci = ceph_inode(inode); 2487 struct ceph_inode_info *ci = ceph_inode(inode);
2489 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 2488 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -2539,7 +2538,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2539 wake_up_all(&ci->i_cap_wq); 2538 wake_up_all(&ci->i_cap_wq);
2540 2539
2541out: 2540out:
2542 spin_unlock(&inode->i_lock); 2541 spin_unlock(&ci->i_ceph_lock);
2543 if (drop) 2542 if (drop)
2544 iput(inode); 2543 iput(inode);
2545} 2544}
@@ -2562,7 +2561,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2562 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", 2561 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2563 inode, ci, session->s_mds, follows); 2562 inode, ci, session->s_mds, follows);
2564 2563
2565 spin_lock(&inode->i_lock); 2564 spin_lock(&ci->i_ceph_lock);
2566 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2565 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2567 if (capsnap->follows == follows) { 2566 if (capsnap->follows == follows) {
2568 if (capsnap->flush_tid != flush_tid) { 2567 if (capsnap->flush_tid != flush_tid) {
@@ -2585,7 +2584,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2585 capsnap, capsnap->follows); 2584 capsnap, capsnap->follows);
2586 } 2585 }
2587 } 2586 }
2588 spin_unlock(&inode->i_lock); 2587 spin_unlock(&ci->i_ceph_lock);
2589 if (drop) 2588 if (drop)
2590 iput(inode); 2589 iput(inode);
2591} 2590}
@@ -2598,7 +2597,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2598static void handle_cap_trunc(struct inode *inode, 2597static void handle_cap_trunc(struct inode *inode,
2599 struct ceph_mds_caps *trunc, 2598 struct ceph_mds_caps *trunc,
2600 struct ceph_mds_session *session) 2599 struct ceph_mds_session *session)
2601 __releases(inode->i_lock) 2600 __releases(ci->i_ceph_lock)
2602{ 2601{
2603 struct ceph_inode_info *ci = ceph_inode(inode); 2602 struct ceph_inode_info *ci = ceph_inode(inode);
2604 int mds = session->s_mds; 2603 int mds = session->s_mds;
@@ -2617,7 +2616,7 @@ static void handle_cap_trunc(struct inode *inode,
2617 inode, mds, seq, truncate_size, truncate_seq); 2616 inode, mds, seq, truncate_size, truncate_seq);
2618 queue_trunc = ceph_fill_file_size(inode, issued, 2617 queue_trunc = ceph_fill_file_size(inode, issued,
2619 truncate_seq, truncate_size, size); 2618 truncate_seq, truncate_size, size);
2620 spin_unlock(&inode->i_lock); 2619 spin_unlock(&ci->i_ceph_lock);
2621 2620
2622 if (queue_trunc) 2621 if (queue_trunc)
2623 ceph_queue_vmtruncate(inode); 2622 ceph_queue_vmtruncate(inode);
@@ -2646,7 +2645,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2646 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", 2645 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2647 inode, ci, mds, mseq); 2646 inode, ci, mds, mseq);
2648 2647
2649 spin_lock(&inode->i_lock); 2648 spin_lock(&ci->i_ceph_lock);
2650 2649
2651 /* make sure we haven't seen a higher mseq */ 2650 /* make sure we haven't seen a higher mseq */
2652 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2651 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
@@ -2690,7 +2689,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2690 } 2689 }
2691 /* else, we already released it */ 2690 /* else, we already released it */
2692 2691
2693 spin_unlock(&inode->i_lock); 2692 spin_unlock(&ci->i_ceph_lock);
2694} 2693}
2695 2694
2696/* 2695/*
@@ -2745,9 +2744,9 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2745 up_read(&mdsc->snap_rwsem); 2744 up_read(&mdsc->snap_rwsem);
2746 2745
2747 /* make sure we re-request max_size, if necessary */ 2746 /* make sure we re-request max_size, if necessary */
2748 spin_lock(&inode->i_lock); 2747 spin_lock(&ci->i_ceph_lock);
2749 ci->i_requested_max_size = 0; 2748 ci->i_requested_max_size = 0;
2750 spin_unlock(&inode->i_lock); 2749 spin_unlock(&ci->i_ceph_lock);
2751} 2750}
2752 2751
2753/* 2752/*
@@ -2762,6 +2761,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2762 struct ceph_mds_client *mdsc = session->s_mdsc; 2761 struct ceph_mds_client *mdsc = session->s_mdsc;
2763 struct super_block *sb = mdsc->fsc->sb; 2762 struct super_block *sb = mdsc->fsc->sb;
2764 struct inode *inode; 2763 struct inode *inode;
2764 struct ceph_inode_info *ci;
2765 struct ceph_cap *cap; 2765 struct ceph_cap *cap;
2766 struct ceph_mds_caps *h; 2766 struct ceph_mds_caps *h;
2767 int mds = session->s_mds; 2767 int mds = session->s_mds;
@@ -2815,6 +2815,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2815 2815
2816 /* lookup ino */ 2816 /* lookup ino */
2817 inode = ceph_find_inode(sb, vino); 2817 inode = ceph_find_inode(sb, vino);
2818 ci = ceph_inode(inode);
2818 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, 2819 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2819 vino.snap, inode); 2820 vino.snap, inode);
2820 if (!inode) { 2821 if (!inode) {
@@ -2844,16 +2845,16 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2844 } 2845 }
2845 2846
2846 /* the rest require a cap */ 2847 /* the rest require a cap */
2847 spin_lock(&inode->i_lock); 2848 spin_lock(&ci->i_ceph_lock);
2848 cap = __get_cap_for_mds(ceph_inode(inode), mds); 2849 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2849 if (!cap) { 2850 if (!cap) {
2850 dout(" no cap on %p ino %llx.%llx from mds%d\n", 2851 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2851 inode, ceph_ino(inode), ceph_snap(inode), mds); 2852 inode, ceph_ino(inode), ceph_snap(inode), mds);
2852 spin_unlock(&inode->i_lock); 2853 spin_unlock(&ci->i_ceph_lock);
2853 goto flush_cap_releases; 2854 goto flush_cap_releases;
2854 } 2855 }
2855 2856
2856 /* note that each of these drops i_lock for us */ 2857 /* note that each of these drops i_ceph_lock for us */
2857 switch (op) { 2858 switch (op) {
2858 case CEPH_CAP_OP_REVOKE: 2859 case CEPH_CAP_OP_REVOKE:
2859 case CEPH_CAP_OP_GRANT: 2860 case CEPH_CAP_OP_GRANT:
@@ -2869,7 +2870,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2869 break; 2870 break;
2870 2871
2871 default: 2872 default:
2872 spin_unlock(&inode->i_lock); 2873 spin_unlock(&ci->i_ceph_lock);
2873 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, 2874 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2874 ceph_cap_op_name(op)); 2875 ceph_cap_op_name(op));
2875 } 2876 }
@@ -2962,13 +2963,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2962 struct inode *inode = &ci->vfs_inode; 2963 struct inode *inode = &ci->vfs_inode;
2963 int last = 0; 2964 int last = 0;
2964 2965
2965 spin_lock(&inode->i_lock); 2966 spin_lock(&ci->i_ceph_lock);
2966 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, 2967 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2967 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); 2968 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2968 BUG_ON(ci->i_nr_by_mode[fmode] == 0); 2969 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2969 if (--ci->i_nr_by_mode[fmode] == 0) 2970 if (--ci->i_nr_by_mode[fmode] == 0)
2970 last++; 2971 last++;
2971 spin_unlock(&inode->i_lock); 2972 spin_unlock(&ci->i_ceph_lock);
2972 2973
2973 if (last && ci->i_vino.snap == CEPH_NOSNAP) 2974 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2974 ceph_check_caps(ci, 0, NULL); 2975 ceph_check_caps(ci, 0, NULL);
@@ -2991,7 +2992,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
2991 int used, dirty; 2992 int used, dirty;
2992 int ret = 0; 2993 int ret = 0;
2993 2994
2994 spin_lock(&inode->i_lock); 2995 spin_lock(&ci->i_ceph_lock);
2995 used = __ceph_caps_used(ci); 2996 used = __ceph_caps_used(ci);
2996 dirty = __ceph_caps_dirty(ci); 2997 dirty = __ceph_caps_dirty(ci);
2997 2998
@@ -3046,7 +3047,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
3046 inode, cap, ceph_cap_string(cap->issued)); 3047 inode, cap, ceph_cap_string(cap->issued));
3047 } 3048 }
3048 } 3049 }
3049 spin_unlock(&inode->i_lock); 3050 spin_unlock(&ci->i_ceph_lock);
3050 return ret; 3051 return ret;
3051} 3052}
3052 3053
@@ -3061,7 +3062,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
3061 3062
3062 /* 3063 /*
3063 * force an record for the directory caps if we have a dentry lease. 3064 * force an record for the directory caps if we have a dentry lease.
3064 * this is racy (can't take i_lock and d_lock together), but it 3065 * this is racy (can't take i_ceph_lock and d_lock together), but it
3065 * doesn't have to be perfect; the mds will revoke anything we don't 3066 * doesn't have to be perfect; the mds will revoke anything we don't
3066 * release. 3067 * release.
3067 */ 3068 */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2abd0dfad7f8..3eeb97661262 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -281,18 +281,18 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
281 } 281 }
282 282
283 /* can we use the dcache? */ 283 /* can we use the dcache? */
284 spin_lock(&inode->i_lock); 284 spin_lock(&ci->i_ceph_lock);
285 if ((filp->f_pos == 2 || fi->dentry) && 285 if ((filp->f_pos == 2 || fi->dentry) &&
286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
287 ceph_snap(inode) != CEPH_SNAPDIR && 287 ceph_snap(inode) != CEPH_SNAPDIR &&
288 ceph_dir_test_complete(inode) && 288 ceph_dir_test_complete(inode) &&
289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
290 spin_unlock(&inode->i_lock); 290 spin_unlock(&ci->i_ceph_lock);
291 err = __dcache_readdir(filp, dirent, filldir); 291 err = __dcache_readdir(filp, dirent, filldir);
292 if (err != -EAGAIN) 292 if (err != -EAGAIN)
293 return err; 293 return err;
294 } else { 294 } else {
295 spin_unlock(&inode->i_lock); 295 spin_unlock(&ci->i_ceph_lock);
296 } 296 }
297 if (fi->dentry) { 297 if (fi->dentry) {
298 err = note_last_dentry(fi, fi->dentry->d_name.name, 298 err = note_last_dentry(fi, fi->dentry->d_name.name,
@@ -428,12 +428,12 @@ more:
428 * were released during the whole readdir, and we should have 428 * were released during the whole readdir, and we should have
429 * the complete dir contents in our cache. 429 * the complete dir contents in our cache.
430 */ 430 */
431 spin_lock(&inode->i_lock); 431 spin_lock(&ci->i_ceph_lock);
432 if (ci->i_release_count == fi->dir_release_count) { 432 if (ci->i_release_count == fi->dir_release_count) {
433 ceph_dir_set_complete(inode); 433 ceph_dir_set_complete(inode);
434 ci->i_max_offset = filp->f_pos; 434 ci->i_max_offset = filp->f_pos;
435 } 435 }
436 spin_unlock(&inode->i_lock); 436 spin_unlock(&ci->i_ceph_lock);
437 437
438 dout("readdir %p filp %p done.\n", inode, filp); 438 dout("readdir %p filp %p done.\n", inode, filp);
439 return 0; 439 return 0;
@@ -607,7 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
607 struct ceph_inode_info *ci = ceph_inode(dir); 607 struct ceph_inode_info *ci = ceph_inode(dir);
608 struct ceph_dentry_info *di = ceph_dentry(dentry); 608 struct ceph_dentry_info *di = ceph_dentry(dentry);
609 609
610 spin_lock(&dir->i_lock); 610 spin_lock(&ci->i_ceph_lock);
611 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 611 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
612 if (strncmp(dentry->d_name.name, 612 if (strncmp(dentry->d_name.name,
613 fsc->mount_options->snapdir_name, 613 fsc->mount_options->snapdir_name,
@@ -615,13 +615,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
615 !is_root_ceph_dentry(dir, dentry) && 615 !is_root_ceph_dentry(dir, dentry) &&
616 ceph_dir_test_complete(dir) && 616 ceph_dir_test_complete(dir) &&
617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
618 spin_unlock(&dir->i_lock); 618 spin_unlock(&ci->i_ceph_lock);
619 dout(" dir %p complete, -ENOENT\n", dir); 619 dout(" dir %p complete, -ENOENT\n", dir);
620 d_add(dentry, NULL); 620 d_add(dentry, NULL);
621 di->lease_shared_gen = ci->i_shared_gen; 621 di->lease_shared_gen = ci->i_shared_gen;
622 return NULL; 622 return NULL;
623 } 623 }
624 spin_unlock(&dir->i_lock); 624 spin_unlock(&ci->i_ceph_lock);
625 } 625 }
626 626
627 op = ceph_snap(dir) == CEPH_SNAPDIR ? 627 op = ceph_snap(dir) == CEPH_SNAPDIR ?
@@ -841,12 +841,12 @@ static int drop_caps_for_unlink(struct inode *inode)
841 struct ceph_inode_info *ci = ceph_inode(inode); 841 struct ceph_inode_info *ci = ceph_inode(inode);
842 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 842 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
843 843
844 spin_lock(&inode->i_lock); 844 spin_lock(&ci->i_ceph_lock);
845 if (inode->i_nlink == 1) { 845 if (inode->i_nlink == 1) {
846 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 846 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
847 ci->i_ceph_flags |= CEPH_I_NODELAY; 847 ci->i_ceph_flags |= CEPH_I_NODELAY;
848 } 848 }
849 spin_unlock(&inode->i_lock); 849 spin_unlock(&ci->i_ceph_lock);
850 return drop; 850 return drop;
851} 851}
852 852
@@ -1015,10 +1015,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
1015 struct ceph_dentry_info *di = ceph_dentry(dentry); 1015 struct ceph_dentry_info *di = ceph_dentry(dentry);
1016 int valid = 0; 1016 int valid = 0;
1017 1017
1018 spin_lock(&dir->i_lock); 1018 spin_lock(&ci->i_ceph_lock);
1019 if (ci->i_shared_gen == di->lease_shared_gen) 1019 if (ci->i_shared_gen == di->lease_shared_gen)
1020 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1020 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
1021 spin_unlock(&dir->i_lock); 1021 spin_unlock(&ci->i_ceph_lock);
1022 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1022 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
1023 dir, (unsigned)ci->i_shared_gen, dentry, 1023 dir, (unsigned)ci->i_shared_gen, dentry,
1024 (unsigned)di->lease_shared_gen, valid); 1024 (unsigned)di->lease_shared_gen, valid);
@@ -1143,7 +1143,7 @@ static void ceph_d_prune(struct dentry *dentry)
1143{ 1143{
1144 struct ceph_dentry_info *di; 1144 struct ceph_dentry_info *di;
1145 1145
1146 dout("d_release %p\n", dentry); 1146 dout("ceph_d_prune %p\n", dentry);
1147 1147
1148 /* do we have a valid parent? */ 1148 /* do we have a valid parent? */
1149 if (!dentry->d_parent || IS_ROOT(dentry)) 1149 if (!dentry->d_parent || IS_ROOT(dentry))
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce549d31eeb7..ed72428d9c75 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -147,9 +147,9 @@ int ceph_open(struct inode *inode, struct file *file)
147 147
148 /* trivially open snapdir */ 148 /* trivially open snapdir */
149 if (ceph_snap(inode) == CEPH_SNAPDIR) { 149 if (ceph_snap(inode) == CEPH_SNAPDIR) {
150 spin_lock(&inode->i_lock); 150 spin_lock(&ci->i_ceph_lock);
151 __ceph_get_fmode(ci, fmode); 151 __ceph_get_fmode(ci, fmode);
152 spin_unlock(&inode->i_lock); 152 spin_unlock(&ci->i_ceph_lock);
153 return ceph_init_file(inode, file, fmode); 153 return ceph_init_file(inode, file, fmode);
154 } 154 }
155 155
@@ -158,7 +158,7 @@ int ceph_open(struct inode *inode, struct file *file)
158 * write) or any MDS (for read). Update wanted set 158 * write) or any MDS (for read). Update wanted set
159 * asynchronously. 159 * asynchronously.
160 */ 160 */
161 spin_lock(&inode->i_lock); 161 spin_lock(&ci->i_ceph_lock);
162 if (__ceph_is_any_real_caps(ci) && 162 if (__ceph_is_any_real_caps(ci) &&
163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { 163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
164 int mds_wanted = __ceph_caps_mds_wanted(ci); 164 int mds_wanted = __ceph_caps_mds_wanted(ci);
@@ -168,7 +168,7 @@ int ceph_open(struct inode *inode, struct file *file)
168 inode, fmode, ceph_cap_string(wanted), 168 inode, fmode, ceph_cap_string(wanted),
169 ceph_cap_string(issued)); 169 ceph_cap_string(issued));
170 __ceph_get_fmode(ci, fmode); 170 __ceph_get_fmode(ci, fmode);
171 spin_unlock(&inode->i_lock); 171 spin_unlock(&ci->i_ceph_lock);
172 172
173 /* adjust wanted? */ 173 /* adjust wanted? */
174 if ((issued & wanted) != wanted && 174 if ((issued & wanted) != wanted &&
@@ -180,10 +180,10 @@ int ceph_open(struct inode *inode, struct file *file)
180 } else if (ceph_snap(inode) != CEPH_NOSNAP && 180 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
181 (ci->i_snap_caps & wanted) == wanted) { 181 (ci->i_snap_caps & wanted) == wanted) {
182 __ceph_get_fmode(ci, fmode); 182 __ceph_get_fmode(ci, fmode);
183 spin_unlock(&inode->i_lock); 183 spin_unlock(&ci->i_ceph_lock);
184 return ceph_init_file(inode, file, fmode); 184 return ceph_init_file(inode, file, fmode);
185 } 185 }
186 spin_unlock(&inode->i_lock); 186 spin_unlock(&ci->i_ceph_lock);
187 187
188 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); 188 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
189 req = prepare_open_request(inode->i_sb, flags, 0); 189 req = prepare_open_request(inode->i_sb, flags, 0);
@@ -743,9 +743,9 @@ retry_snap:
743 */ 743 */
744 int dirty; 744 int dirty;
745 745
746 spin_lock(&inode->i_lock); 746 spin_lock(&ci->i_ceph_lock);
747 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 747 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
748 spin_unlock(&inode->i_lock); 748 spin_unlock(&ci->i_ceph_lock);
749 ceph_put_cap_refs(ci, got); 749 ceph_put_cap_refs(ci, got);
750 750
751 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 751 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
@@ -764,9 +764,9 @@ retry_snap:
764 764
765 if (ret >= 0) { 765 if (ret >= 0) {
766 int dirty; 766 int dirty;
767 spin_lock(&inode->i_lock); 767 spin_lock(&ci->i_ceph_lock);
768 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 768 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
769 spin_unlock(&inode->i_lock); 769 spin_unlock(&ci->i_ceph_lock);
770 if (dirty) 770 if (dirty)
771 __mark_inode_dirty(inode, dirty); 771 __mark_inode_dirty(inode, dirty);
772 } 772 }
@@ -797,7 +797,8 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
797 797
798 mutex_lock(&inode->i_mutex); 798 mutex_lock(&inode->i_mutex);
799 __ceph_do_pending_vmtruncate(inode); 799 __ceph_do_pending_vmtruncate(inode);
800 if (origin != SEEK_CUR || origin != SEEK_SET) { 800
801 if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) {
801 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 802 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
802 if (ret < 0) { 803 if (ret < 0) {
803 offset = ret; 804 offset = ret;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e392bfce84a3..87fb132fb330 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
297 297
298 dout("alloc_inode %p\n", &ci->vfs_inode); 298 dout("alloc_inode %p\n", &ci->vfs_inode);
299 299
300 spin_lock_init(&ci->i_ceph_lock);
301
300 ci->i_version = 0; 302 ci->i_version = 0;
301 ci->i_time_warp_seq = 0; 303 ci->i_time_warp_seq = 0;
302 ci->i_ceph_flags = 0; 304 ci->i_ceph_flags = 0;
@@ -583,7 +585,7 @@ static int fill_inode(struct inode *inode,
583 iinfo->xattr_len); 585 iinfo->xattr_len);
584 } 586 }
585 587
586 spin_lock(&inode->i_lock); 588 spin_lock(&ci->i_ceph_lock);
587 589
588 /* 590 /*
589 * provided version will be odd if inode value is projected, 591 * provided version will be odd if inode value is projected,
@@ -680,7 +682,7 @@ static int fill_inode(struct inode *inode,
680 char *sym; 682 char *sym;
681 683
682 BUG_ON(symlen != inode->i_size); 684 BUG_ON(symlen != inode->i_size);
683 spin_unlock(&inode->i_lock); 685 spin_unlock(&ci->i_ceph_lock);
684 686
685 err = -ENOMEM; 687 err = -ENOMEM;
686 sym = kmalloc(symlen+1, GFP_NOFS); 688 sym = kmalloc(symlen+1, GFP_NOFS);
@@ -689,7 +691,7 @@ static int fill_inode(struct inode *inode,
689 memcpy(sym, iinfo->symlink, symlen); 691 memcpy(sym, iinfo->symlink, symlen);
690 sym[symlen] = 0; 692 sym[symlen] = 0;
691 693
692 spin_lock(&inode->i_lock); 694 spin_lock(&ci->i_ceph_lock);
693 if (!ci->i_symlink) 695 if (!ci->i_symlink)
694 ci->i_symlink = sym; 696 ci->i_symlink = sym;
695 else 697 else
@@ -715,7 +717,7 @@ static int fill_inode(struct inode *inode,
715 } 717 }
716 718
717no_change: 719no_change:
718 spin_unlock(&inode->i_lock); 720 spin_unlock(&ci->i_ceph_lock);
719 721
720 /* queue truncate if we saw i_size decrease */ 722 /* queue truncate if we saw i_size decrease */
721 if (queue_trunc) 723 if (queue_trunc)
@@ -750,13 +752,13 @@ no_change:
750 info->cap.flags, 752 info->cap.flags,
751 caps_reservation); 753 caps_reservation);
752 } else { 754 } else {
753 spin_lock(&inode->i_lock); 755 spin_lock(&ci->i_ceph_lock);
754 dout(" %p got snap_caps %s\n", inode, 756 dout(" %p got snap_caps %s\n", inode,
755 ceph_cap_string(le32_to_cpu(info->cap.caps))); 757 ceph_cap_string(le32_to_cpu(info->cap.caps)));
756 ci->i_snap_caps |= le32_to_cpu(info->cap.caps); 758 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
757 if (cap_fmode >= 0) 759 if (cap_fmode >= 0)
758 __ceph_get_fmode(ci, cap_fmode); 760 __ceph_get_fmode(ci, cap_fmode);
759 spin_unlock(&inode->i_lock); 761 spin_unlock(&ci->i_ceph_lock);
760 } 762 }
761 } else if (cap_fmode >= 0) { 763 } else if (cap_fmode >= 0) {
762 pr_warning("mds issued no caps on %llx.%llx\n", 764 pr_warning("mds issued no caps on %llx.%llx\n",
@@ -849,19 +851,20 @@ static void ceph_set_dentry_offset(struct dentry *dn)
849{ 851{
850 struct dentry *dir = dn->d_parent; 852 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dir->d_inode; 853 struct inode *inode = dir->d_inode;
854 struct ceph_inode_info *ci = ceph_inode(inode);
852 struct ceph_dentry_info *di; 855 struct ceph_dentry_info *di;
853 856
854 BUG_ON(!inode); 857 BUG_ON(!inode);
855 858
856 di = ceph_dentry(dn); 859 di = ceph_dentry(dn);
857 860
858 spin_lock(&inode->i_lock); 861 spin_lock(&ci->i_ceph_lock);
859 if (!ceph_dir_test_complete(inode)) { 862 if (!ceph_dir_test_complete(inode)) {
860 spin_unlock(&inode->i_lock); 863 spin_unlock(&ci->i_ceph_lock);
861 return; 864 return;
862 } 865 }
863 di->offset = ceph_inode(inode)->i_max_offset++; 866 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock); 867 spin_unlock(&ci->i_ceph_lock);
865 868
866 spin_lock(&dir->d_lock); 869 spin_lock(&dir->d_lock);
867 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); 870 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
@@ -1308,7 +1311,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1308 struct ceph_inode_info *ci = ceph_inode(inode); 1311 struct ceph_inode_info *ci = ceph_inode(inode);
1309 int ret = 0; 1312 int ret = 0;
1310 1313
1311 spin_lock(&inode->i_lock); 1314 spin_lock(&ci->i_ceph_lock);
1312 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1315 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1313 inode->i_size = size; 1316 inode->i_size = size;
1314 inode->i_blocks = (size + (1 << 9) - 1) >> 9; 1317 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
@@ -1318,7 +1321,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1318 (ci->i_reported_size << 1) < ci->i_max_size) 1321 (ci->i_reported_size << 1) < ci->i_max_size)
1319 ret = 1; 1322 ret = 1;
1320 1323
1321 spin_unlock(&inode->i_lock); 1324 spin_unlock(&ci->i_ceph_lock);
1322 return ret; 1325 return ret;
1323} 1326}
1324 1327
@@ -1328,12 +1331,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1328 */ 1331 */
1329void ceph_queue_writeback(struct inode *inode) 1332void ceph_queue_writeback(struct inode *inode)
1330{ 1333{
1334 ihold(inode);
1331 if (queue_work(ceph_inode_to_client(inode)->wb_wq, 1335 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1332 &ceph_inode(inode)->i_wb_work)) { 1336 &ceph_inode(inode)->i_wb_work)) {
1333 dout("ceph_queue_writeback %p\n", inode); 1337 dout("ceph_queue_writeback %p\n", inode);
1334 ihold(inode);
1335 } else { 1338 } else {
1336 dout("ceph_queue_writeback %p failed\n", inode); 1339 dout("ceph_queue_writeback %p failed\n", inode);
1340 iput(inode);
1337 } 1341 }
1338} 1342}
1339 1343
@@ -1353,12 +1357,13 @@ static void ceph_writeback_work(struct work_struct *work)
1353 */ 1357 */
1354void ceph_queue_invalidate(struct inode *inode) 1358void ceph_queue_invalidate(struct inode *inode)
1355{ 1359{
1360 ihold(inode);
1356 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, 1361 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1357 &ceph_inode(inode)->i_pg_inv_work)) { 1362 &ceph_inode(inode)->i_pg_inv_work)) {
1358 dout("ceph_queue_invalidate %p\n", inode); 1363 dout("ceph_queue_invalidate %p\n", inode);
1359 ihold(inode);
1360 } else { 1364 } else {
1361 dout("ceph_queue_invalidate %p failed\n", inode); 1365 dout("ceph_queue_invalidate %p failed\n", inode);
1366 iput(inode);
1362 } 1367 }
1363} 1368}
1364 1369
@@ -1374,20 +1379,20 @@ static void ceph_invalidate_work(struct work_struct *work)
1374 u32 orig_gen; 1379 u32 orig_gen;
1375 int check = 0; 1380 int check = 0;
1376 1381
1377 spin_lock(&inode->i_lock); 1382 spin_lock(&ci->i_ceph_lock);
1378 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1383 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1379 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1384 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1380 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1385 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1381 /* nevermind! */ 1386 /* nevermind! */
1382 spin_unlock(&inode->i_lock); 1387 spin_unlock(&ci->i_ceph_lock);
1383 goto out; 1388 goto out;
1384 } 1389 }
1385 orig_gen = ci->i_rdcache_gen; 1390 orig_gen = ci->i_rdcache_gen;
1386 spin_unlock(&inode->i_lock); 1391 spin_unlock(&ci->i_ceph_lock);
1387 1392
1388 truncate_inode_pages(&inode->i_data, 0); 1393 truncate_inode_pages(&inode->i_data, 0);
1389 1394
1390 spin_lock(&inode->i_lock); 1395 spin_lock(&ci->i_ceph_lock);
1391 if (orig_gen == ci->i_rdcache_gen && 1396 if (orig_gen == ci->i_rdcache_gen &&
1392 orig_gen == ci->i_rdcache_revoking) { 1397 orig_gen == ci->i_rdcache_revoking) {
1393 dout("invalidate_pages %p gen %d successful\n", inode, 1398 dout("invalidate_pages %p gen %d successful\n", inode,
@@ -1399,7 +1404,7 @@ static void ceph_invalidate_work(struct work_struct *work)
1399 inode, orig_gen, ci->i_rdcache_gen, 1404 inode, orig_gen, ci->i_rdcache_gen,
1400 ci->i_rdcache_revoking); 1405 ci->i_rdcache_revoking);
1401 } 1406 }
1402 spin_unlock(&inode->i_lock); 1407 spin_unlock(&ci->i_ceph_lock);
1403 1408
1404 if (check) 1409 if (check)
1405 ceph_check_caps(ci, 0, NULL); 1410 ceph_check_caps(ci, 0, NULL);
@@ -1434,13 +1439,14 @@ void ceph_queue_vmtruncate(struct inode *inode)
1434{ 1439{
1435 struct ceph_inode_info *ci = ceph_inode(inode); 1440 struct ceph_inode_info *ci = ceph_inode(inode);
1436 1441
1442 ihold(inode);
1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, 1443 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1438 &ci->i_vmtruncate_work)) { 1444 &ci->i_vmtruncate_work)) {
1439 dout("ceph_queue_vmtruncate %p\n", inode); 1445 dout("ceph_queue_vmtruncate %p\n", inode);
1440 ihold(inode);
1441 } else { 1446 } else {
1442 dout("ceph_queue_vmtruncate %p failed, pending=%d\n", 1447 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1443 inode, ci->i_truncate_pending); 1448 inode, ci->i_truncate_pending);
1449 iput(inode);
1444 } 1450 }
1445} 1451}
1446 1452
@@ -1457,10 +1463,10 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
1457 int wrbuffer_refs, wake = 0; 1463 int wrbuffer_refs, wake = 0;
1458 1464
1459retry: 1465retry:
1460 spin_lock(&inode->i_lock); 1466 spin_lock(&ci->i_ceph_lock);
1461 if (ci->i_truncate_pending == 0) { 1467 if (ci->i_truncate_pending == 0) {
1462 dout("__do_pending_vmtruncate %p none pending\n", inode); 1468 dout("__do_pending_vmtruncate %p none pending\n", inode);
1463 spin_unlock(&inode->i_lock); 1469 spin_unlock(&ci->i_ceph_lock);
1464 return; 1470 return;
1465 } 1471 }
1466 1472
@@ -1471,7 +1477,7 @@ retry:
1471 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 1477 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1472 dout("__do_pending_vmtruncate %p flushing snaps first\n", 1478 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1473 inode); 1479 inode);
1474 spin_unlock(&inode->i_lock); 1480 spin_unlock(&ci->i_ceph_lock);
1475 filemap_write_and_wait_range(&inode->i_data, 0, 1481 filemap_write_and_wait_range(&inode->i_data, 0,
1476 inode->i_sb->s_maxbytes); 1482 inode->i_sb->s_maxbytes);
1477 goto retry; 1483 goto retry;
@@ -1481,15 +1487,15 @@ retry:
1481 wrbuffer_refs = ci->i_wrbuffer_ref; 1487 wrbuffer_refs = ci->i_wrbuffer_ref;
1482 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, 1488 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1483 ci->i_truncate_pending, to); 1489 ci->i_truncate_pending, to);
1484 spin_unlock(&inode->i_lock); 1490 spin_unlock(&ci->i_ceph_lock);
1485 1491
1486 truncate_inode_pages(inode->i_mapping, to); 1492 truncate_inode_pages(inode->i_mapping, to);
1487 1493
1488 spin_lock(&inode->i_lock); 1494 spin_lock(&ci->i_ceph_lock);
1489 ci->i_truncate_pending--; 1495 ci->i_truncate_pending--;
1490 if (ci->i_truncate_pending == 0) 1496 if (ci->i_truncate_pending == 0)
1491 wake = 1; 1497 wake = 1;
1492 spin_unlock(&inode->i_lock); 1498 spin_unlock(&ci->i_ceph_lock);
1493 1499
1494 if (wrbuffer_refs == 0) 1500 if (wrbuffer_refs == 0)
1495 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1501 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -1544,7 +1550,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1544 if (IS_ERR(req)) 1550 if (IS_ERR(req))
1545 return PTR_ERR(req); 1551 return PTR_ERR(req);
1546 1552
1547 spin_lock(&inode->i_lock); 1553 spin_lock(&ci->i_ceph_lock);
1548 issued = __ceph_caps_issued(ci, NULL); 1554 issued = __ceph_caps_issued(ci, NULL);
1549 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); 1555 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1550 1556
@@ -1692,7 +1698,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1692 } 1698 }
1693 1699
1694 release &= issued; 1700 release &= issued;
1695 spin_unlock(&inode->i_lock); 1701 spin_unlock(&ci->i_ceph_lock);
1696 1702
1697 if (inode_dirty_flags) 1703 if (inode_dirty_flags)
1698 __mark_inode_dirty(inode, inode_dirty_flags); 1704 __mark_inode_dirty(inode, inode_dirty_flags);
@@ -1714,7 +1720,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1714 __ceph_do_pending_vmtruncate(inode); 1720 __ceph_do_pending_vmtruncate(inode);
1715 return err; 1721 return err;
1716out: 1722out:
1717 spin_unlock(&inode->i_lock); 1723 spin_unlock(&ci->i_ceph_lock);
1718 ceph_mdsc_put_request(req); 1724 ceph_mdsc_put_request(req);
1719 return err; 1725 return err;
1720} 1726}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 5a14c29cbba6..790914a598dd 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -241,11 +241,11 @@ static long ceph_ioctl_lazyio(struct file *file)
241 struct ceph_inode_info *ci = ceph_inode(inode); 241 struct ceph_inode_info *ci = ceph_inode(inode);
242 242
243 if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { 243 if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
244 spin_lock(&inode->i_lock); 244 spin_lock(&ci->i_ceph_lock);
245 ci->i_nr_by_mode[fi->fmode]--; 245 ci->i_nr_by_mode[fi->fmode]--;
246 fi->fmode |= CEPH_FILE_MODE_LAZY; 246 fi->fmode |= CEPH_FILE_MODE_LAZY;
247 ci->i_nr_by_mode[fi->fmode]++; 247 ci->i_nr_by_mode[fi->fmode]++;
248 spin_unlock(&inode->i_lock); 248 spin_unlock(&ci->i_ceph_lock);
249 dout("ioctl_layzio: file %p marked lazy\n", file); 249 dout("ioctl_layzio: file %p marked lazy\n", file);
250 250
251 ceph_check_caps(ci, 0, NULL); 251 ceph_check_caps(ci, 0, NULL);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 264ab701154f..6203d805eb45 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -732,21 +732,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
732 } 732 }
733 } 733 }
734 734
735 spin_lock(&inode->i_lock); 735 spin_lock(&ci->i_ceph_lock);
736 cap = NULL; 736 cap = NULL;
737 if (mode == USE_AUTH_MDS) 737 if (mode == USE_AUTH_MDS)
738 cap = ci->i_auth_cap; 738 cap = ci->i_auth_cap;
739 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) 739 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
740 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); 740 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
741 if (!cap) { 741 if (!cap) {
742 spin_unlock(&inode->i_lock); 742 spin_unlock(&ci->i_ceph_lock);
743 goto random; 743 goto random;
744 } 744 }
745 mds = cap->session->s_mds; 745 mds = cap->session->s_mds;
746 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", 746 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
747 inode, ceph_vinop(inode), mds, 747 inode, ceph_vinop(inode), mds,
748 cap == ci->i_auth_cap ? "auth " : "", cap); 748 cap == ci->i_auth_cap ? "auth " : "", cap);
749 spin_unlock(&inode->i_lock); 749 spin_unlock(&ci->i_ceph_lock);
750 return mds; 750 return mds;
751 751
752random: 752random:
@@ -951,7 +951,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
951 951
952 dout("removing cap %p, ci is %p, inode is %p\n", 952 dout("removing cap %p, ci is %p, inode is %p\n",
953 cap, ci, &ci->vfs_inode); 953 cap, ci, &ci->vfs_inode);
954 spin_lock(&inode->i_lock); 954 spin_lock(&ci->i_ceph_lock);
955 __ceph_remove_cap(cap); 955 __ceph_remove_cap(cap);
956 if (!__ceph_is_any_real_caps(ci)) { 956 if (!__ceph_is_any_real_caps(ci)) {
957 struct ceph_mds_client *mdsc = 957 struct ceph_mds_client *mdsc =
@@ -984,7 +984,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
984 } 984 }
985 spin_unlock(&mdsc->cap_dirty_lock); 985 spin_unlock(&mdsc->cap_dirty_lock);
986 } 986 }
987 spin_unlock(&inode->i_lock); 987 spin_unlock(&ci->i_ceph_lock);
988 while (drop--) 988 while (drop--)
989 iput(inode); 989 iput(inode);
990 return 0; 990 return 0;
@@ -1015,10 +1015,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1015 1015
1016 wake_up_all(&ci->i_cap_wq); 1016 wake_up_all(&ci->i_cap_wq);
1017 if (arg) { 1017 if (arg) {
1018 spin_lock(&inode->i_lock); 1018 spin_lock(&ci->i_ceph_lock);
1019 ci->i_wanted_max_size = 0; 1019 ci->i_wanted_max_size = 0;
1020 ci->i_requested_max_size = 0; 1020 ci->i_requested_max_size = 0;
1021 spin_unlock(&inode->i_lock); 1021 spin_unlock(&ci->i_ceph_lock);
1022 } 1022 }
1023 return 0; 1023 return 0;
1024} 1024}
@@ -1151,7 +1151,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1151 if (session->s_trim_caps <= 0) 1151 if (session->s_trim_caps <= 0)
1152 return -1; 1152 return -1;
1153 1153
1154 spin_lock(&inode->i_lock); 1154 spin_lock(&ci->i_ceph_lock);
1155 mine = cap->issued | cap->implemented; 1155 mine = cap->issued | cap->implemented;
1156 used = __ceph_caps_used(ci); 1156 used = __ceph_caps_used(ci);
1157 oissued = __ceph_caps_issued_other(ci, cap); 1157 oissued = __ceph_caps_issued_other(ci, cap);
@@ -1170,7 +1170,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1170 __ceph_remove_cap(cap); 1170 __ceph_remove_cap(cap);
1171 } else { 1171 } else {
1172 /* try to drop referring dentries */ 1172 /* try to drop referring dentries */
1173 spin_unlock(&inode->i_lock); 1173 spin_unlock(&ci->i_ceph_lock);
1174 d_prune_aliases(inode); 1174 d_prune_aliases(inode);
1175 dout("trim_caps_cb %p cap %p pruned, count now %d\n", 1175 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
1176 inode, cap, atomic_read(&inode->i_count)); 1176 inode, cap, atomic_read(&inode->i_count));
@@ -1178,7 +1178,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1178 } 1178 }
1179 1179
1180out: 1180out:
1181 spin_unlock(&inode->i_lock); 1181 spin_unlock(&ci->i_ceph_lock);
1182 return 0; 1182 return 0;
1183} 1183}
1184 1184
@@ -1296,7 +1296,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1296 i_flushing_item); 1296 i_flushing_item);
1297 struct inode *inode = &ci->vfs_inode; 1297 struct inode *inode = &ci->vfs_inode;
1298 1298
1299 spin_lock(&inode->i_lock); 1299 spin_lock(&ci->i_ceph_lock);
1300 if (ci->i_cap_flush_seq <= want_flush_seq) { 1300 if (ci->i_cap_flush_seq <= want_flush_seq) {
1301 dout("check_cap_flush still flushing %p " 1301 dout("check_cap_flush still flushing %p "
1302 "seq %lld <= %lld to mds%d\n", inode, 1302 "seq %lld <= %lld to mds%d\n", inode,
@@ -1304,7 +1304,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1304 session->s_mds); 1304 session->s_mds);
1305 ret = 0; 1305 ret = 0;
1306 } 1306 }
1307 spin_unlock(&inode->i_lock); 1307 spin_unlock(&ci->i_ceph_lock);
1308 } 1308 }
1309 mutex_unlock(&session->s_mutex); 1309 mutex_unlock(&session->s_mutex);
1310 ceph_put_mds_session(session); 1310 ceph_put_mds_session(session);
@@ -1495,6 +1495,7 @@ retry:
1495 pos, temp); 1495 pos, temp);
1496 } else if (stop_on_nosnap && inode && 1496 } else if (stop_on_nosnap && inode &&
1497 ceph_snap(inode) == CEPH_NOSNAP) { 1497 ceph_snap(inode) == CEPH_NOSNAP) {
1498 spin_unlock(&temp->d_lock);
1498 break; 1499 break;
1499 } else { 1500 } else {
1500 pos -= temp->d_name.len; 1501 pos -= temp->d_name.len;
@@ -2011,10 +2012,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2011 struct ceph_inode_info *ci = ceph_inode(inode); 2012 struct ceph_inode_info *ci = ceph_inode(inode);
2012 2013
2013 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); 2014 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
2014 spin_lock(&inode->i_lock); 2015 spin_lock(&ci->i_ceph_lock);
2015 ceph_dir_clear_complete(inode); 2016 ceph_dir_clear_complete(inode);
2016 ci->i_release_count++; 2017 ci->i_release_count++;
2017 spin_unlock(&inode->i_lock); 2018 spin_unlock(&ci->i_ceph_lock);
2018 2019
2019 if (req->r_dentry) 2020 if (req->r_dentry)
2020 ceph_invalidate_dentry_lease(req->r_dentry); 2021 ceph_invalidate_dentry_lease(req->r_dentry);
@@ -2422,7 +2423,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2422 if (err) 2423 if (err)
2423 goto out_free; 2424 goto out_free;
2424 2425
2425 spin_lock(&inode->i_lock); 2426 spin_lock(&ci->i_ceph_lock);
2426 cap->seq = 0; /* reset cap seq */ 2427 cap->seq = 0; /* reset cap seq */
2427 cap->issue_seq = 0; /* and issue_seq */ 2428 cap->issue_seq = 0; /* and issue_seq */
2428 2429
@@ -2445,7 +2446,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2445 rec.v1.pathbase = cpu_to_le64(pathbase); 2446 rec.v1.pathbase = cpu_to_le64(pathbase);
2446 reclen = sizeof(rec.v1); 2447 reclen = sizeof(rec.v1);
2447 } 2448 }
2448 spin_unlock(&inode->i_lock); 2449 spin_unlock(&ci->i_ceph_lock);
2449 2450
2450 if (recon_state->flock) { 2451 if (recon_state->flock) {
2451 int num_fcntl_locks, num_flock_locks; 2452 int num_fcntl_locks, num_flock_locks;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4bb239921dbd..a50ca0e39475 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -20,7 +20,7 @@
20 * 20 *
21 * mdsc->snap_rwsem 21 * mdsc->snap_rwsem
22 * 22 *
23 * inode->i_lock 23 * ci->i_ceph_lock
24 * mdsc->snap_flush_lock 24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock 25 * mdsc->cap_delay_lock
26 * 26 *
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e26437191333..a559c80f127a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -446,7 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
446 return; 446 return;
447 } 447 }
448 448
449 spin_lock(&inode->i_lock); 449 spin_lock(&ci->i_ceph_lock);
450 used = __ceph_caps_used(ci); 450 used = __ceph_caps_used(ci);
451 dirty = __ceph_caps_dirty(ci); 451 dirty = __ceph_caps_dirty(ci);
452 452
@@ -528,7 +528,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
528 kfree(capsnap); 528 kfree(capsnap);
529 } 529 }
530 530
531 spin_unlock(&inode->i_lock); 531 spin_unlock(&ci->i_ceph_lock);
532} 532}
533 533
534/* 534/*
@@ -537,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
537 * 537 *
538 * If capsnap can now be flushed, add to snap_flush list, and return 1. 538 * If capsnap can now be flushed, add to snap_flush list, and return 1.
539 * 539 *
540 * Caller must hold i_lock. 540 * Caller must hold i_ceph_lock.
541 */ 541 */
542int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 542int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
543 struct ceph_cap_snap *capsnap) 543 struct ceph_cap_snap *capsnap)
@@ -739,9 +739,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
739 inode = &ci->vfs_inode; 739 inode = &ci->vfs_inode;
740 ihold(inode); 740 ihold(inode);
741 spin_unlock(&mdsc->snap_flush_lock); 741 spin_unlock(&mdsc->snap_flush_lock);
742 spin_lock(&inode->i_lock); 742 spin_lock(&ci->i_ceph_lock);
743 __ceph_flush_snaps(ci, &session, 0); 743 __ceph_flush_snaps(ci, &session, 0);
744 spin_unlock(&inode->i_lock); 744 spin_unlock(&ci->i_ceph_lock);
745 iput(inode); 745 iput(inode);
746 spin_lock(&mdsc->snap_flush_lock); 746 spin_lock(&mdsc->snap_flush_lock);
747 } 747 }
@@ -847,7 +847,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
847 continue; 847 continue;
848 ci = ceph_inode(inode); 848 ci = ceph_inode(inode);
849 849
850 spin_lock(&inode->i_lock); 850 spin_lock(&ci->i_ceph_lock);
851 if (!ci->i_snap_realm) 851 if (!ci->i_snap_realm)
852 goto skip_inode; 852 goto skip_inode;
853 /* 853 /*
@@ -876,7 +876,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
876 oldrealm = ci->i_snap_realm; 876 oldrealm = ci->i_snap_realm;
877 ci->i_snap_realm = realm; 877 ci->i_snap_realm = realm;
878 spin_unlock(&realm->inodes_with_caps_lock); 878 spin_unlock(&realm->inodes_with_caps_lock);
879 spin_unlock(&inode->i_lock); 879 spin_unlock(&ci->i_ceph_lock);
880 880
881 ceph_get_snap_realm(mdsc, realm); 881 ceph_get_snap_realm(mdsc, realm);
882 ceph_put_snap_realm(mdsc, oldrealm); 882 ceph_put_snap_realm(mdsc, oldrealm);
@@ -885,7 +885,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
885 continue; 885 continue;
886 886
887skip_inode: 887skip_inode:
888 spin_unlock(&inode->i_lock); 888 spin_unlock(&ci->i_ceph_lock);
889 iput(inode); 889 iput(inode);
890 } 890 }
891 891
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a90846fac759..b48f15f101a0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -383,7 +383,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
383 if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 383 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
384 seq_printf(m, ",rsize=%d", fsopt->rsize); 384 seq_printf(m, ",rsize=%d", fsopt->rsize);
385 if (fsopt->rasize != CEPH_RASIZE_DEFAULT) 385 if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
386 seq_printf(m, ",rasize=%d", fsopt->rsize); 386 seq_printf(m, ",rasize=%d", fsopt->rasize);
387 if (fsopt->congestion_kb != default_congestion_kb()) 387 if (fsopt->congestion_kb != default_congestion_kb())
388 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); 388 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
389 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) 389 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
@@ -638,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
638 if (err == 0) { 638 if (err == 0) {
639 dout("open_root_inode success\n"); 639 dout("open_root_inode success\n");
640 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 640 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
641 fsc->sb->s_root == NULL) 641 fsc->sb->s_root == NULL) {
642 root = d_alloc_root(req->r_target_inode); 642 root = d_alloc_root(req->r_target_inode);
643 else 643 ceph_init_dentry(root);
644 } else {
644 root = d_obtain_alias(req->r_target_inode); 645 root = d_obtain_alias(req->r_target_inode);
646 }
645 req->r_target_inode = NULL; 647 req->r_target_inode = NULL;
646 dout("open_root_inode success, root dentry is %p\n", root); 648 dout("open_root_inode success, root dentry is %p\n", root);
647 } else { 649 } else {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 01bf189e08a9..edcbf3774a56 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -220,7 +220,7 @@ struct ceph_dentry_info {
220 * The locking for D_COMPLETE is a bit odd: 220 * The locking for D_COMPLETE is a bit odd:
221 * - we can clear it at almost any time (see ceph_d_prune) 221 * - we can clear it at almost any time (see ceph_d_prune)
222 * - it is only meaningful if: 222 * - it is only meaningful if:
223 * - we hold dir inode i_lock 223 * - we hold dir inode i_ceph_lock
224 * - we hold dir FILE_SHARED caps 224 * - we hold dir FILE_SHARED caps
225 * - the dentry D_COMPLETE is set 225 * - the dentry D_COMPLETE is set
226 */ 226 */
@@ -250,6 +250,8 @@ struct ceph_inode_xattrs_info {
250struct ceph_inode_info { 250struct ceph_inode_info {
251 struct ceph_vino i_vino; /* ceph ino + snap */ 251 struct ceph_vino i_vino; /* ceph ino + snap */
252 252
253 spinlock_t i_ceph_lock;
254
253 u64 i_version; 255 u64 i_version;
254 u32 i_time_warp_seq; 256 u32 i_time_warp_seq;
255 257
@@ -271,7 +273,7 @@ struct ceph_inode_info {
271 273
272 struct ceph_inode_xattrs_info i_xattrs; 274 struct ceph_inode_xattrs_info i_xattrs;
273 275
274 /* capabilities. protected _both_ by i_lock and cap->session's 276 /* capabilities. protected _both_ by i_ceph_lock and cap->session's
275 * s_mutex. */ 277 * s_mutex. */
276 struct rb_root i_caps; /* cap list */ 278 struct rb_root i_caps; /* cap list */
277 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ 279 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
@@ -437,18 +439,18 @@ static inline void ceph_i_clear(struct inode *inode, unsigned mask)
437{ 439{
438 struct ceph_inode_info *ci = ceph_inode(inode); 440 struct ceph_inode_info *ci = ceph_inode(inode);
439 441
440 spin_lock(&inode->i_lock); 442 spin_lock(&ci->i_ceph_lock);
441 ci->i_ceph_flags &= ~mask; 443 ci->i_ceph_flags &= ~mask;
442 spin_unlock(&inode->i_lock); 444 spin_unlock(&ci->i_ceph_lock);
443} 445}
444 446
445static inline void ceph_i_set(struct inode *inode, unsigned mask) 447static inline void ceph_i_set(struct inode *inode, unsigned mask)
446{ 448{
447 struct ceph_inode_info *ci = ceph_inode(inode); 449 struct ceph_inode_info *ci = ceph_inode(inode);
448 450
449 spin_lock(&inode->i_lock); 451 spin_lock(&ci->i_ceph_lock);
450 ci->i_ceph_flags |= mask; 452 ci->i_ceph_flags |= mask;
451 spin_unlock(&inode->i_lock); 453 spin_unlock(&ci->i_ceph_lock);
452} 454}
453 455
454static inline bool ceph_i_test(struct inode *inode, unsigned mask) 456static inline bool ceph_i_test(struct inode *inode, unsigned mask)
@@ -456,9 +458,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
456 struct ceph_inode_info *ci = ceph_inode(inode); 458 struct ceph_inode_info *ci = ceph_inode(inode);
457 bool r; 459 bool r;
458 460
459 spin_lock(&inode->i_lock); 461 spin_lock(&ci->i_ceph_lock);
460 r = (ci->i_ceph_flags & mask) == mask; 462 r = (ci->i_ceph_flags & mask) == mask;
461 spin_unlock(&inode->i_lock); 463 spin_unlock(&ci->i_ceph_lock);
462 return r; 464 return r;
463} 465}
464 466
@@ -508,9 +510,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
508static inline int ceph_caps_issued(struct ceph_inode_info *ci) 510static inline int ceph_caps_issued(struct ceph_inode_info *ci)
509{ 511{
510 int issued; 512 int issued;
511 spin_lock(&ci->vfs_inode.i_lock); 513 spin_lock(&ci->i_ceph_lock);
512 issued = __ceph_caps_issued(ci, NULL); 514 issued = __ceph_caps_issued(ci, NULL);
513 spin_unlock(&ci->vfs_inode.i_lock); 515 spin_unlock(&ci->i_ceph_lock);
514 return issued; 516 return issued;
515} 517}
516 518
@@ -518,9 +520,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
518 int touch) 520 int touch)
519{ 521{
520 int r; 522 int r;
521 spin_lock(&ci->vfs_inode.i_lock); 523 spin_lock(&ci->i_ceph_lock);
522 r = __ceph_caps_issued_mask(ci, mask, touch); 524 r = __ceph_caps_issued_mask(ci, mask, touch);
523 spin_unlock(&ci->vfs_inode.i_lock); 525 spin_unlock(&ci->i_ceph_lock);
524 return r; 526 return r;
525} 527}
526 528
@@ -743,10 +745,9 @@ extern int ceph_add_cap(struct inode *inode,
743extern void __ceph_remove_cap(struct ceph_cap *cap); 745extern void __ceph_remove_cap(struct ceph_cap *cap);
744static inline void ceph_remove_cap(struct ceph_cap *cap) 746static inline void ceph_remove_cap(struct ceph_cap *cap)
745{ 747{
746 struct inode *inode = &cap->ci->vfs_inode; 748 spin_lock(&cap->ci->i_ceph_lock);
747 spin_lock(&inode->i_lock);
748 __ceph_remove_cap(cap); 749 __ceph_remove_cap(cap);
749 spin_unlock(&inode->i_lock); 750 spin_unlock(&cap->ci->i_ceph_lock);
750} 751}
751extern void ceph_put_cap(struct ceph_mds_client *mdsc, 752extern void ceph_put_cap(struct ceph_mds_client *mdsc,
752 struct ceph_cap *cap); 753 struct ceph_cap *cap);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 96c6739a0280..a5e36e4488a7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -343,8 +343,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
343} 343}
344 344
345static int __build_xattrs(struct inode *inode) 345static int __build_xattrs(struct inode *inode)
346 __releases(inode->i_lock) 346 __releases(ci->i_ceph_lock)
347 __acquires(inode->i_lock) 347 __acquires(ci->i_ceph_lock)
348{ 348{
349 u32 namelen; 349 u32 namelen;
350 u32 numattr = 0; 350 u32 numattr = 0;
@@ -372,7 +372,7 @@ start:
372 end = p + ci->i_xattrs.blob->vec.iov_len; 372 end = p + ci->i_xattrs.blob->vec.iov_len;
373 ceph_decode_32_safe(&p, end, numattr, bad); 373 ceph_decode_32_safe(&p, end, numattr, bad);
374 xattr_version = ci->i_xattrs.version; 374 xattr_version = ci->i_xattrs.version;
375 spin_unlock(&inode->i_lock); 375 spin_unlock(&ci->i_ceph_lock);
376 376
377 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), 377 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
378 GFP_NOFS); 378 GFP_NOFS);
@@ -387,7 +387,7 @@ start:
387 goto bad_lock; 387 goto bad_lock;
388 } 388 }
389 389
390 spin_lock(&inode->i_lock); 390 spin_lock(&ci->i_ceph_lock);
391 if (ci->i_xattrs.version != xattr_version) { 391 if (ci->i_xattrs.version != xattr_version) {
392 /* lost a race, retry */ 392 /* lost a race, retry */
393 for (i = 0; i < numattr; i++) 393 for (i = 0; i < numattr; i++)
@@ -418,7 +418,7 @@ start:
418 418
419 return err; 419 return err;
420bad_lock: 420bad_lock:
421 spin_lock(&inode->i_lock); 421 spin_lock(&ci->i_ceph_lock);
422bad: 422bad:
423 if (xattrs) { 423 if (xattrs) {
424 for (i = 0; i < numattr; i++) 424 for (i = 0; i < numattr; i++)
@@ -512,7 +512,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
512 if (vxattrs) 512 if (vxattrs)
513 vxattr = ceph_match_vxattr(vxattrs, name); 513 vxattr = ceph_match_vxattr(vxattrs, name);
514 514
515 spin_lock(&inode->i_lock); 515 spin_lock(&ci->i_ceph_lock);
516 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 516 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
517 ci->i_xattrs.version, ci->i_xattrs.index_version); 517 ci->i_xattrs.version, ci->i_xattrs.index_version);
518 518
@@ -520,14 +520,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
520 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 520 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
521 goto get_xattr; 521 goto get_xattr;
522 } else { 522 } else {
523 spin_unlock(&inode->i_lock); 523 spin_unlock(&ci->i_ceph_lock);
524 /* get xattrs from mds (if we don't already have them) */ 524 /* get xattrs from mds (if we don't already have them) */
525 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); 525 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
526 if (err) 526 if (err)
527 return err; 527 return err;
528 } 528 }
529 529
530 spin_lock(&inode->i_lock); 530 spin_lock(&ci->i_ceph_lock);
531 531
532 if (vxattr && vxattr->readonly) { 532 if (vxattr && vxattr->readonly) {
533 err = vxattr->getxattr_cb(ci, value, size); 533 err = vxattr->getxattr_cb(ci, value, size);
@@ -558,7 +558,7 @@ get_xattr:
558 memcpy(value, xattr->val, xattr->val_len); 558 memcpy(value, xattr->val, xattr->val_len);
559 559
560out: 560out:
561 spin_unlock(&inode->i_lock); 561 spin_unlock(&ci->i_ceph_lock);
562 return err; 562 return err;
563} 563}
564 564
@@ -573,7 +573,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
573 u32 len; 573 u32 len;
574 int i; 574 int i;
575 575
576 spin_lock(&inode->i_lock); 576 spin_lock(&ci->i_ceph_lock);
577 dout("listxattr %p ver=%lld index_ver=%lld\n", inode, 577 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
578 ci->i_xattrs.version, ci->i_xattrs.index_version); 578 ci->i_xattrs.version, ci->i_xattrs.index_version);
579 579
@@ -581,13 +581,13 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
581 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 581 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
582 goto list_xattr; 582 goto list_xattr;
583 } else { 583 } else {
584 spin_unlock(&inode->i_lock); 584 spin_unlock(&ci->i_ceph_lock);
585 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); 585 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
586 if (err) 586 if (err)
587 return err; 587 return err;
588 } 588 }
589 589
590 spin_lock(&inode->i_lock); 590 spin_lock(&ci->i_ceph_lock);
591 591
592 err = __build_xattrs(inode); 592 err = __build_xattrs(inode);
593 if (err < 0) 593 if (err < 0)
@@ -619,7 +619,7 @@ list_xattr:
619 } 619 }
620 620
621out: 621out:
622 spin_unlock(&inode->i_lock); 622 spin_unlock(&ci->i_ceph_lock);
623 return err; 623 return err;
624} 624}
625 625
@@ -739,7 +739,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
739 if (!xattr) 739 if (!xattr)
740 goto out; 740 goto out;
741 741
742 spin_lock(&inode->i_lock); 742 spin_lock(&ci->i_ceph_lock);
743retry: 743retry:
744 issued = __ceph_caps_issued(ci, NULL); 744 issued = __ceph_caps_issued(ci, NULL);
745 if (!(issued & CEPH_CAP_XATTR_EXCL)) 745 if (!(issued & CEPH_CAP_XATTR_EXCL))
@@ -752,12 +752,12 @@ retry:
752 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { 752 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
753 struct ceph_buffer *blob = NULL; 753 struct ceph_buffer *blob = NULL;
754 754
755 spin_unlock(&inode->i_lock); 755 spin_unlock(&ci->i_ceph_lock);
756 dout(" preaallocating new blob size=%d\n", required_blob_size); 756 dout(" preaallocating new blob size=%d\n", required_blob_size);
757 blob = ceph_buffer_new(required_blob_size, GFP_NOFS); 757 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
758 if (!blob) 758 if (!blob)
759 goto out; 759 goto out;
760 spin_lock(&inode->i_lock); 760 spin_lock(&ci->i_ceph_lock);
761 if (ci->i_xattrs.prealloc_blob) 761 if (ci->i_xattrs.prealloc_blob)
762 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 762 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
763 ci->i_xattrs.prealloc_blob = blob; 763 ci->i_xattrs.prealloc_blob = blob;
@@ -770,13 +770,13 @@ retry:
770 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 770 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
771 ci->i_xattrs.dirty = true; 771 ci->i_xattrs.dirty = true;
772 inode->i_ctime = CURRENT_TIME; 772 inode->i_ctime = CURRENT_TIME;
773 spin_unlock(&inode->i_lock); 773 spin_unlock(&ci->i_ceph_lock);
774 if (dirty) 774 if (dirty)
775 __mark_inode_dirty(inode, dirty); 775 __mark_inode_dirty(inode, dirty);
776 return err; 776 return err;
777 777
778do_sync: 778do_sync:
779 spin_unlock(&inode->i_lock); 779 spin_unlock(&ci->i_ceph_lock);
780 err = ceph_sync_setxattr(dentry, name, value, size, flags); 780 err = ceph_sync_setxattr(dentry, name, value, size, flags);
781out: 781out:
782 kfree(newname); 782 kfree(newname);
@@ -833,7 +833,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
833 return -EOPNOTSUPP; 833 return -EOPNOTSUPP;
834 } 834 }
835 835
836 spin_lock(&inode->i_lock); 836 spin_lock(&ci->i_ceph_lock);
837 __build_xattrs(inode); 837 __build_xattrs(inode);
838 issued = __ceph_caps_issued(ci, NULL); 838 issued = __ceph_caps_issued(ci, NULL);
839 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); 839 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
@@ -846,12 +846,12 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
846 ci->i_xattrs.dirty = true; 846 ci->i_xattrs.dirty = true;
847 inode->i_ctime = CURRENT_TIME; 847 inode->i_ctime = CURRENT_TIME;
848 848
849 spin_unlock(&inode->i_lock); 849 spin_unlock(&ci->i_ceph_lock);
850 if (dirty) 850 if (dirty)
851 __mark_inode_dirty(inode, dirty); 851 __mark_inode_dirty(inode, dirty);
852 return err; 852 return err;
853do_sync: 853do_sync:
854 spin_unlock(&inode->i_lock); 854 spin_unlock(&ci->i_ceph_lock);
855 err = ceph_send_removexattr(dentry, name); 855 err = ceph_send_removexattr(dentry, name);
856 return err; 856 return err;
857} 857}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d6a972df0338..8cd4b52d4217 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -441,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
441 smb_msg.msg_controllen = 0; 441 smb_msg.msg_controllen = 0;
442 442
443 for (total_read = 0; to_read; total_read += length, to_read -= length) { 443 for (total_read = 0; to_read; total_read += length, to_read -= length) {
444 try_to_freeze();
445
444 if (server_unresponsive(server)) { 446 if (server_unresponsive(server)) {
445 total_read = -EAGAIN; 447 total_read = -EAGAIN;
446 break; 448 break;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cf0b1539b321..4dd9283885e7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -702,6 +702,13 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
702 lock->type, lock->netfid, conf_lock); 702 lock->type, lock->netfid, conf_lock);
703} 703}
704 704
705/*
706 * Check if there is another lock that prevents us to set the lock (mandatory
707 * style). If such a lock exists, update the flock structure with its
708 * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
709 * or leave it the same if we can't. Returns 0 if we don't need to request to
710 * the server or 1 otherwise.
711 */
705static int 712static int
706cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, 713cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
707 __u8 type, __u16 netfid, struct file_lock *flock) 714 __u8 type, __u16 netfid, struct file_lock *flock)
@@ -739,6 +746,12 @@ cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
739 mutex_unlock(&cinode->lock_mutex); 746 mutex_unlock(&cinode->lock_mutex);
740} 747}
741 748
749/*
750 * Set the byte-range lock (mandatory style). Returns:
751 * 1) 0, if we set the lock and don't need to request to the server;
752 * 2) 1, if no locks prevent us but we need to request to the server;
753 * 3) -EACCESS, if there is a lock that prevents us and wait is false.
754 */
742static int 755static int
743cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, 756cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
744 bool wait) 757 bool wait)
@@ -778,6 +791,13 @@ try_again:
778 return rc; 791 return rc;
779} 792}
780 793
794/*
795 * Check if there is another lock that prevents us to set the lock (posix
796 * style). If such a lock exists, update the flock structure with its
797 * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
798 * or leave it the same if we can't. Returns 0 if we don't need to request to
799 * the server or 1 otherwise.
800 */
781static int 801static int
782cifs_posix_lock_test(struct file *file, struct file_lock *flock) 802cifs_posix_lock_test(struct file *file, struct file_lock *flock)
783{ 803{
@@ -800,6 +820,12 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
800 return rc; 820 return rc;
801} 821}
802 822
823/*
824 * Set the byte-range lock (posix style). Returns:
825 * 1) 0, if we set the lock and don't need to request to the server;
826 * 2) 1, if we need to request to the server;
827 * 3) <0, if the error occurs while setting the lock.
828 */
803static int 829static int
804cifs_posix_lock_set(struct file *file, struct file_lock *flock) 830cifs_posix_lock_set(struct file *file, struct file_lock *flock)
805{ 831{
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5de03ec20144..a090bbe6ee29 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
554 rc); 554 rc);
555 return rc; 555 return rc;
556 } 556 }
557 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 557 /* FindFirst/Next set last_entry to NULL on malformed reply */
558 if (cifsFile->srch_inf.last_entry)
559 cifs_save_resume_key(cifsFile->srch_inf.last_entry,
560 cifsFile);
558 } 561 }
559 562
560 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 563 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
562 cFYI(1, "calling findnext2"); 565 cFYI(1, "calling findnext2");
563 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 566 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
564 &cifsFile->srch_inf); 567 &cifsFile->srch_inf);
565 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 568 /* FindFirst/Next set last_entry to NULL on malformed reply */
569 if (cifsFile->srch_inf.last_entry)
570 cifs_save_resume_key(cifsFile->srch_inf.last_entry,
571 cifsFile);
566 if (rc) 572 if (rc)
567 return -ENOENT; 573 return -ENOENT;
568 } 574 }
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 7cacba12b8f1..80d850881938 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -209,7 +209,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
209{ 209{
210 int rc; 210 int rc;
211 int len; 211 int len;
212 __u16 wpwd[129]; 212 __le16 wpwd[129];
213 213
214 /* Password cannot be longer than 128 characters */ 214 /* Password cannot be longer than 128 characters */
215 if (passwd) /* Password must be converted to NT unicode */ 215 if (passwd) /* Password must be converted to NT unicode */
@@ -219,8 +219,8 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
219 *wpwd = 0; /* Ensure string is null terminated */ 219 *wpwd = 0; /* Ensure string is null terminated */
220 } 220 }
221 221
222 rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16)); 222 rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
223 memset(wpwd, 0, 129 * sizeof(__u16)); 223 memset(wpwd, 0, 129 * sizeof(__le16));
224 224
225 return rc; 225 return rc;
226} 226}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index ca418aaf6352..9d8715c45f25 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -292,7 +292,7 @@ int __init configfs_inode_init(void)
292 return bdi_init(&configfs_backing_dev_info); 292 return bdi_init(&configfs_backing_dev_info);
293} 293}
294 294
295void __exit configfs_inode_exit(void) 295void configfs_inode_exit(void)
296{ 296{
297 bdi_destroy(&configfs_backing_dev_info); 297 bdi_destroy(&configfs_backing_dev_info);
298} 298}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index ecc62178beda..276e15cafd58 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -143,28 +143,26 @@ static int __init configfs_init(void)
143 goto out; 143 goto out;
144 144
145 config_kobj = kobject_create_and_add("config", kernel_kobj); 145 config_kobj = kobject_create_and_add("config", kernel_kobj);
146 if (!config_kobj) { 146 if (!config_kobj)
147 kmem_cache_destroy(configfs_dir_cachep); 147 goto out2;
148 configfs_dir_cachep = NULL; 148
149 goto out; 149 err = configfs_inode_init();
150 } 150 if (err)
151 goto out3;
151 152
152 err = register_filesystem(&configfs_fs_type); 153 err = register_filesystem(&configfs_fs_type);
153 if (err) { 154 if (err)
154 printk(KERN_ERR "configfs: Unable to register filesystem!\n"); 155 goto out4;
155 kobject_put(config_kobj);
156 kmem_cache_destroy(configfs_dir_cachep);
157 configfs_dir_cachep = NULL;
158 goto out;
159 }
160 156
161 err = configfs_inode_init(); 157 return 0;
162 if (err) { 158out4:
163 unregister_filesystem(&configfs_fs_type); 159 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
164 kobject_put(config_kobj); 160 configfs_inode_exit();
165 kmem_cache_destroy(configfs_dir_cachep); 161out3:
166 configfs_dir_cachep = NULL; 162 kobject_put(config_kobj);
167 } 163out2:
164 kmem_cache_destroy(configfs_dir_cachep);
165 configfs_dir_cachep = NULL;
168out: 166out:
169 return err; 167 return err;
170} 168}
diff --git a/fs/dcache.c b/fs/dcache.c
index a901c6901bce..89509b5a090e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/rculist_bl.h> 37#include <linux/rculist_bl.h>
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39#include <linux/ratelimit.h>
39#include "internal.h" 40#include "internal.h"
40 41
41/* 42/*
@@ -2383,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2383 actual = __d_unalias(inode, dentry, alias); 2384 actual = __d_unalias(inode, dentry, alias);
2384 } 2385 }
2385 write_sequnlock(&rename_lock); 2386 write_sequnlock(&rename_lock);
2386 if (IS_ERR(actual)) 2387 if (IS_ERR(actual)) {
2388 if (PTR_ERR(actual) == -ELOOP)
2389 pr_warn_ratelimited(
2390 "VFS: Lookup of '%s' in %s %s"
2391 " would have caused loop\n",
2392 dentry->d_name.name,
2393 inode->i_sb->s_type->name,
2394 inode->i_sb->s_id);
2387 dput(alias); 2395 dput(alias);
2396 }
2388 goto out_nolock; 2397 goto out_nolock;
2389 } 2398 }
2390 } 2399 }
@@ -2430,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2430/** 2439/**
2431 * prepend_path - Prepend path string to a buffer 2440 * prepend_path - Prepend path string to a buffer
2432 * @path: the dentry/vfsmount to report 2441 * @path: the dentry/vfsmount to report
2433 * @root: root vfsmnt/dentry (may be modified by this function) 2442 * @root: root vfsmnt/dentry
2434 * @buffer: pointer to the end of the buffer 2443 * @buffer: pointer to the end of the buffer
2435 * @buflen: pointer to buffer length 2444 * @buflen: pointer to buffer length
2436 * 2445 *
2437 * Caller holds the rename_lock. 2446 * Caller holds the rename_lock.
2438 *
2439 * If path is not reachable from the supplied root, then the value of
2440 * root is changed (without modifying refcounts).
2441 */ 2447 */
2442static int prepend_path(const struct path *path, struct path *root, 2448static int prepend_path(const struct path *path,
2449 const struct path *root,
2443 char **buffer, int *buflen) 2450 char **buffer, int *buflen)
2444{ 2451{
2445 struct dentry *dentry = path->dentry; 2452 struct dentry *dentry = path->dentry;
@@ -2474,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root,
2474 dentry = parent; 2481 dentry = parent;
2475 } 2482 }
2476 2483
2477out:
2478 if (!error && !slash) 2484 if (!error && !slash)
2479 error = prepend(buffer, buflen, "/", 1); 2485 error = prepend(buffer, buflen, "/", 1);
2480 2486
2487out:
2481 br_read_unlock(vfsmount_lock); 2488 br_read_unlock(vfsmount_lock);
2482 return error; 2489 return error;
2483 2490
@@ -2491,15 +2498,17 @@ global_root:
2491 WARN(1, "Root dentry has weird name <%.*s>\n", 2498 WARN(1, "Root dentry has weird name <%.*s>\n",
2492 (int) dentry->d_name.len, dentry->d_name.name); 2499 (int) dentry->d_name.len, dentry->d_name.name);
2493 } 2500 }
2494 root->mnt = vfsmnt; 2501 if (!slash)
2495 root->dentry = dentry; 2502 error = prepend(buffer, buflen, "/", 1);
2503 if (!error)
2504 error = vfsmnt->mnt_ns ? 1 : 2;
2496 goto out; 2505 goto out;
2497} 2506}
2498 2507
2499/** 2508/**
2500 * __d_path - return the path of a dentry 2509 * __d_path - return the path of a dentry
2501 * @path: the dentry/vfsmount to report 2510 * @path: the dentry/vfsmount to report
2502 * @root: root vfsmnt/dentry (may be modified by this function) 2511 * @root: root vfsmnt/dentry
2503 * @buf: buffer to return value in 2512 * @buf: buffer to return value in
2504 * @buflen: buffer length 2513 * @buflen: buffer length
2505 * 2514 *
@@ -2510,10 +2519,10 @@ global_root:
2510 * 2519 *
2511 * "buflen" should be positive. 2520 * "buflen" should be positive.
2512 * 2521 *
2513 * If path is not reachable from the supplied root, then the value of 2522 * If the path is not reachable from the supplied root, return %NULL.
2514 * root is changed (without modifying refcounts).
2515 */ 2523 */
2516char *__d_path(const struct path *path, struct path *root, 2524char *__d_path(const struct path *path,
2525 const struct path *root,
2517 char *buf, int buflen) 2526 char *buf, int buflen)
2518{ 2527{
2519 char *res = buf + buflen; 2528 char *res = buf + buflen;
@@ -2524,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root,
2524 error = prepend_path(path, root, &res, &buflen); 2533 error = prepend_path(path, root, &res, &buflen);
2525 write_sequnlock(&rename_lock); 2534 write_sequnlock(&rename_lock);
2526 2535
2527 if (error) 2536 if (error < 0)
2537 return ERR_PTR(error);
2538 if (error > 0)
2539 return NULL;
2540 return res;
2541}
2542
2543char *d_absolute_path(const struct path *path,
2544 char *buf, int buflen)
2545{
2546 struct path root = {};
2547 char *res = buf + buflen;
2548 int error;
2549
2550 prepend(&res, &buflen, "\0", 1);
2551 write_seqlock(&rename_lock);
2552 error = prepend_path(path, &root, &res, &buflen);
2553 write_sequnlock(&rename_lock);
2554
2555 if (error > 1)
2556 error = -EINVAL;
2557 if (error < 0)
2528 return ERR_PTR(error); 2558 return ERR_PTR(error);
2529 return res; 2559 return res;
2530} 2560}
@@ -2532,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root,
2532/* 2562/*
2533 * same as __d_path but appends "(deleted)" for unlinked files. 2563 * same as __d_path but appends "(deleted)" for unlinked files.
2534 */ 2564 */
2535static int path_with_deleted(const struct path *path, struct path *root, 2565static int path_with_deleted(const struct path *path,
2536 char **buf, int *buflen) 2566 const struct path *root,
2567 char **buf, int *buflen)
2537{ 2568{
2538 prepend(buf, buflen, "\0", 1); 2569 prepend(buf, buflen, "\0", 1);
2539 if (d_unlinked(path->dentry)) { 2570 if (d_unlinked(path->dentry)) {
@@ -2570,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
2570{ 2601{
2571 char *res = buf + buflen; 2602 char *res = buf + buflen;
2572 struct path root; 2603 struct path root;
2573 struct path tmp;
2574 int error; 2604 int error;
2575 2605
2576 /* 2606 /*
@@ -2585,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen)
2585 2615
2586 get_fs_root(current->fs, &root); 2616 get_fs_root(current->fs, &root);
2587 write_seqlock(&rename_lock); 2617 write_seqlock(&rename_lock);
2588 tmp = root; 2618 error = path_with_deleted(path, &root, &res, &buflen);
2589 error = path_with_deleted(path, &tmp, &res, &buflen); 2619 if (error < 0)
2590 if (error)
2591 res = ERR_PTR(error); 2620 res = ERR_PTR(error);
2592 write_sequnlock(&rename_lock); 2621 write_sequnlock(&rename_lock);
2593 path_put(&root); 2622 path_put(&root);
@@ -2608,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2608{ 2637{
2609 char *res = buf + buflen; 2638 char *res = buf + buflen;
2610 struct path root; 2639 struct path root;
2611 struct path tmp;
2612 int error; 2640 int error;
2613 2641
2614 if (path->dentry->d_op && path->dentry->d_op->d_dname) 2642 if (path->dentry->d_op && path->dentry->d_op->d_dname)
@@ -2616,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2616 2644
2617 get_fs_root(current->fs, &root); 2645 get_fs_root(current->fs, &root);
2618 write_seqlock(&rename_lock); 2646 write_seqlock(&rename_lock);
2619 tmp = root; 2647 error = path_with_deleted(path, &root, &res, &buflen);
2620 error = path_with_deleted(path, &tmp, &res, &buflen); 2648 if (error > 0)
2621 if (!error && !path_equal(&tmp, &root))
2622 error = prepend_unreachable(&res, &buflen); 2649 error = prepend_unreachable(&res, &buflen);
2623 write_sequnlock(&rename_lock); 2650 write_sequnlock(&rename_lock);
2624 path_put(&root); 2651 path_put(&root);
@@ -2749,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2749 write_seqlock(&rename_lock); 2776 write_seqlock(&rename_lock);
2750 if (!d_unlinked(pwd.dentry)) { 2777 if (!d_unlinked(pwd.dentry)) {
2751 unsigned long len; 2778 unsigned long len;
2752 struct path tmp = root;
2753 char *cwd = page + PAGE_SIZE; 2779 char *cwd = page + PAGE_SIZE;
2754 int buflen = PAGE_SIZE; 2780 int buflen = PAGE_SIZE;
2755 2781
2756 prepend(&cwd, &buflen, "\0", 1); 2782 prepend(&cwd, &buflen, "\0", 1);
2757 error = prepend_path(&pwd, &tmp, &cwd, &buflen); 2783 error = prepend_path(&pwd, &root, &cwd, &buflen);
2758 write_sequnlock(&rename_lock); 2784 write_sequnlock(&rename_lock);
2759 2785
2760 if (error) 2786 if (error < 0)
2761 goto out; 2787 goto out;
2762 2788
2763 /* Unreachable from current root */ 2789 /* Unreachable from current root */
2764 if (!path_equal(&tmp, &root)) { 2790 if (error > 0) {
2765 error = prepend_unreachable(&cwd, &buflen); 2791 error = prepend_unreachable(&cwd, &buflen);
2766 if (error) 2792 if (error)
2767 goto out; 2793 goto out;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 58609bde3b9f..2a834255c75d 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals(
967 967
968/** 968/**
969 * ecryptfs_new_file_context 969 * ecryptfs_new_file_context
970 * @ecryptfs_dentry: The eCryptfs dentry 970 * @ecryptfs_inode: The eCryptfs inode
971 * 971 *
972 * If the crypto context for the file has not yet been established, 972 * If the crypto context for the file has not yet been established,
973 * this is where we do that. Establishing a new crypto context 973 * this is where we do that. Establishing a new crypto context
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals(
984 * 984 *
985 * Returns zero on success; non-zero otherwise 985 * Returns zero on success; non-zero otherwise
986 */ 986 */
987int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) 987int ecryptfs_new_file_context(struct inode *ecryptfs_inode)
988{ 988{
989 struct ecryptfs_crypt_stat *crypt_stat = 989 struct ecryptfs_crypt_stat *crypt_stat =
990 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 990 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
991 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 991 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
992 &ecryptfs_superblock_to_private( 992 &ecryptfs_superblock_to_private(
993 ecryptfs_dentry->d_sb)->mount_crypt_stat; 993 ecryptfs_inode->i_sb)->mount_crypt_stat;
994 int cipher_name_len; 994 int cipher_name_len;
995 int rc = 0; 995 int rc = 0;
996 996
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1299} 1299}
1300 1300
1301static int 1301static int
1302ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, 1302ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
1303 char *virt, size_t virt_len) 1303 char *virt, size_t virt_len)
1304{ 1304{
1305 int rc; 1305 int rc;
1306 1306
1307 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1307 rc = ecryptfs_write_lower(ecryptfs_inode, virt,
1308 0, virt_len); 1308 0, virt_len);
1309 if (rc < 0) 1309 if (rc < 0)
1310 printk(KERN_ERR "%s: Error attempting to write header " 1310 printk(KERN_ERR "%s: Error attempting to write header "
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
1338 1338
1339/** 1339/**
1340 * ecryptfs_write_metadata 1340 * ecryptfs_write_metadata
1341 * @ecryptfs_dentry: The eCryptfs dentry 1341 * @ecryptfs_dentry: The eCryptfs dentry, which should be negative
1342 * @ecryptfs_inode: The newly created eCryptfs inode
1342 * 1343 *
1343 * Write the file headers out. This will likely involve a userspace 1344 * Write the file headers out. This will likely involve a userspace
1344 * callout, in which the session key is encrypted with one or more 1345 * callout, in which the session key is encrypted with one or more
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
1348 * 1349 *
1349 * Returns zero on success; non-zero on error 1350 * Returns zero on success; non-zero on error
1350 */ 1351 */
1351int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) 1352int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
1353 struct inode *ecryptfs_inode)
1352{ 1354{
1353 struct ecryptfs_crypt_stat *crypt_stat = 1355 struct ecryptfs_crypt_stat *crypt_stat =
1354 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 1356 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
1355 unsigned int order; 1357 unsigned int order;
1356 char *virt; 1358 char *virt;
1357 size_t virt_len; 1359 size_t virt_len;
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1391 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, 1393 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
1392 size); 1394 size);
1393 else 1395 else
1394 rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, 1396 rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
1395 virt_len); 1397 virt_len);
1396 if (rc) { 1398 if (rc) {
1397 printk(KERN_ERR "%s: Error writing metadata out to lower file; " 1399 printk(KERN_ERR "%s: Error writing metadata out to lower file; "
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
1943 1945
1944/* We could either offset on every reverse map or just pad some 0x00's 1946/* We could either offset on every reverse map or just pad some 0x00's
1945 * at the front here */ 1947 * at the front here */
1946static const unsigned char filename_rev_map[] = { 1948static const unsigned char filename_rev_map[256] = {
1947 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 1949 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
1948 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 1950 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
1949 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ 1951 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = {
1959 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 1961 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
1960 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 1962 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
1961 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ 1963 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
1962 0x3D, 0x3E, 0x3F 1964 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
1963}; 1965};
1964 1966
1965/** 1967/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 54481a3b2c79..a9f29b12fbf2 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
584int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); 584int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
585int ecryptfs_encrypt_page(struct page *page); 585int ecryptfs_encrypt_page(struct page *page);
586int ecryptfs_decrypt_page(struct page *page); 586int ecryptfs_decrypt_page(struct page *page);
587int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); 587int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
588 struct inode *ecryptfs_inode);
588int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); 589int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
589int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); 590int ecryptfs_new_file_context(struct inode *ecryptfs_inode);
590void ecryptfs_write_crypt_stat_flags(char *page_virt, 591void ecryptfs_write_crypt_stat_flags(char *page_virt,
591 struct ecryptfs_crypt_stat *crypt_stat, 592 struct ecryptfs_crypt_stat *crypt_stat,
592 size_t *written); 593 size_t *written);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c6ac98cf9baa..d3f95f941c47 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -139,6 +139,27 @@ out:
139 return rc; 139 return rc;
140} 140}
141 141
142static void ecryptfs_vma_close(struct vm_area_struct *vma)
143{
144 filemap_write_and_wait(vma->vm_file->f_mapping);
145}
146
147static const struct vm_operations_struct ecryptfs_file_vm_ops = {
148 .close = ecryptfs_vma_close,
149 .fault = filemap_fault,
150};
151
152static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
153{
154 int rc;
155
156 rc = generic_file_mmap(file, vma);
157 if (!rc)
158 vma->vm_ops = &ecryptfs_file_vm_ops;
159
160 return rc;
161}
162
142struct kmem_cache *ecryptfs_file_info_cache; 163struct kmem_cache *ecryptfs_file_info_cache;
143 164
144/** 165/**
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = {
349#ifdef CONFIG_COMPAT 370#ifdef CONFIG_COMPAT
350 .compat_ioctl = ecryptfs_compat_ioctl, 371 .compat_ioctl = ecryptfs_compat_ioctl,
351#endif 372#endif
352 .mmap = generic_file_mmap, 373 .mmap = ecryptfs_file_mmap,
353 .open = ecryptfs_open, 374 .open = ecryptfs_open,
354 .flush = ecryptfs_flush, 375 .flush = ecryptfs_flush,
355 .release = ecryptfs_release, 376 .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a36d327f1521..32f90a3ae63e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
172 * it. It will also update the eCryptfs directory inode to mimic the 172 * it. It will also update the eCryptfs directory inode to mimic the
173 * stat of the lower directory inode. 173 * stat of the lower directory inode.
174 * 174 *
175 * Returns zero on success; non-zero on error condition 175 * Returns the new eCryptfs inode on success; an ERR_PTR on error condition
176 */ 176 */
177static int 177static struct inode *
178ecryptfs_do_create(struct inode *directory_inode, 178ecryptfs_do_create(struct inode *directory_inode,
179 struct dentry *ecryptfs_dentry, int mode) 179 struct dentry *ecryptfs_dentry, int mode)
180{ 180{
181 int rc; 181 int rc;
182 struct dentry *lower_dentry; 182 struct dentry *lower_dentry;
183 struct dentry *lower_dir_dentry; 183 struct dentry *lower_dir_dentry;
184 struct inode *inode;
184 185
185 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 186 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
186 lower_dir_dentry = lock_parent(lower_dentry); 187 lower_dir_dentry = lock_parent(lower_dentry);
187 if (IS_ERR(lower_dir_dentry)) { 188 if (IS_ERR(lower_dir_dentry)) {
188 ecryptfs_printk(KERN_ERR, "Error locking directory of " 189 ecryptfs_printk(KERN_ERR, "Error locking directory of "
189 "dentry\n"); 190 "dentry\n");
190 rc = PTR_ERR(lower_dir_dentry); 191 inode = ERR_CAST(lower_dir_dentry);
191 goto out; 192 goto out;
192 } 193 }
193 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, 194 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode,
195 if (rc) { 196 if (rc) {
196 printk(KERN_ERR "%s: Failure to create dentry in lower fs; " 197 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
197 "rc = [%d]\n", __func__, rc); 198 "rc = [%d]\n", __func__, rc);
199 inode = ERR_PTR(rc);
198 goto out_lock; 200 goto out_lock;
199 } 201 }
200 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 202 inode = __ecryptfs_get_inode(lower_dentry->d_inode,
201 directory_inode->i_sb); 203 directory_inode->i_sb);
202 if (rc) { 204 if (IS_ERR(inode))
203 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
204 goto out_lock; 205 goto out_lock;
205 }
206 fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); 206 fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
207 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); 207 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
208out_lock: 208out_lock:
209 unlock_dir(lower_dir_dentry); 209 unlock_dir(lower_dir_dentry);
210out: 210out:
211 return rc; 211 return inode;
212} 212}
213 213
214/** 214/**
@@ -219,26 +219,26 @@ out:
219 * 219 *
220 * Returns zero on success 220 * Returns zero on success
221 */ 221 */
222static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) 222static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
223 struct inode *ecryptfs_inode)
223{ 224{
224 struct ecryptfs_crypt_stat *crypt_stat = 225 struct ecryptfs_crypt_stat *crypt_stat =
225 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 226 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
226 int rc = 0; 227 int rc = 0;
227 228
228 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { 229 if (S_ISDIR(ecryptfs_inode->i_mode)) {
229 ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); 230 ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
230 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); 231 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
231 goto out; 232 goto out;
232 } 233 }
233 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); 234 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
234 rc = ecryptfs_new_file_context(ecryptfs_dentry); 235 rc = ecryptfs_new_file_context(ecryptfs_inode);
235 if (rc) { 236 if (rc) {
236 ecryptfs_printk(KERN_ERR, "Error creating new file " 237 ecryptfs_printk(KERN_ERR, "Error creating new file "
237 "context; rc = [%d]\n", rc); 238 "context; rc = [%d]\n", rc);
238 goto out; 239 goto out;
239 } 240 }
240 rc = ecryptfs_get_lower_file(ecryptfs_dentry, 241 rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode);
241 ecryptfs_dentry->d_inode);
242 if (rc) { 242 if (rc) {
243 printk(KERN_ERR "%s: Error attempting to initialize " 243 printk(KERN_ERR "%s: Error attempting to initialize "
244 "the lower file for the dentry with name " 244 "the lower file for the dentry with name "
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
246 ecryptfs_dentry->d_name.name, rc); 246 ecryptfs_dentry->d_name.name, rc);
247 goto out; 247 goto out;
248 } 248 }
249 rc = ecryptfs_write_metadata(ecryptfs_dentry); 249 rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
250 if (rc) 250 if (rc)
251 printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); 251 printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
252 ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); 252 ecryptfs_put_lower_file(ecryptfs_inode);
253out: 253out:
254 return rc; 254 return rc;
255} 255}
@@ -269,18 +269,28 @@ static int
269ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, 269ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
270 int mode, struct nameidata *nd) 270 int mode, struct nameidata *nd)
271{ 271{
272 struct inode *ecryptfs_inode;
272 int rc; 273 int rc;
273 274
274 /* ecryptfs_do_create() calls ecryptfs_interpose() */ 275 ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
275 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); 276 mode);
276 if (unlikely(rc)) { 277 if (unlikely(IS_ERR(ecryptfs_inode))) {
277 ecryptfs_printk(KERN_WARNING, "Failed to create file in" 278 ecryptfs_printk(KERN_WARNING, "Failed to create file in"
278 "lower filesystem\n"); 279 "lower filesystem\n");
280 rc = PTR_ERR(ecryptfs_inode);
279 goto out; 281 goto out;
280 } 282 }
281 /* At this point, a file exists on "disk"; we need to make sure 283 /* At this point, a file exists on "disk"; we need to make sure
282 * that this on disk file is prepared to be an ecryptfs file */ 284 * that this on disk file is prepared to be an ecryptfs file */
283 rc = ecryptfs_initialize_file(ecryptfs_dentry); 285 rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
286 if (rc) {
287 drop_nlink(ecryptfs_inode);
288 unlock_new_inode(ecryptfs_inode);
289 iput(ecryptfs_inode);
290 goto out;
291 }
292 d_instantiate(ecryptfs_dentry, ecryptfs_inode);
293 unlock_new_inode(ecryptfs_inode);
284out: 294out:
285 return rc; 295 return rc;
286} 296}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f6dba4505f1c..12ccacda44e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
565 brelse(bitmap_bh); 565 brelse(bitmap_bh);
566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" 566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
567 ", computed = %llu, %llu\n", 567 ", computed = %llu, %llu\n",
568 EXT4_B2C(sbi, ext4_free_blocks_count(es)), 568 EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
569 desc_count, bitmap_count); 569 desc_count, bitmap_count);
570 return bitmap_count; 570 return bitmap_count;
571#else 571#else
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 61fa9e1614af..607b1557d292 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1095,7 +1095,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1095 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1095 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1096 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1096 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1097 1097
1098 neh->eh_depth = cpu_to_le16(neh->eh_depth + 1); 1098 neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
1099 ext4_mark_inode_dirty(handle, inode); 1099 ext4_mark_inode_dirty(handle, inode);
1100out: 1100out:
1101 brelse(bh); 1101 brelse(bh);
@@ -2955,7 +2955,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2955 /* Pre-conditions */ 2955 /* Pre-conditions */
2956 BUG_ON(!ext4_ext_is_uninitialized(ex)); 2956 BUG_ON(!ext4_ext_is_uninitialized(ex));
2957 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); 2957 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
2958 BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
2959 2958
2960 /* 2959 /*
2961 * Attempt to transfer newly initialized blocks from the currently 2960 * Attempt to transfer newly initialized blocks from the currently
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 240f6e2dc7ee..92655fd89657 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1339,8 +1339,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1339 clear_buffer_unwritten(bh); 1339 clear_buffer_unwritten(bh);
1340 } 1340 }
1341 1341
1342 /* skip page if block allocation undone */ 1342 /*
1343 if (buffer_delay(bh) || buffer_unwritten(bh)) 1343 * skip page if block allocation undone and
1344 * block is dirty
1345 */
1346 if (ext4_bh_delay_or_unwritten(NULL, bh))
1344 skip_page = 1; 1347 skip_page = 1;
1345 bh = bh->b_this_page; 1348 bh = bh->b_this_page;
1346 block_start += bh->b_size; 1349 block_start += bh->b_size;
@@ -2270,6 +2273,7 @@ retry:
2270 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2273 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2271 "%ld pages, ino %lu; err %d", __func__, 2274 "%ld pages, ino %lu; err %d", __func__,
2272 wbc->nr_to_write, inode->i_ino, ret); 2275 wbc->nr_to_write, inode->i_ino, ret);
2276 blk_finish_plug(&plug);
2273 goto out_writepages; 2277 goto out_writepages;
2274 } 2278 }
2275 2279
@@ -2386,7 +2390,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2386 pgoff_t index; 2390 pgoff_t index;
2387 struct inode *inode = mapping->host; 2391 struct inode *inode = mapping->host;
2388 handle_t *handle; 2392 handle_t *handle;
2389 loff_t page_len;
2390 2393
2391 index = pos >> PAGE_CACHE_SHIFT; 2394 index = pos >> PAGE_CACHE_SHIFT;
2392 2395
@@ -2433,13 +2436,6 @@ retry:
2433 */ 2436 */
2434 if (pos + len > inode->i_size) 2437 if (pos + len > inode->i_size)
2435 ext4_truncate_failed_write(inode); 2438 ext4_truncate_failed_write(inode);
2436 } else {
2437 page_len = pos & (PAGE_CACHE_SIZE - 1);
2438 if (page_len > 0) {
2439 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2440 inode, page, pos - page_len, page_len,
2441 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2442 }
2443 } 2439 }
2444 2440
2445 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2441 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2482,7 +2478,6 @@ static int ext4_da_write_end(struct file *file,
2482 loff_t new_i_size; 2478 loff_t new_i_size;
2483 unsigned long start, end; 2479 unsigned long start, end;
2484 int write_mode = (int)(unsigned long)fsdata; 2480 int write_mode = (int)(unsigned long)fsdata;
2485 loff_t page_len;
2486 2481
2487 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2482 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2488 if (ext4_should_order_data(inode)) { 2483 if (ext4_should_order_data(inode)) {
@@ -2507,7 +2502,7 @@ static int ext4_da_write_end(struct file *file,
2507 */ 2502 */
2508 2503
2509 new_i_size = pos + copied; 2504 new_i_size = pos + copied;
2510 if (new_i_size > EXT4_I(inode)->i_disksize) { 2505 if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2511 if (ext4_da_should_update_i_disksize(page, end)) { 2506 if (ext4_da_should_update_i_disksize(page, end)) {
2512 down_write(&EXT4_I(inode)->i_data_sem); 2507 down_write(&EXT4_I(inode)->i_data_sem);
2513 if (new_i_size > EXT4_I(inode)->i_disksize) { 2508 if (new_i_size > EXT4_I(inode)->i_disksize) {
@@ -2531,16 +2526,6 @@ static int ext4_da_write_end(struct file *file,
2531 } 2526 }
2532 ret2 = generic_write_end(file, mapping, pos, len, copied, 2527 ret2 = generic_write_end(file, mapping, pos, len, copied,
2533 page, fsdata); 2528 page, fsdata);
2534
2535 page_len = PAGE_CACHE_SIZE -
2536 ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
2537
2538 if (page_len > 0) {
2539 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2540 inode, page, pos + copied - 1, page_len,
2541 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2542 }
2543
2544 copied = ret2; 2529 copied = ret2;
2545 if (ret2 < 0) 2530 if (ret2 < 0)
2546 ret = ret2; 2531 ret = ret2;
@@ -2780,10 +2765,11 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2780 iocb->private, io_end->inode->i_ino, iocb, offset, 2765 iocb->private, io_end->inode->i_ino, iocb, offset,
2781 size); 2766 size);
2782 2767
2768 iocb->private = NULL;
2769
2783 /* if not aio dio with unwritten extents, just free io and return */ 2770 /* if not aio dio with unwritten extents, just free io and return */
2784 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 2771 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2785 ext4_free_io_end(io_end); 2772 ext4_free_io_end(io_end);
2786 iocb->private = NULL;
2787out: 2773out:
2788 if (is_async) 2774 if (is_async)
2789 aio_complete(iocb, ret, 0); 2775 aio_complete(iocb, ret, 0);
@@ -2807,7 +2793,6 @@ out:
2807 2793
2808 /* queue the work to convert unwritten extents to written */ 2794 /* queue the work to convert unwritten extents to written */
2809 queue_work(wq, &io_end->work); 2795 queue_work(wq, &io_end->work);
2810 iocb->private = NULL;
2811 2796
2812 /* XXX: probably should move into the real I/O completion handler */ 2797 /* XXX: probably should move into the real I/O completion handler */
2813 inode_dio_done(inode); 2798 inode_dio_done(inode);
@@ -3202,26 +3187,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3202 3187
3203 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3188 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3204 3189
3205 if (!page_has_buffers(page)) { 3190 if (!page_has_buffers(page))
3206 /* 3191 create_empty_buffers(page, blocksize, 0);
3207 * If the range to be discarded covers a partial block
3208 * we need to get the page buffers. This is because
3209 * partial blocks cannot be released and the page needs
3210 * to be updated with the contents of the block before
3211 * we write the zeros on top of it.
3212 */
3213 if ((from & (blocksize - 1)) ||
3214 ((from + length) & (blocksize - 1))) {
3215 create_empty_buffers(page, blocksize, 0);
3216 } else {
3217 /*
3218 * If there are no partial blocks,
3219 * there is nothing to update,
3220 * so we can return now
3221 */
3222 return 0;
3223 }
3224 }
3225 3192
3226 /* Find the buffer that contains "offset" */ 3193 /* Find the buffer that contains "offset" */
3227 bh = page_buffers(page); 3194 bh = page_buffers(page);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7ce1d0b19c94..7e106c810c62 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -385,6 +385,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
385 385
386 block_end = block_start + blocksize; 386 block_end = block_start + blocksize;
387 if (block_start >= len) { 387 if (block_start >= len) {
388 /*
389 * Comments copied from block_write_full_page_endio:
390 *
391 * The page straddles i_size. It must be zeroed out on
392 * each and every writepage invocation because it may
393 * be mmapped. "A file is mapped in multiples of the
394 * page size. For a file that is not a multiple of
395 * the page size, the remaining memory is zeroed when
396 * mapped, and writes to that region are not written
397 * out to the file."
398 */
399 zero_user_segment(page, block_start, block_end);
388 clear_buffer_dirty(bh); 400 clear_buffer_dirty(bh);
389 set_buffer_uptodate(bh); 401 set_buffer_uptodate(bh);
390 continue; 402 continue;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80145ad..3e1329e2f826 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1155,9 +1155,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1155 seq_puts(seq, ",block_validity"); 1155 seq_puts(seq, ",block_validity");
1156 1156
1157 if (!test_opt(sb, INIT_INODE_TABLE)) 1157 if (!test_opt(sb, INIT_INODE_TABLE))
1158 seq_puts(seq, ",noinit_inode_table"); 1158 seq_puts(seq, ",noinit_itable");
1159 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) 1159 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1160 seq_printf(seq, ",init_inode_table=%u", 1160 seq_printf(seq, ",init_itable=%u",
1161 (unsigned) sbi->s_li_wait_mult); 1161 (unsigned) sbi->s_li_wait_mult);
1162 1162
1163 ext4_show_quota_options(seq, sb); 1163 ext4_show_quota_options(seq, sb);
@@ -1333,8 +1333,7 @@ enum {
1333 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1333 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1334 Opt_inode_readahead_blks, Opt_journal_ioprio, 1334 Opt_inode_readahead_blks, Opt_journal_ioprio,
1335 Opt_dioread_nolock, Opt_dioread_lock, 1335 Opt_dioread_nolock, Opt_dioread_lock,
1336 Opt_discard, Opt_nodiscard, 1336 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1337 Opt_init_inode_table, Opt_noinit_inode_table,
1338}; 1337};
1339 1338
1340static const match_table_t tokens = { 1339static const match_table_t tokens = {
@@ -1407,9 +1406,9 @@ static const match_table_t tokens = {
1407 {Opt_dioread_lock, "dioread_lock"}, 1406 {Opt_dioread_lock, "dioread_lock"},
1408 {Opt_discard, "discard"}, 1407 {Opt_discard, "discard"},
1409 {Opt_nodiscard, "nodiscard"}, 1408 {Opt_nodiscard, "nodiscard"},
1410 {Opt_init_inode_table, "init_itable=%u"}, 1409 {Opt_init_itable, "init_itable=%u"},
1411 {Opt_init_inode_table, "init_itable"}, 1410 {Opt_init_itable, "init_itable"},
1412 {Opt_noinit_inode_table, "noinit_itable"}, 1411 {Opt_noinit_itable, "noinit_itable"},
1413 {Opt_err, NULL}, 1412 {Opt_err, NULL},
1414}; 1413};
1415 1414
@@ -1683,7 +1682,9 @@ static int parse_options(char *options, struct super_block *sb,
1683 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1682 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1684 datacheck: 1683 datacheck:
1685 if (is_remount) { 1684 if (is_remount) {
1686 if (test_opt(sb, DATA_FLAGS) != data_opt) { 1685 if (!sbi->s_journal)
1686 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1687 else if (test_opt(sb, DATA_FLAGS) != data_opt) {
1687 ext4_msg(sb, KERN_ERR, 1688 ext4_msg(sb, KERN_ERR,
1688 "Cannot change data mode on remount"); 1689 "Cannot change data mode on remount");
1689 return 0; 1690 return 0;
@@ -1890,7 +1891,7 @@ set_qf_format:
1890 case Opt_dioread_lock: 1891 case Opt_dioread_lock:
1891 clear_opt(sb, DIOREAD_NOLOCK); 1892 clear_opt(sb, DIOREAD_NOLOCK);
1892 break; 1893 break;
1893 case Opt_init_inode_table: 1894 case Opt_init_itable:
1894 set_opt(sb, INIT_INODE_TABLE); 1895 set_opt(sb, INIT_INODE_TABLE);
1895 if (args[0].from) { 1896 if (args[0].from) {
1896 if (match_int(&args[0], &option)) 1897 if (match_int(&args[0], &option))
@@ -1901,7 +1902,7 @@ set_qf_format:
1901 return 0; 1902 return 0;
1902 sbi->s_li_wait_mult = option; 1903 sbi->s_li_wait_mult = option;
1903 break; 1904 break;
1904 case Opt_noinit_inode_table: 1905 case Opt_noinit_itable:
1905 clear_opt(sb, INIT_INODE_TABLE); 1906 clear_opt(sb, INIT_INODE_TABLE);
1906 break; 1907 break;
1907 default: 1908 default:
@@ -3099,8 +3100,6 @@ static void ext4_destroy_lazyinit_thread(void)
3099} 3100}
3100 3101
3101static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3102static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3102 __releases(kernel_lock)
3103 __acquires(kernel_lock)
3104{ 3103{
3105 char *orig_data = kstrdup(data, GFP_KERNEL); 3104 char *orig_data = kstrdup(data, GFP_KERNEL);
3106 struct buffer_head *bh; 3105 struct buffer_head *bh;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 73c3992b2bb4..ac86f8b3e3cb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -156,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
156 * bdi_start_writeback - start writeback 156 * bdi_start_writeback - start writeback
157 * @bdi: the backing device to write from 157 * @bdi: the backing device to write from
158 * @nr_pages: the number of pages to write 158 * @nr_pages: the number of pages to write
159 * @reason: reason why some writeback work was initiated
159 * 160 *
160 * Description: 161 * Description:
161 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 162 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
@@ -1223,6 +1224,7 @@ static void wait_sb_inodes(struct super_block *sb)
1223 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 1224 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1224 * @sb: the superblock 1225 * @sb: the superblock
1225 * @nr: the number of pages to write 1226 * @nr: the number of pages to write
1227 * @reason: reason why some writeback work initiated
1226 * 1228 *
1227 * Start writeback on some inodes on this super_block. No guarantees are made 1229 * Start writeback on some inodes on this super_block. No guarantees are made
1228 * on how many (if any) will be written, and this function does not wait 1230 * on how many (if any) will be written, and this function does not wait
@@ -1251,6 +1253,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
1251/** 1253/**
1252 * writeback_inodes_sb - writeback dirty inodes from given super_block 1254 * writeback_inodes_sb - writeback dirty inodes from given super_block
1253 * @sb: the superblock 1255 * @sb: the superblock
1256 * @reason: reason why some writeback work was initiated
1254 * 1257 *
1255 * Start writeback on some inodes on this super_block. No guarantees are made 1258 * Start writeback on some inodes on this super_block. No guarantees are made
1256 * on how many (if any) will be written, and this function does not wait 1259 * on how many (if any) will be written, and this function does not wait
@@ -1265,6 +1268,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1265/** 1268/**
1266 * writeback_inodes_sb_if_idle - start writeback if none underway 1269 * writeback_inodes_sb_if_idle - start writeback if none underway
1267 * @sb: the superblock 1270 * @sb: the superblock
1271 * @reason: reason why some writeback work was initiated
1268 * 1272 *
1269 * Invoke writeback_inodes_sb if no writeback is currently underway. 1273 * Invoke writeback_inodes_sb if no writeback is currently underway.
1270 * Returns 1 if writeback was started, 0 if not. 1274 * Returns 1 if writeback was started, 0 if not.
@@ -1285,6 +1289,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1285 * writeback_inodes_sb_if_idle - start writeback if none underway 1289 * writeback_inodes_sb_if_idle - start writeback if none underway
1286 * @sb: the superblock 1290 * @sb: the superblock
1287 * @nr: the number of pages to write 1291 * @nr: the number of pages to write
1292 * @reason: reason why some writeback work was initiated
1288 * 1293 *
1289 * Invoke writeback_inodes_sb if no writeback is currently underway. 1294 * Invoke writeback_inodes_sb if no writeback is currently underway.
1290 * Returns 1 if writeback was started, 0 if not. 1295 * Returns 1 if writeback was started, 0 if not.
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5cb8614508c3..2aaf3eaaf13d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1512,7 +1512,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1512 else if (outarg->offset + num > file_size) 1512 else if (outarg->offset + num > file_size)
1513 num = file_size - outarg->offset; 1513 num = file_size - outarg->offset;
1514 1514
1515 while (num) { 1515 while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
1516 struct page *page; 1516 struct page *page;
1517 unsigned int this_num; 1517 unsigned int this_num;
1518 1518
@@ -1526,6 +1526,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1526 1526
1527 num -= this_num; 1527 num -= this_num;
1528 total_len += this_num; 1528 total_len += this_num;
1529 index++;
1529 } 1530 }
1530 req->misc.retrieve_in.offset = outarg->offset; 1531 req->misc.retrieve_in.offset = outarg->offset;
1531 req->misc.retrieve_in.size = total_len; 1532 req->misc.retrieve_in.size = total_len;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 594f07a81c28..0c84100acd44 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1556,7 +1556,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1556 struct inode *inode = file->f_path.dentry->d_inode; 1556 struct inode *inode = file->f_path.dentry->d_inode;
1557 1557
1558 mutex_lock(&inode->i_mutex); 1558 mutex_lock(&inode->i_mutex);
1559 if (origin != SEEK_CUR || origin != SEEK_SET) { 1559 if (origin != SEEK_CUR && origin != SEEK_SET) {
1560 retval = fuse_update_attributes(inode, NULL, file, NULL); 1560 retval = fuse_update_attributes(inode, NULL, file, NULL);
1561 if (retval) 1561 if (retval)
1562 goto exit; 1562 goto exit;
@@ -1567,6 +1567,10 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1567 offset += i_size_read(inode); 1567 offset += i_size_read(inode);
1568 break; 1568 break;
1569 case SEEK_CUR: 1569 case SEEK_CUR:
1570 if (offset == 0) {
1571 retval = file->f_pos;
1572 goto exit;
1573 }
1570 offset += file->f_pos; 1574 offset += file->f_pos;
1571 break; 1575 break;
1572 case SEEK_DATA: 1576 case SEEK_DATA:
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3e6d72756479..aa83109b9431 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1138,28 +1138,28 @@ static int __init fuse_fs_init(void)
1138{ 1138{
1139 int err; 1139 int err;
1140 1140
1141 err = register_filesystem(&fuse_fs_type);
1142 if (err)
1143 goto out;
1144
1145 err = register_fuseblk();
1146 if (err)
1147 goto out_unreg;
1148
1149 fuse_inode_cachep = kmem_cache_create("fuse_inode", 1141 fuse_inode_cachep = kmem_cache_create("fuse_inode",
1150 sizeof(struct fuse_inode), 1142 sizeof(struct fuse_inode),
1151 0, SLAB_HWCACHE_ALIGN, 1143 0, SLAB_HWCACHE_ALIGN,
1152 fuse_inode_init_once); 1144 fuse_inode_init_once);
1153 err = -ENOMEM; 1145 err = -ENOMEM;
1154 if (!fuse_inode_cachep) 1146 if (!fuse_inode_cachep)
1155 goto out_unreg2; 1147 goto out;
1148
1149 err = register_fuseblk();
1150 if (err)
1151 goto out2;
1152
1153 err = register_filesystem(&fuse_fs_type);
1154 if (err)
1155 goto out3;
1156 1156
1157 return 0; 1157 return 0;
1158 1158
1159 out_unreg2: 1159 out3:
1160 unregister_fuseblk(); 1160 unregister_fuseblk();
1161 out_unreg: 1161 out2:
1162 unregister_filesystem(&fuse_fs_type); 1162 kmem_cache_destroy(fuse_inode_cachep);
1163 out: 1163 out:
1164 return err; 1164 return err;
1165} 1165}
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9bd..ef175cb8cfd8 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
16#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18 18
19static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
20
21static DEFINE_SPINLOCK(bitmap_lock); 19static DEFINE_SPINLOCK(bitmap_lock);
22 20
23static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) 21/*
22 * bitmap consists of blocks filled with 16bit words
23 * bit set == busy, bit clear == free
24 * endianness is a mess, but for counting zero bits it really doesn't matter...
25 */
26static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
24{ 27{
25 unsigned i, j, sum = 0; 28 __u32 sum = 0;
26 struct buffer_head *bh; 29 unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
27
28 for (i=0; i<numblocks-1; i++) {
29 if (!(bh=map[i]))
30 return(0);
31 for (j=0; j<bh->b_size; j++)
32 sum += nibblemap[bh->b_data[j] & 0xf]
33 + nibblemap[(bh->b_data[j]>>4) & 0xf];
34 }
35 30
36 if (numblocks==0 || !(bh=map[numblocks-1])) 31 while (blocks--) {
37 return(0); 32 unsigned words = blocksize / 2;
38 i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; 33 __u16 *p = (__u16 *)(*map++)->b_data;
39 for (j=0; j<i; j++) { 34 while (words--)
40 sum += nibblemap[bh->b_data[j] & 0xf] 35 sum += 16 - hweight16(*p++);
41 + nibblemap[(bh->b_data[j]>>4) & 0xf];
42 } 36 }
43 37
44 i = numbits%16; 38 return sum;
45 if (i!=0) {
46 i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
47 sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
48 sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
49 }
50 return(sum);
51} 39}
52 40
53void minix_free_block(struct inode *inode, unsigned long block) 41void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
105 return 0; 93 return 0;
106} 94}
107 95
108unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) 96unsigned long minix_count_free_blocks(struct super_block *sb)
109{ 97{
110 return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, 98 struct minix_sb_info *sbi = minix_sb(sb);
111 sbi->s_nzones - sbi->s_firstdatazone + 1) 99 u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
100
101 return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
112 << sbi->s_log_zone_size); 102 << sbi->s_log_zone_size);
113} 103}
114 104
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
273 return inode; 263 return inode;
274} 264}
275 265
276unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) 266unsigned long minix_count_free_inodes(struct super_block *sb)
277{ 267{
278 return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); 268 struct minix_sb_info *sbi = minix_sb(sb);
269 u32 bits = sbi->s_ninodes + 1;
270
271 return count_free(sbi->s_imap, sb->s_blocksize, bits);
279} 272}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 64cdcd662ffc..1d9e33966db0 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
279 else if (sbi->s_mount_state & MINIX_ERROR_FS) 279 else if (sbi->s_mount_state & MINIX_ERROR_FS)
280 printk("MINIX-fs: mounting file system with errors, " 280 printk("MINIX-fs: mounting file system with errors, "
281 "running fsck is recommended\n"); 281 "running fsck is recommended\n");
282
283 /* Apparently minix can create filesystems that allocate more blocks for
284 * the bitmaps than needed. We simply ignore that, but verify it didn't
285 * create one with not enough blocks and bail out if so.
286 */
287 block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
288 if (sbi->s_imap_blocks < block) {
289 printk("MINIX-fs: file system does not have enough "
290 "imap blocks allocated. Refusing to mount\n");
291 goto out_iput;
292 }
293
294 block = minix_blocks_needed(
295 (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
296 s->s_blocksize);
297 if (sbi->s_zmap_blocks < block) {
298 printk("MINIX-fs: file system does not have enough "
299 "zmap blocks allocated. Refusing to mount.\n");
300 goto out_iput;
301 }
302
282 return 0; 303 return 0;
283 304
284out_iput: 305out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
339 buf->f_type = sb->s_magic; 360 buf->f_type = sb->s_magic;
340 buf->f_bsize = sb->s_blocksize; 361 buf->f_bsize = sb->s_blocksize;
341 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 362 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
342 buf->f_bfree = minix_count_free_blocks(sbi); 363 buf->f_bfree = minix_count_free_blocks(sb);
343 buf->f_bavail = buf->f_bfree; 364 buf->f_bavail = buf->f_bfree;
344 buf->f_files = sbi->s_ninodes; 365 buf->f_files = sbi->s_ninodes;
345 buf->f_ffree = minix_count_free_inodes(sbi); 366 buf->f_ffree = minix_count_free_inodes(sb);
346 buf->f_namelen = sbi->s_namelen; 367 buf->f_namelen = sbi->s_namelen;
347 buf->f_fsid.val[0] = (u32)id; 368 buf->f_fsid.val[0] = (u32)id;
348 buf->f_fsid.val[1] = (u32)(id >> 32); 369 buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879a..26bbd55e82ea 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode *, int, int *); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct super_block *sb);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
53extern void minix_free_block(struct inode *inode, unsigned long block); 53extern void minix_free_block(struct inode *inode, unsigned long block);
54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); 54extern unsigned long minix_count_free_blocks(struct super_block *sb);
55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); 55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); 56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
57 57
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
88 return list_entry(inode, struct minix_inode_info, vfs_inode); 88 return list_entry(inode, struct minix_inode_info, vfs_inode);
89} 89}
90 90
91static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
92{
93 return DIV_ROUND_UP(bits, blocksize * 8);
94}
95
91#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ 96#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
92 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) 97 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
93 98
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
125 if (!size) 130 if (!size)
126 return 0; 131 return 0;
127 132
128 size = (size >> 4) + ((size & 15) > 0); 133 size >>= 4;
129 while (*p++ == 0xffff) { 134 while (*p++ == 0xffff) {
130 if (--size == 0) 135 if (--size == 0)
131 return (p - addr) << 4; 136 return (p - addr) << 4;
diff --git a/fs/namespace.c b/fs/namespace.c
index e5e1c7d1839b..cfc6d4448aa5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
1048 if (err) 1048 if (err)
1049 goto out; 1049 goto out;
1050 seq_putc(m, ' '); 1050 seq_putc(m, ' ');
1051 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 1051
1052 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 1052 /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
1053 /* 1053 err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
1054 * Mountpoint is outside root, discard that one. Ugly, 1054 if (err)
1055 * but less so than trying to do that in iterator in a 1055 goto out;
1056 * race-free way (due to renames). 1056
1057 */
1058 return SEQ_SKIP;
1059 }
1060 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); 1057 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
1061 show_mnt_opts(m, mnt); 1058 show_mnt_opts(m, mnt);
1062 1059
@@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2483 __mnt_make_longterm(mnt); 2480 __mnt_make_longterm(mnt);
2484 new_ns->root = mnt; 2481 new_ns->root = mnt;
2485 list_add(&new_ns->list, &new_ns->root->mnt_list); 2482 list_add(&new_ns->list, &new_ns->root->mnt_list);
2483 } else {
2484 mntput(mnt);
2486 } 2485 }
2487 return new_ns; 2486 return new_ns;
2488} 2487}
2489EXPORT_SYMBOL(create_mnt_ns); 2488EXPORT_SYMBOL(create_mnt_ns);
2490 2489
2490struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2491{
2492 struct mnt_namespace *ns;
2493 struct super_block *s;
2494 struct path path;
2495 int err;
2496
2497 ns = create_mnt_ns(mnt);
2498 if (IS_ERR(ns))
2499 return ERR_CAST(ns);
2500
2501 err = vfs_path_lookup(mnt->mnt_root, mnt,
2502 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2503
2504 put_mnt_ns(ns);
2505
2506 if (err)
2507 return ERR_PTR(err);
2508
2509 /* trade a vfsmount reference for active sb one */
2510 s = path.mnt->mnt_sb;
2511 atomic_inc(&s->s_active);
2512 mntput(path.mnt);
2513 /* lock the sucker */
2514 down_write(&s->s_umount);
2515 /* ... and return the root of (sub)tree on it */
2516 return path.dentry;
2517}
2518EXPORT_SYMBOL(mount_subtree);
2519
2491SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2520SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2492 char __user *, type, unsigned long, flags, void __user *, data) 2521 char __user *, type, unsigned long, flags, void __user *, data)
2493{ 2522{
@@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt)
2744 } 2773 }
2745} 2774}
2746EXPORT_SYMBOL(kern_unmount); 2775EXPORT_SYMBOL(kern_unmount);
2776
2777bool our_mnt(struct vfsmount *mnt)
2778{
2779 return check_mnt(mnt);
2780}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 5b5fa33b6b9d..cbd1a61c110a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -548,7 +548,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
548 548
549 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); 549 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
550 if (error) 550 if (error)
551 goto out_bdi; 551 goto out_fput;
552 552
553 server->ncp_filp = ncp_filp; 553 server->ncp_filp = ncp_filp;
554 server->ncp_sock = sock; 554 server->ncp_sock = sock;
@@ -559,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
559 error = -EBADF; 559 error = -EBADF;
560 server->info_filp = fget(data.info_fd); 560 server->info_filp = fget(data.info_fd);
561 if (!server->info_filp) 561 if (!server->info_filp)
562 goto out_fput; 562 goto out_bdi;
563 error = -ENOTSOCK; 563 error = -ENOTSOCK;
564 sock_inode = server->info_filp->f_path.dentry->d_inode; 564 sock_inode = server->info_filp->f_path.dentry->d_inode;
565 if (!S_ISSOCK(sock_inode->i_mode)) 565 if (!S_ISSOCK(sock_inode->i_mode))
@@ -746,9 +746,9 @@ out_nls:
746out_fput2: 746out_fput2:
747 if (server->info_filp) 747 if (server->info_filp)
748 fput(server->info_filp); 748 fput(server->info_filp);
749out_fput:
750 bdi_destroy(&server->bdi);
751out_bdi: 749out_bdi:
750 bdi_destroy(&server->bdi);
751out_fput:
752 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 752 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
753 * 753 *
754 * The previously used put_filp(ncp_filp); was bogus, since 754 * The previously used put_filp(ncp_filp); was bogus, since
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b238d95ac48c..ac2899098147 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1468 res = NULL; 1468 res = NULL;
1469 goto out; 1469 goto out;
1470 /* This turned out not to be a regular file */ 1470 /* This turned out not to be a regular file */
1471 case -EISDIR:
1471 case -ENOTDIR: 1472 case -ENOTDIR:
1472 goto no_open; 1473 goto no_open;
1473 case -ELOOP: 1474 case -ELOOP:
1474 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1475 if (!(nd->intent.open.flags & O_NOFOLLOW))
1475 goto no_open; 1476 goto no_open;
1476 /* case -EISDIR: */
1477 /* case -EINVAL: */ 1477 /* case -EINVAL: */
1478 default: 1478 default:
1479 res = ERR_CAST(inode); 1479 res = ERR_CAST(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0a1f8312b4dc..eca56d4b39c0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -40,48 +40,8 @@
40 40
41#define NFSDBG_FACILITY NFSDBG_FILE 41#define NFSDBG_FACILITY NFSDBG_FILE
42 42
43static int nfs_file_open(struct inode *, struct file *);
44static int nfs_file_release(struct inode *, struct file *);
45static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
46static int nfs_file_mmap(struct file *, struct vm_area_struct *);
47static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
48 struct pipe_inode_info *pipe,
49 size_t count, unsigned int flags);
50static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
51 unsigned long nr_segs, loff_t pos);
52static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53 struct file *filp, loff_t *ppos,
54 size_t count, unsigned int flags);
55static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
56 unsigned long nr_segs, loff_t pos);
57static int nfs_file_flush(struct file *, fl_owner_t id);
58static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
59static int nfs_check_flags(int flags);
60static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
61static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
62static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
63
64static const struct vm_operations_struct nfs_file_vm_ops; 43static const struct vm_operations_struct nfs_file_vm_ops;
65 44
66const struct file_operations nfs_file_operations = {
67 .llseek = nfs_file_llseek,
68 .read = do_sync_read,
69 .write = do_sync_write,
70 .aio_read = nfs_file_read,
71 .aio_write = nfs_file_write,
72 .mmap = nfs_file_mmap,
73 .open = nfs_file_open,
74 .flush = nfs_file_flush,
75 .release = nfs_file_release,
76 .fsync = nfs_file_fsync,
77 .lock = nfs_lock,
78 .flock = nfs_flock,
79 .splice_read = nfs_file_splice_read,
80 .splice_write = nfs_file_splice_write,
81 .check_flags = nfs_check_flags,
82 .setlease = nfs_setlease,
83};
84
85const struct inode_operations nfs_file_inode_operations = { 45const struct inode_operations nfs_file_inode_operations = {
86 .permission = nfs_permission, 46 .permission = nfs_permission,
87 .getattr = nfs_getattr, 47 .getattr = nfs_getattr,
@@ -886,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
886 file->f_path.dentry->d_name.name, arg); 846 file->f_path.dentry->d_name.name, arg);
887 return -EINVAL; 847 return -EINVAL;
888} 848}
849
850const struct file_operations nfs_file_operations = {
851 .llseek = nfs_file_llseek,
852 .read = do_sync_read,
853 .write = do_sync_write,
854 .aio_read = nfs_file_read,
855 .aio_write = nfs_file_write,
856 .mmap = nfs_file_mmap,
857 .open = nfs_file_open,
858 .flush = nfs_file_flush,
859 .release = nfs_file_release,
860 .fsync = nfs_file_fsync,
861 .lock = nfs_lock,
862 .flock = nfs_flock,
863 .splice_read = nfs_file_splice_read,
864 .splice_write = nfs_file_splice_write,
865 .check_flags = nfs_check_flags,
866 .setlease = nfs_setlease,
867};
868
869#ifdef CONFIG_NFS_V4
870static int
871nfs4_file_open(struct inode *inode, struct file *filp)
872{
873 /*
874 * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
875 * this point, then something is very wrong
876 */
877 dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
878 return -ENOTDIR;
879}
880
881const struct file_operations nfs4_file_operations = {
882 .llseek = nfs_file_llseek,
883 .read = do_sync_read,
884 .write = do_sync_write,
885 .aio_read = nfs_file_read,
886 .aio_write = nfs_file_write,
887 .mmap = nfs_file_mmap,
888 .open = nfs4_file_open,
889 .flush = nfs_file_flush,
890 .release = nfs_file_release,
891 .fsync = nfs_file_fsync,
892 .lock = nfs_lock,
893 .flock = nfs_flock,
894 .splice_read = nfs_file_splice_read,
895 .splice_write = nfs_file_splice_write,
896 .check_flags = nfs_check_flags,
897 .setlease = nfs_setlease,
898};
899#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c07a55aec838..50a15fa8cf98 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 */ 291 */
292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; 292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
293 if (S_ISREG(inode->i_mode)) { 293 if (S_ISREG(inode->i_mode)) {
294 inode->i_fop = &nfs_file_operations; 294 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
295 inode->i_data.a_ops = &nfs_file_aops; 295 inode->i_data.a_ops = &nfs_file_aops;
296 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; 296 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
297 } else if (S_ISDIR(inode->i_mode)) { 297 } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c1a1bd8ddf1c..3f4d95751d52 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
299extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, 299extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
300 struct list_head *head); 300 struct list_head *head);
301 301
302extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
303 struct inode *inode);
302extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); 304extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
303extern void nfs_readdata_release(struct nfs_read_data *rdata); 305extern void nfs_readdata_release(struct nfs_read_data *rdata);
304 306
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690ca08c..d4bc9ed91748 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
853 .dentry_ops = &nfs_dentry_operations, 853 .dentry_ops = &nfs_dentry_operations,
854 .dir_inode_ops = &nfs3_dir_inode_operations, 854 .dir_inode_ops = &nfs3_dir_inode_operations,
855 .file_inode_ops = &nfs3_file_inode_operations, 855 .file_inode_ops = &nfs3_file_inode_operations,
856 .file_ops = &nfs_file_operations,
856 .getroot = nfs3_proc_get_root, 857 .getroot = nfs3_proc_get_root,
857 .getattr = nfs3_proc_getattr, 858 .getattr = nfs3_proc_getattr,
858 .setattr = nfs3_proc_setattr, 859 .setattr = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b60fddf606f7..be2bbac13817 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
2464 case -NFS4ERR_BADNAME: 2464 case -NFS4ERR_BADNAME:
2465 return -ENOENT; 2465 return -ENOENT;
2466 case -NFS4ERR_MOVED: 2466 case -NFS4ERR_MOVED:
2467 err = nfs4_get_referral(dir, name, fattr, fhandle); 2467 return nfs4_get_referral(dir, name, fattr, fhandle);
2468 break;
2469 case -NFS4ERR_WRONGSEC: 2468 case -NFS4ERR_WRONGSEC:
2470 nfs_fixup_secinfo_attributes(fattr, fhandle); 2469 nfs_fixup_secinfo_attributes(fattr, fhandle);
2471 } 2470 }
@@ -6253,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6253 .dentry_ops = &nfs4_dentry_operations, 6252 .dentry_ops = &nfs4_dentry_operations,
6254 .dir_inode_ops = &nfs4_dir_inode_operations, 6253 .dir_inode_ops = &nfs4_dir_inode_operations,
6255 .file_inode_ops = &nfs4_file_inode_operations, 6254 .file_inode_ops = &nfs4_file_inode_operations,
6255 .file_ops = &nfs4_file_operations,
6256 .getroot = nfs4_proc_get_root, 6256 .getroot = nfs4_proc_get_root,
6257 .getattr = nfs4_proc_getattr, 6257 .getattr = nfs4_proc_getattr,
6258 .setattr = nfs4_proc_setattr, 6258 .setattr = nfs4_proc_setattr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index baf73536bc04..8e672a2b2d69 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1260,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1260} 1260}
1261EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1261EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1262 1262
1263static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
1264{
1265 struct nfs_pageio_descriptor pgio;
1266
1267 put_lseg(data->lseg);
1268 data->lseg = NULL;
1269 dprintk("pnfs write error = %d\n", data->pnfs_error);
1270
1271 nfs_pageio_init_read_mds(&pgio, data->inode);
1272
1273 while (!list_empty(&data->pages)) {
1274 struct nfs_page *req = nfs_list_entry(data->pages.next);
1275
1276 nfs_list_remove_request(req);
1277 nfs_pageio_add_request(&pgio, req);
1278 }
1279 nfs_pageio_complete(&pgio);
1280}
1281
1263/* 1282/*
1264 * Called by non rpc-based layout drivers 1283 * Called by non rpc-based layout drivers
1265 */ 1284 */
@@ -1268,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
1268 if (likely(!data->pnfs_error)) { 1287 if (likely(!data->pnfs_error)) {
1269 __nfs4_read_done_cb(data); 1288 __nfs4_read_done_cb(data);
1270 data->mds_ops->rpc_call_done(&data->task, data); 1289 data->mds_ops->rpc_call_done(&data->task, data);
1271 } else { 1290 } else
1272 put_lseg(data->lseg); 1291 pnfs_ld_handle_read_error(data);
1273 data->lseg = NULL;
1274 dprintk("pnfs write error = %d\n", data->pnfs_error);
1275 }
1276 data->mds_ops->rpc_release(data); 1292 data->mds_ops->rpc_release(data);
1277} 1293}
1278EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1294EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7e..f48125da198a 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
710 .dentry_ops = &nfs_dentry_operations, 710 .dentry_ops = &nfs_dentry_operations,
711 .dir_inode_ops = &nfs_dir_inode_operations, 711 .dir_inode_ops = &nfs_dir_inode_operations,
712 .file_inode_ops = &nfs_file_inode_operations, 712 .file_inode_ops = &nfs_file_inode_operations,
713 .file_ops = &nfs_file_operations,
713 .getroot = nfs_proc_get_root, 714 .getroot = nfs_proc_get_root,
714 .getattr = nfs_proc_getattr, 715 .getattr = nfs_proc_getattr,
715 .setattr = nfs_proc_setattr, 716 .setattr = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8b48ec63f722..cfa175c223dc 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
109 } 109 }
110} 110}
111 111
112static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, 112void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
113 struct inode *inode) 113 struct inode *inode)
114{ 114{
115 nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, 115 nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
534static void nfs_readpage_release_full(void *calldata) 534static void nfs_readpage_release_full(void *calldata)
535{ 535{
536 struct nfs_read_data *data = calldata; 536 struct nfs_read_data *data = calldata;
537 struct nfs_pageio_descriptor pgio;
538 537
539 if (data->pnfs_error) {
540 nfs_pageio_init_read_mds(&pgio, data->inode);
541 pgio.pg_recoalesce = 1;
542 }
543 while (!list_empty(&data->pages)) { 538 while (!list_empty(&data->pages)) {
544 struct nfs_page *req = nfs_list_entry(data->pages.next); 539 struct nfs_page *req = nfs_list_entry(data->pages.next);
545 540
546 nfs_list_remove_request(req); 541 nfs_list_remove_request(req);
547 if (!data->pnfs_error) 542 nfs_readpage_release(req);
548 nfs_readpage_release(req);
549 else
550 nfs_pageio_add_request(&pgio, req);
551 } 543 }
552 if (data->pnfs_error)
553 nfs_pageio_complete(&pgio);
554 nfs_readdata_release(calldata); 544 nfs_readdata_release(calldata);
555} 545}
556 546
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 480b3b6bf71e..134777406ee3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void)
2787static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, 2787static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2788 const char *export_path) 2788 const char *export_path)
2789{ 2789{
2790 struct mnt_namespace *ns_private;
2791 struct super_block *s;
2792 struct dentry *dentry; 2790 struct dentry *dentry;
2793 struct path path; 2791 int ret = nfs_referral_loop_protect();
2794 int ret;
2795
2796 ns_private = create_mnt_ns(root_mnt);
2797 ret = PTR_ERR(ns_private);
2798 if (IS_ERR(ns_private))
2799 goto out_mntput;
2800
2801 ret = nfs_referral_loop_protect();
2802 if (ret != 0)
2803 goto out_put_mnt_ns;
2804 2792
2805 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2793 if (ret) {
2806 export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2794 mntput(root_mnt);
2795 return ERR_PTR(ret);
2796 }
2807 2797
2798 dentry = mount_subtree(root_mnt, export_path);
2808 nfs_referral_loop_unprotect(); 2799 nfs_referral_loop_unprotect();
2809 put_mnt_ns(ns_private);
2810
2811 if (ret != 0)
2812 goto out_err;
2813
2814 s = path.mnt->mnt_sb;
2815 atomic_inc(&s->s_active);
2816 dentry = dget(path.dentry);
2817 2800
2818 path_put(&path);
2819 down_write(&s->s_umount);
2820 return dentry; 2801 return dentry;
2821out_put_mnt_ns:
2822 put_mnt_ns(ns_private);
2823out_mntput:
2824 mntput(root_mnt);
2825out_err:
2826 return ERR_PTR(ret);
2827} 2802}
2828 2803
2829static struct dentry *nfs4_try_mount(int flags, const char *dev_name, 2804static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ed553c60de82..3165aebb43c8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5699 OCFS2_JOURNAL_ACCESS_WRITE); 5699 OCFS2_JOURNAL_ACCESS_WRITE);
5700 if (ret) { 5700 if (ret) {
5701 mlog_errno(ret); 5701 mlog_errno(ret);
5702 goto out; 5702 goto out_commit;
5703 } 5703 }
5704 5704
5705 dquot_free_space_nodirty(inode, 5705 dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c1efe939c774..78b68af3b0e3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
290 } 290 }
291 291
292 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 292 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
293 /*
294 * Unlock the page and cycle ip_alloc_sem so that we don't
295 * busyloop waiting for ip_alloc_sem to unlock
296 */
293 ret = AOP_TRUNCATED_PAGE; 297 ret = AOP_TRUNCATED_PAGE;
298 unlock_page(page);
299 unlock = 0;
300 down_read(&oi->ip_alloc_sem);
301 up_read(&oi->ip_alloc_sem);
294 goto out_inode_unlock; 302 goto out_inode_unlock;
295 } 303 }
296 304
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
563{ 571{
564 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 572 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
565 int level; 573 int level;
574 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
566 575
567 /* this io's submitter should not have unlocked this before we could */ 576 /* this io's submitter should not have unlocked this before we could */
568 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 577 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
570 if (ocfs2_iocb_is_sem_locked(iocb)) 579 if (ocfs2_iocb_is_sem_locked(iocb))
571 ocfs2_iocb_clear_sem_locked(iocb); 580 ocfs2_iocb_clear_sem_locked(iocb);
572 581
582 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
583 ocfs2_iocb_clear_unaligned_aio(iocb);
584
585 if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
586 waitqueue_active(wq)) {
587 wake_up_all(wq);
588 }
589 }
590
573 ocfs2_iocb_clear_rw_locked(iocb); 591 ocfs2_iocb_clear_rw_locked(iocb);
574 592
575 level = ocfs2_iocb_rw_locked_level(iocb); 593 level = ocfs2_iocb_rw_locked_level(iocb);
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt {
863 struct page *w_target_page; 881 struct page *w_target_page;
864 882
865 /* 883 /*
884 * w_target_locked is used for page_mkwrite path indicating no unlocking
885 * against w_target_page in ocfs2_write_end_nolock.
886 */
887 unsigned int w_target_locked:1;
888
889 /*
866 * ocfs2_write_end() uses this to know what the real range to 890 * ocfs2_write_end() uses this to know what the real range to
867 * write in the target should be. 891 * write in the target should be.
868 */ 892 */
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
895 919
896static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 920static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
897{ 921{
922 int i;
923
924 /*
925 * w_target_locked is only set to true in the page_mkwrite() case.
926 * The intent is to allow us to lock the target page from write_begin()
927 * to write_end(). The caller must hold a ref on w_target_page.
928 */
929 if (wc->w_target_locked) {
930 BUG_ON(!wc->w_target_page);
931 for (i = 0; i < wc->w_num_pages; i++) {
932 if (wc->w_target_page == wc->w_pages[i]) {
933 wc->w_pages[i] = NULL;
934 break;
935 }
936 }
937 mark_page_accessed(wc->w_target_page);
938 page_cache_release(wc->w_target_page);
939 }
898 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 940 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
899 941
900 brelse(wc->w_di_bh); 942 brelse(wc->w_di_bh);
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1132 */ 1174 */
1133 lock_page(mmap_page); 1175 lock_page(mmap_page);
1134 1176
1177 /* Exit and let the caller retry */
1135 if (mmap_page->mapping != mapping) { 1178 if (mmap_page->mapping != mapping) {
1179 WARN_ON(mmap_page->mapping);
1136 unlock_page(mmap_page); 1180 unlock_page(mmap_page);
1137 /* 1181 ret = -EAGAIN;
1138 * Sanity check - the locking in
1139 * ocfs2_pagemkwrite() should ensure
1140 * that this code doesn't trigger.
1141 */
1142 ret = -EINVAL;
1143 mlog_errno(ret);
1144 goto out; 1182 goto out;
1145 } 1183 }
1146 1184
1147 page_cache_get(mmap_page); 1185 page_cache_get(mmap_page);
1148 wc->w_pages[i] = mmap_page; 1186 wc->w_pages[i] = mmap_page;
1187 wc->w_target_locked = true;
1149 } else { 1188 } else {
1150 wc->w_pages[i] = find_or_create_page(mapping, index, 1189 wc->w_pages[i] = find_or_create_page(mapping, index,
1151 GFP_NOFS); 1190 GFP_NOFS);
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1160 wc->w_target_page = wc->w_pages[i]; 1199 wc->w_target_page = wc->w_pages[i];
1161 } 1200 }
1162out: 1201out:
1202 if (ret)
1203 wc->w_target_locked = false;
1163 return ret; 1204 return ret;
1164} 1205}
1165 1206
@@ -1817,11 +1858,23 @@ try_again:
1817 */ 1858 */
1818 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, 1859 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
1819 cluster_of_pages, mmap_page); 1860 cluster_of_pages, mmap_page);
1820 if (ret) { 1861 if (ret && ret != -EAGAIN) {
1821 mlog_errno(ret); 1862 mlog_errno(ret);
1822 goto out_quota; 1863 goto out_quota;
1823 } 1864 }
1824 1865
1866 /*
1867 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
1868 * the target page. In this case, we exit with no error and no target
1869 * page. This will trigger the caller, page_mkwrite(), to re-try
1870 * the operation.
1871 */
1872 if (ret == -EAGAIN) {
1873 BUG_ON(wc->w_target_page);
1874 ret = 0;
1875 goto out_quota;
1876 }
1877
1825 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1878 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1826 len); 1879 len);
1827 if (ret) { 1880 if (ret) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad987a6..ffb2da370a99 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
78 OCFS2_IOCB_RW_LOCK = 0, 78 OCFS2_IOCB_RW_LOCK = 0,
79 OCFS2_IOCB_RW_LOCK_LEVEL, 79 OCFS2_IOCB_RW_LOCK_LEVEL,
80 OCFS2_IOCB_SEM, 80 OCFS2_IOCB_SEM,
81 OCFS2_IOCB_UNALIGNED_IO,
81 OCFS2_IOCB_NUM_LOCKS 82 OCFS2_IOCB_NUM_LOCKS
82}; 83};
83 84
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
91 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 92 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
92#define ocfs2_iocb_is_sem_locked(iocb) \ 93#define ocfs2_iocb_is_sem_locked(iocb) \
93 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 94 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
95
96#define ocfs2_iocb_set_unaligned_aio(iocb) \
97 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
98#define ocfs2_iocb_clear_unaligned_aio(iocb) \
99 clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
100#define ocfs2_iocb_is_unaligned_aio(iocb) \
101 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
102
103#define OCFS2_IOEND_WQ_HASH_SZ 37
104#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
105 OCFS2_IOEND_WQ_HASH_SZ])
106extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
107
94#endif /* OCFS2_FILE_H */ 108#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27b..a4e855e3690e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
216 216
217 struct list_head hr_all_item; 217 struct list_head hr_all_item;
218 unsigned hr_unclean_stop:1, 218 unsigned hr_unclean_stop:1,
219 hr_aborted_start:1,
219 hr_item_pinned:1, 220 hr_item_pinned:1,
220 hr_item_dropped:1; 221 hr_item_dropped:1;
221 222
@@ -254,6 +255,10 @@ struct o2hb_region {
254 * a more complete api that doesn't lead to this sort of fragility. */ 255 * a more complete api that doesn't lead to this sort of fragility. */
255 atomic_t hr_steady_iterations; 256 atomic_t hr_steady_iterations;
256 257
258 /* terminate o2hb thread if it does not reach steady state
259 * (hr_steady_iterations == 0) within hr_unsteady_iterations */
260 atomic_t hr_unsteady_iterations;
261
257 char hr_dev_name[BDEVNAME_SIZE]; 262 char hr_dev_name[BDEVNAME_SIZE];
258 263
259 unsigned int hr_timeout_ms; 264 unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
324 329
325static void o2hb_arm_write_timeout(struct o2hb_region *reg) 330static void o2hb_arm_write_timeout(struct o2hb_region *reg)
326{ 331{
332 /* Arm writeout only after thread reaches steady state */
333 if (atomic_read(&reg->hr_steady_iterations) != 0)
334 return;
335
327 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 336 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
328 O2HB_MAX_WRITE_TIMEOUT_MS); 337 O2HB_MAX_WRITE_TIMEOUT_MS);
329 338
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
537 return read == computed; 546 return read == computed;
538} 547}
539 548
540/* We want to make sure that nobody is heartbeating on top of us -- 549/*
541 * this will help detect an invalid configuration. */ 550 * Compare the slot data with what we wrote in the last iteration.
542static void o2hb_check_last_timestamp(struct o2hb_region *reg) 551 * If the match fails, print an appropriate error message. This is to
552 * detect errors like... another node hearting on the same slot,
553 * flaky device that is losing writes, etc.
554 * Returns 1 if check succeeds, 0 otherwise.
555 */
556static int o2hb_check_own_slot(struct o2hb_region *reg)
543{ 557{
544 struct o2hb_disk_slot *slot; 558 struct o2hb_disk_slot *slot;
545 struct o2hb_disk_heartbeat_block *hb_block; 559 struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
548 slot = &reg->hr_slots[o2nm_this_node()]; 562 slot = &reg->hr_slots[o2nm_this_node()];
549 /* Don't check on our 1st timestamp */ 563 /* Don't check on our 1st timestamp */
550 if (!slot->ds_last_time) 564 if (!slot->ds_last_time)
551 return; 565 return 0;
552 566
553 hb_block = slot->ds_raw_block; 567 hb_block = slot->ds_raw_block;
554 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && 568 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
555 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && 569 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
556 hb_block->hb_node == slot->ds_node_num) 570 hb_block->hb_node == slot->ds_node_num)
557 return; 571 return 1;
558 572
559#define ERRSTR1 "Another node is heartbeating on device" 573#define ERRSTR1 "Another node is heartbeating on device"
560#define ERRSTR2 "Heartbeat generation mismatch on device" 574#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
574 (unsigned long long)slot->ds_last_time, hb_block->hb_node, 588 (unsigned long long)slot->ds_last_time, hb_block->hb_node,
575 (unsigned long long)le64_to_cpu(hb_block->hb_generation), 589 (unsigned long long)le64_to_cpu(hb_block->hb_generation),
576 (unsigned long long)le64_to_cpu(hb_block->hb_seq)); 590 (unsigned long long)le64_to_cpu(hb_block->hb_seq));
591
592 return 0;
577} 593}
578 594
579static inline void o2hb_prepare_block(struct o2hb_region *reg, 595static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
719 o2nm_node_put(node); 735 o2nm_node_put(node);
720} 736}
721 737
722static void o2hb_set_quorum_device(struct o2hb_region *reg, 738static void o2hb_set_quorum_device(struct o2hb_region *reg)
723 struct o2hb_disk_slot *slot)
724{ 739{
725 assert_spin_locked(&o2hb_live_lock);
726
727 if (!o2hb_global_heartbeat_active()) 740 if (!o2hb_global_heartbeat_active())
728 return; 741 return;
729 742
730 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 743 /* Prevent race with o2hb_heartbeat_group_drop_item() */
744 if (kthread_should_stop())
745 return;
746
747 /* Tag region as quorum only after thread reaches steady state */
748 if (atomic_read(&reg->hr_steady_iterations) != 0)
731 return; 749 return;
732 750
751 spin_lock(&o2hb_live_lock);
752
753 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
754 goto unlock;
755
733 /* 756 /*
734 * A region can be added to the quorum only when it sees all 757 * A region can be added to the quorum only when it sees all
735 * live nodes heartbeat on it. In other words, the region has been 758 * live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
737 */ 760 */
738 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, 761 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
739 sizeof(o2hb_live_node_bitmap))) 762 sizeof(o2hb_live_node_bitmap)))
740 return; 763 goto unlock;
741
742 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
743 return;
744 764
745 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", 765 printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
746 config_item_name(&reg->hr_item)); 766 config_item_name(&reg->hr_item), reg->hr_dev_name);
747 767
748 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 768 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
749 769
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
754 if (o2hb_pop_count(&o2hb_quorum_region_bitmap, 774 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
755 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) 775 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
756 o2hb_region_unpin(NULL); 776 o2hb_region_unpin(NULL);
777unlock:
778 spin_unlock(&o2hb_live_lock);
757} 779}
758 780
759static int o2hb_check_slot(struct o2hb_region *reg, 781static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
925 slot->ds_equal_samples = 0; 947 slot->ds_equal_samples = 0;
926 } 948 }
927out: 949out:
928 o2hb_set_quorum_device(reg, slot);
929
930 spin_unlock(&o2hb_live_lock); 950 spin_unlock(&o2hb_live_lock);
931 951
932 o2hb_run_event_list(&event); 952 o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
957 977
958static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) 978static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
959{ 979{
960 int i, ret, highest_node, change = 0; 980 int i, ret, highest_node;
981 int membership_change = 0, own_slot_ok = 0;
961 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 982 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
962 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 983 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
963 struct o2hb_bio_wait_ctxt write_wc; 984 struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
966 sizeof(configured_nodes)); 987 sizeof(configured_nodes));
967 if (ret) { 988 if (ret) {
968 mlog_errno(ret); 989 mlog_errno(ret);
969 return ret; 990 goto bail;
970 } 991 }
971 992
972 /* 993 /*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
982 1003
983 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 1004 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
984 if (highest_node >= O2NM_MAX_NODES) { 1005 if (highest_node >= O2NM_MAX_NODES) {
985 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 1006 mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
986 return -EINVAL; 1007 ret = -EINVAL;
1008 goto bail;
987 } 1009 }
988 1010
989 /* No sense in reading the slots of nodes that don't exist 1011 /* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
993 ret = o2hb_read_slots(reg, highest_node + 1); 1015 ret = o2hb_read_slots(reg, highest_node + 1);
994 if (ret < 0) { 1016 if (ret < 0) {
995 mlog_errno(ret); 1017 mlog_errno(ret);
996 return ret; 1018 goto bail;
997 } 1019 }
998 1020
999 /* With an up to date view of the slots, we can check that no 1021 /* With an up to date view of the slots, we can check that no
1000 * other node has been improperly configured to heartbeat in 1022 * other node has been improperly configured to heartbeat in
1001 * our slot. */ 1023 * our slot. */
1002 o2hb_check_last_timestamp(reg); 1024 own_slot_ok = o2hb_check_own_slot(reg);
1003 1025
1004 /* fill in the proper info for our next heartbeat */ 1026 /* fill in the proper info for our next heartbeat */
1005 o2hb_prepare_block(reg, reg->hr_generation); 1027 o2hb_prepare_block(reg, reg->hr_generation);
1006 1028
1007 /* And fire off the write. Note that we don't wait on this I/O
1008 * until later. */
1009 ret = o2hb_issue_node_write(reg, &write_wc); 1029 ret = o2hb_issue_node_write(reg, &write_wc);
1010 if (ret < 0) { 1030 if (ret < 0) {
1011 mlog_errno(ret); 1031 mlog_errno(ret);
1012 return ret; 1032 goto bail;
1013 } 1033 }
1014 1034
1015 i = -1; 1035 i = -1;
1016 while((i = find_next_bit(configured_nodes, 1036 while((i = find_next_bit(configured_nodes,
1017 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { 1037 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
1018 change |= o2hb_check_slot(reg, &reg->hr_slots[i]); 1038 membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
1019 } 1039 }
1020 1040
1021 /* 1041 /*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
1030 * disk */ 1050 * disk */
1031 mlog(ML_ERROR, "Write error %d on device \"%s\"\n", 1051 mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
1032 write_wc.wc_error, reg->hr_dev_name); 1052 write_wc.wc_error, reg->hr_dev_name);
1033 return write_wc.wc_error; 1053 ret = write_wc.wc_error;
1054 goto bail;
1034 } 1055 }
1035 1056
1036 o2hb_arm_write_timeout(reg); 1057 /* Skip disarming the timeout if own slot has stale/bad data */
1058 if (own_slot_ok) {
1059 o2hb_set_quorum_device(reg);
1060 o2hb_arm_write_timeout(reg);
1061 }
1037 1062
1063bail:
1038 /* let the person who launched us know when things are steady */ 1064 /* let the person who launched us know when things are steady */
1039 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) { 1065 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1040 if (atomic_dec_and_test(&reg->hr_steady_iterations)) 1066 if (!ret && own_slot_ok && !membership_change) {
1067 if (atomic_dec_and_test(&reg->hr_steady_iterations))
1068 wake_up(&o2hb_steady_queue);
1069 }
1070 }
1071
1072 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1073 if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
1074 printk(KERN_NOTICE "o2hb: Unable to stabilize "
1075 "heartbeart on region %s (%s)\n",
1076 config_item_name(&reg->hr_item),
1077 reg->hr_dev_name);
1078 atomic_set(&reg->hr_steady_iterations, 0);
1079 reg->hr_aborted_start = 1;
1041 wake_up(&o2hb_steady_queue); 1080 wake_up(&o2hb_steady_queue);
1081 ret = -EIO;
1082 }
1042 } 1083 }
1043 1084
1044 return 0; 1085 return ret;
1045} 1086}
1046 1087
1047/* Subtract b from a, storing the result in a. a *must* have a larger 1088/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
1095 /* Pin node */ 1136 /* Pin node */
1096 o2nm_depend_this_node(); 1137 o2nm_depend_this_node();
1097 1138
1098 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1139 while (!kthread_should_stop() &&
1140 !reg->hr_unclean_stop && !reg->hr_aborted_start) {
1099 /* We track the time spent inside 1141 /* We track the time spent inside
1100 * o2hb_do_disk_heartbeat so that we avoid more than 1142 * o2hb_do_disk_heartbeat so that we avoid more than
1101 * hr_timeout_ms between disk writes. On busy systems 1143 * hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
1103 * likely to time itself out. */ 1145 * likely to time itself out. */
1104 do_gettimeofday(&before_hb); 1146 do_gettimeofday(&before_hb);
1105 1147
1106 i = 0; 1148 ret = o2hb_do_disk_heartbeat(reg);
1107 do {
1108 ret = o2hb_do_disk_heartbeat(reg);
1109 } while (ret && ++i < 2);
1110 1149
1111 do_gettimeofday(&after_hb); 1150 do_gettimeofday(&after_hb);
1112 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 1151 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
1117 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 1156 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1118 elapsed_msec); 1157 elapsed_msec);
1119 1158
1120 if (elapsed_msec < reg->hr_timeout_ms) { 1159 if (!kthread_should_stop() &&
1160 elapsed_msec < reg->hr_timeout_ms) {
1121 /* the kthread api has blocked signals for us so no 1161 /* the kthread api has blocked signals for us so no
1122 * need to record the return value. */ 1162 * need to record the return value. */
1123 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); 1163 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
1134 * to timeout on this region when we could just as easily 1174 * to timeout on this region when we could just as easily
1135 * write a clear generation - thus indicating to them that 1175 * write a clear generation - thus indicating to them that
1136 * this node has left this region. 1176 * this node has left this region.
1137 * 1177 */
1138 * XXX: Should we skip this on unclean_stop? */ 1178 if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
1139 o2hb_prepare_block(reg, 0); 1179 o2hb_prepare_block(reg, 0);
1140 ret = o2hb_issue_node_write(reg, &write_wc); 1180 ret = o2hb_issue_node_write(reg, &write_wc);
1141 if (ret == 0) { 1181 if (ret == 0)
1142 o2hb_wait_on_io(reg, &write_wc); 1182 o2hb_wait_on_io(reg, &write_wc);
1143 } else { 1183 else
1144 mlog_errno(ret); 1184 mlog_errno(ret);
1145 } 1185 }
1146 1186
1147 /* Unpin node */ 1187 /* Unpin node */
1148 o2nm_undepend_this_node(); 1188 o2nm_undepend_this_node();
1149 1189
1150 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1190 mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
1151 1191
1152 return 0; 1192 return 0;
1153} 1193}
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1158 struct o2hb_debug_buf *db = inode->i_private; 1198 struct o2hb_debug_buf *db = inode->i_private;
1159 struct o2hb_region *reg; 1199 struct o2hb_region *reg;
1160 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1200 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1201 unsigned long lts;
1161 char *buf = NULL; 1202 char *buf = NULL;
1162 int i = -1; 1203 int i = -1;
1163 int out = 0; 1204 int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1194 1235
1195 case O2HB_DB_TYPE_REGION_ELAPSED_TIME: 1236 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1196 reg = (struct o2hb_region *)db->db_data; 1237 reg = (struct o2hb_region *)db->db_data;
1197 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", 1238 lts = reg->hr_last_timeout_start;
1198 jiffies_to_msecs(jiffies - 1239 /* If 0, it has never been set before */
1199 reg->hr_last_timeout_start)); 1240 if (lts)
1241 lts = jiffies_to_msecs(jiffies - lts);
1242 out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
1200 goto done; 1243 goto done;
1201 1244
1202 case O2HB_DB_TYPE_REGION_PINNED: 1245 case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
1426 struct page *page; 1469 struct page *page;
1427 struct o2hb_region *reg = to_o2hb_region(item); 1470 struct o2hb_region *reg = to_o2hb_region(item);
1428 1471
1472 mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
1473
1429 if (reg->hr_tmp_block) 1474 if (reg->hr_tmp_block)
1430 kfree(reg->hr_tmp_block); 1475 kfree(reg->hr_tmp_block);
1431 1476
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1792 live_threshold <<= 1; 1837 live_threshold <<= 1;
1793 spin_unlock(&o2hb_live_lock); 1838 spin_unlock(&o2hb_live_lock);
1794 } 1839 }
1795 atomic_set(&reg->hr_steady_iterations, live_threshold + 1); 1840 ++live_threshold;
1841 atomic_set(&reg->hr_steady_iterations, live_threshold);
1842 /* unsteady_iterations is double the steady_iterations */
1843 atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
1796 1844
1797 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1845 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1798 reg->hr_item.ci_name); 1846 reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1809 ret = wait_event_interruptible(o2hb_steady_queue, 1857 ret = wait_event_interruptible(o2hb_steady_queue,
1810 atomic_read(&reg->hr_steady_iterations) == 0); 1858 atomic_read(&reg->hr_steady_iterations) == 0);
1811 if (ret) { 1859 if (ret) {
1812 /* We got interrupted (hello ptrace!). Clean up */ 1860 atomic_set(&reg->hr_steady_iterations, 0);
1813 spin_lock(&o2hb_live_lock); 1861 reg->hr_aborted_start = 1;
1814 hb_task = reg->hr_task; 1862 }
1815 reg->hr_task = NULL;
1816 spin_unlock(&o2hb_live_lock);
1817 1863
1818 if (hb_task) 1864 if (reg->hr_aborted_start) {
1819 kthread_stop(hb_task); 1865 ret = -EIO;
1820 goto out; 1866 goto out;
1821 } 1867 }
1822 1868
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1833 ret = -EIO; 1879 ret = -EIO;
1834 1880
1835 if (hb_task && o2hb_global_heartbeat_active()) 1881 if (hb_task && o2hb_global_heartbeat_active())
1836 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", 1882 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
1837 config_item_name(&reg->hr_item)); 1883 config_item_name(&reg->hr_item), reg->hr_dev_name);
1838 1884
1839out: 1885out:
1840 if (filp) 1886 if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2092 2138
2093 /* stop the thread when the user removes the region dir */ 2139 /* stop the thread when the user removes the region dir */
2094 spin_lock(&o2hb_live_lock); 2140 spin_lock(&o2hb_live_lock);
2095 if (o2hb_global_heartbeat_active()) {
2096 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2097 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2098 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2099 quorum_region = 1;
2100 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2101 }
2102 hb_task = reg->hr_task; 2141 hb_task = reg->hr_task;
2103 reg->hr_task = NULL; 2142 reg->hr_task = NULL;
2104 reg->hr_item_dropped = 1; 2143 reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2107 if (hb_task) 2146 if (hb_task)
2108 kthread_stop(hb_task); 2147 kthread_stop(hb_task);
2109 2148
2149 if (o2hb_global_heartbeat_active()) {
2150 spin_lock(&o2hb_live_lock);
2151 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2152 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2153 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2154 quorum_region = 1;
2155 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2156 spin_unlock(&o2hb_live_lock);
2157 printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
2158 ((atomic_read(&reg->hr_steady_iterations) == 0) ?
2159 "stopped" : "start aborted"), config_item_name(item),
2160 reg->hr_dev_name);
2161 }
2162
2110 /* 2163 /*
2111 * If we're racing a dev_write(), we need to wake them. They will 2164 * If we're racing a dev_write(), we need to wake them. They will
2112 * check reg->hr_task 2165 * check reg->hr_task
2113 */ 2166 */
2114 if (atomic_read(&reg->hr_steady_iterations) != 0) { 2167 if (atomic_read(&reg->hr_steady_iterations) != 0) {
2168 reg->hr_aborted_start = 1;
2115 atomic_set(&reg->hr_steady_iterations, 0); 2169 atomic_set(&reg->hr_steady_iterations, 0);
2116 wake_up(&o2hb_steady_queue); 2170 wake_up(&o2hb_steady_queue);
2117 } 2171 }
2118 2172
2119 if (o2hb_global_heartbeat_active())
2120 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2121 config_item_name(&reg->hr_item));
2122
2123 config_item_put(item); 2173 config_item_put(item);
2124 2174
2125 if (!o2hb_global_heartbeat_active() || !quorum_region) 2175 if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3d..dc45deb19e68 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
47#define SC_DEBUG_NAME "sock_containers" 47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking" 48#define NST_DEBUG_NAME "send_tracking"
49#define STATS_DEBUG_NAME "stats" 49#define STATS_DEBUG_NAME "stats"
50#define NODES_DEBUG_NAME "connected_nodes"
50 51
51#define SHOW_SOCK_CONTAINERS 0 52#define SHOW_SOCK_CONTAINERS 0
52#define SHOW_SOCK_STATS 1 53#define SHOW_SOCK_STATS 1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
55static struct dentry *sc_dentry; 56static struct dentry *sc_dentry;
56static struct dentry *nst_dentry; 57static struct dentry *nst_dentry;
57static struct dentry *stats_dentry; 58static struct dentry *stats_dentry;
59static struct dentry *nodes_dentry;
58 60
59static DEFINE_SPINLOCK(o2net_debug_lock); 61static DEFINE_SPINLOCK(o2net_debug_lock);
60 62
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
491 .release = sc_fop_release, 493 .release = sc_fop_release,
492}; 494};
493 495
494int o2net_debugfs_init(void) 496static int o2net_fill_bitmap(char *buf, int len)
495{ 497{
496 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); 498 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
497 if (!o2net_dentry) { 499 int i = -1, out = 0;
498 mlog_errno(-ENOMEM);
499 goto bail;
500 }
501 500
502 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, 501 o2net_fill_node_map(map, sizeof(map));
503 o2net_dentry, NULL,
504 &nst_seq_fops);
505 if (!nst_dentry) {
506 mlog_errno(-ENOMEM);
507 goto bail;
508 }
509 502
510 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, 503 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
511 o2net_dentry, NULL, 504 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
512 &sc_seq_fops); 505 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
513 if (!sc_dentry) {
514 mlog_errno(-ENOMEM);
515 goto bail;
516 }
517 506
518 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, 507 return out;
519 o2net_dentry, NULL, 508}
520 &stats_seq_fops); 509
521 if (!stats_dentry) { 510static int nodes_fop_open(struct inode *inode, struct file *file)
522 mlog_errno(-ENOMEM); 511{
523 goto bail; 512 char *buf;
524 } 513
514 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
515 if (!buf)
516 return -ENOMEM;
517
518 i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
519
520 file->private_data = buf;
525 521
526 return 0; 522 return 0;
527bail:
528 debugfs_remove(stats_dentry);
529 debugfs_remove(sc_dentry);
530 debugfs_remove(nst_dentry);
531 debugfs_remove(o2net_dentry);
532 return -ENOMEM;
533} 523}
534 524
525static int o2net_debug_release(struct inode *inode, struct file *file)
526{
527 kfree(file->private_data);
528 return 0;
529}
530
531static ssize_t o2net_debug_read(struct file *file, char __user *buf,
532 size_t nbytes, loff_t *ppos)
533{
534 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
535 i_size_read(file->f_mapping->host));
536}
537
538static const struct file_operations nodes_fops = {
539 .open = nodes_fop_open,
540 .release = o2net_debug_release,
541 .read = o2net_debug_read,
542 .llseek = generic_file_llseek,
543};
544
535void o2net_debugfs_exit(void) 545void o2net_debugfs_exit(void)
536{ 546{
547 debugfs_remove(nodes_dentry);
537 debugfs_remove(stats_dentry); 548 debugfs_remove(stats_dentry);
538 debugfs_remove(sc_dentry); 549 debugfs_remove(sc_dentry);
539 debugfs_remove(nst_dentry); 550 debugfs_remove(nst_dentry);
540 debugfs_remove(o2net_dentry); 551 debugfs_remove(o2net_dentry);
541} 552}
542 553
554int o2net_debugfs_init(void)
555{
556 mode_t mode = S_IFREG|S_IRUSR;
557
558 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
559 if (o2net_dentry)
560 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
561 o2net_dentry, NULL, &nst_seq_fops);
562 if (nst_dentry)
563 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
564 o2net_dentry, NULL, &sc_seq_fops);
565 if (sc_dentry)
566 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
567 o2net_dentry, NULL, &stats_seq_fops);
568 if (stats_dentry)
569 nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
570 o2net_dentry, NULL, &nodes_fops);
571 if (nodes_dentry)
572 return 0;
573
574 o2net_debugfs_exit();
575 mlog_errno(-ENOMEM);
576 return -ENOMEM;
577}
578
543#endif /* CONFIG_DEBUG_FS */ 579#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ad7d0c155de4..044e7b58d31c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -546,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
546 } 546 }
547 547
548 if (was_valid && !valid) { 548 if (was_valid && !valid) {
549 printk(KERN_NOTICE "o2net: no longer connected to " 549 printk(KERN_NOTICE "o2net: No longer connected to "
550 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 550 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
551 o2net_complete_nodes_nsw(nn); 551 o2net_complete_nodes_nsw(nn);
552 } 552 }
@@ -556,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
556 cancel_delayed_work(&nn->nn_connect_expired); 556 cancel_delayed_work(&nn->nn_connect_expired);
557 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", 557 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
558 o2nm_this_node() > sc->sc_node->nd_num ? 558 o2nm_this_node() > sc->sc_node->nd_num ?
559 "connected to" : "accepted connection from", 559 "Connected to" : "Accepted connection from",
560 SC_NODEF_ARGS(sc)); 560 SC_NODEF_ARGS(sc));
561 } 561 }
562 562
@@ -644,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
644 o2net_sc_queue_work(sc, &sc->sc_connect_work); 644 o2net_sc_queue_work(sc, &sc->sc_connect_work);
645 break; 645 break;
646 default: 646 default:
647 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT 647 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
648 " shutdown, state %d\n", 648 " shutdown, state %d\n",
649 SC_NODEF_ARGS(sc), sk->sk_state); 649 SC_NODEF_ARGS(sc), sk->sk_state);
650 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 650 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1035,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
1035 return ret; 1035 return ret;
1036} 1036}
1037 1037
1038/* Get a map of all nodes to which this node is currently connected to */
1039void o2net_fill_node_map(unsigned long *map, unsigned bytes)
1040{
1041 struct o2net_sock_container *sc;
1042 int node, ret;
1043
1044 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
1045
1046 memset(map, 0, bytes);
1047 for (node = 0; node < O2NM_MAX_NODES; ++node) {
1048 o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
1049 if (!ret) {
1050 set_bit(node, map);
1051 sc_put(sc);
1052 }
1053 }
1054}
1055EXPORT_SYMBOL_GPL(o2net_fill_node_map);
1056
1038int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 1057int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1039 size_t caller_veclen, u8 target_node, int *status) 1058 size_t caller_veclen, u8 target_node, int *status)
1040{ 1059{
@@ -1285,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1285 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1304 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1286 1305
1287 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { 1306 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
1288 mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " 1307 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
1289 "version %llu but %llu is required, disconnecting\n", 1308 "protocol version %llu but %llu is required. "
1290 SC_NODEF_ARGS(sc), 1309 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1291 (unsigned long long)be64_to_cpu(hand->protocol_version), 1310 (unsigned long long)be64_to_cpu(hand->protocol_version),
1292 O2NET_PROTOCOL_VERSION); 1311 O2NET_PROTOCOL_VERSION);
1293 1312
1294 /* don't bother reconnecting if its the wrong version. */ 1313 /* don't bother reconnecting if its the wrong version. */
1295 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1314 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1303,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1303 */ 1322 */
1304 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1323 if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
1305 o2net_idle_timeout()) { 1324 o2net_idle_timeout()) {
1306 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1325 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
1307 "%u ms, but we use %u ms locally. disconnecting\n", 1326 "idle timeout of %u ms, but we use %u ms locally. "
1308 SC_NODEF_ARGS(sc), 1327 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1309 be32_to_cpu(hand->o2net_idle_timeout_ms), 1328 be32_to_cpu(hand->o2net_idle_timeout_ms),
1310 o2net_idle_timeout()); 1329 o2net_idle_timeout());
1311 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1330 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1312 return -1; 1331 return -1;
1313 } 1332 }
1314 1333
1315 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1334 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
1316 o2net_keepalive_delay()) { 1335 o2net_keepalive_delay()) {
1317 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1336 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
1318 "%u ms, but we use %u ms locally. disconnecting\n", 1337 "delay of %u ms, but we use %u ms locally. "
1319 SC_NODEF_ARGS(sc), 1338 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1320 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1339 be32_to_cpu(hand->o2net_keepalive_delay_ms),
1321 o2net_keepalive_delay()); 1340 o2net_keepalive_delay());
1322 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1341 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1323 return -1; 1342 return -1;
1324 } 1343 }
1325 1344
1326 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1345 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
1327 O2HB_MAX_WRITE_TIMEOUT_MS) { 1346 O2HB_MAX_WRITE_TIMEOUT_MS) {
1328 mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " 1347 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
1329 "%u ms, but we use %u ms locally. disconnecting\n", 1348 "timeout of %u ms, but we use %u ms locally. "
1330 SC_NODEF_ARGS(sc), 1349 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1331 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1350 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
1332 O2HB_MAX_WRITE_TIMEOUT_MS); 1351 O2HB_MAX_WRITE_TIMEOUT_MS);
1333 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1352 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1334 return -1; 1353 return -1;
1335 } 1354 }
@@ -1540,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
1540{ 1559{
1541 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1560 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1542 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1561 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1543
1544#ifdef CONFIG_DEBUG_FS 1562#ifdef CONFIG_DEBUG_FS
1545 ktime_t now = ktime_get(); 1563 unsigned long msecs = ktime_to_ms(ktime_get()) -
1564 ktime_to_ms(sc->sc_tv_timer);
1565#else
1566 unsigned long msecs = o2net_idle_timeout();
1546#endif 1567#endif
1547 1568
1548 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1569 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
1549 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1570 "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
1550 o2net_idle_timeout() / 1000, 1571 msecs / 1000, msecs % 1000);
1551 o2net_idle_timeout() % 1000);
1552
1553#ifdef CONFIG_DEBUG_FS
1554 mlog(ML_NOTICE, "Here are some times that might help debug the "
1555 "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
1556 "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
1557 (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
1558 (long long)ktime_to_us(sc->sc_tv_data_ready),
1559 (long long)ktime_to_us(sc->sc_tv_advance_start),
1560 (long long)ktime_to_us(sc->sc_tv_advance_stop),
1561 sc->sc_msg_key, sc->sc_msg_type,
1562 (long long)ktime_to_us(sc->sc_tv_func_start),
1563 (long long)ktime_to_us(sc->sc_tv_func_stop));
1564#endif
1565 1572
1566 /* 1573 /*
1567 * Initialize the nn_timeout so that the next connection attempt 1574 * Initialize the nn_timeout so that the next connection attempt
@@ -1694,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
1694 1701
1695out: 1702out:
1696 if (ret) { 1703 if (ret) {
1697 mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " 1704 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
1698 "with errno %d\n", SC_NODEF_ARGS(sc), ret); 1705 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
1699 /* 0 err so that another will be queued and attempted 1706 /* 0 err so that another will be queued and attempted
1700 * from set_nn_state */ 1707 * from set_nn_state */
1701 if (sc) 1708 if (sc)
@@ -1718,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
1718 1725
1719 spin_lock(&nn->nn_lock); 1726 spin_lock(&nn->nn_lock);
1720 if (!nn->nn_sc_valid) { 1727 if (!nn->nn_sc_valid) {
1721 mlog(ML_ERROR, "no connection established with node %u after " 1728 printk(KERN_NOTICE "o2net: No connection established with "
1722 "%u.%u seconds, giving up and returning errors.\n", 1729 "node %u after %u.%u seconds, giving up.\n",
1723 o2net_num_from_nn(nn), 1730 o2net_num_from_nn(nn),
1724 o2net_idle_timeout() / 1000, 1731 o2net_idle_timeout() / 1000,
1725 o2net_idle_timeout() % 1000); 1732 o2net_idle_timeout() % 1000);
@@ -1862,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
1862 1869
1863 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); 1870 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
1864 if (node == NULL) { 1871 if (node == NULL) {
1865 mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", 1872 printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
1866 &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1873 "node at %pI4:%d\n", &sin.sin_addr.s_addr,
1874 ntohs(sin.sin_port));
1867 ret = -EINVAL; 1875 ret = -EINVAL;
1868 goto out; 1876 goto out;
1869 } 1877 }
1870 1878
1871 if (o2nm_this_node() >= node->nd_num) { 1879 if (o2nm_this_node() >= node->nd_num) {
1872 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1880 local_node = o2nm_get_node_by_num(o2nm_this_node());
1873 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" 1881 printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
1874 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", 1882 "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
1875 local_node->nd_name, local_node->nd_num, 1883 "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
1876 &(local_node->nd_ipv4_address), 1884 &(local_node->nd_ipv4_address),
1877 ntohs(local_node->nd_ipv4_port), 1885 ntohs(local_node->nd_ipv4_port), node->nd_name,
1878 node->nd_name, node->nd_num, &sin.sin_addr.s_addr, 1886 node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
1879 ntohs(sin.sin_port));
1880 ret = -EINVAL; 1887 ret = -EINVAL;
1881 goto out; 1888 goto out;
1882 } 1889 }
@@ -1901,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
1901 ret = 0; 1908 ret = 0;
1902 spin_unlock(&nn->nn_lock); 1909 spin_unlock(&nn->nn_lock);
1903 if (ret) { 1910 if (ret) {
1904 mlog(ML_NOTICE, "attempt to connect from node '%s' at " 1911 printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
1905 "%pI4:%d but it already has an open connection\n", 1912 "at %pI4:%d but it already has an open connection\n",
1906 node->nd_name, &sin.sin_addr.s_addr, 1913 node->nd_name, &sin.sin_addr.s_addr,
1907 ntohs(sin.sin_port)); 1914 ntohs(sin.sin_port));
1908 goto out; 1915 goto out;
1909 } 1916 }
1910 1917
@@ -1984,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
1984 1991
1985 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 1992 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1986 if (ret < 0) { 1993 if (ret < 0) {
1987 mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); 1994 printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
1988 goto out; 1995 goto out;
1989 } 1996 }
1990 1997
@@ -2001,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
2001 sock->sk->sk_reuse = 1; 2008 sock->sk->sk_reuse = 1;
2002 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 2009 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
2003 if (ret < 0) { 2010 if (ret < 0) {
2004 mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " 2011 printk(KERN_ERR "o2net: Error %d while binding socket at "
2005 "ret=%d\n", &addr, ntohs(port), ret); 2012 "%pI4:%u\n", ret, &addr, ntohs(port));
2006 goto out; 2013 goto out;
2007 } 2014 }
2008 2015
2009 ret = sock->ops->listen(sock, 64); 2016 ret = sock->ops->listen(sock, 64);
2010 if (ret < 0) { 2017 if (ret < 0)
2011 mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", 2018 printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
2012 &addr, ntohs(port), ret); 2019 ret, &addr, ntohs(port));
2013 }
2014 2020
2015out: 2021out:
2016 if (ret) { 2022 if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d4..5bada2a69b50 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
106 struct list_head *unreg_list); 106 struct list_head *unreg_list);
107void o2net_unregister_handler_list(struct list_head *list); 107void o2net_unregister_handler_list(struct list_head *list);
108 108
109void o2net_fill_node_map(unsigned long *map, unsigned bytes);
110
109struct o2nm_node; 111struct o2nm_node;
110int o2net_register_hb_callbacks(void); 112int o2net_register_hb_callbacks(void);
111void o2net_unregister_hb_callbacks(void); 113void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e2878b5895fb..8fe4e2892ab9 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1184 if (pde) 1184 if (pde)
1185 le16_add_cpu(&pde->rec_len, 1185 le16_add_cpu(&pde->rec_len,
1186 le16_to_cpu(de->rec_len)); 1186 le16_to_cpu(de->rec_len));
1187 else 1187 de->inode = 0;
1188 de->inode = 0;
1189 dir->i_version++; 1188 dir->i_version++;
1190 ocfs2_journal_dirty(handle, bh); 1189 ocfs2_journal_dirty(handle, bh);
1191 goto bail; 1190 goto bail;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b61..a5952ceecba5 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
859void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 859void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
860void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); 860void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
861int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 861int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
862int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 862void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
863int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); 863void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
864 864
865void dlm_put(struct dlm_ctxt *dlm); 865void dlm_put(struct dlm_ctxt *dlm);
866struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 866struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
877 kref_get(&res->refs); 877 kref_get(&res->refs);
878} 878}
879void dlm_lockres_put(struct dlm_lock_resource *res); 879void dlm_lockres_put(struct dlm_lock_resource *res);
880void __dlm_unhash_lockres(struct dlm_lock_resource *res); 880void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
881void __dlm_insert_lockres(struct dlm_ctxt *dlm, 881void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
882 struct dlm_lock_resource *res);
883struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 882struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
884 const char *name, 883 const char *name,
885 unsigned int len, 884 unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
902 const char *name, 901 const char *name,
903 unsigned int namelen); 902 unsigned int namelen);
904 903
905#define dlm_lockres_set_refmap_bit(bit,res) \ 904void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
906 __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) 905 struct dlm_lock_resource *res, int bit);
907#define dlm_lockres_clear_refmap_bit(bit,res) \ 906void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
908 __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) 907 struct dlm_lock_resource *res, int bit);
909 908
910static inline void __dlm_lockres_set_refmap_bit(int bit, 909void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
911 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res);
912 const char *file, 911void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
913 int line) 912 struct dlm_lock_resource *res);
914{
915 //printk("%s:%d:%.*s: setting bit %d\n", file, line,
916 // res->lockname.len, res->lockname.name, bit);
917 set_bit(bit, res->refmap);
918}
919
920static inline void __dlm_lockres_clear_refmap_bit(int bit,
921 struct dlm_lock_resource *res,
922 const char *file,
923 int line)
924{
925 //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
926 // res->lockname.len, res->lockname.name, bit);
927 clear_bit(bit, res->refmap);
928}
929
930void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
931 struct dlm_lock_resource *res,
932 const char *file,
933 int line);
934void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
935 struct dlm_lock_resource *res,
936 int new_lockres,
937 const char *file,
938 int line);
939#define dlm_lockres_drop_inflight_ref(d,r) \
940 __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
941#define dlm_lockres_grab_inflight_ref(d,r) \
942 __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
943#define dlm_lockres_grab_inflight_ref_new(d,r) \
944 __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
945 913
946void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 914void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
947void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 915void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf93..92f2ead0fab6 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
157 157
158static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); 158static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
159 159
160void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) 160void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
161{ 161{
162 if (!hlist_unhashed(&lockres->hash_node)) { 162 if (hlist_unhashed(&res->hash_node))
163 hlist_del_init(&lockres->hash_node); 163 return;
164 dlm_lockres_put(lockres); 164
165 } 165 mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
166 res->lockname.name);
167 hlist_del_init(&res->hash_node);
168 dlm_lockres_put(res);
166} 169}
167 170
168void __dlm_insert_lockres(struct dlm_ctxt *dlm, 171void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
169 struct dlm_lock_resource *res)
170{ 172{
171 struct hlist_head *bucket; 173 struct hlist_head *bucket;
172 struct qstr *q; 174 struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
180 dlm_lockres_get(res); 182 dlm_lockres_get(res);
181 183
182 hlist_add_head(&res->hash_node, bucket); 184 hlist_add_head(&res->hash_node, bucket);
185
186 mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
187 res->lockname.name);
183} 188}
184 189
185struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 190struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
539 544
540static void __dlm_print_nodes(struct dlm_ctxt *dlm) 545static void __dlm_print_nodes(struct dlm_ctxt *dlm)
541{ 546{
542 int node = -1; 547 int node = -1, num = 0;
543 548
544 assert_spin_locked(&dlm->spinlock); 549 assert_spin_locked(&dlm->spinlock);
545 550
546 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); 551 printk("( ");
547
548 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 552 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
549 node + 1)) < O2NM_MAX_NODES) { 553 node + 1)) < O2NM_MAX_NODES) {
550 printk("%d ", node); 554 printk("%d ", node);
555 ++num;
551 } 556 }
552 printk("\n"); 557 printk(") %u nodes\n", num);
553} 558}
554 559
555static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 560static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
566 571
567 node = exit_msg->node_idx; 572 node = exit_msg->node_idx;
568 573
569 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
570
571 spin_lock(&dlm->spinlock); 574 spin_lock(&dlm->spinlock);
572 clear_bit(node, dlm->domain_map); 575 clear_bit(node, dlm->domain_map);
573 clear_bit(node, dlm->exit_domain_map); 576 clear_bit(node, dlm->exit_domain_map);
577 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
574 __dlm_print_nodes(dlm); 578 __dlm_print_nodes(dlm);
575 579
576 /* notify anything attached to the heartbeat events */ 580 /* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
755 759
756 dlm_mark_domain_leaving(dlm); 760 dlm_mark_domain_leaving(dlm);
757 dlm_leave_domain(dlm); 761 dlm_leave_domain(dlm);
762 printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
758 dlm_force_free_mles(dlm); 763 dlm_force_free_mles(dlm);
759 dlm_complete_dlm_shutdown(dlm); 764 dlm_complete_dlm_shutdown(dlm);
760 } 765 }
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
970 clear_bit(assert->node_idx, dlm->exit_domain_map); 975 clear_bit(assert->node_idx, dlm->exit_domain_map);
971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 976 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
972 977
973 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 978 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
974 assert->node_idx, dlm->name); 979 assert->node_idx, dlm->name);
975 __dlm_print_nodes(dlm); 980 __dlm_print_nodes(dlm);
976 981
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1701bail: 1706bail:
1702 spin_lock(&dlm->spinlock); 1707 spin_lock(&dlm->spinlock);
1703 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 1708 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
1704 if (!status) 1709 if (!status) {
1710 printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
1705 __dlm_print_nodes(dlm); 1711 __dlm_print_nodes(dlm);
1712 }
1706 spin_unlock(&dlm->spinlock); 1713 spin_unlock(&dlm->spinlock);
1707 1714
1708 if (ctxt) { 1715 if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
2131 goto leave; 2138 goto leave;
2132 } 2139 }
2133 2140
2134 if (!o2hb_check_local_node_heartbeating()) {
2135 mlog(ML_ERROR, "the local node has not been configured, or is "
2136 "not heartbeating\n");
2137 ret = -EPROTO;
2138 goto leave;
2139 }
2140
2141 mlog(0, "register called for domain \"%s\"\n", domain); 2141 mlog(0, "register called for domain \"%s\"\n", domain);
2142 2142
2143retry: 2143retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0fd66f7..975810b98492 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
183 kick_thread = 1; 183 kick_thread = 1;
184 } 184 }
185 } 185 }
186 /* reduce the inflight count, this may result in the lockres
187 * being purged below during calc_usage */
188 if (lock->ml.node == dlm->node_num)
189 dlm_lockres_drop_inflight_ref(dlm, res);
190 186
191 spin_unlock(&res->spinlock); 187 spin_unlock(&res->spinlock);
192 wake_up(&res->wq); 188 wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
231 lock->ml.type, res->lockname.len, 227 lock->ml.type, res->lockname.len,
232 res->lockname.name, flags); 228 res->lockname.name, flags);
233 229
230 /*
231 * Wait if resource is getting recovered, remastered, etc.
232 * If the resource was remastered and new owner is self, then exit.
233 */
234 spin_lock(&res->spinlock); 234 spin_lock(&res->spinlock);
235
236 /* will exit this call with spinlock held */
237 __dlm_wait_on_lockres(res); 235 __dlm_wait_on_lockres(res);
236 if (res->owner == dlm->node_num) {
237 spin_unlock(&res->spinlock);
238 return DLM_RECOVERING;
239 }
238 res->state |= DLM_LOCK_RES_IN_PROGRESS; 240 res->state |= DLM_LOCK_RES_IN_PROGRESS;
239 241
240 /* add lock to local (secondary) queue */ 242 /* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
319 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, 321 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
320 sizeof(create), res->owner, &status); 322 sizeof(create), res->owner, &status);
321 if (tmpret >= 0) { 323 if (tmpret >= 0) {
322 // successfully sent and received 324 ret = status;
323 ret = status; // this is already a dlm_status
324 if (ret == DLM_REJECTED) { 325 if (ret == DLM_REJECTED) {
325 mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " 326 mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
326 "no longer owned by %u. that node is coming back " 327 "owned by node %u. That node is coming back up "
327 "up currently.\n", dlm->name, create.namelen, 328 "currently.\n", dlm->name, create.namelen,
328 create.name, res->owner); 329 create.name, res->owner);
329 dlm_print_one_lock_resource(res); 330 dlm_print_one_lock_resource(res);
330 BUG(); 331 BUG();
331 } 332 }
332 } else { 333 } else {
333 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 334 mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
334 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, 335 "node %u\n", dlm->name, create.namelen, create.name,
335 res->owner); 336 tmpret, res->owner);
336 if (dlm_is_host_down(tmpret)) { 337 if (dlm_is_host_down(tmpret))
337 ret = DLM_RECOVERING; 338 ret = DLM_RECOVERING;
338 mlog(0, "node %u died so returning DLM_RECOVERING " 339 else
339 "from lock message!\n", res->owner);
340 } else {
341 ret = dlm_err_to_dlm_status(tmpret); 340 ret = dlm_err_to_dlm_status(tmpret);
342 }
343 } 341 }
344 342
345 return ret; 343 return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
440 /* zero memory only if kernel-allocated */ 438 /* zero memory only if kernel-allocated */
441 lksb = kzalloc(sizeof(*lksb), GFP_NOFS); 439 lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
442 if (!lksb) { 440 if (!lksb) {
443 kfree(lock); 441 kmem_cache_free(dlm_lock_cache, lock);
444 return NULL; 442 return NULL;
445 } 443 }
446 kernel_allocated = 1; 444 kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
718 716
719 if (status == DLM_RECOVERING || status == DLM_MIGRATING || 717 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
720 status == DLM_FORWARD) { 718 status == DLM_FORWARD) {
721 mlog(0, "retrying lock with migration/"
722 "recovery/in progress\n");
723 msleep(100); 719 msleep(100);
724 /* no waiting for dlm_reco_thread */
725 if (recovery) { 720 if (recovery) {
726 if (status != DLM_RECOVERING) 721 if (status != DLM_RECOVERING)
727 goto retry_lock; 722 goto retry_lock;
728
729 mlog(0, "%s: got RECOVERING "
730 "for $RECOVERY lock, master "
731 "was %u\n", dlm->name,
732 res->owner);
733 /* wait to see the node go down, then 723 /* wait to see the node go down, then
734 * drop down and allow the lockres to 724 * drop down and allow the lockres to
735 * get cleaned up. need to remaster. */ 725 * get cleaned up. need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
741 } 731 }
742 } 732 }
743 733
734 /* Inflight taken in dlm_get_lock_resource() is dropped here */
735 spin_lock(&res->spinlock);
736 dlm_lockres_drop_inflight_ref(dlm, res);
737 spin_unlock(&res->spinlock);
738
739 dlm_lockres_calc_usage(dlm, res);
740 dlm_kick_thread(dlm, res);
741
744 if (status != DLM_NORMAL) { 742 if (status != DLM_NORMAL) {
745 lock->lksb->flags &= ~DLM_LKSB_GET_LVB; 743 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
746 if (status != DLM_NOTQUEUED) 744 if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e9..005261c333b0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
631 return NULL; 631 return NULL;
632} 632}
633 633
634void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 634void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
635 struct dlm_lock_resource *res, 635 struct dlm_lock_resource *res, int bit)
636 int new_lockres,
637 const char *file,
638 int line)
639{ 636{
640 if (!new_lockres) 637 assert_spin_locked(&res->spinlock);
641 assert_spin_locked(&res->spinlock); 638
639 mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
640 res->lockname.name, bit, __builtin_return_address(0));
641
642 set_bit(bit, res->refmap);
643}
644
645void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
646 struct dlm_lock_resource *res, int bit)
647{
648 assert_spin_locked(&res->spinlock);
649
650 mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
651 res->lockname.name, bit, __builtin_return_address(0));
652
653 clear_bit(bit, res->refmap);
654}
655
656
657void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
658 struct dlm_lock_resource *res)
659{
660 assert_spin_locked(&res->spinlock);
642 661
643 if (!test_bit(dlm->node_num, res->refmap)) {
644 BUG_ON(res->inflight_locks != 0);
645 dlm_lockres_set_refmap_bit(dlm->node_num, res);
646 }
647 res->inflight_locks++; 662 res->inflight_locks++;
648 mlog(0, "%s:%.*s: inflight++: now %u\n", 663
649 dlm->name, res->lockname.len, res->lockname.name, 664 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
650 res->inflight_locks); 665 res->lockname.len, res->lockname.name, res->inflight_locks,
666 __builtin_return_address(0));
651} 667}
652 668
653void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 669void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
654 struct dlm_lock_resource *res, 670 struct dlm_lock_resource *res)
655 const char *file,
656 int line)
657{ 671{
658 assert_spin_locked(&res->spinlock); 672 assert_spin_locked(&res->spinlock);
659 673
660 BUG_ON(res->inflight_locks == 0); 674 BUG_ON(res->inflight_locks == 0);
675
661 res->inflight_locks--; 676 res->inflight_locks--;
662 mlog(0, "%s:%.*s: inflight--: now %u\n", 677
663 dlm->name, res->lockname.len, res->lockname.name, 678 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
664 res->inflight_locks); 679 res->lockname.len, res->lockname.name, res->inflight_locks,
665 if (res->inflight_locks == 0) 680 __builtin_return_address(0));
666 dlm_lockres_clear_refmap_bit(dlm->node_num, res); 681
667 wake_up(&res->wq); 682 wake_up(&res->wq);
668} 683}
669 684
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
697 unsigned int hash; 712 unsigned int hash;
698 int tries = 0; 713 int tries = 0;
699 int bit, wait_on_recovery = 0; 714 int bit, wait_on_recovery = 0;
700 int drop_inflight_if_nonlocal = 0;
701 715
702 BUG_ON(!lockid); 716 BUG_ON(!lockid);
703 717
@@ -709,36 +723,33 @@ lookup:
709 spin_lock(&dlm->spinlock); 723 spin_lock(&dlm->spinlock);
710 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 724 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
711 if (tmpres) { 725 if (tmpres) {
712 int dropping_ref = 0;
713
714 spin_unlock(&dlm->spinlock); 726 spin_unlock(&dlm->spinlock);
715
716 spin_lock(&tmpres->spinlock); 727 spin_lock(&tmpres->spinlock);
717 /* We wait for the other thread that is mastering the resource */ 728 /* Wait on the thread that is mastering the resource */
718 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 729 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
719 __dlm_wait_on_lockres(tmpres); 730 __dlm_wait_on_lockres(tmpres);
720 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 731 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
732 spin_unlock(&tmpres->spinlock);
733 dlm_lockres_put(tmpres);
734 tmpres = NULL;
735 goto lookup;
721 } 736 }
722 737
723 if (tmpres->owner == dlm->node_num) { 738 /* Wait on the resource purge to complete before continuing */
724 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 739 if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
725 dlm_lockres_grab_inflight_ref(dlm, tmpres); 740 BUG_ON(tmpres->owner == dlm->node_num);
726 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 741 __dlm_wait_on_lockres_flags(tmpres,
727 dropping_ref = 1; 742 DLM_LOCK_RES_DROPPING_REF);
728 spin_unlock(&tmpres->spinlock);
729
730 /* wait until done messaging the master, drop our ref to allow
731 * the lockres to be purged, start over. */
732 if (dropping_ref) {
733 spin_lock(&tmpres->spinlock);
734 __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
735 spin_unlock(&tmpres->spinlock); 743 spin_unlock(&tmpres->spinlock);
736 dlm_lockres_put(tmpres); 744 dlm_lockres_put(tmpres);
737 tmpres = NULL; 745 tmpres = NULL;
738 goto lookup; 746 goto lookup;
739 } 747 }
740 748
741 mlog(0, "found in hash!\n"); 749 /* Grab inflight ref to pin the resource */
750 dlm_lockres_grab_inflight_ref(dlm, tmpres);
751
752 spin_unlock(&tmpres->spinlock);
742 if (res) 753 if (res)
743 dlm_lockres_put(res); 754 dlm_lockres_put(res);
744 res = tmpres; 755 res = tmpres;
@@ -829,8 +840,8 @@ lookup:
829 * but they might own this lockres. wait on them. */ 840 * but they might own this lockres. wait on them. */
830 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 841 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
831 if (bit < O2NM_MAX_NODES) { 842 if (bit < O2NM_MAX_NODES) {
832 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 843 mlog(0, "%s: res %.*s, At least one node (%d) "
833 "recover before lock mastery can begin\n", 844 "to recover before lock mastery can begin\n",
834 dlm->name, namelen, (char *)lockid, bit); 845 dlm->name, namelen, (char *)lockid, bit);
835 wait_on_recovery = 1; 846 wait_on_recovery = 1;
836 } 847 }
@@ -843,12 +854,11 @@ lookup:
843 854
844 /* finally add the lockres to its hash bucket */ 855 /* finally add the lockres to its hash bucket */
845 __dlm_insert_lockres(dlm, res); 856 __dlm_insert_lockres(dlm, res);
846 /* since this lockres is new it doesn't not require the spinlock */
847 dlm_lockres_grab_inflight_ref_new(dlm, res);
848 857
849 /* if this node does not become the master make sure to drop 858 /* Grab inflight ref to pin the resource */
850 * this inflight reference below */ 859 spin_lock(&res->spinlock);
851 drop_inflight_if_nonlocal = 1; 860 dlm_lockres_grab_inflight_ref(dlm, res);
861 spin_unlock(&res->spinlock);
852 862
853 /* get an extra ref on the mle in case this is a BLOCK 863 /* get an extra ref on the mle in case this is a BLOCK
854 * if so, the creator of the BLOCK may try to put the last 864 * if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
864 * dlm spinlock would be detectable be a change on the mle, 874 * dlm spinlock would be detectable be a change on the mle,
865 * so we only need to clear out the recovery map once. */ 875 * so we only need to clear out the recovery map once. */
866 if (dlm_is_recovery_lock(lockid, namelen)) { 876 if (dlm_is_recovery_lock(lockid, namelen)) {
867 mlog(ML_NOTICE, "%s: recovery map is not empty, but " 877 mlog(0, "%s: Recovery map is not empty, but must "
868 "must master $RECOVERY lock now\n", dlm->name); 878 "master $RECOVERY lock now\n", dlm->name);
869 if (!dlm_pre_master_reco_lockres(dlm, res)) 879 if (!dlm_pre_master_reco_lockres(dlm, res))
870 wait_on_recovery = 0; 880 wait_on_recovery = 0;
871 else { 881 else {
@@ -883,8 +893,8 @@ redo_request:
883 spin_lock(&dlm->spinlock); 893 spin_lock(&dlm->spinlock);
884 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 894 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
885 if (bit < O2NM_MAX_NODES) { 895 if (bit < O2NM_MAX_NODES) {
886 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 896 mlog(0, "%s: res %.*s, At least one node (%d) "
887 "recover before lock mastery can begin\n", 897 "to recover before lock mastery can begin\n",
888 dlm->name, namelen, (char *)lockid, bit); 898 dlm->name, namelen, (char *)lockid, bit);
889 wait_on_recovery = 1; 899 wait_on_recovery = 1;
890 } else 900 } else
@@ -913,8 +923,8 @@ redo_request:
913 * yet, keep going until it does. this is how the 923 * yet, keep going until it does. this is how the
914 * master will know that asserts are needed back to 924 * master will know that asserts are needed back to
915 * the lower nodes. */ 925 * the lower nodes. */
916 mlog(0, "%s:%.*s: requests only up to %u but master " 926 mlog(0, "%s: res %.*s, Requests only up to %u but "
917 "is %u, keep going\n", dlm->name, namelen, 927 "master is %u, keep going\n", dlm->name, namelen,
918 lockid, nodenum, mle->master); 928 lockid, nodenum, mle->master);
919 } 929 }
920 } 930 }
@@ -924,13 +934,12 @@ wait:
924 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 934 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
925 if (ret < 0) { 935 if (ret < 0) {
926 wait_on_recovery = 1; 936 wait_on_recovery = 1;
927 mlog(0, "%s:%.*s: node map changed, redo the " 937 mlog(0, "%s: res %.*s, Node map changed, redo the master "
928 "master request now, blocked=%d\n", 938 "request now, blocked=%d\n", dlm->name, res->lockname.len,
929 dlm->name, res->lockname.len,
930 res->lockname.name, blocked); 939 res->lockname.name, blocked);
931 if (++tries > 20) { 940 if (++tries > 20) {
932 mlog(ML_ERROR, "%s:%.*s: spinning on " 941 mlog(ML_ERROR, "%s: res %.*s, Spinning on "
933 "dlm_wait_for_lock_mastery, blocked=%d\n", 942 "dlm_wait_for_lock_mastery, blocked = %d\n",
934 dlm->name, res->lockname.len, 943 dlm->name, res->lockname.len,
935 res->lockname.name, blocked); 944 res->lockname.name, blocked);
936 dlm_print_one_lock_resource(res); 945 dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
940 goto redo_request; 949 goto redo_request;
941 } 950 }
942 951
943 mlog(0, "lockres mastered by %u\n", res->owner); 952 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
953 res->lockname.name, res->owner);
944 /* make sure we never continue without this */ 954 /* make sure we never continue without this */
945 BUG_ON(res->owner == O2NM_MAX_NODES); 955 BUG_ON(res->owner == O2NM_MAX_NODES);
946 956
@@ -952,8 +962,6 @@ wait:
952 962
953wake_waiters: 963wake_waiters:
954 spin_lock(&res->spinlock); 964 spin_lock(&res->spinlock);
955 if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
956 dlm_lockres_drop_inflight_ref(dlm, res);
957 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 965 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
958 spin_unlock(&res->spinlock); 966 spin_unlock(&res->spinlock);
959 wake_up(&res->wq); 967 wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
1426 } 1434 }
1427 1435
1428 if (res->owner == dlm->node_num) { 1436 if (res->owner == dlm->node_num) {
1429 mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1437 dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
1430 dlm->name, namelen, name, request->node_idx);
1431 dlm_lockres_set_refmap_bit(request->node_idx, res);
1432 spin_unlock(&res->spinlock); 1438 spin_unlock(&res->spinlock);
1433 response = DLM_MASTER_RESP_YES; 1439 response = DLM_MASTER_RESP_YES;
1434 if (mle) 1440 if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
1493 * go back and clean the mles on any 1499 * go back and clean the mles on any
1494 * other nodes */ 1500 * other nodes */
1495 dispatch_assert = 1; 1501 dispatch_assert = 1;
1496 dlm_lockres_set_refmap_bit(request->node_idx, res); 1502 dlm_lockres_set_refmap_bit(dlm, res,
1497 mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1503 request->node_idx);
1498 dlm->name, namelen, name,
1499 request->node_idx);
1500 } else 1504 } else
1501 response = DLM_MASTER_RESP_NO; 1505 response = DLM_MASTER_RESP_NO;
1502 } else { 1506 } else {
@@ -1702,7 +1706,7 @@ again:
1702 "lockres, set the bit in the refmap\n", 1706 "lockres, set the bit in the refmap\n",
1703 namelen, lockname, to); 1707 namelen, lockname, to);
1704 spin_lock(&res->spinlock); 1708 spin_lock(&res->spinlock);
1705 dlm_lockres_set_refmap_bit(to, res); 1709 dlm_lockres_set_refmap_bit(dlm, res, to);
1706 spin_unlock(&res->spinlock); 1710 spin_unlock(&res->spinlock);
1707 } 1711 }
1708 } 1712 }
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2187 namelen = res->lockname.len; 2191 namelen = res->lockname.len;
2188 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2192 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2189 2193
2190 mlog(0, "%s:%.*s: sending deref to %d\n",
2191 dlm->name, namelen, lockname, res->owner);
2192 memset(&deref, 0, sizeof(deref)); 2194 memset(&deref, 0, sizeof(deref));
2193 deref.node_idx = dlm->node_num; 2195 deref.node_idx = dlm->node_num;
2194 deref.namelen = namelen; 2196 deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2197 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2199 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2198 &deref, sizeof(deref), res->owner, &r); 2200 &deref, sizeof(deref), res->owner, &r);
2199 if (ret < 0) 2201 if (ret < 0)
2200 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 2202 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2201 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, 2203 dlm->name, namelen, lockname, ret, res->owner);
2202 res->owner);
2203 else if (r < 0) { 2204 else if (r < 0) {
2204 /* BAD. other node says I did not have a ref. */ 2205 /* BAD. other node says I did not have a ref. */
2205 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2206 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2206 "(master=%u) got %d.\n", dlm->name, namelen, 2207 dlm->name, namelen, lockname, res->owner, r);
2207 lockname, res->owner, r);
2208 dlm_print_one_lock_resource(res); 2208 dlm_print_one_lock_resource(res);
2209 BUG(); 2209 BUG();
2210 } 2210 }
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2260 else { 2260 else {
2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2262 if (test_bit(node, res->refmap)) { 2262 if (test_bit(node, res->refmap)) {
2263 dlm_lockres_clear_refmap_bit(node, res); 2263 dlm_lockres_clear_refmap_bit(dlm, res, node);
2264 cleared = 1; 2264 cleared = 1;
2265 } 2265 }
2266 } 2266 }
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2321 if (test_bit(node, res->refmap)) { 2321 if (test_bit(node, res->refmap)) {
2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2323 dlm_lockres_clear_refmap_bit(node, res); 2323 dlm_lockres_clear_refmap_bit(dlm, res, node);
2324 cleared = 1; 2324 cleared = 1;
2325 } 2325 }
2326 spin_unlock(&res->spinlock); 2326 spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2802 BUG_ON(!list_empty(&lock->bast_list)); 2802 BUG_ON(!list_empty(&lock->bast_list));
2803 BUG_ON(lock->ast_pending); 2803 BUG_ON(lock->ast_pending);
2804 BUG_ON(lock->bast_pending); 2804 BUG_ON(lock->bast_pending);
2805 dlm_lockres_clear_refmap_bit(lock->ml.node, res); 2805 dlm_lockres_clear_refmap_bit(dlm, res,
2806 lock->ml.node);
2806 list_del_init(&lock->list); 2807 list_del_init(&lock->list);
2807 dlm_lock_put(lock); 2808 dlm_lock_put(lock);
2808 /* In a normal unlock, we would have added a 2809 /* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2823 mlog(0, "%s:%.*s: node %u had a ref to this " 2824 mlog(0, "%s:%.*s: node %u had a ref to this "
2824 "migrating lockres, clearing\n", dlm->name, 2825 "migrating lockres, clearing\n", dlm->name,
2825 res->lockname.len, res->lockname.name, bit); 2826 res->lockname.len, res->lockname.name, bit);
2826 dlm_lockres_clear_refmap_bit(bit, res); 2827 dlm_lockres_clear_refmap_bit(dlm, res, bit);
2827 } 2828 }
2828 bit++; 2829 bit++;
2829 } 2830 }
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2916 &migrate, sizeof(migrate), nodenum, 2917 &migrate, sizeof(migrate), nodenum,
2917 &status); 2918 &status);
2918 if (ret < 0) { 2919 if (ret < 0) {
2919 mlog(ML_ERROR, "Error %d when sending message %u (key " 2920 mlog(ML_ERROR, "%s: res %.*s, Error %d send "
2920 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, 2921 "MIGRATE_REQUEST to node %u\n", dlm->name,
2921 dlm->key, nodenum); 2922 migrate.namelen, migrate.name, ret, nodenum);
2922 if (!dlm_is_host_down(ret)) { 2923 if (!dlm_is_host_down(ret)) {
2923 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2924 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2924 BUG(); 2925 BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2937 dlm->name, res->lockname.len, res->lockname.name, 2938 dlm->name, res->lockname.len, res->lockname.name,
2938 nodenum); 2939 nodenum);
2939 spin_lock(&res->spinlock); 2940 spin_lock(&res->spinlock);
2940 dlm_lockres_set_refmap_bit(nodenum, res); 2941 dlm_lockres_set_refmap_bit(dlm, res, nodenum);
2941 spin_unlock(&res->spinlock); 2942 spin_unlock(&res->spinlock);
2942 } 2943 }
2943 } 2944 }
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3271 * mastery reference here since old_master will briefly have 3272 * mastery reference here since old_master will briefly have
3272 * a reference after the migration completes */ 3273 * a reference after the migration completes */
3273 spin_lock(&res->spinlock); 3274 spin_lock(&res->spinlock);
3274 dlm_lockres_set_refmap_bit(old_master, res); 3275 dlm_lockres_set_refmap_bit(dlm, res, old_master);
3275 spin_unlock(&res->spinlock); 3276 spin_unlock(&res->spinlock);
3276 3277
3277 mlog(0, "now time to do a migrate request to other nodes\n"); 3278 mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a21..01ebfd0bdad7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
362} 362}
363 363
364 364
365int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 365void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
366{ 366{
367 if (timeout) { 367 if (dlm_is_node_dead(dlm, node))
368 mlog(ML_NOTICE, "%s: waiting %dms for notification of " 368 return;
369 "death of node %u\n", dlm->name, timeout, node); 369
370 printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
371 "domain %s\n", node, dlm->name);
372
373 if (timeout)
370 wait_event_timeout(dlm->dlm_reco_thread_wq, 374 wait_event_timeout(dlm->dlm_reco_thread_wq,
371 dlm_is_node_dead(dlm, node), 375 dlm_is_node_dead(dlm, node),
372 msecs_to_jiffies(timeout)); 376 msecs_to_jiffies(timeout));
373 } else { 377 else
374 mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
375 "of death of node %u\n", dlm->name, node);
376 wait_event(dlm->dlm_reco_thread_wq, 378 wait_event(dlm->dlm_reco_thread_wq,
377 dlm_is_node_dead(dlm, node)); 379 dlm_is_node_dead(dlm, node));
378 }
379 /* for now, return 0 */
380 return 0;
381} 380}
382 381
383int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) 382void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
384{ 383{
385 if (timeout) { 384 if (dlm_is_node_recovered(dlm, node))
386 mlog(0, "%s: waiting %dms for notification of " 385 return;
387 "recovery of node %u\n", dlm->name, timeout, node); 386
387 printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
388 "domain %s\n", node, dlm->name);
389
390 if (timeout)
388 wait_event_timeout(dlm->dlm_reco_thread_wq, 391 wait_event_timeout(dlm->dlm_reco_thread_wq,
389 dlm_is_node_recovered(dlm, node), 392 dlm_is_node_recovered(dlm, node),
390 msecs_to_jiffies(timeout)); 393 msecs_to_jiffies(timeout));
391 } else { 394 else
392 mlog(0, "%s: waiting indefinitely for notification "
393 "of recovery of node %u\n", dlm->name, node);
394 wait_event(dlm->dlm_reco_thread_wq, 395 wait_event(dlm->dlm_reco_thread_wq,
395 dlm_is_node_recovered(dlm, node)); 396 dlm_is_node_recovered(dlm, node));
396 }
397 /* for now, return 0 */
398 return 0;
399} 397}
400 398
401/* callers of the top-level api calls (dlmlock/dlmunlock) should 399/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
430{ 428{
431 spin_lock(&dlm->spinlock); 429 spin_lock(&dlm->spinlock);
432 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 430 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
431 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
432 dlm->name, dlm->reco.dead_node);
433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
434 spin_unlock(&dlm->spinlock); 434 spin_unlock(&dlm->spinlock);
435} 435}
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); 440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; 441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
442 spin_unlock(&dlm->spinlock); 442 spin_unlock(&dlm->spinlock);
443 printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
443 wake_up(&dlm->reco.event); 444 wake_up(&dlm->reco.event);
444} 445}
445 446
447static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
448{
449 printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
450 "dead node %u in domain %s\n", dlm->reco.new_master,
451 (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
452 dlm->reco.dead_node, dlm->name);
453}
454
446static int dlm_do_recovery(struct dlm_ctxt *dlm) 455static int dlm_do_recovery(struct dlm_ctxt *dlm)
447{ 456{
448 int status = 0; 457 int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
505 } 514 }
506 mlog(0, "another node will master this recovery session.\n"); 515 mlog(0, "another node will master this recovery session.\n");
507 } 516 }
508 mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", 517
509 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, 518 dlm_print_recovery_master(dlm);
510 dlm->node_num, dlm->reco.dead_node);
511 519
512 /* it is safe to start everything back up here 520 /* it is safe to start everything back up here
513 * because all of the dead node's lock resources 521 * because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
518 return 0; 526 return 0;
519 527
520master_here: 528master_here:
521 mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " 529 dlm_print_recovery_master(dlm);
522 "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
523 dlm->node_num, dlm->reco.dead_node, dlm->name);
524 530
525 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 531 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
526 if (status < 0) { 532 if (status < 0) {
527 /* we should never hit this anymore */ 533 /* we should never hit this anymore */
528 mlog(ML_ERROR, "error %d remastering locks for node %u, " 534 mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
529 "retrying.\n", status, dlm->reco.dead_node); 535 "retrying.\n", dlm->name, status, dlm->reco.dead_node);
530 /* yield a bit to allow any final network messages 536 /* yield a bit to allow any final network messages
531 * to get handled on remaining nodes */ 537 * to get handled on remaining nodes */
532 msleep(100); 538 msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 573 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
568 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 574 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
569 575
570 mlog(0, "requesting lock info from node %u\n", 576 mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
571 ndata->node_num); 577 ndata->node_num);
572 578
573 if (ndata->node_num == dlm->node_num) { 579 if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
640 spin_unlock(&dlm_reco_state_lock); 646 spin_unlock(&dlm_reco_state_lock);
641 } 647 }
642 648
643 mlog(0, "done requesting all lock info\n"); 649 mlog(0, "%s: Done requesting all lock info\n", dlm->name);
644 650
645 /* nodes should be sending reco data now 651 /* nodes should be sending reco data now
646 * just need to wait */ 652 * just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 808
803 /* negative status is handled by caller */ 809 /* negative status is handled by caller */
804 if (ret < 0) 810 if (ret < 0)
805 mlog(ML_ERROR, "Error %d when sending message %u (key " 811 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
806 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, 812 "to recover dead node %u\n", dlm->name, ret,
807 dlm->key, request_from); 813 request_from, dead_node);
808
809 // return from here, then 814 // return from here, then
810 // sleep until all received or error 815 // sleep until all received or error
811 return ret; 816 return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
956 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 961 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
957 sizeof(done_msg), send_to, &tmpret); 962 sizeof(done_msg), send_to, &tmpret);
958 if (ret < 0) { 963 if (ret < 0) {
959 mlog(ML_ERROR, "Error %d when sending message %u (key " 964 mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
960 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, 965 "to recover dead node %u\n", dlm->name, ret, send_to,
961 dlm->key, send_to); 966 dead_node);
962 if (!dlm_is_host_down(ret)) { 967 if (!dlm_is_host_down(ret)) {
963 BUG(); 968 BUG();
964 } 969 }
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1127 if (ret < 0) { 1132 if (ret < 0) {
1128 /* XXX: negative status is not handled. 1133 /* XXX: negative status is not handled.
1129 * this will end up killing this node. */ 1134 * this will end up killing this node. */
1130 mlog(ML_ERROR, "Error %d when sending message %u (key " 1135 mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
1131 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, 1136 "node %u (%s)\n", dlm->name, mres->lockname_len,
1132 dlm->key, send_to); 1137 mres->lockname, ret, send_to,
1138 (orig_flags & DLM_MRES_MIGRATION ?
1139 "migration" : "recovery"));
1133 } else { 1140 } else {
1134 /* might get an -ENOMEM back here */ 1141 /* might get an -ENOMEM back here */
1135 ret = status; 1142 ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1767 dlm->name, mres->lockname_len, mres->lockname, 1774 dlm->name, mres->lockname_len, mres->lockname,
1768 from); 1775 from);
1769 spin_lock(&res->spinlock); 1776 spin_lock(&res->spinlock);
1770 dlm_lockres_set_refmap_bit(from, res); 1777 dlm_lockres_set_refmap_bit(dlm, res, from);
1771 spin_unlock(&res->spinlock); 1778 spin_unlock(&res->spinlock);
1772 added++; 1779 added++;
1773 break; 1780 break;
@@ -1965,7 +1972,7 @@ skip_lvb:
1965 mlog(0, "%s:%.*s: added lock for node %u, " 1972 mlog(0, "%s:%.*s: added lock for node %u, "
1966 "setting refmap bit\n", dlm->name, 1973 "setting refmap bit\n", dlm->name,
1967 res->lockname.len, res->lockname.name, ml->node); 1974 res->lockname.len, res->lockname.name, ml->node);
1968 dlm_lockres_set_refmap_bit(ml->node, res); 1975 dlm_lockres_set_refmap_bit(dlm, res, ml->node);
1969 added++; 1976 added++;
1970 } 1977 }
1971 spin_unlock(&res->spinlock); 1978 spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2084 2091
2085 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { 2092 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2086 if (res->owner == dead_node) { 2093 if (res->owner == dead_node) {
2094 mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2095 dlm->name, res->lockname.len, res->lockname.name,
2096 res->owner, new_master);
2087 list_del_init(&res->recovering); 2097 list_del_init(&res->recovering);
2088 spin_lock(&res->spinlock); 2098 spin_lock(&res->spinlock);
2089 /* new_master has our reference from 2099 /* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2105 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2115 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2106 bucket = dlm_lockres_hash(dlm, i); 2116 bucket = dlm_lockres_hash(dlm, i);
2107 hlist_for_each_entry(res, hash_iter, bucket, hash_node) { 2117 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
2108 if (res->state & DLM_LOCK_RES_RECOVERING) { 2118 if (!(res->state & DLM_LOCK_RES_RECOVERING))
2109 if (res->owner == dead_node) { 2119 continue;
2110 mlog(0, "(this=%u) res %.*s owner=%u "
2111 "was not on recovering list, but "
2112 "clearing state anyway\n",
2113 dlm->node_num, res->lockname.len,
2114 res->lockname.name, new_master);
2115 } else if (res->owner == dlm->node_num) {
2116 mlog(0, "(this=%u) res %.*s owner=%u "
2117 "was not on recovering list, "
2118 "owner is THIS node, clearing\n",
2119 dlm->node_num, res->lockname.len,
2120 res->lockname.name, new_master);
2121 } else
2122 continue;
2123 2120
2124 if (!list_empty(&res->recovering)) { 2121 if (res->owner != dead_node &&
2125 mlog(0, "%s:%.*s: lockres was " 2122 res->owner != dlm->node_num)
2126 "marked RECOVERING, owner=%u\n", 2123 continue;
2127 dlm->name, res->lockname.len, 2124
2128 res->lockname.name, res->owner); 2125 if (!list_empty(&res->recovering)) {
2129 list_del_init(&res->recovering); 2126 list_del_init(&res->recovering);
2130 dlm_lockres_put(res); 2127 dlm_lockres_put(res);
2131 }
2132 spin_lock(&res->spinlock);
2133 /* new_master has our reference from
2134 * the lock state sent during recovery */
2135 dlm_change_lockres_owner(dlm, res, new_master);
2136 res->state &= ~DLM_LOCK_RES_RECOVERING;
2137 if (__dlm_lockres_has_locks(res))
2138 __dlm_dirty_lockres(dlm, res);
2139 spin_unlock(&res->spinlock);
2140 wake_up(&res->wq);
2141 } 2128 }
2129
2130 /* new_master has our reference from
2131 * the lock state sent during recovery */
2132 mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2133 dlm->name, res->lockname.len, res->lockname.name,
2134 res->owner, new_master);
2135 spin_lock(&res->spinlock);
2136 dlm_change_lockres_owner(dlm, res, new_master);
2137 res->state &= ~DLM_LOCK_RES_RECOVERING;
2138 if (__dlm_lockres_has_locks(res))
2139 __dlm_dirty_lockres(dlm, res);
2140 spin_unlock(&res->spinlock);
2141 wake_up(&res->wq);
2142 } 2142 }
2143 } 2143 }
2144} 2144}
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2252 res->lockname.len, res->lockname.name, freed, dead_node); 2252 res->lockname.len, res->lockname.name, freed, dead_node);
2253 __dlm_print_one_lock_resource(res); 2253 __dlm_print_one_lock_resource(res);
2254 } 2254 }
2255 dlm_lockres_clear_refmap_bit(dead_node, res); 2255 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2256 } else if (test_bit(dead_node, res->refmap)) { 2256 } else if (test_bit(dead_node, res->refmap)) {
2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2258 "no locks and had not purged before dying\n", dlm->name, 2258 "no locks and had not purged before dying\n", dlm->name,
2259 res->lockname.len, res->lockname.name, dead_node); 2259 res->lockname.len, res->lockname.name, dead_node);
2260 dlm_lockres_clear_refmap_bit(dead_node, res); 2260 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2261 } 2261 }
2262 2262
2263 /* do not kick thread yet */ 2263 /* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2324 dlm_revalidate_lvb(dlm, res, dead_node); 2324 dlm_revalidate_lvb(dlm, res, dead_node);
2325 if (res->owner == dead_node) { 2325 if (res->owner == dead_node) {
2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2327 mlog(ML_NOTICE, "Ignore %.*s for " 2327 mlog(ML_NOTICE, "%s: res %.*s, Skip "
2328 "recovery as it is being freed\n", 2328 "recovery as it is being freed\n",
2329 res->lockname.len, 2329 dlm->name, res->lockname.len,
2330 res->lockname.name); 2330 res->lockname.name);
2331 } else 2331 } else
2332 dlm_move_lockres_to_recovery_list(dlm, 2332 dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c471..e73c833fc2a1 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 int bit; 95 int bit;
96 96
97 assert_spin_locked(&res->spinlock);
98
97 if (__dlm_lockres_has_locks(res)) 99 if (__dlm_lockres_has_locks(res))
98 return 0; 100 return 0;
99 101
102 /* Locks are in the process of being created */
103 if (res->inflight_locks)
104 return 0;
105
100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 return 0; 107 return 0;
102 108
103 if (res->state & DLM_LOCK_RES_RECOVERING) 109 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 0; 110 return 0;
105 111
112 /* Another node has this resource with this node as the master */
106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 113 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 if (bit < O2NM_MAX_NODES) 114 if (bit < O2NM_MAX_NODES)
108 return 0; 115 return 0;
109 116
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1; 117 return 1;
116} 118}
117 119
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
185 /* clear our bit from the master's refmap, ignore errors */ 187 /* clear our bit from the master's refmap, ignore errors */
186 ret = dlm_drop_lockres_ref(dlm, res); 188 ret = dlm_drop_lockres_ref(dlm, res);
187 if (ret < 0) { 189 if (ret < 0) {
188 mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
189 res->lockname.len, res->lockname.name, ret);
190 if (!dlm_is_host_down(ret)) 190 if (!dlm_is_host_down(ret))
191 BUG(); 191 BUG();
192 } 192 }
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
209 BUG(); 209 BUG();
210 } 210 }
211 211
212 __dlm_unhash_lockres(res); 212 __dlm_unhash_lockres(dlm, res);
213 213
214 /* lockres is not in the hash now. drop the flag and wake up 214 /* lockres is not in the hash now. drop the flag and wake up
215 * any processes waiting in dlm_get_lock_resource. */ 215 * any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e1ed5e502ff2..81a4cd22f80b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
1692 mlog(0, "inode %llu take PRMODE open lock\n", 1692 mlog(0, "inode %llu take PRMODE open lock\n",
1693 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1693 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1694 1694
1695 if (ocfs2_mount_local(osb)) 1695 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
1696 goto out; 1696 goto out;
1697 1697
1698 lockres = &OCFS2_I(inode)->ip_open_lockres; 1698 lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1718 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1718 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1719 write ? "EXMODE" : "PRMODE"); 1719 write ? "EXMODE" : "PRMODE");
1720 1720
1721 if (ocfs2_is_hard_readonly(osb)) {
1722 if (write)
1723 status = -EROFS;
1724 goto out;
1725 }
1726
1721 if (ocfs2_mount_local(osb)) 1727 if (ocfs2_mount_local(osb))
1722 goto out; 1728 goto out;
1723 1729
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
2298 if (ocfs2_is_hard_readonly(osb)) { 2304 if (ocfs2_is_hard_readonly(osb)) {
2299 if (ex) 2305 if (ex)
2300 status = -EROFS; 2306 status = -EROFS;
2301 goto bail; 2307 goto getbh;
2302 } 2308 }
2303 2309
2304 if (ocfs2_mount_local(osb)) 2310 if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
2356 mlog_errno(status); 2362 mlog_errno(status);
2357 goto bail; 2363 goto bail;
2358 } 2364 }
2359 2365getbh:
2360 if (ret_bh) { 2366 if (ret_bh) {
2361 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 2367 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
2362 if (status < 0) { 2368 if (status < 0) {
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2628 2634
2629 BUG_ON(!dl); 2635 BUG_ON(!dl);
2630 2636
2631 if (ocfs2_is_hard_readonly(osb)) 2637 if (ocfs2_is_hard_readonly(osb)) {
2632 return -EROFS; 2638 if (ex)
2639 return -EROFS;
2640 return 0;
2641 }
2633 2642
2634 if (ocfs2_mount_local(osb)) 2643 if (ocfs2_mount_local(osb))
2635 return 0; 2644 return 0;
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2647 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2656 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2648 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2657 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2649 2658
2650 if (!ocfs2_mount_local(osb)) 2659 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
2651 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); 2660 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
2652} 2661}
2653 2662
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 23457b491e8c..2f5b92ef0e53 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
832 return ret; 832 return ret;
833} 833}
834 834
835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
836{
837 struct inode *inode = file->f_mapping->host;
838 int ret;
839 unsigned int is_last = 0, is_data = 0;
840 u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
841 u32 cpos, cend, clen, hole_size;
842 u64 extoff, extlen;
843 struct buffer_head *di_bh = NULL;
844 struct ocfs2_extent_rec rec;
845
846 BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
847
848 ret = ocfs2_inode_lock(inode, &di_bh, 0);
849 if (ret) {
850 mlog_errno(ret);
851 goto out;
852 }
853
854 down_read(&OCFS2_I(inode)->ip_alloc_sem);
855
856 if (*offset >= inode->i_size) {
857 ret = -ENXIO;
858 goto out_unlock;
859 }
860
861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862 if (origin == SEEK_HOLE)
863 *offset = inode->i_size;
864 goto out_unlock;
865 }
866
867 clen = 0;
868 cpos = *offset >> cs_bits;
869 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
870
871 while (cpos < cend && !is_last) {
872 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
873 &rec, &is_last);
874 if (ret) {
875 mlog_errno(ret);
876 goto out_unlock;
877 }
878
879 extoff = cpos;
880 extoff <<= cs_bits;
881
882 if (rec.e_blkno == 0ULL) {
883 clen = hole_size;
884 is_data = 0;
885 } else {
886 clen = le16_to_cpu(rec.e_leaf_clusters) -
887 (cpos - le32_to_cpu(rec.e_cpos));
888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889 }
890
891 if ((!is_data && origin == SEEK_HOLE) ||
892 (is_data && origin == SEEK_DATA)) {
893 if (extoff > *offset)
894 *offset = extoff;
895 goto out_unlock;
896 }
897
898 if (!is_last)
899 cpos += clen;
900 }
901
902 if (origin == SEEK_HOLE) {
903 extoff = cpos;
904 extoff <<= cs_bits;
905 extlen = clen;
906 extlen <<= cs_bits;
907
908 if ((extoff + extlen) > inode->i_size)
909 extlen = inode->i_size - extoff;
910 extoff += extlen;
911 if (extoff > *offset)
912 *offset = extoff;
913 goto out_unlock;
914 }
915
916 ret = -ENXIO;
917
918out_unlock:
919
920 brelse(di_bh);
921
922 up_read(&OCFS2_I(inode)->ip_alloc_sem);
923
924 ocfs2_inode_unlock(inode, 0);
925out:
926 if (ret && ret != -ENXIO)
927 ret = -ENXIO;
928 return ret;
929}
930
835int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 931int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
836 struct buffer_head *bhs[], int flags, 932 struct buffer_head *bhs[], int flags,
837 int (*validate)(struct super_block *sb, 933 int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c909..67ea57d2fd59 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len); 54 u64 map_start, u64 map_len);
55 55
56int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
57
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 58int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 59 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el, 60 struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de4ea1af041b..6e396683c3d4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1950 if (ret < 0) 1950 if (ret < 0)
1951 mlog_errno(ret); 1951 mlog_errno(ret);
1952 1952
1953 if (file->f_flags & O_SYNC)
1954 handle->h_sync = 1;
1955
1953 ocfs2_commit_trans(osb, handle); 1956 ocfs2_commit_trans(osb, handle);
1954 1957
1955out_inode_unlock: 1958out_inode_unlock:
@@ -2052,6 +2055,23 @@ out:
2052 return ret; 2055 return ret;
2053} 2056}
2054 2057
2058static void ocfs2_aiodio_wait(struct inode *inode)
2059{
2060 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2061
2062 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2063}
2064
2065static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2066{
2067 int blockmask = inode->i_sb->s_blocksize - 1;
2068 loff_t final_size = pos + count;
2069
2070 if ((pos & blockmask) || (final_size & blockmask))
2071 return 1;
2072 return 0;
2073}
2074
2055static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2075static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2056 struct file *file, 2076 struct file *file,
2057 loff_t pos, size_t count, 2077 loff_t pos, size_t count,
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2230 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2231 int full_coherency = !(osb->s_mount_opt & 2251 int full_coherency = !(osb->s_mount_opt &
2232 OCFS2_MOUNT_COHERENCY_BUFFERED); 2252 OCFS2_MOUNT_COHERENCY_BUFFERED);
2253 int unaligned_dio = 0;
2233 2254
2234 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2255 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2235 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2256 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2297,6 +2318,10 @@ relock:
2297 goto out; 2318 goto out;
2298 } 2319 }
2299 2320
2321 if (direct_io && !is_sync_kiocb(iocb))
2322 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2323 *ppos);
2324
2300 /* 2325 /*
2301 * We can't complete the direct I/O as requested, fall back to 2326 * We can't complete the direct I/O as requested, fall back to
2302 * buffered I/O. 2327 * buffered I/O.
@@ -2311,6 +2336,18 @@ relock:
2311 goto relock; 2336 goto relock;
2312 } 2337 }
2313 2338
2339 if (unaligned_dio) {
2340 /*
2341 * Wait on previous unaligned aio to complete before
2342 * proceeding.
2343 */
2344 ocfs2_aiodio_wait(inode);
2345
2346 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2347 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2348 ocfs2_iocb_set_unaligned_aio(iocb);
2349 }
2350
2314 /* 2351 /*
2315 * To later detect whether a journal commit for sync writes is 2352 * To later detect whether a journal commit for sync writes is
2316 * necessary, we sample i_size, and cluster count here. 2353 * necessary, we sample i_size, and cluster count here.
@@ -2382,8 +2419,12 @@ out_dio:
2382 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2419 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2383 rw_level = -1; 2420 rw_level = -1;
2384 have_alloc_sem = 0; 2421 have_alloc_sem = 0;
2422 unaligned_dio = 0;
2385 } 2423 }
2386 2424
2425 if (unaligned_dio)
2426 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2427
2387out: 2428out:
2388 if (rw_level != -1) 2429 if (rw_level != -1)
2389 ocfs2_rw_unlock(inode, rw_level); 2430 ocfs2_rw_unlock(inode, rw_level);
@@ -2591,6 +2632,57 @@ bail:
2591 return ret; 2632 return ret;
2592} 2633}
2593 2634
2635/* Refer generic_file_llseek_unlocked() */
2636static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
2637{
2638 struct inode *inode = file->f_mapping->host;
2639 int ret = 0;
2640
2641 mutex_lock(&inode->i_mutex);
2642
2643 switch (origin) {
2644 case SEEK_SET:
2645 break;
2646 case SEEK_END:
2647 offset += inode->i_size;
2648 break;
2649 case SEEK_CUR:
2650 if (offset == 0) {
2651 offset = file->f_pos;
2652 goto out;
2653 }
2654 offset += file->f_pos;
2655 break;
2656 case SEEK_DATA:
2657 case SEEK_HOLE:
2658 ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
2659 if (ret)
2660 goto out;
2661 break;
2662 default:
2663 ret = -EINVAL;
2664 goto out;
2665 }
2666
2667 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2668 ret = -EINVAL;
2669 if (!ret && offset > inode->i_sb->s_maxbytes)
2670 ret = -EINVAL;
2671 if (ret)
2672 goto out;
2673
2674 if (offset != file->f_pos) {
2675 file->f_pos = offset;
2676 file->f_version = 0;
2677 }
2678
2679out:
2680 mutex_unlock(&inode->i_mutex);
2681 if (ret)
2682 return ret;
2683 return offset;
2684}
2685
2594const struct inode_operations ocfs2_file_iops = { 2686const struct inode_operations ocfs2_file_iops = {
2595 .setattr = ocfs2_setattr, 2687 .setattr = ocfs2_setattr,
2596 .getattr = ocfs2_getattr, 2688 .getattr = ocfs2_getattr,
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2615 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2707 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2616 */ 2708 */
2617const struct file_operations ocfs2_fops = { 2709const struct file_operations ocfs2_fops = {
2618 .llseek = generic_file_llseek, 2710 .llseek = ocfs2_file_llseek,
2619 .read = do_sync_read, 2711 .read = do_sync_read,
2620 .write = do_sync_write, 2712 .write = do_sync_write,
2621 .mmap = ocfs2_mmap, 2713 .mmap = ocfs2_mmap,
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = {
2663 * the cluster. 2755 * the cluster.
2664 */ 2756 */
2665const struct file_operations ocfs2_fops_no_plocks = { 2757const struct file_operations ocfs2_fops_no_plocks = {
2666 .llseek = generic_file_llseek, 2758 .llseek = ocfs2_file_llseek,
2667 .read = do_sync_read, 2759 .read = do_sync_read,
2668 .write = do_sync_write, 2760 .write = do_sync_write,
2669 .mmap = ocfs2_mmap, 2761 .mmap = ocfs2_mmap,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a22d2c098890..17454a904d7b 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
951 trace_ocfs2_cleanup_delete_inode( 951 trace_ocfs2_cleanup_delete_inode(
952 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 952 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
953 if (sync_data) 953 if (sync_data)
954 write_inode_now(inode, 1); 954 filemap_write_and_wait(inode->i_mapping);
955 truncate_inode_pages(&inode->i_data, 0); 955 truncate_inode_pages(&inode->i_data, 0);
956} 956}
957 957
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b149b3a..88924a3133fa 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
43 /* protects extended attribute changes on this inode */ 43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */
47 atomic_t ip_unaligned_aio;
48
46 /* These fields are protected by ip_lock */ 49 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 50 spinlock_t ip_lock;
48 u32 ip_open_count; 51 u32 ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bc91072b7219..726ff265b296 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
122 if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & 122 if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
123 (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { 123 (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
124 if (!capable(CAP_LINUX_IMMUTABLE)) 124 if (!capable(CAP_LINUX_IMMUTABLE))
125 goto bail_unlock; 125 goto bail_commit;
126 } 126 }
127 127
128 ocfs2_inode->ip_attr = flags; 128 ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
132 if (status < 0) 132 if (status < 0)
133 mlog_errno(status); 133 mlog_errno(status);
134 134
135bail_commit:
135 ocfs2_commit_trans(osb, handle); 136 ocfs2_commit_trans(osb, handle);
136bail_unlock: 137bail_unlock:
137 ocfs2_inode_unlock(inode, 1); 138 ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
381 if (!oifi) { 382 if (!oifi) {
382 status = -ENOMEM; 383 status = -ENOMEM;
383 mlog_errno(status); 384 mlog_errno(status);
384 goto bail; 385 goto out_err;
385 } 386 }
386 387
387 if (o2info_from_user(*oifi, req)) 388 if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
431 o2info_set_request_error(&oifi->ifi_req, req); 432 o2info_set_request_error(&oifi->ifi_req, req);
432 433
433 kfree(oifi); 434 kfree(oifi);
434 435out_err:
435 return status; 436 return status;
436} 437}
437 438
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
666 if (!oiff) { 667 if (!oiff) {
667 status = -ENOMEM; 668 status = -ENOMEM;
668 mlog_errno(status); 669 mlog_errno(status);
669 goto bail; 670 goto out_err;
670 } 671 }
671 672
672 if (o2info_from_user(*oiff, req)) 673 if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
716 o2info_set_request_error(&oiff->iff_req, req); 717 o2info_set_request_error(&oiff->iff_req, req);
717 718
718 kfree(oiff); 719 kfree(oiff);
719 720out_err:
720 return status; 721 return status;
721} 722}
722 723
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d56454e8b..0a42ae96dca7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1544 /* we need to run complete recovery for offline orphan slots */ 1544 /* we need to run complete recovery for offline orphan slots */
1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1546 1546
1547 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1547 printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
1548 node_num, slot_num, 1548 "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
1549 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1549 MINOR(osb->sb->s_dev));
1550 1550
1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1552 1552
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1601 1601
1602 jbd2_journal_destroy(journal); 1602 jbd2_journal_destroy(journal);
1603 1603
1604 printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
1605 "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
1606 MINOR(osb->sb->s_dev));
1604done: 1607done:
1605 /* drop the lock on this nodes journal */ 1608 /* drop the lock on this nodes journal */
1606 if (got_lock) 1609 if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
1808 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This 1811 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
1809 * is done to catch any orphans that are left over in orphan directories. 1812 * is done to catch any orphans that are left over in orphan directories.
1810 * 1813 *
1814 * It scans all slots, even ones that are in use. It does so to handle the
1815 * case described below:
1816 *
1817 * Node 1 has an inode it was using. The dentry went away due to memory
1818 * pressure. Node 1 closes the inode, but it's on the free list. The node
1819 * has the open lock.
1820 * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
1821 * but node 1 has no dentry and doesn't get the message. It trylocks the
1822 * open lock, sees that another node has a PR, and does nothing.
1823 * Later node 2 runs its orphan dir. It igets the inode, trylocks the
1824 * open lock, sees the PR still, and does nothing.
1825 * Basically, we have to trigger an orphan iput on node 1. The only way
1826 * for this to happen is if node 1 runs node 2's orphan dir.
1827 *
1811 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT 1828 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
1812 * seconds. It gets an EX lock on os_lockres and checks sequence number 1829 * seconds. It gets an EX lock on os_lockres and checks sequence number
1813 * stored in LVB. If the sequence number has changed, it means some other 1830 * stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 68cf2f6d3c6a..a3385b63ff5e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
441#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 441#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
442 442
443/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 443/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
444 * update on dir + index leaf + dx root update for free list */ 444 * update on dir + index leaf + dx root update for free list +
445 * previous dirblock update in the free list */
445static inline int ocfs2_link_credits(struct super_block *sb) 446static inline int ocfs2_link_credits(struct super_block *sb)
446{ 447{
447 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + 448 return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
448 ocfs2_quota_trans_credits(sb); 449 ocfs2_quota_trans_credits(sb);
449} 450}
450 451
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393ca39eb..9cd41083e991 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
61static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, 61static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
62 struct page *page) 62 struct page *page)
63{ 63{
64 int ret; 64 int ret = VM_FAULT_NOPAGE;
65 struct inode *inode = file->f_path.dentry->d_inode; 65 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 66 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 67 loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
71 void *fsdata; 71 void *fsdata;
72 loff_t size = i_size_read(inode); 72 loff_t size = i_size_read(inode);
73 73
74 /*
75 * Another node might have truncated while we were waiting on
76 * cluster locks.
77 * We don't check size == 0 before the shift. This is borrowed
78 * from do_generic_file_read.
79 */
80 last_index = (size - 1) >> PAGE_CACHE_SHIFT; 74 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
81 if (unlikely(!size || page->index > last_index)) {
82 ret = -EINVAL;
83 goto out;
84 }
85 75
86 /* 76 /*
87 * The i_size check above doesn't catch the case where nodes 77 * There are cases that lead to the page no longer bebongs to the
88 * truncated and then re-extended the file. We'll re-check the 78 * mapping.
89 * page mapping after taking the page lock inside of 79 * 1) pagecache truncates locally due to memory pressure.
90 * ocfs2_write_begin_nolock(). 80 * 2) pagecache truncates when another is taking EX lock against
81 * inode lock. see ocfs2_data_convert_worker.
82 *
83 * The i_size check doesn't catch the case where nodes truncated and
84 * then re-extended the file. We'll re-check the page mapping after
85 * taking the page lock inside of ocfs2_write_begin_nolock().
86 *
87 * Let VM retry with these cases.
91 */ 88 */
92 if (!PageUptodate(page) || page->mapping != inode->i_mapping) { 89 if ((page->mapping != inode->i_mapping) ||
93 /* 90 (!PageUptodate(page)) ||
94 * the page has been umapped in ocfs2_data_downconvert_worker. 91 (page_offset(page) >= size))
95 * So return 0 here and let VFS retry.
96 */
97 ret = 0;
98 goto out; 92 goto out;
99 }
100 93
101 /* 94 /*
102 * Call ocfs2_write_begin() and ocfs2_write_end() to take 95 * Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
116 if (ret) { 109 if (ret) {
117 if (ret != -ENOSPC) 110 if (ret != -ENOSPC)
118 mlog_errno(ret); 111 mlog_errno(ret);
112 if (ret == -ENOMEM)
113 ret = VM_FAULT_OOM;
114 else
115 ret = VM_FAULT_SIGBUS;
119 goto out; 116 goto out;
120 } 117 }
121 118
122 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, 119 if (!locked_page) {
123 fsdata); 120 ret = VM_FAULT_NOPAGE;
124 if (ret < 0) {
125 mlog_errno(ret);
126 goto out; 121 goto out;
127 } 122 }
123 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
124 fsdata);
128 BUG_ON(ret != len); 125 BUG_ON(ret != len);
129 ret = 0; 126 ret = VM_FAULT_LOCKED;
130out: 127out:
131 return ret; 128 return ret;
132} 129}
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
168 165
169out: 166out:
170 ocfs2_unblock_signals(&oldset); 167 ocfs2_unblock_signals(&oldset);
171 if (ret)
172 ret = VM_FAULT_SIGBUS;
173 return ret; 168 return ret;
174} 169}
175 170
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index d53cb706f14c..184c76b8c293 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
745 */ 745 */
746 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 746 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
747 new_phys_cpos); 747 new_phys_cpos);
748 if (!new_phys_cpos) { 748 if (!*new_phys_cpos) {
749 ret = -ENOSPC; 749 ret = -ENOSPC;
750 goto out_commit; 750 goto out_commit;
751 } 751 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 409285854f64..d355e6e36b36 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
836 836
837static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 837static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
838{ 838{
839 __test_and_set_bit_le(bit, bitmap); 839 __set_bit_le(bit, bitmap);
840} 840}
841#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) 841#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
842 842
843static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) 843static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
844{ 844{
845 __test_and_clear_bit_le(bit, bitmap); 845 __clear_bit_le(bit, bitmap);
846} 846}
847#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) 847#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
848 848
849#define ocfs2_test_bit test_bit_le 849#define ocfs2_test_bit test_bit_le
850#define ocfs2_find_next_zero_bit find_next_zero_bit_le 850#define ocfs2_find_next_zero_bit find_next_zero_bit_le
851#define ocfs2_find_next_bit find_next_bit_le 851#define ocfs2_find_next_bit find_next_bit_le
852
853static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
854{
855#if BITS_PER_LONG == 64
856 *bit += ((unsigned long) addr & 7UL) << 3;
857 addr = (void *) ((unsigned long) addr & ~7UL);
858#elif BITS_PER_LONG == 32
859 *bit += ((unsigned long) addr & 3UL) << 3;
860 addr = (void *) ((unsigned long) addr & ~3UL);
861#else
862#error "how many bits you are?!"
863#endif
864 return addr;
865}
866
867static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
868{
869 bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
870 ocfs2_set_bit(bit, bitmap);
871}
872
873static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
874{
875 bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
876 ocfs2_clear_bit(bit, bitmap);
877}
878
879static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
880{
881 bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
882 return ocfs2_test_bit(bit, bitmap);
883}
884
885static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
886 int start)
887{
888 int fix = 0, ret, tmpmax;
889 bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
890 tmpmax = max + fix;
891 start += fix;
892
893 ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
894 if (ret > max)
895 return max;
896 return ret;
897}
898
852#endif /* OCFS2_H */ 899#endif /* OCFS2_H */
853 900
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007fc9247..f100bf70a906 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
404 int status = 0; 404 int status = 0;
405 struct ocfs2_quota_recovery *rec; 405 struct ocfs2_quota_recovery *rec;
406 406
407 mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); 407 printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
408 "slot %u\n", osb->dev_str, slot_num);
409
408 rec = ocfs2_alloc_quota_recovery(); 410 rec = ocfs2_alloc_quota_recovery();
409 if (!rec) 411 if (!rec)
410 return ERR_PTR(-ENOMEM); 412 return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
549 goto out_commit; 551 goto out_commit;
550 } 552 }
551 lock_buffer(qbh); 553 lock_buffer(qbh);
552 WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); 554 WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
553 ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 555 ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
554 le32_add_cpu(&dchunk->dqc_free, 1); 556 le32_add_cpu(&dchunk->dqc_free, 1);
555 unlock_buffer(qbh); 557 unlock_buffer(qbh);
556 ocfs2_journal_dirty(handle, qbh); 558 ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
596 struct inode *lqinode; 598 struct inode *lqinode;
597 unsigned int flags; 599 unsigned int flags;
598 600
599 mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); 601 printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
602 "slot %u\n", osb->dev_str, slot_num);
603
600 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 604 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
601 for (type = 0; type < MAXQUOTAS; type++) { 605 for (type = 0; type < MAXQUOTAS; type++) {
602 if (list_empty(&(rec->r_list[type]))) 606 if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
612 /* Someone else is holding the lock? Then he must be 616 /* Someone else is holding the lock? Then he must be
613 * doing the recovery. Just skip the file... */ 617 * doing the recovery. Just skip the file... */
614 if (status == -EAGAIN) { 618 if (status == -EAGAIN) {
615 mlog(ML_NOTICE, "skipping quota recovery for slot %d " 619 printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
616 "because quota file is locked.\n", slot_num); 620 "device (%s) for slot %d because quota file is "
621 "locked.\n", osb->dev_str, slot_num);
617 status = 0; 622 status = 0;
618 goto out_put; 623 goto out_put;
619 } else if (status < 0) { 624 } else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
944 * ol_quota_entries_per_block(sb); 949 * ol_quota_entries_per_block(sb);
945 } 950 }
946 951
947 found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); 952 found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
948 /* We failed? */ 953 /* We failed? */
949 if (found == len) { 954 if (found == len) {
950 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" 955 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1208 struct ocfs2_local_disk_chunk *dchunk; 1213 struct ocfs2_local_disk_chunk *dchunk;
1209 1214
1210 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 1215 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
1211 ocfs2_set_bit(*offset, dchunk->dqc_bitmap); 1216 ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
1212 le32_add_cpu(&dchunk->dqc_free, -1); 1217 le32_add_cpu(&dchunk->dqc_free, -1);
1213} 1218}
1214 1219
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1289 (od->dq_chunk->qc_headerbh->b_data); 1294 (od->dq_chunk->qc_headerbh->b_data);
1290 /* Mark structure as freed */ 1295 /* Mark structure as freed */
1291 lock_buffer(od->dq_chunk->qc_headerbh); 1296 lock_buffer(od->dq_chunk->qc_headerbh);
1292 ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1297 ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
1293 le32_add_cpu(&dchunk->dqc_free, 1); 1298 le32_add_cpu(&dchunk->dqc_free, 1);
1294 unlock_buffer(od->dq_chunk->qc_headerbh); 1299 unlock_buffer(od->dq_chunk->qc_headerbh);
1295 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1300 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc0014d509..1424c151cccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
493 goto bail; 493 goto bail;
494 } 494 }
495 } else 495 } else
496 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 496 printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
497 slot); 497 "allocated to this node!\n", slot, osb->dev_str);
498 498
499 ocfs2_set_slot(si, slot, osb->node_num); 499 ocfs2_set_slot(si, slot, osb->node_num);
500 osb->slot_num = slot; 500 osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b00c43c..94368017edb3 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
28#include "cluster/masklog.h" 28#include "cluster/masklog.h"
29#include "cluster/nodemanager.h" 29#include "cluster/nodemanager.h"
30#include "cluster/heartbeat.h" 30#include "cluster/heartbeat.h"
31#include "cluster/tcp.h"
31 32
32#include "stackglue.h" 33#include "stackglue.h"
33 34
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256} 257}
257 258
258/* 259/*
260 * Check if this node is heartbeating and is connected to all other
261 * heartbeating nodes.
262 */
263static int o2cb_cluster_check(void)
264{
265 u8 node_num;
266 int i;
267 unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
268 unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
269
270 node_num = o2nm_this_node();
271 if (node_num == O2NM_MAX_NODES) {
272 printk(KERN_ERR "o2cb: This node has not been configured.\n");
273 return -EINVAL;
274 }
275
276 /*
277 * o2dlm expects o2net sockets to be created. If not, then
278 * dlm_join_domain() fails with a stack of errors which are both cryptic
279 * and incomplete. The idea here is to detect upfront whether we have
280 * managed to connect to all nodes or not. If not, then list the nodes
281 * to allow the user to check the configuration (incorrect IP, firewall,
282 * etc.) Yes, this is racy. But its not the end of the world.
283 */
284#define O2CB_MAP_STABILIZE_COUNT 60
285 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
286 o2hb_fill_node_map(hbmap, sizeof(hbmap));
287 if (!test_bit(node_num, hbmap)) {
288 printk(KERN_ERR "o2cb: %s heartbeat has not been "
289 "started.\n", (o2hb_global_heartbeat_active() ?
290 "Global" : "Local"));
291 return -EINVAL;
292 }
293 o2net_fill_node_map(netmap, sizeof(netmap));
294 /* Force set the current node to allow easy compare */
295 set_bit(node_num, netmap);
296 if (!memcmp(hbmap, netmap, sizeof(hbmap)))
297 return 0;
298 if (i < O2CB_MAP_STABILIZE_COUNT)
299 msleep(1000);
300 }
301
302 printk(KERN_ERR "o2cb: This node could not connect to nodes:");
303 i = -1;
304 while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
305 i + 1)) < O2NM_MAX_NODES) {
306 if (!test_bit(i, netmap))
307 printk(" %u", i);
308 }
309 printk(".\n");
310
311 return -ENOTCONN;
312}
313
314/*
259 * Called from the dlm when it's about to evict a node. This is how the 315 * Called from the dlm when it's about to evict a node. This is how the
260 * classic stack signals node death. 316 * classic stack signals node death.
261 */ 317 */
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
263{ 319{
264 struct ocfs2_cluster_connection *conn = data; 320 struct ocfs2_cluster_connection *conn = data;
265 321
266 mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", 322 printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
267 node_num, conn->cc_namelen, conn->cc_name); 323 node_num, conn->cc_namelen, conn->cc_name);
268 324
269 conn->cc_recovery_handler(node_num, conn->cc_recovery_data); 325 conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
270} 326}
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 BUG_ON(conn == NULL); 336 BUG_ON(conn == NULL);
281 BUG_ON(conn->cc_proto == NULL); 337 BUG_ON(conn->cc_proto == NULL);
282 338
283 /* for now we only have one cluster/node, make sure we see it 339 /* Ensure cluster stack is up and all nodes are connected */
284 * in the heartbeat universe */ 340 rc = o2cb_cluster_check();
285 if (!o2hb_check_local_node_heartbeating()) { 341 if (rc) {
286 if (o2hb_global_heartbeat_active()) 342 printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
287 mlog(ML_ERROR, "Global heartbeat not started\n"); 343 "before retrying.\n");
288 rc = -EINVAL;
289 goto out; 344 goto out;
290 } 345 }
291 346
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f61027236b..4994f8b0e604 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
54#include "ocfs1_fs_compat.h" 54#include "ocfs1_fs_compat.h"
55 55
56#include "alloc.h" 56#include "alloc.h"
57#include "aops.h"
57#include "blockcheck.h" 58#include "blockcheck.h"
58#include "dlmglue.h" 59#include "dlmglue.h"
59#include "export.h" 60#include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1107 1108
1108 ocfs2_set_ro_flag(osb, 1); 1109 ocfs2_set_ro_flag(osb, 1);
1109 1110
1110 printk(KERN_NOTICE "Readonly device detected. No cluster " 1111 printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
1111 "services will be utilized for this mount. Recovery " 1112 "Cluster services will not be used for this mount. "
1112 "will be skipped.\n"); 1113 "Recovery will be skipped.\n", osb->dev_str);
1113 } 1114 }
1114 1115
1115 if (!ocfs2_is_hard_readonly(osb)) { 1116 if (!ocfs2_is_hard_readonly(osb)) {
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1616 return 0; 1617 return 0;
1617} 1618}
1618 1619
1620wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
1621
1619static int __init ocfs2_init(void) 1622static int __init ocfs2_init(void)
1620{ 1623{
1621 int status; 1624 int status, i;
1622 1625
1623 ocfs2_print_version(); 1626 ocfs2_print_version();
1624 1627
1628 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1629 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1630
1625 status = init_ocfs2_uptodate_cache(); 1631 status = init_ocfs2_uptodate_cache();
1626 if (status < 0) { 1632 if (status < 0) {
1627 mlog_errno(status); 1633 mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
1760 ocfs2_extent_map_init(&oi->vfs_inode); 1766 ocfs2_extent_map_init(&oi->vfs_inode);
1761 INIT_LIST_HEAD(&oi->ip_io_markers); 1767 INIT_LIST_HEAD(&oi->ip_io_markers);
1762 oi->ip_dir_start_lookup = 0; 1768 oi->ip_dir_start_lookup = 0;
1763 1769 atomic_set(&oi->ip_unaligned_aio, 0);
1764 init_rwsem(&oi->ip_alloc_sem); 1770 init_rwsem(&oi->ip_alloc_sem);
1765 init_rwsem(&oi->ip_xattr_sem); 1771 init_rwsem(&oi->ip_xattr_sem);
1766 mutex_init(&oi->ip_io_mutex); 1772 mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1974 * If we failed before we got a uuid_str yet, we can't stop 1980 * If we failed before we got a uuid_str yet, we can't stop
1975 * heartbeat. Otherwise, do it. 1981 * heartbeat. Otherwise, do it.
1976 */ 1982 */
1977 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) 1983 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
1984 !ocfs2_is_hard_readonly(osb))
1978 hangup_needed = 1; 1985 hangup_needed = 1;
1979 1986
1980 if (osb->cconn) 1987 if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2353 mlog_errno(status); 2360 mlog_errno(status);
2354 goto bail; 2361 goto bail;
2355 } 2362 }
2356 cleancache_init_shared_fs((char *)&uuid_net_key, sb); 2363 cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
2357 2364
2358bail: 2365bail:
2359 return status; 2366 return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2462 goto finally; 2469 goto finally;
2463 } 2470 }
2464 } else { 2471 } else {
2465 mlog(ML_NOTICE, "File system was not unmounted cleanly, " 2472 printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
2466 "recovering volume.\n"); 2473 "unmounted cleanly, recovering it.\n", osb->dev_str);
2467 } 2474 }
2468 2475
2469 local = ocfs2_mount_local(osb); 2476 local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 194fb22ef79d..aa9e8777b09a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
2376 } 2376 }
2377 2377
2378 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); 2378 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
2379 if (ret < 0) {
2380 mlog_errno(ret);
2381 break;
2382 }
2383 2379
2384 ocfs2_commit_trans(osb, ctxt.handle); 2380 ocfs2_commit_trans(osb, ctxt.handle);
2385 if (ctxt.meta_ac) { 2381 if (ctxt.meta_ac) {
2386 ocfs2_free_alloc_context(ctxt.meta_ac); 2382 ocfs2_free_alloc_context(ctxt.meta_ac);
2387 ctxt.meta_ac = NULL; 2383 ctxt.meta_ac = NULL;
2388 } 2384 }
2385
2386 if (ret < 0) {
2387 mlog_errno(ret);
2388 break;
2389 }
2390
2389 } 2391 }
2390 2392
2391 if (ctxt.meta_ac) 2393 if (ctxt.meta_ac)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 586174168e2a..80e4645f7990 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
131 K(i.freeswap), 131 K(i.freeswap),
132 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
133 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE 134#ifdef CONFIG_TRANSPARENT_HUGEPAGE
135 K(global_page_state(NR_ANON_PAGES)
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * 136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR 137 HPAGE_PMD_NR),
138#else
139 K(global_page_state(NR_ANON_PAGES)),
138#endif 140#endif
139 ),
140 K(global_page_state(NR_FILE_MAPPED)), 141 K(global_page_state(NR_FILE_MAPPED)),
141 K(global_page_state(NR_SHMEM)), 142 K(global_page_state(NR_SHMEM)),
142 K(global_page_state(NR_SLAB_RECLAIMABLE) + 143 K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9a8a2b77b874..03102d978180 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,20 +91,18 @@ static struct file_system_type proc_fs_type = {
91 91
92void __init proc_root_init(void) 92void __init proc_root_init(void)
93{ 93{
94 struct vfsmount *mnt;
95 int err; 94 int err;
96 95
97 proc_init_inodecache(); 96 proc_init_inodecache();
98 err = register_filesystem(&proc_fs_type); 97 err = register_filesystem(&proc_fs_type);
99 if (err) 98 if (err)
100 return; 99 return;
101 mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 100 err = pid_ns_prepare_proc(&init_pid_ns);
102 if (IS_ERR(mnt)) { 101 if (err) {
103 unregister_filesystem(&proc_fs_type); 102 unregister_filesystem(&proc_fs_type);
104 return; 103 return;
105 } 104 }
106 105
107 init_pid_ns.proc_mnt = mnt;
108 proc_symlink("mounts", NULL, "self/mounts"); 106 proc_symlink("mounts", NULL, "self/mounts");
109 107
110 proc_net_init(); 108 proc_net_init();
@@ -209,5 +207,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
209 207
210void pid_ns_release_proc(struct pid_namespace *ns) 208void pid_ns_release_proc(struct pid_namespace *ns)
211{ 209{
212 mntput(ns->proc_mnt); 210 kern_unmount(ns->proc_mnt);
213} 211}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 42b274da92c3..2a30d67dd6b8 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu)
32 idle = kstat_cpu(cpu).cpustat.idle; 32 idle = kstat_cpu(cpu).cpustat.idle;
33 idle = cputime64_add(idle, arch_idle_time(cpu)); 33 idle = cputime64_add(idle, arch_idle_time(cpu));
34 } else 34 } else
35 idle = usecs_to_cputime(idle_time); 35 idle = nsecs_to_jiffies64(1000 * idle_time);
36 36
37 return idle; 37 return idle;
38} 38}
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu)
46 /* !NO_HZ so we can rely on cpustat.iowait */ 46 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait; 47 iowait = kstat_cpu(cpu).cpustat.iowait;
48 else 48 else
49 iowait = usecs_to_cputime(iowait_time); 49 iowait = nsecs_to_jiffies64(1000 * iowait_time);
50 50
51 return iowait; 51 return iowait;
52} 52}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2bd620f0d796..57bbf9078ac8 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi)
167 } 167 }
168 168
169 psinfo = psi; 169 psinfo = psi;
170 mutex_init(&psinfo->read_mutex);
170 spin_unlock(&pstore_lock); 171 spin_unlock(&pstore_lock);
171 172
172 if (owner && !try_module_get(owner)) { 173 if (owner && !try_module_get(owner)) {
@@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register);
195void pstore_get_records(int quiet) 196void pstore_get_records(int quiet)
196{ 197{
197 struct pstore_info *psi = psinfo; 198 struct pstore_info *psi = psinfo;
199 char *buf = NULL;
198 ssize_t size; 200 ssize_t size;
199 u64 id; 201 u64 id;
200 enum pstore_type_id type; 202 enum pstore_type_id type;
201 struct timespec time; 203 struct timespec time;
202 int failed = 0, rc; 204 int failed = 0, rc;
203 unsigned long flags;
204 205
205 if (!psi) 206 if (!psi)
206 return; 207 return;
207 208
208 spin_lock_irqsave(&psinfo->buf_lock, flags); 209 mutex_lock(&psi->read_mutex);
209 rc = psi->open(psi); 210 rc = psi->open(psi);
210 if (rc) 211 if (rc)
211 goto out; 212 goto out;
212 213
213 while ((size = psi->read(&id, &type, &time, psi)) > 0) { 214 while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
214 rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, 215 rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
215 time, psi); 216 time, psi);
217 kfree(buf);
218 buf = NULL;
216 if (rc && (rc != -EEXIST || !quiet)) 219 if (rc && (rc != -EEXIST || !quiet))
217 failed++; 220 failed++;
218 } 221 }
219 psi->close(psi); 222 psi->close(psi);
220out: 223out:
221 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 224 mutex_unlock(&psi->read_mutex);
222 225
223 if (failed) 226 if (failed)
224 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", 227 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 05d6b0e78c95..dba43c3ea3af 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path);
449 449
450/* 450/*
451 * Same as seq_path, but relative to supplied root. 451 * Same as seq_path, but relative to supplied root.
452 *
453 * root may be changed, see __d_path().
454 */ 452 */
455int seq_path_root(struct seq_file *m, struct path *path, struct path *root, 453int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
456 char *esc) 454 char *esc)
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
463 char *p; 461 char *p;
464 462
465 p = __d_path(path, root, buf, size); 463 p = __d_path(path, root, buf, size);
464 if (!p)
465 return SEQ_SKIP;
466 res = PTR_ERR(p); 466 res = PTR_ERR(p);
467 if (!IS_ERR(p)) { 467 if (!IS_ERR(p)) {
468 char *end = mangle_path(buf, p, esc); 468 char *end = mangle_path(buf, p, esc);
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
474 } 474 }
475 seq_commit(m, res); 475 seq_commit(m, res);
476 476
477 return res < 0 ? res : 0; 477 return res < 0 && res != -ENAMETOOLONG ? res : 0;
478} 478}
479 479
480/* 480/*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 20403dc5d437..ae0e76bb6ebf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2264,19 +2264,12 @@ static int __init ubifs_init(void)
2264 return -EINVAL; 2264 return -EINVAL;
2265 } 2265 }
2266 2266
2267 err = register_filesystem(&ubifs_fs_type);
2268 if (err) {
2269 ubifs_err("cannot register file system, error %d", err);
2270 return err;
2271 }
2272
2273 err = -ENOMEM;
2274 ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", 2267 ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
2275 sizeof(struct ubifs_inode), 0, 2268 sizeof(struct ubifs_inode), 0,
2276 SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, 2269 SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
2277 &inode_slab_ctor); 2270 &inode_slab_ctor);
2278 if (!ubifs_inode_slab) 2271 if (!ubifs_inode_slab)
2279 goto out_reg; 2272 return -ENOMEM;
2280 2273
2281 register_shrinker(&ubifs_shrinker_info); 2274 register_shrinker(&ubifs_shrinker_info);
2282 2275
@@ -2288,15 +2281,20 @@ static int __init ubifs_init(void)
2288 if (err) 2281 if (err)
2289 goto out_compr; 2282 goto out_compr;
2290 2283
2284 err = register_filesystem(&ubifs_fs_type);
2285 if (err) {
2286 ubifs_err("cannot register file system, error %d", err);
2287 goto out_dbg;
2288 }
2291 return 0; 2289 return 0;
2292 2290
2291out_dbg:
2292 dbg_debugfs_exit();
2293out_compr: 2293out_compr:
2294 ubifs_compressors_exit(); 2294 ubifs_compressors_exit();
2295out_shrinker: 2295out_shrinker:
2296 unregister_shrinker(&ubifs_shrinker_info); 2296 unregister_shrinker(&ubifs_shrinker_info);
2297 kmem_cache_destroy(ubifs_inode_slab); 2297 kmem_cache_destroy(ubifs_inode_slab);
2298out_reg:
2299 unregister_filesystem(&ubifs_fs_type);
2300 return err; 2298 return err;
2301} 2299}
2302/* late_initcall to let compressors initialize first */ 2300/* late_initcall to let compressors initialize first */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4a..76e4266d2e7e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
42 int count, i; 42 int count, i;
43 43
44 count = be32_to_cpu(aclp->acl_cnt); 44 count = be32_to_cpu(aclp->acl_cnt);
45 if (count > XFS_ACL_MAX_ENTRIES)
46 return ERR_PTR(-EFSCORRUPTED);
45 47
46 acl = posix_acl_alloc(count, GFP_KERNEL); 48 acl = posix_acl_alloc(count, GFP_KERNEL);
47 if (!acl) 49 if (!acl)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d4906e7c9787..c1b55e596551 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
110/* 110/*
111 * Query whether the requested number of additional bytes of extended 111 * Query whether the requested number of additional bytes of extended
112 * attribute space will be able to fit inline. 112 * attribute space will be able to fit inline.
113 *
113 * Returns zero if not, else the di_forkoff fork offset to be used in the 114 * Returns zero if not, else the di_forkoff fork offset to be used in the
114 * literal area for attribute data once the new bytes have been added. 115 * literal area for attribute data once the new bytes have been added.
115 * 116 *
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
122 int offset; 123 int offset;
123 int minforkoff; /* lower limit on valid forkoff locations */ 124 int minforkoff; /* lower limit on valid forkoff locations */
124 int maxforkoff; /* upper limit on valid forkoff locations */ 125 int maxforkoff; /* upper limit on valid forkoff locations */
125 int dsize; 126 int dsize;
126 xfs_mount_t *mp = dp->i_mount; 127 xfs_mount_t *mp = dp->i_mount;
127 128
128 offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ 129 offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
136 return (offset >= minforkoff) ? minforkoff : 0; 137 return (offset >= minforkoff) ? minforkoff : 0;
137 } 138 }
138 139
139 if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { 140 /*
140 if (bytes <= XFS_IFORK_ASIZE(dp)) 141 * If the requested numbers of bytes is smaller or equal to the
141 return dp->i_d.di_forkoff; 142 * current attribute fork size we can always proceed.
143 *
144 * Note that if_bytes in the data fork might actually be larger than
145 * the current data fork size is due to delalloc extents. In that
146 * case either the extent count will go down when they are converted
147 * to real extents, or the delalloc conversion will take care of the
148 * literal area rebalancing.
149 */
150 if (bytes <= XFS_IFORK_ASIZE(dp))
151 return dp->i_d.di_forkoff;
152
153 /*
154 * For attr2 we can try to move the forkoff if there is space in the
155 * literal area, but for the old format we are done if there is no
156 * space in the fixed attribute fork.
157 */
158 if (!(mp->m_flags & XFS_MOUNT_ATTR2))
142 return 0; 159 return 0;
143 }
144 160
145 dsize = dp->i_df.if_bytes; 161 dsize = dp->i_df.if_bytes;
146 162
147 switch (dp->i_d.di_format) { 163 switch (dp->i_d.di_format) {
148 case XFS_DINODE_FMT_EXTENTS: 164 case XFS_DINODE_FMT_EXTENTS:
149 /* 165 /*
150 * If there is no attr fork and the data fork is extents, 166 * If there is no attr fork and the data fork is extents,
151 * determine if creating the default attr fork will result 167 * determine if creating the default attr fork will result
152 * in the extents form migrating to btree. If so, the 168 * in the extents form migrating to btree. If so, the
153 * minimum offset only needs to be the space required for 169 * minimum offset only needs to be the space required for
154 * the btree root. 170 * the btree root.
155 */ 171 */
156 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > 172 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
157 xfs_default_attroffset(dp)) 173 xfs_default_attroffset(dp))
158 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); 174 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
159 break; 175 break;
160
161 case XFS_DINODE_FMT_BTREE: 176 case XFS_DINODE_FMT_BTREE:
162 /* 177 /*
163 * If have data btree then keep forkoff if we have one, 178 * If we have a data btree then keep forkoff if we have one,
164 * otherwise we are adding a new attr, so then we set 179 * otherwise we are adding a new attr, so then we set
165 * minforkoff to where the btree root can finish so we have 180 * minforkoff to where the btree root can finish so we have
166 * plenty of room for attrs 181 * plenty of room for attrs
167 */ 182 */
168 if (dp->i_d.di_forkoff) { 183 if (dp->i_d.di_forkoff) {
169 if (offset < dp->i_d.di_forkoff) 184 if (offset < dp->i_d.di_forkoff)
170 return 0; 185 return 0;
171 else 186 return dp->i_d.di_forkoff;
172 return dp->i_d.di_forkoff; 187 }
173 } else 188 dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
174 dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
175 break; 189 break;
176 } 190 }
177 191
178 /* 192 /*
179 * A data fork btree root must have space for at least 193 * A data fork btree root must have space for at least
180 * MINDBTPTRS key/ptr pairs if the data fork is small or empty. 194 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
181 */ 195 */
182 minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); 196 minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
186 maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); 200 maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
187 maxforkoff = maxforkoff >> 3; /* rounded down */ 201 maxforkoff = maxforkoff >> 3; /* rounded down */
188 202
189 if (offset >= minforkoff && offset < maxforkoff)
190 return offset;
191 if (offset >= maxforkoff) 203 if (offset >= maxforkoff)
192 return maxforkoff; 204 return maxforkoff;
205 if (offset >= minforkoff)
206 return offset;
193 return 0; 207 return 0;
194} 208}
195 209
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c68baeb0974a..d0ab78837057 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc(
2383 int tryagain; 2383 int tryagain;
2384 int error; 2384 int error;
2385 2385
2386 ASSERT(ap->length);
2387
2386 mp = ap->ip->i_mount; 2388 mp = ap->ip->i_mount;
2387 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 2389 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
2388 if (unlikely(align)) { 2390 if (unlikely(align)) {
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate(
4629 int error; 4631 int error;
4630 int rt; 4632 int rt;
4631 4633
4634 ASSERT(bma->length > 0);
4635
4632 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); 4636 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
4633 4637
4634 /* 4638 /*
@@ -4849,6 +4853,7 @@ xfs_bmapi_write(
4849 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4853 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4850 ASSERT(!(flags & XFS_BMAPI_IGSTATE)); 4854 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4851 ASSERT(tp != NULL); 4855 ASSERT(tp != NULL);
4856 ASSERT(len > 0);
4852 4857
4853 whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 4858 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4854 XFS_ATTR_FORK : XFS_DATA_FORK; 4859 XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -4918,9 +4923,22 @@ xfs_bmapi_write(
4918 bma.eof = eof; 4923 bma.eof = eof;
4919 bma.conv = !!(flags & XFS_BMAPI_CONVERT); 4924 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4920 bma.wasdel = wasdelay; 4925 bma.wasdel = wasdelay;
4921 bma.length = len;
4922 bma.offset = bno; 4926 bma.offset = bno;
4923 4927
4928 /*
4929 * There's a 32/64 bit type mismatch between the
4930 * allocation length request (which can be 64 bits in
4931 * length) and the bma length request, which is
4932 * xfs_extlen_t and therefore 32 bits. Hence we have to
4933 * check for 32-bit overflows and handle them here.
4934 */
4935 if (len > (xfs_filblks_t)MAXEXTLEN)
4936 bma.length = MAXEXTLEN;
4937 else
4938 bma.length = len;
4939
4940 ASSERT(len > 0);
4941 ASSERT(bma.length > 0);
4924 error = xfs_bmapi_allocate(&bma, flags); 4942 error = xfs_bmapi_allocate(&bma, flags);
4925 if (error) 4943 if (error)
4926 goto error0; 4944 goto error0;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index da108977b21f..558910f5e3c0 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -98,22 +98,22 @@ xfs_fs_encode_fh(
98 switch (fileid_type) { 98 switch (fileid_type) {
99 case FILEID_INO32_GEN_PARENT: 99 case FILEID_INO32_GEN_PARENT:
100 spin_lock(&dentry->d_lock); 100 spin_lock(&dentry->d_lock);
101 fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; 101 fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
102 fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; 102 fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
103 spin_unlock(&dentry->d_lock); 103 spin_unlock(&dentry->d_lock);
104 /*FALLTHRU*/ 104 /*FALLTHRU*/
105 case FILEID_INO32_GEN: 105 case FILEID_INO32_GEN:
106 fid->i32.ino = inode->i_ino; 106 fid->i32.ino = XFS_I(inode)->i_ino;
107 fid->i32.gen = inode->i_generation; 107 fid->i32.gen = inode->i_generation;
108 break; 108 break;
109 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: 109 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
110 spin_lock(&dentry->d_lock); 110 spin_lock(&dentry->d_lock);
111 fid64->parent_ino = dentry->d_parent->d_inode->i_ino; 111 fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
112 fid64->parent_gen = dentry->d_parent->d_inode->i_generation; 112 fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
113 spin_unlock(&dentry->d_lock); 113 spin_unlock(&dentry->d_lock);
114 /*FALLTHRU*/ 114 /*FALLTHRU*/
115 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: 115 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
116 fid64->ino = inode->i_ino; 116 fid64->ino = XFS_I(inode)->i_ino;
117 fid64->gen = inode->i_generation; 117 fid64->gen = inode->i_generation;
118 break; 118 break;
119 } 119 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0237c602f11..755ee8164880 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,6 +2835,27 @@ corrupt_out:
2835 return XFS_ERROR(EFSCORRUPTED); 2835 return XFS_ERROR(EFSCORRUPTED);
2836} 2836}
2837 2837
2838void
2839xfs_promote_inode(
2840 struct xfs_inode *ip)
2841{
2842 struct xfs_buf *bp;
2843
2844 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2845
2846 bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
2847 ip->i_imap.im_len, XBF_TRYLOCK);
2848 if (!bp)
2849 return;
2850
2851 if (XFS_BUF_ISDELAYWRITE(bp)) {
2852 xfs_buf_delwri_promote(bp);
2853 wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
2854 }
2855
2856 xfs_buf_relse(bp);
2857}
2858
2838/* 2859/*
2839 * Return a pointer to the extent record at file index idx. 2860 * Return a pointer to the extent record at file index idx.
2840 */ 2861 */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 760140d1dd66..b4cd4739f98e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
498void xfs_iext_realloc(xfs_inode_t *, int, int); 498void xfs_iext_realloc(xfs_inode_t *, int, int);
499void xfs_iunpin_wait(xfs_inode_t *); 499void xfs_iunpin_wait(xfs_inode_t *);
500int xfs_iflush(xfs_inode_t *, uint); 500int xfs_iflush(xfs_inode_t *, uint);
501void xfs_promote_inode(struct xfs_inode *);
501void xfs_lock_inodes(xfs_inode_t **, int, uint); 502void xfs_lock_inodes(xfs_inode_t **, int, uint);
502void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 503void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
503 504
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a14cd89fe465..34817adf4b9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -150,6 +150,117 @@ xlog_grant_add_space(
150 } while (head_val != old); 150 } while (head_val != old);
151} 151}
152 152
153STATIC bool
154xlog_reserveq_wake(
155 struct log *log,
156 int *free_bytes)
157{
158 struct xlog_ticket *tic;
159 int need_bytes;
160
161 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
162 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
163 need_bytes = tic->t_unit_res * tic->t_cnt;
164 else
165 need_bytes = tic->t_unit_res;
166
167 if (*free_bytes < need_bytes)
168 return false;
169 *free_bytes -= need_bytes;
170
171 trace_xfs_log_grant_wake_up(log, tic);
172 wake_up(&tic->t_wait);
173 }
174
175 return true;
176}
177
178STATIC bool
179xlog_writeq_wake(
180 struct log *log,
181 int *free_bytes)
182{
183 struct xlog_ticket *tic;
184 int need_bytes;
185
186 list_for_each_entry(tic, &log->l_writeq, t_queue) {
187 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
188
189 need_bytes = tic->t_unit_res;
190
191 if (*free_bytes < need_bytes)
192 return false;
193 *free_bytes -= need_bytes;
194
195 trace_xfs_log_regrant_write_wake_up(log, tic);
196 wake_up(&tic->t_wait);
197 }
198
199 return true;
200}
201
202STATIC int
203xlog_reserveq_wait(
204 struct log *log,
205 struct xlog_ticket *tic,
206 int need_bytes)
207{
208 list_add_tail(&tic->t_queue, &log->l_reserveq);
209
210 do {
211 if (XLOG_FORCED_SHUTDOWN(log))
212 goto shutdown;
213 xlog_grant_push_ail(log, need_bytes);
214
215 XFS_STATS_INC(xs_sleep_logspace);
216 trace_xfs_log_grant_sleep(log, tic);
217
218 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
219 trace_xfs_log_grant_wake(log, tic);
220
221 spin_lock(&log->l_grant_reserve_lock);
222 if (XLOG_FORCED_SHUTDOWN(log))
223 goto shutdown;
224 } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
225
226 list_del_init(&tic->t_queue);
227 return 0;
228shutdown:
229 list_del_init(&tic->t_queue);
230 return XFS_ERROR(EIO);
231}
232
233STATIC int
234xlog_writeq_wait(
235 struct log *log,
236 struct xlog_ticket *tic,
237 int need_bytes)
238{
239 list_add_tail(&tic->t_queue, &log->l_writeq);
240
241 do {
242 if (XLOG_FORCED_SHUTDOWN(log))
243 goto shutdown;
244 xlog_grant_push_ail(log, need_bytes);
245
246 XFS_STATS_INC(xs_sleep_logspace);
247 trace_xfs_log_regrant_write_sleep(log, tic);
248
249 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
250 trace_xfs_log_regrant_write_wake(log, tic);
251
252 spin_lock(&log->l_grant_write_lock);
253 if (XLOG_FORCED_SHUTDOWN(log))
254 goto shutdown;
255 } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
256
257 list_del_init(&tic->t_queue);
258 return 0;
259shutdown:
260 list_del_init(&tic->t_queue);
261 return XFS_ERROR(EIO);
262}
263
153static void 264static void
154xlog_tic_reset_res(xlog_ticket_t *tic) 265xlog_tic_reset_res(xlog_ticket_t *tic)
155{ 266{
@@ -350,8 +461,19 @@ xfs_log_reserve(
350 retval = xlog_grant_log_space(log, internal_ticket); 461 retval = xlog_grant_log_space(log, internal_ticket);
351 } 462 }
352 463
464 if (unlikely(retval)) {
465 /*
466 * If we are failing, make sure the ticket doesn't have any
467 * current reservations. We don't want to add this back
468 * when the ticket/ transaction gets cancelled.
469 */
470 internal_ticket->t_curr_res = 0;
471 /* ungrant will give back unit_res * t_cnt. */
472 internal_ticket->t_cnt = 0;
473 }
474
353 return retval; 475 return retval;
354} /* xfs_log_reserve */ 476}
355 477
356 478
357/* 479/*
@@ -2481,8 +2603,8 @@ restart:
2481/* 2603/*
2482 * Atomically get the log space required for a log ticket. 2604 * Atomically get the log space required for a log ticket.
2483 * 2605 *
2484 * Once a ticket gets put onto the reserveq, it will only return after 2606 * Once a ticket gets put onto the reserveq, it will only return after the
2485 * the needed reservation is satisfied. 2607 * needed reservation is satisfied.
2486 * 2608 *
2487 * This function is structured so that it has a lock free fast path. This is 2609 * This function is structured so that it has a lock free fast path. This is
2488 * necessary because every new transaction reservation will come through this 2610 * necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
2490 * every pass. 2612 * every pass.
2491 * 2613 *
2492 * As tickets are only ever moved on and off the reserveq under the 2614 * As tickets are only ever moved on and off the reserveq under the
2493 * l_grant_reserve_lock, we only need to take that lock if we are going 2615 * l_grant_reserve_lock, we only need to take that lock if we are going to add
2494 * to add the ticket to the queue and sleep. We can avoid taking the lock if the 2616 * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
2495 * ticket was never added to the reserveq because the t_queue list head will be 2617 * was never added to the reserveq because the t_queue list head will be empty
2496 * empty and we hold the only reference to it so it can safely be checked 2618 * and we hold the only reference to it so it can safely be checked unlocked.
2497 * unlocked.
2498 */ 2619 */
2499STATIC int 2620STATIC int
2500xlog_grant_log_space(xlog_t *log, 2621xlog_grant_log_space(
2501 xlog_ticket_t *tic) 2622 struct log *log,
2623 struct xlog_ticket *tic)
2502{ 2624{
2503 int free_bytes; 2625 int free_bytes, need_bytes;
2504 int need_bytes; 2626 int error = 0;
2505 2627
2506#ifdef DEBUG 2628 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
2507 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2508 panic("grant Recovery problem");
2509#endif
2510 2629
2511 trace_xfs_log_grant_enter(log, tic); 2630 trace_xfs_log_grant_enter(log, tic);
2512 2631
2632 /*
2633 * If there are other waiters on the queue then give them a chance at
2634 * logspace before us. Wake up the first waiters, if we do not wake
2635 * up all the waiters then go to sleep waiting for more free space,
2636 * otherwise try to get some space for this transaction.
2637 */
2513 need_bytes = tic->t_unit_res; 2638 need_bytes = tic->t_unit_res;
2514 if (tic->t_flags & XFS_LOG_PERM_RESERV) 2639 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2515 need_bytes *= tic->t_ocnt; 2640 need_bytes *= tic->t_ocnt;
2516
2517 /* something is already sleeping; insert new transaction at end */
2518 if (!list_empty_careful(&log->l_reserveq)) {
2519 spin_lock(&log->l_grant_reserve_lock);
2520 /* recheck the queue now we are locked */
2521 if (list_empty(&log->l_reserveq)) {
2522 spin_unlock(&log->l_grant_reserve_lock);
2523 goto redo;
2524 }
2525 list_add_tail(&tic->t_queue, &log->l_reserveq);
2526
2527 trace_xfs_log_grant_sleep1(log, tic);
2528
2529 /*
2530 * Gotta check this before going to sleep, while we're
2531 * holding the grant lock.
2532 */
2533 if (XLOG_FORCED_SHUTDOWN(log))
2534 goto error_return;
2535
2536 XFS_STATS_INC(xs_sleep_logspace);
2537 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2538
2539 /*
2540 * If we got an error, and the filesystem is shutting down,
2541 * we'll catch it down below. So just continue...
2542 */
2543 trace_xfs_log_grant_wake1(log, tic);
2544 }
2545
2546redo:
2547 if (XLOG_FORCED_SHUTDOWN(log))
2548 goto error_return_unlocked;
2549
2550 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); 2641 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2551 if (free_bytes < need_bytes) { 2642 if (!list_empty_careful(&log->l_reserveq)) {
2552 spin_lock(&log->l_grant_reserve_lock); 2643 spin_lock(&log->l_grant_reserve_lock);
2553 if (list_empty(&tic->t_queue)) 2644 if (!xlog_reserveq_wake(log, &free_bytes) ||
2554 list_add_tail(&tic->t_queue, &log->l_reserveq); 2645 free_bytes < need_bytes)
2555 2646 error = xlog_reserveq_wait(log, tic, need_bytes);
2556 trace_xfs_log_grant_sleep2(log, tic); 2647 spin_unlock(&log->l_grant_reserve_lock);
2557 2648 } else if (free_bytes < need_bytes) {
2558 if (XLOG_FORCED_SHUTDOWN(log))
2559 goto error_return;
2560
2561 xlog_grant_push_ail(log, need_bytes);
2562
2563 XFS_STATS_INC(xs_sleep_logspace);
2564 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2565
2566 trace_xfs_log_grant_wake2(log, tic);
2567 goto redo;
2568 }
2569
2570 if (!list_empty(&tic->t_queue)) {
2571 spin_lock(&log->l_grant_reserve_lock); 2649 spin_lock(&log->l_grant_reserve_lock);
2572 list_del_init(&tic->t_queue); 2650 error = xlog_reserveq_wait(log, tic, need_bytes);
2573 spin_unlock(&log->l_grant_reserve_lock); 2651 spin_unlock(&log->l_grant_reserve_lock);
2574 } 2652 }
2653 if (error)
2654 return error;
2575 2655
2576 /* we've got enough space */
2577 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); 2656 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2578 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); 2657 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2579 trace_xfs_log_grant_exit(log, tic); 2658 trace_xfs_log_grant_exit(log, tic);
2580 xlog_verify_grant_tail(log); 2659 xlog_verify_grant_tail(log);
2581 return 0; 2660 return 0;
2582 2661}
2583error_return_unlocked:
2584 spin_lock(&log->l_grant_reserve_lock);
2585error_return:
2586 list_del_init(&tic->t_queue);
2587 spin_unlock(&log->l_grant_reserve_lock);
2588 trace_xfs_log_grant_error(log, tic);
2589
2590 /*
2591 * If we are failing, make sure the ticket doesn't have any
2592 * current reservations. We don't want to add this back when
2593 * the ticket/transaction gets cancelled.
2594 */
2595 tic->t_curr_res = 0;
2596 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2597 return XFS_ERROR(EIO);
2598} /* xlog_grant_log_space */
2599
2600 2662
2601/* 2663/*
2602 * Replenish the byte reservation required by moving the grant write head. 2664 * Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
2605 * free fast path. 2667 * free fast path.
2606 */ 2668 */
2607STATIC int 2669STATIC int
2608xlog_regrant_write_log_space(xlog_t *log, 2670xlog_regrant_write_log_space(
2609 xlog_ticket_t *tic) 2671 struct log *log,
2672 struct xlog_ticket *tic)
2610{ 2673{
2611 int free_bytes, need_bytes; 2674 int free_bytes, need_bytes;
2675 int error = 0;
2612 2676
2613 tic->t_curr_res = tic->t_unit_res; 2677 tic->t_curr_res = tic->t_unit_res;
2614 xlog_tic_reset_res(tic); 2678 xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log,
2616 if (tic->t_cnt > 0) 2680 if (tic->t_cnt > 0)
2617 return 0; 2681 return 0;
2618 2682
2619#ifdef DEBUG 2683 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
2620 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2621 panic("regrant Recovery problem");
2622#endif
2623 2684
2624 trace_xfs_log_regrant_write_enter(log, tic); 2685 trace_xfs_log_regrant_write_enter(log, tic);
2625 if (XLOG_FORCED_SHUTDOWN(log))
2626 goto error_return_unlocked;
2627 2686
2628 /* If there are other waiters on the queue then give them a 2687 /*
2629 * chance at logspace before us. Wake up the first waiters, 2688 * If there are other waiters on the queue then give them a chance at
2630 * if we do not wake up all the waiters then go to sleep waiting 2689 * logspace before us. Wake up the first waiters, if we do not wake
2631 * for more free space, otherwise try to get some space for 2690 * up all the waiters then go to sleep waiting for more free space,
2632 * this transaction. 2691 * otherwise try to get some space for this transaction.
2633 */ 2692 */
2634 need_bytes = tic->t_unit_res; 2693 need_bytes = tic->t_unit_res;
2635 if (!list_empty_careful(&log->l_writeq)) {
2636 struct xlog_ticket *ntic;
2637
2638 spin_lock(&log->l_grant_write_lock);
2639 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2640 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2641 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2642
2643 if (free_bytes < ntic->t_unit_res)
2644 break;
2645 free_bytes -= ntic->t_unit_res;
2646 wake_up(&ntic->t_wait);
2647 }
2648
2649 if (ntic != list_first_entry(&log->l_writeq,
2650 struct xlog_ticket, t_queue)) {
2651 if (list_empty(&tic->t_queue))
2652 list_add_tail(&tic->t_queue, &log->l_writeq);
2653 trace_xfs_log_regrant_write_sleep1(log, tic);
2654
2655 xlog_grant_push_ail(log, need_bytes);
2656
2657 XFS_STATS_INC(xs_sleep_logspace);
2658 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2659 trace_xfs_log_regrant_write_wake1(log, tic);
2660 } else
2661 spin_unlock(&log->l_grant_write_lock);
2662 }
2663
2664redo:
2665 if (XLOG_FORCED_SHUTDOWN(log))
2666 goto error_return_unlocked;
2667
2668 free_bytes = xlog_space_left(log, &log->l_grant_write_head); 2694 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2669 if (free_bytes < need_bytes) { 2695 if (!list_empty_careful(&log->l_writeq)) {
2670 spin_lock(&log->l_grant_write_lock); 2696 spin_lock(&log->l_grant_write_lock);
2671 if (list_empty(&tic->t_queue)) 2697 if (!xlog_writeq_wake(log, &free_bytes) ||
2672 list_add_tail(&tic->t_queue, &log->l_writeq); 2698 free_bytes < need_bytes)
2673 2699 error = xlog_writeq_wait(log, tic, need_bytes);
2674 if (XLOG_FORCED_SHUTDOWN(log)) 2700 spin_unlock(&log->l_grant_write_lock);
2675 goto error_return; 2701 } else if (free_bytes < need_bytes) {
2676
2677 xlog_grant_push_ail(log, need_bytes);
2678
2679 XFS_STATS_INC(xs_sleep_logspace);
2680 trace_xfs_log_regrant_write_sleep2(log, tic);
2681 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2682
2683 trace_xfs_log_regrant_write_wake2(log, tic);
2684 goto redo;
2685 }
2686
2687 if (!list_empty(&tic->t_queue)) {
2688 spin_lock(&log->l_grant_write_lock); 2702 spin_lock(&log->l_grant_write_lock);
2689 list_del_init(&tic->t_queue); 2703 error = xlog_writeq_wait(log, tic, need_bytes);
2690 spin_unlock(&log->l_grant_write_lock); 2704 spin_unlock(&log->l_grant_write_lock);
2691 } 2705 }
2692 2706
2693 /* we've got enough space */ 2707 if (error)
2708 return error;
2709
2694 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); 2710 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2695 trace_xfs_log_regrant_write_exit(log, tic); 2711 trace_xfs_log_regrant_write_exit(log, tic);
2696 xlog_verify_grant_tail(log); 2712 xlog_verify_grant_tail(log);
2697 return 0; 2713 return 0;
2698 2714}
2699
2700 error_return_unlocked:
2701 spin_lock(&log->l_grant_write_lock);
2702 error_return:
2703 list_del_init(&tic->t_queue);
2704 spin_unlock(&log->l_grant_write_lock);
2705 trace_xfs_log_regrant_write_error(log, tic);
2706
2707 /*
2708 * If we are failing, make sure the ticket doesn't have any
2709 * current reservations. We don't want to add this back when
2710 * the ticket/transaction gets cancelled.
2711 */
2712 tic->t_curr_res = 0;
2713 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2714 return XFS_ERROR(EIO);
2715} /* xlog_regrant_write_log_space */
2716
2717 2715
2718/* The first cnt-1 times through here we don't need to 2716/* The first cnt-1 times through here we don't need to
2719 * move the grant write head because the permanent 2717 * move the grant write head because the permanent
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5cff443f6cdb..0bbb1a41998b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one(
674 * disk and we didn't ask it to allocate; 674 * disk and we didn't ask it to allocate;
675 * ESRCH if quotas got turned off suddenly. 675 * ESRCH if quotas got turned off suddenly.
676 */ 676 */
677 error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); 677 error = xfs_qm_dqget(ip->i_mount, ip, id, type,
678 doalloc | XFS_QMOPT_DOWARN, &dqp);
678 if (error) 679 if (error)
679 return error; 680 return error;
680 681
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index aa3dc1a4d53d..be5c51d8f757 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -770,6 +770,17 @@ restart:
770 if (!xfs_iflock_nowait(ip)) { 770 if (!xfs_iflock_nowait(ip)) {
771 if (!(sync_mode & SYNC_WAIT)) 771 if (!(sync_mode & SYNC_WAIT))
772 goto out; 772 goto out;
773
774 /*
775 * If we only have a single dirty inode in a cluster there is
776 * a fair chance that the AIL push may have pushed it into
777 * the buffer, but xfsbufd won't touch it until 30 seconds
778 * from now, and thus we will lock up here.
779 *
780 * Promote the inode buffer to the front of the delwri list
781 * and wake up xfsbufd now.
782 */
783 xfs_promote_inode(ip);
773 xfs_iflock(ip); 784 xfs_iflock(ip);
774 } 785 }
775 786
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1d2802b2f07..494035798873 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
834DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); 834DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); 835DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); 836DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
839DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
841DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);