diff options
Diffstat (limited to 'fs')
115 files changed, 2688 insertions, 1661 deletions
@@ -337,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) | |||
337 | * RETURNS: | 337 | * RETURNS: |
338 | * Pointer to new bio on success, NULL on failure. | 338 | * Pointer to new bio on success, NULL on failure. |
339 | */ | 339 | */ |
340 | struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) | 340 | struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) |
341 | { | 341 | { |
342 | struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); | 342 | struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); |
343 | 343 | ||
@@ -365,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) | |||
365 | * %__GFP_WAIT, the allocation is guaranteed to succeed. | 365 | * %__GFP_WAIT, the allocation is guaranteed to succeed. |
366 | * | 366 | * |
367 | **/ | 367 | **/ |
368 | struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) | 368 | struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) |
369 | { | 369 | { |
370 | struct bio *bio; | 370 | struct bio *bio; |
371 | 371 | ||
@@ -696,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) | |||
696 | kfree(bmd); | 696 | kfree(bmd); |
697 | } | 697 | } |
698 | 698 | ||
699 | static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, | 699 | static struct bio_map_data *bio_alloc_map_data(int nr_segs, |
700 | unsigned int iov_count, | ||
700 | gfp_t gfp_mask) | 701 | gfp_t gfp_mask) |
701 | { | 702 | { |
702 | struct bio_map_data *bmd; | 703 | struct bio_map_data *bmd; |
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec14097fef1..cb97174e2366 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c | |||
@@ -64,6 +64,8 @@ struct btrfs_worker_thread { | |||
64 | int idle; | 64 | int idle; |
65 | }; | 65 | }; |
66 | 66 | ||
67 | static int __btrfs_start_workers(struct btrfs_workers *workers); | ||
68 | |||
67 | /* | 69 | /* |
68 | * btrfs_start_workers uses kthread_run, which can block waiting for memory | 70 | * btrfs_start_workers uses kthread_run, which can block waiting for memory |
69 | * for a very long time. It will actually throttle on page writeback, | 71 | * for a very long time. It will actually throttle on page writeback, |
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work) | |||
88 | { | 90 | { |
89 | struct worker_start *start; | 91 | struct worker_start *start; |
90 | start = container_of(work, struct worker_start, work); | 92 | start = container_of(work, struct worker_start, work); |
91 | btrfs_start_workers(start->queue, 1); | 93 | __btrfs_start_workers(start->queue); |
92 | kfree(start); | 94 | kfree(start); |
93 | } | 95 | } |
94 | 96 | ||
95 | static int start_new_worker(struct btrfs_workers *queue) | ||
96 | { | ||
97 | struct worker_start *start; | ||
98 | int ret; | ||
99 | |||
100 | start = kzalloc(sizeof(*start), GFP_NOFS); | ||
101 | if (!start) | ||
102 | return -ENOMEM; | ||
103 | |||
104 | start->work.func = start_new_worker_func; | ||
105 | start->queue = queue; | ||
106 | ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); | ||
107 | if (ret) | ||
108 | kfree(start); | ||
109 | return ret; | ||
110 | } | ||
111 | |||
112 | /* | 97 | /* |
113 | * helper function to move a thread onto the idle list after it | 98 | * helper function to move a thread onto the idle list after it |
114 | * has finished some requests. | 99 | * has finished some requests. |
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) | |||
153 | static void check_pending_worker_creates(struct btrfs_worker_thread *worker) | 138 | static void check_pending_worker_creates(struct btrfs_worker_thread *worker) |
154 | { | 139 | { |
155 | struct btrfs_workers *workers = worker->workers; | 140 | struct btrfs_workers *workers = worker->workers; |
141 | struct worker_start *start; | ||
156 | unsigned long flags; | 142 | unsigned long flags; |
157 | 143 | ||
158 | rmb(); | 144 | rmb(); |
159 | if (!workers->atomic_start_pending) | 145 | if (!workers->atomic_start_pending) |
160 | return; | 146 | return; |
161 | 147 | ||
148 | start = kzalloc(sizeof(*start), GFP_NOFS); | ||
149 | if (!start) | ||
150 | return; | ||
151 | |||
152 | start->work.func = start_new_worker_func; | ||
153 | start->queue = workers; | ||
154 | |||
162 | spin_lock_irqsave(&workers->lock, flags); | 155 | spin_lock_irqsave(&workers->lock, flags); |
163 | if (!workers->atomic_start_pending) | 156 | if (!workers->atomic_start_pending) |
164 | goto out; | 157 | goto out; |
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) | |||
170 | 163 | ||
171 | workers->num_workers_starting += 1; | 164 | workers->num_workers_starting += 1; |
172 | spin_unlock_irqrestore(&workers->lock, flags); | 165 | spin_unlock_irqrestore(&workers->lock, flags); |
173 | start_new_worker(workers); | 166 | btrfs_queue_worker(workers->atomic_worker_start, &start->work); |
174 | return; | 167 | return; |
175 | 168 | ||
176 | out: | 169 | out: |
170 | kfree(start); | ||
177 | spin_unlock_irqrestore(&workers->lock, flags); | 171 | spin_unlock_irqrestore(&workers->lock, flags); |
178 | } | 172 | } |
179 | 173 | ||
@@ -331,7 +325,7 @@ again: | |||
331 | run_ordered_completions(worker->workers, work); | 325 | run_ordered_completions(worker->workers, work); |
332 | 326 | ||
333 | check_pending_worker_creates(worker); | 327 | check_pending_worker_creates(worker); |
334 | 328 | cond_resched(); | |
335 | } | 329 | } |
336 | 330 | ||
337 | spin_lock_irq(&worker->lock); | 331 | spin_lock_irq(&worker->lock); |
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, | |||
462 | * starts new worker threads. This does not enforce the max worker | 456 | * starts new worker threads. This does not enforce the max worker |
463 | * count in case you need to temporarily go past it. | 457 | * count in case you need to temporarily go past it. |
464 | */ | 458 | */ |
465 | static int __btrfs_start_workers(struct btrfs_workers *workers, | 459 | static int __btrfs_start_workers(struct btrfs_workers *workers) |
466 | int num_workers) | ||
467 | { | 460 | { |
468 | struct btrfs_worker_thread *worker; | 461 | struct btrfs_worker_thread *worker; |
469 | int ret = 0; | 462 | int ret = 0; |
470 | int i; | ||
471 | 463 | ||
472 | for (i = 0; i < num_workers; i++) { | 464 | worker = kzalloc(sizeof(*worker), GFP_NOFS); |
473 | worker = kzalloc(sizeof(*worker), GFP_NOFS); | 465 | if (!worker) { |
474 | if (!worker) { | 466 | ret = -ENOMEM; |
475 | ret = -ENOMEM; | 467 | goto fail; |
476 | goto fail; | 468 | } |
477 | } | ||
478 | 469 | ||
479 | INIT_LIST_HEAD(&worker->pending); | 470 | INIT_LIST_HEAD(&worker->pending); |
480 | INIT_LIST_HEAD(&worker->prio_pending); | 471 | INIT_LIST_HEAD(&worker->prio_pending); |
481 | INIT_LIST_HEAD(&worker->worker_list); | 472 | INIT_LIST_HEAD(&worker->worker_list); |
482 | spin_lock_init(&worker->lock); | 473 | spin_lock_init(&worker->lock); |
483 | 474 | ||
484 | atomic_set(&worker->num_pending, 0); | 475 | atomic_set(&worker->num_pending, 0); |
485 | atomic_set(&worker->refs, 1); | 476 | atomic_set(&worker->refs, 1); |
486 | worker->workers = workers; | 477 | worker->workers = workers; |
487 | worker->task = kthread_run(worker_loop, worker, | 478 | worker->task = kthread_run(worker_loop, worker, |
488 | "btrfs-%s-%d", workers->name, | 479 | "btrfs-%s-%d", workers->name, |
489 | workers->num_workers + i); | 480 | workers->num_workers + 1); |
490 | if (IS_ERR(worker->task)) { | 481 | if (IS_ERR(worker->task)) { |
491 | ret = PTR_ERR(worker->task); | 482 | ret = PTR_ERR(worker->task); |
492 | kfree(worker); | 483 | kfree(worker); |
493 | goto fail; | 484 | goto fail; |
494 | } | ||
495 | spin_lock_irq(&workers->lock); | ||
496 | list_add_tail(&worker->worker_list, &workers->idle_list); | ||
497 | worker->idle = 1; | ||
498 | workers->num_workers++; | ||
499 | workers->num_workers_starting--; | ||
500 | WARN_ON(workers->num_workers_starting < 0); | ||
501 | spin_unlock_irq(&workers->lock); | ||
502 | } | 485 | } |
486 | spin_lock_irq(&workers->lock); | ||
487 | list_add_tail(&worker->worker_list, &workers->idle_list); | ||
488 | worker->idle = 1; | ||
489 | workers->num_workers++; | ||
490 | workers->num_workers_starting--; | ||
491 | WARN_ON(workers->num_workers_starting < 0); | ||
492 | spin_unlock_irq(&workers->lock); | ||
493 | |||
503 | return 0; | 494 | return 0; |
504 | fail: | 495 | fail: |
505 | btrfs_stop_workers(workers); | 496 | spin_lock_irq(&workers->lock); |
497 | workers->num_workers_starting--; | ||
498 | spin_unlock_irq(&workers->lock); | ||
506 | return ret; | 499 | return ret; |
507 | } | 500 | } |
508 | 501 | ||
509 | int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) | 502 | int btrfs_start_workers(struct btrfs_workers *workers) |
510 | { | 503 | { |
511 | spin_lock_irq(&workers->lock); | 504 | spin_lock_irq(&workers->lock); |
512 | workers->num_workers_starting += num_workers; | 505 | workers->num_workers_starting++; |
513 | spin_unlock_irq(&workers->lock); | 506 | spin_unlock_irq(&workers->lock); |
514 | return __btrfs_start_workers(workers, num_workers); | 507 | return __btrfs_start_workers(workers); |
515 | } | 508 | } |
516 | 509 | ||
517 | /* | 510 | /* |
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) | |||
568 | struct btrfs_worker_thread *worker; | 561 | struct btrfs_worker_thread *worker; |
569 | unsigned long flags; | 562 | unsigned long flags; |
570 | struct list_head *fallback; | 563 | struct list_head *fallback; |
564 | int ret; | ||
571 | 565 | ||
572 | again: | 566 | again: |
573 | spin_lock_irqsave(&workers->lock, flags); | 567 | spin_lock_irqsave(&workers->lock, flags); |
@@ -584,7 +578,9 @@ again: | |||
584 | workers->num_workers_starting++; | 578 | workers->num_workers_starting++; |
585 | spin_unlock_irqrestore(&workers->lock, flags); | 579 | spin_unlock_irqrestore(&workers->lock, flags); |
586 | /* we're below the limit, start another worker */ | 580 | /* we're below the limit, start another worker */ |
587 | __btrfs_start_workers(workers, 1); | 581 | ret = __btrfs_start_workers(workers); |
582 | if (ret) | ||
583 | goto fallback; | ||
588 | goto again; | 584 | goto again; |
589 | } | 585 | } |
590 | } | 586 | } |
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work) | |||
665 | /* | 661 | /* |
666 | * places a struct btrfs_work into the pending queue of one of the kthreads | 662 | * places a struct btrfs_work into the pending queue of one of the kthreads |
667 | */ | 663 | */ |
668 | int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) | 664 | void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) |
669 | { | 665 | { |
670 | struct btrfs_worker_thread *worker; | 666 | struct btrfs_worker_thread *worker; |
671 | unsigned long flags; | 667 | unsigned long flags; |
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) | |||
673 | 669 | ||
674 | /* don't requeue something already on a list */ | 670 | /* don't requeue something already on a list */ |
675 | if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) | 671 | if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) |
676 | goto out; | 672 | return; |
677 | 673 | ||
678 | worker = find_worker(workers); | 674 | worker = find_worker(workers); |
679 | if (workers->ordered) { | 675 | if (workers->ordered) { |
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) | |||
712 | if (wake) | 708 | if (wake) |
713 | wake_up_process(worker->task); | 709 | wake_up_process(worker->task); |
714 | spin_unlock_irqrestore(&worker->lock, flags); | 710 | spin_unlock_irqrestore(&worker->lock, flags); |
715 | |||
716 | out: | ||
717 | return 0; | ||
718 | } | 711 | } |
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 5077746cf85e..f34cc31fa3c9 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h | |||
@@ -109,8 +109,8 @@ struct btrfs_workers { | |||
109 | char *name; | 109 | char *name; |
110 | }; | 110 | }; |
111 | 111 | ||
112 | int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); | 112 | void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); |
113 | int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); | 113 | int btrfs_start_workers(struct btrfs_workers *workers); |
114 | int btrfs_stop_workers(struct btrfs_workers *workers); | 114 | int btrfs_stop_workers(struct btrfs_workers *workers); |
115 | void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, | 115 | void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, |
116 | struct btrfs_workers *async_starter); | 116 | struct btrfs_workers *async_starter); |
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad3929c..22c64fff1bd5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c | |||
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, | |||
683 | return PTR_ERR(fspath); | 683 | return PTR_ERR(fspath); |
684 | 684 | ||
685 | if (fspath > fspath_min) { | 685 | if (fspath > fspath_min) { |
686 | ipath->fspath->val[i] = (u64)fspath; | 686 | ipath->fspath->val[i] = (u64)(unsigned long)fspath; |
687 | ++ipath->fspath->elem_cnt; | 687 | ++ipath->fspath->elem_cnt; |
688 | ipath->fspath->bytes_left = fspath - fspath_min; | 688 | ipath->fspath->bytes_left = fspath - fspath_min; |
689 | } else { | 689 | } else { |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e4ea38..dede441bdeee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, | |||
514 | struct btrfs_root *root, | 514 | struct btrfs_root *root, |
515 | struct extent_buffer *buf) | 515 | struct extent_buffer *buf) |
516 | { | 516 | { |
517 | /* ensure we can see the force_cow */ | ||
518 | smp_rmb(); | ||
519 | |||
520 | /* | ||
521 | * We do not need to cow a block if | ||
522 | * 1) this block is not created or changed in this transaction; | ||
523 | * 2) this block does not belong to TREE_RELOC tree; | ||
524 | * 3) the root is not forced COW. | ||
525 | * | ||
526 | * What is forced COW: | ||
527 | * when we create snapshot during commiting the transaction, | ||
528 | * after we've finished coping src root, we must COW the shared | ||
529 | * block to ensure the metadata consistency. | ||
530 | */ | ||
517 | if (btrfs_header_generation(buf) == trans->transid && | 531 | if (btrfs_header_generation(buf) == trans->transid && |
518 | !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && | 532 | !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && |
519 | !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && | 533 | !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && |
520 | btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) | 534 | btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && |
535 | !root->force_cow) | ||
521 | return 0; | 536 | return 0; |
522 | return 1; | 537 | return 1; |
523 | } | 538 | } |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59ff9292..67385033323d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -848,7 +848,8 @@ struct btrfs_free_cluster { | |||
848 | enum btrfs_caching_type { | 848 | enum btrfs_caching_type { |
849 | BTRFS_CACHE_NO = 0, | 849 | BTRFS_CACHE_NO = 0, |
850 | BTRFS_CACHE_STARTED = 1, | 850 | BTRFS_CACHE_STARTED = 1, |
851 | BTRFS_CACHE_FINISHED = 2, | 851 | BTRFS_CACHE_FAST = 2, |
852 | BTRFS_CACHE_FINISHED = 3, | ||
852 | }; | 853 | }; |
853 | 854 | ||
854 | enum btrfs_disk_cache_state { | 855 | enum btrfs_disk_cache_state { |
@@ -1271,6 +1272,8 @@ struct btrfs_root { | |||
1271 | * for stat. It may be used for more later | 1272 | * for stat. It may be used for more later |
1272 | */ | 1273 | */ |
1273 | dev_t anon_dev; | 1274 | dev_t anon_dev; |
1275 | |||
1276 | int force_cow; | ||
1274 | }; | 1277 | }; |
1275 | 1278 | ||
1276 | struct btrfs_ioctl_defrag_range_args { | 1279 | struct btrfs_ioctl_defrag_range_args { |
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, | |||
2366 | int btrfs_block_rsv_refill(struct btrfs_root *root, | 2369 | int btrfs_block_rsv_refill(struct btrfs_root *root, |
2367 | struct btrfs_block_rsv *block_rsv, | 2370 | struct btrfs_block_rsv *block_rsv, |
2368 | u64 min_reserved); | 2371 | u64 min_reserved); |
2372 | int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, | ||
2373 | struct btrfs_block_rsv *block_rsv, | ||
2374 | u64 min_reserved); | ||
2369 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | 2375 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, |
2370 | struct btrfs_block_rsv *dst_rsv, | 2376 | struct btrfs_block_rsv *dst_rsv, |
2371 | u64 num_bytes); | 2377 | u64 num_bytes); |
@@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | |||
2686 | int btrfs_readpage(struct file *file, struct page *page); | 2692 | int btrfs_readpage(struct file *file, struct page *page); |
2687 | void btrfs_evict_inode(struct inode *inode); | 2693 | void btrfs_evict_inode(struct inode *inode); |
2688 | int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); | 2694 | int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); |
2689 | void btrfs_dirty_inode(struct inode *inode, int flags); | 2695 | int btrfs_dirty_inode(struct inode *inode); |
2696 | int btrfs_update_time(struct file *file); | ||
2690 | struct inode *btrfs_alloc_inode(struct super_block *sb); | 2697 | struct inode *btrfs_alloc_inode(struct super_block *sb); |
2691 | void btrfs_destroy_inode(struct inode *inode); | 2698 | void btrfs_destroy_inode(struct inode *inode); |
2692 | int btrfs_drop_inode(struct inode *inode); | 2699 | int btrfs_drop_inode(struct inode *inode); |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5b163572e0ca..9c1eccc2c503 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -640,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata( | |||
640 | * Now if src_rsv == delalloc_block_rsv we'll let it just steal since | 640 | * Now if src_rsv == delalloc_block_rsv we'll let it just steal since |
641 | * we're accounted for. | 641 | * we're accounted for. |
642 | */ | 642 | */ |
643 | if (!trans->bytes_reserved && | 643 | if (!src_rsv || (!trans->bytes_reserved && |
644 | src_rsv != &root->fs_info->delalloc_block_rsv) { | 644 | src_rsv != &root->fs_info->delalloc_block_rsv)) { |
645 | ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); | 645 | ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); |
646 | /* | 646 | /* |
647 | * Since we're under a transaction reserve_metadata_bytes could | 647 | * Since we're under a transaction reserve_metadata_bytes could |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62afe5c5694e..f44b3928dc2d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -620,7 +620,7 @@ out: | |||
620 | 620 | ||
621 | static int btree_io_failed_hook(struct bio *failed_bio, | 621 | static int btree_io_failed_hook(struct bio *failed_bio, |
622 | struct page *page, u64 start, u64 end, | 622 | struct page *page, u64 start, u64 end, |
623 | u64 mirror_num, struct extent_state *state) | 623 | int mirror_num, struct extent_state *state) |
624 | { | 624 | { |
625 | struct extent_io_tree *tree; | 625 | struct extent_io_tree *tree; |
626 | unsigned long len; | 626 | unsigned long len; |
@@ -2194,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
2194 | fs_info->endio_meta_write_workers.idle_thresh = 2; | 2194 | fs_info->endio_meta_write_workers.idle_thresh = 2; |
2195 | fs_info->readahead_workers.idle_thresh = 2; | 2195 | fs_info->readahead_workers.idle_thresh = 2; |
2196 | 2196 | ||
2197 | btrfs_start_workers(&fs_info->workers, 1); | 2197 | /* |
2198 | btrfs_start_workers(&fs_info->generic_worker, 1); | 2198 | * btrfs_start_workers can really only fail because of ENOMEM so just |
2199 | btrfs_start_workers(&fs_info->submit_workers, 1); | 2199 | * return -ENOMEM if any of these fail. |
2200 | btrfs_start_workers(&fs_info->delalloc_workers, 1); | 2200 | */ |
2201 | btrfs_start_workers(&fs_info->fixup_workers, 1); | 2201 | ret = btrfs_start_workers(&fs_info->workers); |
2202 | btrfs_start_workers(&fs_info->endio_workers, 1); | 2202 | ret |= btrfs_start_workers(&fs_info->generic_worker); |
2203 | btrfs_start_workers(&fs_info->endio_meta_workers, 1); | 2203 | ret |= btrfs_start_workers(&fs_info->submit_workers); |
2204 | btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); | 2204 | ret |= btrfs_start_workers(&fs_info->delalloc_workers); |
2205 | btrfs_start_workers(&fs_info->endio_write_workers, 1); | 2205 | ret |= btrfs_start_workers(&fs_info->fixup_workers); |
2206 | btrfs_start_workers(&fs_info->endio_freespace_worker, 1); | 2206 | ret |= btrfs_start_workers(&fs_info->endio_workers); |
2207 | btrfs_start_workers(&fs_info->delayed_workers, 1); | 2207 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); |
2208 | btrfs_start_workers(&fs_info->caching_workers, 1); | 2208 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); |
2209 | btrfs_start_workers(&fs_info->readahead_workers, 1); | 2209 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); |
2210 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); | ||
2211 | ret |= btrfs_start_workers(&fs_info->delayed_workers); | ||
2212 | ret |= btrfs_start_workers(&fs_info->caching_workers); | ||
2213 | ret |= btrfs_start_workers(&fs_info->readahead_workers); | ||
2214 | if (ret) { | ||
2215 | ret = -ENOMEM; | ||
2216 | goto fail_sb_buffer; | ||
2217 | } | ||
2210 | 2218 | ||
2211 | fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); | 2219 | fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); |
2212 | fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, | 2220 | fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, |
@@ -2573,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device, | |||
2573 | int errors = 0; | 2581 | int errors = 0; |
2574 | u32 crc; | 2582 | u32 crc; |
2575 | u64 bytenr; | 2583 | u64 bytenr; |
2576 | int last_barrier = 0; | ||
2577 | 2584 | ||
2578 | if (max_mirrors == 0) | 2585 | if (max_mirrors == 0) |
2579 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; | 2586 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; |
2580 | 2587 | ||
2581 | /* make sure only the last submit_bh does a barrier */ | ||
2582 | if (do_barriers) { | ||
2583 | for (i = 0; i < max_mirrors; i++) { | ||
2584 | bytenr = btrfs_sb_offset(i); | ||
2585 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= | ||
2586 | device->total_bytes) | ||
2587 | break; | ||
2588 | last_barrier = i; | ||
2589 | } | ||
2590 | } | ||
2591 | |||
2592 | for (i = 0; i < max_mirrors; i++) { | 2588 | for (i = 0; i < max_mirrors; i++) { |
2593 | bytenr = btrfs_sb_offset(i); | 2589 | bytenr = btrfs_sb_offset(i); |
2594 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) | 2590 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) |
@@ -2634,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device, | |||
2634 | bh->b_end_io = btrfs_end_buffer_write_sync; | 2630 | bh->b_end_io = btrfs_end_buffer_write_sync; |
2635 | } | 2631 | } |
2636 | 2632 | ||
2637 | if (i == last_barrier && do_barriers) | 2633 | /* |
2638 | ret = submit_bh(WRITE_FLUSH_FUA, bh); | 2634 | * we fua the first super. The others we allow |
2639 | else | 2635 | * to go down lazy. |
2640 | ret = submit_bh(WRITE_SYNC, bh); | 2636 | */ |
2641 | 2637 | ret = submit_bh(WRITE_FUA, bh); | |
2642 | if (ret) | 2638 | if (ret) |
2643 | errors++; | 2639 | errors++; |
2644 | } | 2640 | } |
2645 | return errors < i ? 0 : -1; | 2641 | return errors < i ? 0 : -1; |
2646 | } | 2642 | } |
2647 | 2643 | ||
2644 | /* | ||
2645 | * endio for the write_dev_flush, this will wake anyone waiting | ||
2646 | * for the barrier when it is done | ||
2647 | */ | ||
2648 | static void btrfs_end_empty_barrier(struct bio *bio, int err) | ||
2649 | { | ||
2650 | if (err) { | ||
2651 | if (err == -EOPNOTSUPP) | ||
2652 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
2653 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
2654 | } | ||
2655 | if (bio->bi_private) | ||
2656 | complete(bio->bi_private); | ||
2657 | bio_put(bio); | ||
2658 | } | ||
2659 | |||
2660 | /* | ||
2661 | * trigger flushes for one the devices. If you pass wait == 0, the flushes are | ||
2662 | * sent down. With wait == 1, it waits for the previous flush. | ||
2663 | * | ||
2664 | * any device where the flush fails with eopnotsupp are flagged as not-barrier | ||
2665 | * capable | ||
2666 | */ | ||
2667 | static int write_dev_flush(struct btrfs_device *device, int wait) | ||
2668 | { | ||
2669 | struct bio *bio; | ||
2670 | int ret = 0; | ||
2671 | |||
2672 | if (device->nobarriers) | ||
2673 | return 0; | ||
2674 | |||
2675 | if (wait) { | ||
2676 | bio = device->flush_bio; | ||
2677 | if (!bio) | ||
2678 | return 0; | ||
2679 | |||
2680 | wait_for_completion(&device->flush_wait); | ||
2681 | |||
2682 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) { | ||
2683 | printk("btrfs: disabling barriers on dev %s\n", | ||
2684 | device->name); | ||
2685 | device->nobarriers = 1; | ||
2686 | } | ||
2687 | if (!bio_flagged(bio, BIO_UPTODATE)) { | ||
2688 | ret = -EIO; | ||
2689 | } | ||
2690 | |||
2691 | /* drop the reference from the wait == 0 run */ | ||
2692 | bio_put(bio); | ||
2693 | device->flush_bio = NULL; | ||
2694 | |||
2695 | return ret; | ||
2696 | } | ||
2697 | |||
2698 | /* | ||
2699 | * one reference for us, and we leave it for the | ||
2700 | * caller | ||
2701 | */ | ||
2702 | device->flush_bio = NULL;; | ||
2703 | bio = bio_alloc(GFP_NOFS, 0); | ||
2704 | if (!bio) | ||
2705 | return -ENOMEM; | ||
2706 | |||
2707 | bio->bi_end_io = btrfs_end_empty_barrier; | ||
2708 | bio->bi_bdev = device->bdev; | ||
2709 | init_completion(&device->flush_wait); | ||
2710 | bio->bi_private = &device->flush_wait; | ||
2711 | device->flush_bio = bio; | ||
2712 | |||
2713 | bio_get(bio); | ||
2714 | submit_bio(WRITE_FLUSH, bio); | ||
2715 | |||
2716 | return 0; | ||
2717 | } | ||
2718 | |||
2719 | /* | ||
2720 | * send an empty flush down to each device in parallel, | ||
2721 | * then wait for them | ||
2722 | */ | ||
2723 | static int barrier_all_devices(struct btrfs_fs_info *info) | ||
2724 | { | ||
2725 | struct list_head *head; | ||
2726 | struct btrfs_device *dev; | ||
2727 | int errors = 0; | ||
2728 | int ret; | ||
2729 | |||
2730 | /* send down all the barriers */ | ||
2731 | head = &info->fs_devices->devices; | ||
2732 | list_for_each_entry_rcu(dev, head, dev_list) { | ||
2733 | if (!dev->bdev) { | ||
2734 | errors++; | ||
2735 | continue; | ||
2736 | } | ||
2737 | if (!dev->in_fs_metadata || !dev->writeable) | ||
2738 | continue; | ||
2739 | |||
2740 | ret = write_dev_flush(dev, 0); | ||
2741 | if (ret) | ||
2742 | errors++; | ||
2743 | } | ||
2744 | |||
2745 | /* wait for all the barriers */ | ||
2746 | list_for_each_entry_rcu(dev, head, dev_list) { | ||
2747 | if (!dev->bdev) { | ||
2748 | errors++; | ||
2749 | continue; | ||
2750 | } | ||
2751 | if (!dev->in_fs_metadata || !dev->writeable) | ||
2752 | continue; | ||
2753 | |||
2754 | ret = write_dev_flush(dev, 1); | ||
2755 | if (ret) | ||
2756 | errors++; | ||
2757 | } | ||
2758 | if (errors) | ||
2759 | return -EIO; | ||
2760 | return 0; | ||
2761 | } | ||
2762 | |||
2648 | int write_all_supers(struct btrfs_root *root, int max_mirrors) | 2763 | int write_all_supers(struct btrfs_root *root, int max_mirrors) |
2649 | { | 2764 | { |
2650 | struct list_head *head; | 2765 | struct list_head *head; |
@@ -2666,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) | |||
2666 | 2781 | ||
2667 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 2782 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); |
2668 | head = &root->fs_info->fs_devices->devices; | 2783 | head = &root->fs_info->fs_devices->devices; |
2784 | |||
2785 | if (do_barriers) | ||
2786 | barrier_all_devices(root->fs_info); | ||
2787 | |||
2669 | list_for_each_entry_rcu(dev, head, dev_list) { | 2788 | list_for_each_entry_rcu(dev, head, dev_list) { |
2670 | if (!dev->bdev) { | 2789 | if (!dev->bdev) { |
2671 | total_errors++; | 2790 | total_errors++; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b232150b5b6b..f5fbe576d2ba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
467 | struct btrfs_root *root, | 467 | struct btrfs_root *root, |
468 | int load_cache_only) | 468 | int load_cache_only) |
469 | { | 469 | { |
470 | DEFINE_WAIT(wait); | ||
470 | struct btrfs_fs_info *fs_info = cache->fs_info; | 471 | struct btrfs_fs_info *fs_info = cache->fs_info; |
471 | struct btrfs_caching_control *caching_ctl; | 472 | struct btrfs_caching_control *caching_ctl; |
472 | int ret = 0; | 473 | int ret = 0; |
473 | 474 | ||
474 | smp_mb(); | 475 | caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); |
475 | if (cache->cached != BTRFS_CACHE_NO) | 476 | BUG_ON(!caching_ctl); |
477 | |||
478 | INIT_LIST_HEAD(&caching_ctl->list); | ||
479 | mutex_init(&caching_ctl->mutex); | ||
480 | init_waitqueue_head(&caching_ctl->wait); | ||
481 | caching_ctl->block_group = cache; | ||
482 | caching_ctl->progress = cache->key.objectid; | ||
483 | atomic_set(&caching_ctl->count, 1); | ||
484 | caching_ctl->work.func = caching_thread; | ||
485 | |||
486 | spin_lock(&cache->lock); | ||
487 | /* | ||
488 | * This should be a rare occasion, but this could happen I think in the | ||
489 | * case where one thread starts to load the space cache info, and then | ||
490 | * some other thread starts a transaction commit which tries to do an | ||
491 | * allocation while the other thread is still loading the space cache | ||
492 | * info. The previous loop should have kept us from choosing this block | ||
493 | * group, but if we've moved to the state where we will wait on caching | ||
494 | * block groups we need to first check if we're doing a fast load here, | ||
495 | * so we can wait for it to finish, otherwise we could end up allocating | ||
496 | * from a block group who's cache gets evicted for one reason or | ||
497 | * another. | ||
498 | */ | ||
499 | while (cache->cached == BTRFS_CACHE_FAST) { | ||
500 | struct btrfs_caching_control *ctl; | ||
501 | |||
502 | ctl = cache->caching_ctl; | ||
503 | atomic_inc(&ctl->count); | ||
504 | prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
505 | spin_unlock(&cache->lock); | ||
506 | |||
507 | schedule(); | ||
508 | |||
509 | finish_wait(&ctl->wait, &wait); | ||
510 | put_caching_control(ctl); | ||
511 | spin_lock(&cache->lock); | ||
512 | } | ||
513 | |||
514 | if (cache->cached != BTRFS_CACHE_NO) { | ||
515 | spin_unlock(&cache->lock); | ||
516 | kfree(caching_ctl); | ||
476 | return 0; | 517 | return 0; |
518 | } | ||
519 | WARN_ON(cache->caching_ctl); | ||
520 | cache->caching_ctl = caching_ctl; | ||
521 | cache->cached = BTRFS_CACHE_FAST; | ||
522 | spin_unlock(&cache->lock); | ||
477 | 523 | ||
478 | /* | 524 | /* |
479 | * We can't do the read from on-disk cache during a commit since we need | 525 | * We can't do the read from on-disk cache during a commit since we need |
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
484 | if (trans && (!trans->transaction->in_commit) && | 530 | if (trans && (!trans->transaction->in_commit) && |
485 | (root && root != root->fs_info->tree_root) && | 531 | (root && root != root->fs_info->tree_root) && |
486 | btrfs_test_opt(root, SPACE_CACHE)) { | 532 | btrfs_test_opt(root, SPACE_CACHE)) { |
487 | spin_lock(&cache->lock); | ||
488 | if (cache->cached != BTRFS_CACHE_NO) { | ||
489 | spin_unlock(&cache->lock); | ||
490 | return 0; | ||
491 | } | ||
492 | cache->cached = BTRFS_CACHE_STARTED; | ||
493 | spin_unlock(&cache->lock); | ||
494 | |||
495 | ret = load_free_space_cache(fs_info, cache); | 533 | ret = load_free_space_cache(fs_info, cache); |
496 | 534 | ||
497 | spin_lock(&cache->lock); | 535 | spin_lock(&cache->lock); |
498 | if (ret == 1) { | 536 | if (ret == 1) { |
537 | cache->caching_ctl = NULL; | ||
499 | cache->cached = BTRFS_CACHE_FINISHED; | 538 | cache->cached = BTRFS_CACHE_FINISHED; |
500 | cache->last_byte_to_unpin = (u64)-1; | 539 | cache->last_byte_to_unpin = (u64)-1; |
501 | } else { | 540 | } else { |
502 | cache->cached = BTRFS_CACHE_NO; | 541 | if (load_cache_only) { |
542 | cache->caching_ctl = NULL; | ||
543 | cache->cached = BTRFS_CACHE_NO; | ||
544 | } else { | ||
545 | cache->cached = BTRFS_CACHE_STARTED; | ||
546 | } | ||
503 | } | 547 | } |
504 | spin_unlock(&cache->lock); | 548 | spin_unlock(&cache->lock); |
549 | wake_up(&caching_ctl->wait); | ||
505 | if (ret == 1) { | 550 | if (ret == 1) { |
551 | put_caching_control(caching_ctl); | ||
506 | free_excluded_extents(fs_info->extent_root, cache); | 552 | free_excluded_extents(fs_info->extent_root, cache); |
507 | return 0; | 553 | return 0; |
508 | } | 554 | } |
555 | } else { | ||
556 | /* | ||
557 | * We are not going to do the fast caching, set cached to the | ||
558 | * appropriate value and wakeup any waiters. | ||
559 | */ | ||
560 | spin_lock(&cache->lock); | ||
561 | if (load_cache_only) { | ||
562 | cache->caching_ctl = NULL; | ||
563 | cache->cached = BTRFS_CACHE_NO; | ||
564 | } else { | ||
565 | cache->cached = BTRFS_CACHE_STARTED; | ||
566 | } | ||
567 | spin_unlock(&cache->lock); | ||
568 | wake_up(&caching_ctl->wait); | ||
509 | } | 569 | } |
510 | 570 | ||
511 | if (load_cache_only) | 571 | if (load_cache_only) { |
512 | return 0; | 572 | put_caching_control(caching_ctl); |
513 | |||
514 | caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); | ||
515 | BUG_ON(!caching_ctl); | ||
516 | |||
517 | INIT_LIST_HEAD(&caching_ctl->list); | ||
518 | mutex_init(&caching_ctl->mutex); | ||
519 | init_waitqueue_head(&caching_ctl->wait); | ||
520 | caching_ctl->block_group = cache; | ||
521 | caching_ctl->progress = cache->key.objectid; | ||
522 | /* one for caching kthread, one for caching block group list */ | ||
523 | atomic_set(&caching_ctl->count, 2); | ||
524 | caching_ctl->work.func = caching_thread; | ||
525 | |||
526 | spin_lock(&cache->lock); | ||
527 | if (cache->cached != BTRFS_CACHE_NO) { | ||
528 | spin_unlock(&cache->lock); | ||
529 | kfree(caching_ctl); | ||
530 | return 0; | 573 | return 0; |
531 | } | 574 | } |
532 | cache->caching_ctl = caching_ctl; | ||
533 | cache->cached = BTRFS_CACHE_STARTED; | ||
534 | spin_unlock(&cache->lock); | ||
535 | 575 | ||
536 | down_write(&fs_info->extent_commit_sem); | 576 | down_write(&fs_info->extent_commit_sem); |
577 | atomic_inc(&caching_ctl->count); | ||
537 | list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); | 578 | list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); |
538 | up_write(&fs_info->extent_commit_sem); | 579 | up_write(&fs_info->extent_commit_sem); |
539 | 580 | ||
@@ -2781,7 +2822,7 @@ out_free: | |||
2781 | btrfs_release_path(path); | 2822 | btrfs_release_path(path); |
2782 | out: | 2823 | out: |
2783 | spin_lock(&block_group->lock); | 2824 | spin_lock(&block_group->lock); |
2784 | if (!ret) | 2825 | if (!ret && dcs == BTRFS_DC_SETUP) |
2785 | block_group->cache_generation = trans->transid; | 2826 | block_group->cache_generation = trans->transid; |
2786 | block_group->disk_cache_state = dcs; | 2827 | block_group->disk_cache_state = dcs; |
2787 | spin_unlock(&block_group->lock); | 2828 | spin_unlock(&block_group->lock); |
@@ -3847,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, | |||
3847 | return ret; | 3888 | return ret; |
3848 | } | 3889 | } |
3849 | 3890 | ||
3850 | int btrfs_block_rsv_refill(struct btrfs_root *root, | 3891 | static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, |
3851 | struct btrfs_block_rsv *block_rsv, | 3892 | struct btrfs_block_rsv *block_rsv, |
3852 | u64 min_reserved) | 3893 | u64 min_reserved, int flush) |
3853 | { | 3894 | { |
3854 | u64 num_bytes = 0; | 3895 | u64 num_bytes = 0; |
3855 | int ret = -ENOSPC; | 3896 | int ret = -ENOSPC; |
@@ -3868,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, | |||
3868 | if (!ret) | 3909 | if (!ret) |
3869 | return 0; | 3910 | return 0; |
3870 | 3911 | ||
3871 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); | 3912 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); |
3872 | if (!ret) { | 3913 | if (!ret) { |
3873 | block_rsv_add_bytes(block_rsv, num_bytes, 0); | 3914 | block_rsv_add_bytes(block_rsv, num_bytes, 0); |
3874 | return 0; | 3915 | return 0; |
@@ -3877,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, | |||
3877 | return ret; | 3918 | return ret; |
3878 | } | 3919 | } |
3879 | 3920 | ||
3921 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
3922 | struct btrfs_block_rsv *block_rsv, | ||
3923 | u64 min_reserved) | ||
3924 | { | ||
3925 | return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); | ||
3926 | } | ||
3927 | |||
3928 | int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, | ||
3929 | struct btrfs_block_rsv *block_rsv, | ||
3930 | u64 min_reserved) | ||
3931 | { | ||
3932 | return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); | ||
3933 | } | ||
3934 | |||
3880 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | 3935 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, |
3881 | struct btrfs_block_rsv *dst_rsv, | 3936 | struct btrfs_block_rsv *dst_rsv, |
3882 | u64 num_bytes) | 3937 | u64 num_bytes) |
@@ -4149,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4149 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4204 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4150 | struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; | 4205 | struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; |
4151 | u64 to_reserve = 0; | 4206 | u64 to_reserve = 0; |
4207 | u64 csum_bytes; | ||
4152 | unsigned nr_extents = 0; | 4208 | unsigned nr_extents = 0; |
4209 | int extra_reserve = 0; | ||
4153 | int flush = 1; | 4210 | int flush = 1; |
4154 | int ret; | 4211 | int ret; |
4155 | 4212 | ||
4213 | /* Need to be holding the i_mutex here if we aren't free space cache */ | ||
4156 | if (btrfs_is_free_space_inode(root, inode)) | 4214 | if (btrfs_is_free_space_inode(root, inode)) |
4157 | flush = 0; | 4215 | flush = 0; |
4216 | else | ||
4217 | WARN_ON(!mutex_is_locked(&inode->i_mutex)); | ||
4158 | 4218 | ||
4159 | if (flush && btrfs_transaction_in_commit(root->fs_info)) | 4219 | if (flush && btrfs_transaction_in_commit(root->fs_info)) |
4160 | schedule_timeout(1); | 4220 | schedule_timeout(1); |
@@ -4165,11 +4225,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4165 | BTRFS_I(inode)->outstanding_extents++; | 4225 | BTRFS_I(inode)->outstanding_extents++; |
4166 | 4226 | ||
4167 | if (BTRFS_I(inode)->outstanding_extents > | 4227 | if (BTRFS_I(inode)->outstanding_extents > |
4168 | BTRFS_I(inode)->reserved_extents) { | 4228 | BTRFS_I(inode)->reserved_extents) |
4169 | nr_extents = BTRFS_I(inode)->outstanding_extents - | 4229 | nr_extents = BTRFS_I(inode)->outstanding_extents - |
4170 | BTRFS_I(inode)->reserved_extents; | 4230 | BTRFS_I(inode)->reserved_extents; |
4171 | BTRFS_I(inode)->reserved_extents += nr_extents; | ||
4172 | } | ||
4173 | 4231 | ||
4174 | /* | 4232 | /* |
4175 | * Add an item to reserve for updating the inode when we complete the | 4233 | * Add an item to reserve for updating the inode when we complete the |
@@ -4177,11 +4235,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4177 | */ | 4235 | */ |
4178 | if (!BTRFS_I(inode)->delalloc_meta_reserved) { | 4236 | if (!BTRFS_I(inode)->delalloc_meta_reserved) { |
4179 | nr_extents++; | 4237 | nr_extents++; |
4180 | BTRFS_I(inode)->delalloc_meta_reserved = 1; | 4238 | extra_reserve = 1; |
4181 | } | 4239 | } |
4182 | 4240 | ||
4183 | to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); | 4241 | to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); |
4184 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); | 4242 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); |
4243 | csum_bytes = BTRFS_I(inode)->csum_bytes; | ||
4185 | spin_unlock(&BTRFS_I(inode)->lock); | 4244 | spin_unlock(&BTRFS_I(inode)->lock); |
4186 | 4245 | ||
4187 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); | 4246 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); |
@@ -4191,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4191 | 4250 | ||
4192 | spin_lock(&BTRFS_I(inode)->lock); | 4251 | spin_lock(&BTRFS_I(inode)->lock); |
4193 | dropped = drop_outstanding_extent(inode); | 4252 | dropped = drop_outstanding_extent(inode); |
4194 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4195 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4196 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4197 | |||
4198 | /* | 4253 | /* |
4199 | * Somebody could have come in and twiddled with the | 4254 | * If the inodes csum_bytes is the same as the original |
4200 | * reservation, so if we have to free more than we would have | 4255 | * csum_bytes then we know we haven't raced with any free()ers |
4201 | * reserved from this reservation go ahead and release those | 4256 | * so we can just reduce our inodes csum bytes and carry on. |
4202 | * bytes. | 4257 | * Otherwise we have to do the normal free thing to account for |
4258 | * the case that the free side didn't free up its reserve | ||
4259 | * because of this outstanding reservation. | ||
4203 | */ | 4260 | */ |
4204 | to_free -= to_reserve; | 4261 | if (BTRFS_I(inode)->csum_bytes == csum_bytes) |
4262 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
4263 | else | ||
4264 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4265 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4266 | if (dropped) | ||
4267 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4268 | |||
4205 | if (to_free) | 4269 | if (to_free) |
4206 | btrfs_block_rsv_release(root, block_rsv, to_free); | 4270 | btrfs_block_rsv_release(root, block_rsv, to_free); |
4207 | return ret; | 4271 | return ret; |
4208 | } | 4272 | } |
4209 | 4273 | ||
4274 | spin_lock(&BTRFS_I(inode)->lock); | ||
4275 | if (extra_reserve) { | ||
4276 | BTRFS_I(inode)->delalloc_meta_reserved = 1; | ||
4277 | nr_extents--; | ||
4278 | } | ||
4279 | BTRFS_I(inode)->reserved_extents += nr_extents; | ||
4280 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4281 | |||
4210 | block_rsv_add_bytes(block_rsv, to_reserve, 1); | 4282 | block_rsv_add_bytes(block_rsv, to_reserve, 1); |
4211 | 4283 | ||
4212 | return 0; | 4284 | return 0; |
@@ -5052,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, | |||
5052 | struct btrfs_root *root = orig_root->fs_info->extent_root; | 5124 | struct btrfs_root *root = orig_root->fs_info->extent_root; |
5053 | struct btrfs_free_cluster *last_ptr = NULL; | 5125 | struct btrfs_free_cluster *last_ptr = NULL; |
5054 | struct btrfs_block_group_cache *block_group = NULL; | 5126 | struct btrfs_block_group_cache *block_group = NULL; |
5127 | struct btrfs_block_group_cache *used_block_group; | ||
5055 | int empty_cluster = 2 * 1024 * 1024; | 5128 | int empty_cluster = 2 * 1024 * 1024; |
5056 | int allowed_chunk_alloc = 0; | 5129 | int allowed_chunk_alloc = 0; |
5057 | int done_chunk_alloc = 0; | 5130 | int done_chunk_alloc = 0; |
5058 | struct btrfs_space_info *space_info; | 5131 | struct btrfs_space_info *space_info; |
5059 | int last_ptr_loop = 0; | ||
5060 | int loop = 0; | 5132 | int loop = 0; |
5061 | int index = 0; | 5133 | int index = 0; |
5062 | int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? | 5134 | int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? |
@@ -5118,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, | |||
5118 | ideal_cache: | 5190 | ideal_cache: |
5119 | block_group = btrfs_lookup_block_group(root->fs_info, | 5191 | block_group = btrfs_lookup_block_group(root->fs_info, |
5120 | search_start); | 5192 | search_start); |
5193 | used_block_group = block_group; | ||
5121 | /* | 5194 | /* |
5122 | * we don't want to use the block group if it doesn't match our | 5195 | * we don't want to use the block group if it doesn't match our |
5123 | * allocation bits, or if its not cached. | 5196 | * allocation bits, or if its not cached. |
@@ -5155,6 +5228,7 @@ search: | |||
5155 | u64 offset; | 5228 | u64 offset; |
5156 | int cached; | 5229 | int cached; |
5157 | 5230 | ||
5231 | used_block_group = block_group; | ||
5158 | btrfs_get_block_group(block_group); | 5232 | btrfs_get_block_group(block_group); |
5159 | search_start = block_group->key.objectid; | 5233 | search_start = block_group->key.objectid; |
5160 | 5234 | ||
@@ -5178,13 +5252,15 @@ search: | |||
5178 | } | 5252 | } |
5179 | 5253 | ||
5180 | have_block_group: | 5254 | have_block_group: |
5181 | if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { | 5255 | cached = block_group_cache_done(block_group); |
5256 | if (unlikely(!cached)) { | ||
5182 | u64 free_percent; | 5257 | u64 free_percent; |
5183 | 5258 | ||
5259 | found_uncached_bg = true; | ||
5184 | ret = cache_block_group(block_group, trans, | 5260 | ret = cache_block_group(block_group, trans, |
5185 | orig_root, 1); | 5261 | orig_root, 1); |
5186 | if (block_group->cached == BTRFS_CACHE_FINISHED) | 5262 | if (block_group->cached == BTRFS_CACHE_FINISHED) |
5187 | goto have_block_group; | 5263 | goto alloc; |
5188 | 5264 | ||
5189 | free_percent = btrfs_block_group_used(&block_group->item); | 5265 | free_percent = btrfs_block_group_used(&block_group->item); |
5190 | free_percent *= 100; | 5266 | free_percent *= 100; |
@@ -5206,7 +5282,6 @@ have_block_group: | |||
5206 | orig_root, 0); | 5282 | orig_root, 0); |
5207 | BUG_ON(ret); | 5283 | BUG_ON(ret); |
5208 | } | 5284 | } |
5209 | found_uncached_bg = true; | ||
5210 | 5285 | ||
5211 | /* | 5286 | /* |
5212 | * If loop is set for cached only, try the next block | 5287 | * If loop is set for cached only, try the next block |
@@ -5216,94 +5291,80 @@ have_block_group: | |||
5216 | goto loop; | 5291 | goto loop; |
5217 | } | 5292 | } |
5218 | 5293 | ||
5219 | cached = block_group_cache_done(block_group); | 5294 | alloc: |
5220 | if (unlikely(!cached)) | ||
5221 | found_uncached_bg = true; | ||
5222 | |||
5223 | if (unlikely(block_group->ro)) | 5295 | if (unlikely(block_group->ro)) |
5224 | goto loop; | 5296 | goto loop; |
5225 | 5297 | ||
5226 | spin_lock(&block_group->free_space_ctl->tree_lock); | 5298 | spin_lock(&block_group->free_space_ctl->tree_lock); |
5227 | if (cached && | 5299 | if (cached && |
5228 | block_group->free_space_ctl->free_space < | 5300 | block_group->free_space_ctl->free_space < |
5229 | num_bytes + empty_size) { | 5301 | num_bytes + empty_cluster + empty_size) { |
5230 | spin_unlock(&block_group->free_space_ctl->tree_lock); | 5302 | spin_unlock(&block_group->free_space_ctl->tree_lock); |
5231 | goto loop; | 5303 | goto loop; |
5232 | } | 5304 | } |
5233 | spin_unlock(&block_group->free_space_ctl->tree_lock); | 5305 | spin_unlock(&block_group->free_space_ctl->tree_lock); |
5234 | 5306 | ||
5235 | /* | 5307 | /* |
5236 | * Ok we want to try and use the cluster allocator, so lets look | 5308 | * Ok we want to try and use the cluster allocator, so |
5237 | * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will | 5309 | * lets look there |
5238 | * have tried the cluster allocator plenty of times at this | ||
5239 | * point and not have found anything, so we are likely way too | ||
5240 | * fragmented for the clustering stuff to find anything, so lets | ||
5241 | * just skip it and let the allocator find whatever block it can | ||
5242 | * find | ||
5243 | */ | 5310 | */ |
5244 | if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { | 5311 | if (last_ptr) { |
5245 | /* | 5312 | /* |
5246 | * the refill lock keeps out other | 5313 | * the refill lock keeps out other |
5247 | * people trying to start a new cluster | 5314 | * people trying to start a new cluster |
5248 | */ | 5315 | */ |
5249 | spin_lock(&last_ptr->refill_lock); | 5316 | spin_lock(&last_ptr->refill_lock); |
5250 | if (last_ptr->block_group && | 5317 | used_block_group = last_ptr->block_group; |
5251 | (last_ptr->block_group->ro || | 5318 | if (used_block_group != block_group && |
5252 | !block_group_bits(last_ptr->block_group, data))) { | 5319 | (!used_block_group || |
5253 | offset = 0; | 5320 | used_block_group->ro || |
5321 | !block_group_bits(used_block_group, data))) { | ||
5322 | used_block_group = block_group; | ||
5254 | goto refill_cluster; | 5323 | goto refill_cluster; |
5255 | } | 5324 | } |
5256 | 5325 | ||
5257 | offset = btrfs_alloc_from_cluster(block_group, last_ptr, | 5326 | if (used_block_group != block_group) |
5258 | num_bytes, search_start); | 5327 | btrfs_get_block_group(used_block_group); |
5328 | |||
5329 | offset = btrfs_alloc_from_cluster(used_block_group, | ||
5330 | last_ptr, num_bytes, used_block_group->key.objectid); | ||
5259 | if (offset) { | 5331 | if (offset) { |
5260 | /* we have a block, we're done */ | 5332 | /* we have a block, we're done */ |
5261 | spin_unlock(&last_ptr->refill_lock); | 5333 | spin_unlock(&last_ptr->refill_lock); |
5262 | goto checks; | 5334 | goto checks; |
5263 | } | 5335 | } |
5264 | 5336 | ||
5265 | spin_lock(&last_ptr->lock); | 5337 | WARN_ON(last_ptr->block_group != used_block_group); |
5266 | /* | 5338 | if (used_block_group != block_group) { |
5267 | * whoops, this cluster doesn't actually point to | 5339 | btrfs_put_block_group(used_block_group); |
5268 | * this block group. Get a ref on the block | 5340 | used_block_group = block_group; |
5269 | * group is does point to and try again | ||
5270 | */ | ||
5271 | if (!last_ptr_loop && last_ptr->block_group && | ||
5272 | last_ptr->block_group != block_group && | ||
5273 | index <= | ||
5274 | get_block_group_index(last_ptr->block_group)) { | ||
5275 | |||
5276 | btrfs_put_block_group(block_group); | ||
5277 | block_group = last_ptr->block_group; | ||
5278 | btrfs_get_block_group(block_group); | ||
5279 | spin_unlock(&last_ptr->lock); | ||
5280 | spin_unlock(&last_ptr->refill_lock); | ||
5281 | |||
5282 | last_ptr_loop = 1; | ||
5283 | search_start = block_group->key.objectid; | ||
5284 | /* | ||
5285 | * we know this block group is properly | ||
5286 | * in the list because | ||
5287 | * btrfs_remove_block_group, drops the | ||
5288 | * cluster before it removes the block | ||
5289 | * group from the list | ||
5290 | */ | ||
5291 | goto have_block_group; | ||
5292 | } | 5341 | } |
5293 | spin_unlock(&last_ptr->lock); | ||
5294 | refill_cluster: | 5342 | refill_cluster: |
5343 | BUG_ON(used_block_group != block_group); | ||
5344 | /* If we are on LOOP_NO_EMPTY_SIZE, we can't | ||
5345 | * set up a new clusters, so lets just skip it | ||
5346 | * and let the allocator find whatever block | ||
5347 | * it can find. If we reach this point, we | ||
5348 | * will have tried the cluster allocator | ||
5349 | * plenty of times and not have found | ||
5350 | * anything, so we are likely way too | ||
5351 | * fragmented for the clustering stuff to find | ||
5352 | * anything. */ | ||
5353 | if (loop >= LOOP_NO_EMPTY_SIZE) { | ||
5354 | spin_unlock(&last_ptr->refill_lock); | ||
5355 | goto unclustered_alloc; | ||
5356 | } | ||
5357 | |||
5295 | /* | 5358 | /* |
5296 | * this cluster didn't work out, free it and | 5359 | * this cluster didn't work out, free it and |
5297 | * start over | 5360 | * start over |
5298 | */ | 5361 | */ |
5299 | btrfs_return_cluster_to_free_space(NULL, last_ptr); | 5362 | btrfs_return_cluster_to_free_space(NULL, last_ptr); |
5300 | 5363 | ||
5301 | last_ptr_loop = 0; | ||
5302 | |||
5303 | /* allocate a cluster in this block group */ | 5364 | /* allocate a cluster in this block group */ |
5304 | ret = btrfs_find_space_cluster(trans, root, | 5365 | ret = btrfs_find_space_cluster(trans, root, |
5305 | block_group, last_ptr, | 5366 | block_group, last_ptr, |
5306 | offset, num_bytes, | 5367 | search_start, num_bytes, |
5307 | empty_cluster + empty_size); | 5368 | empty_cluster + empty_size); |
5308 | if (ret == 0) { | 5369 | if (ret == 0) { |
5309 | /* | 5370 | /* |
@@ -5339,6 +5400,7 @@ refill_cluster: | |||
5339 | goto loop; | 5400 | goto loop; |
5340 | } | 5401 | } |
5341 | 5402 | ||
5403 | unclustered_alloc: | ||
5342 | offset = btrfs_find_space_for_alloc(block_group, search_start, | 5404 | offset = btrfs_find_space_for_alloc(block_group, search_start, |
5343 | num_bytes, empty_size); | 5405 | num_bytes, empty_size); |
5344 | /* | 5406 | /* |
@@ -5365,14 +5427,14 @@ checks: | |||
5365 | search_start = stripe_align(root, offset); | 5427 | search_start = stripe_align(root, offset); |
5366 | /* move on to the next group */ | 5428 | /* move on to the next group */ |
5367 | if (search_start + num_bytes >= search_end) { | 5429 | if (search_start + num_bytes >= search_end) { |
5368 | btrfs_add_free_space(block_group, offset, num_bytes); | 5430 | btrfs_add_free_space(used_block_group, offset, num_bytes); |
5369 | goto loop; | 5431 | goto loop; |
5370 | } | 5432 | } |
5371 | 5433 | ||
5372 | /* move on to the next group */ | 5434 | /* move on to the next group */ |
5373 | if (search_start + num_bytes > | 5435 | if (search_start + num_bytes > |
5374 | block_group->key.objectid + block_group->key.offset) { | 5436 | used_block_group->key.objectid + used_block_group->key.offset) { |
5375 | btrfs_add_free_space(block_group, offset, num_bytes); | 5437 | btrfs_add_free_space(used_block_group, offset, num_bytes); |
5376 | goto loop; | 5438 | goto loop; |
5377 | } | 5439 | } |
5378 | 5440 | ||
@@ -5380,14 +5442,14 @@ checks: | |||
5380 | ins->offset = num_bytes; | 5442 | ins->offset = num_bytes; |
5381 | 5443 | ||
5382 | if (offset < search_start) | 5444 | if (offset < search_start) |
5383 | btrfs_add_free_space(block_group, offset, | 5445 | btrfs_add_free_space(used_block_group, offset, |
5384 | search_start - offset); | 5446 | search_start - offset); |
5385 | BUG_ON(offset > search_start); | 5447 | BUG_ON(offset > search_start); |
5386 | 5448 | ||
5387 | ret = btrfs_update_reserved_bytes(block_group, num_bytes, | 5449 | ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, |
5388 | alloc_type); | 5450 | alloc_type); |
5389 | if (ret == -EAGAIN) { | 5451 | if (ret == -EAGAIN) { |
5390 | btrfs_add_free_space(block_group, offset, num_bytes); | 5452 | btrfs_add_free_space(used_block_group, offset, num_bytes); |
5391 | goto loop; | 5453 | goto loop; |
5392 | } | 5454 | } |
5393 | 5455 | ||
@@ -5396,15 +5458,19 @@ checks: | |||
5396 | ins->offset = num_bytes; | 5458 | ins->offset = num_bytes; |
5397 | 5459 | ||
5398 | if (offset < search_start) | 5460 | if (offset < search_start) |
5399 | btrfs_add_free_space(block_group, offset, | 5461 | btrfs_add_free_space(used_block_group, offset, |
5400 | search_start - offset); | 5462 | search_start - offset); |
5401 | BUG_ON(offset > search_start); | 5463 | BUG_ON(offset > search_start); |
5464 | if (used_block_group != block_group) | ||
5465 | btrfs_put_block_group(used_block_group); | ||
5402 | btrfs_put_block_group(block_group); | 5466 | btrfs_put_block_group(block_group); |
5403 | break; | 5467 | break; |
5404 | loop: | 5468 | loop: |
5405 | failed_cluster_refill = false; | 5469 | failed_cluster_refill = false; |
5406 | failed_alloc = false; | 5470 | failed_alloc = false; |
5407 | BUG_ON(index != get_block_group_index(block_group)); | 5471 | BUG_ON(index != get_block_group_index(block_group)); |
5472 | if (used_block_group != block_group) | ||
5473 | btrfs_put_block_group(used_block_group); | ||
5408 | btrfs_put_block_group(block_group); | 5474 | btrfs_put_block_group(block_group); |
5409 | } | 5475 | } |
5410 | up_read(&space_info->groups_sem); | 5476 | up_read(&space_info->groups_sem); |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1f87c4d0e7a0..49f3c9dc09f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -935,8 +935,10 @@ again: | |||
935 | node = tree_search(tree, start); | 935 | node = tree_search(tree, start); |
936 | if (!node) { | 936 | if (!node) { |
937 | prealloc = alloc_extent_state_atomic(prealloc); | 937 | prealloc = alloc_extent_state_atomic(prealloc); |
938 | if (!prealloc) | 938 | if (!prealloc) { |
939 | return -ENOMEM; | 939 | err = -ENOMEM; |
940 | goto out; | ||
941 | } | ||
940 | err = insert_state(tree, prealloc, start, end, &bits); | 942 | err = insert_state(tree, prealloc, start, end, &bits); |
941 | prealloc = NULL; | 943 | prealloc = NULL; |
942 | BUG_ON(err == -EEXIST); | 944 | BUG_ON(err == -EEXIST); |
@@ -992,8 +994,10 @@ hit_next: | |||
992 | */ | 994 | */ |
993 | if (state->start < start) { | 995 | if (state->start < start) { |
994 | prealloc = alloc_extent_state_atomic(prealloc); | 996 | prealloc = alloc_extent_state_atomic(prealloc); |
995 | if (!prealloc) | 997 | if (!prealloc) { |
996 | return -ENOMEM; | 998 | err = -ENOMEM; |
999 | goto out; | ||
1000 | } | ||
997 | err = split_state(tree, state, prealloc, start); | 1001 | err = split_state(tree, state, prealloc, start); |
998 | BUG_ON(err == -EEXIST); | 1002 | BUG_ON(err == -EEXIST); |
999 | prealloc = NULL; | 1003 | prealloc = NULL; |
@@ -1024,8 +1028,10 @@ hit_next: | |||
1024 | this_end = last_start - 1; | 1028 | this_end = last_start - 1; |
1025 | 1029 | ||
1026 | prealloc = alloc_extent_state_atomic(prealloc); | 1030 | prealloc = alloc_extent_state_atomic(prealloc); |
1027 | if (!prealloc) | 1031 | if (!prealloc) { |
1028 | return -ENOMEM; | 1032 | err = -ENOMEM; |
1033 | goto out; | ||
1034 | } | ||
1029 | 1035 | ||
1030 | /* | 1036 | /* |
1031 | * Avoid to free 'prealloc' if it can be merged with | 1037 | * Avoid to free 'prealloc' if it can be merged with |
@@ -1051,8 +1057,10 @@ hit_next: | |||
1051 | */ | 1057 | */ |
1052 | if (state->start <= end && state->end > end) { | 1058 | if (state->start <= end && state->end > end) { |
1053 | prealloc = alloc_extent_state_atomic(prealloc); | 1059 | prealloc = alloc_extent_state_atomic(prealloc); |
1054 | if (!prealloc) | 1060 | if (!prealloc) { |
1055 | return -ENOMEM; | 1061 | err = -ENOMEM; |
1062 | goto out; | ||
1063 | } | ||
1056 | 1064 | ||
1057 | err = split_state(tree, state, prealloc, end + 1); | 1065 | err = split_state(tree, state, prealloc, end + 1); |
1058 | BUG_ON(err == -EEXIST); | 1066 | BUG_ON(err == -EEXIST); |
@@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
2285 | clean_io_failure(start, page); | 2293 | clean_io_failure(start, page); |
2286 | } | 2294 | } |
2287 | if (!uptodate) { | 2295 | if (!uptodate) { |
2288 | u64 failed_mirror; | 2296 | int failed_mirror; |
2289 | failed_mirror = (u64)bio->bi_bdev; | 2297 | failed_mirror = (int)(unsigned long)bio->bi_bdev; |
2290 | if (tree->ops && tree->ops->readpage_io_failed_hook) | 2298 | /* |
2291 | ret = tree->ops->readpage_io_failed_hook( | 2299 | * The generic bio_readpage_error handles errors the |
2292 | bio, page, start, end, | 2300 | * following way: If possible, new read requests are |
2293 | failed_mirror, state); | 2301 | * created and submitted and will end up in |
2294 | else | 2302 | * end_bio_extent_readpage as well (if we're lucky, not |
2295 | ret = bio_readpage_error(bio, page, start, end, | 2303 | * in the !uptodate case). In that case it returns 0 and |
2296 | failed_mirror, NULL); | 2304 | * we just go on with the next page in our bio. If it |
2305 | * can't handle the error it will return -EIO and we | ||
2306 | * remain responsible for that page. | ||
2307 | */ | ||
2308 | ret = bio_readpage_error(bio, page, start, end, | ||
2309 | failed_mirror, NULL); | ||
2297 | if (ret == 0) { | 2310 | if (ret == 0) { |
2311 | error_handled: | ||
2298 | uptodate = | 2312 | uptodate = |
2299 | test_bit(BIO_UPTODATE, &bio->bi_flags); | 2313 | test_bit(BIO_UPTODATE, &bio->bi_flags); |
2300 | if (err) | 2314 | if (err) |
@@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
2302 | uncache_state(&cached); | 2316 | uncache_state(&cached); |
2303 | continue; | 2317 | continue; |
2304 | } | 2318 | } |
2319 | if (tree->ops && tree->ops->readpage_io_failed_hook) { | ||
2320 | ret = tree->ops->readpage_io_failed_hook( | ||
2321 | bio, page, start, end, | ||
2322 | failed_mirror, state); | ||
2323 | if (ret == 0) | ||
2324 | goto error_handled; | ||
2325 | } | ||
2305 | } | 2326 | } |
2306 | 2327 | ||
2307 | if (uptodate) { | 2328 | if (uptodate) { |
@@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
3366 | return -ENOMEM; | 3387 | return -ENOMEM; |
3367 | path->leave_spinning = 1; | 3388 | path->leave_spinning = 1; |
3368 | 3389 | ||
3390 | start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); | ||
3391 | len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); | ||
3392 | |||
3369 | /* | 3393 | /* |
3370 | * lookup the last file extent. We're not using i_size here | 3394 | * lookup the last file extent. We're not using i_size here |
3371 | * because there might be preallocation past i_size | 3395 | * because there might be preallocation past i_size |
@@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
3413 | lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, | 3437 | lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, |
3414 | &cached_state, GFP_NOFS); | 3438 | &cached_state, GFP_NOFS); |
3415 | 3439 | ||
3416 | em = get_extent_skip_holes(inode, off, last_for_get_extent, | 3440 | em = get_extent_skip_holes(inode, start, last_for_get_extent, |
3417 | get_extent); | 3441 | get_extent); |
3418 | if (!em) | 3442 | if (!em) |
3419 | goto out; | 3443 | goto out; |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0e23bc..7604c3001322 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -70,7 +70,7 @@ struct extent_io_ops { | |||
70 | unsigned long bio_flags); | 70 | unsigned long bio_flags); |
71 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); | 71 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); |
72 | int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, | 72 | int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, |
73 | u64 start, u64 end, u64 failed_mirror, | 73 | u64 start, u64 end, int failed_mirror, |
74 | struct extent_state *state); | 74 | struct extent_state *state); |
75 | int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, | 75 | int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, |
76 | u64 start, u64 end, | 76 | u64 start, u64 end, |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dafdfa059bf6..97fbe939c050 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1167 | nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / | 1167 | nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / |
1168 | PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / | 1168 | PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / |
1169 | (sizeof(struct page *))); | 1169 | (sizeof(struct page *))); |
1170 | nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); | ||
1171 | nrptrs = max(nrptrs, 8); | ||
1170 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); | 1172 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); |
1171 | if (!pages) | 1173 | if (!pages) |
1172 | return -ENOMEM; | 1174 | return -ENOMEM; |
@@ -1387,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1387 | goto out; | 1389 | goto out; |
1388 | } | 1390 | } |
1389 | 1391 | ||
1390 | file_update_time(file); | 1392 | err = btrfs_update_time(file); |
1393 | if (err) { | ||
1394 | mutex_unlock(&inode->i_mutex); | ||
1395 | goto out; | ||
1396 | } | ||
1391 | BTRFS_I(inode)->sequence++; | 1397 | BTRFS_I(inode)->sequence++; |
1392 | 1398 | ||
1393 | start_pos = round_down(pos, root->sectorsize); | 1399 | start_pos = round_down(pos, root->sectorsize); |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 181760f9d2ab..ec23d43d0c35 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, | |||
351 | } | 351 | } |
352 | } | 352 | } |
353 | 353 | ||
354 | for (i = 0; i < io_ctl->num_pages; i++) { | ||
355 | clear_page_dirty_for_io(io_ctl->pages[i]); | ||
356 | set_page_extent_mapped(io_ctl->pages[i]); | ||
357 | } | ||
358 | |||
354 | return 0; | 359 | return 0; |
355 | } | 360 | } |
356 | 361 | ||
@@ -1465,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1465 | { | 1470 | { |
1466 | info->offset = offset_to_bitmap(ctl, offset); | 1471 | info->offset = offset_to_bitmap(ctl, offset); |
1467 | info->bytes = 0; | 1472 | info->bytes = 0; |
1473 | INIT_LIST_HEAD(&info->list); | ||
1468 | link_free_space(ctl, info); | 1474 | link_free_space(ctl, info); |
1469 | ctl->total_bitmaps++; | 1475 | ctl->total_bitmaps++; |
1470 | 1476 | ||
@@ -1844,7 +1850,13 @@ again: | |||
1844 | info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), | 1850 | info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), |
1845 | 1, 0); | 1851 | 1, 0); |
1846 | if (!info) { | 1852 | if (!info) { |
1847 | WARN_ON(1); | 1853 | /* the tree logging code might be calling us before we |
1854 | * have fully loaded the free space rbtree for this | ||
1855 | * block group. So it is possible the entry won't | ||
1856 | * be in the rbtree yet at all. The caching code | ||
1857 | * will make sure not to put it in the rbtree if | ||
1858 | * the logging code has pinned it. | ||
1859 | */ | ||
1848 | goto out_lock; | 1860 | goto out_lock; |
1849 | } | 1861 | } |
1850 | } | 1862 | } |
@@ -2308,6 +2320,7 @@ again: | |||
2308 | 2320 | ||
2309 | if (!found) { | 2321 | if (!found) { |
2310 | start = i; | 2322 | start = i; |
2323 | cluster->max_size = 0; | ||
2311 | found = true; | 2324 | found = true; |
2312 | } | 2325 | } |
2313 | 2326 | ||
@@ -2451,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | |||
2451 | { | 2464 | { |
2452 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2465 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2453 | struct btrfs_free_space *entry; | 2466 | struct btrfs_free_space *entry; |
2454 | struct rb_node *node; | ||
2455 | int ret = -ENOSPC; | 2467 | int ret = -ENOSPC; |
2468 | u64 bitmap_offset = offset_to_bitmap(ctl, offset); | ||
2456 | 2469 | ||
2457 | if (ctl->total_bitmaps == 0) | 2470 | if (ctl->total_bitmaps == 0) |
2458 | return -ENOSPC; | 2471 | return -ENOSPC; |
2459 | 2472 | ||
2460 | /* | 2473 | /* |
2461 | * First check our cached list of bitmaps and see if there is an entry | 2474 | * The bitmap that covers offset won't be in the list unless offset |
2462 | * here that will work. | 2475 | * is just its start offset. |
2463 | */ | 2476 | */ |
2477 | entry = list_first_entry(bitmaps, struct btrfs_free_space, list); | ||
2478 | if (entry->offset != bitmap_offset) { | ||
2479 | entry = tree_search_offset(ctl, bitmap_offset, 1, 0); | ||
2480 | if (entry && list_empty(&entry->list)) | ||
2481 | list_add(&entry->list, bitmaps); | ||
2482 | } | ||
2483 | |||
2464 | list_for_each_entry(entry, bitmaps, list) { | 2484 | list_for_each_entry(entry, bitmaps, list) { |
2465 | if (entry->bytes < min_bytes) | 2485 | if (entry->bytes < min_bytes) |
2466 | continue; | 2486 | continue; |
@@ -2471,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | |||
2471 | } | 2491 | } |
2472 | 2492 | ||
2473 | /* | 2493 | /* |
2474 | * If we do have entries on our list and we are here then we didn't find | 2494 | * The bitmaps list has all the bitmaps that record free space |
2475 | * anything, so go ahead and get the next entry after the last entry in | 2495 | * starting after offset, so no more search is required. |
2476 | * this list and start the search from there. | ||
2477 | */ | 2496 | */ |
2478 | if (!list_empty(bitmaps)) { | 2497 | return -ENOSPC; |
2479 | entry = list_entry(bitmaps->prev, struct btrfs_free_space, | ||
2480 | list); | ||
2481 | node = rb_next(&entry->offset_index); | ||
2482 | if (!node) | ||
2483 | return -ENOSPC; | ||
2484 | entry = rb_entry(node, struct btrfs_free_space, offset_index); | ||
2485 | goto search; | ||
2486 | } | ||
2487 | |||
2488 | entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); | ||
2489 | if (!entry) | ||
2490 | return -ENOSPC; | ||
2491 | |||
2492 | search: | ||
2493 | node = &entry->offset_index; | ||
2494 | do { | ||
2495 | entry = rb_entry(node, struct btrfs_free_space, offset_index); | ||
2496 | node = rb_next(&entry->offset_index); | ||
2497 | if (!entry->bitmap) | ||
2498 | continue; | ||
2499 | if (entry->bytes < min_bytes) | ||
2500 | continue; | ||
2501 | ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, | ||
2502 | bytes, min_bytes); | ||
2503 | } while (ret && node); | ||
2504 | |||
2505 | return ret; | ||
2506 | } | 2498 | } |
2507 | 2499 | ||
2508 | /* | 2500 | /* |
@@ -2520,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2520 | u64 offset, u64 bytes, u64 empty_size) | 2512 | u64 offset, u64 bytes, u64 empty_size) |
2521 | { | 2513 | { |
2522 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2514 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2523 | struct list_head bitmaps; | ||
2524 | struct btrfs_free_space *entry, *tmp; | 2515 | struct btrfs_free_space *entry, *tmp; |
2516 | LIST_HEAD(bitmaps); | ||
2525 | u64 min_bytes; | 2517 | u64 min_bytes; |
2526 | int ret; | 2518 | int ret; |
2527 | 2519 | ||
@@ -2560,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2560 | goto out; | 2552 | goto out; |
2561 | } | 2553 | } |
2562 | 2554 | ||
2563 | INIT_LIST_HEAD(&bitmaps); | ||
2564 | ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, | 2555 | ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, |
2565 | bytes, min_bytes); | 2556 | bytes, min_bytes); |
2566 | if (ret) | 2557 | if (ret) |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 116ab67a06df..0a6b928813a4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/falloc.h> | 38 | #include <linux/falloc.h> |
39 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
40 | #include <linux/ratelimit.h> | 40 | #include <linux/ratelimit.h> |
41 | #include <linux/mount.h> | ||
41 | #include "compat.h" | 42 | #include "compat.h" |
42 | #include "ctree.h" | 43 | #include "ctree.h" |
43 | #include "disk-io.h" | 44 | #include "disk-io.h" |
@@ -2031,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) | |||
2031 | /* insert an orphan item to track this unlinked/truncated file */ | 2032 | /* insert an orphan item to track this unlinked/truncated file */ |
2032 | if (insert >= 1) { | 2033 | if (insert >= 1) { |
2033 | ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); | 2034 | ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); |
2034 | BUG_ON(ret); | 2035 | BUG_ON(ret && ret != -EEXIST); |
2035 | } | 2036 | } |
2036 | 2037 | ||
2037 | /* insert an orphan item to track subvolume contains orphan files */ | 2038 | /* insert an orphan item to track subvolume contains orphan files */ |
@@ -2158,6 +2159,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2158 | if (ret && ret != -ESTALE) | 2159 | if (ret && ret != -ESTALE) |
2159 | goto out; | 2160 | goto out; |
2160 | 2161 | ||
2162 | if (ret == -ESTALE && root == root->fs_info->tree_root) { | ||
2163 | struct btrfs_root *dead_root; | ||
2164 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2165 | int is_dead_root = 0; | ||
2166 | |||
2167 | /* | ||
2168 | * this is an orphan in the tree root. Currently these | ||
2169 | * could come from 2 sources: | ||
2170 | * a) a snapshot deletion in progress | ||
2171 | * b) a free space cache inode | ||
2172 | * We need to distinguish those two, as the snapshot | ||
2173 | * orphan must not get deleted. | ||
2174 | * find_dead_roots already ran before us, so if this | ||
2175 | * is a snapshot deletion, we should find the root | ||
2176 | * in the dead_roots list | ||
2177 | */ | ||
2178 | spin_lock(&fs_info->trans_lock); | ||
2179 | list_for_each_entry(dead_root, &fs_info->dead_roots, | ||
2180 | root_list) { | ||
2181 | if (dead_root->root_key.objectid == | ||
2182 | found_key.objectid) { | ||
2183 | is_dead_root = 1; | ||
2184 | break; | ||
2185 | } | ||
2186 | } | ||
2187 | spin_unlock(&fs_info->trans_lock); | ||
2188 | if (is_dead_root) { | ||
2189 | /* prevent this orphan from being found again */ | ||
2190 | key.offset = found_key.objectid - 1; | ||
2191 | continue; | ||
2192 | } | ||
2193 | } | ||
2161 | /* | 2194 | /* |
2162 | * Inode is already gone but the orphan item is still there, | 2195 | * Inode is already gone but the orphan item is still there, |
2163 | * kill the orphan item. | 2196 | * kill the orphan item. |
@@ -2191,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2191 | continue; | 2224 | continue; |
2192 | } | 2225 | } |
2193 | nr_truncate++; | 2226 | nr_truncate++; |
2227 | /* | ||
2228 | * Need to hold the imutex for reservation purposes, not | ||
2229 | * a huge deal here but I have a WARN_ON in | ||
2230 | * btrfs_delalloc_reserve_space to catch offenders. | ||
2231 | */ | ||
2232 | mutex_lock(&inode->i_mutex); | ||
2194 | ret = btrfs_truncate(inode); | 2233 | ret = btrfs_truncate(inode); |
2234 | mutex_unlock(&inode->i_mutex); | ||
2195 | } else { | 2235 | } else { |
2196 | nr_unlink++; | 2236 | nr_unlink++; |
2197 | } | 2237 | } |
@@ -3327,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3327 | u64 hint_byte = 0; | 3367 | u64 hint_byte = 0; |
3328 | hole_size = last_byte - cur_offset; | 3368 | hole_size = last_byte - cur_offset; |
3329 | 3369 | ||
3330 | trans = btrfs_start_transaction(root, 2); | 3370 | trans = btrfs_start_transaction(root, 3); |
3331 | if (IS_ERR(trans)) { | 3371 | if (IS_ERR(trans)) { |
3332 | err = PTR_ERR(trans); | 3372 | err = PTR_ERR(trans); |
3333 | break; | 3373 | break; |
@@ -3337,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3337 | cur_offset + hole_size, | 3377 | cur_offset + hole_size, |
3338 | &hint_byte, 1); | 3378 | &hint_byte, 1); |
3339 | if (err) { | 3379 | if (err) { |
3380 | btrfs_update_inode(trans, root, inode); | ||
3340 | btrfs_end_transaction(trans, root); | 3381 | btrfs_end_transaction(trans, root); |
3341 | break; | 3382 | break; |
3342 | } | 3383 | } |
@@ -3346,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3346 | 0, hole_size, 0, hole_size, | 3387 | 0, hole_size, 0, hole_size, |
3347 | 0, 0, 0); | 3388 | 0, 0, 0); |
3348 | if (err) { | 3389 | if (err) { |
3390 | btrfs_update_inode(trans, root, inode); | ||
3349 | btrfs_end_transaction(trans, root); | 3391 | btrfs_end_transaction(trans, root); |
3350 | break; | 3392 | break; |
3351 | } | 3393 | } |
@@ -3353,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3353 | btrfs_drop_extent_cache(inode, hole_start, | 3395 | btrfs_drop_extent_cache(inode, hole_start, |
3354 | last_byte - 1, 0); | 3396 | last_byte - 1, 0); |
3355 | 3397 | ||
3398 | btrfs_update_inode(trans, root, inode); | ||
3356 | btrfs_end_transaction(trans, root); | 3399 | btrfs_end_transaction(trans, root); |
3357 | } | 3400 | } |
3358 | free_extent_map(em); | 3401 | free_extent_map(em); |
@@ -3370,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3370 | 3413 | ||
3371 | static int btrfs_setsize(struct inode *inode, loff_t newsize) | 3414 | static int btrfs_setsize(struct inode *inode, loff_t newsize) |
3372 | { | 3415 | { |
3416 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
3417 | struct btrfs_trans_handle *trans; | ||
3373 | loff_t oldsize = i_size_read(inode); | 3418 | loff_t oldsize = i_size_read(inode); |
3374 | int ret; | 3419 | int ret; |
3375 | 3420 | ||
@@ -3377,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) | |||
3377 | return 0; | 3422 | return 0; |
3378 | 3423 | ||
3379 | if (newsize > oldsize) { | 3424 | if (newsize > oldsize) { |
3380 | i_size_write(inode, newsize); | ||
3381 | btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); | ||
3382 | truncate_pagecache(inode, oldsize, newsize); | 3425 | truncate_pagecache(inode, oldsize, newsize); |
3383 | ret = btrfs_cont_expand(inode, oldsize, newsize); | 3426 | ret = btrfs_cont_expand(inode, oldsize, newsize); |
3384 | if (ret) { | 3427 | if (ret) |
3385 | btrfs_setsize(inode, oldsize); | ||
3386 | return ret; | 3428 | return ret; |
3387 | } | ||
3388 | 3429 | ||
3389 | mark_inode_dirty(inode); | 3430 | trans = btrfs_start_transaction(root, 1); |
3431 | if (IS_ERR(trans)) | ||
3432 | return PTR_ERR(trans); | ||
3433 | |||
3434 | i_size_write(inode, newsize); | ||
3435 | btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); | ||
3436 | ret = btrfs_update_inode(trans, root, inode); | ||
3437 | btrfs_end_transaction_throttle(trans, root); | ||
3390 | } else { | 3438 | } else { |
3391 | 3439 | ||
3392 | /* | 3440 | /* |
@@ -3426,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) | |||
3426 | 3474 | ||
3427 | if (attr->ia_valid) { | 3475 | if (attr->ia_valid) { |
3428 | setattr_copy(inode, attr); | 3476 | setattr_copy(inode, attr); |
3429 | mark_inode_dirty(inode); | 3477 | err = btrfs_dirty_inode(inode); |
3430 | 3478 | ||
3431 | if (attr->ia_valid & ATTR_MODE) | 3479 | if (!err && attr->ia_valid & ATTR_MODE) |
3432 | err = btrfs_acl_chmod(inode); | 3480 | err = btrfs_acl_chmod(inode); |
3433 | } | 3481 | } |
3434 | 3482 | ||
@@ -3490,7 +3538,7 @@ void btrfs_evict_inode(struct inode *inode) | |||
3490 | * doing the truncate. | 3538 | * doing the truncate. |
3491 | */ | 3539 | */ |
3492 | while (1) { | 3540 | while (1) { |
3493 | ret = btrfs_block_rsv_refill(root, rsv, min_size); | 3541 | ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); |
3494 | 3542 | ||
3495 | /* | 3543 | /* |
3496 | * Try and steal from the global reserve since we will | 3544 | * Try and steal from the global reserve since we will |
@@ -4204,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) | |||
4204 | * FIXME, needs more benchmarking...there are no reasons other than performance | 4252 | * FIXME, needs more benchmarking...there are no reasons other than performance |
4205 | * to keep or drop this code. | 4253 | * to keep or drop this code. |
4206 | */ | 4254 | */ |
4207 | void btrfs_dirty_inode(struct inode *inode, int flags) | 4255 | int btrfs_dirty_inode(struct inode *inode) |
4208 | { | 4256 | { |
4209 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4257 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4210 | struct btrfs_trans_handle *trans; | 4258 | struct btrfs_trans_handle *trans; |
4211 | int ret; | 4259 | int ret; |
4212 | 4260 | ||
4213 | if (BTRFS_I(inode)->dummy_inode) | 4261 | if (BTRFS_I(inode)->dummy_inode) |
4214 | return; | 4262 | return 0; |
4215 | 4263 | ||
4216 | trans = btrfs_join_transaction(root); | 4264 | trans = btrfs_join_transaction(root); |
4217 | BUG_ON(IS_ERR(trans)); | 4265 | if (IS_ERR(trans)) |
4266 | return PTR_ERR(trans); | ||
4218 | 4267 | ||
4219 | ret = btrfs_update_inode(trans, root, inode); | 4268 | ret = btrfs_update_inode(trans, root, inode); |
4220 | if (ret && ret == -ENOSPC) { | 4269 | if (ret && ret == -ENOSPC) { |
4221 | /* whoops, lets try again with the full transaction */ | 4270 | /* whoops, lets try again with the full transaction */ |
4222 | btrfs_end_transaction(trans, root); | 4271 | btrfs_end_transaction(trans, root); |
4223 | trans = btrfs_start_transaction(root, 1); | 4272 | trans = btrfs_start_transaction(root, 1); |
4224 | if (IS_ERR(trans)) { | 4273 | if (IS_ERR(trans)) |
4225 | printk_ratelimited(KERN_ERR "btrfs: fail to " | 4274 | return PTR_ERR(trans); |
4226 | "dirty inode %llu error %ld\n", | ||
4227 | (unsigned long long)btrfs_ino(inode), | ||
4228 | PTR_ERR(trans)); | ||
4229 | return; | ||
4230 | } | ||
4231 | 4275 | ||
4232 | ret = btrfs_update_inode(trans, root, inode); | 4276 | ret = btrfs_update_inode(trans, root, inode); |
4233 | if (ret) { | ||
4234 | printk_ratelimited(KERN_ERR "btrfs: fail to " | ||
4235 | "dirty inode %llu error %d\n", | ||
4236 | (unsigned long long)btrfs_ino(inode), | ||
4237 | ret); | ||
4238 | } | ||
4239 | } | 4277 | } |
4240 | btrfs_end_transaction(trans, root); | 4278 | btrfs_end_transaction(trans, root); |
4241 | if (BTRFS_I(inode)->delayed_node) | 4279 | if (BTRFS_I(inode)->delayed_node) |
4242 | btrfs_balance_delayed_items(root); | 4280 | btrfs_balance_delayed_items(root); |
4281 | |||
4282 | return ret; | ||
4283 | } | ||
4284 | |||
4285 | /* | ||
4286 | * This is a copy of file_update_time. We need this so we can return error on | ||
4287 | * ENOSPC for updating the inode in the case of file write and mmap writes. | ||
4288 | */ | ||
4289 | int btrfs_update_time(struct file *file) | ||
4290 | { | ||
4291 | struct inode *inode = file->f_path.dentry->d_inode; | ||
4292 | struct timespec now; | ||
4293 | int ret; | ||
4294 | enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; | ||
4295 | |||
4296 | /* First try to exhaust all avenues to not sync */ | ||
4297 | if (IS_NOCMTIME(inode)) | ||
4298 | return 0; | ||
4299 | |||
4300 | now = current_fs_time(inode->i_sb); | ||
4301 | if (!timespec_equal(&inode->i_mtime, &now)) | ||
4302 | sync_it = S_MTIME; | ||
4303 | |||
4304 | if (!timespec_equal(&inode->i_ctime, &now)) | ||
4305 | sync_it |= S_CTIME; | ||
4306 | |||
4307 | if (IS_I_VERSION(inode)) | ||
4308 | sync_it |= S_VERSION; | ||
4309 | |||
4310 | if (!sync_it) | ||
4311 | return 0; | ||
4312 | |||
4313 | /* Finally allowed to write? Takes lock. */ | ||
4314 | if (mnt_want_write_file(file)) | ||
4315 | return 0; | ||
4316 | |||
4317 | /* Only change inode inside the lock region */ | ||
4318 | if (sync_it & S_VERSION) | ||
4319 | inode_inc_iversion(inode); | ||
4320 | if (sync_it & S_CTIME) | ||
4321 | inode->i_ctime = now; | ||
4322 | if (sync_it & S_MTIME) | ||
4323 | inode->i_mtime = now; | ||
4324 | ret = btrfs_dirty_inode(inode); | ||
4325 | if (!ret) | ||
4326 | mark_inode_dirty_sync(inode); | ||
4327 | mnt_drop_write(file->f_path.mnt); | ||
4328 | return ret; | ||
4243 | } | 4329 | } |
4244 | 4330 | ||
4245 | /* | 4331 | /* |
@@ -4555,11 +4641,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
4555 | goto out_unlock; | 4641 | goto out_unlock; |
4556 | } | 4642 | } |
4557 | 4643 | ||
4644 | /* | ||
4645 | * If the active LSM wants to access the inode during | ||
4646 | * d_instantiate it needs these. Smack checks to see | ||
4647 | * if the filesystem supports xattrs by looking at the | ||
4648 | * ops vector. | ||
4649 | */ | ||
4650 | |||
4651 | inode->i_op = &btrfs_special_inode_operations; | ||
4558 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); | 4652 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); |
4559 | if (err) | 4653 | if (err) |
4560 | drop_inode = 1; | 4654 | drop_inode = 1; |
4561 | else { | 4655 | else { |
4562 | inode->i_op = &btrfs_special_inode_operations; | ||
4563 | init_special_inode(inode, inode->i_mode, rdev); | 4656 | init_special_inode(inode, inode->i_mode, rdev); |
4564 | btrfs_update_inode(trans, root, inode); | 4657 | btrfs_update_inode(trans, root, inode); |
4565 | } | 4658 | } |
@@ -4613,14 +4706,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, | |||
4613 | goto out_unlock; | 4706 | goto out_unlock; |
4614 | } | 4707 | } |
4615 | 4708 | ||
4709 | /* | ||
4710 | * If the active LSM wants to access the inode during | ||
4711 | * d_instantiate it needs these. Smack checks to see | ||
4712 | * if the filesystem supports xattrs by looking at the | ||
4713 | * ops vector. | ||
4714 | */ | ||
4715 | inode->i_fop = &btrfs_file_operations; | ||
4716 | inode->i_op = &btrfs_file_inode_operations; | ||
4717 | |||
4616 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); | 4718 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); |
4617 | if (err) | 4719 | if (err) |
4618 | drop_inode = 1; | 4720 | drop_inode = 1; |
4619 | else { | 4721 | else { |
4620 | inode->i_mapping->a_ops = &btrfs_aops; | 4722 | inode->i_mapping->a_ops = &btrfs_aops; |
4621 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | 4723 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; |
4622 | inode->i_fop = &btrfs_file_operations; | ||
4623 | inode->i_op = &btrfs_file_inode_operations; | ||
4624 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; | 4724 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; |
4625 | } | 4725 | } |
4626 | out_unlock: | 4726 | out_unlock: |
@@ -6303,7 +6403,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
6303 | u64 page_start; | 6403 | u64 page_start; |
6304 | u64 page_end; | 6404 | u64 page_end; |
6305 | 6405 | ||
6406 | /* Need this to keep space reservations serialized */ | ||
6407 | mutex_lock(&inode->i_mutex); | ||
6306 | ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); | 6408 | ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); |
6409 | mutex_unlock(&inode->i_mutex); | ||
6410 | if (!ret) | ||
6411 | ret = btrfs_update_time(vma->vm_file); | ||
6307 | if (ret) { | 6412 | if (ret) { |
6308 | if (ret == -ENOMEM) | 6413 | if (ret == -ENOMEM) |
6309 | ret = VM_FAULT_OOM; | 6414 | ret = VM_FAULT_OOM; |
@@ -6515,8 +6620,9 @@ static int btrfs_truncate(struct inode *inode) | |||
6515 | /* Just need the 1 for updating the inode */ | 6620 | /* Just need the 1 for updating the inode */ |
6516 | trans = btrfs_start_transaction(root, 1); | 6621 | trans = btrfs_start_transaction(root, 1); |
6517 | if (IS_ERR(trans)) { | 6622 | if (IS_ERR(trans)) { |
6518 | err = PTR_ERR(trans); | 6623 | ret = err = PTR_ERR(trans); |
6519 | goto out; | 6624 | trans = NULL; |
6625 | break; | ||
6520 | } | 6626 | } |
6521 | } | 6627 | } |
6522 | 6628 | ||
@@ -6794,11 +6900,13 @@ static int btrfs_getattr(struct vfsmount *mnt, | |||
6794 | struct dentry *dentry, struct kstat *stat) | 6900 | struct dentry *dentry, struct kstat *stat) |
6795 | { | 6901 | { |
6796 | struct inode *inode = dentry->d_inode; | 6902 | struct inode *inode = dentry->d_inode; |
6903 | u32 blocksize = inode->i_sb->s_blocksize; | ||
6904 | |||
6797 | generic_fillattr(inode, stat); | 6905 | generic_fillattr(inode, stat); |
6798 | stat->dev = BTRFS_I(inode)->root->anon_dev; | 6906 | stat->dev = BTRFS_I(inode)->root->anon_dev; |
6799 | stat->blksize = PAGE_CACHE_SIZE; | 6907 | stat->blksize = PAGE_CACHE_SIZE; |
6800 | stat->blocks = (inode_get_bytes(inode) + | 6908 | stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + |
6801 | BTRFS_I(inode)->delalloc_bytes) >> 9; | 6909 | ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; |
6802 | return 0; | 6910 | return 0; |
6803 | } | 6911 | } |
6804 | 6912 | ||
@@ -7074,14 +7182,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, | |||
7074 | goto out_unlock; | 7182 | goto out_unlock; |
7075 | } | 7183 | } |
7076 | 7184 | ||
7185 | /* | ||
7186 | * If the active LSM wants to access the inode during | ||
7187 | * d_instantiate it needs these. Smack checks to see | ||
7188 | * if the filesystem supports xattrs by looking at the | ||
7189 | * ops vector. | ||
7190 | */ | ||
7191 | inode->i_fop = &btrfs_file_operations; | ||
7192 | inode->i_op = &btrfs_file_inode_operations; | ||
7193 | |||
7077 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); | 7194 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); |
7078 | if (err) | 7195 | if (err) |
7079 | drop_inode = 1; | 7196 | drop_inode = 1; |
7080 | else { | 7197 | else { |
7081 | inode->i_mapping->a_ops = &btrfs_aops; | 7198 | inode->i_mapping->a_ops = &btrfs_aops; |
7082 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | 7199 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; |
7083 | inode->i_fop = &btrfs_file_operations; | ||
7084 | inode->i_op = &btrfs_file_inode_operations; | ||
7085 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; | 7200 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; |
7086 | } | 7201 | } |
7087 | if (drop_inode) | 7202 | if (drop_inode) |
@@ -7351,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { | |||
7351 | .follow_link = page_follow_link_light, | 7466 | .follow_link = page_follow_link_light, |
7352 | .put_link = page_put_link, | 7467 | .put_link = page_put_link, |
7353 | .getattr = btrfs_getattr, | 7468 | .getattr = btrfs_getattr, |
7469 | .setattr = btrfs_setattr, | ||
7354 | .permission = btrfs_permission, | 7470 | .permission = btrfs_permission, |
7355 | .setxattr = btrfs_setxattr, | 7471 | .setxattr = btrfs_setxattr, |
7356 | .getxattr = btrfs_getxattr, | 7472 | .getxattr = btrfs_getxattr, |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4a34c472f126..c04f02c7d5bb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) | |||
252 | trans = btrfs_join_transaction(root); | 252 | trans = btrfs_join_transaction(root); |
253 | BUG_ON(IS_ERR(trans)); | 253 | BUG_ON(IS_ERR(trans)); |
254 | 254 | ||
255 | btrfs_update_iflags(inode); | ||
256 | inode->i_ctime = CURRENT_TIME; | ||
255 | ret = btrfs_update_inode(trans, root, inode); | 257 | ret = btrfs_update_inode(trans, root, inode); |
256 | BUG_ON(ret); | 258 | BUG_ON(ret); |
257 | 259 | ||
258 | btrfs_update_iflags(inode); | ||
259 | inode->i_ctime = CURRENT_TIME; | ||
260 | btrfs_end_transaction(trans, root); | 260 | btrfs_end_transaction(trans, root); |
261 | 261 | ||
262 | mnt_drop_write(file->f_path.mnt); | 262 | mnt_drop_write(file->f_path.mnt); |
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode, | |||
858 | return 0; | 858 | return 0; |
859 | file_end = (isize - 1) >> PAGE_CACHE_SHIFT; | 859 | file_end = (isize - 1) >> PAGE_CACHE_SHIFT; |
860 | 860 | ||
861 | mutex_lock(&inode->i_mutex); | ||
861 | ret = btrfs_delalloc_reserve_space(inode, | 862 | ret = btrfs_delalloc_reserve_space(inode, |
862 | num_pages << PAGE_CACHE_SHIFT); | 863 | num_pages << PAGE_CACHE_SHIFT); |
864 | mutex_unlock(&inode->i_mutex); | ||
863 | if (ret) | 865 | if (ret) |
864 | return ret; | 866 | return ret; |
865 | again: | 867 | again: |
@@ -1216,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1216 | *devstr = '\0'; | 1218 | *devstr = '\0'; |
1217 | devstr = vol_args->name; | 1219 | devstr = vol_args->name; |
1218 | devid = simple_strtoull(devstr, &end, 10); | 1220 | devid = simple_strtoull(devstr, &end, 10); |
1219 | printk(KERN_INFO "resizing devid %llu\n", | 1221 | printk(KERN_INFO "btrfs: resizing devid %llu\n", |
1220 | (unsigned long long)devid); | 1222 | (unsigned long long)devid); |
1221 | } | 1223 | } |
1222 | device = btrfs_find_device(root, devid, NULL, NULL); | 1224 | device = btrfs_find_device(root, devid, NULL, NULL); |
1223 | if (!device) { | 1225 | if (!device) { |
1224 | printk(KERN_INFO "resizer unable to find device %llu\n", | 1226 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", |
1225 | (unsigned long long)devid); | 1227 | (unsigned long long)devid); |
1226 | ret = -EINVAL; | 1228 | ret = -EINVAL; |
1227 | goto out_unlock; | 1229 | goto out_unlock; |
@@ -1267,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1267 | do_div(new_size, root->sectorsize); | 1269 | do_div(new_size, root->sectorsize); |
1268 | new_size *= root->sectorsize; | 1270 | new_size *= root->sectorsize; |
1269 | 1271 | ||
1270 | printk(KERN_INFO "new size for %s is %llu\n", | 1272 | printk(KERN_INFO "btrfs: new size for %s is %llu\n", |
1271 | device->name, (unsigned long long)new_size); | 1273 | device->name, (unsigned long long)new_size); |
1272 | 1274 | ||
1273 | if (new_size > old_size) { | 1275 | if (new_size > old_size) { |
@@ -1278,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1278 | } | 1280 | } |
1279 | ret = btrfs_grow_device(trans, device, new_size); | 1281 | ret = btrfs_grow_device(trans, device, new_size); |
1280 | btrfs_commit_transaction(trans, root); | 1282 | btrfs_commit_transaction(trans, root); |
1281 | } else { | 1283 | } else if (new_size < old_size) { |
1282 | ret = btrfs_shrink_device(device, new_size); | 1284 | ret = btrfs_shrink_device(device, new_size); |
1283 | } | 1285 | } |
1284 | 1286 | ||
@@ -2930,11 +2932,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) | |||
2930 | goto out; | 2932 | goto out; |
2931 | 2933 | ||
2932 | for (i = 0; i < ipath->fspath->elem_cnt; ++i) { | 2934 | for (i = 0; i < ipath->fspath->elem_cnt; ++i) { |
2933 | rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; | 2935 | rel_ptr = ipath->fspath->val[i] - |
2936 | (u64)(unsigned long)ipath->fspath->val; | ||
2934 | ipath->fspath->val[i] = rel_ptr; | 2937 | ipath->fspath->val[i] = rel_ptr; |
2935 | } | 2938 | } |
2936 | 2939 | ||
2937 | ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); | 2940 | ret = copy_to_user((void *)(unsigned long)ipa->fspath, |
2941 | (void *)(unsigned long)ipath->fspath, size); | ||
2938 | if (ret) { | 2942 | if (ret) { |
2939 | ret = -EFAULT; | 2943 | ret = -EFAULT; |
2940 | goto out; | 2944 | goto out; |
@@ -3017,7 +3021,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, | |||
3017 | if (ret < 0) | 3021 | if (ret < 0) |
3018 | goto out; | 3022 | goto out; |
3019 | 3023 | ||
3020 | ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); | 3024 | ret = copy_to_user((void *)(unsigned long)loi->inodes, |
3025 | (void *)(unsigned long)inodes, size); | ||
3021 | if (ret) | 3026 | if (ret) |
3022 | ret = -EFAULT; | 3027 | ret = -EFAULT; |
3023 | 3028 | ||
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index dff29d5e151a..cfb55434a469 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
2947 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; | 2947 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; |
2948 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; | 2948 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; |
2949 | while (index <= last_index) { | 2949 | while (index <= last_index) { |
2950 | mutex_lock(&inode->i_mutex); | ||
2950 | ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); | 2951 | ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); |
2952 | mutex_unlock(&inode->i_mutex); | ||
2951 | if (ret) | 2953 | if (ret) |
2952 | goto out; | 2954 | goto out; |
2953 | 2955 | ||
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f4190f22edfb..ddf2c90d3fc0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | |||
256 | btrfs_release_path(swarn->path); | 256 | btrfs_release_path(swarn->path); |
257 | 257 | ||
258 | ipath = init_ipath(4096, local_root, swarn->path); | 258 | ipath = init_ipath(4096, local_root, swarn->path); |
259 | if (IS_ERR(ipath)) { | ||
260 | ret = PTR_ERR(ipath); | ||
261 | ipath = NULL; | ||
262 | goto err; | ||
263 | } | ||
259 | ret = paths_from_inode(inum, ipath); | 264 | ret = paths_from_inode(inum, ipath); |
260 | 265 | ||
261 | if (ret < 0) | 266 | if (ret < 0) |
@@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | |||
272 | swarn->logical, swarn->dev->name, | 277 | swarn->logical, swarn->dev->name, |
273 | (unsigned long long)swarn->sector, root, inum, offset, | 278 | (unsigned long long)swarn->sector, root, inum, offset, |
274 | min(isize - offset, (u64)PAGE_SIZE), nlink, | 279 | min(isize - offset, (u64)PAGE_SIZE), nlink, |
275 | (char *)ipath->fspath->val[i]); | 280 | (char *)(unsigned long)ipath->fspath->val[i]); |
276 | 281 | ||
277 | free_ipath(ipath); | 282 | free_ipath(ipath); |
278 | return 0; | 283 | return 0; |
@@ -1530,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) | |||
1530 | static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) | 1535 | static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) |
1531 | { | 1536 | { |
1532 | struct btrfs_fs_info *fs_info = root->fs_info; | 1537 | struct btrfs_fs_info *fs_info = root->fs_info; |
1538 | int ret = 0; | ||
1533 | 1539 | ||
1534 | mutex_lock(&fs_info->scrub_lock); | 1540 | mutex_lock(&fs_info->scrub_lock); |
1535 | if (fs_info->scrub_workers_refcnt == 0) { | 1541 | if (fs_info->scrub_workers_refcnt == 0) { |
1536 | btrfs_init_workers(&fs_info->scrub_workers, "scrub", | 1542 | btrfs_init_workers(&fs_info->scrub_workers, "scrub", |
1537 | fs_info->thread_pool_size, &fs_info->generic_worker); | 1543 | fs_info->thread_pool_size, &fs_info->generic_worker); |
1538 | fs_info->scrub_workers.idle_thresh = 4; | 1544 | fs_info->scrub_workers.idle_thresh = 4; |
1539 | btrfs_start_workers(&fs_info->scrub_workers, 1); | 1545 | ret = btrfs_start_workers(&fs_info->scrub_workers); |
1546 | if (ret) | ||
1547 | goto out; | ||
1540 | } | 1548 | } |
1541 | ++fs_info->scrub_workers_refcnt; | 1549 | ++fs_info->scrub_workers_refcnt; |
1550 | out: | ||
1542 | mutex_unlock(&fs_info->scrub_lock); | 1551 | mutex_unlock(&fs_info->scrub_lock); |
1543 | 1552 | ||
1544 | return 0; | 1553 | return ret; |
1545 | } | 1554 | } |
1546 | 1555 | ||
1547 | static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) | 1556 | static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8bd9d6d0e07a..200f63bc6675 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/slab.h> | 41 | #include <linux/slab.h> |
42 | #include <linux/cleancache.h> | 42 | #include <linux/cleancache.h> |
43 | #include <linux/mnt_namespace.h> | 43 | #include <linux/mnt_namespace.h> |
44 | #include <linux/ratelimit.h> | ||
44 | #include "compat.h" | 45 | #include "compat.h" |
45 | #include "delayed-inode.h" | 46 | #include "delayed-inode.h" |
46 | #include "ctree.h" | 47 | #include "ctree.h" |
@@ -825,13 +826,9 @@ static char *setup_root_args(char *args) | |||
825 | static struct dentry *mount_subvol(const char *subvol_name, int flags, | 826 | static struct dentry *mount_subvol(const char *subvol_name, int flags, |
826 | const char *device_name, char *data) | 827 | const char *device_name, char *data) |
827 | { | 828 | { |
828 | struct super_block *s; | ||
829 | struct dentry *root; | 829 | struct dentry *root; |
830 | struct vfsmount *mnt; | 830 | struct vfsmount *mnt; |
831 | struct mnt_namespace *ns_private; | ||
832 | char *newargs; | 831 | char *newargs; |
833 | struct path path; | ||
834 | int error; | ||
835 | 832 | ||
836 | newargs = setup_root_args(data); | 833 | newargs = setup_root_args(data); |
837 | if (!newargs) | 834 | if (!newargs) |
@@ -842,39 +839,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, | |||
842 | if (IS_ERR(mnt)) | 839 | if (IS_ERR(mnt)) |
843 | return ERR_CAST(mnt); | 840 | return ERR_CAST(mnt); |
844 | 841 | ||
845 | ns_private = create_mnt_ns(mnt); | 842 | root = mount_subtree(mnt, subvol_name); |
846 | if (IS_ERR(ns_private)) { | ||
847 | mntput(mnt); | ||
848 | return ERR_CAST(ns_private); | ||
849 | } | ||
850 | 843 | ||
851 | /* | 844 | if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { |
852 | * This will trigger the automount of the subvol so we can just | 845 | struct super_block *s = root->d_sb; |
853 | * drop the mnt we have here and return the dentry that we | 846 | dput(root); |
854 | * found. | 847 | root = ERR_PTR(-EINVAL); |
855 | */ | 848 | deactivate_locked_super(s); |
856 | error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, | ||
857 | LOOKUP_FOLLOW, &path); | ||
858 | put_mnt_ns(ns_private); | ||
859 | if (error) | ||
860 | return ERR_PTR(error); | ||
861 | |||
862 | if (!is_subvolume_inode(path.dentry->d_inode)) { | ||
863 | path_put(&path); | ||
864 | mntput(mnt); | ||
865 | error = -EINVAL; | ||
866 | printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", | 849 | printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", |
867 | subvol_name); | 850 | subvol_name); |
868 | return ERR_PTR(-EINVAL); | ||
869 | } | 851 | } |
870 | 852 | ||
871 | /* Get a ref to the sb and the dentry we found and return it */ | ||
872 | s = path.mnt->mnt_sb; | ||
873 | atomic_inc(&s->s_active); | ||
874 | root = dget(path.dentry); | ||
875 | path_put(&path); | ||
876 | down_write(&s->s_umount); | ||
877 | |||
878 | return root; | 853 | return root; |
879 | } | 854 | } |
880 | 855 | ||
@@ -1079,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1079 | u64 avail_space; | 1054 | u64 avail_space; |
1080 | u64 used_space; | 1055 | u64 used_space; |
1081 | u64 min_stripe_size; | 1056 | u64 min_stripe_size; |
1082 | int min_stripes = 1; | 1057 | int min_stripes = 1, num_stripes = 1; |
1083 | int i = 0, nr_devices; | 1058 | int i = 0, nr_devices; |
1084 | int ret; | 1059 | int ret; |
1085 | 1060 | ||
1086 | nr_devices = fs_info->fs_devices->rw_devices; | 1061 | nr_devices = fs_info->fs_devices->open_devices; |
1087 | BUG_ON(!nr_devices); | 1062 | BUG_ON(!nr_devices); |
1088 | 1063 | ||
1089 | devices_info = kmalloc(sizeof(*devices_info) * nr_devices, | 1064 | devices_info = kmalloc(sizeof(*devices_info) * nr_devices, |
@@ -1093,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1093 | 1068 | ||
1094 | /* calc min stripe number for data space alloction */ | 1069 | /* calc min stripe number for data space alloction */ |
1095 | type = btrfs_get_alloc_profile(root, 1); | 1070 | type = btrfs_get_alloc_profile(root, 1); |
1096 | if (type & BTRFS_BLOCK_GROUP_RAID0) | 1071 | if (type & BTRFS_BLOCK_GROUP_RAID0) { |
1097 | min_stripes = 2; | 1072 | min_stripes = 2; |
1098 | else if (type & BTRFS_BLOCK_GROUP_RAID1) | 1073 | num_stripes = nr_devices; |
1074 | } else if (type & BTRFS_BLOCK_GROUP_RAID1) { | ||
1099 | min_stripes = 2; | 1075 | min_stripes = 2; |
1100 | else if (type & BTRFS_BLOCK_GROUP_RAID10) | 1076 | num_stripes = 2; |
1077 | } else if (type & BTRFS_BLOCK_GROUP_RAID10) { | ||
1101 | min_stripes = 4; | 1078 | min_stripes = 4; |
1079 | num_stripes = 4; | ||
1080 | } | ||
1102 | 1081 | ||
1103 | if (type & BTRFS_BLOCK_GROUP_DUP) | 1082 | if (type & BTRFS_BLOCK_GROUP_DUP) |
1104 | min_stripe_size = 2 * BTRFS_STRIPE_LEN; | 1083 | min_stripe_size = 2 * BTRFS_STRIPE_LEN; |
1105 | else | 1084 | else |
1106 | min_stripe_size = BTRFS_STRIPE_LEN; | 1085 | min_stripe_size = BTRFS_STRIPE_LEN; |
1107 | 1086 | ||
1108 | list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { | 1087 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
1109 | if (!device->in_fs_metadata) | 1088 | if (!device->in_fs_metadata || !device->bdev) |
1110 | continue; | 1089 | continue; |
1111 | 1090 | ||
1112 | avail_space = device->total_bytes - device->bytes_used; | 1091 | avail_space = device->total_bytes - device->bytes_used; |
@@ -1167,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1167 | i = nr_devices - 1; | 1146 | i = nr_devices - 1; |
1168 | avail_space = 0; | 1147 | avail_space = 0; |
1169 | while (nr_devices >= min_stripes) { | 1148 | while (nr_devices >= min_stripes) { |
1149 | if (num_stripes > nr_devices) | ||
1150 | num_stripes = nr_devices; | ||
1151 | |||
1170 | if (devices_info[i].max_avail >= min_stripe_size) { | 1152 | if (devices_info[i].max_avail >= min_stripe_size) { |
1171 | int j; | 1153 | int j; |
1172 | u64 alloc_size; | 1154 | u64 alloc_size; |
1173 | 1155 | ||
1174 | avail_space += devices_info[i].max_avail * min_stripes; | 1156 | avail_space += devices_info[i].max_avail * num_stripes; |
1175 | alloc_size = devices_info[i].max_avail; | 1157 | alloc_size = devices_info[i].max_avail; |
1176 | for (j = i + 1 - min_stripes; j <= i; j++) | 1158 | for (j = i + 1 - num_stripes; j <= i; j++) |
1177 | devices_info[j].max_avail -= alloc_size; | 1159 | devices_info[j].max_avail -= alloc_size; |
1178 | } | 1160 | } |
1179 | i--; | 1161 | i--; |
@@ -1290,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb) | |||
1290 | return 0; | 1272 | return 0; |
1291 | } | 1273 | } |
1292 | 1274 | ||
1275 | static void btrfs_fs_dirty_inode(struct inode *inode, int flags) | ||
1276 | { | ||
1277 | int ret; | ||
1278 | |||
1279 | ret = btrfs_dirty_inode(inode); | ||
1280 | if (ret) | ||
1281 | printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu " | ||
1282 | "error %d\n", btrfs_ino(inode), ret); | ||
1283 | } | ||
1284 | |||
1293 | static const struct super_operations btrfs_super_ops = { | 1285 | static const struct super_operations btrfs_super_ops = { |
1294 | .drop_inode = btrfs_drop_inode, | 1286 | .drop_inode = btrfs_drop_inode, |
1295 | .evict_inode = btrfs_evict_inode, | 1287 | .evict_inode = btrfs_evict_inode, |
@@ -1297,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = { | |||
1297 | .sync_fs = btrfs_sync_fs, | 1289 | .sync_fs = btrfs_sync_fs, |
1298 | .show_options = btrfs_show_options, | 1290 | .show_options = btrfs_show_options, |
1299 | .write_inode = btrfs_write_inode, | 1291 | .write_inode = btrfs_write_inode, |
1300 | .dirty_inode = btrfs_dirty_inode, | 1292 | .dirty_inode = btrfs_fs_dirty_inode, |
1301 | .alloc_inode = btrfs_alloc_inode, | 1293 | .alloc_inode = btrfs_alloc_inode, |
1302 | .destroy_inode = btrfs_destroy_inode, | 1294 | .destroy_inode = btrfs_destroy_inode, |
1303 | .statfs = btrfs_statfs, | 1295 | .statfs = btrfs_statfs, |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6a0574e923bc..81376d94cd3c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, | |||
785 | 785 | ||
786 | btrfs_save_ino_cache(root, trans); | 786 | btrfs_save_ino_cache(root, trans); |
787 | 787 | ||
788 | /* see comments in should_cow_block() */ | ||
789 | root->force_cow = 0; | ||
790 | smp_wmb(); | ||
791 | |||
788 | if (root->commit_root != root->node) { | 792 | if (root->commit_root != root->node) { |
789 | mutex_lock(&root->fs_commit_mutex); | 793 | mutex_lock(&root->fs_commit_mutex); |
790 | switch_commit_root(root); | 794 | switch_commit_root(root); |
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
947 | btrfs_tree_unlock(old); | 951 | btrfs_tree_unlock(old); |
948 | free_extent_buffer(old); | 952 | free_extent_buffer(old); |
949 | 953 | ||
954 | /* see comments in should_cow_block() */ | ||
955 | root->force_cow = 1; | ||
956 | smp_wmb(); | ||
957 | |||
950 | btrfs_set_root_node(new_root_item, tmp); | 958 | btrfs_set_root_node(new_root_item, tmp); |
951 | /* record when the snapshot was created in key.offset */ | 959 | /* record when the snapshot was created in key.offset */ |
952 | key.offset = trans->transid; | 960 | key.offset = trans->transid; |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c37433d3cd82..f4b839fd3c9d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -295,6 +295,12 @@ loop_lock: | |||
295 | btrfs_requeue_work(&device->work); | 295 | btrfs_requeue_work(&device->work); |
296 | goto done; | 296 | goto done; |
297 | } | 297 | } |
298 | /* unplug every 64 requests just for good measure */ | ||
299 | if (batch_run % 64 == 0) { | ||
300 | blk_finish_plug(&plug); | ||
301 | blk_start_plug(&plug); | ||
302 | sync_pending = 0; | ||
303 | } | ||
298 | } | 304 | } |
299 | 305 | ||
300 | cond_resched(); | 306 | cond_resched(); |
@@ -1611,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1611 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) | 1617 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) |
1612 | return -EINVAL; | 1618 | return -EINVAL; |
1613 | 1619 | ||
1614 | bdev = blkdev_get_by_path(device_path, FMODE_EXCL, | 1620 | bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, |
1615 | root->fs_info->bdev_holder); | 1621 | root->fs_info->bdev_holder); |
1616 | if (IS_ERR(bdev)) | 1622 | if (IS_ERR(bdev)) |
1617 | return PTR_ERR(bdev); | 1623 | return PTR_ERR(bdev); |
@@ -3258,7 +3264,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
3258 | */ | 3264 | */ |
3259 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 3265 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
3260 | err = -EIO; | 3266 | err = -EIO; |
3261 | } else if (err) { | 3267 | } else { |
3262 | /* | 3268 | /* |
3263 | * this bio is actually up to date, we didn't | 3269 | * this bio is actually up to date, we didn't |
3264 | * go over the max number of errors | 3270 | * go over the max number of errors |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c49f352..78f2d4d4f37f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -100,6 +100,12 @@ struct btrfs_device { | |||
100 | struct reada_zone *reada_curr_zone; | 100 | struct reada_zone *reada_curr_zone; |
101 | struct radix_tree_root reada_zones; | 101 | struct radix_tree_root reada_zones; |
102 | struct radix_tree_root reada_extents; | 102 | struct radix_tree_root reada_extents; |
103 | |||
104 | /* for sending down flush barriers */ | ||
105 | struct bio *flush_bio; | ||
106 | struct completion flush_wait; | ||
107 | int nobarriers; | ||
108 | |||
103 | }; | 109 | }; |
104 | 110 | ||
105 | struct btrfs_fs_devices { | 111 | struct btrfs_fs_devices { |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4144caf2f9d3..173b1d22e59b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page) | |||
87 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); | 87 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); |
88 | 88 | ||
89 | /* dirty the head */ | 89 | /* dirty the head */ |
90 | spin_lock(&inode->i_lock); | 90 | spin_lock(&ci->i_ceph_lock); |
91 | if (ci->i_head_snapc == NULL) | 91 | if (ci->i_head_snapc == NULL) |
92 | ci->i_head_snapc = ceph_get_snap_context(snapc); | 92 | ci->i_head_snapc = ceph_get_snap_context(snapc); |
93 | ++ci->i_wrbuffer_ref_head; | 93 | ++ci->i_wrbuffer_ref_head; |
@@ -100,7 +100,7 @@ static int ceph_set_page_dirty(struct page *page) | |||
100 | ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, | 100 | ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, |
101 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, | 101 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, |
102 | snapc, snapc->seq, snapc->num_snaps); | 102 | snapc, snapc->seq, snapc->num_snaps); |
103 | spin_unlock(&inode->i_lock); | 103 | spin_unlock(&ci->i_ceph_lock); |
104 | 104 | ||
105 | /* now adjust page */ | 105 | /* now adjust page */ |
106 | spin_lock_irq(&mapping->tree_lock); | 106 | spin_lock_irq(&mapping->tree_lock); |
@@ -391,7 +391,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, | |||
391 | struct ceph_snap_context *snapc = NULL; | 391 | struct ceph_snap_context *snapc = NULL; |
392 | struct ceph_cap_snap *capsnap = NULL; | 392 | struct ceph_cap_snap *capsnap = NULL; |
393 | 393 | ||
394 | spin_lock(&inode->i_lock); | 394 | spin_lock(&ci->i_ceph_lock); |
395 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 395 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
396 | dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, | 396 | dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, |
397 | capsnap->context, capsnap->dirty_pages); | 397 | capsnap->context, capsnap->dirty_pages); |
@@ -407,7 +407,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, | |||
407 | dout(" head snapc %p has %d dirty pages\n", | 407 | dout(" head snapc %p has %d dirty pages\n", |
408 | snapc, ci->i_wrbuffer_ref_head); | 408 | snapc, ci->i_wrbuffer_ref_head); |
409 | } | 409 | } |
410 | spin_unlock(&inode->i_lock); | 410 | spin_unlock(&ci->i_ceph_lock); |
411 | return snapc; | 411 | return snapc; |
412 | } | 412 | } |
413 | 413 | ||
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0f327c6c9679..8b53193e4f7c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -309,7 +309,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, | |||
309 | /* | 309 | /* |
310 | * Find ceph_cap for given mds, if any. | 310 | * Find ceph_cap for given mds, if any. |
311 | * | 311 | * |
312 | * Called with i_lock held. | 312 | * Called with i_ceph_lock held. |
313 | */ | 313 | */ |
314 | static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) | 314 | static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) |
315 | { | 315 | { |
@@ -332,9 +332,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) | |||
332 | { | 332 | { |
333 | struct ceph_cap *cap; | 333 | struct ceph_cap *cap; |
334 | 334 | ||
335 | spin_lock(&ci->vfs_inode.i_lock); | 335 | spin_lock(&ci->i_ceph_lock); |
336 | cap = __get_cap_for_mds(ci, mds); | 336 | cap = __get_cap_for_mds(ci, mds); |
337 | spin_unlock(&ci->vfs_inode.i_lock); | 337 | spin_unlock(&ci->i_ceph_lock); |
338 | return cap; | 338 | return cap; |
339 | } | 339 | } |
340 | 340 | ||
@@ -361,15 +361,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci) | |||
361 | 361 | ||
362 | int ceph_get_cap_mds(struct inode *inode) | 362 | int ceph_get_cap_mds(struct inode *inode) |
363 | { | 363 | { |
364 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
364 | int mds; | 365 | int mds; |
365 | spin_lock(&inode->i_lock); | 366 | spin_lock(&ci->i_ceph_lock); |
366 | mds = __ceph_get_cap_mds(ceph_inode(inode)); | 367 | mds = __ceph_get_cap_mds(ceph_inode(inode)); |
367 | spin_unlock(&inode->i_lock); | 368 | spin_unlock(&ci->i_ceph_lock); |
368 | return mds; | 369 | return mds; |
369 | } | 370 | } |
370 | 371 | ||
371 | /* | 372 | /* |
372 | * Called under i_lock. | 373 | * Called under i_ceph_lock. |
373 | */ | 374 | */ |
374 | static void __insert_cap_node(struct ceph_inode_info *ci, | 375 | static void __insert_cap_node(struct ceph_inode_info *ci, |
375 | struct ceph_cap *new) | 376 | struct ceph_cap *new) |
@@ -415,7 +416,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, | |||
415 | * | 416 | * |
416 | * If I_FLUSH is set, leave the inode at the front of the list. | 417 | * If I_FLUSH is set, leave the inode at the front of the list. |
417 | * | 418 | * |
418 | * Caller holds i_lock | 419 | * Caller holds i_ceph_lock |
419 | * -> we take mdsc->cap_delay_lock | 420 | * -> we take mdsc->cap_delay_lock |
420 | */ | 421 | */ |
421 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, | 422 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, |
@@ -457,7 +458,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, | |||
457 | /* | 458 | /* |
458 | * Cancel delayed work on cap. | 459 | * Cancel delayed work on cap. |
459 | * | 460 | * |
460 | * Caller must hold i_lock. | 461 | * Caller must hold i_ceph_lock. |
461 | */ | 462 | */ |
462 | static void __cap_delay_cancel(struct ceph_mds_client *mdsc, | 463 | static void __cap_delay_cancel(struct ceph_mds_client *mdsc, |
463 | struct ceph_inode_info *ci) | 464 | struct ceph_inode_info *ci) |
@@ -532,14 +533,14 @@ int ceph_add_cap(struct inode *inode, | |||
532 | wanted |= ceph_caps_for_mode(fmode); | 533 | wanted |= ceph_caps_for_mode(fmode); |
533 | 534 | ||
534 | retry: | 535 | retry: |
535 | spin_lock(&inode->i_lock); | 536 | spin_lock(&ci->i_ceph_lock); |
536 | cap = __get_cap_for_mds(ci, mds); | 537 | cap = __get_cap_for_mds(ci, mds); |
537 | if (!cap) { | 538 | if (!cap) { |
538 | if (new_cap) { | 539 | if (new_cap) { |
539 | cap = new_cap; | 540 | cap = new_cap; |
540 | new_cap = NULL; | 541 | new_cap = NULL; |
541 | } else { | 542 | } else { |
542 | spin_unlock(&inode->i_lock); | 543 | spin_unlock(&ci->i_ceph_lock); |
543 | new_cap = get_cap(mdsc, caps_reservation); | 544 | new_cap = get_cap(mdsc, caps_reservation); |
544 | if (new_cap == NULL) | 545 | if (new_cap == NULL) |
545 | return -ENOMEM; | 546 | return -ENOMEM; |
@@ -625,7 +626,7 @@ retry: | |||
625 | 626 | ||
626 | if (fmode >= 0) | 627 | if (fmode >= 0) |
627 | __ceph_get_fmode(ci, fmode); | 628 | __ceph_get_fmode(ci, fmode); |
628 | spin_unlock(&inode->i_lock); | 629 | spin_unlock(&ci->i_ceph_lock); |
629 | wake_up_all(&ci->i_cap_wq); | 630 | wake_up_all(&ci->i_cap_wq); |
630 | return 0; | 631 | return 0; |
631 | } | 632 | } |
@@ -792,7 +793,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) | |||
792 | struct rb_node *p; | 793 | struct rb_node *p; |
793 | int ret = 0; | 794 | int ret = 0; |
794 | 795 | ||
795 | spin_lock(&inode->i_lock); | 796 | spin_lock(&ci->i_ceph_lock); |
796 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 797 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
797 | cap = rb_entry(p, struct ceph_cap, ci_node); | 798 | cap = rb_entry(p, struct ceph_cap, ci_node); |
798 | if (__cap_is_valid(cap) && | 799 | if (__cap_is_valid(cap) && |
@@ -801,7 +802,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) | |||
801 | break; | 802 | break; |
802 | } | 803 | } |
803 | } | 804 | } |
804 | spin_unlock(&inode->i_lock); | 805 | spin_unlock(&ci->i_ceph_lock); |
805 | dout("ceph_caps_revoking %p %s = %d\n", inode, | 806 | dout("ceph_caps_revoking %p %s = %d\n", inode, |
806 | ceph_cap_string(mask), ret); | 807 | ceph_cap_string(mask), ret); |
807 | return ret; | 808 | return ret; |
@@ -855,7 +856,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) | |||
855 | } | 856 | } |
856 | 857 | ||
857 | /* | 858 | /* |
858 | * called under i_lock | 859 | * called under i_ceph_lock |
859 | */ | 860 | */ |
860 | static int __ceph_is_any_caps(struct ceph_inode_info *ci) | 861 | static int __ceph_is_any_caps(struct ceph_inode_info *ci) |
861 | { | 862 | { |
@@ -865,7 +866,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) | |||
865 | /* | 866 | /* |
866 | * Remove a cap. Take steps to deal with a racing iterate_session_caps. | 867 | * Remove a cap. Take steps to deal with a racing iterate_session_caps. |
867 | * | 868 | * |
868 | * caller should hold i_lock. | 869 | * caller should hold i_ceph_lock. |
869 | * caller will not hold session s_mutex if called from destroy_inode. | 870 | * caller will not hold session s_mutex if called from destroy_inode. |
870 | */ | 871 | */ |
871 | void __ceph_remove_cap(struct ceph_cap *cap) | 872 | void __ceph_remove_cap(struct ceph_cap *cap) |
@@ -1028,7 +1029,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, | |||
1028 | 1029 | ||
1029 | /* | 1030 | /* |
1030 | * Queue cap releases when an inode is dropped from our cache. Since | 1031 | * Queue cap releases when an inode is dropped from our cache. Since |
1031 | * inode is about to be destroyed, there is no need for i_lock. | 1032 | * inode is about to be destroyed, there is no need for i_ceph_lock. |
1032 | */ | 1033 | */ |
1033 | void ceph_queue_caps_release(struct inode *inode) | 1034 | void ceph_queue_caps_release(struct inode *inode) |
1034 | { | 1035 | { |
@@ -1049,7 +1050,7 @@ void ceph_queue_caps_release(struct inode *inode) | |||
1049 | 1050 | ||
1050 | /* | 1051 | /* |
1051 | * Send a cap msg on the given inode. Update our caps state, then | 1052 | * Send a cap msg on the given inode. Update our caps state, then |
1052 | * drop i_lock and send the message. | 1053 | * drop i_ceph_lock and send the message. |
1053 | * | 1054 | * |
1054 | * Make note of max_size reported/requested from mds, revoked caps | 1055 | * Make note of max_size reported/requested from mds, revoked caps |
1055 | * that have now been implemented. | 1056 | * that have now been implemented. |
@@ -1061,13 +1062,13 @@ void ceph_queue_caps_release(struct inode *inode) | |||
1061 | * Return non-zero if delayed release, or we experienced an error | 1062 | * Return non-zero if delayed release, or we experienced an error |
1062 | * such that the caller should requeue + retry later. | 1063 | * such that the caller should requeue + retry later. |
1063 | * | 1064 | * |
1064 | * called with i_lock, then drops it. | 1065 | * called with i_ceph_lock, then drops it. |
1065 | * caller should hold snap_rwsem (read), s_mutex. | 1066 | * caller should hold snap_rwsem (read), s_mutex. |
1066 | */ | 1067 | */ |
1067 | static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | 1068 | static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, |
1068 | int op, int used, int want, int retain, int flushing, | 1069 | int op, int used, int want, int retain, int flushing, |
1069 | unsigned *pflush_tid) | 1070 | unsigned *pflush_tid) |
1070 | __releases(cap->ci->vfs_inode->i_lock) | 1071 | __releases(cap->ci->i_ceph_lock) |
1071 | { | 1072 | { |
1072 | struct ceph_inode_info *ci = cap->ci; | 1073 | struct ceph_inode_info *ci = cap->ci; |
1073 | struct inode *inode = &ci->vfs_inode; | 1074 | struct inode *inode = &ci->vfs_inode; |
@@ -1170,7 +1171,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
1170 | xattr_version = ci->i_xattrs.version; | 1171 | xattr_version = ci->i_xattrs.version; |
1171 | } | 1172 | } |
1172 | 1173 | ||
1173 | spin_unlock(&inode->i_lock); | 1174 | spin_unlock(&ci->i_ceph_lock); |
1174 | 1175 | ||
1175 | ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, | 1176 | ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, |
1176 | op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, | 1177 | op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, |
@@ -1198,13 +1199,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
1198 | * Unless @again is true, skip cap_snaps that were already sent to | 1199 | * Unless @again is true, skip cap_snaps that were already sent to |
1199 | * the MDS (i.e., during this session). | 1200 | * the MDS (i.e., during this session). |
1200 | * | 1201 | * |
1201 | * Called under i_lock. Takes s_mutex as needed. | 1202 | * Called under i_ceph_lock. Takes s_mutex as needed. |
1202 | */ | 1203 | */ |
1203 | void __ceph_flush_snaps(struct ceph_inode_info *ci, | 1204 | void __ceph_flush_snaps(struct ceph_inode_info *ci, |
1204 | struct ceph_mds_session **psession, | 1205 | struct ceph_mds_session **psession, |
1205 | int again) | 1206 | int again) |
1206 | __releases(ci->vfs_inode->i_lock) | 1207 | __releases(ci->i_ceph_lock) |
1207 | __acquires(ci->vfs_inode->i_lock) | 1208 | __acquires(ci->i_ceph_lock) |
1208 | { | 1209 | { |
1209 | struct inode *inode = &ci->vfs_inode; | 1210 | struct inode *inode = &ci->vfs_inode; |
1210 | int mds; | 1211 | int mds; |
@@ -1261,7 +1262,7 @@ retry: | |||
1261 | session = NULL; | 1262 | session = NULL; |
1262 | } | 1263 | } |
1263 | if (!session) { | 1264 | if (!session) { |
1264 | spin_unlock(&inode->i_lock); | 1265 | spin_unlock(&ci->i_ceph_lock); |
1265 | mutex_lock(&mdsc->mutex); | 1266 | mutex_lock(&mdsc->mutex); |
1266 | session = __ceph_lookup_mds_session(mdsc, mds); | 1267 | session = __ceph_lookup_mds_session(mdsc, mds); |
1267 | mutex_unlock(&mdsc->mutex); | 1268 | mutex_unlock(&mdsc->mutex); |
@@ -1275,7 +1276,7 @@ retry: | |||
1275 | * deletion or migration. retry, and we'll | 1276 | * deletion or migration. retry, and we'll |
1276 | * get a better @mds value next time. | 1277 | * get a better @mds value next time. |
1277 | */ | 1278 | */ |
1278 | spin_lock(&inode->i_lock); | 1279 | spin_lock(&ci->i_ceph_lock); |
1279 | goto retry; | 1280 | goto retry; |
1280 | } | 1281 | } |
1281 | 1282 | ||
@@ -1285,7 +1286,7 @@ retry: | |||
1285 | list_del_init(&capsnap->flushing_item); | 1286 | list_del_init(&capsnap->flushing_item); |
1286 | list_add_tail(&capsnap->flushing_item, | 1287 | list_add_tail(&capsnap->flushing_item, |
1287 | &session->s_cap_snaps_flushing); | 1288 | &session->s_cap_snaps_flushing); |
1288 | spin_unlock(&inode->i_lock); | 1289 | spin_unlock(&ci->i_ceph_lock); |
1289 | 1290 | ||
1290 | dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", | 1291 | dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", |
1291 | inode, capsnap, capsnap->follows, capsnap->flush_tid); | 1292 | inode, capsnap, capsnap->follows, capsnap->flush_tid); |
@@ -1302,7 +1303,7 @@ retry: | |||
1302 | next_follows = capsnap->follows + 1; | 1303 | next_follows = capsnap->follows + 1; |
1303 | ceph_put_cap_snap(capsnap); | 1304 | ceph_put_cap_snap(capsnap); |
1304 | 1305 | ||
1305 | spin_lock(&inode->i_lock); | 1306 | spin_lock(&ci->i_ceph_lock); |
1306 | goto retry; | 1307 | goto retry; |
1307 | } | 1308 | } |
1308 | 1309 | ||
@@ -1322,11 +1323,9 @@ out: | |||
1322 | 1323 | ||
1323 | static void ceph_flush_snaps(struct ceph_inode_info *ci) | 1324 | static void ceph_flush_snaps(struct ceph_inode_info *ci) |
1324 | { | 1325 | { |
1325 | struct inode *inode = &ci->vfs_inode; | 1326 | spin_lock(&ci->i_ceph_lock); |
1326 | |||
1327 | spin_lock(&inode->i_lock); | ||
1328 | __ceph_flush_snaps(ci, NULL, 0); | 1327 | __ceph_flush_snaps(ci, NULL, 0); |
1329 | spin_unlock(&inode->i_lock); | 1328 | spin_unlock(&ci->i_ceph_lock); |
1330 | } | 1329 | } |
1331 | 1330 | ||
1332 | /* | 1331 | /* |
@@ -1373,7 +1372,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
1373 | * Add dirty inode to the flushing list. Assigned a seq number so we | 1372 | * Add dirty inode to the flushing list. Assigned a seq number so we |
1374 | * can wait for caps to flush without starving. | 1373 | * can wait for caps to flush without starving. |
1375 | * | 1374 | * |
1376 | * Called under i_lock. | 1375 | * Called under i_ceph_lock. |
1377 | */ | 1376 | */ |
1378 | static int __mark_caps_flushing(struct inode *inode, | 1377 | static int __mark_caps_flushing(struct inode *inode, |
1379 | struct ceph_mds_session *session) | 1378 | struct ceph_mds_session *session) |
@@ -1421,9 +1420,9 @@ static int try_nonblocking_invalidate(struct inode *inode) | |||
1421 | struct ceph_inode_info *ci = ceph_inode(inode); | 1420 | struct ceph_inode_info *ci = ceph_inode(inode); |
1422 | u32 invalidating_gen = ci->i_rdcache_gen; | 1421 | u32 invalidating_gen = ci->i_rdcache_gen; |
1423 | 1422 | ||
1424 | spin_unlock(&inode->i_lock); | 1423 | spin_unlock(&ci->i_ceph_lock); |
1425 | invalidate_mapping_pages(&inode->i_data, 0, -1); | 1424 | invalidate_mapping_pages(&inode->i_data, 0, -1); |
1426 | spin_lock(&inode->i_lock); | 1425 | spin_lock(&ci->i_ceph_lock); |
1427 | 1426 | ||
1428 | if (inode->i_data.nrpages == 0 && | 1427 | if (inode->i_data.nrpages == 0 && |
1429 | invalidating_gen == ci->i_rdcache_gen) { | 1428 | invalidating_gen == ci->i_rdcache_gen) { |
@@ -1470,7 +1469,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, | |||
1470 | if (mdsc->stopping) | 1469 | if (mdsc->stopping) |
1471 | is_delayed = 1; | 1470 | is_delayed = 1; |
1472 | 1471 | ||
1473 | spin_lock(&inode->i_lock); | 1472 | spin_lock(&ci->i_ceph_lock); |
1474 | 1473 | ||
1475 | if (ci->i_ceph_flags & CEPH_I_FLUSH) | 1474 | if (ci->i_ceph_flags & CEPH_I_FLUSH) |
1476 | flags |= CHECK_CAPS_FLUSH; | 1475 | flags |= CHECK_CAPS_FLUSH; |
@@ -1480,7 +1479,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, | |||
1480 | __ceph_flush_snaps(ci, &session, 0); | 1479 | __ceph_flush_snaps(ci, &session, 0); |
1481 | goto retry_locked; | 1480 | goto retry_locked; |
1482 | retry: | 1481 | retry: |
1483 | spin_lock(&inode->i_lock); | 1482 | spin_lock(&ci->i_ceph_lock); |
1484 | retry_locked: | 1483 | retry_locked: |
1485 | file_wanted = __ceph_caps_file_wanted(ci); | 1484 | file_wanted = __ceph_caps_file_wanted(ci); |
1486 | used = __ceph_caps_used(ci); | 1485 | used = __ceph_caps_used(ci); |
@@ -1634,7 +1633,7 @@ ack: | |||
1634 | if (mutex_trylock(&session->s_mutex) == 0) { | 1633 | if (mutex_trylock(&session->s_mutex) == 0) { |
1635 | dout("inverting session/ino locks on %p\n", | 1634 | dout("inverting session/ino locks on %p\n", |
1636 | session); | 1635 | session); |
1637 | spin_unlock(&inode->i_lock); | 1636 | spin_unlock(&ci->i_ceph_lock); |
1638 | if (took_snap_rwsem) { | 1637 | if (took_snap_rwsem) { |
1639 | up_read(&mdsc->snap_rwsem); | 1638 | up_read(&mdsc->snap_rwsem); |
1640 | took_snap_rwsem = 0; | 1639 | took_snap_rwsem = 0; |
@@ -1648,7 +1647,7 @@ ack: | |||
1648 | if (down_read_trylock(&mdsc->snap_rwsem) == 0) { | 1647 | if (down_read_trylock(&mdsc->snap_rwsem) == 0) { |
1649 | dout("inverting snap/in locks on %p\n", | 1648 | dout("inverting snap/in locks on %p\n", |
1650 | inode); | 1649 | inode); |
1651 | spin_unlock(&inode->i_lock); | 1650 | spin_unlock(&ci->i_ceph_lock); |
1652 | down_read(&mdsc->snap_rwsem); | 1651 | down_read(&mdsc->snap_rwsem); |
1653 | took_snap_rwsem = 1; | 1652 | took_snap_rwsem = 1; |
1654 | goto retry; | 1653 | goto retry; |
@@ -1664,10 +1663,10 @@ ack: | |||
1664 | mds = cap->mds; /* remember mds, so we don't repeat */ | 1663 | mds = cap->mds; /* remember mds, so we don't repeat */ |
1665 | sent++; | 1664 | sent++; |
1666 | 1665 | ||
1667 | /* __send_cap drops i_lock */ | 1666 | /* __send_cap drops i_ceph_lock */ |
1668 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, | 1667 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, |
1669 | retain, flushing, NULL); | 1668 | retain, flushing, NULL); |
1670 | goto retry; /* retake i_lock and restart our cap scan. */ | 1669 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ |
1671 | } | 1670 | } |
1672 | 1671 | ||
1673 | /* | 1672 | /* |
@@ -1681,7 +1680,7 @@ ack: | |||
1681 | else if (!is_delayed || force_requeue) | 1680 | else if (!is_delayed || force_requeue) |
1682 | __cap_delay_requeue(mdsc, ci); | 1681 | __cap_delay_requeue(mdsc, ci); |
1683 | 1682 | ||
1684 | spin_unlock(&inode->i_lock); | 1683 | spin_unlock(&ci->i_ceph_lock); |
1685 | 1684 | ||
1686 | if (queue_invalidate) | 1685 | if (queue_invalidate) |
1687 | ceph_queue_invalidate(inode); | 1686 | ceph_queue_invalidate(inode); |
@@ -1704,7 +1703,7 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, | |||
1704 | int flushing = 0; | 1703 | int flushing = 0; |
1705 | 1704 | ||
1706 | retry: | 1705 | retry: |
1707 | spin_lock(&inode->i_lock); | 1706 | spin_lock(&ci->i_ceph_lock); |
1708 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | 1707 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
1709 | dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); | 1708 | dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); |
1710 | goto out; | 1709 | goto out; |
@@ -1716,7 +1715,7 @@ retry: | |||
1716 | int delayed; | 1715 | int delayed; |
1717 | 1716 | ||
1718 | if (!session) { | 1717 | if (!session) { |
1719 | spin_unlock(&inode->i_lock); | 1718 | spin_unlock(&ci->i_ceph_lock); |
1720 | session = cap->session; | 1719 | session = cap->session; |
1721 | mutex_lock(&session->s_mutex); | 1720 | mutex_lock(&session->s_mutex); |
1722 | goto retry; | 1721 | goto retry; |
@@ -1727,18 +1726,18 @@ retry: | |||
1727 | 1726 | ||
1728 | flushing = __mark_caps_flushing(inode, session); | 1727 | flushing = __mark_caps_flushing(inode, session); |
1729 | 1728 | ||
1730 | /* __send_cap drops i_lock */ | 1729 | /* __send_cap drops i_ceph_lock */ |
1731 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, | 1730 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, |
1732 | cap->issued | cap->implemented, flushing, | 1731 | cap->issued | cap->implemented, flushing, |
1733 | flush_tid); | 1732 | flush_tid); |
1734 | if (!delayed) | 1733 | if (!delayed) |
1735 | goto out_unlocked; | 1734 | goto out_unlocked; |
1736 | 1735 | ||
1737 | spin_lock(&inode->i_lock); | 1736 | spin_lock(&ci->i_ceph_lock); |
1738 | __cap_delay_requeue(mdsc, ci); | 1737 | __cap_delay_requeue(mdsc, ci); |
1739 | } | 1738 | } |
1740 | out: | 1739 | out: |
1741 | spin_unlock(&inode->i_lock); | 1740 | spin_unlock(&ci->i_ceph_lock); |
1742 | out_unlocked: | 1741 | out_unlocked: |
1743 | if (session && unlock_session) | 1742 | if (session && unlock_session) |
1744 | mutex_unlock(&session->s_mutex); | 1743 | mutex_unlock(&session->s_mutex); |
@@ -1753,7 +1752,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) | |||
1753 | struct ceph_inode_info *ci = ceph_inode(inode); | 1752 | struct ceph_inode_info *ci = ceph_inode(inode); |
1754 | int i, ret = 1; | 1753 | int i, ret = 1; |
1755 | 1754 | ||
1756 | spin_lock(&inode->i_lock); | 1755 | spin_lock(&ci->i_ceph_lock); |
1757 | for (i = 0; i < CEPH_CAP_BITS; i++) | 1756 | for (i = 0; i < CEPH_CAP_BITS; i++) |
1758 | if ((ci->i_flushing_caps & (1 << i)) && | 1757 | if ((ci->i_flushing_caps & (1 << i)) && |
1759 | ci->i_cap_flush_tid[i] <= tid) { | 1758 | ci->i_cap_flush_tid[i] <= tid) { |
@@ -1761,7 +1760,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) | |||
1761 | ret = 0; | 1760 | ret = 0; |
1762 | break; | 1761 | break; |
1763 | } | 1762 | } |
1764 | spin_unlock(&inode->i_lock); | 1763 | spin_unlock(&ci->i_ceph_lock); |
1765 | return ret; | 1764 | return ret; |
1766 | } | 1765 | } |
1767 | 1766 | ||
@@ -1868,10 +1867,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) | |||
1868 | struct ceph_mds_client *mdsc = | 1867 | struct ceph_mds_client *mdsc = |
1869 | ceph_sb_to_client(inode->i_sb)->mdsc; | 1868 | ceph_sb_to_client(inode->i_sb)->mdsc; |
1870 | 1869 | ||
1871 | spin_lock(&inode->i_lock); | 1870 | spin_lock(&ci->i_ceph_lock); |
1872 | if (__ceph_caps_dirty(ci)) | 1871 | if (__ceph_caps_dirty(ci)) |
1873 | __cap_delay_requeue_front(mdsc, ci); | 1872 | __cap_delay_requeue_front(mdsc, ci); |
1874 | spin_unlock(&inode->i_lock); | 1873 | spin_unlock(&ci->i_ceph_lock); |
1875 | } | 1874 | } |
1876 | return err; | 1875 | return err; |
1877 | } | 1876 | } |
@@ -1894,7 +1893,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, | |||
1894 | struct inode *inode = &ci->vfs_inode; | 1893 | struct inode *inode = &ci->vfs_inode; |
1895 | struct ceph_cap *cap; | 1894 | struct ceph_cap *cap; |
1896 | 1895 | ||
1897 | spin_lock(&inode->i_lock); | 1896 | spin_lock(&ci->i_ceph_lock); |
1898 | cap = ci->i_auth_cap; | 1897 | cap = ci->i_auth_cap; |
1899 | if (cap && cap->session == session) { | 1898 | if (cap && cap->session == session) { |
1900 | dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, | 1899 | dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, |
@@ -1904,7 +1903,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, | |||
1904 | pr_err("%p auth cap %p not mds%d ???\n", inode, | 1903 | pr_err("%p auth cap %p not mds%d ???\n", inode, |
1905 | cap, session->s_mds); | 1904 | cap, session->s_mds); |
1906 | } | 1905 | } |
1907 | spin_unlock(&inode->i_lock); | 1906 | spin_unlock(&ci->i_ceph_lock); |
1908 | } | 1907 | } |
1909 | } | 1908 | } |
1910 | 1909 | ||
@@ -1921,7 +1920,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | |||
1921 | struct ceph_cap *cap; | 1920 | struct ceph_cap *cap; |
1922 | int delayed = 0; | 1921 | int delayed = 0; |
1923 | 1922 | ||
1924 | spin_lock(&inode->i_lock); | 1923 | spin_lock(&ci->i_ceph_lock); |
1925 | cap = ci->i_auth_cap; | 1924 | cap = ci->i_auth_cap; |
1926 | if (cap && cap->session == session) { | 1925 | if (cap && cap->session == session) { |
1927 | dout("kick_flushing_caps %p cap %p %s\n", inode, | 1926 | dout("kick_flushing_caps %p cap %p %s\n", inode, |
@@ -1932,14 +1931,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | |||
1932 | cap->issued | cap->implemented, | 1931 | cap->issued | cap->implemented, |
1933 | ci->i_flushing_caps, NULL); | 1932 | ci->i_flushing_caps, NULL); |
1934 | if (delayed) { | 1933 | if (delayed) { |
1935 | spin_lock(&inode->i_lock); | 1934 | spin_lock(&ci->i_ceph_lock); |
1936 | __cap_delay_requeue(mdsc, ci); | 1935 | __cap_delay_requeue(mdsc, ci); |
1937 | spin_unlock(&inode->i_lock); | 1936 | spin_unlock(&ci->i_ceph_lock); |
1938 | } | 1937 | } |
1939 | } else { | 1938 | } else { |
1940 | pr_err("%p auth cap %p not mds%d ???\n", inode, | 1939 | pr_err("%p auth cap %p not mds%d ???\n", inode, |
1941 | cap, session->s_mds); | 1940 | cap, session->s_mds); |
1942 | spin_unlock(&inode->i_lock); | 1941 | spin_unlock(&ci->i_ceph_lock); |
1943 | } | 1942 | } |
1944 | } | 1943 | } |
1945 | } | 1944 | } |
@@ -1952,7 +1951,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | |||
1952 | struct ceph_cap *cap; | 1951 | struct ceph_cap *cap; |
1953 | int delayed = 0; | 1952 | int delayed = 0; |
1954 | 1953 | ||
1955 | spin_lock(&inode->i_lock); | 1954 | spin_lock(&ci->i_ceph_lock); |
1956 | cap = ci->i_auth_cap; | 1955 | cap = ci->i_auth_cap; |
1957 | dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, | 1956 | dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, |
1958 | ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); | 1957 | ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); |
@@ -1964,12 +1963,12 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | |||
1964 | cap->issued | cap->implemented, | 1963 | cap->issued | cap->implemented, |
1965 | ci->i_flushing_caps, NULL); | 1964 | ci->i_flushing_caps, NULL); |
1966 | if (delayed) { | 1965 | if (delayed) { |
1967 | spin_lock(&inode->i_lock); | 1966 | spin_lock(&ci->i_ceph_lock); |
1968 | __cap_delay_requeue(mdsc, ci); | 1967 | __cap_delay_requeue(mdsc, ci); |
1969 | spin_unlock(&inode->i_lock); | 1968 | spin_unlock(&ci->i_ceph_lock); |
1970 | } | 1969 | } |
1971 | } else { | 1970 | } else { |
1972 | spin_unlock(&inode->i_lock); | 1971 | spin_unlock(&ci->i_ceph_lock); |
1973 | } | 1972 | } |
1974 | } | 1973 | } |
1975 | 1974 | ||
@@ -1978,7 +1977,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | |||
1978 | * Take references to capabilities we hold, so that we don't release | 1977 | * Take references to capabilities we hold, so that we don't release |
1979 | * them to the MDS prematurely. | 1978 | * them to the MDS prematurely. |
1980 | * | 1979 | * |
1981 | * Protected by i_lock. | 1980 | * Protected by i_ceph_lock. |
1982 | */ | 1981 | */ |
1983 | static void __take_cap_refs(struct ceph_inode_info *ci, int got) | 1982 | static void __take_cap_refs(struct ceph_inode_info *ci, int got) |
1984 | { | 1983 | { |
@@ -2016,7 +2015,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2016 | 2015 | ||
2017 | dout("get_cap_refs %p need %s want %s\n", inode, | 2016 | dout("get_cap_refs %p need %s want %s\n", inode, |
2018 | ceph_cap_string(need), ceph_cap_string(want)); | 2017 | ceph_cap_string(need), ceph_cap_string(want)); |
2019 | spin_lock(&inode->i_lock); | 2018 | spin_lock(&ci->i_ceph_lock); |
2020 | 2019 | ||
2021 | /* make sure file is actually open */ | 2020 | /* make sure file is actually open */ |
2022 | file_wanted = __ceph_caps_file_wanted(ci); | 2021 | file_wanted = __ceph_caps_file_wanted(ci); |
@@ -2077,7 +2076,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2077 | ceph_cap_string(have), ceph_cap_string(need)); | 2076 | ceph_cap_string(have), ceph_cap_string(need)); |
2078 | } | 2077 | } |
2079 | out: | 2078 | out: |
2080 | spin_unlock(&inode->i_lock); | 2079 | spin_unlock(&ci->i_ceph_lock); |
2081 | dout("get_cap_refs %p ret %d got %s\n", inode, | 2080 | dout("get_cap_refs %p ret %d got %s\n", inode, |
2082 | ret, ceph_cap_string(*got)); | 2081 | ret, ceph_cap_string(*got)); |
2083 | return ret; | 2082 | return ret; |
@@ -2094,7 +2093,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) | |||
2094 | int check = 0; | 2093 | int check = 0; |
2095 | 2094 | ||
2096 | /* do we need to explicitly request a larger max_size? */ | 2095 | /* do we need to explicitly request a larger max_size? */ |
2097 | spin_lock(&inode->i_lock); | 2096 | spin_lock(&ci->i_ceph_lock); |
2098 | if ((endoff >= ci->i_max_size || | 2097 | if ((endoff >= ci->i_max_size || |
2099 | endoff > (inode->i_size << 1)) && | 2098 | endoff > (inode->i_size << 1)) && |
2100 | endoff > ci->i_wanted_max_size) { | 2099 | endoff > ci->i_wanted_max_size) { |
@@ -2103,7 +2102,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) | |||
2103 | ci->i_wanted_max_size = endoff; | 2102 | ci->i_wanted_max_size = endoff; |
2104 | check = 1; | 2103 | check = 1; |
2105 | } | 2104 | } |
2106 | spin_unlock(&inode->i_lock); | 2105 | spin_unlock(&ci->i_ceph_lock); |
2107 | if (check) | 2106 | if (check) |
2108 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 2107 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
2109 | } | 2108 | } |
@@ -2140,9 +2139,9 @@ retry: | |||
2140 | */ | 2139 | */ |
2141 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) | 2140 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) |
2142 | { | 2141 | { |
2143 | spin_lock(&ci->vfs_inode.i_lock); | 2142 | spin_lock(&ci->i_ceph_lock); |
2144 | __take_cap_refs(ci, caps); | 2143 | __take_cap_refs(ci, caps); |
2145 | spin_unlock(&ci->vfs_inode.i_lock); | 2144 | spin_unlock(&ci->i_ceph_lock); |
2146 | } | 2145 | } |
2147 | 2146 | ||
2148 | /* | 2147 | /* |
@@ -2160,7 +2159,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | |||
2160 | int last = 0, put = 0, flushsnaps = 0, wake = 0; | 2159 | int last = 0, put = 0, flushsnaps = 0, wake = 0; |
2161 | struct ceph_cap_snap *capsnap; | 2160 | struct ceph_cap_snap *capsnap; |
2162 | 2161 | ||
2163 | spin_lock(&inode->i_lock); | 2162 | spin_lock(&ci->i_ceph_lock); |
2164 | if (had & CEPH_CAP_PIN) | 2163 | if (had & CEPH_CAP_PIN) |
2165 | --ci->i_pin_ref; | 2164 | --ci->i_pin_ref; |
2166 | if (had & CEPH_CAP_FILE_RD) | 2165 | if (had & CEPH_CAP_FILE_RD) |
@@ -2193,7 +2192,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | |||
2193 | } | 2192 | } |
2194 | } | 2193 | } |
2195 | } | 2194 | } |
2196 | spin_unlock(&inode->i_lock); | 2195 | spin_unlock(&ci->i_ceph_lock); |
2197 | 2196 | ||
2198 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), | 2197 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), |
2199 | last ? " last" : "", put ? " put" : ""); | 2198 | last ? " last" : "", put ? " put" : ""); |
@@ -2225,7 +2224,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
2225 | int found = 0; | 2224 | int found = 0; |
2226 | struct ceph_cap_snap *capsnap = NULL; | 2225 | struct ceph_cap_snap *capsnap = NULL; |
2227 | 2226 | ||
2228 | spin_lock(&inode->i_lock); | 2227 | spin_lock(&ci->i_ceph_lock); |
2229 | ci->i_wrbuffer_ref -= nr; | 2228 | ci->i_wrbuffer_ref -= nr; |
2230 | last = !ci->i_wrbuffer_ref; | 2229 | last = !ci->i_wrbuffer_ref; |
2231 | 2230 | ||
@@ -2274,7 +2273,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
2274 | } | 2273 | } |
2275 | } | 2274 | } |
2276 | 2275 | ||
2277 | spin_unlock(&inode->i_lock); | 2276 | spin_unlock(&ci->i_ceph_lock); |
2278 | 2277 | ||
2279 | if (last) { | 2278 | if (last) { |
2280 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 2279 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
@@ -2291,7 +2290,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
2291 | * Handle a cap GRANT message from the MDS. (Note that a GRANT may | 2290 | * Handle a cap GRANT message from the MDS. (Note that a GRANT may |
2292 | * actually be a revocation if it specifies a smaller cap set.) | 2291 | * actually be a revocation if it specifies a smaller cap set.) |
2293 | * | 2292 | * |
2294 | * caller holds s_mutex and i_lock, we drop both. | 2293 | * caller holds s_mutex and i_ceph_lock, we drop both. |
2295 | * | 2294 | * |
2296 | * return value: | 2295 | * return value: |
2297 | * 0 - ok | 2296 | * 0 - ok |
@@ -2302,7 +2301,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
2302 | struct ceph_mds_session *session, | 2301 | struct ceph_mds_session *session, |
2303 | struct ceph_cap *cap, | 2302 | struct ceph_cap *cap, |
2304 | struct ceph_buffer *xattr_buf) | 2303 | struct ceph_buffer *xattr_buf) |
2305 | __releases(inode->i_lock) | 2304 | __releases(ci->i_ceph_lock) |
2306 | { | 2305 | { |
2307 | struct ceph_inode_info *ci = ceph_inode(inode); | 2306 | struct ceph_inode_info *ci = ceph_inode(inode); |
2308 | int mds = session->s_mds; | 2307 | int mds = session->s_mds; |
@@ -2453,7 +2452,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
2453 | } | 2452 | } |
2454 | BUG_ON(cap->issued & ~cap->implemented); | 2453 | BUG_ON(cap->issued & ~cap->implemented); |
2455 | 2454 | ||
2456 | spin_unlock(&inode->i_lock); | 2455 | spin_unlock(&ci->i_ceph_lock); |
2457 | if (writeback) | 2456 | if (writeback) |
2458 | /* | 2457 | /* |
2459 | * queue inode for writeback: we can't actually call | 2458 | * queue inode for writeback: we can't actually call |
@@ -2483,7 +2482,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2483 | struct ceph_mds_caps *m, | 2482 | struct ceph_mds_caps *m, |
2484 | struct ceph_mds_session *session, | 2483 | struct ceph_mds_session *session, |
2485 | struct ceph_cap *cap) | 2484 | struct ceph_cap *cap) |
2486 | __releases(inode->i_lock) | 2485 | __releases(ci->i_ceph_lock) |
2487 | { | 2486 | { |
2488 | struct ceph_inode_info *ci = ceph_inode(inode); | 2487 | struct ceph_inode_info *ci = ceph_inode(inode); |
2489 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 2488 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
@@ -2539,7 +2538,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2539 | wake_up_all(&ci->i_cap_wq); | 2538 | wake_up_all(&ci->i_cap_wq); |
2540 | 2539 | ||
2541 | out: | 2540 | out: |
2542 | spin_unlock(&inode->i_lock); | 2541 | spin_unlock(&ci->i_ceph_lock); |
2543 | if (drop) | 2542 | if (drop) |
2544 | iput(inode); | 2543 | iput(inode); |
2545 | } | 2544 | } |
@@ -2562,7 +2561,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, | |||
2562 | dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", | 2561 | dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", |
2563 | inode, ci, session->s_mds, follows); | 2562 | inode, ci, session->s_mds, follows); |
2564 | 2563 | ||
2565 | spin_lock(&inode->i_lock); | 2564 | spin_lock(&ci->i_ceph_lock); |
2566 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 2565 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
2567 | if (capsnap->follows == follows) { | 2566 | if (capsnap->follows == follows) { |
2568 | if (capsnap->flush_tid != flush_tid) { | 2567 | if (capsnap->flush_tid != flush_tid) { |
@@ -2585,7 +2584,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, | |||
2585 | capsnap, capsnap->follows); | 2584 | capsnap, capsnap->follows); |
2586 | } | 2585 | } |
2587 | } | 2586 | } |
2588 | spin_unlock(&inode->i_lock); | 2587 | spin_unlock(&ci->i_ceph_lock); |
2589 | if (drop) | 2588 | if (drop) |
2590 | iput(inode); | 2589 | iput(inode); |
2591 | } | 2590 | } |
@@ -2598,7 +2597,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, | |||
2598 | static void handle_cap_trunc(struct inode *inode, | 2597 | static void handle_cap_trunc(struct inode *inode, |
2599 | struct ceph_mds_caps *trunc, | 2598 | struct ceph_mds_caps *trunc, |
2600 | struct ceph_mds_session *session) | 2599 | struct ceph_mds_session *session) |
2601 | __releases(inode->i_lock) | 2600 | __releases(ci->i_ceph_lock) |
2602 | { | 2601 | { |
2603 | struct ceph_inode_info *ci = ceph_inode(inode); | 2602 | struct ceph_inode_info *ci = ceph_inode(inode); |
2604 | int mds = session->s_mds; | 2603 | int mds = session->s_mds; |
@@ -2617,7 +2616,7 @@ static void handle_cap_trunc(struct inode *inode, | |||
2617 | inode, mds, seq, truncate_size, truncate_seq); | 2616 | inode, mds, seq, truncate_size, truncate_seq); |
2618 | queue_trunc = ceph_fill_file_size(inode, issued, | 2617 | queue_trunc = ceph_fill_file_size(inode, issued, |
2619 | truncate_seq, truncate_size, size); | 2618 | truncate_seq, truncate_size, size); |
2620 | spin_unlock(&inode->i_lock); | 2619 | spin_unlock(&ci->i_ceph_lock); |
2621 | 2620 | ||
2622 | if (queue_trunc) | 2621 | if (queue_trunc) |
2623 | ceph_queue_vmtruncate(inode); | 2622 | ceph_queue_vmtruncate(inode); |
@@ -2646,7 +2645,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, | |||
2646 | dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", | 2645 | dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", |
2647 | inode, ci, mds, mseq); | 2646 | inode, ci, mds, mseq); |
2648 | 2647 | ||
2649 | spin_lock(&inode->i_lock); | 2648 | spin_lock(&ci->i_ceph_lock); |
2650 | 2649 | ||
2651 | /* make sure we haven't seen a higher mseq */ | 2650 | /* make sure we haven't seen a higher mseq */ |
2652 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 2651 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
@@ -2690,7 +2689,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, | |||
2690 | } | 2689 | } |
2691 | /* else, we already released it */ | 2690 | /* else, we already released it */ |
2692 | 2691 | ||
2693 | spin_unlock(&inode->i_lock); | 2692 | spin_unlock(&ci->i_ceph_lock); |
2694 | } | 2693 | } |
2695 | 2694 | ||
2696 | /* | 2695 | /* |
@@ -2745,9 +2744,9 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, | |||
2745 | up_read(&mdsc->snap_rwsem); | 2744 | up_read(&mdsc->snap_rwsem); |
2746 | 2745 | ||
2747 | /* make sure we re-request max_size, if necessary */ | 2746 | /* make sure we re-request max_size, if necessary */ |
2748 | spin_lock(&inode->i_lock); | 2747 | spin_lock(&ci->i_ceph_lock); |
2749 | ci->i_requested_max_size = 0; | 2748 | ci->i_requested_max_size = 0; |
2750 | spin_unlock(&inode->i_lock); | 2749 | spin_unlock(&ci->i_ceph_lock); |
2751 | } | 2750 | } |
2752 | 2751 | ||
2753 | /* | 2752 | /* |
@@ -2762,6 +2761,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2762 | struct ceph_mds_client *mdsc = session->s_mdsc; | 2761 | struct ceph_mds_client *mdsc = session->s_mdsc; |
2763 | struct super_block *sb = mdsc->fsc->sb; | 2762 | struct super_block *sb = mdsc->fsc->sb; |
2764 | struct inode *inode; | 2763 | struct inode *inode; |
2764 | struct ceph_inode_info *ci; | ||
2765 | struct ceph_cap *cap; | 2765 | struct ceph_cap *cap; |
2766 | struct ceph_mds_caps *h; | 2766 | struct ceph_mds_caps *h; |
2767 | int mds = session->s_mds; | 2767 | int mds = session->s_mds; |
@@ -2815,6 +2815,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2815 | 2815 | ||
2816 | /* lookup ino */ | 2816 | /* lookup ino */ |
2817 | inode = ceph_find_inode(sb, vino); | 2817 | inode = ceph_find_inode(sb, vino); |
2818 | ci = ceph_inode(inode); | ||
2818 | dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, | 2819 | dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, |
2819 | vino.snap, inode); | 2820 | vino.snap, inode); |
2820 | if (!inode) { | 2821 | if (!inode) { |
@@ -2844,16 +2845,16 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2844 | } | 2845 | } |
2845 | 2846 | ||
2846 | /* the rest require a cap */ | 2847 | /* the rest require a cap */ |
2847 | spin_lock(&inode->i_lock); | 2848 | spin_lock(&ci->i_ceph_lock); |
2848 | cap = __get_cap_for_mds(ceph_inode(inode), mds); | 2849 | cap = __get_cap_for_mds(ceph_inode(inode), mds); |
2849 | if (!cap) { | 2850 | if (!cap) { |
2850 | dout(" no cap on %p ino %llx.%llx from mds%d\n", | 2851 | dout(" no cap on %p ino %llx.%llx from mds%d\n", |
2851 | inode, ceph_ino(inode), ceph_snap(inode), mds); | 2852 | inode, ceph_ino(inode), ceph_snap(inode), mds); |
2852 | spin_unlock(&inode->i_lock); | 2853 | spin_unlock(&ci->i_ceph_lock); |
2853 | goto flush_cap_releases; | 2854 | goto flush_cap_releases; |
2854 | } | 2855 | } |
2855 | 2856 | ||
2856 | /* note that each of these drops i_lock for us */ | 2857 | /* note that each of these drops i_ceph_lock for us */ |
2857 | switch (op) { | 2858 | switch (op) { |
2858 | case CEPH_CAP_OP_REVOKE: | 2859 | case CEPH_CAP_OP_REVOKE: |
2859 | case CEPH_CAP_OP_GRANT: | 2860 | case CEPH_CAP_OP_GRANT: |
@@ -2869,7 +2870,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2869 | break; | 2870 | break; |
2870 | 2871 | ||
2871 | default: | 2872 | default: |
2872 | spin_unlock(&inode->i_lock); | 2873 | spin_unlock(&ci->i_ceph_lock); |
2873 | pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, | 2874 | pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, |
2874 | ceph_cap_op_name(op)); | 2875 | ceph_cap_op_name(op)); |
2875 | } | 2876 | } |
@@ -2962,13 +2963,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) | |||
2962 | struct inode *inode = &ci->vfs_inode; | 2963 | struct inode *inode = &ci->vfs_inode; |
2963 | int last = 0; | 2964 | int last = 0; |
2964 | 2965 | ||
2965 | spin_lock(&inode->i_lock); | 2966 | spin_lock(&ci->i_ceph_lock); |
2966 | dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, | 2967 | dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, |
2967 | ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); | 2968 | ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); |
2968 | BUG_ON(ci->i_nr_by_mode[fmode] == 0); | 2969 | BUG_ON(ci->i_nr_by_mode[fmode] == 0); |
2969 | if (--ci->i_nr_by_mode[fmode] == 0) | 2970 | if (--ci->i_nr_by_mode[fmode] == 0) |
2970 | last++; | 2971 | last++; |
2971 | spin_unlock(&inode->i_lock); | 2972 | spin_unlock(&ci->i_ceph_lock); |
2972 | 2973 | ||
2973 | if (last && ci->i_vino.snap == CEPH_NOSNAP) | 2974 | if (last && ci->i_vino.snap == CEPH_NOSNAP) |
2974 | ceph_check_caps(ci, 0, NULL); | 2975 | ceph_check_caps(ci, 0, NULL); |
@@ -2991,7 +2992,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, | |||
2991 | int used, dirty; | 2992 | int used, dirty; |
2992 | int ret = 0; | 2993 | int ret = 0; |
2993 | 2994 | ||
2994 | spin_lock(&inode->i_lock); | 2995 | spin_lock(&ci->i_ceph_lock); |
2995 | used = __ceph_caps_used(ci); | 2996 | used = __ceph_caps_used(ci); |
2996 | dirty = __ceph_caps_dirty(ci); | 2997 | dirty = __ceph_caps_dirty(ci); |
2997 | 2998 | ||
@@ -3046,7 +3047,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, | |||
3046 | inode, cap, ceph_cap_string(cap->issued)); | 3047 | inode, cap, ceph_cap_string(cap->issued)); |
3047 | } | 3048 | } |
3048 | } | 3049 | } |
3049 | spin_unlock(&inode->i_lock); | 3050 | spin_unlock(&ci->i_ceph_lock); |
3050 | return ret; | 3051 | return ret; |
3051 | } | 3052 | } |
3052 | 3053 | ||
@@ -3061,7 +3062,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, | |||
3061 | 3062 | ||
3062 | /* | 3063 | /* |
3063 | * force an record for the directory caps if we have a dentry lease. | 3064 | * force an record for the directory caps if we have a dentry lease. |
3064 | * this is racy (can't take i_lock and d_lock together), but it | 3065 | * this is racy (can't take i_ceph_lock and d_lock together), but it |
3065 | * doesn't have to be perfect; the mds will revoke anything we don't | 3066 | * doesn't have to be perfect; the mds will revoke anything we don't |
3066 | * release. | 3067 | * release. |
3067 | */ | 3068 | */ |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2abd0dfad7f8..3eeb97661262 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
@@ -281,18 +281,18 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
281 | } | 281 | } |
282 | 282 | ||
283 | /* can we use the dcache? */ | 283 | /* can we use the dcache? */ |
284 | spin_lock(&inode->i_lock); | 284 | spin_lock(&ci->i_ceph_lock); |
285 | if ((filp->f_pos == 2 || fi->dentry) && | 285 | if ((filp->f_pos == 2 || fi->dentry) && |
286 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && | 286 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && |
287 | ceph_snap(inode) != CEPH_SNAPDIR && | 287 | ceph_snap(inode) != CEPH_SNAPDIR && |
288 | ceph_dir_test_complete(inode) && | 288 | ceph_dir_test_complete(inode) && |
289 | __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { | 289 | __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { |
290 | spin_unlock(&inode->i_lock); | 290 | spin_unlock(&ci->i_ceph_lock); |
291 | err = __dcache_readdir(filp, dirent, filldir); | 291 | err = __dcache_readdir(filp, dirent, filldir); |
292 | if (err != -EAGAIN) | 292 | if (err != -EAGAIN) |
293 | return err; | 293 | return err; |
294 | } else { | 294 | } else { |
295 | spin_unlock(&inode->i_lock); | 295 | spin_unlock(&ci->i_ceph_lock); |
296 | } | 296 | } |
297 | if (fi->dentry) { | 297 | if (fi->dentry) { |
298 | err = note_last_dentry(fi, fi->dentry->d_name.name, | 298 | err = note_last_dentry(fi, fi->dentry->d_name.name, |
@@ -428,12 +428,12 @@ more: | |||
428 | * were released during the whole readdir, and we should have | 428 | * were released during the whole readdir, and we should have |
429 | * the complete dir contents in our cache. | 429 | * the complete dir contents in our cache. |
430 | */ | 430 | */ |
431 | spin_lock(&inode->i_lock); | 431 | spin_lock(&ci->i_ceph_lock); |
432 | if (ci->i_release_count == fi->dir_release_count) { | 432 | if (ci->i_release_count == fi->dir_release_count) { |
433 | ceph_dir_set_complete(inode); | 433 | ceph_dir_set_complete(inode); |
434 | ci->i_max_offset = filp->f_pos; | 434 | ci->i_max_offset = filp->f_pos; |
435 | } | 435 | } |
436 | spin_unlock(&inode->i_lock); | 436 | spin_unlock(&ci->i_ceph_lock); |
437 | 437 | ||
438 | dout("readdir %p filp %p done.\n", inode, filp); | 438 | dout("readdir %p filp %p done.\n", inode, filp); |
439 | return 0; | 439 | return 0; |
@@ -607,7 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, | |||
607 | struct ceph_inode_info *ci = ceph_inode(dir); | 607 | struct ceph_inode_info *ci = ceph_inode(dir); |
608 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 608 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
609 | 609 | ||
610 | spin_lock(&dir->i_lock); | 610 | spin_lock(&ci->i_ceph_lock); |
611 | dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); | 611 | dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); |
612 | if (strncmp(dentry->d_name.name, | 612 | if (strncmp(dentry->d_name.name, |
613 | fsc->mount_options->snapdir_name, | 613 | fsc->mount_options->snapdir_name, |
@@ -615,13 +615,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, | |||
615 | !is_root_ceph_dentry(dir, dentry) && | 615 | !is_root_ceph_dentry(dir, dentry) && |
616 | ceph_dir_test_complete(dir) && | 616 | ceph_dir_test_complete(dir) && |
617 | (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { | 617 | (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { |
618 | spin_unlock(&dir->i_lock); | 618 | spin_unlock(&ci->i_ceph_lock); |
619 | dout(" dir %p complete, -ENOENT\n", dir); | 619 | dout(" dir %p complete, -ENOENT\n", dir); |
620 | d_add(dentry, NULL); | 620 | d_add(dentry, NULL); |
621 | di->lease_shared_gen = ci->i_shared_gen; | 621 | di->lease_shared_gen = ci->i_shared_gen; |
622 | return NULL; | 622 | return NULL; |
623 | } | 623 | } |
624 | spin_unlock(&dir->i_lock); | 624 | spin_unlock(&ci->i_ceph_lock); |
625 | } | 625 | } |
626 | 626 | ||
627 | op = ceph_snap(dir) == CEPH_SNAPDIR ? | 627 | op = ceph_snap(dir) == CEPH_SNAPDIR ? |
@@ -841,12 +841,12 @@ static int drop_caps_for_unlink(struct inode *inode) | |||
841 | struct ceph_inode_info *ci = ceph_inode(inode); | 841 | struct ceph_inode_info *ci = ceph_inode(inode); |
842 | int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; | 842 | int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; |
843 | 843 | ||
844 | spin_lock(&inode->i_lock); | 844 | spin_lock(&ci->i_ceph_lock); |
845 | if (inode->i_nlink == 1) { | 845 | if (inode->i_nlink == 1) { |
846 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); | 846 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); |
847 | ci->i_ceph_flags |= CEPH_I_NODELAY; | 847 | ci->i_ceph_flags |= CEPH_I_NODELAY; |
848 | } | 848 | } |
849 | spin_unlock(&inode->i_lock); | 849 | spin_unlock(&ci->i_ceph_lock); |
850 | return drop; | 850 | return drop; |
851 | } | 851 | } |
852 | 852 | ||
@@ -1015,10 +1015,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) | |||
1015 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 1015 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
1016 | int valid = 0; | 1016 | int valid = 0; |
1017 | 1017 | ||
1018 | spin_lock(&dir->i_lock); | 1018 | spin_lock(&ci->i_ceph_lock); |
1019 | if (ci->i_shared_gen == di->lease_shared_gen) | 1019 | if (ci->i_shared_gen == di->lease_shared_gen) |
1020 | valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); | 1020 | valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); |
1021 | spin_unlock(&dir->i_lock); | 1021 | spin_unlock(&ci->i_ceph_lock); |
1022 | dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", | 1022 | dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", |
1023 | dir, (unsigned)ci->i_shared_gen, dentry, | 1023 | dir, (unsigned)ci->i_shared_gen, dentry, |
1024 | (unsigned)di->lease_shared_gen, valid); | 1024 | (unsigned)di->lease_shared_gen, valid); |
@@ -1143,7 +1143,7 @@ static void ceph_d_prune(struct dentry *dentry) | |||
1143 | { | 1143 | { |
1144 | struct ceph_dentry_info *di; | 1144 | struct ceph_dentry_info *di; |
1145 | 1145 | ||
1146 | dout("d_release %p\n", dentry); | 1146 | dout("ceph_d_prune %p\n", dentry); |
1147 | 1147 | ||
1148 | /* do we have a valid parent? */ | 1148 | /* do we have a valid parent? */ |
1149 | if (!dentry->d_parent || IS_ROOT(dentry)) | 1149 | if (!dentry->d_parent || IS_ROOT(dentry)) |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce549d31eeb7..ed72428d9c75 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -147,9 +147,9 @@ int ceph_open(struct inode *inode, struct file *file) | |||
147 | 147 | ||
148 | /* trivially open snapdir */ | 148 | /* trivially open snapdir */ |
149 | if (ceph_snap(inode) == CEPH_SNAPDIR) { | 149 | if (ceph_snap(inode) == CEPH_SNAPDIR) { |
150 | spin_lock(&inode->i_lock); | 150 | spin_lock(&ci->i_ceph_lock); |
151 | __ceph_get_fmode(ci, fmode); | 151 | __ceph_get_fmode(ci, fmode); |
152 | spin_unlock(&inode->i_lock); | 152 | spin_unlock(&ci->i_ceph_lock); |
153 | return ceph_init_file(inode, file, fmode); | 153 | return ceph_init_file(inode, file, fmode); |
154 | } | 154 | } |
155 | 155 | ||
@@ -158,7 +158,7 @@ int ceph_open(struct inode *inode, struct file *file) | |||
158 | * write) or any MDS (for read). Update wanted set | 158 | * write) or any MDS (for read). Update wanted set |
159 | * asynchronously. | 159 | * asynchronously. |
160 | */ | 160 | */ |
161 | spin_lock(&inode->i_lock); | 161 | spin_lock(&ci->i_ceph_lock); |
162 | if (__ceph_is_any_real_caps(ci) && | 162 | if (__ceph_is_any_real_caps(ci) && |
163 | (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { | 163 | (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { |
164 | int mds_wanted = __ceph_caps_mds_wanted(ci); | 164 | int mds_wanted = __ceph_caps_mds_wanted(ci); |
@@ -168,7 +168,7 @@ int ceph_open(struct inode *inode, struct file *file) | |||
168 | inode, fmode, ceph_cap_string(wanted), | 168 | inode, fmode, ceph_cap_string(wanted), |
169 | ceph_cap_string(issued)); | 169 | ceph_cap_string(issued)); |
170 | __ceph_get_fmode(ci, fmode); | 170 | __ceph_get_fmode(ci, fmode); |
171 | spin_unlock(&inode->i_lock); | 171 | spin_unlock(&ci->i_ceph_lock); |
172 | 172 | ||
173 | /* adjust wanted? */ | 173 | /* adjust wanted? */ |
174 | if ((issued & wanted) != wanted && | 174 | if ((issued & wanted) != wanted && |
@@ -180,10 +180,10 @@ int ceph_open(struct inode *inode, struct file *file) | |||
180 | } else if (ceph_snap(inode) != CEPH_NOSNAP && | 180 | } else if (ceph_snap(inode) != CEPH_NOSNAP && |
181 | (ci->i_snap_caps & wanted) == wanted) { | 181 | (ci->i_snap_caps & wanted) == wanted) { |
182 | __ceph_get_fmode(ci, fmode); | 182 | __ceph_get_fmode(ci, fmode); |
183 | spin_unlock(&inode->i_lock); | 183 | spin_unlock(&ci->i_ceph_lock); |
184 | return ceph_init_file(inode, file, fmode); | 184 | return ceph_init_file(inode, file, fmode); |
185 | } | 185 | } |
186 | spin_unlock(&inode->i_lock); | 186 | spin_unlock(&ci->i_ceph_lock); |
187 | 187 | ||
188 | dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); | 188 | dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); |
189 | req = prepare_open_request(inode->i_sb, flags, 0); | 189 | req = prepare_open_request(inode->i_sb, flags, 0); |
@@ -743,9 +743,9 @@ retry_snap: | |||
743 | */ | 743 | */ |
744 | int dirty; | 744 | int dirty; |
745 | 745 | ||
746 | spin_lock(&inode->i_lock); | 746 | spin_lock(&ci->i_ceph_lock); |
747 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 747 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); |
748 | spin_unlock(&inode->i_lock); | 748 | spin_unlock(&ci->i_ceph_lock); |
749 | ceph_put_cap_refs(ci, got); | 749 | ceph_put_cap_refs(ci, got); |
750 | 750 | ||
751 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | 751 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); |
@@ -764,9 +764,9 @@ retry_snap: | |||
764 | 764 | ||
765 | if (ret >= 0) { | 765 | if (ret >= 0) { |
766 | int dirty; | 766 | int dirty; |
767 | spin_lock(&inode->i_lock); | 767 | spin_lock(&ci->i_ceph_lock); |
768 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 768 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); |
769 | spin_unlock(&inode->i_lock); | 769 | spin_unlock(&ci->i_ceph_lock); |
770 | if (dirty) | 770 | if (dirty) |
771 | __mark_inode_dirty(inode, dirty); | 771 | __mark_inode_dirty(inode, dirty); |
772 | } | 772 | } |
@@ -797,7 +797,8 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) | |||
797 | 797 | ||
798 | mutex_lock(&inode->i_mutex); | 798 | mutex_lock(&inode->i_mutex); |
799 | __ceph_do_pending_vmtruncate(inode); | 799 | __ceph_do_pending_vmtruncate(inode); |
800 | if (origin != SEEK_CUR || origin != SEEK_SET) { | 800 | |
801 | if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { | ||
801 | ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); | 802 | ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); |
802 | if (ret < 0) { | 803 | if (ret < 0) { |
803 | offset = ret; | 804 | offset = ret; |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e392bfce84a3..87fb132fb330 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
297 | 297 | ||
298 | dout("alloc_inode %p\n", &ci->vfs_inode); | 298 | dout("alloc_inode %p\n", &ci->vfs_inode); |
299 | 299 | ||
300 | spin_lock_init(&ci->i_ceph_lock); | ||
301 | |||
300 | ci->i_version = 0; | 302 | ci->i_version = 0; |
301 | ci->i_time_warp_seq = 0; | 303 | ci->i_time_warp_seq = 0; |
302 | ci->i_ceph_flags = 0; | 304 | ci->i_ceph_flags = 0; |
@@ -583,7 +585,7 @@ static int fill_inode(struct inode *inode, | |||
583 | iinfo->xattr_len); | 585 | iinfo->xattr_len); |
584 | } | 586 | } |
585 | 587 | ||
586 | spin_lock(&inode->i_lock); | 588 | spin_lock(&ci->i_ceph_lock); |
587 | 589 | ||
588 | /* | 590 | /* |
589 | * provided version will be odd if inode value is projected, | 591 | * provided version will be odd if inode value is projected, |
@@ -680,7 +682,7 @@ static int fill_inode(struct inode *inode, | |||
680 | char *sym; | 682 | char *sym; |
681 | 683 | ||
682 | BUG_ON(symlen != inode->i_size); | 684 | BUG_ON(symlen != inode->i_size); |
683 | spin_unlock(&inode->i_lock); | 685 | spin_unlock(&ci->i_ceph_lock); |
684 | 686 | ||
685 | err = -ENOMEM; | 687 | err = -ENOMEM; |
686 | sym = kmalloc(symlen+1, GFP_NOFS); | 688 | sym = kmalloc(symlen+1, GFP_NOFS); |
@@ -689,7 +691,7 @@ static int fill_inode(struct inode *inode, | |||
689 | memcpy(sym, iinfo->symlink, symlen); | 691 | memcpy(sym, iinfo->symlink, symlen); |
690 | sym[symlen] = 0; | 692 | sym[symlen] = 0; |
691 | 693 | ||
692 | spin_lock(&inode->i_lock); | 694 | spin_lock(&ci->i_ceph_lock); |
693 | if (!ci->i_symlink) | 695 | if (!ci->i_symlink) |
694 | ci->i_symlink = sym; | 696 | ci->i_symlink = sym; |
695 | else | 697 | else |
@@ -715,7 +717,7 @@ static int fill_inode(struct inode *inode, | |||
715 | } | 717 | } |
716 | 718 | ||
717 | no_change: | 719 | no_change: |
718 | spin_unlock(&inode->i_lock); | 720 | spin_unlock(&ci->i_ceph_lock); |
719 | 721 | ||
720 | /* queue truncate if we saw i_size decrease */ | 722 | /* queue truncate if we saw i_size decrease */ |
721 | if (queue_trunc) | 723 | if (queue_trunc) |
@@ -750,13 +752,13 @@ no_change: | |||
750 | info->cap.flags, | 752 | info->cap.flags, |
751 | caps_reservation); | 753 | caps_reservation); |
752 | } else { | 754 | } else { |
753 | spin_lock(&inode->i_lock); | 755 | spin_lock(&ci->i_ceph_lock); |
754 | dout(" %p got snap_caps %s\n", inode, | 756 | dout(" %p got snap_caps %s\n", inode, |
755 | ceph_cap_string(le32_to_cpu(info->cap.caps))); | 757 | ceph_cap_string(le32_to_cpu(info->cap.caps))); |
756 | ci->i_snap_caps |= le32_to_cpu(info->cap.caps); | 758 | ci->i_snap_caps |= le32_to_cpu(info->cap.caps); |
757 | if (cap_fmode >= 0) | 759 | if (cap_fmode >= 0) |
758 | __ceph_get_fmode(ci, cap_fmode); | 760 | __ceph_get_fmode(ci, cap_fmode); |
759 | spin_unlock(&inode->i_lock); | 761 | spin_unlock(&ci->i_ceph_lock); |
760 | } | 762 | } |
761 | } else if (cap_fmode >= 0) { | 763 | } else if (cap_fmode >= 0) { |
762 | pr_warning("mds issued no caps on %llx.%llx\n", | 764 | pr_warning("mds issued no caps on %llx.%llx\n", |
@@ -849,19 +851,20 @@ static void ceph_set_dentry_offset(struct dentry *dn) | |||
849 | { | 851 | { |
850 | struct dentry *dir = dn->d_parent; | 852 | struct dentry *dir = dn->d_parent; |
851 | struct inode *inode = dir->d_inode; | 853 | struct inode *inode = dir->d_inode; |
854 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
852 | struct ceph_dentry_info *di; | 855 | struct ceph_dentry_info *di; |
853 | 856 | ||
854 | BUG_ON(!inode); | 857 | BUG_ON(!inode); |
855 | 858 | ||
856 | di = ceph_dentry(dn); | 859 | di = ceph_dentry(dn); |
857 | 860 | ||
858 | spin_lock(&inode->i_lock); | 861 | spin_lock(&ci->i_ceph_lock); |
859 | if (!ceph_dir_test_complete(inode)) { | 862 | if (!ceph_dir_test_complete(inode)) { |
860 | spin_unlock(&inode->i_lock); | 863 | spin_unlock(&ci->i_ceph_lock); |
861 | return; | 864 | return; |
862 | } | 865 | } |
863 | di->offset = ceph_inode(inode)->i_max_offset++; | 866 | di->offset = ceph_inode(inode)->i_max_offset++; |
864 | spin_unlock(&inode->i_lock); | 867 | spin_unlock(&ci->i_ceph_lock); |
865 | 868 | ||
866 | spin_lock(&dir->d_lock); | 869 | spin_lock(&dir->d_lock); |
867 | spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); | 870 | spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); |
@@ -1308,7 +1311,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) | |||
1308 | struct ceph_inode_info *ci = ceph_inode(inode); | 1311 | struct ceph_inode_info *ci = ceph_inode(inode); |
1309 | int ret = 0; | 1312 | int ret = 0; |
1310 | 1313 | ||
1311 | spin_lock(&inode->i_lock); | 1314 | spin_lock(&ci->i_ceph_lock); |
1312 | dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); | 1315 | dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); |
1313 | inode->i_size = size; | 1316 | inode->i_size = size; |
1314 | inode->i_blocks = (size + (1 << 9) - 1) >> 9; | 1317 | inode->i_blocks = (size + (1 << 9) - 1) >> 9; |
@@ -1318,7 +1321,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) | |||
1318 | (ci->i_reported_size << 1) < ci->i_max_size) | 1321 | (ci->i_reported_size << 1) < ci->i_max_size) |
1319 | ret = 1; | 1322 | ret = 1; |
1320 | 1323 | ||
1321 | spin_unlock(&inode->i_lock); | 1324 | spin_unlock(&ci->i_ceph_lock); |
1322 | return ret; | 1325 | return ret; |
1323 | } | 1326 | } |
1324 | 1327 | ||
@@ -1328,12 +1331,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) | |||
1328 | */ | 1331 | */ |
1329 | void ceph_queue_writeback(struct inode *inode) | 1332 | void ceph_queue_writeback(struct inode *inode) |
1330 | { | 1333 | { |
1334 | ihold(inode); | ||
1331 | if (queue_work(ceph_inode_to_client(inode)->wb_wq, | 1335 | if (queue_work(ceph_inode_to_client(inode)->wb_wq, |
1332 | &ceph_inode(inode)->i_wb_work)) { | 1336 | &ceph_inode(inode)->i_wb_work)) { |
1333 | dout("ceph_queue_writeback %p\n", inode); | 1337 | dout("ceph_queue_writeback %p\n", inode); |
1334 | ihold(inode); | ||
1335 | } else { | 1338 | } else { |
1336 | dout("ceph_queue_writeback %p failed\n", inode); | 1339 | dout("ceph_queue_writeback %p failed\n", inode); |
1340 | iput(inode); | ||
1337 | } | 1341 | } |
1338 | } | 1342 | } |
1339 | 1343 | ||
@@ -1353,12 +1357,13 @@ static void ceph_writeback_work(struct work_struct *work) | |||
1353 | */ | 1357 | */ |
1354 | void ceph_queue_invalidate(struct inode *inode) | 1358 | void ceph_queue_invalidate(struct inode *inode) |
1355 | { | 1359 | { |
1360 | ihold(inode); | ||
1356 | if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, | 1361 | if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, |
1357 | &ceph_inode(inode)->i_pg_inv_work)) { | 1362 | &ceph_inode(inode)->i_pg_inv_work)) { |
1358 | dout("ceph_queue_invalidate %p\n", inode); | 1363 | dout("ceph_queue_invalidate %p\n", inode); |
1359 | ihold(inode); | ||
1360 | } else { | 1364 | } else { |
1361 | dout("ceph_queue_invalidate %p failed\n", inode); | 1365 | dout("ceph_queue_invalidate %p failed\n", inode); |
1366 | iput(inode); | ||
1362 | } | 1367 | } |
1363 | } | 1368 | } |
1364 | 1369 | ||
@@ -1374,20 +1379,20 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
1374 | u32 orig_gen; | 1379 | u32 orig_gen; |
1375 | int check = 0; | 1380 | int check = 0; |
1376 | 1381 | ||
1377 | spin_lock(&inode->i_lock); | 1382 | spin_lock(&ci->i_ceph_lock); |
1378 | dout("invalidate_pages %p gen %d revoking %d\n", inode, | 1383 | dout("invalidate_pages %p gen %d revoking %d\n", inode, |
1379 | ci->i_rdcache_gen, ci->i_rdcache_revoking); | 1384 | ci->i_rdcache_gen, ci->i_rdcache_revoking); |
1380 | if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { | 1385 | if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
1381 | /* nevermind! */ | 1386 | /* nevermind! */ |
1382 | spin_unlock(&inode->i_lock); | 1387 | spin_unlock(&ci->i_ceph_lock); |
1383 | goto out; | 1388 | goto out; |
1384 | } | 1389 | } |
1385 | orig_gen = ci->i_rdcache_gen; | 1390 | orig_gen = ci->i_rdcache_gen; |
1386 | spin_unlock(&inode->i_lock); | 1391 | spin_unlock(&ci->i_ceph_lock); |
1387 | 1392 | ||
1388 | truncate_inode_pages(&inode->i_data, 0); | 1393 | truncate_inode_pages(&inode->i_data, 0); |
1389 | 1394 | ||
1390 | spin_lock(&inode->i_lock); | 1395 | spin_lock(&ci->i_ceph_lock); |
1391 | if (orig_gen == ci->i_rdcache_gen && | 1396 | if (orig_gen == ci->i_rdcache_gen && |
1392 | orig_gen == ci->i_rdcache_revoking) { | 1397 | orig_gen == ci->i_rdcache_revoking) { |
1393 | dout("invalidate_pages %p gen %d successful\n", inode, | 1398 | dout("invalidate_pages %p gen %d successful\n", inode, |
@@ -1399,7 +1404,7 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
1399 | inode, orig_gen, ci->i_rdcache_gen, | 1404 | inode, orig_gen, ci->i_rdcache_gen, |
1400 | ci->i_rdcache_revoking); | 1405 | ci->i_rdcache_revoking); |
1401 | } | 1406 | } |
1402 | spin_unlock(&inode->i_lock); | 1407 | spin_unlock(&ci->i_ceph_lock); |
1403 | 1408 | ||
1404 | if (check) | 1409 | if (check) |
1405 | ceph_check_caps(ci, 0, NULL); | 1410 | ceph_check_caps(ci, 0, NULL); |
@@ -1434,13 +1439,14 @@ void ceph_queue_vmtruncate(struct inode *inode) | |||
1434 | { | 1439 | { |
1435 | struct ceph_inode_info *ci = ceph_inode(inode); | 1440 | struct ceph_inode_info *ci = ceph_inode(inode); |
1436 | 1441 | ||
1442 | ihold(inode); | ||
1437 | if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, | 1443 | if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, |
1438 | &ci->i_vmtruncate_work)) { | 1444 | &ci->i_vmtruncate_work)) { |
1439 | dout("ceph_queue_vmtruncate %p\n", inode); | 1445 | dout("ceph_queue_vmtruncate %p\n", inode); |
1440 | ihold(inode); | ||
1441 | } else { | 1446 | } else { |
1442 | dout("ceph_queue_vmtruncate %p failed, pending=%d\n", | 1447 | dout("ceph_queue_vmtruncate %p failed, pending=%d\n", |
1443 | inode, ci->i_truncate_pending); | 1448 | inode, ci->i_truncate_pending); |
1449 | iput(inode); | ||
1444 | } | 1450 | } |
1445 | } | 1451 | } |
1446 | 1452 | ||
@@ -1457,10 +1463,10 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) | |||
1457 | int wrbuffer_refs, wake = 0; | 1463 | int wrbuffer_refs, wake = 0; |
1458 | 1464 | ||
1459 | retry: | 1465 | retry: |
1460 | spin_lock(&inode->i_lock); | 1466 | spin_lock(&ci->i_ceph_lock); |
1461 | if (ci->i_truncate_pending == 0) { | 1467 | if (ci->i_truncate_pending == 0) { |
1462 | dout("__do_pending_vmtruncate %p none pending\n", inode); | 1468 | dout("__do_pending_vmtruncate %p none pending\n", inode); |
1463 | spin_unlock(&inode->i_lock); | 1469 | spin_unlock(&ci->i_ceph_lock); |
1464 | return; | 1470 | return; |
1465 | } | 1471 | } |
1466 | 1472 | ||
@@ -1471,7 +1477,7 @@ retry: | |||
1471 | if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { | 1477 | if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { |
1472 | dout("__do_pending_vmtruncate %p flushing snaps first\n", | 1478 | dout("__do_pending_vmtruncate %p flushing snaps first\n", |
1473 | inode); | 1479 | inode); |
1474 | spin_unlock(&inode->i_lock); | 1480 | spin_unlock(&ci->i_ceph_lock); |
1475 | filemap_write_and_wait_range(&inode->i_data, 0, | 1481 | filemap_write_and_wait_range(&inode->i_data, 0, |
1476 | inode->i_sb->s_maxbytes); | 1482 | inode->i_sb->s_maxbytes); |
1477 | goto retry; | 1483 | goto retry; |
@@ -1481,15 +1487,15 @@ retry: | |||
1481 | wrbuffer_refs = ci->i_wrbuffer_ref; | 1487 | wrbuffer_refs = ci->i_wrbuffer_ref; |
1482 | dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, | 1488 | dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, |
1483 | ci->i_truncate_pending, to); | 1489 | ci->i_truncate_pending, to); |
1484 | spin_unlock(&inode->i_lock); | 1490 | spin_unlock(&ci->i_ceph_lock); |
1485 | 1491 | ||
1486 | truncate_inode_pages(inode->i_mapping, to); | 1492 | truncate_inode_pages(inode->i_mapping, to); |
1487 | 1493 | ||
1488 | spin_lock(&inode->i_lock); | 1494 | spin_lock(&ci->i_ceph_lock); |
1489 | ci->i_truncate_pending--; | 1495 | ci->i_truncate_pending--; |
1490 | if (ci->i_truncate_pending == 0) | 1496 | if (ci->i_truncate_pending == 0) |
1491 | wake = 1; | 1497 | wake = 1; |
1492 | spin_unlock(&inode->i_lock); | 1498 | spin_unlock(&ci->i_ceph_lock); |
1493 | 1499 | ||
1494 | if (wrbuffer_refs == 0) | 1500 | if (wrbuffer_refs == 0) |
1495 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 1501 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
@@ -1544,7 +1550,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1544 | if (IS_ERR(req)) | 1550 | if (IS_ERR(req)) |
1545 | return PTR_ERR(req); | 1551 | return PTR_ERR(req); |
1546 | 1552 | ||
1547 | spin_lock(&inode->i_lock); | 1553 | spin_lock(&ci->i_ceph_lock); |
1548 | issued = __ceph_caps_issued(ci, NULL); | 1554 | issued = __ceph_caps_issued(ci, NULL); |
1549 | dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); | 1555 | dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); |
1550 | 1556 | ||
@@ -1692,7 +1698,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1692 | } | 1698 | } |
1693 | 1699 | ||
1694 | release &= issued; | 1700 | release &= issued; |
1695 | spin_unlock(&inode->i_lock); | 1701 | spin_unlock(&ci->i_ceph_lock); |
1696 | 1702 | ||
1697 | if (inode_dirty_flags) | 1703 | if (inode_dirty_flags) |
1698 | __mark_inode_dirty(inode, inode_dirty_flags); | 1704 | __mark_inode_dirty(inode, inode_dirty_flags); |
@@ -1714,7 +1720,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1714 | __ceph_do_pending_vmtruncate(inode); | 1720 | __ceph_do_pending_vmtruncate(inode); |
1715 | return err; | 1721 | return err; |
1716 | out: | 1722 | out: |
1717 | spin_unlock(&inode->i_lock); | 1723 | spin_unlock(&ci->i_ceph_lock); |
1718 | ceph_mdsc_put_request(req); | 1724 | ceph_mdsc_put_request(req); |
1719 | return err; | 1725 | return err; |
1720 | } | 1726 | } |
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 5a14c29cbba6..790914a598dd 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
@@ -241,11 +241,11 @@ static long ceph_ioctl_lazyio(struct file *file) | |||
241 | struct ceph_inode_info *ci = ceph_inode(inode); | 241 | struct ceph_inode_info *ci = ceph_inode(inode); |
242 | 242 | ||
243 | if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { | 243 | if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { |
244 | spin_lock(&inode->i_lock); | 244 | spin_lock(&ci->i_ceph_lock); |
245 | ci->i_nr_by_mode[fi->fmode]--; | 245 | ci->i_nr_by_mode[fi->fmode]--; |
246 | fi->fmode |= CEPH_FILE_MODE_LAZY; | 246 | fi->fmode |= CEPH_FILE_MODE_LAZY; |
247 | ci->i_nr_by_mode[fi->fmode]++; | 247 | ci->i_nr_by_mode[fi->fmode]++; |
248 | spin_unlock(&inode->i_lock); | 248 | spin_unlock(&ci->i_ceph_lock); |
249 | dout("ioctl_layzio: file %p marked lazy\n", file); | 249 | dout("ioctl_layzio: file %p marked lazy\n", file); |
250 | 250 | ||
251 | ceph_check_caps(ci, 0, NULL); | 251 | ceph_check_caps(ci, 0, NULL); |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 264ab701154f..6203d805eb45 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -732,21 +732,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
732 | } | 732 | } |
733 | } | 733 | } |
734 | 734 | ||
735 | spin_lock(&inode->i_lock); | 735 | spin_lock(&ci->i_ceph_lock); |
736 | cap = NULL; | 736 | cap = NULL; |
737 | if (mode == USE_AUTH_MDS) | 737 | if (mode == USE_AUTH_MDS) |
738 | cap = ci->i_auth_cap; | 738 | cap = ci->i_auth_cap; |
739 | if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) | 739 | if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) |
740 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); | 740 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); |
741 | if (!cap) { | 741 | if (!cap) { |
742 | spin_unlock(&inode->i_lock); | 742 | spin_unlock(&ci->i_ceph_lock); |
743 | goto random; | 743 | goto random; |
744 | } | 744 | } |
745 | mds = cap->session->s_mds; | 745 | mds = cap->session->s_mds; |
746 | dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", | 746 | dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", |
747 | inode, ceph_vinop(inode), mds, | 747 | inode, ceph_vinop(inode), mds, |
748 | cap == ci->i_auth_cap ? "auth " : "", cap); | 748 | cap == ci->i_auth_cap ? "auth " : "", cap); |
749 | spin_unlock(&inode->i_lock); | 749 | spin_unlock(&ci->i_ceph_lock); |
750 | return mds; | 750 | return mds; |
751 | 751 | ||
752 | random: | 752 | random: |
@@ -951,7 +951,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
951 | 951 | ||
952 | dout("removing cap %p, ci is %p, inode is %p\n", | 952 | dout("removing cap %p, ci is %p, inode is %p\n", |
953 | cap, ci, &ci->vfs_inode); | 953 | cap, ci, &ci->vfs_inode); |
954 | spin_lock(&inode->i_lock); | 954 | spin_lock(&ci->i_ceph_lock); |
955 | __ceph_remove_cap(cap); | 955 | __ceph_remove_cap(cap); |
956 | if (!__ceph_is_any_real_caps(ci)) { | 956 | if (!__ceph_is_any_real_caps(ci)) { |
957 | struct ceph_mds_client *mdsc = | 957 | struct ceph_mds_client *mdsc = |
@@ -984,7 +984,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
984 | } | 984 | } |
985 | spin_unlock(&mdsc->cap_dirty_lock); | 985 | spin_unlock(&mdsc->cap_dirty_lock); |
986 | } | 986 | } |
987 | spin_unlock(&inode->i_lock); | 987 | spin_unlock(&ci->i_ceph_lock); |
988 | while (drop--) | 988 | while (drop--) |
989 | iput(inode); | 989 | iput(inode); |
990 | return 0; | 990 | return 0; |
@@ -1015,10 +1015,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, | |||
1015 | 1015 | ||
1016 | wake_up_all(&ci->i_cap_wq); | 1016 | wake_up_all(&ci->i_cap_wq); |
1017 | if (arg) { | 1017 | if (arg) { |
1018 | spin_lock(&inode->i_lock); | 1018 | spin_lock(&ci->i_ceph_lock); |
1019 | ci->i_wanted_max_size = 0; | 1019 | ci->i_wanted_max_size = 0; |
1020 | ci->i_requested_max_size = 0; | 1020 | ci->i_requested_max_size = 0; |
1021 | spin_unlock(&inode->i_lock); | 1021 | spin_unlock(&ci->i_ceph_lock); |
1022 | } | 1022 | } |
1023 | return 0; | 1023 | return 0; |
1024 | } | 1024 | } |
@@ -1151,7 +1151,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
1151 | if (session->s_trim_caps <= 0) | 1151 | if (session->s_trim_caps <= 0) |
1152 | return -1; | 1152 | return -1; |
1153 | 1153 | ||
1154 | spin_lock(&inode->i_lock); | 1154 | spin_lock(&ci->i_ceph_lock); |
1155 | mine = cap->issued | cap->implemented; | 1155 | mine = cap->issued | cap->implemented; |
1156 | used = __ceph_caps_used(ci); | 1156 | used = __ceph_caps_used(ci); |
1157 | oissued = __ceph_caps_issued_other(ci, cap); | 1157 | oissued = __ceph_caps_issued_other(ci, cap); |
@@ -1170,7 +1170,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
1170 | __ceph_remove_cap(cap); | 1170 | __ceph_remove_cap(cap); |
1171 | } else { | 1171 | } else { |
1172 | /* try to drop referring dentries */ | 1172 | /* try to drop referring dentries */ |
1173 | spin_unlock(&inode->i_lock); | 1173 | spin_unlock(&ci->i_ceph_lock); |
1174 | d_prune_aliases(inode); | 1174 | d_prune_aliases(inode); |
1175 | dout("trim_caps_cb %p cap %p pruned, count now %d\n", | 1175 | dout("trim_caps_cb %p cap %p pruned, count now %d\n", |
1176 | inode, cap, atomic_read(&inode->i_count)); | 1176 | inode, cap, atomic_read(&inode->i_count)); |
@@ -1178,7 +1178,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
1178 | } | 1178 | } |
1179 | 1179 | ||
1180 | out: | 1180 | out: |
1181 | spin_unlock(&inode->i_lock); | 1181 | spin_unlock(&ci->i_ceph_lock); |
1182 | return 0; | 1182 | return 0; |
1183 | } | 1183 | } |
1184 | 1184 | ||
@@ -1296,7 +1296,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) | |||
1296 | i_flushing_item); | 1296 | i_flushing_item); |
1297 | struct inode *inode = &ci->vfs_inode; | 1297 | struct inode *inode = &ci->vfs_inode; |
1298 | 1298 | ||
1299 | spin_lock(&inode->i_lock); | 1299 | spin_lock(&ci->i_ceph_lock); |
1300 | if (ci->i_cap_flush_seq <= want_flush_seq) { | 1300 | if (ci->i_cap_flush_seq <= want_flush_seq) { |
1301 | dout("check_cap_flush still flushing %p " | 1301 | dout("check_cap_flush still flushing %p " |
1302 | "seq %lld <= %lld to mds%d\n", inode, | 1302 | "seq %lld <= %lld to mds%d\n", inode, |
@@ -1304,7 +1304,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) | |||
1304 | session->s_mds); | 1304 | session->s_mds); |
1305 | ret = 0; | 1305 | ret = 0; |
1306 | } | 1306 | } |
1307 | spin_unlock(&inode->i_lock); | 1307 | spin_unlock(&ci->i_ceph_lock); |
1308 | } | 1308 | } |
1309 | mutex_unlock(&session->s_mutex); | 1309 | mutex_unlock(&session->s_mutex); |
1310 | ceph_put_mds_session(session); | 1310 | ceph_put_mds_session(session); |
@@ -1495,6 +1495,7 @@ retry: | |||
1495 | pos, temp); | 1495 | pos, temp); |
1496 | } else if (stop_on_nosnap && inode && | 1496 | } else if (stop_on_nosnap && inode && |
1497 | ceph_snap(inode) == CEPH_NOSNAP) { | 1497 | ceph_snap(inode) == CEPH_NOSNAP) { |
1498 | spin_unlock(&temp->d_lock); | ||
1498 | break; | 1499 | break; |
1499 | } else { | 1500 | } else { |
1500 | pos -= temp->d_name.len; | 1501 | pos -= temp->d_name.len; |
@@ -2011,10 +2012,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) | |||
2011 | struct ceph_inode_info *ci = ceph_inode(inode); | 2012 | struct ceph_inode_info *ci = ceph_inode(inode); |
2012 | 2013 | ||
2013 | dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); | 2014 | dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); |
2014 | spin_lock(&inode->i_lock); | 2015 | spin_lock(&ci->i_ceph_lock); |
2015 | ceph_dir_clear_complete(inode); | 2016 | ceph_dir_clear_complete(inode); |
2016 | ci->i_release_count++; | 2017 | ci->i_release_count++; |
2017 | spin_unlock(&inode->i_lock); | 2018 | spin_unlock(&ci->i_ceph_lock); |
2018 | 2019 | ||
2019 | if (req->r_dentry) | 2020 | if (req->r_dentry) |
2020 | ceph_invalidate_dentry_lease(req->r_dentry); | 2021 | ceph_invalidate_dentry_lease(req->r_dentry); |
@@ -2422,7 +2423,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
2422 | if (err) | 2423 | if (err) |
2423 | goto out_free; | 2424 | goto out_free; |
2424 | 2425 | ||
2425 | spin_lock(&inode->i_lock); | 2426 | spin_lock(&ci->i_ceph_lock); |
2426 | cap->seq = 0; /* reset cap seq */ | 2427 | cap->seq = 0; /* reset cap seq */ |
2427 | cap->issue_seq = 0; /* and issue_seq */ | 2428 | cap->issue_seq = 0; /* and issue_seq */ |
2428 | 2429 | ||
@@ -2445,7 +2446,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
2445 | rec.v1.pathbase = cpu_to_le64(pathbase); | 2446 | rec.v1.pathbase = cpu_to_le64(pathbase); |
2446 | reclen = sizeof(rec.v1); | 2447 | reclen = sizeof(rec.v1); |
2447 | } | 2448 | } |
2448 | spin_unlock(&inode->i_lock); | 2449 | spin_unlock(&ci->i_ceph_lock); |
2449 | 2450 | ||
2450 | if (recon_state->flock) { | 2451 | if (recon_state->flock) { |
2451 | int num_fcntl_locks, num_flock_locks; | 2452 | int num_fcntl_locks, num_flock_locks; |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4bb239921dbd..a50ca0e39475 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -20,7 +20,7 @@ | |||
20 | * | 20 | * |
21 | * mdsc->snap_rwsem | 21 | * mdsc->snap_rwsem |
22 | * | 22 | * |
23 | * inode->i_lock | 23 | * ci->i_ceph_lock |
24 | * mdsc->snap_flush_lock | 24 | * mdsc->snap_flush_lock |
25 | * mdsc->cap_delay_lock | 25 | * mdsc->cap_delay_lock |
26 | * | 26 | * |
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e26437191333..a559c80f127a 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c | |||
@@ -446,7 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) | |||
446 | return; | 446 | return; |
447 | } | 447 | } |
448 | 448 | ||
449 | spin_lock(&inode->i_lock); | 449 | spin_lock(&ci->i_ceph_lock); |
450 | used = __ceph_caps_used(ci); | 450 | used = __ceph_caps_used(ci); |
451 | dirty = __ceph_caps_dirty(ci); | 451 | dirty = __ceph_caps_dirty(ci); |
452 | 452 | ||
@@ -528,7 +528,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) | |||
528 | kfree(capsnap); | 528 | kfree(capsnap); |
529 | } | 529 | } |
530 | 530 | ||
531 | spin_unlock(&inode->i_lock); | 531 | spin_unlock(&ci->i_ceph_lock); |
532 | } | 532 | } |
533 | 533 | ||
534 | /* | 534 | /* |
@@ -537,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) | |||
537 | * | 537 | * |
538 | * If capsnap can now be flushed, add to snap_flush list, and return 1. | 538 | * If capsnap can now be flushed, add to snap_flush list, and return 1. |
539 | * | 539 | * |
540 | * Caller must hold i_lock. | 540 | * Caller must hold i_ceph_lock. |
541 | */ | 541 | */ |
542 | int __ceph_finish_cap_snap(struct ceph_inode_info *ci, | 542 | int __ceph_finish_cap_snap(struct ceph_inode_info *ci, |
543 | struct ceph_cap_snap *capsnap) | 543 | struct ceph_cap_snap *capsnap) |
@@ -739,9 +739,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc) | |||
739 | inode = &ci->vfs_inode; | 739 | inode = &ci->vfs_inode; |
740 | ihold(inode); | 740 | ihold(inode); |
741 | spin_unlock(&mdsc->snap_flush_lock); | 741 | spin_unlock(&mdsc->snap_flush_lock); |
742 | spin_lock(&inode->i_lock); | 742 | spin_lock(&ci->i_ceph_lock); |
743 | __ceph_flush_snaps(ci, &session, 0); | 743 | __ceph_flush_snaps(ci, &session, 0); |
744 | spin_unlock(&inode->i_lock); | 744 | spin_unlock(&ci->i_ceph_lock); |
745 | iput(inode); | 745 | iput(inode); |
746 | spin_lock(&mdsc->snap_flush_lock); | 746 | spin_lock(&mdsc->snap_flush_lock); |
747 | } | 747 | } |
@@ -847,7 +847,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, | |||
847 | continue; | 847 | continue; |
848 | ci = ceph_inode(inode); | 848 | ci = ceph_inode(inode); |
849 | 849 | ||
850 | spin_lock(&inode->i_lock); | 850 | spin_lock(&ci->i_ceph_lock); |
851 | if (!ci->i_snap_realm) | 851 | if (!ci->i_snap_realm) |
852 | goto skip_inode; | 852 | goto skip_inode; |
853 | /* | 853 | /* |
@@ -876,7 +876,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, | |||
876 | oldrealm = ci->i_snap_realm; | 876 | oldrealm = ci->i_snap_realm; |
877 | ci->i_snap_realm = realm; | 877 | ci->i_snap_realm = realm; |
878 | spin_unlock(&realm->inodes_with_caps_lock); | 878 | spin_unlock(&realm->inodes_with_caps_lock); |
879 | spin_unlock(&inode->i_lock); | 879 | spin_unlock(&ci->i_ceph_lock); |
880 | 880 | ||
881 | ceph_get_snap_realm(mdsc, realm); | 881 | ceph_get_snap_realm(mdsc, realm); |
882 | ceph_put_snap_realm(mdsc, oldrealm); | 882 | ceph_put_snap_realm(mdsc, oldrealm); |
@@ -885,7 +885,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, | |||
885 | continue; | 885 | continue; |
886 | 886 | ||
887 | skip_inode: | 887 | skip_inode: |
888 | spin_unlock(&inode->i_lock); | 888 | spin_unlock(&ci->i_ceph_lock); |
889 | iput(inode); | 889 | iput(inode); |
890 | } | 890 | } |
891 | 891 | ||
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a90846fac759..b48f15f101a0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -383,7 +383,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) | |||
383 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) | 383 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) |
384 | seq_printf(m, ",rsize=%d", fsopt->rsize); | 384 | seq_printf(m, ",rsize=%d", fsopt->rsize); |
385 | if (fsopt->rasize != CEPH_RASIZE_DEFAULT) | 385 | if (fsopt->rasize != CEPH_RASIZE_DEFAULT) |
386 | seq_printf(m, ",rasize=%d", fsopt->rsize); | 386 | seq_printf(m, ",rasize=%d", fsopt->rasize); |
387 | if (fsopt->congestion_kb != default_congestion_kb()) | 387 | if (fsopt->congestion_kb != default_congestion_kb()) |
388 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); | 388 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); |
389 | if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) | 389 | if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) |
@@ -638,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, | |||
638 | if (err == 0) { | 638 | if (err == 0) { |
639 | dout("open_root_inode success\n"); | 639 | dout("open_root_inode success\n"); |
640 | if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && | 640 | if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && |
641 | fsc->sb->s_root == NULL) | 641 | fsc->sb->s_root == NULL) { |
642 | root = d_alloc_root(req->r_target_inode); | 642 | root = d_alloc_root(req->r_target_inode); |
643 | else | 643 | ceph_init_dentry(root); |
644 | } else { | ||
644 | root = d_obtain_alias(req->r_target_inode); | 645 | root = d_obtain_alias(req->r_target_inode); |
646 | } | ||
645 | req->r_target_inode = NULL; | 647 | req->r_target_inode = NULL; |
646 | dout("open_root_inode success, root dentry is %p\n", root); | 648 | dout("open_root_inode success, root dentry is %p\n", root); |
647 | } else { | 649 | } else { |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 01bf189e08a9..edcbf3774a56 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -220,7 +220,7 @@ struct ceph_dentry_info { | |||
220 | * The locking for D_COMPLETE is a bit odd: | 220 | * The locking for D_COMPLETE is a bit odd: |
221 | * - we can clear it at almost any time (see ceph_d_prune) | 221 | * - we can clear it at almost any time (see ceph_d_prune) |
222 | * - it is only meaningful if: | 222 | * - it is only meaningful if: |
223 | * - we hold dir inode i_lock | 223 | * - we hold dir inode i_ceph_lock |
224 | * - we hold dir FILE_SHARED caps | 224 | * - we hold dir FILE_SHARED caps |
225 | * - the dentry D_COMPLETE is set | 225 | * - the dentry D_COMPLETE is set |
226 | */ | 226 | */ |
@@ -250,6 +250,8 @@ struct ceph_inode_xattrs_info { | |||
250 | struct ceph_inode_info { | 250 | struct ceph_inode_info { |
251 | struct ceph_vino i_vino; /* ceph ino + snap */ | 251 | struct ceph_vino i_vino; /* ceph ino + snap */ |
252 | 252 | ||
253 | spinlock_t i_ceph_lock; | ||
254 | |||
253 | u64 i_version; | 255 | u64 i_version; |
254 | u32 i_time_warp_seq; | 256 | u32 i_time_warp_seq; |
255 | 257 | ||
@@ -271,7 +273,7 @@ struct ceph_inode_info { | |||
271 | 273 | ||
272 | struct ceph_inode_xattrs_info i_xattrs; | 274 | struct ceph_inode_xattrs_info i_xattrs; |
273 | 275 | ||
274 | /* capabilities. protected _both_ by i_lock and cap->session's | 276 | /* capabilities. protected _both_ by i_ceph_lock and cap->session's |
275 | * s_mutex. */ | 277 | * s_mutex. */ |
276 | struct rb_root i_caps; /* cap list */ | 278 | struct rb_root i_caps; /* cap list */ |
277 | struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ | 279 | struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ |
@@ -437,18 +439,18 @@ static inline void ceph_i_clear(struct inode *inode, unsigned mask) | |||
437 | { | 439 | { |
438 | struct ceph_inode_info *ci = ceph_inode(inode); | 440 | struct ceph_inode_info *ci = ceph_inode(inode); |
439 | 441 | ||
440 | spin_lock(&inode->i_lock); | 442 | spin_lock(&ci->i_ceph_lock); |
441 | ci->i_ceph_flags &= ~mask; | 443 | ci->i_ceph_flags &= ~mask; |
442 | spin_unlock(&inode->i_lock); | 444 | spin_unlock(&ci->i_ceph_lock); |
443 | } | 445 | } |
444 | 446 | ||
445 | static inline void ceph_i_set(struct inode *inode, unsigned mask) | 447 | static inline void ceph_i_set(struct inode *inode, unsigned mask) |
446 | { | 448 | { |
447 | struct ceph_inode_info *ci = ceph_inode(inode); | 449 | struct ceph_inode_info *ci = ceph_inode(inode); |
448 | 450 | ||
449 | spin_lock(&inode->i_lock); | 451 | spin_lock(&ci->i_ceph_lock); |
450 | ci->i_ceph_flags |= mask; | 452 | ci->i_ceph_flags |= mask; |
451 | spin_unlock(&inode->i_lock); | 453 | spin_unlock(&ci->i_ceph_lock); |
452 | } | 454 | } |
453 | 455 | ||
454 | static inline bool ceph_i_test(struct inode *inode, unsigned mask) | 456 | static inline bool ceph_i_test(struct inode *inode, unsigned mask) |
@@ -456,9 +458,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask) | |||
456 | struct ceph_inode_info *ci = ceph_inode(inode); | 458 | struct ceph_inode_info *ci = ceph_inode(inode); |
457 | bool r; | 459 | bool r; |
458 | 460 | ||
459 | spin_lock(&inode->i_lock); | 461 | spin_lock(&ci->i_ceph_lock); |
460 | r = (ci->i_ceph_flags & mask) == mask; | 462 | r = (ci->i_ceph_flags & mask) == mask; |
461 | spin_unlock(&inode->i_lock); | 463 | spin_unlock(&ci->i_ceph_lock); |
462 | return r; | 464 | return r; |
463 | } | 465 | } |
464 | 466 | ||
@@ -508,9 +510,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, | |||
508 | static inline int ceph_caps_issued(struct ceph_inode_info *ci) | 510 | static inline int ceph_caps_issued(struct ceph_inode_info *ci) |
509 | { | 511 | { |
510 | int issued; | 512 | int issued; |
511 | spin_lock(&ci->vfs_inode.i_lock); | 513 | spin_lock(&ci->i_ceph_lock); |
512 | issued = __ceph_caps_issued(ci, NULL); | 514 | issued = __ceph_caps_issued(ci, NULL); |
513 | spin_unlock(&ci->vfs_inode.i_lock); | 515 | spin_unlock(&ci->i_ceph_lock); |
514 | return issued; | 516 | return issued; |
515 | } | 517 | } |
516 | 518 | ||
@@ -518,9 +520,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, | |||
518 | int touch) | 520 | int touch) |
519 | { | 521 | { |
520 | int r; | 522 | int r; |
521 | spin_lock(&ci->vfs_inode.i_lock); | 523 | spin_lock(&ci->i_ceph_lock); |
522 | r = __ceph_caps_issued_mask(ci, mask, touch); | 524 | r = __ceph_caps_issued_mask(ci, mask, touch); |
523 | spin_unlock(&ci->vfs_inode.i_lock); | 525 | spin_unlock(&ci->i_ceph_lock); |
524 | return r; | 526 | return r; |
525 | } | 527 | } |
526 | 528 | ||
@@ -743,10 +745,9 @@ extern int ceph_add_cap(struct inode *inode, | |||
743 | extern void __ceph_remove_cap(struct ceph_cap *cap); | 745 | extern void __ceph_remove_cap(struct ceph_cap *cap); |
744 | static inline void ceph_remove_cap(struct ceph_cap *cap) | 746 | static inline void ceph_remove_cap(struct ceph_cap *cap) |
745 | { | 747 | { |
746 | struct inode *inode = &cap->ci->vfs_inode; | 748 | spin_lock(&cap->ci->i_ceph_lock); |
747 | spin_lock(&inode->i_lock); | ||
748 | __ceph_remove_cap(cap); | 749 | __ceph_remove_cap(cap); |
749 | spin_unlock(&inode->i_lock); | 750 | spin_unlock(&cap->ci->i_ceph_lock); |
750 | } | 751 | } |
751 | extern void ceph_put_cap(struct ceph_mds_client *mdsc, | 752 | extern void ceph_put_cap(struct ceph_mds_client *mdsc, |
752 | struct ceph_cap *cap); | 753 | struct ceph_cap *cap); |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 96c6739a0280..a5e36e4488a7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -343,8 +343,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci) | |||
343 | } | 343 | } |
344 | 344 | ||
345 | static int __build_xattrs(struct inode *inode) | 345 | static int __build_xattrs(struct inode *inode) |
346 | __releases(inode->i_lock) | 346 | __releases(ci->i_ceph_lock) |
347 | __acquires(inode->i_lock) | 347 | __acquires(ci->i_ceph_lock) |
348 | { | 348 | { |
349 | u32 namelen; | 349 | u32 namelen; |
350 | u32 numattr = 0; | 350 | u32 numattr = 0; |
@@ -372,7 +372,7 @@ start: | |||
372 | end = p + ci->i_xattrs.blob->vec.iov_len; | 372 | end = p + ci->i_xattrs.blob->vec.iov_len; |
373 | ceph_decode_32_safe(&p, end, numattr, bad); | 373 | ceph_decode_32_safe(&p, end, numattr, bad); |
374 | xattr_version = ci->i_xattrs.version; | 374 | xattr_version = ci->i_xattrs.version; |
375 | spin_unlock(&inode->i_lock); | 375 | spin_unlock(&ci->i_ceph_lock); |
376 | 376 | ||
377 | xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), | 377 | xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), |
378 | GFP_NOFS); | 378 | GFP_NOFS); |
@@ -387,7 +387,7 @@ start: | |||
387 | goto bad_lock; | 387 | goto bad_lock; |
388 | } | 388 | } |
389 | 389 | ||
390 | spin_lock(&inode->i_lock); | 390 | spin_lock(&ci->i_ceph_lock); |
391 | if (ci->i_xattrs.version != xattr_version) { | 391 | if (ci->i_xattrs.version != xattr_version) { |
392 | /* lost a race, retry */ | 392 | /* lost a race, retry */ |
393 | for (i = 0; i < numattr; i++) | 393 | for (i = 0; i < numattr; i++) |
@@ -418,7 +418,7 @@ start: | |||
418 | 418 | ||
419 | return err; | 419 | return err; |
420 | bad_lock: | 420 | bad_lock: |
421 | spin_lock(&inode->i_lock); | 421 | spin_lock(&ci->i_ceph_lock); |
422 | bad: | 422 | bad: |
423 | if (xattrs) { | 423 | if (xattrs) { |
424 | for (i = 0; i < numattr; i++) | 424 | for (i = 0; i < numattr; i++) |
@@ -512,7 +512,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | |||
512 | if (vxattrs) | 512 | if (vxattrs) |
513 | vxattr = ceph_match_vxattr(vxattrs, name); | 513 | vxattr = ceph_match_vxattr(vxattrs, name); |
514 | 514 | ||
515 | spin_lock(&inode->i_lock); | 515 | spin_lock(&ci->i_ceph_lock); |
516 | dout("getxattr %p ver=%lld index_ver=%lld\n", inode, | 516 | dout("getxattr %p ver=%lld index_ver=%lld\n", inode, |
517 | ci->i_xattrs.version, ci->i_xattrs.index_version); | 517 | ci->i_xattrs.version, ci->i_xattrs.index_version); |
518 | 518 | ||
@@ -520,14 +520,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | |||
520 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { | 520 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { |
521 | goto get_xattr; | 521 | goto get_xattr; |
522 | } else { | 522 | } else { |
523 | spin_unlock(&inode->i_lock); | 523 | spin_unlock(&ci->i_ceph_lock); |
524 | /* get xattrs from mds (if we don't already have them) */ | 524 | /* get xattrs from mds (if we don't already have them) */ |
525 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); | 525 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); |
526 | if (err) | 526 | if (err) |
527 | return err; | 527 | return err; |
528 | } | 528 | } |
529 | 529 | ||
530 | spin_lock(&inode->i_lock); | 530 | spin_lock(&ci->i_ceph_lock); |
531 | 531 | ||
532 | if (vxattr && vxattr->readonly) { | 532 | if (vxattr && vxattr->readonly) { |
533 | err = vxattr->getxattr_cb(ci, value, size); | 533 | err = vxattr->getxattr_cb(ci, value, size); |
@@ -558,7 +558,7 @@ get_xattr: | |||
558 | memcpy(value, xattr->val, xattr->val_len); | 558 | memcpy(value, xattr->val, xattr->val_len); |
559 | 559 | ||
560 | out: | 560 | out: |
561 | spin_unlock(&inode->i_lock); | 561 | spin_unlock(&ci->i_ceph_lock); |
562 | return err; | 562 | return err; |
563 | } | 563 | } |
564 | 564 | ||
@@ -573,7 +573,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) | |||
573 | u32 len; | 573 | u32 len; |
574 | int i; | 574 | int i; |
575 | 575 | ||
576 | spin_lock(&inode->i_lock); | 576 | spin_lock(&ci->i_ceph_lock); |
577 | dout("listxattr %p ver=%lld index_ver=%lld\n", inode, | 577 | dout("listxattr %p ver=%lld index_ver=%lld\n", inode, |
578 | ci->i_xattrs.version, ci->i_xattrs.index_version); | 578 | ci->i_xattrs.version, ci->i_xattrs.index_version); |
579 | 579 | ||
@@ -581,13 +581,13 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) | |||
581 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { | 581 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { |
582 | goto list_xattr; | 582 | goto list_xattr; |
583 | } else { | 583 | } else { |
584 | spin_unlock(&inode->i_lock); | 584 | spin_unlock(&ci->i_ceph_lock); |
585 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); | 585 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); |
586 | if (err) | 586 | if (err) |
587 | return err; | 587 | return err; |
588 | } | 588 | } |
589 | 589 | ||
590 | spin_lock(&inode->i_lock); | 590 | spin_lock(&ci->i_ceph_lock); |
591 | 591 | ||
592 | err = __build_xattrs(inode); | 592 | err = __build_xattrs(inode); |
593 | if (err < 0) | 593 | if (err < 0) |
@@ -619,7 +619,7 @@ list_xattr: | |||
619 | } | 619 | } |
620 | 620 | ||
621 | out: | 621 | out: |
622 | spin_unlock(&inode->i_lock); | 622 | spin_unlock(&ci->i_ceph_lock); |
623 | return err; | 623 | return err; |
624 | } | 624 | } |
625 | 625 | ||
@@ -739,7 +739,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, | |||
739 | if (!xattr) | 739 | if (!xattr) |
740 | goto out; | 740 | goto out; |
741 | 741 | ||
742 | spin_lock(&inode->i_lock); | 742 | spin_lock(&ci->i_ceph_lock); |
743 | retry: | 743 | retry: |
744 | issued = __ceph_caps_issued(ci, NULL); | 744 | issued = __ceph_caps_issued(ci, NULL); |
745 | if (!(issued & CEPH_CAP_XATTR_EXCL)) | 745 | if (!(issued & CEPH_CAP_XATTR_EXCL)) |
@@ -752,12 +752,12 @@ retry: | |||
752 | required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { | 752 | required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { |
753 | struct ceph_buffer *blob = NULL; | 753 | struct ceph_buffer *blob = NULL; |
754 | 754 | ||
755 | spin_unlock(&inode->i_lock); | 755 | spin_unlock(&ci->i_ceph_lock); |
756 | dout(" preaallocating new blob size=%d\n", required_blob_size); | 756 | dout(" preaallocating new blob size=%d\n", required_blob_size); |
757 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); | 757 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); |
758 | if (!blob) | 758 | if (!blob) |
759 | goto out; | 759 | goto out; |
760 | spin_lock(&inode->i_lock); | 760 | spin_lock(&ci->i_ceph_lock); |
761 | if (ci->i_xattrs.prealloc_blob) | 761 | if (ci->i_xattrs.prealloc_blob) |
762 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); | 762 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); |
763 | ci->i_xattrs.prealloc_blob = blob; | 763 | ci->i_xattrs.prealloc_blob = blob; |
@@ -770,13 +770,13 @@ retry: | |||
770 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); | 770 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); |
771 | ci->i_xattrs.dirty = true; | 771 | ci->i_xattrs.dirty = true; |
772 | inode->i_ctime = CURRENT_TIME; | 772 | inode->i_ctime = CURRENT_TIME; |
773 | spin_unlock(&inode->i_lock); | 773 | spin_unlock(&ci->i_ceph_lock); |
774 | if (dirty) | 774 | if (dirty) |
775 | __mark_inode_dirty(inode, dirty); | 775 | __mark_inode_dirty(inode, dirty); |
776 | return err; | 776 | return err; |
777 | 777 | ||
778 | do_sync: | 778 | do_sync: |
779 | spin_unlock(&inode->i_lock); | 779 | spin_unlock(&ci->i_ceph_lock); |
780 | err = ceph_sync_setxattr(dentry, name, value, size, flags); | 780 | err = ceph_sync_setxattr(dentry, name, value, size, flags); |
781 | out: | 781 | out: |
782 | kfree(newname); | 782 | kfree(newname); |
@@ -833,7 +833,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) | |||
833 | return -EOPNOTSUPP; | 833 | return -EOPNOTSUPP; |
834 | } | 834 | } |
835 | 835 | ||
836 | spin_lock(&inode->i_lock); | 836 | spin_lock(&ci->i_ceph_lock); |
837 | __build_xattrs(inode); | 837 | __build_xattrs(inode); |
838 | issued = __ceph_caps_issued(ci, NULL); | 838 | issued = __ceph_caps_issued(ci, NULL); |
839 | dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); | 839 | dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); |
@@ -846,12 +846,12 @@ int ceph_removexattr(struct dentry *dentry, const char *name) | |||
846 | ci->i_xattrs.dirty = true; | 846 | ci->i_xattrs.dirty = true; |
847 | inode->i_ctime = CURRENT_TIME; | 847 | inode->i_ctime = CURRENT_TIME; |
848 | 848 | ||
849 | spin_unlock(&inode->i_lock); | 849 | spin_unlock(&ci->i_ceph_lock); |
850 | if (dirty) | 850 | if (dirty) |
851 | __mark_inode_dirty(inode, dirty); | 851 | __mark_inode_dirty(inode, dirty); |
852 | return err; | 852 | return err; |
853 | do_sync: | 853 | do_sync: |
854 | spin_unlock(&inode->i_lock); | 854 | spin_unlock(&ci->i_ceph_lock); |
855 | err = ceph_send_removexattr(dentry, name); | 855 | err = ceph_send_removexattr(dentry, name); |
856 | return err; | 856 | return err; |
857 | } | 857 | } |
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d6a972df0338..8cd4b52d4217 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c | |||
@@ -441,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, | |||
441 | smb_msg.msg_controllen = 0; | 441 | smb_msg.msg_controllen = 0; |
442 | 442 | ||
443 | for (total_read = 0; to_read; total_read += length, to_read -= length) { | 443 | for (total_read = 0; to_read; total_read += length, to_read -= length) { |
444 | try_to_freeze(); | ||
445 | |||
444 | if (server_unresponsive(server)) { | 446 | if (server_unresponsive(server)) { |
445 | total_read = -EAGAIN; | 447 | total_read = -EAGAIN; |
446 | break; | 448 | break; |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cf0b1539b321..4dd9283885e7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -702,6 +702,13 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, | |||
702 | lock->type, lock->netfid, conf_lock); | 702 | lock->type, lock->netfid, conf_lock); |
703 | } | 703 | } |
704 | 704 | ||
705 | /* | ||
706 | * Check if there is another lock that prevents us to set the lock (mandatory | ||
707 | * style). If such a lock exists, update the flock structure with its | ||
708 | * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks | ||
709 | * or leave it the same if we can't. Returns 0 if we don't need to request to | ||
710 | * the server or 1 otherwise. | ||
711 | */ | ||
705 | static int | 712 | static int |
706 | cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, | 713 | cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, |
707 | __u8 type, __u16 netfid, struct file_lock *flock) | 714 | __u8 type, __u16 netfid, struct file_lock *flock) |
@@ -739,6 +746,12 @@ cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) | |||
739 | mutex_unlock(&cinode->lock_mutex); | 746 | mutex_unlock(&cinode->lock_mutex); |
740 | } | 747 | } |
741 | 748 | ||
749 | /* | ||
750 | * Set the byte-range lock (mandatory style). Returns: | ||
751 | * 1) 0, if we set the lock and don't need to request to the server; | ||
752 | * 2) 1, if no locks prevent us but we need to request to the server; | ||
753 | * 3) -EACCESS, if there is a lock that prevents us and wait is false. | ||
754 | */ | ||
742 | static int | 755 | static int |
743 | cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, | 756 | cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, |
744 | bool wait) | 757 | bool wait) |
@@ -778,6 +791,13 @@ try_again: | |||
778 | return rc; | 791 | return rc; |
779 | } | 792 | } |
780 | 793 | ||
794 | /* | ||
795 | * Check if there is another lock that prevents us to set the lock (posix | ||
796 | * style). If such a lock exists, update the flock structure with its | ||
797 | * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks | ||
798 | * or leave it the same if we can't. Returns 0 if we don't need to request to | ||
799 | * the server or 1 otherwise. | ||
800 | */ | ||
781 | static int | 801 | static int |
782 | cifs_posix_lock_test(struct file *file, struct file_lock *flock) | 802 | cifs_posix_lock_test(struct file *file, struct file_lock *flock) |
783 | { | 803 | { |
@@ -800,6 +820,12 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) | |||
800 | return rc; | 820 | return rc; |
801 | } | 821 | } |
802 | 822 | ||
823 | /* | ||
824 | * Set the byte-range lock (posix style). Returns: | ||
825 | * 1) 0, if we set the lock and don't need to request to the server; | ||
826 | * 2) 1, if we need to request to the server; | ||
827 | * 3) <0, if the error occurs while setting the lock. | ||
828 | */ | ||
803 | static int | 829 | static int |
804 | cifs_posix_lock_set(struct file *file, struct file_lock *flock) | 830 | cifs_posix_lock_set(struct file *file, struct file_lock *flock) |
805 | { | 831 | { |
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5de03ec20144..a090bbe6ee29 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c | |||
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, | |||
554 | rc); | 554 | rc); |
555 | return rc; | 555 | return rc; |
556 | } | 556 | } |
557 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); | 557 | /* FindFirst/Next set last_entry to NULL on malformed reply */ |
558 | if (cifsFile->srch_inf.last_entry) | ||
559 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, | ||
560 | cifsFile); | ||
558 | } | 561 | } |
559 | 562 | ||
560 | while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && | 563 | while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && |
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, | |||
562 | cFYI(1, "calling findnext2"); | 565 | cFYI(1, "calling findnext2"); |
563 | rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, | 566 | rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, |
564 | &cifsFile->srch_inf); | 567 | &cifsFile->srch_inf); |
565 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); | 568 | /* FindFirst/Next set last_entry to NULL on malformed reply */ |
569 | if (cifsFile->srch_inf.last_entry) | ||
570 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, | ||
571 | cifsFile); | ||
566 | if (rc) | 572 | if (rc) |
567 | return -ENOENT; | 573 | return -ENOENT; |
568 | } | 574 | } |
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 7cacba12b8f1..80d850881938 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c | |||
@@ -209,7 +209,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16, | |||
209 | { | 209 | { |
210 | int rc; | 210 | int rc; |
211 | int len; | 211 | int len; |
212 | __u16 wpwd[129]; | 212 | __le16 wpwd[129]; |
213 | 213 | ||
214 | /* Password cannot be longer than 128 characters */ | 214 | /* Password cannot be longer than 128 characters */ |
215 | if (passwd) /* Password must be converted to NT unicode */ | 215 | if (passwd) /* Password must be converted to NT unicode */ |
@@ -219,8 +219,8 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16, | |||
219 | *wpwd = 0; /* Ensure string is null terminated */ | 219 | *wpwd = 0; /* Ensure string is null terminated */ |
220 | } | 220 | } |
221 | 221 | ||
222 | rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16)); | 222 | rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); |
223 | memset(wpwd, 0, 129 * sizeof(__u16)); | 223 | memset(wpwd, 0, 129 * sizeof(__le16)); |
224 | 224 | ||
225 | return rc; | 225 | return rc; |
226 | } | 226 | } |
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index ca418aaf6352..9d8715c45f25 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c | |||
@@ -292,7 +292,7 @@ int __init configfs_inode_init(void) | |||
292 | return bdi_init(&configfs_backing_dev_info); | 292 | return bdi_init(&configfs_backing_dev_info); |
293 | } | 293 | } |
294 | 294 | ||
295 | void __exit configfs_inode_exit(void) | 295 | void configfs_inode_exit(void) |
296 | { | 296 | { |
297 | bdi_destroy(&configfs_backing_dev_info); | 297 | bdi_destroy(&configfs_backing_dev_info); |
298 | } | 298 | } |
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index ecc62178beda..276e15cafd58 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c | |||
@@ -143,28 +143,26 @@ static int __init configfs_init(void) | |||
143 | goto out; | 143 | goto out; |
144 | 144 | ||
145 | config_kobj = kobject_create_and_add("config", kernel_kobj); | 145 | config_kobj = kobject_create_and_add("config", kernel_kobj); |
146 | if (!config_kobj) { | 146 | if (!config_kobj) |
147 | kmem_cache_destroy(configfs_dir_cachep); | 147 | goto out2; |
148 | configfs_dir_cachep = NULL; | 148 | |
149 | goto out; | 149 | err = configfs_inode_init(); |
150 | } | 150 | if (err) |
151 | goto out3; | ||
151 | 152 | ||
152 | err = register_filesystem(&configfs_fs_type); | 153 | err = register_filesystem(&configfs_fs_type); |
153 | if (err) { | 154 | if (err) |
154 | printk(KERN_ERR "configfs: Unable to register filesystem!\n"); | 155 | goto out4; |
155 | kobject_put(config_kobj); | ||
156 | kmem_cache_destroy(configfs_dir_cachep); | ||
157 | configfs_dir_cachep = NULL; | ||
158 | goto out; | ||
159 | } | ||
160 | 156 | ||
161 | err = configfs_inode_init(); | 157 | return 0; |
162 | if (err) { | 158 | out4: |
163 | unregister_filesystem(&configfs_fs_type); | 159 | printk(KERN_ERR "configfs: Unable to register filesystem!\n"); |
164 | kobject_put(config_kobj); | 160 | configfs_inode_exit(); |
165 | kmem_cache_destroy(configfs_dir_cachep); | 161 | out3: |
166 | configfs_dir_cachep = NULL; | 162 | kobject_put(config_kobj); |
167 | } | 163 | out2: |
164 | kmem_cache_destroy(configfs_dir_cachep); | ||
165 | configfs_dir_cachep = NULL; | ||
168 | out: | 166 | out: |
169 | return err; | 167 | return err; |
170 | } | 168 | } |
diff --git a/fs/dcache.c b/fs/dcache.c index a901c6901bce..89509b5a090e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/bit_spinlock.h> | 36 | #include <linux/bit_spinlock.h> |
37 | #include <linux/rculist_bl.h> | 37 | #include <linux/rculist_bl.h> |
38 | #include <linux/prefetch.h> | 38 | #include <linux/prefetch.h> |
39 | #include <linux/ratelimit.h> | ||
39 | #include "internal.h" | 40 | #include "internal.h" |
40 | 41 | ||
41 | /* | 42 | /* |
@@ -2383,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) | |||
2383 | actual = __d_unalias(inode, dentry, alias); | 2384 | actual = __d_unalias(inode, dentry, alias); |
2384 | } | 2385 | } |
2385 | write_sequnlock(&rename_lock); | 2386 | write_sequnlock(&rename_lock); |
2386 | if (IS_ERR(actual)) | 2387 | if (IS_ERR(actual)) { |
2388 | if (PTR_ERR(actual) == -ELOOP) | ||
2389 | pr_warn_ratelimited( | ||
2390 | "VFS: Lookup of '%s' in %s %s" | ||
2391 | " would have caused loop\n", | ||
2392 | dentry->d_name.name, | ||
2393 | inode->i_sb->s_type->name, | ||
2394 | inode->i_sb->s_id); | ||
2387 | dput(alias); | 2395 | dput(alias); |
2396 | } | ||
2388 | goto out_nolock; | 2397 | goto out_nolock; |
2389 | } | 2398 | } |
2390 | } | 2399 | } |
@@ -2430,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) | |||
2430 | /** | 2439 | /** |
2431 | * prepend_path - Prepend path string to a buffer | 2440 | * prepend_path - Prepend path string to a buffer |
2432 | * @path: the dentry/vfsmount to report | 2441 | * @path: the dentry/vfsmount to report |
2433 | * @root: root vfsmnt/dentry (may be modified by this function) | 2442 | * @root: root vfsmnt/dentry |
2434 | * @buffer: pointer to the end of the buffer | 2443 | * @buffer: pointer to the end of the buffer |
2435 | * @buflen: pointer to buffer length | 2444 | * @buflen: pointer to buffer length |
2436 | * | 2445 | * |
2437 | * Caller holds the rename_lock. | 2446 | * Caller holds the rename_lock. |
2438 | * | ||
2439 | * If path is not reachable from the supplied root, then the value of | ||
2440 | * root is changed (without modifying refcounts). | ||
2441 | */ | 2447 | */ |
2442 | static int prepend_path(const struct path *path, struct path *root, | 2448 | static int prepend_path(const struct path *path, |
2449 | const struct path *root, | ||
2443 | char **buffer, int *buflen) | 2450 | char **buffer, int *buflen) |
2444 | { | 2451 | { |
2445 | struct dentry *dentry = path->dentry; | 2452 | struct dentry *dentry = path->dentry; |
@@ -2474,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, | |||
2474 | dentry = parent; | 2481 | dentry = parent; |
2475 | } | 2482 | } |
2476 | 2483 | ||
2477 | out: | ||
2478 | if (!error && !slash) | 2484 | if (!error && !slash) |
2479 | error = prepend(buffer, buflen, "/", 1); | 2485 | error = prepend(buffer, buflen, "/", 1); |
2480 | 2486 | ||
2487 | out: | ||
2481 | br_read_unlock(vfsmount_lock); | 2488 | br_read_unlock(vfsmount_lock); |
2482 | return error; | 2489 | return error; |
2483 | 2490 | ||
@@ -2491,15 +2498,17 @@ global_root: | |||
2491 | WARN(1, "Root dentry has weird name <%.*s>\n", | 2498 | WARN(1, "Root dentry has weird name <%.*s>\n", |
2492 | (int) dentry->d_name.len, dentry->d_name.name); | 2499 | (int) dentry->d_name.len, dentry->d_name.name); |
2493 | } | 2500 | } |
2494 | root->mnt = vfsmnt; | 2501 | if (!slash) |
2495 | root->dentry = dentry; | 2502 | error = prepend(buffer, buflen, "/", 1); |
2503 | if (!error) | ||
2504 | error = vfsmnt->mnt_ns ? 1 : 2; | ||
2496 | goto out; | 2505 | goto out; |
2497 | } | 2506 | } |
2498 | 2507 | ||
2499 | /** | 2508 | /** |
2500 | * __d_path - return the path of a dentry | 2509 | * __d_path - return the path of a dentry |
2501 | * @path: the dentry/vfsmount to report | 2510 | * @path: the dentry/vfsmount to report |
2502 | * @root: root vfsmnt/dentry (may be modified by this function) | 2511 | * @root: root vfsmnt/dentry |
2503 | * @buf: buffer to return value in | 2512 | * @buf: buffer to return value in |
2504 | * @buflen: buffer length | 2513 | * @buflen: buffer length |
2505 | * | 2514 | * |
@@ -2510,10 +2519,10 @@ global_root: | |||
2510 | * | 2519 | * |
2511 | * "buflen" should be positive. | 2520 | * "buflen" should be positive. |
2512 | * | 2521 | * |
2513 | * If path is not reachable from the supplied root, then the value of | 2522 | * If the path is not reachable from the supplied root, return %NULL. |
2514 | * root is changed (without modifying refcounts). | ||
2515 | */ | 2523 | */ |
2516 | char *__d_path(const struct path *path, struct path *root, | 2524 | char *__d_path(const struct path *path, |
2525 | const struct path *root, | ||
2517 | char *buf, int buflen) | 2526 | char *buf, int buflen) |
2518 | { | 2527 | { |
2519 | char *res = buf + buflen; | 2528 | char *res = buf + buflen; |
@@ -2524,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, | |||
2524 | error = prepend_path(path, root, &res, &buflen); | 2533 | error = prepend_path(path, root, &res, &buflen); |
2525 | write_sequnlock(&rename_lock); | 2534 | write_sequnlock(&rename_lock); |
2526 | 2535 | ||
2527 | if (error) | 2536 | if (error < 0) |
2537 | return ERR_PTR(error); | ||
2538 | if (error > 0) | ||
2539 | return NULL; | ||
2540 | return res; | ||
2541 | } | ||
2542 | |||
2543 | char *d_absolute_path(const struct path *path, | ||
2544 | char *buf, int buflen) | ||
2545 | { | ||
2546 | struct path root = {}; | ||
2547 | char *res = buf + buflen; | ||
2548 | int error; | ||
2549 | |||
2550 | prepend(&res, &buflen, "\0", 1); | ||
2551 | write_seqlock(&rename_lock); | ||
2552 | error = prepend_path(path, &root, &res, &buflen); | ||
2553 | write_sequnlock(&rename_lock); | ||
2554 | |||
2555 | if (error > 1) | ||
2556 | error = -EINVAL; | ||
2557 | if (error < 0) | ||
2528 | return ERR_PTR(error); | 2558 | return ERR_PTR(error); |
2529 | return res; | 2559 | return res; |
2530 | } | 2560 | } |
@@ -2532,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, | |||
2532 | /* | 2562 | /* |
2533 | * same as __d_path but appends "(deleted)" for unlinked files. | 2563 | * same as __d_path but appends "(deleted)" for unlinked files. |
2534 | */ | 2564 | */ |
2535 | static int path_with_deleted(const struct path *path, struct path *root, | 2565 | static int path_with_deleted(const struct path *path, |
2536 | char **buf, int *buflen) | 2566 | const struct path *root, |
2567 | char **buf, int *buflen) | ||
2537 | { | 2568 | { |
2538 | prepend(buf, buflen, "\0", 1); | 2569 | prepend(buf, buflen, "\0", 1); |
2539 | if (d_unlinked(path->dentry)) { | 2570 | if (d_unlinked(path->dentry)) { |
@@ -2570,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) | |||
2570 | { | 2601 | { |
2571 | char *res = buf + buflen; | 2602 | char *res = buf + buflen; |
2572 | struct path root; | 2603 | struct path root; |
2573 | struct path tmp; | ||
2574 | int error; | 2604 | int error; |
2575 | 2605 | ||
2576 | /* | 2606 | /* |
@@ -2585,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) | |||
2585 | 2615 | ||
2586 | get_fs_root(current->fs, &root); | 2616 | get_fs_root(current->fs, &root); |
2587 | write_seqlock(&rename_lock); | 2617 | write_seqlock(&rename_lock); |
2588 | tmp = root; | 2618 | error = path_with_deleted(path, &root, &res, &buflen); |
2589 | error = path_with_deleted(path, &tmp, &res, &buflen); | 2619 | if (error < 0) |
2590 | if (error) | ||
2591 | res = ERR_PTR(error); | 2620 | res = ERR_PTR(error); |
2592 | write_sequnlock(&rename_lock); | 2621 | write_sequnlock(&rename_lock); |
2593 | path_put(&root); | 2622 | path_put(&root); |
@@ -2608,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) | |||
2608 | { | 2637 | { |
2609 | char *res = buf + buflen; | 2638 | char *res = buf + buflen; |
2610 | struct path root; | 2639 | struct path root; |
2611 | struct path tmp; | ||
2612 | int error; | 2640 | int error; |
2613 | 2641 | ||
2614 | if (path->dentry->d_op && path->dentry->d_op->d_dname) | 2642 | if (path->dentry->d_op && path->dentry->d_op->d_dname) |
@@ -2616,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) | |||
2616 | 2644 | ||
2617 | get_fs_root(current->fs, &root); | 2645 | get_fs_root(current->fs, &root); |
2618 | write_seqlock(&rename_lock); | 2646 | write_seqlock(&rename_lock); |
2619 | tmp = root; | 2647 | error = path_with_deleted(path, &root, &res, &buflen); |
2620 | error = path_with_deleted(path, &tmp, &res, &buflen); | 2648 | if (error > 0) |
2621 | if (!error && !path_equal(&tmp, &root)) | ||
2622 | error = prepend_unreachable(&res, &buflen); | 2649 | error = prepend_unreachable(&res, &buflen); |
2623 | write_sequnlock(&rename_lock); | 2650 | write_sequnlock(&rename_lock); |
2624 | path_put(&root); | 2651 | path_put(&root); |
@@ -2749,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) | |||
2749 | write_seqlock(&rename_lock); | 2776 | write_seqlock(&rename_lock); |
2750 | if (!d_unlinked(pwd.dentry)) { | 2777 | if (!d_unlinked(pwd.dentry)) { |
2751 | unsigned long len; | 2778 | unsigned long len; |
2752 | struct path tmp = root; | ||
2753 | char *cwd = page + PAGE_SIZE; | 2779 | char *cwd = page + PAGE_SIZE; |
2754 | int buflen = PAGE_SIZE; | 2780 | int buflen = PAGE_SIZE; |
2755 | 2781 | ||
2756 | prepend(&cwd, &buflen, "\0", 1); | 2782 | prepend(&cwd, &buflen, "\0", 1); |
2757 | error = prepend_path(&pwd, &tmp, &cwd, &buflen); | 2783 | error = prepend_path(&pwd, &root, &cwd, &buflen); |
2758 | write_sequnlock(&rename_lock); | 2784 | write_sequnlock(&rename_lock); |
2759 | 2785 | ||
2760 | if (error) | 2786 | if (error < 0) |
2761 | goto out; | 2787 | goto out; |
2762 | 2788 | ||
2763 | /* Unreachable from current root */ | 2789 | /* Unreachable from current root */ |
2764 | if (!path_equal(&tmp, &root)) { | 2790 | if (error > 0) { |
2765 | error = prepend_unreachable(&cwd, &buflen); | 2791 | error = prepend_unreachable(&cwd, &buflen); |
2766 | if (error) | 2792 | if (error) |
2767 | goto out; | 2793 | goto out; |
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bde3b9f..2a834255c75d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c | |||
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( | |||
967 | 967 | ||
968 | /** | 968 | /** |
969 | * ecryptfs_new_file_context | 969 | * ecryptfs_new_file_context |
970 | * @ecryptfs_dentry: The eCryptfs dentry | 970 | * @ecryptfs_inode: The eCryptfs inode |
971 | * | 971 | * |
972 | * If the crypto context for the file has not yet been established, | 972 | * If the crypto context for the file has not yet been established, |
973 | * this is where we do that. Establishing a new crypto context | 973 | * this is where we do that. Establishing a new crypto context |
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( | |||
984 | * | 984 | * |
985 | * Returns zero on success; non-zero otherwise | 985 | * Returns zero on success; non-zero otherwise |
986 | */ | 986 | */ |
987 | int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) | 987 | int ecryptfs_new_file_context(struct inode *ecryptfs_inode) |
988 | { | 988 | { |
989 | struct ecryptfs_crypt_stat *crypt_stat = | 989 | struct ecryptfs_crypt_stat *crypt_stat = |
990 | &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; | 990 | &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; |
991 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = | 991 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = |
992 | &ecryptfs_superblock_to_private( | 992 | &ecryptfs_superblock_to_private( |
993 | ecryptfs_dentry->d_sb)->mount_crypt_stat; | 993 | ecryptfs_inode->i_sb)->mount_crypt_stat; |
994 | int cipher_name_len; | 994 | int cipher_name_len; |
995 | int rc = 0; | 995 | int rc = 0; |
996 | 996 | ||
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, | |||
1299 | } | 1299 | } |
1300 | 1300 | ||
1301 | static int | 1301 | static int |
1302 | ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, | 1302 | ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, |
1303 | char *virt, size_t virt_len) | 1303 | char *virt, size_t virt_len) |
1304 | { | 1304 | { |
1305 | int rc; | 1305 | int rc; |
1306 | 1306 | ||
1307 | rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, | 1307 | rc = ecryptfs_write_lower(ecryptfs_inode, virt, |
1308 | 0, virt_len); | 1308 | 0, virt_len); |
1309 | if (rc < 0) | 1309 | if (rc < 0) |
1310 | printk(KERN_ERR "%s: Error attempting to write header " | 1310 | printk(KERN_ERR "%s: Error attempting to write header " |
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, | |||
1338 | 1338 | ||
1339 | /** | 1339 | /** |
1340 | * ecryptfs_write_metadata | 1340 | * ecryptfs_write_metadata |
1341 | * @ecryptfs_dentry: The eCryptfs dentry | 1341 | * @ecryptfs_dentry: The eCryptfs dentry, which should be negative |
1342 | * @ecryptfs_inode: The newly created eCryptfs inode | ||
1342 | * | 1343 | * |
1343 | * Write the file headers out. This will likely involve a userspace | 1344 | * Write the file headers out. This will likely involve a userspace |
1344 | * callout, in which the session key is encrypted with one or more | 1345 | * callout, in which the session key is encrypted with one or more |
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, | |||
1348 | * | 1349 | * |
1349 | * Returns zero on success; non-zero on error | 1350 | * Returns zero on success; non-zero on error |
1350 | */ | 1351 | */ |
1351 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) | 1352 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, |
1353 | struct inode *ecryptfs_inode) | ||
1352 | { | 1354 | { |
1353 | struct ecryptfs_crypt_stat *crypt_stat = | 1355 | struct ecryptfs_crypt_stat *crypt_stat = |
1354 | &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; | 1356 | &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; |
1355 | unsigned int order; | 1357 | unsigned int order; |
1356 | char *virt; | 1358 | char *virt; |
1357 | size_t virt_len; | 1359 | size_t virt_len; |
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) | |||
1391 | rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, | 1393 | rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, |
1392 | size); | 1394 | size); |
1393 | else | 1395 | else |
1394 | rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, | 1396 | rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, |
1395 | virt_len); | 1397 | virt_len); |
1396 | if (rc) { | 1398 | if (rc) { |
1397 | printk(KERN_ERR "%s: Error writing metadata out to lower file; " | 1399 | printk(KERN_ERR "%s: Error writing metadata out to lower file; " |
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" | |||
1943 | 1945 | ||
1944 | /* We could either offset on every reverse map or just pad some 0x00's | 1946 | /* We could either offset on every reverse map or just pad some 0x00's |
1945 | * at the front here */ | 1947 | * at the front here */ |
1946 | static const unsigned char filename_rev_map[] = { | 1948 | static const unsigned char filename_rev_map[256] = { |
1947 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ | 1949 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ |
1948 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ | 1950 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ |
1949 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ | 1951 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ |
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { | |||
1959 | 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ | 1961 | 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ |
1960 | 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ | 1962 | 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ |
1961 | 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ | 1963 | 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ |
1962 | 0x3D, 0x3E, 0x3F | 1964 | 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ |
1963 | }; | 1965 | }; |
1964 | 1966 | ||
1965 | /** | 1967 | /** |
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 54481a3b2c79..a9f29b12fbf2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h | |||
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); | |||
584 | int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); | 584 | int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); |
585 | int ecryptfs_encrypt_page(struct page *page); | 585 | int ecryptfs_encrypt_page(struct page *page); |
586 | int ecryptfs_decrypt_page(struct page *page); | 586 | int ecryptfs_decrypt_page(struct page *page); |
587 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); | 587 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, |
588 | struct inode *ecryptfs_inode); | ||
588 | int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); | 589 | int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); |
589 | int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); | 590 | int ecryptfs_new_file_context(struct inode *ecryptfs_inode); |
590 | void ecryptfs_write_crypt_stat_flags(char *page_virt, | 591 | void ecryptfs_write_crypt_stat_flags(char *page_virt, |
591 | struct ecryptfs_crypt_stat *crypt_stat, | 592 | struct ecryptfs_crypt_stat *crypt_stat, |
592 | size_t *written); | 593 | size_t *written); |
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98cf9baa..d3f95f941c47 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c | |||
@@ -139,6 +139,27 @@ out: | |||
139 | return rc; | 139 | return rc; |
140 | } | 140 | } |
141 | 141 | ||
142 | static void ecryptfs_vma_close(struct vm_area_struct *vma) | ||
143 | { | ||
144 | filemap_write_and_wait(vma->vm_file->f_mapping); | ||
145 | } | ||
146 | |||
147 | static const struct vm_operations_struct ecryptfs_file_vm_ops = { | ||
148 | .close = ecryptfs_vma_close, | ||
149 | .fault = filemap_fault, | ||
150 | }; | ||
151 | |||
152 | static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) | ||
153 | { | ||
154 | int rc; | ||
155 | |||
156 | rc = generic_file_mmap(file, vma); | ||
157 | if (!rc) | ||
158 | vma->vm_ops = &ecryptfs_file_vm_ops; | ||
159 | |||
160 | return rc; | ||
161 | } | ||
162 | |||
142 | struct kmem_cache *ecryptfs_file_info_cache; | 163 | struct kmem_cache *ecryptfs_file_info_cache; |
143 | 164 | ||
144 | /** | 165 | /** |
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { | |||
349 | #ifdef CONFIG_COMPAT | 370 | #ifdef CONFIG_COMPAT |
350 | .compat_ioctl = ecryptfs_compat_ioctl, | 371 | .compat_ioctl = ecryptfs_compat_ioctl, |
351 | #endif | 372 | #endif |
352 | .mmap = generic_file_mmap, | 373 | .mmap = ecryptfs_file_mmap, |
353 | .open = ecryptfs_open, | 374 | .open = ecryptfs_open, |
354 | .flush = ecryptfs_flush, | 375 | .flush = ecryptfs_flush, |
355 | .release = ecryptfs_release, | 376 | .release = ecryptfs_release, |
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a36d327f1521..32f90a3ae63e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c | |||
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, | |||
172 | * it. It will also update the eCryptfs directory inode to mimic the | 172 | * it. It will also update the eCryptfs directory inode to mimic the |
173 | * stat of the lower directory inode. | 173 | * stat of the lower directory inode. |
174 | * | 174 | * |
175 | * Returns zero on success; non-zero on error condition | 175 | * Returns the new eCryptfs inode on success; an ERR_PTR on error condition |
176 | */ | 176 | */ |
177 | static int | 177 | static struct inode * |
178 | ecryptfs_do_create(struct inode *directory_inode, | 178 | ecryptfs_do_create(struct inode *directory_inode, |
179 | struct dentry *ecryptfs_dentry, int mode) | 179 | struct dentry *ecryptfs_dentry, int mode) |
180 | { | 180 | { |
181 | int rc; | 181 | int rc; |
182 | struct dentry *lower_dentry; | 182 | struct dentry *lower_dentry; |
183 | struct dentry *lower_dir_dentry; | 183 | struct dentry *lower_dir_dentry; |
184 | struct inode *inode; | ||
184 | 185 | ||
185 | lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); | 186 | lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); |
186 | lower_dir_dentry = lock_parent(lower_dentry); | 187 | lower_dir_dentry = lock_parent(lower_dentry); |
187 | if (IS_ERR(lower_dir_dentry)) { | 188 | if (IS_ERR(lower_dir_dentry)) { |
188 | ecryptfs_printk(KERN_ERR, "Error locking directory of " | 189 | ecryptfs_printk(KERN_ERR, "Error locking directory of " |
189 | "dentry\n"); | 190 | "dentry\n"); |
190 | rc = PTR_ERR(lower_dir_dentry); | 191 | inode = ERR_CAST(lower_dir_dentry); |
191 | goto out; | 192 | goto out; |
192 | } | 193 | } |
193 | rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, | 194 | rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, |
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode, | |||
195 | if (rc) { | 196 | if (rc) { |
196 | printk(KERN_ERR "%s: Failure to create dentry in lower fs; " | 197 | printk(KERN_ERR "%s: Failure to create dentry in lower fs; " |
197 | "rc = [%d]\n", __func__, rc); | 198 | "rc = [%d]\n", __func__, rc); |
199 | inode = ERR_PTR(rc); | ||
198 | goto out_lock; | 200 | goto out_lock; |
199 | } | 201 | } |
200 | rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, | 202 | inode = __ecryptfs_get_inode(lower_dentry->d_inode, |
201 | directory_inode->i_sb); | 203 | directory_inode->i_sb); |
202 | if (rc) { | 204 | if (IS_ERR(inode)) |
203 | ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); | ||
204 | goto out_lock; | 205 | goto out_lock; |
205 | } | ||
206 | fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); | 206 | fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); |
207 | fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); | 207 | fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); |
208 | out_lock: | 208 | out_lock: |
209 | unlock_dir(lower_dir_dentry); | 209 | unlock_dir(lower_dir_dentry); |
210 | out: | 210 | out: |
211 | return rc; | 211 | return inode; |
212 | } | 212 | } |
213 | 213 | ||
214 | /** | 214 | /** |
@@ -219,26 +219,26 @@ out: | |||
219 | * | 219 | * |
220 | * Returns zero on success | 220 | * Returns zero on success |
221 | */ | 221 | */ |
222 | static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) | 222 | static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, |
223 | struct inode *ecryptfs_inode) | ||
223 | { | 224 | { |
224 | struct ecryptfs_crypt_stat *crypt_stat = | 225 | struct ecryptfs_crypt_stat *crypt_stat = |
225 | &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; | 226 | &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; |
226 | int rc = 0; | 227 | int rc = 0; |
227 | 228 | ||
228 | if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { | 229 | if (S_ISDIR(ecryptfs_inode->i_mode)) { |
229 | ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); | 230 | ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); |
230 | crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); | 231 | crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); |
231 | goto out; | 232 | goto out; |
232 | } | 233 | } |
233 | ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); | 234 | ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); |
234 | rc = ecryptfs_new_file_context(ecryptfs_dentry); | 235 | rc = ecryptfs_new_file_context(ecryptfs_inode); |
235 | if (rc) { | 236 | if (rc) { |
236 | ecryptfs_printk(KERN_ERR, "Error creating new file " | 237 | ecryptfs_printk(KERN_ERR, "Error creating new file " |
237 | "context; rc = [%d]\n", rc); | 238 | "context; rc = [%d]\n", rc); |
238 | goto out; | 239 | goto out; |
239 | } | 240 | } |
240 | rc = ecryptfs_get_lower_file(ecryptfs_dentry, | 241 | rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); |
241 | ecryptfs_dentry->d_inode); | ||
242 | if (rc) { | 242 | if (rc) { |
243 | printk(KERN_ERR "%s: Error attempting to initialize " | 243 | printk(KERN_ERR "%s: Error attempting to initialize " |
244 | "the lower file for the dentry with name " | 244 | "the lower file for the dentry with name " |
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) | |||
246 | ecryptfs_dentry->d_name.name, rc); | 246 | ecryptfs_dentry->d_name.name, rc); |
247 | goto out; | 247 | goto out; |
248 | } | 248 | } |
249 | rc = ecryptfs_write_metadata(ecryptfs_dentry); | 249 | rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); |
250 | if (rc) | 250 | if (rc) |
251 | printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); | 251 | printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); |
252 | ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); | 252 | ecryptfs_put_lower_file(ecryptfs_inode); |
253 | out: | 253 | out: |
254 | return rc; | 254 | return rc; |
255 | } | 255 | } |
@@ -269,18 +269,28 @@ static int | |||
269 | ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, | 269 | ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, |
270 | int mode, struct nameidata *nd) | 270 | int mode, struct nameidata *nd) |
271 | { | 271 | { |
272 | struct inode *ecryptfs_inode; | ||
272 | int rc; | 273 | int rc; |
273 | 274 | ||
274 | /* ecryptfs_do_create() calls ecryptfs_interpose() */ | 275 | ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, |
275 | rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); | 276 | mode); |
276 | if (unlikely(rc)) { | 277 | if (unlikely(IS_ERR(ecryptfs_inode))) { |
277 | ecryptfs_printk(KERN_WARNING, "Failed to create file in" | 278 | ecryptfs_printk(KERN_WARNING, "Failed to create file in" |
278 | "lower filesystem\n"); | 279 | "lower filesystem\n"); |
280 | rc = PTR_ERR(ecryptfs_inode); | ||
279 | goto out; | 281 | goto out; |
280 | } | 282 | } |
281 | /* At this point, a file exists on "disk"; we need to make sure | 283 | /* At this point, a file exists on "disk"; we need to make sure |
282 | * that this on disk file is prepared to be an ecryptfs file */ | 284 | * that this on disk file is prepared to be an ecryptfs file */ |
283 | rc = ecryptfs_initialize_file(ecryptfs_dentry); | 285 | rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); |
286 | if (rc) { | ||
287 | drop_nlink(ecryptfs_inode); | ||
288 | unlock_new_inode(ecryptfs_inode); | ||
289 | iput(ecryptfs_inode); | ||
290 | goto out; | ||
291 | } | ||
292 | d_instantiate(ecryptfs_dentry, ecryptfs_inode); | ||
293 | unlock_new_inode(ecryptfs_inode); | ||
284 | out: | 294 | out: |
285 | return rc; | 295 | return rc; |
286 | } | 296 | } |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f6dba4505f1c..12ccacda44e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) | |||
565 | brelse(bitmap_bh); | 565 | brelse(bitmap_bh); |
566 | printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" | 566 | printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" |
567 | ", computed = %llu, %llu\n", | 567 | ", computed = %llu, %llu\n", |
568 | EXT4_B2C(sbi, ext4_free_blocks_count(es)), | 568 | EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), |
569 | desc_count, bitmap_count); | 569 | desc_count, bitmap_count); |
570 | return bitmap_count; | 570 | return bitmap_count; |
571 | #else | 571 | #else |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 61fa9e1614af..607b1557d292 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -1095,7 +1095,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, | |||
1095 | le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), | 1095 | le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), |
1096 | ext4_idx_pblock(EXT_FIRST_INDEX(neh))); | 1096 | ext4_idx_pblock(EXT_FIRST_INDEX(neh))); |
1097 | 1097 | ||
1098 | neh->eh_depth = cpu_to_le16(neh->eh_depth + 1); | 1098 | neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); |
1099 | ext4_mark_inode_dirty(handle, inode); | 1099 | ext4_mark_inode_dirty(handle, inode); |
1100 | out: | 1100 | out: |
1101 | brelse(bh); | 1101 | brelse(bh); |
@@ -2955,7 +2955,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |||
2955 | /* Pre-conditions */ | 2955 | /* Pre-conditions */ |
2956 | BUG_ON(!ext4_ext_is_uninitialized(ex)); | 2956 | BUG_ON(!ext4_ext_is_uninitialized(ex)); |
2957 | BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); | 2957 | BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); |
2958 | BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len); | ||
2959 | 2958 | ||
2960 | /* | 2959 | /* |
2961 | * Attempt to transfer newly initialized blocks from the currently | 2960 | * Attempt to transfer newly initialized blocks from the currently |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 240f6e2dc7ee..92655fd89657 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -1339,8 +1339,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, | |||
1339 | clear_buffer_unwritten(bh); | 1339 | clear_buffer_unwritten(bh); |
1340 | } | 1340 | } |
1341 | 1341 | ||
1342 | /* skip page if block allocation undone */ | 1342 | /* |
1343 | if (buffer_delay(bh) || buffer_unwritten(bh)) | 1343 | * skip page if block allocation undone and |
1344 | * block is dirty | ||
1345 | */ | ||
1346 | if (ext4_bh_delay_or_unwritten(NULL, bh)) | ||
1344 | skip_page = 1; | 1347 | skip_page = 1; |
1345 | bh = bh->b_this_page; | 1348 | bh = bh->b_this_page; |
1346 | block_start += bh->b_size; | 1349 | block_start += bh->b_size; |
@@ -2270,6 +2273,7 @@ retry: | |||
2270 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " | 2273 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2271 | "%ld pages, ino %lu; err %d", __func__, | 2274 | "%ld pages, ino %lu; err %d", __func__, |
2272 | wbc->nr_to_write, inode->i_ino, ret); | 2275 | wbc->nr_to_write, inode->i_ino, ret); |
2276 | blk_finish_plug(&plug); | ||
2273 | goto out_writepages; | 2277 | goto out_writepages; |
2274 | } | 2278 | } |
2275 | 2279 | ||
@@ -2386,7 +2390,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | |||
2386 | pgoff_t index; | 2390 | pgoff_t index; |
2387 | struct inode *inode = mapping->host; | 2391 | struct inode *inode = mapping->host; |
2388 | handle_t *handle; | 2392 | handle_t *handle; |
2389 | loff_t page_len; | ||
2390 | 2393 | ||
2391 | index = pos >> PAGE_CACHE_SHIFT; | 2394 | index = pos >> PAGE_CACHE_SHIFT; |
2392 | 2395 | ||
@@ -2433,13 +2436,6 @@ retry: | |||
2433 | */ | 2436 | */ |
2434 | if (pos + len > inode->i_size) | 2437 | if (pos + len > inode->i_size) |
2435 | ext4_truncate_failed_write(inode); | 2438 | ext4_truncate_failed_write(inode); |
2436 | } else { | ||
2437 | page_len = pos & (PAGE_CACHE_SIZE - 1); | ||
2438 | if (page_len > 0) { | ||
2439 | ret = ext4_discard_partial_page_buffers_no_lock(handle, | ||
2440 | inode, page, pos - page_len, page_len, | ||
2441 | EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); | ||
2442 | } | ||
2443 | } | 2439 | } |
2444 | 2440 | ||
2445 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 2441 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
@@ -2482,7 +2478,6 @@ static int ext4_da_write_end(struct file *file, | |||
2482 | loff_t new_i_size; | 2478 | loff_t new_i_size; |
2483 | unsigned long start, end; | 2479 | unsigned long start, end; |
2484 | int write_mode = (int)(unsigned long)fsdata; | 2480 | int write_mode = (int)(unsigned long)fsdata; |
2485 | loff_t page_len; | ||
2486 | 2481 | ||
2487 | if (write_mode == FALL_BACK_TO_NONDELALLOC) { | 2482 | if (write_mode == FALL_BACK_TO_NONDELALLOC) { |
2488 | if (ext4_should_order_data(inode)) { | 2483 | if (ext4_should_order_data(inode)) { |
@@ -2507,7 +2502,7 @@ static int ext4_da_write_end(struct file *file, | |||
2507 | */ | 2502 | */ |
2508 | 2503 | ||
2509 | new_i_size = pos + copied; | 2504 | new_i_size = pos + copied; |
2510 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 2505 | if (copied && new_i_size > EXT4_I(inode)->i_disksize) { |
2511 | if (ext4_da_should_update_i_disksize(page, end)) { | 2506 | if (ext4_da_should_update_i_disksize(page, end)) { |
2512 | down_write(&EXT4_I(inode)->i_data_sem); | 2507 | down_write(&EXT4_I(inode)->i_data_sem); |
2513 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 2508 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
@@ -2531,16 +2526,6 @@ static int ext4_da_write_end(struct file *file, | |||
2531 | } | 2526 | } |
2532 | ret2 = generic_write_end(file, mapping, pos, len, copied, | 2527 | ret2 = generic_write_end(file, mapping, pos, len, copied, |
2533 | page, fsdata); | 2528 | page, fsdata); |
2534 | |||
2535 | page_len = PAGE_CACHE_SIZE - | ||
2536 | ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1)); | ||
2537 | |||
2538 | if (page_len > 0) { | ||
2539 | ret = ext4_discard_partial_page_buffers_no_lock(handle, | ||
2540 | inode, page, pos + copied - 1, page_len, | ||
2541 | EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); | ||
2542 | } | ||
2543 | |||
2544 | copied = ret2; | 2529 | copied = ret2; |
2545 | if (ret2 < 0) | 2530 | if (ret2 < 0) |
2546 | ret = ret2; | 2531 | ret = ret2; |
@@ -2780,10 +2765,11 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
2780 | iocb->private, io_end->inode->i_ino, iocb, offset, | 2765 | iocb->private, io_end->inode->i_ino, iocb, offset, |
2781 | size); | 2766 | size); |
2782 | 2767 | ||
2768 | iocb->private = NULL; | ||
2769 | |||
2783 | /* if not aio dio with unwritten extents, just free io and return */ | 2770 | /* if not aio dio with unwritten extents, just free io and return */ |
2784 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | 2771 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { |
2785 | ext4_free_io_end(io_end); | 2772 | ext4_free_io_end(io_end); |
2786 | iocb->private = NULL; | ||
2787 | out: | 2773 | out: |
2788 | if (is_async) | 2774 | if (is_async) |
2789 | aio_complete(iocb, ret, 0); | 2775 | aio_complete(iocb, ret, 0); |
@@ -2807,7 +2793,6 @@ out: | |||
2807 | 2793 | ||
2808 | /* queue the work to convert unwritten extents to written */ | 2794 | /* queue the work to convert unwritten extents to written */ |
2809 | queue_work(wq, &io_end->work); | 2795 | queue_work(wq, &io_end->work); |
2810 | iocb->private = NULL; | ||
2811 | 2796 | ||
2812 | /* XXX: probably should move into the real I/O completion handler */ | 2797 | /* XXX: probably should move into the real I/O completion handler */ |
2813 | inode_dio_done(inode); | 2798 | inode_dio_done(inode); |
@@ -3202,26 +3187,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, | |||
3202 | 3187 | ||
3203 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | 3188 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); |
3204 | 3189 | ||
3205 | if (!page_has_buffers(page)) { | 3190 | if (!page_has_buffers(page)) |
3206 | /* | 3191 | create_empty_buffers(page, blocksize, 0); |
3207 | * If the range to be discarded covers a partial block | ||
3208 | * we need to get the page buffers. This is because | ||
3209 | * partial blocks cannot be released and the page needs | ||
3210 | * to be updated with the contents of the block before | ||
3211 | * we write the zeros on top of it. | ||
3212 | */ | ||
3213 | if ((from & (blocksize - 1)) || | ||
3214 | ((from + length) & (blocksize - 1))) { | ||
3215 | create_empty_buffers(page, blocksize, 0); | ||
3216 | } else { | ||
3217 | /* | ||
3218 | * If there are no partial blocks, | ||
3219 | * there is nothing to update, | ||
3220 | * so we can return now | ||
3221 | */ | ||
3222 | return 0; | ||
3223 | } | ||
3224 | } | ||
3225 | 3192 | ||
3226 | /* Find the buffer that contains "offset" */ | 3193 | /* Find the buffer that contains "offset" */ |
3227 | bh = page_buffers(page); | 3194 | bh = page_buffers(page); |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7ce1d0b19c94..7e106c810c62 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -385,6 +385,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
385 | 385 | ||
386 | block_end = block_start + blocksize; | 386 | block_end = block_start + blocksize; |
387 | if (block_start >= len) { | 387 | if (block_start >= len) { |
388 | /* | ||
389 | * Comments copied from block_write_full_page_endio: | ||
390 | * | ||
391 | * The page straddles i_size. It must be zeroed out on | ||
392 | * each and every writepage invocation because it may | ||
393 | * be mmapped. "A file is mapped in multiples of the | ||
394 | * page size. For a file that is not a multiple of | ||
395 | * the page size, the remaining memory is zeroed when | ||
396 | * mapped, and writes to that region are not written | ||
397 | * out to the file." | ||
398 | */ | ||
399 | zero_user_segment(page, block_start, block_end); | ||
388 | clear_buffer_dirty(bh); | 400 | clear_buffer_dirty(bh); |
389 | set_buffer_uptodate(bh); | 401 | set_buffer_uptodate(bh); |
390 | continue; | 402 | continue; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9953d80145ad..3e1329e2f826 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -1155,9 +1155,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1155 | seq_puts(seq, ",block_validity"); | 1155 | seq_puts(seq, ",block_validity"); |
1156 | 1156 | ||
1157 | if (!test_opt(sb, INIT_INODE_TABLE)) | 1157 | if (!test_opt(sb, INIT_INODE_TABLE)) |
1158 | seq_puts(seq, ",noinit_inode_table"); | 1158 | seq_puts(seq, ",noinit_itable"); |
1159 | else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) | 1159 | else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) |
1160 | seq_printf(seq, ",init_inode_table=%u", | 1160 | seq_printf(seq, ",init_itable=%u", |
1161 | (unsigned) sbi->s_li_wait_mult); | 1161 | (unsigned) sbi->s_li_wait_mult); |
1162 | 1162 | ||
1163 | ext4_show_quota_options(seq, sb); | 1163 | ext4_show_quota_options(seq, sb); |
@@ -1333,8 +1333,7 @@ enum { | |||
1333 | Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, | 1333 | Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, |
1334 | Opt_inode_readahead_blks, Opt_journal_ioprio, | 1334 | Opt_inode_readahead_blks, Opt_journal_ioprio, |
1335 | Opt_dioread_nolock, Opt_dioread_lock, | 1335 | Opt_dioread_nolock, Opt_dioread_lock, |
1336 | Opt_discard, Opt_nodiscard, | 1336 | Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, |
1337 | Opt_init_inode_table, Opt_noinit_inode_table, | ||
1338 | }; | 1337 | }; |
1339 | 1338 | ||
1340 | static const match_table_t tokens = { | 1339 | static const match_table_t tokens = { |
@@ -1407,9 +1406,9 @@ static const match_table_t tokens = { | |||
1407 | {Opt_dioread_lock, "dioread_lock"}, | 1406 | {Opt_dioread_lock, "dioread_lock"}, |
1408 | {Opt_discard, "discard"}, | 1407 | {Opt_discard, "discard"}, |
1409 | {Opt_nodiscard, "nodiscard"}, | 1408 | {Opt_nodiscard, "nodiscard"}, |
1410 | {Opt_init_inode_table, "init_itable=%u"}, | 1409 | {Opt_init_itable, "init_itable=%u"}, |
1411 | {Opt_init_inode_table, "init_itable"}, | 1410 | {Opt_init_itable, "init_itable"}, |
1412 | {Opt_noinit_inode_table, "noinit_itable"}, | 1411 | {Opt_noinit_itable, "noinit_itable"}, |
1413 | {Opt_err, NULL}, | 1412 | {Opt_err, NULL}, |
1414 | }; | 1413 | }; |
1415 | 1414 | ||
@@ -1683,7 +1682,9 @@ static int parse_options(char *options, struct super_block *sb, | |||
1683 | data_opt = EXT4_MOUNT_WRITEBACK_DATA; | 1682 | data_opt = EXT4_MOUNT_WRITEBACK_DATA; |
1684 | datacheck: | 1683 | datacheck: |
1685 | if (is_remount) { | 1684 | if (is_remount) { |
1686 | if (test_opt(sb, DATA_FLAGS) != data_opt) { | 1685 | if (!sbi->s_journal) |
1686 | ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); | ||
1687 | else if (test_opt(sb, DATA_FLAGS) != data_opt) { | ||
1687 | ext4_msg(sb, KERN_ERR, | 1688 | ext4_msg(sb, KERN_ERR, |
1688 | "Cannot change data mode on remount"); | 1689 | "Cannot change data mode on remount"); |
1689 | return 0; | 1690 | return 0; |
@@ -1890,7 +1891,7 @@ set_qf_format: | |||
1890 | case Opt_dioread_lock: | 1891 | case Opt_dioread_lock: |
1891 | clear_opt(sb, DIOREAD_NOLOCK); | 1892 | clear_opt(sb, DIOREAD_NOLOCK); |
1892 | break; | 1893 | break; |
1893 | case Opt_init_inode_table: | 1894 | case Opt_init_itable: |
1894 | set_opt(sb, INIT_INODE_TABLE); | 1895 | set_opt(sb, INIT_INODE_TABLE); |
1895 | if (args[0].from) { | 1896 | if (args[0].from) { |
1896 | if (match_int(&args[0], &option)) | 1897 | if (match_int(&args[0], &option)) |
@@ -1901,7 +1902,7 @@ set_qf_format: | |||
1901 | return 0; | 1902 | return 0; |
1902 | sbi->s_li_wait_mult = option; | 1903 | sbi->s_li_wait_mult = option; |
1903 | break; | 1904 | break; |
1904 | case Opt_noinit_inode_table: | 1905 | case Opt_noinit_itable: |
1905 | clear_opt(sb, INIT_INODE_TABLE); | 1906 | clear_opt(sb, INIT_INODE_TABLE); |
1906 | break; | 1907 | break; |
1907 | default: | 1908 | default: |
@@ -3099,8 +3100,6 @@ static void ext4_destroy_lazyinit_thread(void) | |||
3099 | } | 3100 | } |
3100 | 3101 | ||
3101 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) | 3102 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
3102 | __releases(kernel_lock) | ||
3103 | __acquires(kernel_lock) | ||
3104 | { | 3103 | { |
3105 | char *orig_data = kstrdup(data, GFP_KERNEL); | 3104 | char *orig_data = kstrdup(data, GFP_KERNEL); |
3106 | struct buffer_head *bh; | 3105 | struct buffer_head *bh; |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 73c3992b2bb4..ac86f8b3e3cb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -156,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | |||
156 | * bdi_start_writeback - start writeback | 156 | * bdi_start_writeback - start writeback |
157 | * @bdi: the backing device to write from | 157 | * @bdi: the backing device to write from |
158 | * @nr_pages: the number of pages to write | 158 | * @nr_pages: the number of pages to write |
159 | * @reason: reason why some writeback work was initiated | ||
159 | * | 160 | * |
160 | * Description: | 161 | * Description: |
161 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only | 162 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only |
@@ -1223,6 +1224,7 @@ static void wait_sb_inodes(struct super_block *sb) | |||
1223 | * writeback_inodes_sb_nr - writeback dirty inodes from given super_block | 1224 | * writeback_inodes_sb_nr - writeback dirty inodes from given super_block |
1224 | * @sb: the superblock | 1225 | * @sb: the superblock |
1225 | * @nr: the number of pages to write | 1226 | * @nr: the number of pages to write |
1227 | * @reason: reason why some writeback work initiated | ||
1226 | * | 1228 | * |
1227 | * Start writeback on some inodes on this super_block. No guarantees are made | 1229 | * Start writeback on some inodes on this super_block. No guarantees are made |
1228 | * on how many (if any) will be written, and this function does not wait | 1230 | * on how many (if any) will be written, and this function does not wait |
@@ -1251,6 +1253,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); | |||
1251 | /** | 1253 | /** |
1252 | * writeback_inodes_sb - writeback dirty inodes from given super_block | 1254 | * writeback_inodes_sb - writeback dirty inodes from given super_block |
1253 | * @sb: the superblock | 1255 | * @sb: the superblock |
1256 | * @reason: reason why some writeback work was initiated | ||
1254 | * | 1257 | * |
1255 | * Start writeback on some inodes on this super_block. No guarantees are made | 1258 | * Start writeback on some inodes on this super_block. No guarantees are made |
1256 | * on how many (if any) will be written, and this function does not wait | 1259 | * on how many (if any) will be written, and this function does not wait |
@@ -1265,6 +1268,7 @@ EXPORT_SYMBOL(writeback_inodes_sb); | |||
1265 | /** | 1268 | /** |
1266 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1269 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1267 | * @sb: the superblock | 1270 | * @sb: the superblock |
1271 | * @reason: reason why some writeback work was initiated | ||
1268 | * | 1272 | * |
1269 | * Invoke writeback_inodes_sb if no writeback is currently underway. | 1273 | * Invoke writeback_inodes_sb if no writeback is currently underway. |
1270 | * Returns 1 if writeback was started, 0 if not. | 1274 | * Returns 1 if writeback was started, 0 if not. |
@@ -1285,6 +1289,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); | |||
1285 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1289 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1286 | * @sb: the superblock | 1290 | * @sb: the superblock |
1287 | * @nr: the number of pages to write | 1291 | * @nr: the number of pages to write |
1292 | * @reason: reason why some writeback work was initiated | ||
1288 | * | 1293 | * |
1289 | * Invoke writeback_inodes_sb if no writeback is currently underway. | 1294 | * Invoke writeback_inodes_sb if no writeback is currently underway. |
1290 | * Returns 1 if writeback was started, 0 if not. | 1295 | * Returns 1 if writeback was started, 0 if not. |
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5cb8614508c3..2aaf3eaaf13d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -1512,7 +1512,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, | |||
1512 | else if (outarg->offset + num > file_size) | 1512 | else if (outarg->offset + num > file_size) |
1513 | num = file_size - outarg->offset; | 1513 | num = file_size - outarg->offset; |
1514 | 1514 | ||
1515 | while (num) { | 1515 | while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { |
1516 | struct page *page; | 1516 | struct page *page; |
1517 | unsigned int this_num; | 1517 | unsigned int this_num; |
1518 | 1518 | ||
@@ -1526,6 +1526,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, | |||
1526 | 1526 | ||
1527 | num -= this_num; | 1527 | num -= this_num; |
1528 | total_len += this_num; | 1528 | total_len += this_num; |
1529 | index++; | ||
1529 | } | 1530 | } |
1530 | req->misc.retrieve_in.offset = outarg->offset; | 1531 | req->misc.retrieve_in.offset = outarg->offset; |
1531 | req->misc.retrieve_in.size = total_len; | 1532 | req->misc.retrieve_in.size = total_len; |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 594f07a81c28..0c84100acd44 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -1556,7 +1556,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) | |||
1556 | struct inode *inode = file->f_path.dentry->d_inode; | 1556 | struct inode *inode = file->f_path.dentry->d_inode; |
1557 | 1557 | ||
1558 | mutex_lock(&inode->i_mutex); | 1558 | mutex_lock(&inode->i_mutex); |
1559 | if (origin != SEEK_CUR || origin != SEEK_SET) { | 1559 | if (origin != SEEK_CUR && origin != SEEK_SET) { |
1560 | retval = fuse_update_attributes(inode, NULL, file, NULL); | 1560 | retval = fuse_update_attributes(inode, NULL, file, NULL); |
1561 | if (retval) | 1561 | if (retval) |
1562 | goto exit; | 1562 | goto exit; |
@@ -1567,6 +1567,10 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) | |||
1567 | offset += i_size_read(inode); | 1567 | offset += i_size_read(inode); |
1568 | break; | 1568 | break; |
1569 | case SEEK_CUR: | 1569 | case SEEK_CUR: |
1570 | if (offset == 0) { | ||
1571 | retval = file->f_pos; | ||
1572 | goto exit; | ||
1573 | } | ||
1570 | offset += file->f_pos; | 1574 | offset += file->f_pos; |
1571 | break; | 1575 | break; |
1572 | case SEEK_DATA: | 1576 | case SEEK_DATA: |
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3e6d72756479..aa83109b9431 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c | |||
@@ -1138,28 +1138,28 @@ static int __init fuse_fs_init(void) | |||
1138 | { | 1138 | { |
1139 | int err; | 1139 | int err; |
1140 | 1140 | ||
1141 | err = register_filesystem(&fuse_fs_type); | ||
1142 | if (err) | ||
1143 | goto out; | ||
1144 | |||
1145 | err = register_fuseblk(); | ||
1146 | if (err) | ||
1147 | goto out_unreg; | ||
1148 | |||
1149 | fuse_inode_cachep = kmem_cache_create("fuse_inode", | 1141 | fuse_inode_cachep = kmem_cache_create("fuse_inode", |
1150 | sizeof(struct fuse_inode), | 1142 | sizeof(struct fuse_inode), |
1151 | 0, SLAB_HWCACHE_ALIGN, | 1143 | 0, SLAB_HWCACHE_ALIGN, |
1152 | fuse_inode_init_once); | 1144 | fuse_inode_init_once); |
1153 | err = -ENOMEM; | 1145 | err = -ENOMEM; |
1154 | if (!fuse_inode_cachep) | 1146 | if (!fuse_inode_cachep) |
1155 | goto out_unreg2; | 1147 | goto out; |
1148 | |||
1149 | err = register_fuseblk(); | ||
1150 | if (err) | ||
1151 | goto out2; | ||
1152 | |||
1153 | err = register_filesystem(&fuse_fs_type); | ||
1154 | if (err) | ||
1155 | goto out3; | ||
1156 | 1156 | ||
1157 | return 0; | 1157 | return 0; |
1158 | 1158 | ||
1159 | out_unreg2: | 1159 | out3: |
1160 | unregister_fuseblk(); | 1160 | unregister_fuseblk(); |
1161 | out_unreg: | 1161 | out2: |
1162 | unregister_filesystem(&fuse_fs_type); | 1162 | kmem_cache_destroy(fuse_inode_cachep); |
1163 | out: | 1163 | out: |
1164 | return err; | 1164 | return err; |
1165 | } | 1165 | } |
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3f32bcb0d9bd..ef175cb8cfd8 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c | |||
@@ -16,38 +16,26 @@ | |||
16 | #include <linux/bitops.h> | 16 | #include <linux/bitops.h> |
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | 18 | ||
19 | static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; | ||
20 | |||
21 | static DEFINE_SPINLOCK(bitmap_lock); | 19 | static DEFINE_SPINLOCK(bitmap_lock); |
22 | 20 | ||
23 | static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) | 21 | /* |
22 | * bitmap consists of blocks filled with 16bit words | ||
23 | * bit set == busy, bit clear == free | ||
24 | * endianness is a mess, but for counting zero bits it really doesn't matter... | ||
25 | */ | ||
26 | static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) | ||
24 | { | 27 | { |
25 | unsigned i, j, sum = 0; | 28 | __u32 sum = 0; |
26 | struct buffer_head *bh; | 29 | unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); |
27 | |||
28 | for (i=0; i<numblocks-1; i++) { | ||
29 | if (!(bh=map[i])) | ||
30 | return(0); | ||
31 | for (j=0; j<bh->b_size; j++) | ||
32 | sum += nibblemap[bh->b_data[j] & 0xf] | ||
33 | + nibblemap[(bh->b_data[j]>>4) & 0xf]; | ||
34 | } | ||
35 | 30 | ||
36 | if (numblocks==0 || !(bh=map[numblocks-1])) | 31 | while (blocks--) { |
37 | return(0); | 32 | unsigned words = blocksize / 2; |
38 | i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; | 33 | __u16 *p = (__u16 *)(*map++)->b_data; |
39 | for (j=0; j<i; j++) { | 34 | while (words--) |
40 | sum += nibblemap[bh->b_data[j] & 0xf] | 35 | sum += 16 - hweight16(*p++); |
41 | + nibblemap[(bh->b_data[j]>>4) & 0xf]; | ||
42 | } | 36 | } |
43 | 37 | ||
44 | i = numbits%16; | 38 | return sum; |
45 | if (i!=0) { | ||
46 | i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1); | ||
47 | sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf]; | ||
48 | sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; | ||
49 | } | ||
50 | return(sum); | ||
51 | } | 39 | } |
52 | 40 | ||
53 | void minix_free_block(struct inode *inode, unsigned long block) | 41 | void minix_free_block(struct inode *inode, unsigned long block) |
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode) | |||
105 | return 0; | 93 | return 0; |
106 | } | 94 | } |
107 | 95 | ||
108 | unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) | 96 | unsigned long minix_count_free_blocks(struct super_block *sb) |
109 | { | 97 | { |
110 | return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, | 98 | struct minix_sb_info *sbi = minix_sb(sb); |
111 | sbi->s_nzones - sbi->s_firstdatazone + 1) | 99 | u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); |
100 | |||
101 | return (count_free(sbi->s_zmap, sb->s_blocksize, bits) | ||
112 | << sbi->s_log_zone_size); | 102 | << sbi->s_log_zone_size); |
113 | } | 103 | } |
114 | 104 | ||
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) | |||
273 | return inode; | 263 | return inode; |
274 | } | 264 | } |
275 | 265 | ||
276 | unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) | 266 | unsigned long minix_count_free_inodes(struct super_block *sb) |
277 | { | 267 | { |
278 | return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); | 268 | struct minix_sb_info *sbi = minix_sb(sb); |
269 | u32 bits = sbi->s_ninodes + 1; | ||
270 | |||
271 | return count_free(sbi->s_imap, sb->s_blocksize, bits); | ||
279 | } | 272 | } |
diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 64cdcd662ffc..1d9e33966db0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c | |||
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) | |||
279 | else if (sbi->s_mount_state & MINIX_ERROR_FS) | 279 | else if (sbi->s_mount_state & MINIX_ERROR_FS) |
280 | printk("MINIX-fs: mounting file system with errors, " | 280 | printk("MINIX-fs: mounting file system with errors, " |
281 | "running fsck is recommended\n"); | 281 | "running fsck is recommended\n"); |
282 | |||
283 | /* Apparently minix can create filesystems that allocate more blocks for | ||
284 | * the bitmaps than needed. We simply ignore that, but verify it didn't | ||
285 | * create one with not enough blocks and bail out if so. | ||
286 | */ | ||
287 | block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); | ||
288 | if (sbi->s_imap_blocks < block) { | ||
289 | printk("MINIX-fs: file system does not have enough " | ||
290 | "imap blocks allocated. Refusing to mount\n"); | ||
291 | goto out_iput; | ||
292 | } | ||
293 | |||
294 | block = minix_blocks_needed( | ||
295 | (sbi->s_nzones - (sbi->s_firstdatazone + 1)), | ||
296 | s->s_blocksize); | ||
297 | if (sbi->s_zmap_blocks < block) { | ||
298 | printk("MINIX-fs: file system does not have enough " | ||
299 | "zmap blocks allocated. Refusing to mount.\n"); | ||
300 | goto out_iput; | ||
301 | } | ||
302 | |||
282 | return 0; | 303 | return 0; |
283 | 304 | ||
284 | out_iput: | 305 | out_iput: |
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
339 | buf->f_type = sb->s_magic; | 360 | buf->f_type = sb->s_magic; |
340 | buf->f_bsize = sb->s_blocksize; | 361 | buf->f_bsize = sb->s_blocksize; |
341 | buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; | 362 | buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; |
342 | buf->f_bfree = minix_count_free_blocks(sbi); | 363 | buf->f_bfree = minix_count_free_blocks(sb); |
343 | buf->f_bavail = buf->f_bfree; | 364 | buf->f_bavail = buf->f_bfree; |
344 | buf->f_files = sbi->s_ninodes; | 365 | buf->f_files = sbi->s_ninodes; |
345 | buf->f_ffree = minix_count_free_inodes(sbi); | 366 | buf->f_ffree = minix_count_free_inodes(sb); |
346 | buf->f_namelen = sbi->s_namelen; | 367 | buf->f_namelen = sbi->s_namelen; |
347 | buf->f_fsid.val[0] = (u32)id; | 368 | buf->f_fsid.val[0] = (u32)id; |
348 | buf->f_fsid.val[1] = (u32)(id >> 32); | 369 | buf->f_fsid.val[1] = (u32)(id >> 32); |
diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 341e2122879a..26bbd55e82ea 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h | |||
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru | |||
48 | extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); | 48 | extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); |
49 | extern struct inode * minix_new_inode(const struct inode *, int, int *); | 49 | extern struct inode * minix_new_inode(const struct inode *, int, int *); |
50 | extern void minix_free_inode(struct inode * inode); | 50 | extern void minix_free_inode(struct inode * inode); |
51 | extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); | 51 | extern unsigned long minix_count_free_inodes(struct super_block *sb); |
52 | extern int minix_new_block(struct inode * inode); | 52 | extern int minix_new_block(struct inode * inode); |
53 | extern void minix_free_block(struct inode *inode, unsigned long block); | 53 | extern void minix_free_block(struct inode *inode, unsigned long block); |
54 | extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); | 54 | extern unsigned long minix_count_free_blocks(struct super_block *sb); |
55 | extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 55 | extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
56 | extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); | 56 | extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); |
57 | 57 | ||
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) | |||
88 | return list_entry(inode, struct minix_inode_info, vfs_inode); | 88 | return list_entry(inode, struct minix_inode_info, vfs_inode); |
89 | } | 89 | } |
90 | 90 | ||
91 | static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) | ||
92 | { | ||
93 | return DIV_ROUND_UP(bits, blocksize * 8); | ||
94 | } | ||
95 | |||
91 | #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ | 96 | #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ |
92 | defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) | 97 | defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) |
93 | 98 | ||
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) | |||
125 | if (!size) | 130 | if (!size) |
126 | return 0; | 131 | return 0; |
127 | 132 | ||
128 | size = (size >> 4) + ((size & 15) > 0); | 133 | size >>= 4; |
129 | while (*p++ == 0xffff) { | 134 | while (*p++ == 0xffff) { |
130 | if (--size == 0) | 135 | if (--size == 0) |
131 | return (p - addr) << 4; | 136 | return (p - addr) << 4; |
diff --git a/fs/namespace.c b/fs/namespace.c index e5e1c7d1839b..cfc6d4448aa5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c | |||
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v) | |||
1048 | if (err) | 1048 | if (err) |
1049 | goto out; | 1049 | goto out; |
1050 | seq_putc(m, ' '); | 1050 | seq_putc(m, ' '); |
1051 | seq_path_root(m, &mnt_path, &root, " \t\n\\"); | 1051 | |
1052 | if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { | 1052 | /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ |
1053 | /* | 1053 | err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); |
1054 | * Mountpoint is outside root, discard that one. Ugly, | 1054 | if (err) |
1055 | * but less so than trying to do that in iterator in a | 1055 | goto out; |
1056 | * race-free way (due to renames). | 1056 | |
1057 | */ | ||
1058 | return SEQ_SKIP; | ||
1059 | } | ||
1060 | seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); | 1057 | seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); |
1061 | show_mnt_opts(m, mnt); | 1058 | show_mnt_opts(m, mnt); |
1062 | 1059 | ||
@@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) | |||
2483 | __mnt_make_longterm(mnt); | 2480 | __mnt_make_longterm(mnt); |
2484 | new_ns->root = mnt; | 2481 | new_ns->root = mnt; |
2485 | list_add(&new_ns->list, &new_ns->root->mnt_list); | 2482 | list_add(&new_ns->list, &new_ns->root->mnt_list); |
2483 | } else { | ||
2484 | mntput(mnt); | ||
2486 | } | 2485 | } |
2487 | return new_ns; | 2486 | return new_ns; |
2488 | } | 2487 | } |
2489 | EXPORT_SYMBOL(create_mnt_ns); | 2488 | EXPORT_SYMBOL(create_mnt_ns); |
2490 | 2489 | ||
2490 | struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) | ||
2491 | { | ||
2492 | struct mnt_namespace *ns; | ||
2493 | struct super_block *s; | ||
2494 | struct path path; | ||
2495 | int err; | ||
2496 | |||
2497 | ns = create_mnt_ns(mnt); | ||
2498 | if (IS_ERR(ns)) | ||
2499 | return ERR_CAST(ns); | ||
2500 | |||
2501 | err = vfs_path_lookup(mnt->mnt_root, mnt, | ||
2502 | name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); | ||
2503 | |||
2504 | put_mnt_ns(ns); | ||
2505 | |||
2506 | if (err) | ||
2507 | return ERR_PTR(err); | ||
2508 | |||
2509 | /* trade a vfsmount reference for active sb one */ | ||
2510 | s = path.mnt->mnt_sb; | ||
2511 | atomic_inc(&s->s_active); | ||
2512 | mntput(path.mnt); | ||
2513 | /* lock the sucker */ | ||
2514 | down_write(&s->s_umount); | ||
2515 | /* ... and return the root of (sub)tree on it */ | ||
2516 | return path.dentry; | ||
2517 | } | ||
2518 | EXPORT_SYMBOL(mount_subtree); | ||
2519 | |||
2491 | SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, | 2520 | SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, |
2492 | char __user *, type, unsigned long, flags, void __user *, data) | 2521 | char __user *, type, unsigned long, flags, void __user *, data) |
2493 | { | 2522 | { |
@@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt) | |||
2744 | } | 2773 | } |
2745 | } | 2774 | } |
2746 | EXPORT_SYMBOL(kern_unmount); | 2775 | EXPORT_SYMBOL(kern_unmount); |
2776 | |||
2777 | bool our_mnt(struct vfsmount *mnt) | ||
2778 | { | ||
2779 | return check_mnt(mnt); | ||
2780 | } | ||
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 5b5fa33b6b9d..cbd1a61c110a 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c | |||
@@ -548,7 +548,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) | |||
548 | 548 | ||
549 | error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); | 549 | error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); |
550 | if (error) | 550 | if (error) |
551 | goto out_bdi; | 551 | goto out_fput; |
552 | 552 | ||
553 | server->ncp_filp = ncp_filp; | 553 | server->ncp_filp = ncp_filp; |
554 | server->ncp_sock = sock; | 554 | server->ncp_sock = sock; |
@@ -559,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) | |||
559 | error = -EBADF; | 559 | error = -EBADF; |
560 | server->info_filp = fget(data.info_fd); | 560 | server->info_filp = fget(data.info_fd); |
561 | if (!server->info_filp) | 561 | if (!server->info_filp) |
562 | goto out_fput; | 562 | goto out_bdi; |
563 | error = -ENOTSOCK; | 563 | error = -ENOTSOCK; |
564 | sock_inode = server->info_filp->f_path.dentry->d_inode; | 564 | sock_inode = server->info_filp->f_path.dentry->d_inode; |
565 | if (!S_ISSOCK(sock_inode->i_mode)) | 565 | if (!S_ISSOCK(sock_inode->i_mode)) |
@@ -746,9 +746,9 @@ out_nls: | |||
746 | out_fput2: | 746 | out_fput2: |
747 | if (server->info_filp) | 747 | if (server->info_filp) |
748 | fput(server->info_filp); | 748 | fput(server->info_filp); |
749 | out_fput: | ||
750 | bdi_destroy(&server->bdi); | ||
751 | out_bdi: | 749 | out_bdi: |
750 | bdi_destroy(&server->bdi); | ||
751 | out_fput: | ||
752 | /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: | 752 | /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: |
753 | * | 753 | * |
754 | * The previously used put_filp(ncp_filp); was bogus, since | 754 | * The previously used put_filp(ncp_filp); was bogus, since |
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b238d95ac48c..ac2899098147 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c | |||
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry | |||
1468 | res = NULL; | 1468 | res = NULL; |
1469 | goto out; | 1469 | goto out; |
1470 | /* This turned out not to be a regular file */ | 1470 | /* This turned out not to be a regular file */ |
1471 | case -EISDIR: | ||
1471 | case -ENOTDIR: | 1472 | case -ENOTDIR: |
1472 | goto no_open; | 1473 | goto no_open; |
1473 | case -ELOOP: | 1474 | case -ELOOP: |
1474 | if (!(nd->intent.open.flags & O_NOFOLLOW)) | 1475 | if (!(nd->intent.open.flags & O_NOFOLLOW)) |
1475 | goto no_open; | 1476 | goto no_open; |
1476 | /* case -EISDIR: */ | ||
1477 | /* case -EINVAL: */ | 1477 | /* case -EINVAL: */ |
1478 | default: | 1478 | default: |
1479 | res = ERR_CAST(inode); | 1479 | res = ERR_CAST(inode); |
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0a1f8312b4dc..eca56d4b39c0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -40,48 +40,8 @@ | |||
40 | 40 | ||
41 | #define NFSDBG_FACILITY NFSDBG_FILE | 41 | #define NFSDBG_FACILITY NFSDBG_FILE |
42 | 42 | ||
43 | static int nfs_file_open(struct inode *, struct file *); | ||
44 | static int nfs_file_release(struct inode *, struct file *); | ||
45 | static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); | ||
46 | static int nfs_file_mmap(struct file *, struct vm_area_struct *); | ||
47 | static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, | ||
48 | struct pipe_inode_info *pipe, | ||
49 | size_t count, unsigned int flags); | ||
50 | static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, | ||
51 | unsigned long nr_segs, loff_t pos); | ||
52 | static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, | ||
53 | struct file *filp, loff_t *ppos, | ||
54 | size_t count, unsigned int flags); | ||
55 | static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, | ||
56 | unsigned long nr_segs, loff_t pos); | ||
57 | static int nfs_file_flush(struct file *, fl_owner_t id); | ||
58 | static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); | ||
59 | static int nfs_check_flags(int flags); | ||
60 | static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); | ||
61 | static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); | ||
62 | static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); | ||
63 | |||
64 | static const struct vm_operations_struct nfs_file_vm_ops; | 43 | static const struct vm_operations_struct nfs_file_vm_ops; |
65 | 44 | ||
66 | const struct file_operations nfs_file_operations = { | ||
67 | .llseek = nfs_file_llseek, | ||
68 | .read = do_sync_read, | ||
69 | .write = do_sync_write, | ||
70 | .aio_read = nfs_file_read, | ||
71 | .aio_write = nfs_file_write, | ||
72 | .mmap = nfs_file_mmap, | ||
73 | .open = nfs_file_open, | ||
74 | .flush = nfs_file_flush, | ||
75 | .release = nfs_file_release, | ||
76 | .fsync = nfs_file_fsync, | ||
77 | .lock = nfs_lock, | ||
78 | .flock = nfs_flock, | ||
79 | .splice_read = nfs_file_splice_read, | ||
80 | .splice_write = nfs_file_splice_write, | ||
81 | .check_flags = nfs_check_flags, | ||
82 | .setlease = nfs_setlease, | ||
83 | }; | ||
84 | |||
85 | const struct inode_operations nfs_file_inode_operations = { | 45 | const struct inode_operations nfs_file_inode_operations = { |
86 | .permission = nfs_permission, | 46 | .permission = nfs_permission, |
87 | .getattr = nfs_getattr, | 47 | .getattr = nfs_getattr, |
@@ -886,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) | |||
886 | file->f_path.dentry->d_name.name, arg); | 846 | file->f_path.dentry->d_name.name, arg); |
887 | return -EINVAL; | 847 | return -EINVAL; |
888 | } | 848 | } |
849 | |||
850 | const struct file_operations nfs_file_operations = { | ||
851 | .llseek = nfs_file_llseek, | ||
852 | .read = do_sync_read, | ||
853 | .write = do_sync_write, | ||
854 | .aio_read = nfs_file_read, | ||
855 | .aio_write = nfs_file_write, | ||
856 | .mmap = nfs_file_mmap, | ||
857 | .open = nfs_file_open, | ||
858 | .flush = nfs_file_flush, | ||
859 | .release = nfs_file_release, | ||
860 | .fsync = nfs_file_fsync, | ||
861 | .lock = nfs_lock, | ||
862 | .flock = nfs_flock, | ||
863 | .splice_read = nfs_file_splice_read, | ||
864 | .splice_write = nfs_file_splice_write, | ||
865 | .check_flags = nfs_check_flags, | ||
866 | .setlease = nfs_setlease, | ||
867 | }; | ||
868 | |||
869 | #ifdef CONFIG_NFS_V4 | ||
870 | static int | ||
871 | nfs4_file_open(struct inode *inode, struct file *filp) | ||
872 | { | ||
873 | /* | ||
874 | * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to | ||
875 | * this point, then something is very wrong | ||
876 | */ | ||
877 | dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); | ||
878 | return -ENOTDIR; | ||
879 | } | ||
880 | |||
881 | const struct file_operations nfs4_file_operations = { | ||
882 | .llseek = nfs_file_llseek, | ||
883 | .read = do_sync_read, | ||
884 | .write = do_sync_write, | ||
885 | .aio_read = nfs_file_read, | ||
886 | .aio_write = nfs_file_write, | ||
887 | .mmap = nfs_file_mmap, | ||
888 | .open = nfs4_file_open, | ||
889 | .flush = nfs_file_flush, | ||
890 | .release = nfs_file_release, | ||
891 | .fsync = nfs_file_fsync, | ||
892 | .lock = nfs_lock, | ||
893 | .flock = nfs_flock, | ||
894 | .splice_read = nfs_file_splice_read, | ||
895 | .splice_write = nfs_file_splice_write, | ||
896 | .check_flags = nfs_check_flags, | ||
897 | .setlease = nfs_setlease, | ||
898 | }; | ||
899 | #endif /* CONFIG_NFS_V4 */ | ||
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c07a55aec838..50a15fa8cf98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c | |||
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | |||
291 | */ | 291 | */ |
292 | inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; | 292 | inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; |
293 | if (S_ISREG(inode->i_mode)) { | 293 | if (S_ISREG(inode->i_mode)) { |
294 | inode->i_fop = &nfs_file_operations; | 294 | inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; |
295 | inode->i_data.a_ops = &nfs_file_aops; | 295 | inode->i_data.a_ops = &nfs_file_aops; |
296 | inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; | 296 | inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; |
297 | } else if (S_ISDIR(inode->i_mode)) { | 297 | } else if (S_ISDIR(inode->i_mode)) { |
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c1a1bd8ddf1c..3f4d95751d52 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); | |||
299 | extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, | 299 | extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, |
300 | struct list_head *head); | 300 | struct list_head *head); |
301 | 301 | ||
302 | extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, | ||
303 | struct inode *inode); | ||
302 | extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); | 304 | extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); |
303 | extern void nfs_readdata_release(struct nfs_read_data *rdata); | 305 | extern void nfs_readdata_release(struct nfs_read_data *rdata); |
304 | 306 | ||
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 85f1690ca08c..d4bc9ed91748 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c | |||
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { | |||
853 | .dentry_ops = &nfs_dentry_operations, | 853 | .dentry_ops = &nfs_dentry_operations, |
854 | .dir_inode_ops = &nfs3_dir_inode_operations, | 854 | .dir_inode_ops = &nfs3_dir_inode_operations, |
855 | .file_inode_ops = &nfs3_file_inode_operations, | 855 | .file_inode_ops = &nfs3_file_inode_operations, |
856 | .file_ops = &nfs_file_operations, | ||
856 | .getroot = nfs3_proc_get_root, | 857 | .getroot = nfs3_proc_get_root, |
857 | .getattr = nfs3_proc_getattr, | 858 | .getattr = nfs3_proc_getattr, |
858 | .setattr = nfs3_proc_setattr, | 859 | .setattr = nfs3_proc_setattr, |
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b60fddf606f7..be2bbac13817 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c | |||
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst | |||
2464 | case -NFS4ERR_BADNAME: | 2464 | case -NFS4ERR_BADNAME: |
2465 | return -ENOENT; | 2465 | return -ENOENT; |
2466 | case -NFS4ERR_MOVED: | 2466 | case -NFS4ERR_MOVED: |
2467 | err = nfs4_get_referral(dir, name, fattr, fhandle); | 2467 | return nfs4_get_referral(dir, name, fattr, fhandle); |
2468 | break; | ||
2469 | case -NFS4ERR_WRONGSEC: | 2468 | case -NFS4ERR_WRONGSEC: |
2470 | nfs_fixup_secinfo_attributes(fattr, fhandle); | 2469 | nfs_fixup_secinfo_attributes(fattr, fhandle); |
2471 | } | 2470 | } |
@@ -6253,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { | |||
6253 | .dentry_ops = &nfs4_dentry_operations, | 6252 | .dentry_ops = &nfs4_dentry_operations, |
6254 | .dir_inode_ops = &nfs4_dir_inode_operations, | 6253 | .dir_inode_ops = &nfs4_dir_inode_operations, |
6255 | .file_inode_ops = &nfs4_file_inode_operations, | 6254 | .file_inode_ops = &nfs4_file_inode_operations, |
6255 | .file_ops = &nfs4_file_operations, | ||
6256 | .getroot = nfs4_proc_get_root, | 6256 | .getroot = nfs4_proc_get_root, |
6257 | .getattr = nfs4_proc_getattr, | 6257 | .getattr = nfs4_proc_getattr, |
6258 | .setattr = nfs4_proc_setattr, | 6258 | .setattr = nfs4_proc_setattr, |
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index baf73536bc04..8e672a2b2d69 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c | |||
@@ -1260,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) | |||
1260 | } | 1260 | } |
1261 | EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); | 1261 | EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); |
1262 | 1262 | ||
1263 | static void pnfs_ld_handle_read_error(struct nfs_read_data *data) | ||
1264 | { | ||
1265 | struct nfs_pageio_descriptor pgio; | ||
1266 | |||
1267 | put_lseg(data->lseg); | ||
1268 | data->lseg = NULL; | ||
1269 | dprintk("pnfs write error = %d\n", data->pnfs_error); | ||
1270 | |||
1271 | nfs_pageio_init_read_mds(&pgio, data->inode); | ||
1272 | |||
1273 | while (!list_empty(&data->pages)) { | ||
1274 | struct nfs_page *req = nfs_list_entry(data->pages.next); | ||
1275 | |||
1276 | nfs_list_remove_request(req); | ||
1277 | nfs_pageio_add_request(&pgio, req); | ||
1278 | } | ||
1279 | nfs_pageio_complete(&pgio); | ||
1280 | } | ||
1281 | |||
1263 | /* | 1282 | /* |
1264 | * Called by non rpc-based layout drivers | 1283 | * Called by non rpc-based layout drivers |
1265 | */ | 1284 | */ |
@@ -1268,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data) | |||
1268 | if (likely(!data->pnfs_error)) { | 1287 | if (likely(!data->pnfs_error)) { |
1269 | __nfs4_read_done_cb(data); | 1288 | __nfs4_read_done_cb(data); |
1270 | data->mds_ops->rpc_call_done(&data->task, data); | 1289 | data->mds_ops->rpc_call_done(&data->task, data); |
1271 | } else { | 1290 | } else |
1272 | put_lseg(data->lseg); | 1291 | pnfs_ld_handle_read_error(data); |
1273 | data->lseg = NULL; | ||
1274 | dprintk("pnfs write error = %d\n", data->pnfs_error); | ||
1275 | } | ||
1276 | data->mds_ops->rpc_release(data); | 1292 | data->mds_ops->rpc_release(data); |
1277 | } | 1293 | } |
1278 | EXPORT_SYMBOL_GPL(pnfs_ld_read_done); | 1294 | EXPORT_SYMBOL_GPL(pnfs_ld_read_done); |
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ac40b8535d7e..f48125da198a 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c | |||
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { | |||
710 | .dentry_ops = &nfs_dentry_operations, | 710 | .dentry_ops = &nfs_dentry_operations, |
711 | .dir_inode_ops = &nfs_dir_inode_operations, | 711 | .dir_inode_ops = &nfs_dir_inode_operations, |
712 | .file_inode_ops = &nfs_file_inode_operations, | 712 | .file_inode_ops = &nfs_file_inode_operations, |
713 | .file_ops = &nfs_file_operations, | ||
713 | .getroot = nfs_proc_get_root, | 714 | .getroot = nfs_proc_get_root, |
714 | .getattr = nfs_proc_getattr, | 715 | .getattr = nfs_proc_getattr, |
715 | .setattr = nfs_proc_setattr, | 716 | .setattr = nfs_proc_setattr, |
diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8b48ec63f722..cfa175c223dc 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c | |||
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) | |||
109 | } | 109 | } |
110 | } | 110 | } |
111 | 111 | ||
112 | static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, | 112 | void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, |
113 | struct inode *inode) | 113 | struct inode *inode) |
114 | { | 114 | { |
115 | nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, | 115 | nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, |
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) | |||
534 | static void nfs_readpage_release_full(void *calldata) | 534 | static void nfs_readpage_release_full(void *calldata) |
535 | { | 535 | { |
536 | struct nfs_read_data *data = calldata; | 536 | struct nfs_read_data *data = calldata; |
537 | struct nfs_pageio_descriptor pgio; | ||
538 | 537 | ||
539 | if (data->pnfs_error) { | ||
540 | nfs_pageio_init_read_mds(&pgio, data->inode); | ||
541 | pgio.pg_recoalesce = 1; | ||
542 | } | ||
543 | while (!list_empty(&data->pages)) { | 538 | while (!list_empty(&data->pages)) { |
544 | struct nfs_page *req = nfs_list_entry(data->pages.next); | 539 | struct nfs_page *req = nfs_list_entry(data->pages.next); |
545 | 540 | ||
546 | nfs_list_remove_request(req); | 541 | nfs_list_remove_request(req); |
547 | if (!data->pnfs_error) | 542 | nfs_readpage_release(req); |
548 | nfs_readpage_release(req); | ||
549 | else | ||
550 | nfs_pageio_add_request(&pgio, req); | ||
551 | } | 543 | } |
552 | if (data->pnfs_error) | ||
553 | nfs_pageio_complete(&pgio); | ||
554 | nfs_readdata_release(calldata); | 544 | nfs_readdata_release(calldata); |
555 | } | 545 | } |
556 | 546 | ||
diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 480b3b6bf71e..134777406ee3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c | |||
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void) | |||
2787 | static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, | 2787 | static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, |
2788 | const char *export_path) | 2788 | const char *export_path) |
2789 | { | 2789 | { |
2790 | struct mnt_namespace *ns_private; | ||
2791 | struct super_block *s; | ||
2792 | struct dentry *dentry; | 2790 | struct dentry *dentry; |
2793 | struct path path; | 2791 | int ret = nfs_referral_loop_protect(); |
2794 | int ret; | ||
2795 | |||
2796 | ns_private = create_mnt_ns(root_mnt); | ||
2797 | ret = PTR_ERR(ns_private); | ||
2798 | if (IS_ERR(ns_private)) | ||
2799 | goto out_mntput; | ||
2800 | |||
2801 | ret = nfs_referral_loop_protect(); | ||
2802 | if (ret != 0) | ||
2803 | goto out_put_mnt_ns; | ||
2804 | 2792 | ||
2805 | ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, | 2793 | if (ret) { |
2806 | export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); | 2794 | mntput(root_mnt); |
2795 | return ERR_PTR(ret); | ||
2796 | } | ||
2807 | 2797 | ||
2798 | dentry = mount_subtree(root_mnt, export_path); | ||
2808 | nfs_referral_loop_unprotect(); | 2799 | nfs_referral_loop_unprotect(); |
2809 | put_mnt_ns(ns_private); | ||
2810 | |||
2811 | if (ret != 0) | ||
2812 | goto out_err; | ||
2813 | |||
2814 | s = path.mnt->mnt_sb; | ||
2815 | atomic_inc(&s->s_active); | ||
2816 | dentry = dget(path.dentry); | ||
2817 | 2800 | ||
2818 | path_put(&path); | ||
2819 | down_write(&s->s_umount); | ||
2820 | return dentry; | 2801 | return dentry; |
2821 | out_put_mnt_ns: | ||
2822 | put_mnt_ns(ns_private); | ||
2823 | out_mntput: | ||
2824 | mntput(root_mnt); | ||
2825 | out_err: | ||
2826 | return ERR_PTR(ret); | ||
2827 | } | 2802 | } |
2828 | 2803 | ||
2829 | static struct dentry *nfs4_try_mount(int flags, const char *dev_name, | 2804 | static struct dentry *nfs4_try_mount(int flags, const char *dev_name, |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ed553c60de82..3165aebb43c8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode, | |||
5699 | OCFS2_JOURNAL_ACCESS_WRITE); | 5699 | OCFS2_JOURNAL_ACCESS_WRITE); |
5700 | if (ret) { | 5700 | if (ret) { |
5701 | mlog_errno(ret); | 5701 | mlog_errno(ret); |
5702 | goto out; | 5702 | goto out_commit; |
5703 | } | 5703 | } |
5704 | 5704 | ||
5705 | dquot_free_space_nodirty(inode, | 5705 | dquot_free_space_nodirty(inode, |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c1efe939c774..78b68af3b0e3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page) | |||
290 | } | 290 | } |
291 | 291 | ||
292 | if (down_read_trylock(&oi->ip_alloc_sem) == 0) { | 292 | if (down_read_trylock(&oi->ip_alloc_sem) == 0) { |
293 | /* | ||
294 | * Unlock the page and cycle ip_alloc_sem so that we don't | ||
295 | * busyloop waiting for ip_alloc_sem to unlock | ||
296 | */ | ||
293 | ret = AOP_TRUNCATED_PAGE; | 297 | ret = AOP_TRUNCATED_PAGE; |
298 | unlock_page(page); | ||
299 | unlock = 0; | ||
300 | down_read(&oi->ip_alloc_sem); | ||
301 | up_read(&oi->ip_alloc_sem); | ||
294 | goto out_inode_unlock; | 302 | goto out_inode_unlock; |
295 | } | 303 | } |
296 | 304 | ||
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
563 | { | 571 | { |
564 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 572 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
565 | int level; | 573 | int level; |
574 | wait_queue_head_t *wq = ocfs2_ioend_wq(inode); | ||
566 | 575 | ||
567 | /* this io's submitter should not have unlocked this before we could */ | 576 | /* this io's submitter should not have unlocked this before we could */ |
568 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 577 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
570 | if (ocfs2_iocb_is_sem_locked(iocb)) | 579 | if (ocfs2_iocb_is_sem_locked(iocb)) |
571 | ocfs2_iocb_clear_sem_locked(iocb); | 580 | ocfs2_iocb_clear_sem_locked(iocb); |
572 | 581 | ||
582 | if (ocfs2_iocb_is_unaligned_aio(iocb)) { | ||
583 | ocfs2_iocb_clear_unaligned_aio(iocb); | ||
584 | |||
585 | if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && | ||
586 | waitqueue_active(wq)) { | ||
587 | wake_up_all(wq); | ||
588 | } | ||
589 | } | ||
590 | |||
573 | ocfs2_iocb_clear_rw_locked(iocb); | 591 | ocfs2_iocb_clear_rw_locked(iocb); |
574 | 592 | ||
575 | level = ocfs2_iocb_rw_locked_level(iocb); | 593 | level = ocfs2_iocb_rw_locked_level(iocb); |
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt { | |||
863 | struct page *w_target_page; | 881 | struct page *w_target_page; |
864 | 882 | ||
865 | /* | 883 | /* |
884 | * w_target_locked is used for page_mkwrite path indicating no unlocking | ||
885 | * against w_target_page in ocfs2_write_end_nolock. | ||
886 | */ | ||
887 | unsigned int w_target_locked:1; | ||
888 | |||
889 | /* | ||
866 | * ocfs2_write_end() uses this to know what the real range to | 890 | * ocfs2_write_end() uses this to know what the real range to |
867 | * write in the target should be. | 891 | * write in the target should be. |
868 | */ | 892 | */ |
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) | |||
895 | 919 | ||
896 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | 920 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) |
897 | { | 921 | { |
922 | int i; | ||
923 | |||
924 | /* | ||
925 | * w_target_locked is only set to true in the page_mkwrite() case. | ||
926 | * The intent is to allow us to lock the target page from write_begin() | ||
927 | * to write_end(). The caller must hold a ref on w_target_page. | ||
928 | */ | ||
929 | if (wc->w_target_locked) { | ||
930 | BUG_ON(!wc->w_target_page); | ||
931 | for (i = 0; i < wc->w_num_pages; i++) { | ||
932 | if (wc->w_target_page == wc->w_pages[i]) { | ||
933 | wc->w_pages[i] = NULL; | ||
934 | break; | ||
935 | } | ||
936 | } | ||
937 | mark_page_accessed(wc->w_target_page); | ||
938 | page_cache_release(wc->w_target_page); | ||
939 | } | ||
898 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); | 940 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); |
899 | 941 | ||
900 | brelse(wc->w_di_bh); | 942 | brelse(wc->w_di_bh); |
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1132 | */ | 1174 | */ |
1133 | lock_page(mmap_page); | 1175 | lock_page(mmap_page); |
1134 | 1176 | ||
1177 | /* Exit and let the caller retry */ | ||
1135 | if (mmap_page->mapping != mapping) { | 1178 | if (mmap_page->mapping != mapping) { |
1179 | WARN_ON(mmap_page->mapping); | ||
1136 | unlock_page(mmap_page); | 1180 | unlock_page(mmap_page); |
1137 | /* | 1181 | ret = -EAGAIN; |
1138 | * Sanity check - the locking in | ||
1139 | * ocfs2_pagemkwrite() should ensure | ||
1140 | * that this code doesn't trigger. | ||
1141 | */ | ||
1142 | ret = -EINVAL; | ||
1143 | mlog_errno(ret); | ||
1144 | goto out; | 1182 | goto out; |
1145 | } | 1183 | } |
1146 | 1184 | ||
1147 | page_cache_get(mmap_page); | 1185 | page_cache_get(mmap_page); |
1148 | wc->w_pages[i] = mmap_page; | 1186 | wc->w_pages[i] = mmap_page; |
1187 | wc->w_target_locked = true; | ||
1149 | } else { | 1188 | } else { |
1150 | wc->w_pages[i] = find_or_create_page(mapping, index, | 1189 | wc->w_pages[i] = find_or_create_page(mapping, index, |
1151 | GFP_NOFS); | 1190 | GFP_NOFS); |
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1160 | wc->w_target_page = wc->w_pages[i]; | 1199 | wc->w_target_page = wc->w_pages[i]; |
1161 | } | 1200 | } |
1162 | out: | 1201 | out: |
1202 | if (ret) | ||
1203 | wc->w_target_locked = false; | ||
1163 | return ret; | 1204 | return ret; |
1164 | } | 1205 | } |
1165 | 1206 | ||
@@ -1817,11 +1858,23 @@ try_again: | |||
1817 | */ | 1858 | */ |
1818 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, | 1859 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, |
1819 | cluster_of_pages, mmap_page); | 1860 | cluster_of_pages, mmap_page); |
1820 | if (ret) { | 1861 | if (ret && ret != -EAGAIN) { |
1821 | mlog_errno(ret); | 1862 | mlog_errno(ret); |
1822 | goto out_quota; | 1863 | goto out_quota; |
1823 | } | 1864 | } |
1824 | 1865 | ||
1866 | /* | ||
1867 | * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock | ||
1868 | * the target page. In this case, we exit with no error and no target | ||
1869 | * page. This will trigger the caller, page_mkwrite(), to re-try | ||
1870 | * the operation. | ||
1871 | */ | ||
1872 | if (ret == -EAGAIN) { | ||
1873 | BUG_ON(wc->w_target_page); | ||
1874 | ret = 0; | ||
1875 | goto out_quota; | ||
1876 | } | ||
1877 | |||
1825 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, | 1878 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, |
1826 | len); | 1879 | len); |
1827 | if (ret) { | 1880 | if (ret) { |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 75cf3ad987a6..ffb2da370a99 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits { | |||
78 | OCFS2_IOCB_RW_LOCK = 0, | 78 | OCFS2_IOCB_RW_LOCK = 0, |
79 | OCFS2_IOCB_RW_LOCK_LEVEL, | 79 | OCFS2_IOCB_RW_LOCK_LEVEL, |
80 | OCFS2_IOCB_SEM, | 80 | OCFS2_IOCB_SEM, |
81 | OCFS2_IOCB_UNALIGNED_IO, | ||
81 | OCFS2_IOCB_NUM_LOCKS | 82 | OCFS2_IOCB_NUM_LOCKS |
82 | }; | 83 | }; |
83 | 84 | ||
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits { | |||
91 | clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | 92 | clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) |
92 | #define ocfs2_iocb_is_sem_locked(iocb) \ | 93 | #define ocfs2_iocb_is_sem_locked(iocb) \ |
93 | test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | 94 | test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) |
95 | |||
96 | #define ocfs2_iocb_set_unaligned_aio(iocb) \ | ||
97 | set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
98 | #define ocfs2_iocb_clear_unaligned_aio(iocb) \ | ||
99 | clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
100 | #define ocfs2_iocb_is_unaligned_aio(iocb) \ | ||
101 | test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
102 | |||
103 | #define OCFS2_IOEND_WQ_HASH_SZ 37 | ||
104 | #define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ | ||
105 | OCFS2_IOEND_WQ_HASH_SZ]) | ||
106 | extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; | ||
107 | |||
94 | #endif /* OCFS2_FILE_H */ | 108 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9a3e6bbff27b..a4e855e3690e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -216,6 +216,7 @@ struct o2hb_region { | |||
216 | 216 | ||
217 | struct list_head hr_all_item; | 217 | struct list_head hr_all_item; |
218 | unsigned hr_unclean_stop:1, | 218 | unsigned hr_unclean_stop:1, |
219 | hr_aborted_start:1, | ||
219 | hr_item_pinned:1, | 220 | hr_item_pinned:1, |
220 | hr_item_dropped:1; | 221 | hr_item_dropped:1; |
221 | 222 | ||
@@ -254,6 +255,10 @@ struct o2hb_region { | |||
254 | * a more complete api that doesn't lead to this sort of fragility. */ | 255 | * a more complete api that doesn't lead to this sort of fragility. */ |
255 | atomic_t hr_steady_iterations; | 256 | atomic_t hr_steady_iterations; |
256 | 257 | ||
258 | /* terminate o2hb thread if it does not reach steady state | ||
259 | * (hr_steady_iterations == 0) within hr_unsteady_iterations */ | ||
260 | atomic_t hr_unsteady_iterations; | ||
261 | |||
257 | char hr_dev_name[BDEVNAME_SIZE]; | 262 | char hr_dev_name[BDEVNAME_SIZE]; |
258 | 263 | ||
259 | unsigned int hr_timeout_ms; | 264 | unsigned int hr_timeout_ms; |
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work) | |||
324 | 329 | ||
325 | static void o2hb_arm_write_timeout(struct o2hb_region *reg) | 330 | static void o2hb_arm_write_timeout(struct o2hb_region *reg) |
326 | { | 331 | { |
332 | /* Arm writeout only after thread reaches steady state */ | ||
333 | if (atomic_read(®->hr_steady_iterations) != 0) | ||
334 | return; | ||
335 | |||
327 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", | 336 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", |
328 | O2HB_MAX_WRITE_TIMEOUT_MS); | 337 | O2HB_MAX_WRITE_TIMEOUT_MS); |
329 | 338 | ||
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg, | |||
537 | return read == computed; | 546 | return read == computed; |
538 | } | 547 | } |
539 | 548 | ||
540 | /* We want to make sure that nobody is heartbeating on top of us -- | 549 | /* |
541 | * this will help detect an invalid configuration. */ | 550 | * Compare the slot data with what we wrote in the last iteration. |
542 | static void o2hb_check_last_timestamp(struct o2hb_region *reg) | 551 | * If the match fails, print an appropriate error message. This is to |
552 | * detect errors like... another node hearting on the same slot, | ||
553 | * flaky device that is losing writes, etc. | ||
554 | * Returns 1 if check succeeds, 0 otherwise. | ||
555 | */ | ||
556 | static int o2hb_check_own_slot(struct o2hb_region *reg) | ||
543 | { | 557 | { |
544 | struct o2hb_disk_slot *slot; | 558 | struct o2hb_disk_slot *slot; |
545 | struct o2hb_disk_heartbeat_block *hb_block; | 559 | struct o2hb_disk_heartbeat_block *hb_block; |
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) | |||
548 | slot = ®->hr_slots[o2nm_this_node()]; | 562 | slot = ®->hr_slots[o2nm_this_node()]; |
549 | /* Don't check on our 1st timestamp */ | 563 | /* Don't check on our 1st timestamp */ |
550 | if (!slot->ds_last_time) | 564 | if (!slot->ds_last_time) |
551 | return; | 565 | return 0; |
552 | 566 | ||
553 | hb_block = slot->ds_raw_block; | 567 | hb_block = slot->ds_raw_block; |
554 | if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && | 568 | if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && |
555 | le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && | 569 | le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && |
556 | hb_block->hb_node == slot->ds_node_num) | 570 | hb_block->hb_node == slot->ds_node_num) |
557 | return; | 571 | return 1; |
558 | 572 | ||
559 | #define ERRSTR1 "Another node is heartbeating on device" | 573 | #define ERRSTR1 "Another node is heartbeating on device" |
560 | #define ERRSTR2 "Heartbeat generation mismatch on device" | 574 | #define ERRSTR2 "Heartbeat generation mismatch on device" |
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) | |||
574 | (unsigned long long)slot->ds_last_time, hb_block->hb_node, | 588 | (unsigned long long)slot->ds_last_time, hb_block->hb_node, |
575 | (unsigned long long)le64_to_cpu(hb_block->hb_generation), | 589 | (unsigned long long)le64_to_cpu(hb_block->hb_generation), |
576 | (unsigned long long)le64_to_cpu(hb_block->hb_seq)); | 590 | (unsigned long long)le64_to_cpu(hb_block->hb_seq)); |
591 | |||
592 | return 0; | ||
577 | } | 593 | } |
578 | 594 | ||
579 | static inline void o2hb_prepare_block(struct o2hb_region *reg, | 595 | static inline void o2hb_prepare_block(struct o2hb_region *reg, |
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) | |||
719 | o2nm_node_put(node); | 735 | o2nm_node_put(node); |
720 | } | 736 | } |
721 | 737 | ||
722 | static void o2hb_set_quorum_device(struct o2hb_region *reg, | 738 | static void o2hb_set_quorum_device(struct o2hb_region *reg) |
723 | struct o2hb_disk_slot *slot) | ||
724 | { | 739 | { |
725 | assert_spin_locked(&o2hb_live_lock); | ||
726 | |||
727 | if (!o2hb_global_heartbeat_active()) | 740 | if (!o2hb_global_heartbeat_active()) |
728 | return; | 741 | return; |
729 | 742 | ||
730 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | 743 | /* Prevent race with o2hb_heartbeat_group_drop_item() */ |
744 | if (kthread_should_stop()) | ||
745 | return; | ||
746 | |||
747 | /* Tag region as quorum only after thread reaches steady state */ | ||
748 | if (atomic_read(®->hr_steady_iterations) != 0) | ||
731 | return; | 749 | return; |
732 | 750 | ||
751 | spin_lock(&o2hb_live_lock); | ||
752 | |||
753 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
754 | goto unlock; | ||
755 | |||
733 | /* | 756 | /* |
734 | * A region can be added to the quorum only when it sees all | 757 | * A region can be added to the quorum only when it sees all |
735 | * live nodes heartbeat on it. In other words, the region has been | 758 | * live nodes heartbeat on it. In other words, the region has been |
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, | |||
737 | */ | 760 | */ |
738 | if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, | 761 | if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, |
739 | sizeof(o2hb_live_node_bitmap))) | 762 | sizeof(o2hb_live_node_bitmap))) |
740 | return; | 763 | goto unlock; |
741 | |||
742 | if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) | ||
743 | return; | ||
744 | 764 | ||
745 | printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", | 765 | printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", |
746 | config_item_name(®->hr_item)); | 766 | config_item_name(®->hr_item), reg->hr_dev_name); |
747 | 767 | ||
748 | set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | 768 | set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); |
749 | 769 | ||
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, | |||
754 | if (o2hb_pop_count(&o2hb_quorum_region_bitmap, | 774 | if (o2hb_pop_count(&o2hb_quorum_region_bitmap, |
755 | O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) | 775 | O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) |
756 | o2hb_region_unpin(NULL); | 776 | o2hb_region_unpin(NULL); |
777 | unlock: | ||
778 | spin_unlock(&o2hb_live_lock); | ||
757 | } | 779 | } |
758 | 780 | ||
759 | static int o2hb_check_slot(struct o2hb_region *reg, | 781 | static int o2hb_check_slot(struct o2hb_region *reg, |
@@ -925,8 +947,6 @@ fire_callbacks: | |||
925 | slot->ds_equal_samples = 0; | 947 | slot->ds_equal_samples = 0; |
926 | } | 948 | } |
927 | out: | 949 | out: |
928 | o2hb_set_quorum_device(reg, slot); | ||
929 | |||
930 | spin_unlock(&o2hb_live_lock); | 950 | spin_unlock(&o2hb_live_lock); |
931 | 951 | ||
932 | o2hb_run_event_list(&event); | 952 | o2hb_run_event_list(&event); |
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes, | |||
957 | 977 | ||
958 | static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | 978 | static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) |
959 | { | 979 | { |
960 | int i, ret, highest_node, change = 0; | 980 | int i, ret, highest_node; |
981 | int membership_change = 0, own_slot_ok = 0; | ||
961 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 982 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
962 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 983 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
963 | struct o2hb_bio_wait_ctxt write_wc; | 984 | struct o2hb_bio_wait_ctxt write_wc; |
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
966 | sizeof(configured_nodes)); | 987 | sizeof(configured_nodes)); |
967 | if (ret) { | 988 | if (ret) { |
968 | mlog_errno(ret); | 989 | mlog_errno(ret); |
969 | return ret; | 990 | goto bail; |
970 | } | 991 | } |
971 | 992 | ||
972 | /* | 993 | /* |
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
982 | 1003 | ||
983 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); | 1004 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); |
984 | if (highest_node >= O2NM_MAX_NODES) { | 1005 | if (highest_node >= O2NM_MAX_NODES) { |
985 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); | 1006 | mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); |
986 | return -EINVAL; | 1007 | ret = -EINVAL; |
1008 | goto bail; | ||
987 | } | 1009 | } |
988 | 1010 | ||
989 | /* No sense in reading the slots of nodes that don't exist | 1011 | /* No sense in reading the slots of nodes that don't exist |
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
993 | ret = o2hb_read_slots(reg, highest_node + 1); | 1015 | ret = o2hb_read_slots(reg, highest_node + 1); |
994 | if (ret < 0) { | 1016 | if (ret < 0) { |
995 | mlog_errno(ret); | 1017 | mlog_errno(ret); |
996 | return ret; | 1018 | goto bail; |
997 | } | 1019 | } |
998 | 1020 | ||
999 | /* With an up to date view of the slots, we can check that no | 1021 | /* With an up to date view of the slots, we can check that no |
1000 | * other node has been improperly configured to heartbeat in | 1022 | * other node has been improperly configured to heartbeat in |
1001 | * our slot. */ | 1023 | * our slot. */ |
1002 | o2hb_check_last_timestamp(reg); | 1024 | own_slot_ok = o2hb_check_own_slot(reg); |
1003 | 1025 | ||
1004 | /* fill in the proper info for our next heartbeat */ | 1026 | /* fill in the proper info for our next heartbeat */ |
1005 | o2hb_prepare_block(reg, reg->hr_generation); | 1027 | o2hb_prepare_block(reg, reg->hr_generation); |
1006 | 1028 | ||
1007 | /* And fire off the write. Note that we don't wait on this I/O | ||
1008 | * until later. */ | ||
1009 | ret = o2hb_issue_node_write(reg, &write_wc); | 1029 | ret = o2hb_issue_node_write(reg, &write_wc); |
1010 | if (ret < 0) { | 1030 | if (ret < 0) { |
1011 | mlog_errno(ret); | 1031 | mlog_errno(ret); |
1012 | return ret; | 1032 | goto bail; |
1013 | } | 1033 | } |
1014 | 1034 | ||
1015 | i = -1; | 1035 | i = -1; |
1016 | while((i = find_next_bit(configured_nodes, | 1036 | while((i = find_next_bit(configured_nodes, |
1017 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { | 1037 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { |
1018 | change |= o2hb_check_slot(reg, ®->hr_slots[i]); | 1038 | membership_change |= o2hb_check_slot(reg, ®->hr_slots[i]); |
1019 | } | 1039 | } |
1020 | 1040 | ||
1021 | /* | 1041 | /* |
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
1030 | * disk */ | 1050 | * disk */ |
1031 | mlog(ML_ERROR, "Write error %d on device \"%s\"\n", | 1051 | mlog(ML_ERROR, "Write error %d on device \"%s\"\n", |
1032 | write_wc.wc_error, reg->hr_dev_name); | 1052 | write_wc.wc_error, reg->hr_dev_name); |
1033 | return write_wc.wc_error; | 1053 | ret = write_wc.wc_error; |
1054 | goto bail; | ||
1034 | } | 1055 | } |
1035 | 1056 | ||
1036 | o2hb_arm_write_timeout(reg); | 1057 | /* Skip disarming the timeout if own slot has stale/bad data */ |
1058 | if (own_slot_ok) { | ||
1059 | o2hb_set_quorum_device(reg); | ||
1060 | o2hb_arm_write_timeout(reg); | ||
1061 | } | ||
1037 | 1062 | ||
1063 | bail: | ||
1038 | /* let the person who launched us know when things are steady */ | 1064 | /* let the person who launched us know when things are steady */ |
1039 | if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { | 1065 | if (atomic_read(®->hr_steady_iterations) != 0) { |
1040 | if (atomic_dec_and_test(®->hr_steady_iterations)) | 1066 | if (!ret && own_slot_ok && !membership_change) { |
1067 | if (atomic_dec_and_test(®->hr_steady_iterations)) | ||
1068 | wake_up(&o2hb_steady_queue); | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | if (atomic_read(®->hr_steady_iterations) != 0) { | ||
1073 | if (atomic_dec_and_test(®->hr_unsteady_iterations)) { | ||
1074 | printk(KERN_NOTICE "o2hb: Unable to stabilize " | ||
1075 | "heartbeart on region %s (%s)\n", | ||
1076 | config_item_name(®->hr_item), | ||
1077 | reg->hr_dev_name); | ||
1078 | atomic_set(®->hr_steady_iterations, 0); | ||
1079 | reg->hr_aborted_start = 1; | ||
1041 | wake_up(&o2hb_steady_queue); | 1080 | wake_up(&o2hb_steady_queue); |
1081 | ret = -EIO; | ||
1082 | } | ||
1042 | } | 1083 | } |
1043 | 1084 | ||
1044 | return 0; | 1085 | return ret; |
1045 | } | 1086 | } |
1046 | 1087 | ||
1047 | /* Subtract b from a, storing the result in a. a *must* have a larger | 1088 | /* Subtract b from a, storing the result in a. a *must* have a larger |
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data) | |||
1095 | /* Pin node */ | 1136 | /* Pin node */ |
1096 | o2nm_depend_this_node(); | 1137 | o2nm_depend_this_node(); |
1097 | 1138 | ||
1098 | while (!kthread_should_stop() && !reg->hr_unclean_stop) { | 1139 | while (!kthread_should_stop() && |
1140 | !reg->hr_unclean_stop && !reg->hr_aborted_start) { | ||
1099 | /* We track the time spent inside | 1141 | /* We track the time spent inside |
1100 | * o2hb_do_disk_heartbeat so that we avoid more than | 1142 | * o2hb_do_disk_heartbeat so that we avoid more than |
1101 | * hr_timeout_ms between disk writes. On busy systems | 1143 | * hr_timeout_ms between disk writes. On busy systems |
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data) | |||
1103 | * likely to time itself out. */ | 1145 | * likely to time itself out. */ |
1104 | do_gettimeofday(&before_hb); | 1146 | do_gettimeofday(&before_hb); |
1105 | 1147 | ||
1106 | i = 0; | 1148 | ret = o2hb_do_disk_heartbeat(reg); |
1107 | do { | ||
1108 | ret = o2hb_do_disk_heartbeat(reg); | ||
1109 | } while (ret && ++i < 2); | ||
1110 | 1149 | ||
1111 | do_gettimeofday(&after_hb); | 1150 | do_gettimeofday(&after_hb); |
1112 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); | 1151 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); |
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data) | |||
1117 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, | 1156 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, |
1118 | elapsed_msec); | 1157 | elapsed_msec); |
1119 | 1158 | ||
1120 | if (elapsed_msec < reg->hr_timeout_ms) { | 1159 | if (!kthread_should_stop() && |
1160 | elapsed_msec < reg->hr_timeout_ms) { | ||
1121 | /* the kthread api has blocked signals for us so no | 1161 | /* the kthread api has blocked signals for us so no |
1122 | * need to record the return value. */ | 1162 | * need to record the return value. */ |
1123 | msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); | 1163 | msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); |
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data) | |||
1134 | * to timeout on this region when we could just as easily | 1174 | * to timeout on this region when we could just as easily |
1135 | * write a clear generation - thus indicating to them that | 1175 | * write a clear generation - thus indicating to them that |
1136 | * this node has left this region. | 1176 | * this node has left this region. |
1137 | * | 1177 | */ |
1138 | * XXX: Should we skip this on unclean_stop? */ | 1178 | if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { |
1139 | o2hb_prepare_block(reg, 0); | 1179 | o2hb_prepare_block(reg, 0); |
1140 | ret = o2hb_issue_node_write(reg, &write_wc); | 1180 | ret = o2hb_issue_node_write(reg, &write_wc); |
1141 | if (ret == 0) { | 1181 | if (ret == 0) |
1142 | o2hb_wait_on_io(reg, &write_wc); | 1182 | o2hb_wait_on_io(reg, &write_wc); |
1143 | } else { | 1183 | else |
1144 | mlog_errno(ret); | 1184 | mlog_errno(ret); |
1145 | } | 1185 | } |
1146 | 1186 | ||
1147 | /* Unpin node */ | 1187 | /* Unpin node */ |
1148 | o2nm_undepend_this_node(); | 1188 | o2nm_undepend_this_node(); |
1149 | 1189 | ||
1150 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); | 1190 | mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n"); |
1151 | 1191 | ||
1152 | return 0; | 1192 | return 0; |
1153 | } | 1193 | } |
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) | |||
1158 | struct o2hb_debug_buf *db = inode->i_private; | 1198 | struct o2hb_debug_buf *db = inode->i_private; |
1159 | struct o2hb_region *reg; | 1199 | struct o2hb_region *reg; |
1160 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 1200 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
1201 | unsigned long lts; | ||
1161 | char *buf = NULL; | 1202 | char *buf = NULL; |
1162 | int i = -1; | 1203 | int i = -1; |
1163 | int out = 0; | 1204 | int out = 0; |
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) | |||
1194 | 1235 | ||
1195 | case O2HB_DB_TYPE_REGION_ELAPSED_TIME: | 1236 | case O2HB_DB_TYPE_REGION_ELAPSED_TIME: |
1196 | reg = (struct o2hb_region *)db->db_data; | 1237 | reg = (struct o2hb_region *)db->db_data; |
1197 | out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", | 1238 | lts = reg->hr_last_timeout_start; |
1198 | jiffies_to_msecs(jiffies - | 1239 | /* If 0, it has never been set before */ |
1199 | reg->hr_last_timeout_start)); | 1240 | if (lts) |
1241 | lts = jiffies_to_msecs(jiffies - lts); | ||
1242 | out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); | ||
1200 | goto done; | 1243 | goto done; |
1201 | 1244 | ||
1202 | case O2HB_DB_TYPE_REGION_PINNED: | 1245 | case O2HB_DB_TYPE_REGION_PINNED: |
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item) | |||
1426 | struct page *page; | 1469 | struct page *page; |
1427 | struct o2hb_region *reg = to_o2hb_region(item); | 1470 | struct o2hb_region *reg = to_o2hb_region(item); |
1428 | 1471 | ||
1472 | mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); | ||
1473 | |||
1429 | if (reg->hr_tmp_block) | 1474 | if (reg->hr_tmp_block) |
1430 | kfree(reg->hr_tmp_block); | 1475 | kfree(reg->hr_tmp_block); |
1431 | 1476 | ||
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1792 | live_threshold <<= 1; | 1837 | live_threshold <<= 1; |
1793 | spin_unlock(&o2hb_live_lock); | 1838 | spin_unlock(&o2hb_live_lock); |
1794 | } | 1839 | } |
1795 | atomic_set(®->hr_steady_iterations, live_threshold + 1); | 1840 | ++live_threshold; |
1841 | atomic_set(®->hr_steady_iterations, live_threshold); | ||
1842 | /* unsteady_iterations is double the steady_iterations */ | ||
1843 | atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); | ||
1796 | 1844 | ||
1797 | hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", | 1845 | hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", |
1798 | reg->hr_item.ci_name); | 1846 | reg->hr_item.ci_name); |
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1809 | ret = wait_event_interruptible(o2hb_steady_queue, | 1857 | ret = wait_event_interruptible(o2hb_steady_queue, |
1810 | atomic_read(®->hr_steady_iterations) == 0); | 1858 | atomic_read(®->hr_steady_iterations) == 0); |
1811 | if (ret) { | 1859 | if (ret) { |
1812 | /* We got interrupted (hello ptrace!). Clean up */ | 1860 | atomic_set(®->hr_steady_iterations, 0); |
1813 | spin_lock(&o2hb_live_lock); | 1861 | reg->hr_aborted_start = 1; |
1814 | hb_task = reg->hr_task; | 1862 | } |
1815 | reg->hr_task = NULL; | ||
1816 | spin_unlock(&o2hb_live_lock); | ||
1817 | 1863 | ||
1818 | if (hb_task) | 1864 | if (reg->hr_aborted_start) { |
1819 | kthread_stop(hb_task); | 1865 | ret = -EIO; |
1820 | goto out; | 1866 | goto out; |
1821 | } | 1867 | } |
1822 | 1868 | ||
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1833 | ret = -EIO; | 1879 | ret = -EIO; |
1834 | 1880 | ||
1835 | if (hb_task && o2hb_global_heartbeat_active()) | 1881 | if (hb_task && o2hb_global_heartbeat_active()) |
1836 | printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", | 1882 | printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", |
1837 | config_item_name(®->hr_item)); | 1883 | config_item_name(®->hr_item), reg->hr_dev_name); |
1838 | 1884 | ||
1839 | out: | 1885 | out: |
1840 | if (filp) | 1886 | if (filp) |
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
2092 | 2138 | ||
2093 | /* stop the thread when the user removes the region dir */ | 2139 | /* stop the thread when the user removes the region dir */ |
2094 | spin_lock(&o2hb_live_lock); | 2140 | spin_lock(&o2hb_live_lock); |
2095 | if (o2hb_global_heartbeat_active()) { | ||
2096 | clear_bit(reg->hr_region_num, o2hb_region_bitmap); | ||
2097 | clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); | ||
2098 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
2099 | quorum_region = 1; | ||
2100 | clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | ||
2101 | } | ||
2102 | hb_task = reg->hr_task; | 2141 | hb_task = reg->hr_task; |
2103 | reg->hr_task = NULL; | 2142 | reg->hr_task = NULL; |
2104 | reg->hr_item_dropped = 1; | 2143 | reg->hr_item_dropped = 1; |
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
2107 | if (hb_task) | 2146 | if (hb_task) |
2108 | kthread_stop(hb_task); | 2147 | kthread_stop(hb_task); |
2109 | 2148 | ||
2149 | if (o2hb_global_heartbeat_active()) { | ||
2150 | spin_lock(&o2hb_live_lock); | ||
2151 | clear_bit(reg->hr_region_num, o2hb_region_bitmap); | ||
2152 | clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); | ||
2153 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
2154 | quorum_region = 1; | ||
2155 | clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | ||
2156 | spin_unlock(&o2hb_live_lock); | ||
2157 | printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", | ||
2158 | ((atomic_read(®->hr_steady_iterations) == 0) ? | ||
2159 | "stopped" : "start aborted"), config_item_name(item), | ||
2160 | reg->hr_dev_name); | ||
2161 | } | ||
2162 | |||
2110 | /* | 2163 | /* |
2111 | * If we're racing a dev_write(), we need to wake them. They will | 2164 | * If we're racing a dev_write(), we need to wake them. They will |
2112 | * check reg->hr_task | 2165 | * check reg->hr_task |
2113 | */ | 2166 | */ |
2114 | if (atomic_read(®->hr_steady_iterations) != 0) { | 2167 | if (atomic_read(®->hr_steady_iterations) != 0) { |
2168 | reg->hr_aborted_start = 1; | ||
2115 | atomic_set(®->hr_steady_iterations, 0); | 2169 | atomic_set(®->hr_steady_iterations, 0); |
2116 | wake_up(&o2hb_steady_queue); | 2170 | wake_up(&o2hb_steady_queue); |
2117 | } | 2171 | } |
2118 | 2172 | ||
2119 | if (o2hb_global_heartbeat_active()) | ||
2120 | printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", | ||
2121 | config_item_name(®->hr_item)); | ||
2122 | |||
2123 | config_item_put(item); | 2173 | config_item_put(item); |
2124 | 2174 | ||
2125 | if (!o2hb_global_heartbeat_active() || !quorum_region) | 2175 | if (!o2hb_global_heartbeat_active() || !quorum_region) |
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 3a5835904b3d..dc45deb19e68 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #define SC_DEBUG_NAME "sock_containers" | 47 | #define SC_DEBUG_NAME "sock_containers" |
48 | #define NST_DEBUG_NAME "send_tracking" | 48 | #define NST_DEBUG_NAME "send_tracking" |
49 | #define STATS_DEBUG_NAME "stats" | 49 | #define STATS_DEBUG_NAME "stats" |
50 | #define NODES_DEBUG_NAME "connected_nodes" | ||
50 | 51 | ||
51 | #define SHOW_SOCK_CONTAINERS 0 | 52 | #define SHOW_SOCK_CONTAINERS 0 |
52 | #define SHOW_SOCK_STATS 1 | 53 | #define SHOW_SOCK_STATS 1 |
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry; | |||
55 | static struct dentry *sc_dentry; | 56 | static struct dentry *sc_dentry; |
56 | static struct dentry *nst_dentry; | 57 | static struct dentry *nst_dentry; |
57 | static struct dentry *stats_dentry; | 58 | static struct dentry *stats_dentry; |
59 | static struct dentry *nodes_dentry; | ||
58 | 60 | ||
59 | static DEFINE_SPINLOCK(o2net_debug_lock); | 61 | static DEFINE_SPINLOCK(o2net_debug_lock); |
60 | 62 | ||
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = { | |||
491 | .release = sc_fop_release, | 493 | .release = sc_fop_release, |
492 | }; | 494 | }; |
493 | 495 | ||
494 | int o2net_debugfs_init(void) | 496 | static int o2net_fill_bitmap(char *buf, int len) |
495 | { | 497 | { |
496 | o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); | 498 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
497 | if (!o2net_dentry) { | 499 | int i = -1, out = 0; |
498 | mlog_errno(-ENOMEM); | ||
499 | goto bail; | ||
500 | } | ||
501 | 500 | ||
502 | nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, | 501 | o2net_fill_node_map(map, sizeof(map)); |
503 | o2net_dentry, NULL, | ||
504 | &nst_seq_fops); | ||
505 | if (!nst_dentry) { | ||
506 | mlog_errno(-ENOMEM); | ||
507 | goto bail; | ||
508 | } | ||
509 | 502 | ||
510 | sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, | 503 | while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) |
511 | o2net_dentry, NULL, | 504 | out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); |
512 | &sc_seq_fops); | 505 | out += snprintf(buf + out, PAGE_SIZE - out, "\n"); |
513 | if (!sc_dentry) { | ||
514 | mlog_errno(-ENOMEM); | ||
515 | goto bail; | ||
516 | } | ||
517 | 506 | ||
518 | stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, | 507 | return out; |
519 | o2net_dentry, NULL, | 508 | } |
520 | &stats_seq_fops); | 509 | |
521 | if (!stats_dentry) { | 510 | static int nodes_fop_open(struct inode *inode, struct file *file) |
522 | mlog_errno(-ENOMEM); | 511 | { |
523 | goto bail; | 512 | char *buf; |
524 | } | 513 | |
514 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
515 | if (!buf) | ||
516 | return -ENOMEM; | ||
517 | |||
518 | i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE)); | ||
519 | |||
520 | file->private_data = buf; | ||
525 | 521 | ||
526 | return 0; | 522 | return 0; |
527 | bail: | ||
528 | debugfs_remove(stats_dentry); | ||
529 | debugfs_remove(sc_dentry); | ||
530 | debugfs_remove(nst_dentry); | ||
531 | debugfs_remove(o2net_dentry); | ||
532 | return -ENOMEM; | ||
533 | } | 523 | } |
534 | 524 | ||
525 | static int o2net_debug_release(struct inode *inode, struct file *file) | ||
526 | { | ||
527 | kfree(file->private_data); | ||
528 | return 0; | ||
529 | } | ||
530 | |||
531 | static ssize_t o2net_debug_read(struct file *file, char __user *buf, | ||
532 | size_t nbytes, loff_t *ppos) | ||
533 | { | ||
534 | return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, | ||
535 | i_size_read(file->f_mapping->host)); | ||
536 | } | ||
537 | |||
538 | static const struct file_operations nodes_fops = { | ||
539 | .open = nodes_fop_open, | ||
540 | .release = o2net_debug_release, | ||
541 | .read = o2net_debug_read, | ||
542 | .llseek = generic_file_llseek, | ||
543 | }; | ||
544 | |||
535 | void o2net_debugfs_exit(void) | 545 | void o2net_debugfs_exit(void) |
536 | { | 546 | { |
547 | debugfs_remove(nodes_dentry); | ||
537 | debugfs_remove(stats_dentry); | 548 | debugfs_remove(stats_dentry); |
538 | debugfs_remove(sc_dentry); | 549 | debugfs_remove(sc_dentry); |
539 | debugfs_remove(nst_dentry); | 550 | debugfs_remove(nst_dentry); |
540 | debugfs_remove(o2net_dentry); | 551 | debugfs_remove(o2net_dentry); |
541 | } | 552 | } |
542 | 553 | ||
554 | int o2net_debugfs_init(void) | ||
555 | { | ||
556 | mode_t mode = S_IFREG|S_IRUSR; | ||
557 | |||
558 | o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); | ||
559 | if (o2net_dentry) | ||
560 | nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, | ||
561 | o2net_dentry, NULL, &nst_seq_fops); | ||
562 | if (nst_dentry) | ||
563 | sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, | ||
564 | o2net_dentry, NULL, &sc_seq_fops); | ||
565 | if (sc_dentry) | ||
566 | stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, | ||
567 | o2net_dentry, NULL, &stats_seq_fops); | ||
568 | if (stats_dentry) | ||
569 | nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, | ||
570 | o2net_dentry, NULL, &nodes_fops); | ||
571 | if (nodes_dentry) | ||
572 | return 0; | ||
573 | |||
574 | o2net_debugfs_exit(); | ||
575 | mlog_errno(-ENOMEM); | ||
576 | return -ENOMEM; | ||
577 | } | ||
578 | |||
543 | #endif /* CONFIG_DEBUG_FS */ | 579 | #endif /* CONFIG_DEBUG_FS */ |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ad7d0c155de4..044e7b58d31c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -546,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
546 | } | 546 | } |
547 | 547 | ||
548 | if (was_valid && !valid) { | 548 | if (was_valid && !valid) { |
549 | printk(KERN_NOTICE "o2net: no longer connected to " | 549 | printk(KERN_NOTICE "o2net: No longer connected to " |
550 | SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); | 550 | SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); |
551 | o2net_complete_nodes_nsw(nn); | 551 | o2net_complete_nodes_nsw(nn); |
552 | } | 552 | } |
@@ -556,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
556 | cancel_delayed_work(&nn->nn_connect_expired); | 556 | cancel_delayed_work(&nn->nn_connect_expired); |
557 | printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", | 557 | printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", |
558 | o2nm_this_node() > sc->sc_node->nd_num ? | 558 | o2nm_this_node() > sc->sc_node->nd_num ? |
559 | "connected to" : "accepted connection from", | 559 | "Connected to" : "Accepted connection from", |
560 | SC_NODEF_ARGS(sc)); | 560 | SC_NODEF_ARGS(sc)); |
561 | } | 561 | } |
562 | 562 | ||
@@ -644,7 +644,7 @@ static void o2net_state_change(struct sock *sk) | |||
644 | o2net_sc_queue_work(sc, &sc->sc_connect_work); | 644 | o2net_sc_queue_work(sc, &sc->sc_connect_work); |
645 | break; | 645 | break; |
646 | default: | 646 | default: |
647 | printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT | 647 | printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT |
648 | " shutdown, state %d\n", | 648 | " shutdown, state %d\n", |
649 | SC_NODEF_ARGS(sc), sk->sk_state); | 649 | SC_NODEF_ARGS(sc), sk->sk_state); |
650 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); | 650 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); |
@@ -1035,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, | |||
1035 | return ret; | 1035 | return ret; |
1036 | } | 1036 | } |
1037 | 1037 | ||
1038 | /* Get a map of all nodes to which this node is currently connected to */ | ||
1039 | void o2net_fill_node_map(unsigned long *map, unsigned bytes) | ||
1040 | { | ||
1041 | struct o2net_sock_container *sc; | ||
1042 | int node, ret; | ||
1043 | |||
1044 | BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); | ||
1045 | |||
1046 | memset(map, 0, bytes); | ||
1047 | for (node = 0; node < O2NM_MAX_NODES; ++node) { | ||
1048 | o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); | ||
1049 | if (!ret) { | ||
1050 | set_bit(node, map); | ||
1051 | sc_put(sc); | ||
1052 | } | ||
1053 | } | ||
1054 | } | ||
1055 | EXPORT_SYMBOL_GPL(o2net_fill_node_map); | ||
1056 | |||
1038 | int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | 1057 | int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, |
1039 | size_t caller_veclen, u8 target_node, int *status) | 1058 | size_t caller_veclen, u8 target_node, int *status) |
1040 | { | 1059 | { |
@@ -1285,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1285 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | 1304 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); |
1286 | 1305 | ||
1287 | if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { | 1306 | if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { |
1288 | mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " | 1307 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " |
1289 | "version %llu but %llu is required, disconnecting\n", | 1308 | "protocol version %llu but %llu is required. " |
1290 | SC_NODEF_ARGS(sc), | 1309 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1291 | (unsigned long long)be64_to_cpu(hand->protocol_version), | 1310 | (unsigned long long)be64_to_cpu(hand->protocol_version), |
1292 | O2NET_PROTOCOL_VERSION); | 1311 | O2NET_PROTOCOL_VERSION); |
1293 | 1312 | ||
1294 | /* don't bother reconnecting if its the wrong version. */ | 1313 | /* don't bother reconnecting if its the wrong version. */ |
1295 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1314 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
@@ -1303,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1303 | */ | 1322 | */ |
1304 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != | 1323 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != |
1305 | o2net_idle_timeout()) { | 1324 | o2net_idle_timeout()) { |
1306 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " | 1325 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " |
1307 | "%u ms, but we use %u ms locally. disconnecting\n", | 1326 | "idle timeout of %u ms, but we use %u ms locally. " |
1308 | SC_NODEF_ARGS(sc), | 1327 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1309 | be32_to_cpu(hand->o2net_idle_timeout_ms), | 1328 | be32_to_cpu(hand->o2net_idle_timeout_ms), |
1310 | o2net_idle_timeout()); | 1329 | o2net_idle_timeout()); |
1311 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1330 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1312 | return -1; | 1331 | return -1; |
1313 | } | 1332 | } |
1314 | 1333 | ||
1315 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != | 1334 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != |
1316 | o2net_keepalive_delay()) { | 1335 | o2net_keepalive_delay()) { |
1317 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " | 1336 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " |
1318 | "%u ms, but we use %u ms locally. disconnecting\n", | 1337 | "delay of %u ms, but we use %u ms locally. " |
1319 | SC_NODEF_ARGS(sc), | 1338 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1320 | be32_to_cpu(hand->o2net_keepalive_delay_ms), | 1339 | be32_to_cpu(hand->o2net_keepalive_delay_ms), |
1321 | o2net_keepalive_delay()); | 1340 | o2net_keepalive_delay()); |
1322 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1341 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1323 | return -1; | 1342 | return -1; |
1324 | } | 1343 | } |
1325 | 1344 | ||
1326 | if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != | 1345 | if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != |
1327 | O2HB_MAX_WRITE_TIMEOUT_MS) { | 1346 | O2HB_MAX_WRITE_TIMEOUT_MS) { |
1328 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " | 1347 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " |
1329 | "%u ms, but we use %u ms locally. disconnecting\n", | 1348 | "timeout of %u ms, but we use %u ms locally. " |
1330 | SC_NODEF_ARGS(sc), | 1349 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1331 | be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), | 1350 | be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), |
1332 | O2HB_MAX_WRITE_TIMEOUT_MS); | 1351 | O2HB_MAX_WRITE_TIMEOUT_MS); |
1333 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1352 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1334 | return -1; | 1353 | return -1; |
1335 | } | 1354 | } |
@@ -1540,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data) | |||
1540 | { | 1559 | { |
1541 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; | 1560 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; |
1542 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | 1561 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); |
1543 | |||
1544 | #ifdef CONFIG_DEBUG_FS | 1562 | #ifdef CONFIG_DEBUG_FS |
1545 | ktime_t now = ktime_get(); | 1563 | unsigned long msecs = ktime_to_ms(ktime_get()) - |
1564 | ktime_to_ms(sc->sc_tv_timer); | ||
1565 | #else | ||
1566 | unsigned long msecs = o2net_idle_timeout(); | ||
1546 | #endif | 1567 | #endif |
1547 | 1568 | ||
1548 | printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " | 1569 | printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " |
1549 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), | 1570 | "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), |
1550 | o2net_idle_timeout() / 1000, | 1571 | msecs / 1000, msecs % 1000); |
1551 | o2net_idle_timeout() % 1000); | ||
1552 | |||
1553 | #ifdef CONFIG_DEBUG_FS | ||
1554 | mlog(ML_NOTICE, "Here are some times that might help debug the " | ||
1555 | "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " | ||
1556 | "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", | ||
1557 | (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), | ||
1558 | (long long)ktime_to_us(sc->sc_tv_data_ready), | ||
1559 | (long long)ktime_to_us(sc->sc_tv_advance_start), | ||
1560 | (long long)ktime_to_us(sc->sc_tv_advance_stop), | ||
1561 | sc->sc_msg_key, sc->sc_msg_type, | ||
1562 | (long long)ktime_to_us(sc->sc_tv_func_start), | ||
1563 | (long long)ktime_to_us(sc->sc_tv_func_stop)); | ||
1564 | #endif | ||
1565 | 1572 | ||
1566 | /* | 1573 | /* |
1567 | * Initialize the nn_timeout so that the next connection attempt | 1574 | * Initialize the nn_timeout so that the next connection attempt |
@@ -1694,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work) | |||
1694 | 1701 | ||
1695 | out: | 1702 | out: |
1696 | if (ret) { | 1703 | if (ret) { |
1697 | mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " | 1704 | printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT |
1698 | "with errno %d\n", SC_NODEF_ARGS(sc), ret); | 1705 | " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); |
1699 | /* 0 err so that another will be queued and attempted | 1706 | /* 0 err so that another will be queued and attempted |
1700 | * from set_nn_state */ | 1707 | * from set_nn_state */ |
1701 | if (sc) | 1708 | if (sc) |
@@ -1718,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work) | |||
1718 | 1725 | ||
1719 | spin_lock(&nn->nn_lock); | 1726 | spin_lock(&nn->nn_lock); |
1720 | if (!nn->nn_sc_valid) { | 1727 | if (!nn->nn_sc_valid) { |
1721 | mlog(ML_ERROR, "no connection established with node %u after " | 1728 | printk(KERN_NOTICE "o2net: No connection established with " |
1722 | "%u.%u seconds, giving up and returning errors.\n", | 1729 | "node %u after %u.%u seconds, giving up.\n", |
1723 | o2net_num_from_nn(nn), | 1730 | o2net_num_from_nn(nn), |
1724 | o2net_idle_timeout() / 1000, | 1731 | o2net_idle_timeout() / 1000, |
1725 | o2net_idle_timeout() % 1000); | 1732 | o2net_idle_timeout() % 1000); |
@@ -1862,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock) | |||
1862 | 1869 | ||
1863 | node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); | 1870 | node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); |
1864 | if (node == NULL) { | 1871 | if (node == NULL) { |
1865 | mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", | 1872 | printk(KERN_NOTICE "o2net: Attempt to connect from unknown " |
1866 | &sin.sin_addr.s_addr, ntohs(sin.sin_port)); | 1873 | "node at %pI4:%d\n", &sin.sin_addr.s_addr, |
1874 | ntohs(sin.sin_port)); | ||
1867 | ret = -EINVAL; | 1875 | ret = -EINVAL; |
1868 | goto out; | 1876 | goto out; |
1869 | } | 1877 | } |
1870 | 1878 | ||
1871 | if (o2nm_this_node() >= node->nd_num) { | 1879 | if (o2nm_this_node() >= node->nd_num) { |
1872 | local_node = o2nm_get_node_by_num(o2nm_this_node()); | 1880 | local_node = o2nm_get_node_by_num(o2nm_this_node()); |
1873 | mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" | 1881 | printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " |
1874 | "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", | 1882 | "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " |
1875 | local_node->nd_name, local_node->nd_num, | 1883 | "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, |
1876 | &(local_node->nd_ipv4_address), | 1884 | &(local_node->nd_ipv4_address), |
1877 | ntohs(local_node->nd_ipv4_port), | 1885 | ntohs(local_node->nd_ipv4_port), node->nd_name, |
1878 | node->nd_name, node->nd_num, &sin.sin_addr.s_addr, | 1886 | node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); |
1879 | ntohs(sin.sin_port)); | ||
1880 | ret = -EINVAL; | 1887 | ret = -EINVAL; |
1881 | goto out; | 1888 | goto out; |
1882 | } | 1889 | } |
@@ -1901,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock) | |||
1901 | ret = 0; | 1908 | ret = 0; |
1902 | spin_unlock(&nn->nn_lock); | 1909 | spin_unlock(&nn->nn_lock); |
1903 | if (ret) { | 1910 | if (ret) { |
1904 | mlog(ML_NOTICE, "attempt to connect from node '%s' at " | 1911 | printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " |
1905 | "%pI4:%d but it already has an open connection\n", | 1912 | "at %pI4:%d but it already has an open connection\n", |
1906 | node->nd_name, &sin.sin_addr.s_addr, | 1913 | node->nd_name, &sin.sin_addr.s_addr, |
1907 | ntohs(sin.sin_port)); | 1914 | ntohs(sin.sin_port)); |
1908 | goto out; | 1915 | goto out; |
1909 | } | 1916 | } |
1910 | 1917 | ||
@@ -1984,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) | |||
1984 | 1991 | ||
1985 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | 1992 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); |
1986 | if (ret < 0) { | 1993 | if (ret < 0) { |
1987 | mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); | 1994 | printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); |
1988 | goto out; | 1995 | goto out; |
1989 | } | 1996 | } |
1990 | 1997 | ||
@@ -2001,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) | |||
2001 | sock->sk->sk_reuse = 1; | 2008 | sock->sk->sk_reuse = 1; |
2002 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); | 2009 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); |
2003 | if (ret < 0) { | 2010 | if (ret < 0) { |
2004 | mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " | 2011 | printk(KERN_ERR "o2net: Error %d while binding socket at " |
2005 | "ret=%d\n", &addr, ntohs(port), ret); | 2012 | "%pI4:%u\n", ret, &addr, ntohs(port)); |
2006 | goto out; | 2013 | goto out; |
2007 | } | 2014 | } |
2008 | 2015 | ||
2009 | ret = sock->ops->listen(sock, 64); | 2016 | ret = sock->ops->listen(sock, 64); |
2010 | if (ret < 0) { | 2017 | if (ret < 0) |
2011 | mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", | 2018 | printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", |
2012 | &addr, ntohs(port), ret); | 2019 | ret, &addr, ntohs(port)); |
2013 | } | ||
2014 | 2020 | ||
2015 | out: | 2021 | out: |
2016 | if (ret) { | 2022 | if (ret) { |
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index fd6179eb26d4..5bada2a69b50 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h | |||
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, | |||
106 | struct list_head *unreg_list); | 106 | struct list_head *unreg_list); |
107 | void o2net_unregister_handler_list(struct list_head *list); | 107 | void o2net_unregister_handler_list(struct list_head *list); |
108 | 108 | ||
109 | void o2net_fill_node_map(unsigned long *map, unsigned bytes); | ||
110 | |||
109 | struct o2nm_node; | 111 | struct o2nm_node; |
110 | int o2net_register_hb_callbacks(void); | 112 | int o2net_register_hb_callbacks(void); |
111 | void o2net_unregister_hb_callbacks(void); | 113 | void o2net_unregister_hb_callbacks(void); |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index e2878b5895fb..8fe4e2892ab9 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, | |||
1184 | if (pde) | 1184 | if (pde) |
1185 | le16_add_cpu(&pde->rec_len, | 1185 | le16_add_cpu(&pde->rec_len, |
1186 | le16_to_cpu(de->rec_len)); | 1186 | le16_to_cpu(de->rec_len)); |
1187 | else | 1187 | de->inode = 0; |
1188 | de->inode = 0; | ||
1189 | dir->i_version++; | 1188 | dir->i_version++; |
1190 | ocfs2_journal_dirty(handle, bh); | 1189 | ocfs2_journal_dirty(handle, bh); |
1191 | goto bail; | 1190 | goto bail; |
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index d602abb51b61..a5952ceecba5 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); | |||
859 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm); | 859 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm); |
860 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); | 860 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); |
861 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); | 861 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); |
862 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); | 862 | void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); |
863 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); | 863 | void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); |
864 | 864 | ||
865 | void dlm_put(struct dlm_ctxt *dlm); | 865 | void dlm_put(struct dlm_ctxt *dlm); |
866 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); | 866 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); |
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res) | |||
877 | kref_get(&res->refs); | 877 | kref_get(&res->refs); |
878 | } | 878 | } |
879 | void dlm_lockres_put(struct dlm_lock_resource *res); | 879 | void dlm_lockres_put(struct dlm_lock_resource *res); |
880 | void __dlm_unhash_lockres(struct dlm_lock_resource *res); | 880 | void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); |
881 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | 881 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); |
882 | struct dlm_lock_resource *res); | ||
883 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, | 882 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, |
884 | const char *name, | 883 | const char *name, |
885 | unsigned int len, | 884 | unsigned int len, |
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | |||
902 | const char *name, | 901 | const char *name, |
903 | unsigned int namelen); | 902 | unsigned int namelen); |
904 | 903 | ||
905 | #define dlm_lockres_set_refmap_bit(bit,res) \ | 904 | void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, |
906 | __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) | 905 | struct dlm_lock_resource *res, int bit); |
907 | #define dlm_lockres_clear_refmap_bit(bit,res) \ | 906 | void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, |
908 | __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) | 907 | struct dlm_lock_resource *res, int bit); |
909 | 908 | ||
910 | static inline void __dlm_lockres_set_refmap_bit(int bit, | 909 | void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, |
911 | struct dlm_lock_resource *res, | 910 | struct dlm_lock_resource *res); |
912 | const char *file, | 911 | void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, |
913 | int line) | 912 | struct dlm_lock_resource *res); |
914 | { | ||
915 | //printk("%s:%d:%.*s: setting bit %d\n", file, line, | ||
916 | // res->lockname.len, res->lockname.name, bit); | ||
917 | set_bit(bit, res->refmap); | ||
918 | } | ||
919 | |||
920 | static inline void __dlm_lockres_clear_refmap_bit(int bit, | ||
921 | struct dlm_lock_resource *res, | ||
922 | const char *file, | ||
923 | int line) | ||
924 | { | ||
925 | //printk("%s:%d:%.*s: clearing bit %d\n", file, line, | ||
926 | // res->lockname.len, res->lockname.name, bit); | ||
927 | clear_bit(bit, res->refmap); | ||
928 | } | ||
929 | |||
930 | void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, | ||
931 | struct dlm_lock_resource *res, | ||
932 | const char *file, | ||
933 | int line); | ||
934 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
935 | struct dlm_lock_resource *res, | ||
936 | int new_lockres, | ||
937 | const char *file, | ||
938 | int line); | ||
939 | #define dlm_lockres_drop_inflight_ref(d,r) \ | ||
940 | __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) | ||
941 | #define dlm_lockres_grab_inflight_ref(d,r) \ | ||
942 | __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) | ||
943 | #define dlm_lockres_grab_inflight_ref_new(d,r) \ | ||
944 | __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) | ||
945 | 913 | ||
946 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | 914 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); |
947 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | 915 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6ed6b95dcf93..92f2ead0fab6 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing, | |||
157 | 157 | ||
158 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); | 158 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); |
159 | 159 | ||
160 | void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) | 160 | void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) |
161 | { | 161 | { |
162 | if (!hlist_unhashed(&lockres->hash_node)) { | 162 | if (hlist_unhashed(&res->hash_node)) |
163 | hlist_del_init(&lockres->hash_node); | 163 | return; |
164 | dlm_lockres_put(lockres); | 164 | |
165 | } | 165 | mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len, |
166 | res->lockname.name); | ||
167 | hlist_del_init(&res->hash_node); | ||
168 | dlm_lockres_put(res); | ||
166 | } | 169 | } |
167 | 170 | ||
168 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | 171 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) |
169 | struct dlm_lock_resource *res) | ||
170 | { | 172 | { |
171 | struct hlist_head *bucket; | 173 | struct hlist_head *bucket; |
172 | struct qstr *q; | 174 | struct qstr *q; |
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, | |||
180 | dlm_lockres_get(res); | 182 | dlm_lockres_get(res); |
181 | 183 | ||
182 | hlist_add_head(&res->hash_node, bucket); | 184 | hlist_add_head(&res->hash_node, bucket); |
185 | |||
186 | mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len, | ||
187 | res->lockname.name); | ||
183 | } | 188 | } |
184 | 189 | ||
185 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, | 190 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, |
@@ -539,17 +544,17 @@ again: | |||
539 | 544 | ||
540 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) | 545 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) |
541 | { | 546 | { |
542 | int node = -1; | 547 | int node = -1, num = 0; |
543 | 548 | ||
544 | assert_spin_locked(&dlm->spinlock); | 549 | assert_spin_locked(&dlm->spinlock); |
545 | 550 | ||
546 | printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); | 551 | printk("( "); |
547 | |||
548 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | 552 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, |
549 | node + 1)) < O2NM_MAX_NODES) { | 553 | node + 1)) < O2NM_MAX_NODES) { |
550 | printk("%d ", node); | 554 | printk("%d ", node); |
555 | ++num; | ||
551 | } | 556 | } |
552 | printk("\n"); | 557 | printk(") %u nodes\n", num); |
553 | } | 558 | } |
554 | 559 | ||
555 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, | 560 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, |
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, | |||
566 | 571 | ||
567 | node = exit_msg->node_idx; | 572 | node = exit_msg->node_idx; |
568 | 573 | ||
569 | printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); | ||
570 | |||
571 | spin_lock(&dlm->spinlock); | 574 | spin_lock(&dlm->spinlock); |
572 | clear_bit(node, dlm->domain_map); | 575 | clear_bit(node, dlm->domain_map); |
573 | clear_bit(node, dlm->exit_domain_map); | 576 | clear_bit(node, dlm->exit_domain_map); |
577 | printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name); | ||
574 | __dlm_print_nodes(dlm); | 578 | __dlm_print_nodes(dlm); |
575 | 579 | ||
576 | /* notify anything attached to the heartbeat events */ | 580 | /* notify anything attached to the heartbeat events */ |
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) | |||
755 | 759 | ||
756 | dlm_mark_domain_leaving(dlm); | 760 | dlm_mark_domain_leaving(dlm); |
757 | dlm_leave_domain(dlm); | 761 | dlm_leave_domain(dlm); |
762 | printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name); | ||
758 | dlm_force_free_mles(dlm); | 763 | dlm_force_free_mles(dlm); |
759 | dlm_complete_dlm_shutdown(dlm); | 764 | dlm_complete_dlm_shutdown(dlm); |
760 | } | 765 | } |
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, | |||
970 | clear_bit(assert->node_idx, dlm->exit_domain_map); | 975 | clear_bit(assert->node_idx, dlm->exit_domain_map); |
971 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | 976 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); |
972 | 977 | ||
973 | printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", | 978 | printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ", |
974 | assert->node_idx, dlm->name); | 979 | assert->node_idx, dlm->name); |
975 | __dlm_print_nodes(dlm); | 980 | __dlm_print_nodes(dlm); |
976 | 981 | ||
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | |||
1701 | bail: | 1706 | bail: |
1702 | spin_lock(&dlm->spinlock); | 1707 | spin_lock(&dlm->spinlock); |
1703 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | 1708 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); |
1704 | if (!status) | 1709 | if (!status) { |
1710 | printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name); | ||
1705 | __dlm_print_nodes(dlm); | 1711 | __dlm_print_nodes(dlm); |
1712 | } | ||
1706 | spin_unlock(&dlm->spinlock); | 1713 | spin_unlock(&dlm->spinlock); |
1707 | 1714 | ||
1708 | if (ctxt) { | 1715 | if (ctxt) { |
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, | |||
2131 | goto leave; | 2138 | goto leave; |
2132 | } | 2139 | } |
2133 | 2140 | ||
2134 | if (!o2hb_check_local_node_heartbeating()) { | ||
2135 | mlog(ML_ERROR, "the local node has not been configured, or is " | ||
2136 | "not heartbeating\n"); | ||
2137 | ret = -EPROTO; | ||
2138 | goto leave; | ||
2139 | } | ||
2140 | |||
2141 | mlog(0, "register called for domain \"%s\"\n", domain); | 2141 | mlog(0, "register called for domain \"%s\"\n", domain); |
2142 | 2142 | ||
2143 | retry: | 2143 | retry: |
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 8d39e0fd66f7..975810b98492 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c | |||
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, | |||
183 | kick_thread = 1; | 183 | kick_thread = 1; |
184 | } | 184 | } |
185 | } | 185 | } |
186 | /* reduce the inflight count, this may result in the lockres | ||
187 | * being purged below during calc_usage */ | ||
188 | if (lock->ml.node == dlm->node_num) | ||
189 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
190 | 186 | ||
191 | spin_unlock(&res->spinlock); | 187 | spin_unlock(&res->spinlock); |
192 | wake_up(&res->wq); | 188 | wake_up(&res->wq); |
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, | |||
231 | lock->ml.type, res->lockname.len, | 227 | lock->ml.type, res->lockname.len, |
232 | res->lockname.name, flags); | 228 | res->lockname.name, flags); |
233 | 229 | ||
230 | /* | ||
231 | * Wait if resource is getting recovered, remastered, etc. | ||
232 | * If the resource was remastered and new owner is self, then exit. | ||
233 | */ | ||
234 | spin_lock(&res->spinlock); | 234 | spin_lock(&res->spinlock); |
235 | |||
236 | /* will exit this call with spinlock held */ | ||
237 | __dlm_wait_on_lockres(res); | 235 | __dlm_wait_on_lockres(res); |
236 | if (res->owner == dlm->node_num) { | ||
237 | spin_unlock(&res->spinlock); | ||
238 | return DLM_RECOVERING; | ||
239 | } | ||
238 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | 240 | res->state |= DLM_LOCK_RES_IN_PROGRESS; |
239 | 241 | ||
240 | /* add lock to local (secondary) queue */ | 242 | /* add lock to local (secondary) queue */ |
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, | |||
319 | tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, | 321 | tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, |
320 | sizeof(create), res->owner, &status); | 322 | sizeof(create), res->owner, &status); |
321 | if (tmpret >= 0) { | 323 | if (tmpret >= 0) { |
322 | // successfully sent and received | 324 | ret = status; |
323 | ret = status; // this is already a dlm_status | ||
324 | if (ret == DLM_REJECTED) { | 325 | if (ret == DLM_REJECTED) { |
325 | mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " | 326 | mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer " |
326 | "no longer owned by %u. that node is coming back " | 327 | "owned by node %u. That node is coming back up " |
327 | "up currently.\n", dlm->name, create.namelen, | 328 | "currently.\n", dlm->name, create.namelen, |
328 | create.name, res->owner); | 329 | create.name, res->owner); |
329 | dlm_print_one_lock_resource(res); | 330 | dlm_print_one_lock_resource(res); |
330 | BUG(); | 331 | BUG(); |
331 | } | 332 | } |
332 | } else { | 333 | } else { |
333 | mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " | 334 | mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to " |
334 | "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, | 335 | "node %u\n", dlm->name, create.namelen, create.name, |
335 | res->owner); | 336 | tmpret, res->owner); |
336 | if (dlm_is_host_down(tmpret)) { | 337 | if (dlm_is_host_down(tmpret)) |
337 | ret = DLM_RECOVERING; | 338 | ret = DLM_RECOVERING; |
338 | mlog(0, "node %u died so returning DLM_RECOVERING " | 339 | else |
339 | "from lock message!\n", res->owner); | ||
340 | } else { | ||
341 | ret = dlm_err_to_dlm_status(tmpret); | 340 | ret = dlm_err_to_dlm_status(tmpret); |
342 | } | ||
343 | } | 341 | } |
344 | 342 | ||
345 | return ret; | 343 | return ret; |
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, | |||
440 | /* zero memory only if kernel-allocated */ | 438 | /* zero memory only if kernel-allocated */ |
441 | lksb = kzalloc(sizeof(*lksb), GFP_NOFS); | 439 | lksb = kzalloc(sizeof(*lksb), GFP_NOFS); |
442 | if (!lksb) { | 440 | if (!lksb) { |
443 | kfree(lock); | 441 | kmem_cache_free(dlm_lock_cache, lock); |
444 | return NULL; | 442 | return NULL; |
445 | } | 443 | } |
446 | kernel_allocated = 1; | 444 | kernel_allocated = 1; |
@@ -718,18 +716,10 @@ retry_lock: | |||
718 | 716 | ||
719 | if (status == DLM_RECOVERING || status == DLM_MIGRATING || | 717 | if (status == DLM_RECOVERING || status == DLM_MIGRATING || |
720 | status == DLM_FORWARD) { | 718 | status == DLM_FORWARD) { |
721 | mlog(0, "retrying lock with migration/" | ||
722 | "recovery/in progress\n"); | ||
723 | msleep(100); | 719 | msleep(100); |
724 | /* no waiting for dlm_reco_thread */ | ||
725 | if (recovery) { | 720 | if (recovery) { |
726 | if (status != DLM_RECOVERING) | 721 | if (status != DLM_RECOVERING) |
727 | goto retry_lock; | 722 | goto retry_lock; |
728 | |||
729 | mlog(0, "%s: got RECOVERING " | ||
730 | "for $RECOVERY lock, master " | ||
731 | "was %u\n", dlm->name, | ||
732 | res->owner); | ||
733 | /* wait to see the node go down, then | 723 | /* wait to see the node go down, then |
734 | * drop down and allow the lockres to | 724 | * drop down and allow the lockres to |
735 | * get cleaned up. need to remaster. */ | 725 | * get cleaned up. need to remaster. */ |
@@ -741,6 +731,14 @@ retry_lock: | |||
741 | } | 731 | } |
742 | } | 732 | } |
743 | 733 | ||
734 | /* Inflight taken in dlm_get_lock_resource() is dropped here */ | ||
735 | spin_lock(&res->spinlock); | ||
736 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
737 | spin_unlock(&res->spinlock); | ||
738 | |||
739 | dlm_lockres_calc_usage(dlm, res); | ||
740 | dlm_kick_thread(dlm, res); | ||
741 | |||
744 | if (status != DLM_NORMAL) { | 742 | if (status != DLM_NORMAL) { |
745 | lock->lksb->flags &= ~DLM_LKSB_GET_LVB; | 743 | lock->lksb->flags &= ~DLM_LKSB_GET_LVB; |
746 | if (status != DLM_NOTQUEUED) | 744 | if (status != DLM_NOTQUEUED) |
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 11eefb8c12e9..005261c333b0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -631,39 +631,54 @@ error: | |||
631 | return NULL; | 631 | return NULL; |
632 | } | 632 | } |
633 | 633 | ||
634 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | 634 | void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, |
635 | struct dlm_lock_resource *res, | 635 | struct dlm_lock_resource *res, int bit) |
636 | int new_lockres, | ||
637 | const char *file, | ||
638 | int line) | ||
639 | { | 636 | { |
640 | if (!new_lockres) | 637 | assert_spin_locked(&res->spinlock); |
641 | assert_spin_locked(&res->spinlock); | 638 | |
639 | mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, | ||
640 | res->lockname.name, bit, __builtin_return_address(0)); | ||
641 | |||
642 | set_bit(bit, res->refmap); | ||
643 | } | ||
644 | |||
645 | void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, | ||
646 | struct dlm_lock_resource *res, int bit) | ||
647 | { | ||
648 | assert_spin_locked(&res->spinlock); | ||
649 | |||
650 | mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, | ||
651 | res->lockname.name, bit, __builtin_return_address(0)); | ||
652 | |||
653 | clear_bit(bit, res->refmap); | ||
654 | } | ||
655 | |||
656 | |||
657 | void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
658 | struct dlm_lock_resource *res) | ||
659 | { | ||
660 | assert_spin_locked(&res->spinlock); | ||
642 | 661 | ||
643 | if (!test_bit(dlm->node_num, res->refmap)) { | ||
644 | BUG_ON(res->inflight_locks != 0); | ||
645 | dlm_lockres_set_refmap_bit(dlm->node_num, res); | ||
646 | } | ||
647 | res->inflight_locks++; | 662 | res->inflight_locks++; |
648 | mlog(0, "%s:%.*s: inflight++: now %u\n", | 663 | |
649 | dlm->name, res->lockname.len, res->lockname.name, | 664 | mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, |
650 | res->inflight_locks); | 665 | res->lockname.len, res->lockname.name, res->inflight_locks, |
666 | __builtin_return_address(0)); | ||
651 | } | 667 | } |
652 | 668 | ||
653 | void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, | 669 | void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, |
654 | struct dlm_lock_resource *res, | 670 | struct dlm_lock_resource *res) |
655 | const char *file, | ||
656 | int line) | ||
657 | { | 671 | { |
658 | assert_spin_locked(&res->spinlock); | 672 | assert_spin_locked(&res->spinlock); |
659 | 673 | ||
660 | BUG_ON(res->inflight_locks == 0); | 674 | BUG_ON(res->inflight_locks == 0); |
675 | |||
661 | res->inflight_locks--; | 676 | res->inflight_locks--; |
662 | mlog(0, "%s:%.*s: inflight--: now %u\n", | 677 | |
663 | dlm->name, res->lockname.len, res->lockname.name, | 678 | mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, |
664 | res->inflight_locks); | 679 | res->lockname.len, res->lockname.name, res->inflight_locks, |
665 | if (res->inflight_locks == 0) | 680 | __builtin_return_address(0)); |
666 | dlm_lockres_clear_refmap_bit(dlm->node_num, res); | 681 | |
667 | wake_up(&res->wq); | 682 | wake_up(&res->wq); |
668 | } | 683 | } |
669 | 684 | ||
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | |||
697 | unsigned int hash; | 712 | unsigned int hash; |
698 | int tries = 0; | 713 | int tries = 0; |
699 | int bit, wait_on_recovery = 0; | 714 | int bit, wait_on_recovery = 0; |
700 | int drop_inflight_if_nonlocal = 0; | ||
701 | 715 | ||
702 | BUG_ON(!lockid); | 716 | BUG_ON(!lockid); |
703 | 717 | ||
@@ -709,36 +723,33 @@ lookup: | |||
709 | spin_lock(&dlm->spinlock); | 723 | spin_lock(&dlm->spinlock); |
710 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); | 724 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); |
711 | if (tmpres) { | 725 | if (tmpres) { |
712 | int dropping_ref = 0; | ||
713 | |||
714 | spin_unlock(&dlm->spinlock); | 726 | spin_unlock(&dlm->spinlock); |
715 | |||
716 | spin_lock(&tmpres->spinlock); | 727 | spin_lock(&tmpres->spinlock); |
717 | /* We wait for the other thread that is mastering the resource */ | 728 | /* Wait on the thread that is mastering the resource */ |
718 | if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | 729 | if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { |
719 | __dlm_wait_on_lockres(tmpres); | 730 | __dlm_wait_on_lockres(tmpres); |
720 | BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); | 731 | BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); |
732 | spin_unlock(&tmpres->spinlock); | ||
733 | dlm_lockres_put(tmpres); | ||
734 | tmpres = NULL; | ||
735 | goto lookup; | ||
721 | } | 736 | } |
722 | 737 | ||
723 | if (tmpres->owner == dlm->node_num) { | 738 | /* Wait on the resource purge to complete before continuing */ |
724 | BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); | 739 | if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { |
725 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | 740 | BUG_ON(tmpres->owner == dlm->node_num); |
726 | } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) | 741 | __dlm_wait_on_lockres_flags(tmpres, |
727 | dropping_ref = 1; | 742 | DLM_LOCK_RES_DROPPING_REF); |
728 | spin_unlock(&tmpres->spinlock); | ||
729 | |||
730 | /* wait until done messaging the master, drop our ref to allow | ||
731 | * the lockres to be purged, start over. */ | ||
732 | if (dropping_ref) { | ||
733 | spin_lock(&tmpres->spinlock); | ||
734 | __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); | ||
735 | spin_unlock(&tmpres->spinlock); | 743 | spin_unlock(&tmpres->spinlock); |
736 | dlm_lockres_put(tmpres); | 744 | dlm_lockres_put(tmpres); |
737 | tmpres = NULL; | 745 | tmpres = NULL; |
738 | goto lookup; | 746 | goto lookup; |
739 | } | 747 | } |
740 | 748 | ||
741 | mlog(0, "found in hash!\n"); | 749 | /* Grab inflight ref to pin the resource */ |
750 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | ||
751 | |||
752 | spin_unlock(&tmpres->spinlock); | ||
742 | if (res) | 753 | if (res) |
743 | dlm_lockres_put(res); | 754 | dlm_lockres_put(res); |
744 | res = tmpres; | 755 | res = tmpres; |
@@ -829,8 +840,8 @@ lookup: | |||
829 | * but they might own this lockres. wait on them. */ | 840 | * but they might own this lockres. wait on them. */ |
830 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | 841 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); |
831 | if (bit < O2NM_MAX_NODES) { | 842 | if (bit < O2NM_MAX_NODES) { |
832 | mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " | 843 | mlog(0, "%s: res %.*s, At least one node (%d) " |
833 | "recover before lock mastery can begin\n", | 844 | "to recover before lock mastery can begin\n", |
834 | dlm->name, namelen, (char *)lockid, bit); | 845 | dlm->name, namelen, (char *)lockid, bit); |
835 | wait_on_recovery = 1; | 846 | wait_on_recovery = 1; |
836 | } | 847 | } |
@@ -843,12 +854,11 @@ lookup: | |||
843 | 854 | ||
844 | /* finally add the lockres to its hash bucket */ | 855 | /* finally add the lockres to its hash bucket */ |
845 | __dlm_insert_lockres(dlm, res); | 856 | __dlm_insert_lockres(dlm, res); |
846 | /* since this lockres is new it doesn't not require the spinlock */ | ||
847 | dlm_lockres_grab_inflight_ref_new(dlm, res); | ||
848 | 857 | ||
849 | /* if this node does not become the master make sure to drop | 858 | /* Grab inflight ref to pin the resource */ |
850 | * this inflight reference below */ | 859 | spin_lock(&res->spinlock); |
851 | drop_inflight_if_nonlocal = 1; | 860 | dlm_lockres_grab_inflight_ref(dlm, res); |
861 | spin_unlock(&res->spinlock); | ||
852 | 862 | ||
853 | /* get an extra ref on the mle in case this is a BLOCK | 863 | /* get an extra ref on the mle in case this is a BLOCK |
854 | * if so, the creator of the BLOCK may try to put the last | 864 | * if so, the creator of the BLOCK may try to put the last |
@@ -864,8 +874,8 @@ redo_request: | |||
864 | * dlm spinlock would be detectable be a change on the mle, | 874 | * dlm spinlock would be detectable be a change on the mle, |
865 | * so we only need to clear out the recovery map once. */ | 875 | * so we only need to clear out the recovery map once. */ |
866 | if (dlm_is_recovery_lock(lockid, namelen)) { | 876 | if (dlm_is_recovery_lock(lockid, namelen)) { |
867 | mlog(ML_NOTICE, "%s: recovery map is not empty, but " | 877 | mlog(0, "%s: Recovery map is not empty, but must " |
868 | "must master $RECOVERY lock now\n", dlm->name); | 878 | "master $RECOVERY lock now\n", dlm->name); |
869 | if (!dlm_pre_master_reco_lockres(dlm, res)) | 879 | if (!dlm_pre_master_reco_lockres(dlm, res)) |
870 | wait_on_recovery = 0; | 880 | wait_on_recovery = 0; |
871 | else { | 881 | else { |
@@ -883,8 +893,8 @@ redo_request: | |||
883 | spin_lock(&dlm->spinlock); | 893 | spin_lock(&dlm->spinlock); |
884 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | 894 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); |
885 | if (bit < O2NM_MAX_NODES) { | 895 | if (bit < O2NM_MAX_NODES) { |
886 | mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " | 896 | mlog(0, "%s: res %.*s, At least one node (%d) " |
887 | "recover before lock mastery can begin\n", | 897 | "to recover before lock mastery can begin\n", |
888 | dlm->name, namelen, (char *)lockid, bit); | 898 | dlm->name, namelen, (char *)lockid, bit); |
889 | wait_on_recovery = 1; | 899 | wait_on_recovery = 1; |
890 | } else | 900 | } else |
@@ -913,8 +923,8 @@ redo_request: | |||
913 | * yet, keep going until it does. this is how the | 923 | * yet, keep going until it does. this is how the |
914 | * master will know that asserts are needed back to | 924 | * master will know that asserts are needed back to |
915 | * the lower nodes. */ | 925 | * the lower nodes. */ |
916 | mlog(0, "%s:%.*s: requests only up to %u but master " | 926 | mlog(0, "%s: res %.*s, Requests only up to %u but " |
917 | "is %u, keep going\n", dlm->name, namelen, | 927 | "master is %u, keep going\n", dlm->name, namelen, |
918 | lockid, nodenum, mle->master); | 928 | lockid, nodenum, mle->master); |
919 | } | 929 | } |
920 | } | 930 | } |
@@ -924,13 +934,12 @@ wait: | |||
924 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); | 934 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); |
925 | if (ret < 0) { | 935 | if (ret < 0) { |
926 | wait_on_recovery = 1; | 936 | wait_on_recovery = 1; |
927 | mlog(0, "%s:%.*s: node map changed, redo the " | 937 | mlog(0, "%s: res %.*s, Node map changed, redo the master " |
928 | "master request now, blocked=%d\n", | 938 | "request now, blocked=%d\n", dlm->name, res->lockname.len, |
929 | dlm->name, res->lockname.len, | ||
930 | res->lockname.name, blocked); | 939 | res->lockname.name, blocked); |
931 | if (++tries > 20) { | 940 | if (++tries > 20) { |
932 | mlog(ML_ERROR, "%s:%.*s: spinning on " | 941 | mlog(ML_ERROR, "%s: res %.*s, Spinning on " |
933 | "dlm_wait_for_lock_mastery, blocked=%d\n", | 942 | "dlm_wait_for_lock_mastery, blocked = %d\n", |
934 | dlm->name, res->lockname.len, | 943 | dlm->name, res->lockname.len, |
935 | res->lockname.name, blocked); | 944 | res->lockname.name, blocked); |
936 | dlm_print_one_lock_resource(res); | 945 | dlm_print_one_lock_resource(res); |
@@ -940,7 +949,8 @@ wait: | |||
940 | goto redo_request; | 949 | goto redo_request; |
941 | } | 950 | } |
942 | 951 | ||
943 | mlog(0, "lockres mastered by %u\n", res->owner); | 952 | mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, |
953 | res->lockname.name, res->owner); | ||
944 | /* make sure we never continue without this */ | 954 | /* make sure we never continue without this */ |
945 | BUG_ON(res->owner == O2NM_MAX_NODES); | 955 | BUG_ON(res->owner == O2NM_MAX_NODES); |
946 | 956 | ||
@@ -952,8 +962,6 @@ wait: | |||
952 | 962 | ||
953 | wake_waiters: | 963 | wake_waiters: |
954 | spin_lock(&res->spinlock); | 964 | spin_lock(&res->spinlock); |
955 | if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) | ||
956 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
957 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 965 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
958 | spin_unlock(&res->spinlock); | 966 | spin_unlock(&res->spinlock); |
959 | wake_up(&res->wq); | 967 | wake_up(&res->wq); |
@@ -1426,9 +1434,7 @@ way_up_top: | |||
1426 | } | 1434 | } |
1427 | 1435 | ||
1428 | if (res->owner == dlm->node_num) { | 1436 | if (res->owner == dlm->node_num) { |
1429 | mlog(0, "%s:%.*s: setting bit %u in refmap\n", | 1437 | dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); |
1430 | dlm->name, namelen, name, request->node_idx); | ||
1431 | dlm_lockres_set_refmap_bit(request->node_idx, res); | ||
1432 | spin_unlock(&res->spinlock); | 1438 | spin_unlock(&res->spinlock); |
1433 | response = DLM_MASTER_RESP_YES; | 1439 | response = DLM_MASTER_RESP_YES; |
1434 | if (mle) | 1440 | if (mle) |
@@ -1493,10 +1499,8 @@ way_up_top: | |||
1493 | * go back and clean the mles on any | 1499 | * go back and clean the mles on any |
1494 | * other nodes */ | 1500 | * other nodes */ |
1495 | dispatch_assert = 1; | 1501 | dispatch_assert = 1; |
1496 | dlm_lockres_set_refmap_bit(request->node_idx, res); | 1502 | dlm_lockres_set_refmap_bit(dlm, res, |
1497 | mlog(0, "%s:%.*s: setting bit %u in refmap\n", | 1503 | request->node_idx); |
1498 | dlm->name, namelen, name, | ||
1499 | request->node_idx); | ||
1500 | } else | 1504 | } else |
1501 | response = DLM_MASTER_RESP_NO; | 1505 | response = DLM_MASTER_RESP_NO; |
1502 | } else { | 1506 | } else { |
@@ -1702,7 +1706,7 @@ again: | |||
1702 | "lockres, set the bit in the refmap\n", | 1706 | "lockres, set the bit in the refmap\n", |
1703 | namelen, lockname, to); | 1707 | namelen, lockname, to); |
1704 | spin_lock(&res->spinlock); | 1708 | spin_lock(&res->spinlock); |
1705 | dlm_lockres_set_refmap_bit(to, res); | 1709 | dlm_lockres_set_refmap_bit(dlm, res, to); |
1706 | spin_unlock(&res->spinlock); | 1710 | spin_unlock(&res->spinlock); |
1707 | } | 1711 | } |
1708 | } | 1712 | } |
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | |||
2187 | namelen = res->lockname.len; | 2191 | namelen = res->lockname.len; |
2188 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | 2192 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); |
2189 | 2193 | ||
2190 | mlog(0, "%s:%.*s: sending deref to %d\n", | ||
2191 | dlm->name, namelen, lockname, res->owner); | ||
2192 | memset(&deref, 0, sizeof(deref)); | 2194 | memset(&deref, 0, sizeof(deref)); |
2193 | deref.node_idx = dlm->node_num; | 2195 | deref.node_idx = dlm->node_num; |
2194 | deref.namelen = namelen; | 2196 | deref.namelen = namelen; |
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | |||
2197 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, | 2199 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, |
2198 | &deref, sizeof(deref), res->owner, &r); | 2200 | &deref, sizeof(deref), res->owner, &r); |
2199 | if (ret < 0) | 2201 | if (ret < 0) |
2200 | mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " | 2202 | mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", |
2201 | "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, | 2203 | dlm->name, namelen, lockname, ret, res->owner); |
2202 | res->owner); | ||
2203 | else if (r < 0) { | 2204 | else if (r < 0) { |
2204 | /* BAD. other node says I did not have a ref. */ | 2205 | /* BAD. other node says I did not have a ref. */ |
2205 | mlog(ML_ERROR,"while dropping ref on %s:%.*s " | 2206 | mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", |
2206 | "(master=%u) got %d.\n", dlm->name, namelen, | 2207 | dlm->name, namelen, lockname, res->owner, r); |
2207 | lockname, res->owner, r); | ||
2208 | dlm_print_one_lock_resource(res); | 2208 | dlm_print_one_lock_resource(res); |
2209 | BUG(); | 2209 | BUG(); |
2210 | } | 2210 | } |
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, | |||
2260 | else { | 2260 | else { |
2261 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | 2261 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); |
2262 | if (test_bit(node, res->refmap)) { | 2262 | if (test_bit(node, res->refmap)) { |
2263 | dlm_lockres_clear_refmap_bit(node, res); | 2263 | dlm_lockres_clear_refmap_bit(dlm, res, node); |
2264 | cleared = 1; | 2264 | cleared = 1; |
2265 | } | 2265 | } |
2266 | } | 2266 | } |
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) | |||
2320 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | 2320 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); |
2321 | if (test_bit(node, res->refmap)) { | 2321 | if (test_bit(node, res->refmap)) { |
2322 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); | 2322 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); |
2323 | dlm_lockres_clear_refmap_bit(node, res); | 2323 | dlm_lockres_clear_refmap_bit(dlm, res, node); |
2324 | cleared = 1; | 2324 | cleared = 1; |
2325 | } | 2325 | } |
2326 | spin_unlock(&res->spinlock); | 2326 | spin_unlock(&res->spinlock); |
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
2802 | BUG_ON(!list_empty(&lock->bast_list)); | 2802 | BUG_ON(!list_empty(&lock->bast_list)); |
2803 | BUG_ON(lock->ast_pending); | 2803 | BUG_ON(lock->ast_pending); |
2804 | BUG_ON(lock->bast_pending); | 2804 | BUG_ON(lock->bast_pending); |
2805 | dlm_lockres_clear_refmap_bit(lock->ml.node, res); | 2805 | dlm_lockres_clear_refmap_bit(dlm, res, |
2806 | lock->ml.node); | ||
2806 | list_del_init(&lock->list); | 2807 | list_del_init(&lock->list); |
2807 | dlm_lock_put(lock); | 2808 | dlm_lock_put(lock); |
2808 | /* In a normal unlock, we would have added a | 2809 | /* In a normal unlock, we would have added a |
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
2823 | mlog(0, "%s:%.*s: node %u had a ref to this " | 2824 | mlog(0, "%s:%.*s: node %u had a ref to this " |
2824 | "migrating lockres, clearing\n", dlm->name, | 2825 | "migrating lockres, clearing\n", dlm->name, |
2825 | res->lockname.len, res->lockname.name, bit); | 2826 | res->lockname.len, res->lockname.name, bit); |
2826 | dlm_lockres_clear_refmap_bit(bit, res); | 2827 | dlm_lockres_clear_refmap_bit(dlm, res, bit); |
2827 | } | 2828 | } |
2828 | bit++; | 2829 | bit++; |
2829 | } | 2830 | } |
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | |||
2916 | &migrate, sizeof(migrate), nodenum, | 2917 | &migrate, sizeof(migrate), nodenum, |
2917 | &status); | 2918 | &status); |
2918 | if (ret < 0) { | 2919 | if (ret < 0) { |
2919 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 2920 | mlog(ML_ERROR, "%s: res %.*s, Error %d send " |
2920 | "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, | 2921 | "MIGRATE_REQUEST to node %u\n", dlm->name, |
2921 | dlm->key, nodenum); | 2922 | migrate.namelen, migrate.name, ret, nodenum); |
2922 | if (!dlm_is_host_down(ret)) { | 2923 | if (!dlm_is_host_down(ret)) { |
2923 | mlog(ML_ERROR, "unhandled error=%d!\n", ret); | 2924 | mlog(ML_ERROR, "unhandled error=%d!\n", ret); |
2924 | BUG(); | 2925 | BUG(); |
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | |||
2937 | dlm->name, res->lockname.len, res->lockname.name, | 2938 | dlm->name, res->lockname.len, res->lockname.name, |
2938 | nodenum); | 2939 | nodenum); |
2939 | spin_lock(&res->spinlock); | 2940 | spin_lock(&res->spinlock); |
2940 | dlm_lockres_set_refmap_bit(nodenum, res); | 2941 | dlm_lockres_set_refmap_bit(dlm, res, nodenum); |
2941 | spin_unlock(&res->spinlock); | 2942 | spin_unlock(&res->spinlock); |
2942 | } | 2943 | } |
2943 | } | 2944 | } |
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
3271 | * mastery reference here since old_master will briefly have | 3272 | * mastery reference here since old_master will briefly have |
3272 | * a reference after the migration completes */ | 3273 | * a reference after the migration completes */ |
3273 | spin_lock(&res->spinlock); | 3274 | spin_lock(&res->spinlock); |
3274 | dlm_lockres_set_refmap_bit(old_master, res); | 3275 | dlm_lockres_set_refmap_bit(dlm, res, old_master); |
3275 | spin_unlock(&res->spinlock); | 3276 | spin_unlock(&res->spinlock); |
3276 | 3277 | ||
3277 | mlog(0, "now time to do a migrate request to other nodes\n"); | 3278 | mlog(0, "now time to do a migrate request to other nodes\n"); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7efab6d28a21..01ebfd0bdad7 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) | |||
362 | } | 362 | } |
363 | 363 | ||
364 | 364 | ||
365 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) | 365 | void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) |
366 | { | 366 | { |
367 | if (timeout) { | 367 | if (dlm_is_node_dead(dlm, node)) |
368 | mlog(ML_NOTICE, "%s: waiting %dms for notification of " | 368 | return; |
369 | "death of node %u\n", dlm->name, timeout, node); | 369 | |
370 | printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in " | ||
371 | "domain %s\n", node, dlm->name); | ||
372 | |||
373 | if (timeout) | ||
370 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 374 | wait_event_timeout(dlm->dlm_reco_thread_wq, |
371 | dlm_is_node_dead(dlm, node), | 375 | dlm_is_node_dead(dlm, node), |
372 | msecs_to_jiffies(timeout)); | 376 | msecs_to_jiffies(timeout)); |
373 | } else { | 377 | else |
374 | mlog(ML_NOTICE, "%s: waiting indefinitely for notification " | ||
375 | "of death of node %u\n", dlm->name, node); | ||
376 | wait_event(dlm->dlm_reco_thread_wq, | 378 | wait_event(dlm->dlm_reco_thread_wq, |
377 | dlm_is_node_dead(dlm, node)); | 379 | dlm_is_node_dead(dlm, node)); |
378 | } | ||
379 | /* for now, return 0 */ | ||
380 | return 0; | ||
381 | } | 380 | } |
382 | 381 | ||
383 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) | 382 | void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) |
384 | { | 383 | { |
385 | if (timeout) { | 384 | if (dlm_is_node_recovered(dlm, node)) |
386 | mlog(0, "%s: waiting %dms for notification of " | 385 | return; |
387 | "recovery of node %u\n", dlm->name, timeout, node); | 386 | |
387 | printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " | ||
388 | "domain %s\n", node, dlm->name); | ||
389 | |||
390 | if (timeout) | ||
388 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 391 | wait_event_timeout(dlm->dlm_reco_thread_wq, |
389 | dlm_is_node_recovered(dlm, node), | 392 | dlm_is_node_recovered(dlm, node), |
390 | msecs_to_jiffies(timeout)); | 393 | msecs_to_jiffies(timeout)); |
391 | } else { | 394 | else |
392 | mlog(0, "%s: waiting indefinitely for notification " | ||
393 | "of recovery of node %u\n", dlm->name, node); | ||
394 | wait_event(dlm->dlm_reco_thread_wq, | 395 | wait_event(dlm->dlm_reco_thread_wq, |
395 | dlm_is_node_recovered(dlm, node)); | 396 | dlm_is_node_recovered(dlm, node)); |
396 | } | ||
397 | /* for now, return 0 */ | ||
398 | return 0; | ||
399 | } | 397 | } |
400 | 398 | ||
401 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | 399 | /* callers of the top-level api calls (dlmlock/dlmunlock) should |
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm) | |||
430 | { | 428 | { |
431 | spin_lock(&dlm->spinlock); | 429 | spin_lock(&dlm->spinlock); |
432 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | 430 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); |
431 | printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", | ||
432 | dlm->name, dlm->reco.dead_node); | ||
433 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; | 433 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; |
434 | spin_unlock(&dlm->spinlock); | 434 | spin_unlock(&dlm->spinlock); |
435 | } | 435 | } |
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) | |||
440 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); | 440 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); |
441 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; | 441 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; |
442 | spin_unlock(&dlm->spinlock); | 442 | spin_unlock(&dlm->spinlock); |
443 | printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); | ||
443 | wake_up(&dlm->reco.event); | 444 | wake_up(&dlm->reco.event); |
444 | } | 445 | } |
445 | 446 | ||
447 | static void dlm_print_recovery_master(struct dlm_ctxt *dlm) | ||
448 | { | ||
449 | printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " | ||
450 | "dead node %u in domain %s\n", dlm->reco.new_master, | ||
451 | (dlm->node_num == dlm->reco.new_master ? "me" : "he"), | ||
452 | dlm->reco.dead_node, dlm->name); | ||
453 | } | ||
454 | |||
446 | static int dlm_do_recovery(struct dlm_ctxt *dlm) | 455 | static int dlm_do_recovery(struct dlm_ctxt *dlm) |
447 | { | 456 | { |
448 | int status = 0; | 457 | int status = 0; |
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
505 | } | 514 | } |
506 | mlog(0, "another node will master this recovery session.\n"); | 515 | mlog(0, "another node will master this recovery session.\n"); |
507 | } | 516 | } |
508 | mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", | 517 | |
509 | dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, | 518 | dlm_print_recovery_master(dlm); |
510 | dlm->node_num, dlm->reco.dead_node); | ||
511 | 519 | ||
512 | /* it is safe to start everything back up here | 520 | /* it is safe to start everything back up here |
513 | * because all of the dead node's lock resources | 521 | * because all of the dead node's lock resources |
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
518 | return 0; | 526 | return 0; |
519 | 527 | ||
520 | master_here: | 528 | master_here: |
521 | mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " | 529 | dlm_print_recovery_master(dlm); |
522 | "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task), | ||
523 | dlm->node_num, dlm->reco.dead_node, dlm->name); | ||
524 | 530 | ||
525 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | 531 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); |
526 | if (status < 0) { | 532 | if (status < 0) { |
527 | /* we should never hit this anymore */ | 533 | /* we should never hit this anymore */ |
528 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | 534 | mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, " |
529 | "retrying.\n", status, dlm->reco.dead_node); | 535 | "retrying.\n", dlm->name, status, dlm->reco.dead_node); |
530 | /* yield a bit to allow any final network messages | 536 | /* yield a bit to allow any final network messages |
531 | * to get handled on remaining nodes */ | 537 | * to get handled on remaining nodes */ |
532 | msleep(100); | 538 | msleep(100); |
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
567 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); | 573 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); |
568 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; | 574 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; |
569 | 575 | ||
570 | mlog(0, "requesting lock info from node %u\n", | 576 | mlog(0, "%s: Requesting lock info from node %u\n", dlm->name, |
571 | ndata->node_num); | 577 | ndata->node_num); |
572 | 578 | ||
573 | if (ndata->node_num == dlm->node_num) { | 579 | if (ndata->node_num == dlm->node_num) { |
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
640 | spin_unlock(&dlm_reco_state_lock); | 646 | spin_unlock(&dlm_reco_state_lock); |
641 | } | 647 | } |
642 | 648 | ||
643 | mlog(0, "done requesting all lock info\n"); | 649 | mlog(0, "%s: Done requesting all lock info\n", dlm->name); |
644 | 650 | ||
645 | /* nodes should be sending reco data now | 651 | /* nodes should be sending reco data now |
646 | * just need to wait */ | 652 | * just need to wait */ |
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, | |||
802 | 808 | ||
803 | /* negative status is handled by caller */ | 809 | /* negative status is handled by caller */ |
804 | if (ret < 0) | 810 | if (ret < 0) |
805 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 811 | mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " |
806 | "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, | 812 | "to recover dead node %u\n", dlm->name, ret, |
807 | dlm->key, request_from); | 813 | request_from, dead_node); |
808 | |||
809 | // return from here, then | 814 | // return from here, then |
810 | // sleep until all received or error | 815 | // sleep until all received or error |
811 | return ret; | 816 | return ret; |
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | |||
956 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | 961 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, |
957 | sizeof(done_msg), send_to, &tmpret); | 962 | sizeof(done_msg), send_to, &tmpret); |
958 | if (ret < 0) { | 963 | if (ret < 0) { |
959 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 964 | mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u " |
960 | "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, | 965 | "to recover dead node %u\n", dlm->name, ret, send_to, |
961 | dlm->key, send_to); | 966 | dead_node); |
962 | if (!dlm_is_host_down(ret)) { | 967 | if (!dlm_is_host_down(ret)) { |
963 | BUG(); | 968 | BUG(); |
964 | } | 969 | } |
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | |||
1127 | if (ret < 0) { | 1132 | if (ret < 0) { |
1128 | /* XXX: negative status is not handled. | 1133 | /* XXX: negative status is not handled. |
1129 | * this will end up killing this node. */ | 1134 | * this will end up killing this node. */ |
1130 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 1135 | mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to " |
1131 | "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, | 1136 | "node %u (%s)\n", dlm->name, mres->lockname_len, |
1132 | dlm->key, send_to); | 1137 | mres->lockname, ret, send_to, |
1138 | (orig_flags & DLM_MRES_MIGRATION ? | ||
1139 | "migration" : "recovery")); | ||
1133 | } else { | 1140 | } else { |
1134 | /* might get an -ENOMEM back here */ | 1141 | /* might get an -ENOMEM back here */ |
1135 | ret = status; | 1142 | ret = status; |
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1767 | dlm->name, mres->lockname_len, mres->lockname, | 1774 | dlm->name, mres->lockname_len, mres->lockname, |
1768 | from); | 1775 | from); |
1769 | spin_lock(&res->spinlock); | 1776 | spin_lock(&res->spinlock); |
1770 | dlm_lockres_set_refmap_bit(from, res); | 1777 | dlm_lockres_set_refmap_bit(dlm, res, from); |
1771 | spin_unlock(&res->spinlock); | 1778 | spin_unlock(&res->spinlock); |
1772 | added++; | 1779 | added++; |
1773 | break; | 1780 | break; |
@@ -1965,7 +1972,7 @@ skip_lvb: | |||
1965 | mlog(0, "%s:%.*s: added lock for node %u, " | 1972 | mlog(0, "%s:%.*s: added lock for node %u, " |
1966 | "setting refmap bit\n", dlm->name, | 1973 | "setting refmap bit\n", dlm->name, |
1967 | res->lockname.len, res->lockname.name, ml->node); | 1974 | res->lockname.len, res->lockname.name, ml->node); |
1968 | dlm_lockres_set_refmap_bit(ml->node, res); | 1975 | dlm_lockres_set_refmap_bit(dlm, res, ml->node); |
1969 | added++; | 1976 | added++; |
1970 | } | 1977 | } |
1971 | spin_unlock(&res->spinlock); | 1978 | spin_unlock(&res->spinlock); |
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
2084 | 2091 | ||
2085 | list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { | 2092 | list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { |
2086 | if (res->owner == dead_node) { | 2093 | if (res->owner == dead_node) { |
2094 | mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", | ||
2095 | dlm->name, res->lockname.len, res->lockname.name, | ||
2096 | res->owner, new_master); | ||
2087 | list_del_init(&res->recovering); | 2097 | list_del_init(&res->recovering); |
2088 | spin_lock(&res->spinlock); | 2098 | spin_lock(&res->spinlock); |
2089 | /* new_master has our reference from | 2099 | /* new_master has our reference from |
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
2105 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 2115 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
2106 | bucket = dlm_lockres_hash(dlm, i); | 2116 | bucket = dlm_lockres_hash(dlm, i); |
2107 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { | 2117 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { |
2108 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 2118 | if (!(res->state & DLM_LOCK_RES_RECOVERING)) |
2109 | if (res->owner == dead_node) { | 2119 | continue; |
2110 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
2111 | "was not on recovering list, but " | ||
2112 | "clearing state anyway\n", | ||
2113 | dlm->node_num, res->lockname.len, | ||
2114 | res->lockname.name, new_master); | ||
2115 | } else if (res->owner == dlm->node_num) { | ||
2116 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
2117 | "was not on recovering list, " | ||
2118 | "owner is THIS node, clearing\n", | ||
2119 | dlm->node_num, res->lockname.len, | ||
2120 | res->lockname.name, new_master); | ||
2121 | } else | ||
2122 | continue; | ||
2123 | 2120 | ||
2124 | if (!list_empty(&res->recovering)) { | 2121 | if (res->owner != dead_node && |
2125 | mlog(0, "%s:%.*s: lockres was " | 2122 | res->owner != dlm->node_num) |
2126 | "marked RECOVERING, owner=%u\n", | 2123 | continue; |
2127 | dlm->name, res->lockname.len, | 2124 | |
2128 | res->lockname.name, res->owner); | 2125 | if (!list_empty(&res->recovering)) { |
2129 | list_del_init(&res->recovering); | 2126 | list_del_init(&res->recovering); |
2130 | dlm_lockres_put(res); | 2127 | dlm_lockres_put(res); |
2131 | } | ||
2132 | spin_lock(&res->spinlock); | ||
2133 | /* new_master has our reference from | ||
2134 | * the lock state sent during recovery */ | ||
2135 | dlm_change_lockres_owner(dlm, res, new_master); | ||
2136 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
2137 | if (__dlm_lockres_has_locks(res)) | ||
2138 | __dlm_dirty_lockres(dlm, res); | ||
2139 | spin_unlock(&res->spinlock); | ||
2140 | wake_up(&res->wq); | ||
2141 | } | 2128 | } |
2129 | |||
2130 | /* new_master has our reference from | ||
2131 | * the lock state sent during recovery */ | ||
2132 | mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", | ||
2133 | dlm->name, res->lockname.len, res->lockname.name, | ||
2134 | res->owner, new_master); | ||
2135 | spin_lock(&res->spinlock); | ||
2136 | dlm_change_lockres_owner(dlm, res, new_master); | ||
2137 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
2138 | if (__dlm_lockres_has_locks(res)) | ||
2139 | __dlm_dirty_lockres(dlm, res); | ||
2140 | spin_unlock(&res->spinlock); | ||
2141 | wake_up(&res->wq); | ||
2142 | } | 2142 | } |
2143 | } | 2143 | } |
2144 | } | 2144 | } |
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | |||
2252 | res->lockname.len, res->lockname.name, freed, dead_node); | 2252 | res->lockname.len, res->lockname.name, freed, dead_node); |
2253 | __dlm_print_one_lock_resource(res); | 2253 | __dlm_print_one_lock_resource(res); |
2254 | } | 2254 | } |
2255 | dlm_lockres_clear_refmap_bit(dead_node, res); | 2255 | dlm_lockres_clear_refmap_bit(dlm, res, dead_node); |
2256 | } else if (test_bit(dead_node, res->refmap)) { | 2256 | } else if (test_bit(dead_node, res->refmap)) { |
2257 | mlog(0, "%s:%.*s: dead node %u had a ref, but had " | 2257 | mlog(0, "%s:%.*s: dead node %u had a ref, but had " |
2258 | "no locks and had not purged before dying\n", dlm->name, | 2258 | "no locks and had not purged before dying\n", dlm->name, |
2259 | res->lockname.len, res->lockname.name, dead_node); | 2259 | res->lockname.len, res->lockname.name, dead_node); |
2260 | dlm_lockres_clear_refmap_bit(dead_node, res); | 2260 | dlm_lockres_clear_refmap_bit(dlm, res, dead_node); |
2261 | } | 2261 | } |
2262 | 2262 | ||
2263 | /* do not kick thread yet */ | 2263 | /* do not kick thread yet */ |
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | |||
2324 | dlm_revalidate_lvb(dlm, res, dead_node); | 2324 | dlm_revalidate_lvb(dlm, res, dead_node); |
2325 | if (res->owner == dead_node) { | 2325 | if (res->owner == dead_node) { |
2326 | if (res->state & DLM_LOCK_RES_DROPPING_REF) { | 2326 | if (res->state & DLM_LOCK_RES_DROPPING_REF) { |
2327 | mlog(ML_NOTICE, "Ignore %.*s for " | 2327 | mlog(ML_NOTICE, "%s: res %.*s, Skip " |
2328 | "recovery as it is being freed\n", | 2328 | "recovery as it is being freed\n", |
2329 | res->lockname.len, | 2329 | dlm->name, res->lockname.len, |
2330 | res->lockname.name); | 2330 | res->lockname.name); |
2331 | } else | 2331 | } else |
2332 | dlm_move_lockres_to_recovery_list(dlm, | 2332 | dlm_move_lockres_to_recovery_list(dlm, |
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 1d6d1d22c471..e73c833fc2a1 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) | |||
94 | { | 94 | { |
95 | int bit; | 95 | int bit; |
96 | 96 | ||
97 | assert_spin_locked(&res->spinlock); | ||
98 | |||
97 | if (__dlm_lockres_has_locks(res)) | 99 | if (__dlm_lockres_has_locks(res)) |
98 | return 0; | 100 | return 0; |
99 | 101 | ||
102 | /* Locks are in the process of being created */ | ||
103 | if (res->inflight_locks) | ||
104 | return 0; | ||
105 | |||
100 | if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) | 106 | if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) |
101 | return 0; | 107 | return 0; |
102 | 108 | ||
103 | if (res->state & DLM_LOCK_RES_RECOVERING) | 109 | if (res->state & DLM_LOCK_RES_RECOVERING) |
104 | return 0; | 110 | return 0; |
105 | 111 | ||
112 | /* Another node has this resource with this node as the master */ | ||
106 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); | 113 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); |
107 | if (bit < O2NM_MAX_NODES) | 114 | if (bit < O2NM_MAX_NODES) |
108 | return 0; | 115 | return 0; |
109 | 116 | ||
110 | /* | ||
111 | * since the bit for dlm->node_num is not set, inflight_locks better | ||
112 | * be zero | ||
113 | */ | ||
114 | BUG_ON(res->inflight_locks != 0); | ||
115 | return 1; | 117 | return 1; |
116 | } | 118 | } |
117 | 119 | ||
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, | |||
185 | /* clear our bit from the master's refmap, ignore errors */ | 187 | /* clear our bit from the master's refmap, ignore errors */ |
186 | ret = dlm_drop_lockres_ref(dlm, res); | 188 | ret = dlm_drop_lockres_ref(dlm, res); |
187 | if (ret < 0) { | 189 | if (ret < 0) { |
188 | mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, | ||
189 | res->lockname.len, res->lockname.name, ret); | ||
190 | if (!dlm_is_host_down(ret)) | 190 | if (!dlm_is_host_down(ret)) |
191 | BUG(); | 191 | BUG(); |
192 | } | 192 | } |
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, | |||
209 | BUG(); | 209 | BUG(); |
210 | } | 210 | } |
211 | 211 | ||
212 | __dlm_unhash_lockres(res); | 212 | __dlm_unhash_lockres(dlm, res); |
213 | 213 | ||
214 | /* lockres is not in the hash now. drop the flag and wake up | 214 | /* lockres is not in the hash now. drop the flag and wake up |
215 | * any processes waiting in dlm_get_lock_resource. */ | 215 | * any processes waiting in dlm_get_lock_resource. */ |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e1ed5e502ff2..81a4cd22f80b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode) | |||
1692 | mlog(0, "inode %llu take PRMODE open lock\n", | 1692 | mlog(0, "inode %llu take PRMODE open lock\n", |
1693 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 1693 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
1694 | 1694 | ||
1695 | if (ocfs2_mount_local(osb)) | 1695 | if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) |
1696 | goto out; | 1696 | goto out; |
1697 | 1697 | ||
1698 | lockres = &OCFS2_I(inode)->ip_open_lockres; | 1698 | lockres = &OCFS2_I(inode)->ip_open_lockres; |
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write) | |||
1718 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1718 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1719 | write ? "EXMODE" : "PRMODE"); | 1719 | write ? "EXMODE" : "PRMODE"); |
1720 | 1720 | ||
1721 | if (ocfs2_is_hard_readonly(osb)) { | ||
1722 | if (write) | ||
1723 | status = -EROFS; | ||
1724 | goto out; | ||
1725 | } | ||
1726 | |||
1721 | if (ocfs2_mount_local(osb)) | 1727 | if (ocfs2_mount_local(osb)) |
1722 | goto out; | 1728 | goto out; |
1723 | 1729 | ||
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, | |||
2298 | if (ocfs2_is_hard_readonly(osb)) { | 2304 | if (ocfs2_is_hard_readonly(osb)) { |
2299 | if (ex) | 2305 | if (ex) |
2300 | status = -EROFS; | 2306 | status = -EROFS; |
2301 | goto bail; | 2307 | goto getbh; |
2302 | } | 2308 | } |
2303 | 2309 | ||
2304 | if (ocfs2_mount_local(osb)) | 2310 | if (ocfs2_mount_local(osb)) |
@@ -2356,7 +2362,7 @@ local: | |||
2356 | mlog_errno(status); | 2362 | mlog_errno(status); |
2357 | goto bail; | 2363 | goto bail; |
2358 | } | 2364 | } |
2359 | 2365 | getbh: | |
2360 | if (ret_bh) { | 2366 | if (ret_bh) { |
2361 | status = ocfs2_assign_bh(inode, ret_bh, local_bh); | 2367 | status = ocfs2_assign_bh(inode, ret_bh, local_bh); |
2362 | if (status < 0) { | 2368 | if (status < 0) { |
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) | |||
2628 | 2634 | ||
2629 | BUG_ON(!dl); | 2635 | BUG_ON(!dl); |
2630 | 2636 | ||
2631 | if (ocfs2_is_hard_readonly(osb)) | 2637 | if (ocfs2_is_hard_readonly(osb)) { |
2632 | return -EROFS; | 2638 | if (ex) |
2639 | return -EROFS; | ||
2640 | return 0; | ||
2641 | } | ||
2633 | 2642 | ||
2634 | if (ocfs2_mount_local(osb)) | 2643 | if (ocfs2_mount_local(osb)) |
2635 | return 0; | 2644 | return 0; |
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex) | |||
2647 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; | 2656 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; |
2648 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); | 2657 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); |
2649 | 2658 | ||
2650 | if (!ocfs2_mount_local(osb)) | 2659 | if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) |
2651 | ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); | 2660 | ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); |
2652 | } | 2661 | } |
2653 | 2662 | ||
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 23457b491e8c..2f5b92ef0e53 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -832,6 +832,102 @@ out: | |||
832 | return ret; | 832 | return ret; |
833 | } | 833 | } |
834 | 834 | ||
835 | int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) | ||
836 | { | ||
837 | struct inode *inode = file->f_mapping->host; | ||
838 | int ret; | ||
839 | unsigned int is_last = 0, is_data = 0; | ||
840 | u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
841 | u32 cpos, cend, clen, hole_size; | ||
842 | u64 extoff, extlen; | ||
843 | struct buffer_head *di_bh = NULL; | ||
844 | struct ocfs2_extent_rec rec; | ||
845 | |||
846 | BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); | ||
847 | |||
848 | ret = ocfs2_inode_lock(inode, &di_bh, 0); | ||
849 | if (ret) { | ||
850 | mlog_errno(ret); | ||
851 | goto out; | ||
852 | } | ||
853 | |||
854 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
855 | |||
856 | if (*offset >= inode->i_size) { | ||
857 | ret = -ENXIO; | ||
858 | goto out_unlock; | ||
859 | } | ||
860 | |||
861 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | ||
862 | if (origin == SEEK_HOLE) | ||
863 | *offset = inode->i_size; | ||
864 | goto out_unlock; | ||
865 | } | ||
866 | |||
867 | clen = 0; | ||
868 | cpos = *offset >> cs_bits; | ||
869 | cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); | ||
870 | |||
871 | while (cpos < cend && !is_last) { | ||
872 | ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, | ||
873 | &rec, &is_last); | ||
874 | if (ret) { | ||
875 | mlog_errno(ret); | ||
876 | goto out_unlock; | ||
877 | } | ||
878 | |||
879 | extoff = cpos; | ||
880 | extoff <<= cs_bits; | ||
881 | |||
882 | if (rec.e_blkno == 0ULL) { | ||
883 | clen = hole_size; | ||
884 | is_data = 0; | ||
885 | } else { | ||
886 | clen = le16_to_cpu(rec.e_leaf_clusters) - | ||
887 | (cpos - le32_to_cpu(rec.e_cpos)); | ||
888 | is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; | ||
889 | } | ||
890 | |||
891 | if ((!is_data && origin == SEEK_HOLE) || | ||
892 | (is_data && origin == SEEK_DATA)) { | ||
893 | if (extoff > *offset) | ||
894 | *offset = extoff; | ||
895 | goto out_unlock; | ||
896 | } | ||
897 | |||
898 | if (!is_last) | ||
899 | cpos += clen; | ||
900 | } | ||
901 | |||
902 | if (origin == SEEK_HOLE) { | ||
903 | extoff = cpos; | ||
904 | extoff <<= cs_bits; | ||
905 | extlen = clen; | ||
906 | extlen <<= cs_bits; | ||
907 | |||
908 | if ((extoff + extlen) > inode->i_size) | ||
909 | extlen = inode->i_size - extoff; | ||
910 | extoff += extlen; | ||
911 | if (extoff > *offset) | ||
912 | *offset = extoff; | ||
913 | goto out_unlock; | ||
914 | } | ||
915 | |||
916 | ret = -ENXIO; | ||
917 | |||
918 | out_unlock: | ||
919 | |||
920 | brelse(di_bh); | ||
921 | |||
922 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
923 | |||
924 | ocfs2_inode_unlock(inode, 0); | ||
925 | out: | ||
926 | if (ret && ret != -ENXIO) | ||
927 | ret = -ENXIO; | ||
928 | return ret; | ||
929 | } | ||
930 | |||
835 | int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, | 931 | int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, |
836 | struct buffer_head *bhs[], int flags, | 932 | struct buffer_head *bhs[], int flags, |
837 | int (*validate)(struct super_block *sb, | 933 | int (*validate)(struct super_block *sb, |
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index e79d41c2c909..67ea57d2fd59 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h | |||
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, | |||
53 | int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 53 | int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
54 | u64 map_start, u64 map_len); | 54 | u64 map_start, u64 map_len); |
55 | 55 | ||
56 | int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); | ||
57 | |||
56 | int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, | 58 | int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, |
57 | u32 *p_cluster, u32 *num_clusters, | 59 | u32 *p_cluster, u32 *num_clusters, |
58 | struct ocfs2_extent_list *el, | 60 | struct ocfs2_extent_list *el, |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index de4ea1af041b..6e396683c3d4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, | |||
1950 | if (ret < 0) | 1950 | if (ret < 0) |
1951 | mlog_errno(ret); | 1951 | mlog_errno(ret); |
1952 | 1952 | ||
1953 | if (file->f_flags & O_SYNC) | ||
1954 | handle->h_sync = 1; | ||
1955 | |||
1953 | ocfs2_commit_trans(osb, handle); | 1956 | ocfs2_commit_trans(osb, handle); |
1954 | 1957 | ||
1955 | out_inode_unlock: | 1958 | out_inode_unlock: |
@@ -2052,6 +2055,23 @@ out: | |||
2052 | return ret; | 2055 | return ret; |
2053 | } | 2056 | } |
2054 | 2057 | ||
2058 | static void ocfs2_aiodio_wait(struct inode *inode) | ||
2059 | { | ||
2060 | wait_queue_head_t *wq = ocfs2_ioend_wq(inode); | ||
2061 | |||
2062 | wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); | ||
2063 | } | ||
2064 | |||
2065 | static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) | ||
2066 | { | ||
2067 | int blockmask = inode->i_sb->s_blocksize - 1; | ||
2068 | loff_t final_size = pos + count; | ||
2069 | |||
2070 | if ((pos & blockmask) || (final_size & blockmask)) | ||
2071 | return 1; | ||
2072 | return 0; | ||
2073 | } | ||
2074 | |||
2055 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, | 2075 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, |
2056 | struct file *file, | 2076 | struct file *file, |
2057 | loff_t pos, size_t count, | 2077 | loff_t pos, size_t count, |
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
2230 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2250 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
2231 | int full_coherency = !(osb->s_mount_opt & | 2251 | int full_coherency = !(osb->s_mount_opt & |
2232 | OCFS2_MOUNT_COHERENCY_BUFFERED); | 2252 | OCFS2_MOUNT_COHERENCY_BUFFERED); |
2253 | int unaligned_dio = 0; | ||
2233 | 2254 | ||
2234 | trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, | 2255 | trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, |
2235 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 2256 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
@@ -2297,6 +2318,10 @@ relock: | |||
2297 | goto out; | 2318 | goto out; |
2298 | } | 2319 | } |
2299 | 2320 | ||
2321 | if (direct_io && !is_sync_kiocb(iocb)) | ||
2322 | unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, | ||
2323 | *ppos); | ||
2324 | |||
2300 | /* | 2325 | /* |
2301 | * We can't complete the direct I/O as requested, fall back to | 2326 | * We can't complete the direct I/O as requested, fall back to |
2302 | * buffered I/O. | 2327 | * buffered I/O. |
@@ -2311,6 +2336,18 @@ relock: | |||
2311 | goto relock; | 2336 | goto relock; |
2312 | } | 2337 | } |
2313 | 2338 | ||
2339 | if (unaligned_dio) { | ||
2340 | /* | ||
2341 | * Wait on previous unaligned aio to complete before | ||
2342 | * proceeding. | ||
2343 | */ | ||
2344 | ocfs2_aiodio_wait(inode); | ||
2345 | |||
2346 | /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ | ||
2347 | atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); | ||
2348 | ocfs2_iocb_set_unaligned_aio(iocb); | ||
2349 | } | ||
2350 | |||
2314 | /* | 2351 | /* |
2315 | * To later detect whether a journal commit for sync writes is | 2352 | * To later detect whether a journal commit for sync writes is |
2316 | * necessary, we sample i_size, and cluster count here. | 2353 | * necessary, we sample i_size, and cluster count here. |
@@ -2382,8 +2419,12 @@ out_dio: | |||
2382 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | 2419 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { |
2383 | rw_level = -1; | 2420 | rw_level = -1; |
2384 | have_alloc_sem = 0; | 2421 | have_alloc_sem = 0; |
2422 | unaligned_dio = 0; | ||
2385 | } | 2423 | } |
2386 | 2424 | ||
2425 | if (unaligned_dio) | ||
2426 | atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); | ||
2427 | |||
2387 | out: | 2428 | out: |
2388 | if (rw_level != -1) | 2429 | if (rw_level != -1) |
2389 | ocfs2_rw_unlock(inode, rw_level); | 2430 | ocfs2_rw_unlock(inode, rw_level); |
@@ -2591,6 +2632,57 @@ bail: | |||
2591 | return ret; | 2632 | return ret; |
2592 | } | 2633 | } |
2593 | 2634 | ||
2635 | /* Refer generic_file_llseek_unlocked() */ | ||
2636 | static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) | ||
2637 | { | ||
2638 | struct inode *inode = file->f_mapping->host; | ||
2639 | int ret = 0; | ||
2640 | |||
2641 | mutex_lock(&inode->i_mutex); | ||
2642 | |||
2643 | switch (origin) { | ||
2644 | case SEEK_SET: | ||
2645 | break; | ||
2646 | case SEEK_END: | ||
2647 | offset += inode->i_size; | ||
2648 | break; | ||
2649 | case SEEK_CUR: | ||
2650 | if (offset == 0) { | ||
2651 | offset = file->f_pos; | ||
2652 | goto out; | ||
2653 | } | ||
2654 | offset += file->f_pos; | ||
2655 | break; | ||
2656 | case SEEK_DATA: | ||
2657 | case SEEK_HOLE: | ||
2658 | ret = ocfs2_seek_data_hole_offset(file, &offset, origin); | ||
2659 | if (ret) | ||
2660 | goto out; | ||
2661 | break; | ||
2662 | default: | ||
2663 | ret = -EINVAL; | ||
2664 | goto out; | ||
2665 | } | ||
2666 | |||
2667 | if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) | ||
2668 | ret = -EINVAL; | ||
2669 | if (!ret && offset > inode->i_sb->s_maxbytes) | ||
2670 | ret = -EINVAL; | ||
2671 | if (ret) | ||
2672 | goto out; | ||
2673 | |||
2674 | if (offset != file->f_pos) { | ||
2675 | file->f_pos = offset; | ||
2676 | file->f_version = 0; | ||
2677 | } | ||
2678 | |||
2679 | out: | ||
2680 | mutex_unlock(&inode->i_mutex); | ||
2681 | if (ret) | ||
2682 | return ret; | ||
2683 | return offset; | ||
2684 | } | ||
2685 | |||
2594 | const struct inode_operations ocfs2_file_iops = { | 2686 | const struct inode_operations ocfs2_file_iops = { |
2595 | .setattr = ocfs2_setattr, | 2687 | .setattr = ocfs2_setattr, |
2596 | .getattr = ocfs2_getattr, | 2688 | .getattr = ocfs2_getattr, |
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = { | |||
2615 | * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! | 2707 | * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! |
2616 | */ | 2708 | */ |
2617 | const struct file_operations ocfs2_fops = { | 2709 | const struct file_operations ocfs2_fops = { |
2618 | .llseek = generic_file_llseek, | 2710 | .llseek = ocfs2_file_llseek, |
2619 | .read = do_sync_read, | 2711 | .read = do_sync_read, |
2620 | .write = do_sync_write, | 2712 | .write = do_sync_write, |
2621 | .mmap = ocfs2_mmap, | 2713 | .mmap = ocfs2_mmap, |
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = { | |||
2663 | * the cluster. | 2755 | * the cluster. |
2664 | */ | 2756 | */ |
2665 | const struct file_operations ocfs2_fops_no_plocks = { | 2757 | const struct file_operations ocfs2_fops_no_plocks = { |
2666 | .llseek = generic_file_llseek, | 2758 | .llseek = ocfs2_file_llseek, |
2667 | .read = do_sync_read, | 2759 | .read = do_sync_read, |
2668 | .write = do_sync_write, | 2760 | .write = do_sync_write, |
2669 | .mmap = ocfs2_mmap, | 2761 | .mmap = ocfs2_mmap, |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index a22d2c098890..17454a904d7b 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, | |||
951 | trace_ocfs2_cleanup_delete_inode( | 951 | trace_ocfs2_cleanup_delete_inode( |
952 | (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); | 952 | (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); |
953 | if (sync_data) | 953 | if (sync_data) |
954 | write_inode_now(inode, 1); | 954 | filemap_write_and_wait(inode->i_mapping); |
955 | truncate_inode_pages(&inode->i_data, 0); | 955 | truncate_inode_pages(&inode->i_data, 0); |
956 | } | 956 | } |
957 | 957 | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1c508b149b3a..88924a3133fa 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -43,6 +43,9 @@ struct ocfs2_inode_info | |||
43 | /* protects extended attribute changes on this inode */ | 43 | /* protects extended attribute changes on this inode */ |
44 | struct rw_semaphore ip_xattr_sem; | 44 | struct rw_semaphore ip_xattr_sem; |
45 | 45 | ||
46 | /* Number of outstanding AIO's which are not page aligned */ | ||
47 | atomic_t ip_unaligned_aio; | ||
48 | |||
46 | /* These fields are protected by ip_lock */ | 49 | /* These fields are protected by ip_lock */ |
47 | spinlock_t ip_lock; | 50 | spinlock_t ip_lock; |
48 | u32 ip_open_count; | 51 | u32 ip_open_count; |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index bc91072b7219..726ff265b296 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, | |||
122 | if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & | 122 | if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & |
123 | (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { | 123 | (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { |
124 | if (!capable(CAP_LINUX_IMMUTABLE)) | 124 | if (!capable(CAP_LINUX_IMMUTABLE)) |
125 | goto bail_unlock; | 125 | goto bail_commit; |
126 | } | 126 | } |
127 | 127 | ||
128 | ocfs2_inode->ip_attr = flags; | 128 | ocfs2_inode->ip_attr = flags; |
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, | |||
132 | if (status < 0) | 132 | if (status < 0) |
133 | mlog_errno(status); | 133 | mlog_errno(status); |
134 | 134 | ||
135 | bail_commit: | ||
135 | ocfs2_commit_trans(osb, handle); | 136 | ocfs2_commit_trans(osb, handle); |
136 | bail_unlock: | 137 | bail_unlock: |
137 | ocfs2_inode_unlock(inode, 1); | 138 | ocfs2_inode_unlock(inode, 1); |
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode, | |||
381 | if (!oifi) { | 382 | if (!oifi) { |
382 | status = -ENOMEM; | 383 | status = -ENOMEM; |
383 | mlog_errno(status); | 384 | mlog_errno(status); |
384 | goto bail; | 385 | goto out_err; |
385 | } | 386 | } |
386 | 387 | ||
387 | if (o2info_from_user(*oifi, req)) | 388 | if (o2info_from_user(*oifi, req)) |
@@ -431,7 +432,7 @@ bail: | |||
431 | o2info_set_request_error(&oifi->ifi_req, req); | 432 | o2info_set_request_error(&oifi->ifi_req, req); |
432 | 433 | ||
433 | kfree(oifi); | 434 | kfree(oifi); |
434 | 435 | out_err: | |
435 | return status; | 436 | return status; |
436 | } | 437 | } |
437 | 438 | ||
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode, | |||
666 | if (!oiff) { | 667 | if (!oiff) { |
667 | status = -ENOMEM; | 668 | status = -ENOMEM; |
668 | mlog_errno(status); | 669 | mlog_errno(status); |
669 | goto bail; | 670 | goto out_err; |
670 | } | 671 | } |
671 | 672 | ||
672 | if (o2info_from_user(*oiff, req)) | 673 | if (o2info_from_user(*oiff, req)) |
@@ -716,7 +717,7 @@ bail: | |||
716 | o2info_set_request_error(&oiff->iff_req, req); | 717 | o2info_set_request_error(&oiff->iff_req, req); |
717 | 718 | ||
718 | kfree(oiff); | 719 | kfree(oiff); |
719 | 720 | out_err: | |
720 | return status; | 721 | return status; |
721 | } | 722 | } |
722 | 723 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 295d56454e8b..0a42ae96dca7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1544 | /* we need to run complete recovery for offline orphan slots */ | 1544 | /* we need to run complete recovery for offline orphan slots */ |
1545 | ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); | 1545 | ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); |
1546 | 1546 | ||
1547 | mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", | 1547 | printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\ |
1548 | node_num, slot_num, | 1548 | "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), |
1549 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | 1549 | MINOR(osb->sb->s_dev)); |
1550 | 1550 | ||
1551 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | 1551 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); |
1552 | 1552 | ||
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1601 | 1601 | ||
1602 | jbd2_journal_destroy(journal); | 1602 | jbd2_journal_destroy(journal); |
1603 | 1603 | ||
1604 | printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\ | ||
1605 | "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), | ||
1606 | MINOR(osb->sb->s_dev)); | ||
1604 | done: | 1607 | done: |
1605 | /* drop the lock on this nodes journal */ | 1608 | /* drop the lock on this nodes journal */ |
1606 | if (got_lock) | 1609 | if (got_lock) |
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) | |||
1808 | * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This | 1811 | * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This |
1809 | * is done to catch any orphans that are left over in orphan directories. | 1812 | * is done to catch any orphans that are left over in orphan directories. |
1810 | * | 1813 | * |
1814 | * It scans all slots, even ones that are in use. It does so to handle the | ||
1815 | * case described below: | ||
1816 | * | ||
1817 | * Node 1 has an inode it was using. The dentry went away due to memory | ||
1818 | * pressure. Node 1 closes the inode, but it's on the free list. The node | ||
1819 | * has the open lock. | ||
1820 | * Node 2 unlinks the inode. It grabs the dentry lock to notify others, | ||
1821 | * but node 1 has no dentry and doesn't get the message. It trylocks the | ||
1822 | * open lock, sees that another node has a PR, and does nothing. | ||
1823 | * Later node 2 runs its orphan dir. It igets the inode, trylocks the | ||
1824 | * open lock, sees the PR still, and does nothing. | ||
1825 | * Basically, we have to trigger an orphan iput on node 1. The only way | ||
1826 | * for this to happen is if node 1 runs node 2's orphan dir. | ||
1827 | * | ||
1811 | * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT | 1828 | * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT |
1812 | * seconds. It gets an EX lock on os_lockres and checks sequence number | 1829 | * seconds. It gets an EX lock on os_lockres and checks sequence number |
1813 | * stored in LVB. If the sequence number has changed, it means some other | 1830 | * stored in LVB. If the sequence number has changed, it means some other |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 68cf2f6d3c6a..a3385b63ff5e 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, | |||
441 | #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) | 441 | #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) |
442 | 442 | ||
443 | /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota | 443 | /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota |
444 | * update on dir + index leaf + dx root update for free list */ | 444 | * update on dir + index leaf + dx root update for free list + |
445 | * previous dirblock update in the free list */ | ||
445 | static inline int ocfs2_link_credits(struct super_block *sb) | 446 | static inline int ocfs2_link_credits(struct super_block *sb) |
446 | { | 447 | { |
447 | return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + | 448 | return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + |
448 | ocfs2_quota_trans_credits(sb); | 449 | ocfs2_quota_trans_credits(sb); |
449 | } | 450 | } |
450 | 451 | ||
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 3e9393ca39eb..9cd41083e991 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) | |||
61 | static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | 61 | static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, |
62 | struct page *page) | 62 | struct page *page) |
63 | { | 63 | { |
64 | int ret; | 64 | int ret = VM_FAULT_NOPAGE; |
65 | struct inode *inode = file->f_path.dentry->d_inode; | 65 | struct inode *inode = file->f_path.dentry->d_inode; |
66 | struct address_space *mapping = inode->i_mapping; | 66 | struct address_space *mapping = inode->i_mapping; |
67 | loff_t pos = page_offset(page); | 67 | loff_t pos = page_offset(page); |
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | |||
71 | void *fsdata; | 71 | void *fsdata; |
72 | loff_t size = i_size_read(inode); | 72 | loff_t size = i_size_read(inode); |
73 | 73 | ||
74 | /* | ||
75 | * Another node might have truncated while we were waiting on | ||
76 | * cluster locks. | ||
77 | * We don't check size == 0 before the shift. This is borrowed | ||
78 | * from do_generic_file_read. | ||
79 | */ | ||
80 | last_index = (size - 1) >> PAGE_CACHE_SHIFT; | 74 | last_index = (size - 1) >> PAGE_CACHE_SHIFT; |
81 | if (unlikely(!size || page->index > last_index)) { | ||
82 | ret = -EINVAL; | ||
83 | goto out; | ||
84 | } | ||
85 | 75 | ||
86 | /* | 76 | /* |
87 | * The i_size check above doesn't catch the case where nodes | 77 | * There are cases that lead to the page no longer bebongs to the |
88 | * truncated and then re-extended the file. We'll re-check the | 78 | * mapping. |
89 | * page mapping after taking the page lock inside of | 79 | * 1) pagecache truncates locally due to memory pressure. |
90 | * ocfs2_write_begin_nolock(). | 80 | * 2) pagecache truncates when another is taking EX lock against |
81 | * inode lock. see ocfs2_data_convert_worker. | ||
82 | * | ||
83 | * The i_size check doesn't catch the case where nodes truncated and | ||
84 | * then re-extended the file. We'll re-check the page mapping after | ||
85 | * taking the page lock inside of ocfs2_write_begin_nolock(). | ||
86 | * | ||
87 | * Let VM retry with these cases. | ||
91 | */ | 88 | */ |
92 | if (!PageUptodate(page) || page->mapping != inode->i_mapping) { | 89 | if ((page->mapping != inode->i_mapping) || |
93 | /* | 90 | (!PageUptodate(page)) || |
94 | * the page has been umapped in ocfs2_data_downconvert_worker. | 91 | (page_offset(page) >= size)) |
95 | * So return 0 here and let VFS retry. | ||
96 | */ | ||
97 | ret = 0; | ||
98 | goto out; | 92 | goto out; |
99 | } | ||
100 | 93 | ||
101 | /* | 94 | /* |
102 | * Call ocfs2_write_begin() and ocfs2_write_end() to take | 95 | * Call ocfs2_write_begin() and ocfs2_write_end() to take |
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | |||
116 | if (ret) { | 109 | if (ret) { |
117 | if (ret != -ENOSPC) | 110 | if (ret != -ENOSPC) |
118 | mlog_errno(ret); | 111 | mlog_errno(ret); |
112 | if (ret == -ENOMEM) | ||
113 | ret = VM_FAULT_OOM; | ||
114 | else | ||
115 | ret = VM_FAULT_SIGBUS; | ||
119 | goto out; | 116 | goto out; |
120 | } | 117 | } |
121 | 118 | ||
122 | ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | 119 | if (!locked_page) { |
123 | fsdata); | 120 | ret = VM_FAULT_NOPAGE; |
124 | if (ret < 0) { | ||
125 | mlog_errno(ret); | ||
126 | goto out; | 121 | goto out; |
127 | } | 122 | } |
123 | ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | ||
124 | fsdata); | ||
128 | BUG_ON(ret != len); | 125 | BUG_ON(ret != len); |
129 | ret = 0; | 126 | ret = VM_FAULT_LOCKED; |
130 | out: | 127 | out: |
131 | return ret; | 128 | return ret; |
132 | } | 129 | } |
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
168 | 165 | ||
169 | out: | 166 | out: |
170 | ocfs2_unblock_signals(&oldset); | 167 | ocfs2_unblock_signals(&oldset); |
171 | if (ret) | ||
172 | ret = VM_FAULT_SIGBUS; | ||
173 | return ret; | 168 | return ret; |
174 | } | 169 | } |
175 | 170 | ||
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index d53cb706f14c..184c76b8c293 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, | |||
745 | */ | 745 | */ |
746 | ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, | 746 | ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, |
747 | new_phys_cpos); | 747 | new_phys_cpos); |
748 | if (!new_phys_cpos) { | 748 | if (!*new_phys_cpos) { |
749 | ret = -ENOSPC; | 749 | ret = -ENOSPC; |
750 | goto out_commit; | 750 | goto out_commit; |
751 | } | 751 | } |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 409285854f64..d355e6e36b36 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, | |||
836 | 836 | ||
837 | static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) | 837 | static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) |
838 | { | 838 | { |
839 | __test_and_set_bit_le(bit, bitmap); | 839 | __set_bit_le(bit, bitmap); |
840 | } | 840 | } |
841 | #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) | 841 | #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) |
842 | 842 | ||
843 | static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) | 843 | static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) |
844 | { | 844 | { |
845 | __test_and_clear_bit_le(bit, bitmap); | 845 | __clear_bit_le(bit, bitmap); |
846 | } | 846 | } |
847 | #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) | 847 | #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) |
848 | 848 | ||
849 | #define ocfs2_test_bit test_bit_le | 849 | #define ocfs2_test_bit test_bit_le |
850 | #define ocfs2_find_next_zero_bit find_next_zero_bit_le | 850 | #define ocfs2_find_next_zero_bit find_next_zero_bit_le |
851 | #define ocfs2_find_next_bit find_next_bit_le | 851 | #define ocfs2_find_next_bit find_next_bit_le |
852 | |||
853 | static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) | ||
854 | { | ||
855 | #if BITS_PER_LONG == 64 | ||
856 | *bit += ((unsigned long) addr & 7UL) << 3; | ||
857 | addr = (void *) ((unsigned long) addr & ~7UL); | ||
858 | #elif BITS_PER_LONG == 32 | ||
859 | *bit += ((unsigned long) addr & 3UL) << 3; | ||
860 | addr = (void *) ((unsigned long) addr & ~3UL); | ||
861 | #else | ||
862 | #error "how many bits you are?!" | ||
863 | #endif | ||
864 | return addr; | ||
865 | } | ||
866 | |||
867 | static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) | ||
868 | { | ||
869 | bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); | ||
870 | ocfs2_set_bit(bit, bitmap); | ||
871 | } | ||
872 | |||
873 | static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) | ||
874 | { | ||
875 | bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); | ||
876 | ocfs2_clear_bit(bit, bitmap); | ||
877 | } | ||
878 | |||
879 | static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) | ||
880 | { | ||
881 | bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); | ||
882 | return ocfs2_test_bit(bit, bitmap); | ||
883 | } | ||
884 | |||
885 | static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, | ||
886 | int start) | ||
887 | { | ||
888 | int fix = 0, ret, tmpmax; | ||
889 | bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); | ||
890 | tmpmax = max + fix; | ||
891 | start += fix; | ||
892 | |||
893 | ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; | ||
894 | if (ret > max) | ||
895 | return max; | ||
896 | return ret; | ||
897 | } | ||
898 | |||
852 | #endif /* OCFS2_H */ | 899 | #endif /* OCFS2_H */ |
853 | 900 | ||
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dc8007fc9247..f100bf70a906 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c | |||
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( | |||
404 | int status = 0; | 404 | int status = 0; |
405 | struct ocfs2_quota_recovery *rec; | 405 | struct ocfs2_quota_recovery *rec; |
406 | 406 | ||
407 | mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); | 407 | printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " |
408 | "slot %u\n", osb->dev_str, slot_num); | ||
409 | |||
408 | rec = ocfs2_alloc_quota_recovery(); | 410 | rec = ocfs2_alloc_quota_recovery(); |
409 | if (!rec) | 411 | if (!rec) |
410 | return ERR_PTR(-ENOMEM); | 412 | return ERR_PTR(-ENOMEM); |
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, | |||
549 | goto out_commit; | 551 | goto out_commit; |
550 | } | 552 | } |
551 | lock_buffer(qbh); | 553 | lock_buffer(qbh); |
552 | WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); | 554 | WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); |
553 | ocfs2_clear_bit(bit, dchunk->dqc_bitmap); | 555 | ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); |
554 | le32_add_cpu(&dchunk->dqc_free, 1); | 556 | le32_add_cpu(&dchunk->dqc_free, 1); |
555 | unlock_buffer(qbh); | 557 | unlock_buffer(qbh); |
556 | ocfs2_journal_dirty(handle, qbh); | 558 | ocfs2_journal_dirty(handle, qbh); |
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, | |||
596 | struct inode *lqinode; | 598 | struct inode *lqinode; |
597 | unsigned int flags; | 599 | unsigned int flags; |
598 | 600 | ||
599 | mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); | 601 | printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " |
602 | "slot %u\n", osb->dev_str, slot_num); | ||
603 | |||
600 | mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); | 604 | mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); |
601 | for (type = 0; type < MAXQUOTAS; type++) { | 605 | for (type = 0; type < MAXQUOTAS; type++) { |
602 | if (list_empty(&(rec->r_list[type]))) | 606 | if (list_empty(&(rec->r_list[type]))) |
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, | |||
612 | /* Someone else is holding the lock? Then he must be | 616 | /* Someone else is holding the lock? Then he must be |
613 | * doing the recovery. Just skip the file... */ | 617 | * doing the recovery. Just skip the file... */ |
614 | if (status == -EAGAIN) { | 618 | if (status == -EAGAIN) { |
615 | mlog(ML_NOTICE, "skipping quota recovery for slot %d " | 619 | printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " |
616 | "because quota file is locked.\n", slot_num); | 620 | "device (%s) for slot %d because quota file is " |
621 | "locked.\n", osb->dev_str, slot_num); | ||
617 | status = 0; | 622 | status = 0; |
618 | goto out_put; | 623 | goto out_put; |
619 | } else if (status < 0) { | 624 | } else if (status < 0) { |
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, | |||
944 | * ol_quota_entries_per_block(sb); | 949 | * ol_quota_entries_per_block(sb); |
945 | } | 950 | } |
946 | 951 | ||
947 | found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); | 952 | found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); |
948 | /* We failed? */ | 953 | /* We failed? */ |
949 | if (found == len) { | 954 | if (found == len) { |
950 | mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" | 955 | mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" |
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private) | |||
1208 | struct ocfs2_local_disk_chunk *dchunk; | 1213 | struct ocfs2_local_disk_chunk *dchunk; |
1209 | 1214 | ||
1210 | dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; | 1215 | dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; |
1211 | ocfs2_set_bit(*offset, dchunk->dqc_bitmap); | 1216 | ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); |
1212 | le32_add_cpu(&dchunk->dqc_free, -1); | 1217 | le32_add_cpu(&dchunk->dqc_free, -1); |
1213 | } | 1218 | } |
1214 | 1219 | ||
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) | |||
1289 | (od->dq_chunk->qc_headerbh->b_data); | 1294 | (od->dq_chunk->qc_headerbh->b_data); |
1290 | /* Mark structure as freed */ | 1295 | /* Mark structure as freed */ |
1291 | lock_buffer(od->dq_chunk->qc_headerbh); | 1296 | lock_buffer(od->dq_chunk->qc_headerbh); |
1292 | ocfs2_clear_bit(offset, dchunk->dqc_bitmap); | 1297 | ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); |
1293 | le32_add_cpu(&dchunk->dqc_free, 1); | 1298 | le32_add_cpu(&dchunk->dqc_free, 1); |
1294 | unlock_buffer(od->dq_chunk->qc_headerbh); | 1299 | unlock_buffer(od->dq_chunk->qc_headerbh); |
1295 | ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); | 1300 | ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 26fc0014d509..1424c151cccc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb) | |||
493 | goto bail; | 493 | goto bail; |
494 | } | 494 | } |
495 | } else | 495 | } else |
496 | mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", | 496 | printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " |
497 | slot); | 497 | "allocated to this node!\n", slot, osb->dev_str); |
498 | 498 | ||
499 | ocfs2_set_slot(si, slot, osb->node_num); | 499 | ocfs2_set_slot(si, slot, osb->node_num); |
500 | osb->slot_num = slot; | 500 | osb->slot_num = slot; |
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 19965b00c43c..94368017edb3 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "cluster/masklog.h" | 28 | #include "cluster/masklog.h" |
29 | #include "cluster/nodemanager.h" | 29 | #include "cluster/nodemanager.h" |
30 | #include "cluster/heartbeat.h" | 30 | #include "cluster/heartbeat.h" |
31 | #include "cluster/tcp.h" | ||
31 | 32 | ||
32 | #include "stackglue.h" | 33 | #include "stackglue.h" |
33 | 34 | ||
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) | |||
256 | } | 257 | } |
257 | 258 | ||
258 | /* | 259 | /* |
260 | * Check if this node is heartbeating and is connected to all other | ||
261 | * heartbeating nodes. | ||
262 | */ | ||
263 | static int o2cb_cluster_check(void) | ||
264 | { | ||
265 | u8 node_num; | ||
266 | int i; | ||
267 | unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
268 | unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
269 | |||
270 | node_num = o2nm_this_node(); | ||
271 | if (node_num == O2NM_MAX_NODES) { | ||
272 | printk(KERN_ERR "o2cb: This node has not been configured.\n"); | ||
273 | return -EINVAL; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * o2dlm expects o2net sockets to be created. If not, then | ||
278 | * dlm_join_domain() fails with a stack of errors which are both cryptic | ||
279 | * and incomplete. The idea here is to detect upfront whether we have | ||
280 | * managed to connect to all nodes or not. If not, then list the nodes | ||
281 | * to allow the user to check the configuration (incorrect IP, firewall, | ||
282 | * etc.) Yes, this is racy. But its not the end of the world. | ||
283 | */ | ||
284 | #define O2CB_MAP_STABILIZE_COUNT 60 | ||
285 | for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { | ||
286 | o2hb_fill_node_map(hbmap, sizeof(hbmap)); | ||
287 | if (!test_bit(node_num, hbmap)) { | ||
288 | printk(KERN_ERR "o2cb: %s heartbeat has not been " | ||
289 | "started.\n", (o2hb_global_heartbeat_active() ? | ||
290 | "Global" : "Local")); | ||
291 | return -EINVAL; | ||
292 | } | ||
293 | o2net_fill_node_map(netmap, sizeof(netmap)); | ||
294 | /* Force set the current node to allow easy compare */ | ||
295 | set_bit(node_num, netmap); | ||
296 | if (!memcmp(hbmap, netmap, sizeof(hbmap))) | ||
297 | return 0; | ||
298 | if (i < O2CB_MAP_STABILIZE_COUNT) | ||
299 | msleep(1000); | ||
300 | } | ||
301 | |||
302 | printk(KERN_ERR "o2cb: This node could not connect to nodes:"); | ||
303 | i = -1; | ||
304 | while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, | ||
305 | i + 1)) < O2NM_MAX_NODES) { | ||
306 | if (!test_bit(i, netmap)) | ||
307 | printk(" %u", i); | ||
308 | } | ||
309 | printk(".\n"); | ||
310 | |||
311 | return -ENOTCONN; | ||
312 | } | ||
313 | |||
314 | /* | ||
259 | * Called from the dlm when it's about to evict a node. This is how the | 315 | * Called from the dlm when it's about to evict a node. This is how the |
260 | * classic stack signals node death. | 316 | * classic stack signals node death. |
261 | */ | 317 | */ |
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data) | |||
263 | { | 319 | { |
264 | struct ocfs2_cluster_connection *conn = data; | 320 | struct ocfs2_cluster_connection *conn = data; |
265 | 321 | ||
266 | mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", | 322 | printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", |
267 | node_num, conn->cc_namelen, conn->cc_name); | 323 | node_num, conn->cc_namelen, conn->cc_name); |
268 | 324 | ||
269 | conn->cc_recovery_handler(node_num, conn->cc_recovery_data); | 325 | conn->cc_recovery_handler(node_num, conn->cc_recovery_data); |
270 | } | 326 | } |
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) | |||
280 | BUG_ON(conn == NULL); | 336 | BUG_ON(conn == NULL); |
281 | BUG_ON(conn->cc_proto == NULL); | 337 | BUG_ON(conn->cc_proto == NULL); |
282 | 338 | ||
283 | /* for now we only have one cluster/node, make sure we see it | 339 | /* Ensure cluster stack is up and all nodes are connected */ |
284 | * in the heartbeat universe */ | 340 | rc = o2cb_cluster_check(); |
285 | if (!o2hb_check_local_node_heartbeating()) { | 341 | if (rc) { |
286 | if (o2hb_global_heartbeat_active()) | 342 | printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " |
287 | mlog(ML_ERROR, "Global heartbeat not started\n"); | 343 | "before retrying.\n"); |
288 | rc = -EINVAL; | ||
289 | goto out; | 344 | goto out; |
290 | } | 345 | } |
291 | 346 | ||
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 56f61027236b..4994f8b0e604 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include "ocfs1_fs_compat.h" | 54 | #include "ocfs1_fs_compat.h" |
55 | 55 | ||
56 | #include "alloc.h" | 56 | #include "alloc.h" |
57 | #include "aops.h" | ||
57 | #include "blockcheck.h" | 58 | #include "blockcheck.h" |
58 | #include "dlmglue.h" | 59 | #include "dlmglue.h" |
59 | #include "export.h" | 60 | #include "export.h" |
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
1107 | 1108 | ||
1108 | ocfs2_set_ro_flag(osb, 1); | 1109 | ocfs2_set_ro_flag(osb, 1); |
1109 | 1110 | ||
1110 | printk(KERN_NOTICE "Readonly device detected. No cluster " | 1111 | printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " |
1111 | "services will be utilized for this mount. Recovery " | 1112 | "Cluster services will not be used for this mount. " |
1112 | "will be skipped.\n"); | 1113 | "Recovery will be skipped.\n", osb->dev_str); |
1113 | } | 1114 | } |
1114 | 1115 | ||
1115 | if (!ocfs2_is_hard_readonly(osb)) { | 1116 | if (!ocfs2_is_hard_readonly(osb)) { |
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) | |||
1616 | return 0; | 1617 | return 0; |
1617 | } | 1618 | } |
1618 | 1619 | ||
1620 | wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; | ||
1621 | |||
1619 | static int __init ocfs2_init(void) | 1622 | static int __init ocfs2_init(void) |
1620 | { | 1623 | { |
1621 | int status; | 1624 | int status, i; |
1622 | 1625 | ||
1623 | ocfs2_print_version(); | 1626 | ocfs2_print_version(); |
1624 | 1627 | ||
1628 | for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) | ||
1629 | init_waitqueue_head(&ocfs2__ioend_wq[i]); | ||
1630 | |||
1625 | status = init_ocfs2_uptodate_cache(); | 1631 | status = init_ocfs2_uptodate_cache(); |
1626 | if (status < 0) { | 1632 | if (status < 0) { |
1627 | mlog_errno(status); | 1633 | mlog_errno(status); |
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data) | |||
1760 | ocfs2_extent_map_init(&oi->vfs_inode); | 1766 | ocfs2_extent_map_init(&oi->vfs_inode); |
1761 | INIT_LIST_HEAD(&oi->ip_io_markers); | 1767 | INIT_LIST_HEAD(&oi->ip_io_markers); |
1762 | oi->ip_dir_start_lookup = 0; | 1768 | oi->ip_dir_start_lookup = 0; |
1763 | 1769 | atomic_set(&oi->ip_unaligned_aio, 0); | |
1764 | init_rwsem(&oi->ip_alloc_sem); | 1770 | init_rwsem(&oi->ip_alloc_sem); |
1765 | init_rwsem(&oi->ip_xattr_sem); | 1771 | init_rwsem(&oi->ip_xattr_sem); |
1766 | mutex_init(&oi->ip_io_mutex); | 1772 | mutex_init(&oi->ip_io_mutex); |
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1974 | * If we failed before we got a uuid_str yet, we can't stop | 1980 | * If we failed before we got a uuid_str yet, we can't stop |
1975 | * heartbeat. Otherwise, do it. | 1981 | * heartbeat. Otherwise, do it. |
1976 | */ | 1982 | */ |
1977 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) | 1983 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && |
1984 | !ocfs2_is_hard_readonly(osb)) | ||
1978 | hangup_needed = 1; | 1985 | hangup_needed = 1; |
1979 | 1986 | ||
1980 | if (osb->cconn) | 1987 | if (osb->cconn) |
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2353 | mlog_errno(status); | 2360 | mlog_errno(status); |
2354 | goto bail; | 2361 | goto bail; |
2355 | } | 2362 | } |
2356 | cleancache_init_shared_fs((char *)&uuid_net_key, sb); | 2363 | cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); |
2357 | 2364 | ||
2358 | bail: | 2365 | bail: |
2359 | return status; | 2366 | return status; |
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) | |||
2462 | goto finally; | 2469 | goto finally; |
2463 | } | 2470 | } |
2464 | } else { | 2471 | } else { |
2465 | mlog(ML_NOTICE, "File system was not unmounted cleanly, " | 2472 | printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " |
2466 | "recovering volume.\n"); | 2473 | "unmounted cleanly, recovering it.\n", osb->dev_str); |
2467 | } | 2474 | } |
2468 | 2475 | ||
2469 | local = ocfs2_mount_local(osb); | 2476 | local = ocfs2_mount_local(osb); |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 194fb22ef79d..aa9e8777b09a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode, | |||
2376 | } | 2376 | } |
2377 | 2377 | ||
2378 | ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); | 2378 | ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); |
2379 | if (ret < 0) { | ||
2380 | mlog_errno(ret); | ||
2381 | break; | ||
2382 | } | ||
2383 | 2379 | ||
2384 | ocfs2_commit_trans(osb, ctxt.handle); | 2380 | ocfs2_commit_trans(osb, ctxt.handle); |
2385 | if (ctxt.meta_ac) { | 2381 | if (ctxt.meta_ac) { |
2386 | ocfs2_free_alloc_context(ctxt.meta_ac); | 2382 | ocfs2_free_alloc_context(ctxt.meta_ac); |
2387 | ctxt.meta_ac = NULL; | 2383 | ctxt.meta_ac = NULL; |
2388 | } | 2384 | } |
2385 | |||
2386 | if (ret < 0) { | ||
2387 | mlog_errno(ret); | ||
2388 | break; | ||
2389 | } | ||
2390 | |||
2389 | } | 2391 | } |
2390 | 2392 | ||
2391 | if (ctxt.meta_ac) | 2393 | if (ctxt.meta_ac) |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 586174168e2a..80e4645f7990 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
131 | K(i.freeswap), | 131 | K(i.freeswap), |
132 | K(global_page_state(NR_FILE_DIRTY)), | 132 | K(global_page_state(NR_FILE_DIRTY)), |
133 | K(global_page_state(NR_WRITEBACK)), | 133 | K(global_page_state(NR_WRITEBACK)), |
134 | K(global_page_state(NR_ANON_PAGES) | ||
135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 134 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
135 | K(global_page_state(NR_ANON_PAGES) | ||
136 | + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | 136 | + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * |
137 | HPAGE_PMD_NR | 137 | HPAGE_PMD_NR), |
138 | #else | ||
139 | K(global_page_state(NR_ANON_PAGES)), | ||
138 | #endif | 140 | #endif |
139 | ), | ||
140 | K(global_page_state(NR_FILE_MAPPED)), | 141 | K(global_page_state(NR_FILE_MAPPED)), |
141 | K(global_page_state(NR_SHMEM)), | 142 | K(global_page_state(NR_SHMEM)), |
142 | K(global_page_state(NR_SLAB_RECLAIMABLE) + | 143 | K(global_page_state(NR_SLAB_RECLAIMABLE) + |
diff --git a/fs/proc/root.c b/fs/proc/root.c index 9a8a2b77b874..03102d978180 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c | |||
@@ -91,20 +91,18 @@ static struct file_system_type proc_fs_type = { | |||
91 | 91 | ||
92 | void __init proc_root_init(void) | 92 | void __init proc_root_init(void) |
93 | { | 93 | { |
94 | struct vfsmount *mnt; | ||
95 | int err; | 94 | int err; |
96 | 95 | ||
97 | proc_init_inodecache(); | 96 | proc_init_inodecache(); |
98 | err = register_filesystem(&proc_fs_type); | 97 | err = register_filesystem(&proc_fs_type); |
99 | if (err) | 98 | if (err) |
100 | return; | 99 | return; |
101 | mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); | 100 | err = pid_ns_prepare_proc(&init_pid_ns); |
102 | if (IS_ERR(mnt)) { | 101 | if (err) { |
103 | unregister_filesystem(&proc_fs_type); | 102 | unregister_filesystem(&proc_fs_type); |
104 | return; | 103 | return; |
105 | } | 104 | } |
106 | 105 | ||
107 | init_pid_ns.proc_mnt = mnt; | ||
108 | proc_symlink("mounts", NULL, "self/mounts"); | 106 | proc_symlink("mounts", NULL, "self/mounts"); |
109 | 107 | ||
110 | proc_net_init(); | 108 | proc_net_init(); |
@@ -209,5 +207,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) | |||
209 | 207 | ||
210 | void pid_ns_release_proc(struct pid_namespace *ns) | 208 | void pid_ns_release_proc(struct pid_namespace *ns) |
211 | { | 209 | { |
212 | mntput(ns->proc_mnt); | 210 | kern_unmount(ns->proc_mnt); |
213 | } | 211 | } |
diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 42b274da92c3..2a30d67dd6b8 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c | |||
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) | |||
32 | idle = kstat_cpu(cpu).cpustat.idle; | 32 | idle = kstat_cpu(cpu).cpustat.idle; |
33 | idle = cputime64_add(idle, arch_idle_time(cpu)); | 33 | idle = cputime64_add(idle, arch_idle_time(cpu)); |
34 | } else | 34 | } else |
35 | idle = usecs_to_cputime(idle_time); | 35 | idle = nsecs_to_jiffies64(1000 * idle_time); |
36 | 36 | ||
37 | return idle; | 37 | return idle; |
38 | } | 38 | } |
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) | |||
46 | /* !NO_HZ so we can rely on cpustat.iowait */ | 46 | /* !NO_HZ so we can rely on cpustat.iowait */ |
47 | iowait = kstat_cpu(cpu).cpustat.iowait; | 47 | iowait = kstat_cpu(cpu).cpustat.iowait; |
48 | else | 48 | else |
49 | iowait = usecs_to_cputime(iowait_time); | 49 | iowait = nsecs_to_jiffies64(1000 * iowait_time); |
50 | 50 | ||
51 | return iowait; | 51 | return iowait; |
52 | } | 52 | } |
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2bd620f0d796..57bbf9078ac8 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c | |||
@@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi) | |||
167 | } | 167 | } |
168 | 168 | ||
169 | psinfo = psi; | 169 | psinfo = psi; |
170 | mutex_init(&psinfo->read_mutex); | ||
170 | spin_unlock(&pstore_lock); | 171 | spin_unlock(&pstore_lock); |
171 | 172 | ||
172 | if (owner && !try_module_get(owner)) { | 173 | if (owner && !try_module_get(owner)) { |
@@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register); | |||
195 | void pstore_get_records(int quiet) | 196 | void pstore_get_records(int quiet) |
196 | { | 197 | { |
197 | struct pstore_info *psi = psinfo; | 198 | struct pstore_info *psi = psinfo; |
199 | char *buf = NULL; | ||
198 | ssize_t size; | 200 | ssize_t size; |
199 | u64 id; | 201 | u64 id; |
200 | enum pstore_type_id type; | 202 | enum pstore_type_id type; |
201 | struct timespec time; | 203 | struct timespec time; |
202 | int failed = 0, rc; | 204 | int failed = 0, rc; |
203 | unsigned long flags; | ||
204 | 205 | ||
205 | if (!psi) | 206 | if (!psi) |
206 | return; | 207 | return; |
207 | 208 | ||
208 | spin_lock_irqsave(&psinfo->buf_lock, flags); | 209 | mutex_lock(&psi->read_mutex); |
209 | rc = psi->open(psi); | 210 | rc = psi->open(psi); |
210 | if (rc) | 211 | if (rc) |
211 | goto out; | 212 | goto out; |
212 | 213 | ||
213 | while ((size = psi->read(&id, &type, &time, psi)) > 0) { | 214 | while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { |
214 | rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, | 215 | rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, |
215 | time, psi); | 216 | time, psi); |
217 | kfree(buf); | ||
218 | buf = NULL; | ||
216 | if (rc && (rc != -EEXIST || !quiet)) | 219 | if (rc && (rc != -EEXIST || !quiet)) |
217 | failed++; | 220 | failed++; |
218 | } | 221 | } |
219 | psi->close(psi); | 222 | psi->close(psi); |
220 | out: | 223 | out: |
221 | spin_unlock_irqrestore(&psinfo->buf_lock, flags); | 224 | mutex_unlock(&psi->read_mutex); |
222 | 225 | ||
223 | if (failed) | 226 | if (failed) |
224 | printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", | 227 | printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", |
diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e78c95..dba43c3ea3af 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c | |||
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path); | |||
449 | 449 | ||
450 | /* | 450 | /* |
451 | * Same as seq_path, but relative to supplied root. | 451 | * Same as seq_path, but relative to supplied root. |
452 | * | ||
453 | * root may be changed, see __d_path(). | ||
454 | */ | 452 | */ |
455 | int seq_path_root(struct seq_file *m, struct path *path, struct path *root, | 453 | int seq_path_root(struct seq_file *m, struct path *path, struct path *root, |
456 | char *esc) | 454 | char *esc) |
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, | |||
463 | char *p; | 461 | char *p; |
464 | 462 | ||
465 | p = __d_path(path, root, buf, size); | 463 | p = __d_path(path, root, buf, size); |
464 | if (!p) | ||
465 | return SEQ_SKIP; | ||
466 | res = PTR_ERR(p); | 466 | res = PTR_ERR(p); |
467 | if (!IS_ERR(p)) { | 467 | if (!IS_ERR(p)) { |
468 | char *end = mangle_path(buf, p, esc); | 468 | char *end = mangle_path(buf, p, esc); |
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, | |||
474 | } | 474 | } |
475 | seq_commit(m, res); | 475 | seq_commit(m, res); |
476 | 476 | ||
477 | return res < 0 ? res : 0; | 477 | return res < 0 && res != -ENAMETOOLONG ? res : 0; |
478 | } | 478 | } |
479 | 479 | ||
480 | /* | 480 | /* |
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 20403dc5d437..ae0e76bb6ebf 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c | |||
@@ -2264,19 +2264,12 @@ static int __init ubifs_init(void) | |||
2264 | return -EINVAL; | 2264 | return -EINVAL; |
2265 | } | 2265 | } |
2266 | 2266 | ||
2267 | err = register_filesystem(&ubifs_fs_type); | ||
2268 | if (err) { | ||
2269 | ubifs_err("cannot register file system, error %d", err); | ||
2270 | return err; | ||
2271 | } | ||
2272 | |||
2273 | err = -ENOMEM; | ||
2274 | ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", | 2267 | ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", |
2275 | sizeof(struct ubifs_inode), 0, | 2268 | sizeof(struct ubifs_inode), 0, |
2276 | SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, | 2269 | SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, |
2277 | &inode_slab_ctor); | 2270 | &inode_slab_ctor); |
2278 | if (!ubifs_inode_slab) | 2271 | if (!ubifs_inode_slab) |
2279 | goto out_reg; | 2272 | return -ENOMEM; |
2280 | 2273 | ||
2281 | register_shrinker(&ubifs_shrinker_info); | 2274 | register_shrinker(&ubifs_shrinker_info); |
2282 | 2275 | ||
@@ -2288,15 +2281,20 @@ static int __init ubifs_init(void) | |||
2288 | if (err) | 2281 | if (err) |
2289 | goto out_compr; | 2282 | goto out_compr; |
2290 | 2283 | ||
2284 | err = register_filesystem(&ubifs_fs_type); | ||
2285 | if (err) { | ||
2286 | ubifs_err("cannot register file system, error %d", err); | ||
2287 | goto out_dbg; | ||
2288 | } | ||
2291 | return 0; | 2289 | return 0; |
2292 | 2290 | ||
2291 | out_dbg: | ||
2292 | dbg_debugfs_exit(); | ||
2293 | out_compr: | 2293 | out_compr: |
2294 | ubifs_compressors_exit(); | 2294 | ubifs_compressors_exit(); |
2295 | out_shrinker: | 2295 | out_shrinker: |
2296 | unregister_shrinker(&ubifs_shrinker_info); | 2296 | unregister_shrinker(&ubifs_shrinker_info); |
2297 | kmem_cache_destroy(ubifs_inode_slab); | 2297 | kmem_cache_destroy(ubifs_inode_slab); |
2298 | out_reg: | ||
2299 | unregister_filesystem(&ubifs_fs_type); | ||
2300 | return err; | 2298 | return err; |
2301 | } | 2299 | } |
2302 | /* late_initcall to let compressors initialize first */ | 2300 | /* late_initcall to let compressors initialize first */ |
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6c4b3795c4a..76e4266d2e7e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c | |||
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp) | |||
42 | int count, i; | 42 | int count, i; |
43 | 43 | ||
44 | count = be32_to_cpu(aclp->acl_cnt); | 44 | count = be32_to_cpu(aclp->acl_cnt); |
45 | if (count > XFS_ACL_MAX_ENTRIES) | ||
46 | return ERR_PTR(-EFSCORRUPTED); | ||
45 | 47 | ||
46 | acl = posix_acl_alloc(count, GFP_KERNEL); | 48 | acl = posix_acl_alloc(count, GFP_KERNEL); |
47 | if (!acl) | 49 | if (!acl) |
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d4906e7c9787..c1b55e596551 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c | |||
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) | |||
110 | /* | 110 | /* |
111 | * Query whether the requested number of additional bytes of extended | 111 | * Query whether the requested number of additional bytes of extended |
112 | * attribute space will be able to fit inline. | 112 | * attribute space will be able to fit inline. |
113 | * | ||
113 | * Returns zero if not, else the di_forkoff fork offset to be used in the | 114 | * Returns zero if not, else the di_forkoff fork offset to be used in the |
114 | * literal area for attribute data once the new bytes have been added. | 115 | * literal area for attribute data once the new bytes have been added. |
115 | * | 116 | * |
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) | |||
122 | int offset; | 123 | int offset; |
123 | int minforkoff; /* lower limit on valid forkoff locations */ | 124 | int minforkoff; /* lower limit on valid forkoff locations */ |
124 | int maxforkoff; /* upper limit on valid forkoff locations */ | 125 | int maxforkoff; /* upper limit on valid forkoff locations */ |
125 | int dsize; | 126 | int dsize; |
126 | xfs_mount_t *mp = dp->i_mount; | 127 | xfs_mount_t *mp = dp->i_mount; |
127 | 128 | ||
128 | offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ | 129 | offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ |
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) | |||
136 | return (offset >= minforkoff) ? minforkoff : 0; | 137 | return (offset >= minforkoff) ? minforkoff : 0; |
137 | } | 138 | } |
138 | 139 | ||
139 | if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { | 140 | /* |
140 | if (bytes <= XFS_IFORK_ASIZE(dp)) | 141 | * If the requested numbers of bytes is smaller or equal to the |
141 | return dp->i_d.di_forkoff; | 142 | * current attribute fork size we can always proceed. |
143 | * | ||
144 | * Note that if_bytes in the data fork might actually be larger than | ||
145 | * the current data fork size is due to delalloc extents. In that | ||
146 | * case either the extent count will go down when they are converted | ||
147 | * to real extents, or the delalloc conversion will take care of the | ||
148 | * literal area rebalancing. | ||
149 | */ | ||
150 | if (bytes <= XFS_IFORK_ASIZE(dp)) | ||
151 | return dp->i_d.di_forkoff; | ||
152 | |||
153 | /* | ||
154 | * For attr2 we can try to move the forkoff if there is space in the | ||
155 | * literal area, but for the old format we are done if there is no | ||
156 | * space in the fixed attribute fork. | ||
157 | */ | ||
158 | if (!(mp->m_flags & XFS_MOUNT_ATTR2)) | ||
142 | return 0; | 159 | return 0; |
143 | } | ||
144 | 160 | ||
145 | dsize = dp->i_df.if_bytes; | 161 | dsize = dp->i_df.if_bytes; |
146 | 162 | ||
147 | switch (dp->i_d.di_format) { | 163 | switch (dp->i_d.di_format) { |
148 | case XFS_DINODE_FMT_EXTENTS: | 164 | case XFS_DINODE_FMT_EXTENTS: |
149 | /* | 165 | /* |
150 | * If there is no attr fork and the data fork is extents, | 166 | * If there is no attr fork and the data fork is extents, |
151 | * determine if creating the default attr fork will result | 167 | * determine if creating the default attr fork will result |
152 | * in the extents form migrating to btree. If so, the | 168 | * in the extents form migrating to btree. If so, the |
153 | * minimum offset only needs to be the space required for | 169 | * minimum offset only needs to be the space required for |
154 | * the btree root. | 170 | * the btree root. |
155 | */ | 171 | */ |
156 | if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > | 172 | if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > |
157 | xfs_default_attroffset(dp)) | 173 | xfs_default_attroffset(dp)) |
158 | dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); | 174 | dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); |
159 | break; | 175 | break; |
160 | |||
161 | case XFS_DINODE_FMT_BTREE: | 176 | case XFS_DINODE_FMT_BTREE: |
162 | /* | 177 | /* |
163 | * If have data btree then keep forkoff if we have one, | 178 | * If we have a data btree then keep forkoff if we have one, |
164 | * otherwise we are adding a new attr, so then we set | 179 | * otherwise we are adding a new attr, so then we set |
165 | * minforkoff to where the btree root can finish so we have | 180 | * minforkoff to where the btree root can finish so we have |
166 | * plenty of room for attrs | 181 | * plenty of room for attrs |
167 | */ | 182 | */ |
168 | if (dp->i_d.di_forkoff) { | 183 | if (dp->i_d.di_forkoff) { |
169 | if (offset < dp->i_d.di_forkoff) | 184 | if (offset < dp->i_d.di_forkoff) |
170 | return 0; | 185 | return 0; |
171 | else | 186 | return dp->i_d.di_forkoff; |
172 | return dp->i_d.di_forkoff; | 187 | } |
173 | } else | 188 | dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); |
174 | dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); | ||
175 | break; | 189 | break; |
176 | } | 190 | } |
177 | 191 | ||
178 | /* | 192 | /* |
179 | * A data fork btree root must have space for at least | 193 | * A data fork btree root must have space for at least |
180 | * MINDBTPTRS key/ptr pairs if the data fork is small or empty. | 194 | * MINDBTPTRS key/ptr pairs if the data fork is small or empty. |
181 | */ | 195 | */ |
182 | minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); | 196 | minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); |
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) | |||
186 | maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); | 200 | maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); |
187 | maxforkoff = maxforkoff >> 3; /* rounded down */ | 201 | maxforkoff = maxforkoff >> 3; /* rounded down */ |
188 | 202 | ||
189 | if (offset >= minforkoff && offset < maxforkoff) | ||
190 | return offset; | ||
191 | if (offset >= maxforkoff) | 203 | if (offset >= maxforkoff) |
192 | return maxforkoff; | 204 | return maxforkoff; |
205 | if (offset >= minforkoff) | ||
206 | return offset; | ||
193 | return 0; | 207 | return 0; |
194 | } | 208 | } |
195 | 209 | ||
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c68baeb0974a..d0ab78837057 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c | |||
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc( | |||
2383 | int tryagain; | 2383 | int tryagain; |
2384 | int error; | 2384 | int error; |
2385 | 2385 | ||
2386 | ASSERT(ap->length); | ||
2387 | |||
2386 | mp = ap->ip->i_mount; | 2388 | mp = ap->ip->i_mount; |
2387 | align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; | 2389 | align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; |
2388 | if (unlikely(align)) { | 2390 | if (unlikely(align)) { |
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate( | |||
4629 | int error; | 4631 | int error; |
4630 | int rt; | 4632 | int rt; |
4631 | 4633 | ||
4634 | ASSERT(bma->length > 0); | ||
4635 | |||
4632 | rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); | 4636 | rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); |
4633 | 4637 | ||
4634 | /* | 4638 | /* |
@@ -4849,6 +4853,7 @@ xfs_bmapi_write( | |||
4849 | ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); | 4853 | ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); |
4850 | ASSERT(!(flags & XFS_BMAPI_IGSTATE)); | 4854 | ASSERT(!(flags & XFS_BMAPI_IGSTATE)); |
4851 | ASSERT(tp != NULL); | 4855 | ASSERT(tp != NULL); |
4856 | ASSERT(len > 0); | ||
4852 | 4857 | ||
4853 | whichfork = (flags & XFS_BMAPI_ATTRFORK) ? | 4858 | whichfork = (flags & XFS_BMAPI_ATTRFORK) ? |
4854 | XFS_ATTR_FORK : XFS_DATA_FORK; | 4859 | XFS_ATTR_FORK : XFS_DATA_FORK; |
@@ -4918,9 +4923,22 @@ xfs_bmapi_write( | |||
4918 | bma.eof = eof; | 4923 | bma.eof = eof; |
4919 | bma.conv = !!(flags & XFS_BMAPI_CONVERT); | 4924 | bma.conv = !!(flags & XFS_BMAPI_CONVERT); |
4920 | bma.wasdel = wasdelay; | 4925 | bma.wasdel = wasdelay; |
4921 | bma.length = len; | ||
4922 | bma.offset = bno; | 4926 | bma.offset = bno; |
4923 | 4927 | ||
4928 | /* | ||
4929 | * There's a 32/64 bit type mismatch between the | ||
4930 | * allocation length request (which can be 64 bits in | ||
4931 | * length) and the bma length request, which is | ||
4932 | * xfs_extlen_t and therefore 32 bits. Hence we have to | ||
4933 | * check for 32-bit overflows and handle them here. | ||
4934 | */ | ||
4935 | if (len > (xfs_filblks_t)MAXEXTLEN) | ||
4936 | bma.length = MAXEXTLEN; | ||
4937 | else | ||
4938 | bma.length = len; | ||
4939 | |||
4940 | ASSERT(len > 0); | ||
4941 | ASSERT(bma.length > 0); | ||
4924 | error = xfs_bmapi_allocate(&bma, flags); | 4942 | error = xfs_bmapi_allocate(&bma, flags); |
4925 | if (error) | 4943 | if (error) |
4926 | goto error0; | 4944 | goto error0; |
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index da108977b21f..558910f5e3c0 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c | |||
@@ -98,22 +98,22 @@ xfs_fs_encode_fh( | |||
98 | switch (fileid_type) { | 98 | switch (fileid_type) { |
99 | case FILEID_INO32_GEN_PARENT: | 99 | case FILEID_INO32_GEN_PARENT: |
100 | spin_lock(&dentry->d_lock); | 100 | spin_lock(&dentry->d_lock); |
101 | fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; | 101 | fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; |
102 | fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; | 102 | fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; |
103 | spin_unlock(&dentry->d_lock); | 103 | spin_unlock(&dentry->d_lock); |
104 | /*FALLTHRU*/ | 104 | /*FALLTHRU*/ |
105 | case FILEID_INO32_GEN: | 105 | case FILEID_INO32_GEN: |
106 | fid->i32.ino = inode->i_ino; | 106 | fid->i32.ino = XFS_I(inode)->i_ino; |
107 | fid->i32.gen = inode->i_generation; | 107 | fid->i32.gen = inode->i_generation; |
108 | break; | 108 | break; |
109 | case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: | 109 | case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: |
110 | spin_lock(&dentry->d_lock); | 110 | spin_lock(&dentry->d_lock); |
111 | fid64->parent_ino = dentry->d_parent->d_inode->i_ino; | 111 | fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; |
112 | fid64->parent_gen = dentry->d_parent->d_inode->i_generation; | 112 | fid64->parent_gen = dentry->d_parent->d_inode->i_generation; |
113 | spin_unlock(&dentry->d_lock); | 113 | spin_unlock(&dentry->d_lock); |
114 | /*FALLTHRU*/ | 114 | /*FALLTHRU*/ |
115 | case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: | 115 | case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: |
116 | fid64->ino = inode->i_ino; | 116 | fid64->ino = XFS_I(inode)->i_ino; |
117 | fid64->gen = inode->i_generation; | 117 | fid64->gen = inode->i_generation; |
118 | break; | 118 | break; |
119 | } | 119 | } |
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0237c602f11..755ee8164880 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -2835,6 +2835,27 @@ corrupt_out: | |||
2835 | return XFS_ERROR(EFSCORRUPTED); | 2835 | return XFS_ERROR(EFSCORRUPTED); |
2836 | } | 2836 | } |
2837 | 2837 | ||
2838 | void | ||
2839 | xfs_promote_inode( | ||
2840 | struct xfs_inode *ip) | ||
2841 | { | ||
2842 | struct xfs_buf *bp; | ||
2843 | |||
2844 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); | ||
2845 | |||
2846 | bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, | ||
2847 | ip->i_imap.im_len, XBF_TRYLOCK); | ||
2848 | if (!bp) | ||
2849 | return; | ||
2850 | |||
2851 | if (XFS_BUF_ISDELAYWRITE(bp)) { | ||
2852 | xfs_buf_delwri_promote(bp); | ||
2853 | wake_up_process(ip->i_mount->m_ddev_targp->bt_task); | ||
2854 | } | ||
2855 | |||
2856 | xfs_buf_relse(bp); | ||
2857 | } | ||
2858 | |||
2838 | /* | 2859 | /* |
2839 | * Return a pointer to the extent record at file index idx. | 2860 | * Return a pointer to the extent record at file index idx. |
2840 | */ | 2861 | */ |
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 760140d1dd66..b4cd4739f98e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h | |||
@@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); | |||
498 | void xfs_iext_realloc(xfs_inode_t *, int, int); | 498 | void xfs_iext_realloc(xfs_inode_t *, int, int); |
499 | void xfs_iunpin_wait(xfs_inode_t *); | 499 | void xfs_iunpin_wait(xfs_inode_t *); |
500 | int xfs_iflush(xfs_inode_t *, uint); | 500 | int xfs_iflush(xfs_inode_t *, uint); |
501 | void xfs_promote_inode(struct xfs_inode *); | ||
501 | void xfs_lock_inodes(xfs_inode_t **, int, uint); | 502 | void xfs_lock_inodes(xfs_inode_t **, int, uint); |
502 | void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); | 503 | void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); |
503 | 504 | ||
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a14cd89fe465..34817adf4b9e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c | |||
@@ -150,6 +150,117 @@ xlog_grant_add_space( | |||
150 | } while (head_val != old); | 150 | } while (head_val != old); |
151 | } | 151 | } |
152 | 152 | ||
153 | STATIC bool | ||
154 | xlog_reserveq_wake( | ||
155 | struct log *log, | ||
156 | int *free_bytes) | ||
157 | { | ||
158 | struct xlog_ticket *tic; | ||
159 | int need_bytes; | ||
160 | |||
161 | list_for_each_entry(tic, &log->l_reserveq, t_queue) { | ||
162 | if (tic->t_flags & XLOG_TIC_PERM_RESERV) | ||
163 | need_bytes = tic->t_unit_res * tic->t_cnt; | ||
164 | else | ||
165 | need_bytes = tic->t_unit_res; | ||
166 | |||
167 | if (*free_bytes < need_bytes) | ||
168 | return false; | ||
169 | *free_bytes -= need_bytes; | ||
170 | |||
171 | trace_xfs_log_grant_wake_up(log, tic); | ||
172 | wake_up(&tic->t_wait); | ||
173 | } | ||
174 | |||
175 | return true; | ||
176 | } | ||
177 | |||
178 | STATIC bool | ||
179 | xlog_writeq_wake( | ||
180 | struct log *log, | ||
181 | int *free_bytes) | ||
182 | { | ||
183 | struct xlog_ticket *tic; | ||
184 | int need_bytes; | ||
185 | |||
186 | list_for_each_entry(tic, &log->l_writeq, t_queue) { | ||
187 | ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); | ||
188 | |||
189 | need_bytes = tic->t_unit_res; | ||
190 | |||
191 | if (*free_bytes < need_bytes) | ||
192 | return false; | ||
193 | *free_bytes -= need_bytes; | ||
194 | |||
195 | trace_xfs_log_regrant_write_wake_up(log, tic); | ||
196 | wake_up(&tic->t_wait); | ||
197 | } | ||
198 | |||
199 | return true; | ||
200 | } | ||
201 | |||
202 | STATIC int | ||
203 | xlog_reserveq_wait( | ||
204 | struct log *log, | ||
205 | struct xlog_ticket *tic, | ||
206 | int need_bytes) | ||
207 | { | ||
208 | list_add_tail(&tic->t_queue, &log->l_reserveq); | ||
209 | |||
210 | do { | ||
211 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
212 | goto shutdown; | ||
213 | xlog_grant_push_ail(log, need_bytes); | ||
214 | |||
215 | XFS_STATS_INC(xs_sleep_logspace); | ||
216 | trace_xfs_log_grant_sleep(log, tic); | ||
217 | |||
218 | xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); | ||
219 | trace_xfs_log_grant_wake(log, tic); | ||
220 | |||
221 | spin_lock(&log->l_grant_reserve_lock); | ||
222 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
223 | goto shutdown; | ||
224 | } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); | ||
225 | |||
226 | list_del_init(&tic->t_queue); | ||
227 | return 0; | ||
228 | shutdown: | ||
229 | list_del_init(&tic->t_queue); | ||
230 | return XFS_ERROR(EIO); | ||
231 | } | ||
232 | |||
233 | STATIC int | ||
234 | xlog_writeq_wait( | ||
235 | struct log *log, | ||
236 | struct xlog_ticket *tic, | ||
237 | int need_bytes) | ||
238 | { | ||
239 | list_add_tail(&tic->t_queue, &log->l_writeq); | ||
240 | |||
241 | do { | ||
242 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
243 | goto shutdown; | ||
244 | xlog_grant_push_ail(log, need_bytes); | ||
245 | |||
246 | XFS_STATS_INC(xs_sleep_logspace); | ||
247 | trace_xfs_log_regrant_write_sleep(log, tic); | ||
248 | |||
249 | xlog_wait(&tic->t_wait, &log->l_grant_write_lock); | ||
250 | trace_xfs_log_regrant_write_wake(log, tic); | ||
251 | |||
252 | spin_lock(&log->l_grant_write_lock); | ||
253 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
254 | goto shutdown; | ||
255 | } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); | ||
256 | |||
257 | list_del_init(&tic->t_queue); | ||
258 | return 0; | ||
259 | shutdown: | ||
260 | list_del_init(&tic->t_queue); | ||
261 | return XFS_ERROR(EIO); | ||
262 | } | ||
263 | |||
153 | static void | 264 | static void |
154 | xlog_tic_reset_res(xlog_ticket_t *tic) | 265 | xlog_tic_reset_res(xlog_ticket_t *tic) |
155 | { | 266 | { |
@@ -350,8 +461,19 @@ xfs_log_reserve( | |||
350 | retval = xlog_grant_log_space(log, internal_ticket); | 461 | retval = xlog_grant_log_space(log, internal_ticket); |
351 | } | 462 | } |
352 | 463 | ||
464 | if (unlikely(retval)) { | ||
465 | /* | ||
466 | * If we are failing, make sure the ticket doesn't have any | ||
467 | * current reservations. We don't want to add this back | ||
468 | * when the ticket/ transaction gets cancelled. | ||
469 | */ | ||
470 | internal_ticket->t_curr_res = 0; | ||
471 | /* ungrant will give back unit_res * t_cnt. */ | ||
472 | internal_ticket->t_cnt = 0; | ||
473 | } | ||
474 | |||
353 | return retval; | 475 | return retval; |
354 | } /* xfs_log_reserve */ | 476 | } |
355 | 477 | ||
356 | 478 | ||
357 | /* | 479 | /* |
@@ -2481,8 +2603,8 @@ restart: | |||
2481 | /* | 2603 | /* |
2482 | * Atomically get the log space required for a log ticket. | 2604 | * Atomically get the log space required for a log ticket. |
2483 | * | 2605 | * |
2484 | * Once a ticket gets put onto the reserveq, it will only return after | 2606 | * Once a ticket gets put onto the reserveq, it will only return after the |
2485 | * the needed reservation is satisfied. | 2607 | * needed reservation is satisfied. |
2486 | * | 2608 | * |
2487 | * This function is structured so that it has a lock free fast path. This is | 2609 | * This function is structured so that it has a lock free fast path. This is |
2488 | * necessary because every new transaction reservation will come through this | 2610 | * necessary because every new transaction reservation will come through this |
@@ -2490,113 +2612,53 @@ restart: | |||
2490 | * every pass. | 2612 | * every pass. |
2491 | * | 2613 | * |
2492 | * As tickets are only ever moved on and off the reserveq under the | 2614 | * As tickets are only ever moved on and off the reserveq under the |
2493 | * l_grant_reserve_lock, we only need to take that lock if we are going | 2615 | * l_grant_reserve_lock, we only need to take that lock if we are going to add |
2494 | * to add the ticket to the queue and sleep. We can avoid taking the lock if the | 2616 | * the ticket to the queue and sleep. We can avoid taking the lock if the ticket |
2495 | * ticket was never added to the reserveq because the t_queue list head will be | 2617 | * was never added to the reserveq because the t_queue list head will be empty |
2496 | * empty and we hold the only reference to it so it can safely be checked | 2618 | * and we hold the only reference to it so it can safely be checked unlocked. |
2497 | * unlocked. | ||
2498 | */ | 2619 | */ |
2499 | STATIC int | 2620 | STATIC int |
2500 | xlog_grant_log_space(xlog_t *log, | 2621 | xlog_grant_log_space( |
2501 | xlog_ticket_t *tic) | 2622 | struct log *log, |
2623 | struct xlog_ticket *tic) | ||
2502 | { | 2624 | { |
2503 | int free_bytes; | 2625 | int free_bytes, need_bytes; |
2504 | int need_bytes; | 2626 | int error = 0; |
2505 | 2627 | ||
2506 | #ifdef DEBUG | 2628 | ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); |
2507 | if (log->l_flags & XLOG_ACTIVE_RECOVERY) | ||
2508 | panic("grant Recovery problem"); | ||
2509 | #endif | ||
2510 | 2629 | ||
2511 | trace_xfs_log_grant_enter(log, tic); | 2630 | trace_xfs_log_grant_enter(log, tic); |
2512 | 2631 | ||
2632 | /* | ||
2633 | * If there are other waiters on the queue then give them a chance at | ||
2634 | * logspace before us. Wake up the first waiters, if we do not wake | ||
2635 | * up all the waiters then go to sleep waiting for more free space, | ||
2636 | * otherwise try to get some space for this transaction. | ||
2637 | */ | ||
2513 | need_bytes = tic->t_unit_res; | 2638 | need_bytes = tic->t_unit_res; |
2514 | if (tic->t_flags & XFS_LOG_PERM_RESERV) | 2639 | if (tic->t_flags & XFS_LOG_PERM_RESERV) |
2515 | need_bytes *= tic->t_ocnt; | 2640 | need_bytes *= tic->t_ocnt; |
2516 | |||
2517 | /* something is already sleeping; insert new transaction at end */ | ||
2518 | if (!list_empty_careful(&log->l_reserveq)) { | ||
2519 | spin_lock(&log->l_grant_reserve_lock); | ||
2520 | /* recheck the queue now we are locked */ | ||
2521 | if (list_empty(&log->l_reserveq)) { | ||
2522 | spin_unlock(&log->l_grant_reserve_lock); | ||
2523 | goto redo; | ||
2524 | } | ||
2525 | list_add_tail(&tic->t_queue, &log->l_reserveq); | ||
2526 | |||
2527 | trace_xfs_log_grant_sleep1(log, tic); | ||
2528 | |||
2529 | /* | ||
2530 | * Gotta check this before going to sleep, while we're | ||
2531 | * holding the grant lock. | ||
2532 | */ | ||
2533 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2534 | goto error_return; | ||
2535 | |||
2536 | XFS_STATS_INC(xs_sleep_logspace); | ||
2537 | xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); | ||
2538 | |||
2539 | /* | ||
2540 | * If we got an error, and the filesystem is shutting down, | ||
2541 | * we'll catch it down below. So just continue... | ||
2542 | */ | ||
2543 | trace_xfs_log_grant_wake1(log, tic); | ||
2544 | } | ||
2545 | |||
2546 | redo: | ||
2547 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2548 | goto error_return_unlocked; | ||
2549 | |||
2550 | free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); | 2641 | free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); |
2551 | if (free_bytes < need_bytes) { | 2642 | if (!list_empty_careful(&log->l_reserveq)) { |
2552 | spin_lock(&log->l_grant_reserve_lock); | 2643 | spin_lock(&log->l_grant_reserve_lock); |
2553 | if (list_empty(&tic->t_queue)) | 2644 | if (!xlog_reserveq_wake(log, &free_bytes) || |
2554 | list_add_tail(&tic->t_queue, &log->l_reserveq); | 2645 | free_bytes < need_bytes) |
2555 | 2646 | error = xlog_reserveq_wait(log, tic, need_bytes); | |
2556 | trace_xfs_log_grant_sleep2(log, tic); | 2647 | spin_unlock(&log->l_grant_reserve_lock); |
2557 | 2648 | } else if (free_bytes < need_bytes) { | |
2558 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2559 | goto error_return; | ||
2560 | |||
2561 | xlog_grant_push_ail(log, need_bytes); | ||
2562 | |||
2563 | XFS_STATS_INC(xs_sleep_logspace); | ||
2564 | xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); | ||
2565 | |||
2566 | trace_xfs_log_grant_wake2(log, tic); | ||
2567 | goto redo; | ||
2568 | } | ||
2569 | |||
2570 | if (!list_empty(&tic->t_queue)) { | ||
2571 | spin_lock(&log->l_grant_reserve_lock); | 2649 | spin_lock(&log->l_grant_reserve_lock); |
2572 | list_del_init(&tic->t_queue); | 2650 | error = xlog_reserveq_wait(log, tic, need_bytes); |
2573 | spin_unlock(&log->l_grant_reserve_lock); | 2651 | spin_unlock(&log->l_grant_reserve_lock); |
2574 | } | 2652 | } |
2653 | if (error) | ||
2654 | return error; | ||
2575 | 2655 | ||
2576 | /* we've got enough space */ | ||
2577 | xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); | 2656 | xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); |
2578 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); | 2657 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); |
2579 | trace_xfs_log_grant_exit(log, tic); | 2658 | trace_xfs_log_grant_exit(log, tic); |
2580 | xlog_verify_grant_tail(log); | 2659 | xlog_verify_grant_tail(log); |
2581 | return 0; | 2660 | return 0; |
2582 | 2661 | } | |
2583 | error_return_unlocked: | ||
2584 | spin_lock(&log->l_grant_reserve_lock); | ||
2585 | error_return: | ||
2586 | list_del_init(&tic->t_queue); | ||
2587 | spin_unlock(&log->l_grant_reserve_lock); | ||
2588 | trace_xfs_log_grant_error(log, tic); | ||
2589 | |||
2590 | /* | ||
2591 | * If we are failing, make sure the ticket doesn't have any | ||
2592 | * current reservations. We don't want to add this back when | ||
2593 | * the ticket/transaction gets cancelled. | ||
2594 | */ | ||
2595 | tic->t_curr_res = 0; | ||
2596 | tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ | ||
2597 | return XFS_ERROR(EIO); | ||
2598 | } /* xlog_grant_log_space */ | ||
2599 | |||
2600 | 2662 | ||
2601 | /* | 2663 | /* |
2602 | * Replenish the byte reservation required by moving the grant write head. | 2664 | * Replenish the byte reservation required by moving the grant write head. |
@@ -2605,10 +2667,12 @@ error_return: | |||
2605 | * free fast path. | 2667 | * free fast path. |
2606 | */ | 2668 | */ |
2607 | STATIC int | 2669 | STATIC int |
2608 | xlog_regrant_write_log_space(xlog_t *log, | 2670 | xlog_regrant_write_log_space( |
2609 | xlog_ticket_t *tic) | 2671 | struct log *log, |
2672 | struct xlog_ticket *tic) | ||
2610 | { | 2673 | { |
2611 | int free_bytes, need_bytes; | 2674 | int free_bytes, need_bytes; |
2675 | int error = 0; | ||
2612 | 2676 | ||
2613 | tic->t_curr_res = tic->t_unit_res; | 2677 | tic->t_curr_res = tic->t_unit_res; |
2614 | xlog_tic_reset_res(tic); | 2678 | xlog_tic_reset_res(tic); |
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log, | |||
2616 | if (tic->t_cnt > 0) | 2680 | if (tic->t_cnt > 0) |
2617 | return 0; | 2681 | return 0; |
2618 | 2682 | ||
2619 | #ifdef DEBUG | 2683 | ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); |
2620 | if (log->l_flags & XLOG_ACTIVE_RECOVERY) | ||
2621 | panic("regrant Recovery problem"); | ||
2622 | #endif | ||
2623 | 2684 | ||
2624 | trace_xfs_log_regrant_write_enter(log, tic); | 2685 | trace_xfs_log_regrant_write_enter(log, tic); |
2625 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2626 | goto error_return_unlocked; | ||
2627 | 2686 | ||
2628 | /* If there are other waiters on the queue then give them a | 2687 | /* |
2629 | * chance at logspace before us. Wake up the first waiters, | 2688 | * If there are other waiters on the queue then give them a chance at |
2630 | * if we do not wake up all the waiters then go to sleep waiting | 2689 | * logspace before us. Wake up the first waiters, if we do not wake |
2631 | * for more free space, otherwise try to get some space for | 2690 | * up all the waiters then go to sleep waiting for more free space, |
2632 | * this transaction. | 2691 | * otherwise try to get some space for this transaction. |
2633 | */ | 2692 | */ |
2634 | need_bytes = tic->t_unit_res; | 2693 | need_bytes = tic->t_unit_res; |
2635 | if (!list_empty_careful(&log->l_writeq)) { | ||
2636 | struct xlog_ticket *ntic; | ||
2637 | |||
2638 | spin_lock(&log->l_grant_write_lock); | ||
2639 | free_bytes = xlog_space_left(log, &log->l_grant_write_head); | ||
2640 | list_for_each_entry(ntic, &log->l_writeq, t_queue) { | ||
2641 | ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); | ||
2642 | |||
2643 | if (free_bytes < ntic->t_unit_res) | ||
2644 | break; | ||
2645 | free_bytes -= ntic->t_unit_res; | ||
2646 | wake_up(&ntic->t_wait); | ||
2647 | } | ||
2648 | |||
2649 | if (ntic != list_first_entry(&log->l_writeq, | ||
2650 | struct xlog_ticket, t_queue)) { | ||
2651 | if (list_empty(&tic->t_queue)) | ||
2652 | list_add_tail(&tic->t_queue, &log->l_writeq); | ||
2653 | trace_xfs_log_regrant_write_sleep1(log, tic); | ||
2654 | |||
2655 | xlog_grant_push_ail(log, need_bytes); | ||
2656 | |||
2657 | XFS_STATS_INC(xs_sleep_logspace); | ||
2658 | xlog_wait(&tic->t_wait, &log->l_grant_write_lock); | ||
2659 | trace_xfs_log_regrant_write_wake1(log, tic); | ||
2660 | } else | ||
2661 | spin_unlock(&log->l_grant_write_lock); | ||
2662 | } | ||
2663 | |||
2664 | redo: | ||
2665 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2666 | goto error_return_unlocked; | ||
2667 | |||
2668 | free_bytes = xlog_space_left(log, &log->l_grant_write_head); | 2694 | free_bytes = xlog_space_left(log, &log->l_grant_write_head); |
2669 | if (free_bytes < need_bytes) { | 2695 | if (!list_empty_careful(&log->l_writeq)) { |
2670 | spin_lock(&log->l_grant_write_lock); | 2696 | spin_lock(&log->l_grant_write_lock); |
2671 | if (list_empty(&tic->t_queue)) | 2697 | if (!xlog_writeq_wake(log, &free_bytes) || |
2672 | list_add_tail(&tic->t_queue, &log->l_writeq); | 2698 | free_bytes < need_bytes) |
2673 | 2699 | error = xlog_writeq_wait(log, tic, need_bytes); | |
2674 | if (XLOG_FORCED_SHUTDOWN(log)) | 2700 | spin_unlock(&log->l_grant_write_lock); |
2675 | goto error_return; | 2701 | } else if (free_bytes < need_bytes) { |
2676 | |||
2677 | xlog_grant_push_ail(log, need_bytes); | ||
2678 | |||
2679 | XFS_STATS_INC(xs_sleep_logspace); | ||
2680 | trace_xfs_log_regrant_write_sleep2(log, tic); | ||
2681 | xlog_wait(&tic->t_wait, &log->l_grant_write_lock); | ||
2682 | |||
2683 | trace_xfs_log_regrant_write_wake2(log, tic); | ||
2684 | goto redo; | ||
2685 | } | ||
2686 | |||
2687 | if (!list_empty(&tic->t_queue)) { | ||
2688 | spin_lock(&log->l_grant_write_lock); | 2702 | spin_lock(&log->l_grant_write_lock); |
2689 | list_del_init(&tic->t_queue); | 2703 | error = xlog_writeq_wait(log, tic, need_bytes); |
2690 | spin_unlock(&log->l_grant_write_lock); | 2704 | spin_unlock(&log->l_grant_write_lock); |
2691 | } | 2705 | } |
2692 | 2706 | ||
2693 | /* we've got enough space */ | 2707 | if (error) |
2708 | return error; | ||
2709 | |||
2694 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); | 2710 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); |
2695 | trace_xfs_log_regrant_write_exit(log, tic); | 2711 | trace_xfs_log_regrant_write_exit(log, tic); |
2696 | xlog_verify_grant_tail(log); | 2712 | xlog_verify_grant_tail(log); |
2697 | return 0; | 2713 | return 0; |
2698 | 2714 | } | |
2699 | |||
2700 | error_return_unlocked: | ||
2701 | spin_lock(&log->l_grant_write_lock); | ||
2702 | error_return: | ||
2703 | list_del_init(&tic->t_queue); | ||
2704 | spin_unlock(&log->l_grant_write_lock); | ||
2705 | trace_xfs_log_regrant_write_error(log, tic); | ||
2706 | |||
2707 | /* | ||
2708 | * If we are failing, make sure the ticket doesn't have any | ||
2709 | * current reservations. We don't want to add this back when | ||
2710 | * the ticket/transaction gets cancelled. | ||
2711 | */ | ||
2712 | tic->t_curr_res = 0; | ||
2713 | tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ | ||
2714 | return XFS_ERROR(EIO); | ||
2715 | } /* xlog_regrant_write_log_space */ | ||
2716 | |||
2717 | 2715 | ||
2718 | /* The first cnt-1 times through here we don't need to | 2716 | /* The first cnt-1 times through here we don't need to |
2719 | * move the grant write head because the permanent | 2717 | * move the grant write head because the permanent |
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5cff443f6cdb..0bbb1a41998b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c | |||
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one( | |||
674 | * disk and we didn't ask it to allocate; | 674 | * disk and we didn't ask it to allocate; |
675 | * ESRCH if quotas got turned off suddenly. | 675 | * ESRCH if quotas got turned off suddenly. |
676 | */ | 676 | */ |
677 | error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); | 677 | error = xfs_qm_dqget(ip->i_mount, ip, id, type, |
678 | doalloc | XFS_QMOPT_DOWARN, &dqp); | ||
678 | if (error) | 679 | if (error) |
679 | return error; | 680 | return error; |
680 | 681 | ||
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index aa3dc1a4d53d..be5c51d8f757 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c | |||
@@ -770,6 +770,17 @@ restart: | |||
770 | if (!xfs_iflock_nowait(ip)) { | 770 | if (!xfs_iflock_nowait(ip)) { |
771 | if (!(sync_mode & SYNC_WAIT)) | 771 | if (!(sync_mode & SYNC_WAIT)) |
772 | goto out; | 772 | goto out; |
773 | |||
774 | /* | ||
775 | * If we only have a single dirty inode in a cluster there is | ||
776 | * a fair chance that the AIL push may have pushed it into | ||
777 | * the buffer, but xfsbufd won't touch it until 30 seconds | ||
778 | * from now, and thus we will lock up here. | ||
779 | * | ||
780 | * Promote the inode buffer to the front of the delwri list | ||
781 | * and wake up xfsbufd now. | ||
782 | */ | ||
783 | xfs_promote_inode(ip); | ||
773 | xfs_iflock(ip); | 784 | xfs_iflock(ip); |
774 | } | 785 | } |
775 | 786 | ||
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f1d2802b2f07..494035798873 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h | |||
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); | |||
834 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); | 834 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); |
835 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); | 835 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); |
836 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); | 836 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); |
837 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); | 837 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); |
838 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); | 838 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); |
839 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); | ||
840 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); | ||
841 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); | 839 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); |
842 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); | 840 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); |
843 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); | 841 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); |
844 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); | 842 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); |
845 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); | 843 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); |
846 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); | 844 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); |
847 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); | ||
848 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); | ||
849 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); | 845 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); |
850 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); | 846 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); |
851 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); | 847 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); |