aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c117
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/disk-io.c181
-rw-r--r--fs/btrfs/extent-tree.c290
-rw-r--r--fs/btrfs/extent_io.c60
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c8
-rw-r--r--fs/btrfs/free-space-cache.c65
-rw-r--r--fs/btrfs/inode.c188
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c15
-rw-r--r--fs/btrfs/super.c76
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/volumes.c10
-rw-r--r--fs/btrfs/volumes.h6
20 files changed, 727 insertions, 362 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..cb97174e2366 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
64 int idle; 64 int idle;
65}; 65};
66 66
67static int __btrfs_start_workers(struct btrfs_workers *workers);
68
67/* 69/*
68 * btrfs_start_workers uses kthread_run, which can block waiting for memory 70 * btrfs_start_workers uses kthread_run, which can block waiting for memory
69 * for a very long time. It will actually throttle on page writeback, 71 * for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
88{ 90{
89 struct worker_start *start; 91 struct worker_start *start;
90 start = container_of(work, struct worker_start, work); 92 start = container_of(work, struct worker_start, work);
91 btrfs_start_workers(start->queue, 1); 93 __btrfs_start_workers(start->queue);
92 kfree(start); 94 kfree(start);
93} 95}
94 96
95static int start_new_worker(struct btrfs_workers *queue)
96{
97 struct worker_start *start;
98 int ret;
99
100 start = kzalloc(sizeof(*start), GFP_NOFS);
101 if (!start)
102 return -ENOMEM;
103
104 start->work.func = start_new_worker_func;
105 start->queue = queue;
106 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
107 if (ret)
108 kfree(start);
109 return ret;
110}
111
112/* 97/*
113 * helper function to move a thread onto the idle list after it 98 * helper function to move a thread onto the idle list after it
114 * has finished some requests. 99 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
153static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 138static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
154{ 139{
155 struct btrfs_workers *workers = worker->workers; 140 struct btrfs_workers *workers = worker->workers;
141 struct worker_start *start;
156 unsigned long flags; 142 unsigned long flags;
157 143
158 rmb(); 144 rmb();
159 if (!workers->atomic_start_pending) 145 if (!workers->atomic_start_pending)
160 return; 146 return;
161 147
148 start = kzalloc(sizeof(*start), GFP_NOFS);
149 if (!start)
150 return;
151
152 start->work.func = start_new_worker_func;
153 start->queue = workers;
154
162 spin_lock_irqsave(&workers->lock, flags); 155 spin_lock_irqsave(&workers->lock, flags);
163 if (!workers->atomic_start_pending) 156 if (!workers->atomic_start_pending)
164 goto out; 157 goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
170 163
171 workers->num_workers_starting += 1; 164 workers->num_workers_starting += 1;
172 spin_unlock_irqrestore(&workers->lock, flags); 165 spin_unlock_irqrestore(&workers->lock, flags);
173 start_new_worker(workers); 166 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
174 return; 167 return;
175 168
176out: 169out:
170 kfree(start);
177 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
178} 172}
179 173
@@ -331,7 +325,7 @@ again:
331 run_ordered_completions(worker->workers, work); 325 run_ordered_completions(worker->workers, work);
332 326
333 check_pending_worker_creates(worker); 327 check_pending_worker_creates(worker);
334 328 cond_resched();
335 } 329 }
336 330
337 spin_lock_irq(&worker->lock); 331 spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
462 * starts new worker threads. This does not enforce the max worker 456 * starts new worker threads. This does not enforce the max worker
463 * count in case you need to temporarily go past it. 457 * count in case you need to temporarily go past it.
464 */ 458 */
465static int __btrfs_start_workers(struct btrfs_workers *workers, 459static int __btrfs_start_workers(struct btrfs_workers *workers)
466 int num_workers)
467{ 460{
468 struct btrfs_worker_thread *worker; 461 struct btrfs_worker_thread *worker;
469 int ret = 0; 462 int ret = 0;
470 int i;
471 463
472 for (i = 0; i < num_workers; i++) { 464 worker = kzalloc(sizeof(*worker), GFP_NOFS);
473 worker = kzalloc(sizeof(*worker), GFP_NOFS); 465 if (!worker) {
474 if (!worker) { 466 ret = -ENOMEM;
475 ret = -ENOMEM; 467 goto fail;
476 goto fail; 468 }
477 }
478 469
479 INIT_LIST_HEAD(&worker->pending); 470 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending); 471 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list); 472 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock); 473 spin_lock_init(&worker->lock);
483 474
484 atomic_set(&worker->num_pending, 0); 475 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1); 476 atomic_set(&worker->refs, 1);
486 worker->workers = workers; 477 worker->workers = workers;
487 worker->task = kthread_run(worker_loop, worker, 478 worker->task = kthread_run(worker_loop, worker,
488 "btrfs-%s-%d", workers->name, 479 "btrfs-%s-%d", workers->name,
489 workers->num_workers + i); 480 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) { 481 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task); 482 ret = PTR_ERR(worker->task);
492 kfree(worker); 483 kfree(worker);
493 goto fail; 484 goto fail;
494 }
495 spin_lock_irq(&workers->lock);
496 list_add_tail(&worker->worker_list, &workers->idle_list);
497 worker->idle = 1;
498 workers->num_workers++;
499 workers->num_workers_starting--;
500 WARN_ON(workers->num_workers_starting < 0);
501 spin_unlock_irq(&workers->lock);
502 } 485 }
486 spin_lock_irq(&workers->lock);
487 list_add_tail(&worker->worker_list, &workers->idle_list);
488 worker->idle = 1;
489 workers->num_workers++;
490 workers->num_workers_starting--;
491 WARN_ON(workers->num_workers_starting < 0);
492 spin_unlock_irq(&workers->lock);
493
503 return 0; 494 return 0;
504fail: 495fail:
505 btrfs_stop_workers(workers); 496 spin_lock_irq(&workers->lock);
497 workers->num_workers_starting--;
498 spin_unlock_irq(&workers->lock);
506 return ret; 499 return ret;
507} 500}
508 501
509int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 502int btrfs_start_workers(struct btrfs_workers *workers)
510{ 503{
511 spin_lock_irq(&workers->lock); 504 spin_lock_irq(&workers->lock);
512 workers->num_workers_starting += num_workers; 505 workers->num_workers_starting++;
513 spin_unlock_irq(&workers->lock); 506 spin_unlock_irq(&workers->lock);
514 return __btrfs_start_workers(workers, num_workers); 507 return __btrfs_start_workers(workers);
515} 508}
516 509
517/* 510/*
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
568 struct btrfs_worker_thread *worker; 561 struct btrfs_worker_thread *worker;
569 unsigned long flags; 562 unsigned long flags;
570 struct list_head *fallback; 563 struct list_head *fallback;
564 int ret;
571 565
572again: 566again:
573 spin_lock_irqsave(&workers->lock, flags); 567 spin_lock_irqsave(&workers->lock, flags);
@@ -584,7 +578,9 @@ again:
584 workers->num_workers_starting++; 578 workers->num_workers_starting++;
585 spin_unlock_irqrestore(&workers->lock, flags); 579 spin_unlock_irqrestore(&workers->lock, flags);
586 /* we're below the limit, start another worker */ 580 /* we're below the limit, start another worker */
587 __btrfs_start_workers(workers, 1); 581 ret = __btrfs_start_workers(workers);
582 if (ret)
583 goto fallback;
588 goto again; 584 goto again;
589 } 585 }
590 } 586 }
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
665/* 661/*
666 * places a struct btrfs_work into the pending queue of one of the kthreads 662 * places a struct btrfs_work into the pending queue of one of the kthreads
667 */ 663 */
668int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) 664void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
669{ 665{
670 struct btrfs_worker_thread *worker; 666 struct btrfs_worker_thread *worker;
671 unsigned long flags; 667 unsigned long flags;
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
673 669
674 /* don't requeue something already on a list */ 670 /* don't requeue something already on a list */
675 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 671 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
676 goto out; 672 return;
677 673
678 worker = find_worker(workers); 674 worker = find_worker(workers);
679 if (workers->ordered) { 675 if (workers->ordered) {
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
712 if (wake) 708 if (wake)
713 wake_up_process(worker->task); 709 wake_up_process(worker->task);
714 spin_unlock_irqrestore(&worker->lock, flags); 710 spin_unlock_irqrestore(&worker->lock, flags);
715
716out:
717 return 0;
718} 711}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
109 char *name; 109 char *name;
110}; 110};
111 111
112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers);
114int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter); 116 struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929c..22c64fff1bd5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 683 return PTR_ERR(fspath);
684 684
685 if (fspath > fspath_min) { 685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)fspath; 686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 689 } else {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea38..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff9292..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
848enum btrfs_caching_type { 848enum btrfs_caching_type {
849 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
850 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
851 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
852}; 853};
853 854
854enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
1271 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1272 */ 1273 */
1273 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1274}; 1277};
1275 1278
1276struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
2366int btrfs_block_rsv_refill(struct btrfs_root *root, 2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2367 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2368 u64 min_reserved); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2370 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2371 u64 num_bytes); 2377 u64 num_bytes);
@@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2686int btrfs_readpage(struct file *file, struct page *page); 2692int btrfs_readpage(struct file *file, struct page *page);
2687void btrfs_evict_inode(struct inode *inode); 2693void btrfs_evict_inode(struct inode *inode);
2688int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2694int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2689void btrfs_dirty_inode(struct inode *inode, int flags); 2695int btrfs_dirty_inode(struct inode *inode);
2696int btrfs_update_time(struct file *file);
2690struct inode *btrfs_alloc_inode(struct super_block *sb); 2697struct inode *btrfs_alloc_inode(struct super_block *sb);
2691void btrfs_destroy_inode(struct inode *inode); 2698void btrfs_destroy_inode(struct inode *inode);
2692int btrfs_drop_inode(struct inode *inode); 2699int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5b163572e0ca..9c1eccc2c503 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -640,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for. 641 * we're accounted for.
642 */ 642 */
643 if (!trans->bytes_reserved && 643 if (!src_rsv || (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv) { 644 src_rsv != &root->fs_info->delalloc_block_rsv)) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /* 646 /*
647 * Since we're under a transaction reserve_metadata_bytes could 647 * Since we're under a transaction reserve_metadata_bytes could
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62afe5c5694e..f44b3928dc2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
620 620
621static int btree_io_failed_hook(struct bio *failed_bio, 621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end, 622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state) 623 int mirror_num, struct extent_state *state)
624{ 624{
625 struct extent_io_tree *tree; 625 struct extent_io_tree *tree;
626 unsigned long len; 626 unsigned long len;
@@ -2194,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2194 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2; 2195 fs_info->readahead_workers.idle_thresh = 2;
2196 2196
2197 btrfs_start_workers(&fs_info->workers, 1); 2197 /*
2198 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 * btrfs_start_workers can really only fail because of ENOMEM so just
2199 btrfs_start_workers(&fs_info->submit_workers, 1); 2199 * return -ENOMEM if any of these fail.
2200 btrfs_start_workers(&fs_info->delalloc_workers, 1); 2200 */
2201 btrfs_start_workers(&fs_info->fixup_workers, 1); 2201 ret = btrfs_start_workers(&fs_info->workers);
2202 btrfs_start_workers(&fs_info->endio_workers, 1); 2202 ret |= btrfs_start_workers(&fs_info->generic_worker);
2203 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 2203 ret |= btrfs_start_workers(&fs_info->submit_workers);
2204 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 2204 ret |= btrfs_start_workers(&fs_info->delalloc_workers);
2205 btrfs_start_workers(&fs_info->endio_write_workers, 1); 2205 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2206 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 ret |= btrfs_start_workers(&fs_info->endio_workers);
2207 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2208 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2209 btrfs_start_workers(&fs_info->readahead_workers, 1); 2209 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2210 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2211 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2212 ret |= btrfs_start_workers(&fs_info->caching_workers);
2213 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2214 if (ret) {
2215 ret = -ENOMEM;
2216 goto fail_sb_buffer;
2217 }
2210 2218
2211 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2219 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2212 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2220 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2573,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
2573 int errors = 0; 2581 int errors = 0;
2574 u32 crc; 2582 u32 crc;
2575 u64 bytenr; 2583 u64 bytenr;
2576 int last_barrier = 0;
2577 2584
2578 if (max_mirrors == 0) 2585 if (max_mirrors == 0)
2579 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2586 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2580 2587
2581 /* make sure only the last submit_bh does a barrier */
2582 if (do_barriers) {
2583 for (i = 0; i < max_mirrors; i++) {
2584 bytenr = btrfs_sb_offset(i);
2585 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2586 device->total_bytes)
2587 break;
2588 last_barrier = i;
2589 }
2590 }
2591
2592 for (i = 0; i < max_mirrors; i++) { 2588 for (i = 0; i < max_mirrors; i++) {
2593 bytenr = btrfs_sb_offset(i); 2589 bytenr = btrfs_sb_offset(i);
2594 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2590 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2634,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
2634 bh->b_end_io = btrfs_end_buffer_write_sync; 2630 bh->b_end_io = btrfs_end_buffer_write_sync;
2635 } 2631 }
2636 2632
2637 if (i == last_barrier && do_barriers) 2633 /*
2638 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2634 * we fua the first super. The others we allow
2639 else 2635 * to go down lazy.
2640 ret = submit_bh(WRITE_SYNC, bh); 2636 */
2641 2637 ret = submit_bh(WRITE_FUA, bh);
2642 if (ret) 2638 if (ret)
2643 errors++; 2639 errors++;
2644 } 2640 }
2645 return errors < i ? 0 : -1; 2641 return errors < i ? 0 : -1;
2646} 2642}
2647 2643
2644/*
2645 * endio for the write_dev_flush, this will wake anyone waiting
2646 * for the barrier when it is done
2647 */
2648static void btrfs_end_empty_barrier(struct bio *bio, int err)
2649{
2650 if (err) {
2651 if (err == -EOPNOTSUPP)
2652 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2653 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2654 }
2655 if (bio->bi_private)
2656 complete(bio->bi_private);
2657 bio_put(bio);
2658}
2659
2660/*
2661 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2662 * sent down. With wait == 1, it waits for the previous flush.
2663 *
2664 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2665 * capable
2666 */
2667static int write_dev_flush(struct btrfs_device *device, int wait)
2668{
2669 struct bio *bio;
2670 int ret = 0;
2671
2672 if (device->nobarriers)
2673 return 0;
2674
2675 if (wait) {
2676 bio = device->flush_bio;
2677 if (!bio)
2678 return 0;
2679
2680 wait_for_completion(&device->flush_wait);
2681
2682 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2683 printk("btrfs: disabling barriers on dev %s\n",
2684 device->name);
2685 device->nobarriers = 1;
2686 }
2687 if (!bio_flagged(bio, BIO_UPTODATE)) {
2688 ret = -EIO;
2689 }
2690
2691 /* drop the reference from the wait == 0 run */
2692 bio_put(bio);
2693 device->flush_bio = NULL;
2694
2695 return ret;
2696 }
2697
2698 /*
2699 * one reference for us, and we leave it for the
2700 * caller
2701 */
2702 device->flush_bio = NULL;;
2703 bio = bio_alloc(GFP_NOFS, 0);
2704 if (!bio)
2705 return -ENOMEM;
2706
2707 bio->bi_end_io = btrfs_end_empty_barrier;
2708 bio->bi_bdev = device->bdev;
2709 init_completion(&device->flush_wait);
2710 bio->bi_private = &device->flush_wait;
2711 device->flush_bio = bio;
2712
2713 bio_get(bio);
2714 submit_bio(WRITE_FLUSH, bio);
2715
2716 return 0;
2717}
2718
2719/*
2720 * send an empty flush down to each device in parallel,
2721 * then wait for them
2722 */
2723static int barrier_all_devices(struct btrfs_fs_info *info)
2724{
2725 struct list_head *head;
2726 struct btrfs_device *dev;
2727 int errors = 0;
2728 int ret;
2729
2730 /* send down all the barriers */
2731 head = &info->fs_devices->devices;
2732 list_for_each_entry_rcu(dev, head, dev_list) {
2733 if (!dev->bdev) {
2734 errors++;
2735 continue;
2736 }
2737 if (!dev->in_fs_metadata || !dev->writeable)
2738 continue;
2739
2740 ret = write_dev_flush(dev, 0);
2741 if (ret)
2742 errors++;
2743 }
2744
2745 /* wait for all the barriers */
2746 list_for_each_entry_rcu(dev, head, dev_list) {
2747 if (!dev->bdev) {
2748 errors++;
2749 continue;
2750 }
2751 if (!dev->in_fs_metadata || !dev->writeable)
2752 continue;
2753
2754 ret = write_dev_flush(dev, 1);
2755 if (ret)
2756 errors++;
2757 }
2758 if (errors)
2759 return -EIO;
2760 return 0;
2761}
2762
2648int write_all_supers(struct btrfs_root *root, int max_mirrors) 2763int write_all_supers(struct btrfs_root *root, int max_mirrors)
2649{ 2764{
2650 struct list_head *head; 2765 struct list_head *head;
@@ -2666,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2666 2781
2667 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2782 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2668 head = &root->fs_info->fs_devices->devices; 2783 head = &root->fs_info->fs_devices->devices;
2784
2785 if (do_barriers)
2786 barrier_all_devices(root->fs_info);
2787
2669 list_for_each_entry_rcu(dev, head, dev_list) { 2788 list_for_each_entry_rcu(dev, head, dev_list) {
2670 if (!dev->bdev) { 2789 if (!dev->bdev) {
2671 total_errors++; 2790 total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b232150b5b6b..f5fbe576d2ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 struct btrfs_root *root, 467 struct btrfs_root *root,
468 int load_cache_only) 468 int load_cache_only)
469{ 469{
470 DEFINE_WAIT(wait);
470 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
471 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
472 int ret = 0; 473 int ret = 0;
473 474
474 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
475 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
476 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
477 523
478 /* 524 /*
479 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
484 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
485 (root && root != root->fs_info->tree_root) && 531 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) { 532 btrfs_test_opt(root, SPACE_CACHE)) {
487 spin_lock(&cache->lock);
488 if (cache->cached != BTRFS_CACHE_NO) {
489 spin_unlock(&cache->lock);
490 return 0;
491 }
492 cache->cached = BTRFS_CACHE_STARTED;
493 spin_unlock(&cache->lock);
494
495 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
496 534
497 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
498 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
499 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
500 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
501 } else { 540 } else {
502 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
503 } 547 }
504 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
505 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
506 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
507 return 0; 553 return 0;
508 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
509 } 569 }
510 570
511 if (load_cache_only) 571 if (load_cache_only) {
512 return 0; 572 put_caching_control(caching_ctl);
513
514 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
515 BUG_ON(!caching_ctl);
516
517 INIT_LIST_HEAD(&caching_ctl->list);
518 mutex_init(&caching_ctl->mutex);
519 init_waitqueue_head(&caching_ctl->wait);
520 caching_ctl->block_group = cache;
521 caching_ctl->progress = cache->key.objectid;
522 /* one for caching kthread, one for caching block group list */
523 atomic_set(&caching_ctl->count, 2);
524 caching_ctl->work.func = caching_thread;
525
526 spin_lock(&cache->lock);
527 if (cache->cached != BTRFS_CACHE_NO) {
528 spin_unlock(&cache->lock);
529 kfree(caching_ctl);
530 return 0; 573 return 0;
531 } 574 }
532 cache->caching_ctl = caching_ctl;
533 cache->cached = BTRFS_CACHE_STARTED;
534 spin_unlock(&cache->lock);
535 575
536 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
537 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
538 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
539 580
@@ -2781,7 +2822,7 @@ out_free:
2781 btrfs_release_path(path); 2822 btrfs_release_path(path);
2782out: 2823out:
2783 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2784 if (!ret) 2825 if (!ret && dcs == BTRFS_DC_SETUP)
2785 block_group->cache_generation = trans->transid; 2826 block_group->cache_generation = trans->transid;
2786 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2787 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
@@ -3847,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
3847 return ret; 3888 return ret;
3848} 3889}
3849 3890
3850int btrfs_block_rsv_refill(struct btrfs_root *root, 3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3851 struct btrfs_block_rsv *block_rsv, 3892 struct btrfs_block_rsv *block_rsv,
3852 u64 min_reserved) 3893 u64 min_reserved, int flush)
3853{ 3894{
3854 u64 num_bytes = 0; 3895 u64 num_bytes = 0;
3855 int ret = -ENOSPC; 3896 int ret = -ENOSPC;
@@ -3868,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3868 if (!ret) 3909 if (!ret)
3869 return 0; 3910 return 0;
3870 3911
3871 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); 3912 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3872 if (!ret) { 3913 if (!ret) {
3873 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3914 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3874 return 0; 3915 return 0;
@@ -3877,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3877 return ret; 3918 return ret;
3878} 3919}
3879 3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922 struct btrfs_block_rsv *block_rsv,
3923 u64 min_reserved)
3924{
3925 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929 struct btrfs_block_rsv *block_rsv,
3930 u64 min_reserved)
3931{
3932 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3933}
3934
3880int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3881 struct btrfs_block_rsv *dst_rsv, 3936 struct btrfs_block_rsv *dst_rsv,
3882 u64 num_bytes) 3937 u64 num_bytes)
@@ -4149,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4149 struct btrfs_root *root = BTRFS_I(inode)->root; 4204 struct btrfs_root *root = BTRFS_I(inode)->root;
4150 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4205 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4151 u64 to_reserve = 0; 4206 u64 to_reserve = 0;
4207 u64 csum_bytes;
4152 unsigned nr_extents = 0; 4208 unsigned nr_extents = 0;
4209 int extra_reserve = 0;
4153 int flush = 1; 4210 int flush = 1;
4154 int ret; 4211 int ret;
4155 4212
4213 /* Need to be holding the i_mutex here if we aren't free space cache */
4156 if (btrfs_is_free_space_inode(root, inode)) 4214 if (btrfs_is_free_space_inode(root, inode))
4157 flush = 0; 4215 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4158 4218
4159 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4219 if (flush && btrfs_transaction_in_commit(root->fs_info))
4160 schedule_timeout(1); 4220 schedule_timeout(1);
@@ -4165,11 +4225,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4165 BTRFS_I(inode)->outstanding_extents++; 4225 BTRFS_I(inode)->outstanding_extents++;
4166 4226
4167 if (BTRFS_I(inode)->outstanding_extents > 4227 if (BTRFS_I(inode)->outstanding_extents >
4168 BTRFS_I(inode)->reserved_extents) { 4228 BTRFS_I(inode)->reserved_extents)
4169 nr_extents = BTRFS_I(inode)->outstanding_extents - 4229 nr_extents = BTRFS_I(inode)->outstanding_extents -
4170 BTRFS_I(inode)->reserved_extents; 4230 BTRFS_I(inode)->reserved_extents;
4171 BTRFS_I(inode)->reserved_extents += nr_extents;
4172 }
4173 4231
4174 /* 4232 /*
4175 * Add an item to reserve for updating the inode when we complete the 4233 * Add an item to reserve for updating the inode when we complete the
@@ -4177,11 +4235,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4177 */ 4235 */
4178 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4236 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4179 nr_extents++; 4237 nr_extents++;
4180 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4238 extra_reserve = 1;
4181 } 4239 }
4182 4240
4183 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4241 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4184 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4242 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4243 csum_bytes = BTRFS_I(inode)->csum_bytes;
4185 spin_unlock(&BTRFS_I(inode)->lock); 4244 spin_unlock(&BTRFS_I(inode)->lock);
4186 4245
4187 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4246 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4191,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4191 4250
4192 spin_lock(&BTRFS_I(inode)->lock); 4251 spin_lock(&BTRFS_I(inode)->lock);
4193 dropped = drop_outstanding_extent(inode); 4252 dropped = drop_outstanding_extent(inode);
4194 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4195 spin_unlock(&BTRFS_I(inode)->lock);
4196 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4197
4198 /* 4253 /*
4199 * Somebody could have come in and twiddled with the 4254 * If the inodes csum_bytes is the same as the original
4200 * reservation, so if we have to free more than we would have 4255 * csum_bytes then we know we haven't raced with any free()ers
4201 * reserved from this reservation go ahead and release those 4256 * so we can just reduce our inodes csum bytes and carry on.
4202 * bytes. 4257 * Otherwise we have to do the normal free thing to account for
4258 * the case that the free side didn't free up its reserve
4259 * because of this outstanding reservation.
4203 */ 4260 */
4204 to_free -= to_reserve; 4261 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4262 calc_csum_metadata_size(inode, num_bytes, 0);
4263 else
4264 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4265 spin_unlock(&BTRFS_I(inode)->lock);
4266 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268
4205 if (to_free) 4269 if (to_free)
4206 btrfs_block_rsv_release(root, block_rsv, to_free); 4270 btrfs_block_rsv_release(root, block_rsv, to_free);
4207 return ret; 4271 return ret;
4208 } 4272 }
4209 4273
4274 spin_lock(&BTRFS_I(inode)->lock);
4275 if (extra_reserve) {
4276 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4277 nr_extents--;
4278 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock);
4281
4210 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4282 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4211 4283
4212 return 0; 4284 return 0;
@@ -5052,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5052 struct btrfs_root *root = orig_root->fs_info->extent_root; 5124 struct btrfs_root *root = orig_root->fs_info->extent_root;
5053 struct btrfs_free_cluster *last_ptr = NULL; 5125 struct btrfs_free_cluster *last_ptr = NULL;
5054 struct btrfs_block_group_cache *block_group = NULL; 5126 struct btrfs_block_group_cache *block_group = NULL;
5127 struct btrfs_block_group_cache *used_block_group;
5055 int empty_cluster = 2 * 1024 * 1024; 5128 int empty_cluster = 2 * 1024 * 1024;
5056 int allowed_chunk_alloc = 0; 5129 int allowed_chunk_alloc = 0;
5057 int done_chunk_alloc = 0; 5130 int done_chunk_alloc = 0;
5058 struct btrfs_space_info *space_info; 5131 struct btrfs_space_info *space_info;
5059 int last_ptr_loop = 0;
5060 int loop = 0; 5132 int loop = 0;
5061 int index = 0; 5133 int index = 0;
5062 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5134 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5118,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5118ideal_cache: 5190ideal_cache:
5119 block_group = btrfs_lookup_block_group(root->fs_info, 5191 block_group = btrfs_lookup_block_group(root->fs_info,
5120 search_start); 5192 search_start);
5193 used_block_group = block_group;
5121 /* 5194 /*
5122 * we don't want to use the block group if it doesn't match our 5195 * we don't want to use the block group if it doesn't match our
5123 * allocation bits, or if its not cached. 5196 * allocation bits, or if its not cached.
@@ -5155,6 +5228,7 @@ search:
5155 u64 offset; 5228 u64 offset;
5156 int cached; 5229 int cached;
5157 5230
5231 used_block_group = block_group;
5158 btrfs_get_block_group(block_group); 5232 btrfs_get_block_group(block_group);
5159 search_start = block_group->key.objectid; 5233 search_start = block_group->key.objectid;
5160 5234
@@ -5178,13 +5252,15 @@ search:
5178 } 5252 }
5179 5253
5180have_block_group: 5254have_block_group:
5181 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5255 cached = block_group_cache_done(block_group);
5256 if (unlikely(!cached)) {
5182 u64 free_percent; 5257 u64 free_percent;
5183 5258
5259 found_uncached_bg = true;
5184 ret = cache_block_group(block_group, trans, 5260 ret = cache_block_group(block_group, trans,
5185 orig_root, 1); 5261 orig_root, 1);
5186 if (block_group->cached == BTRFS_CACHE_FINISHED) 5262 if (block_group->cached == BTRFS_CACHE_FINISHED)
5187 goto have_block_group; 5263 goto alloc;
5188 5264
5189 free_percent = btrfs_block_group_used(&block_group->item); 5265 free_percent = btrfs_block_group_used(&block_group->item);
5190 free_percent *= 100; 5266 free_percent *= 100;
@@ -5206,7 +5282,6 @@ have_block_group:
5206 orig_root, 0); 5282 orig_root, 0);
5207 BUG_ON(ret); 5283 BUG_ON(ret);
5208 } 5284 }
5209 found_uncached_bg = true;
5210 5285
5211 /* 5286 /*
5212 * If loop is set for cached only, try the next block 5287 * If loop is set for cached only, try the next block
@@ -5216,94 +5291,80 @@ have_block_group:
5216 goto loop; 5291 goto loop;
5217 } 5292 }
5218 5293
5219 cached = block_group_cache_done(block_group); 5294alloc:
5220 if (unlikely(!cached))
5221 found_uncached_bg = true;
5222
5223 if (unlikely(block_group->ro)) 5295 if (unlikely(block_group->ro))
5224 goto loop; 5296 goto loop;
5225 5297
5226 spin_lock(&block_group->free_space_ctl->tree_lock); 5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5227 if (cached && 5299 if (cached &&
5228 block_group->free_space_ctl->free_space < 5300 block_group->free_space_ctl->free_space <
5229 num_bytes + empty_size) { 5301 num_bytes + empty_cluster + empty_size) {
5230 spin_unlock(&block_group->free_space_ctl->tree_lock); 5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5231 goto loop; 5303 goto loop;
5232 } 5304 }
5233 spin_unlock(&block_group->free_space_ctl->tree_lock); 5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5234 5306
5235 /* 5307 /*
5236 * Ok we want to try and use the cluster allocator, so lets look 5308 * Ok we want to try and use the cluster allocator, so
5237 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5309 * lets look there
5238 * have tried the cluster allocator plenty of times at this
5239 * point and not have found anything, so we are likely way too
5240 * fragmented for the clustering stuff to find anything, so lets
5241 * just skip it and let the allocator find whatever block it can
5242 * find
5243 */ 5310 */
5244 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5311 if (last_ptr) {
5245 /* 5312 /*
5246 * the refill lock keeps out other 5313 * the refill lock keeps out other
5247 * people trying to start a new cluster 5314 * people trying to start a new cluster
5248 */ 5315 */
5249 spin_lock(&last_ptr->refill_lock); 5316 spin_lock(&last_ptr->refill_lock);
5250 if (last_ptr->block_group && 5317 used_block_group = last_ptr->block_group;
5251 (last_ptr->block_group->ro || 5318 if (used_block_group != block_group &&
5252 !block_group_bits(last_ptr->block_group, data))) { 5319 (!used_block_group ||
5253 offset = 0; 5320 used_block_group->ro ||
5321 !block_group_bits(used_block_group, data))) {
5322 used_block_group = block_group;
5254 goto refill_cluster; 5323 goto refill_cluster;
5255 } 5324 }
5256 5325
5257 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5326 if (used_block_group != block_group)
5258 num_bytes, search_start); 5327 btrfs_get_block_group(used_block_group);
5328
5329 offset = btrfs_alloc_from_cluster(used_block_group,
5330 last_ptr, num_bytes, used_block_group->key.objectid);
5259 if (offset) { 5331 if (offset) {
5260 /* we have a block, we're done */ 5332 /* we have a block, we're done */
5261 spin_unlock(&last_ptr->refill_lock); 5333 spin_unlock(&last_ptr->refill_lock);
5262 goto checks; 5334 goto checks;
5263 } 5335 }
5264 5336
5265 spin_lock(&last_ptr->lock); 5337 WARN_ON(last_ptr->block_group != used_block_group);
5266 /* 5338 if (used_block_group != block_group) {
5267 * whoops, this cluster doesn't actually point to 5339 btrfs_put_block_group(used_block_group);
5268 * this block group. Get a ref on the block 5340 used_block_group = block_group;
5269 * group is does point to and try again
5270 */
5271 if (!last_ptr_loop && last_ptr->block_group &&
5272 last_ptr->block_group != block_group &&
5273 index <=
5274 get_block_group_index(last_ptr->block_group)) {
5275
5276 btrfs_put_block_group(block_group);
5277 block_group = last_ptr->block_group;
5278 btrfs_get_block_group(block_group);
5279 spin_unlock(&last_ptr->lock);
5280 spin_unlock(&last_ptr->refill_lock);
5281
5282 last_ptr_loop = 1;
5283 search_start = block_group->key.objectid;
5284 /*
5285 * we know this block group is properly
5286 * in the list because
5287 * btrfs_remove_block_group, drops the
5288 * cluster before it removes the block
5289 * group from the list
5290 */
5291 goto have_block_group;
5292 } 5341 }
5293 spin_unlock(&last_ptr->lock);
5294refill_cluster: 5342refill_cluster:
5343 BUG_ON(used_block_group != block_group);
5344 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5345 * set up a new clusters, so lets just skip it
5346 * and let the allocator find whatever block
5347 * it can find. If we reach this point, we
5348 * will have tried the cluster allocator
5349 * plenty of times and not have found
5350 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find
5352 * anything. */
5353 if (loop >= LOOP_NO_EMPTY_SIZE) {
5354 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc;
5356 }
5357
5295 /* 5358 /*
5296 * this cluster didn't work out, free it and 5359 * this cluster didn't work out, free it and
5297 * start over 5360 * start over
5298 */ 5361 */
5299 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5362 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5300 5363
5301 last_ptr_loop = 0;
5302
5303 /* allocate a cluster in this block group */ 5364 /* allocate a cluster in this block group */
5304 ret = btrfs_find_space_cluster(trans, root, 5365 ret = btrfs_find_space_cluster(trans, root,
5305 block_group, last_ptr, 5366 block_group, last_ptr,
5306 offset, num_bytes, 5367 search_start, num_bytes,
5307 empty_cluster + empty_size); 5368 empty_cluster + empty_size);
5308 if (ret == 0) { 5369 if (ret == 0) {
5309 /* 5370 /*
@@ -5339,6 +5400,7 @@ refill_cluster:
5339 goto loop; 5400 goto loop;
5340 } 5401 }
5341 5402
5403unclustered_alloc:
5342 offset = btrfs_find_space_for_alloc(block_group, search_start, 5404 offset = btrfs_find_space_for_alloc(block_group, search_start,
5343 num_bytes, empty_size); 5405 num_bytes, empty_size);
5344 /* 5406 /*
@@ -5365,14 +5427,14 @@ checks:
5365 search_start = stripe_align(root, offset); 5427 search_start = stripe_align(root, offset);
5366 /* move on to the next group */ 5428 /* move on to the next group */
5367 if (search_start + num_bytes >= search_end) { 5429 if (search_start + num_bytes >= search_end) {
5368 btrfs_add_free_space(block_group, offset, num_bytes); 5430 btrfs_add_free_space(used_block_group, offset, num_bytes);
5369 goto loop; 5431 goto loop;
5370 } 5432 }
5371 5433
5372 /* move on to the next group */ 5434 /* move on to the next group */
5373 if (search_start + num_bytes > 5435 if (search_start + num_bytes >
5374 block_group->key.objectid + block_group->key.offset) { 5436 used_block_group->key.objectid + used_block_group->key.offset) {
5375 btrfs_add_free_space(block_group, offset, num_bytes); 5437 btrfs_add_free_space(used_block_group, offset, num_bytes);
5376 goto loop; 5438 goto loop;
5377 } 5439 }
5378 5440
@@ -5380,14 +5442,14 @@ checks:
5380 ins->offset = num_bytes; 5442 ins->offset = num_bytes;
5381 5443
5382 if (offset < search_start) 5444 if (offset < search_start)
5383 btrfs_add_free_space(block_group, offset, 5445 btrfs_add_free_space(used_block_group, offset,
5384 search_start - offset); 5446 search_start - offset);
5385 BUG_ON(offset > search_start); 5447 BUG_ON(offset > search_start);
5386 5448
5387 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 5449 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5388 alloc_type); 5450 alloc_type);
5389 if (ret == -EAGAIN) { 5451 if (ret == -EAGAIN) {
5390 btrfs_add_free_space(block_group, offset, num_bytes); 5452 btrfs_add_free_space(used_block_group, offset, num_bytes);
5391 goto loop; 5453 goto loop;
5392 } 5454 }
5393 5455
@@ -5396,15 +5458,19 @@ checks:
5396 ins->offset = num_bytes; 5458 ins->offset = num_bytes;
5397 5459
5398 if (offset < search_start) 5460 if (offset < search_start)
5399 btrfs_add_free_space(block_group, offset, 5461 btrfs_add_free_space(used_block_group, offset,
5400 search_start - offset); 5462 search_start - offset);
5401 BUG_ON(offset > search_start); 5463 BUG_ON(offset > search_start);
5464 if (used_block_group != block_group)
5465 btrfs_put_block_group(used_block_group);
5402 btrfs_put_block_group(block_group); 5466 btrfs_put_block_group(block_group);
5403 break; 5467 break;
5404loop: 5468loop:
5405 failed_cluster_refill = false; 5469 failed_cluster_refill = false;
5406 failed_alloc = false; 5470 failed_alloc = false;
5407 BUG_ON(index != get_block_group_index(block_group)); 5471 BUG_ON(index != get_block_group_index(block_group));
5472 if (used_block_group != block_group)
5473 btrfs_put_block_group(used_block_group);
5408 btrfs_put_block_group(block_group); 5474 btrfs_put_block_group(block_group);
5409 } 5475 }
5410 up_read(&space_info->groups_sem); 5476 up_read(&space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a0..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
935 node = tree_search(tree, start); 935 node = tree_search(tree, start);
936 if (!node) { 936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc); 937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc) 938 if (!prealloc) {
939 return -ENOMEM; 939 err = -ENOMEM;
940 goto out;
941 }
940 err = insert_state(tree, prealloc, start, end, &bits); 942 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL; 943 prealloc = NULL;
942 BUG_ON(err == -EEXIST); 944 BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
992 */ 994 */
993 if (state->start < start) { 995 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc); 996 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc) 997 if (!prealloc) {
996 return -ENOMEM; 998 err = -ENOMEM;
999 goto out;
1000 }
997 err = split_state(tree, state, prealloc, start); 1001 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST); 1002 BUG_ON(err == -EEXIST);
999 prealloc = NULL; 1003 prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
1024 this_end = last_start - 1; 1028 this_end = last_start - 1;
1025 1029
1026 prealloc = alloc_extent_state_atomic(prealloc); 1030 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc) 1031 if (!prealloc) {
1028 return -ENOMEM; 1032 err = -ENOMEM;
1033 goto out;
1034 }
1029 1035
1030 /* 1036 /*
1031 * Avoid to free 'prealloc' if it can be merged with 1037 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
1051 */ 1057 */
1052 if (state->start <= end && state->end > end) { 1058 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc); 1059 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc) 1060 if (!prealloc) {
1055 return -ENOMEM; 1061 err = -ENOMEM;
1062 goto out;
1063 }
1056 1064
1057 err = split_state(tree, state, prealloc, end + 1); 1065 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST); 1066 BUG_ON(err == -EEXIST);
@@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2285 clean_io_failure(start, page); 2293 clean_io_failure(start, page);
2286 } 2294 }
2287 if (!uptodate) { 2295 if (!uptodate) {
2288 u64 failed_mirror; 2296 int failed_mirror;
2289 failed_mirror = (u64)bio->bi_bdev; 2297 failed_mirror = (int)(unsigned long)bio->bi_bdev;
2290 if (tree->ops && tree->ops->readpage_io_failed_hook) 2298 /*
2291 ret = tree->ops->readpage_io_failed_hook( 2299 * The generic bio_readpage_error handles errors the
2292 bio, page, start, end, 2300 * following way: If possible, new read requests are
2293 failed_mirror, state); 2301 * created and submitted and will end up in
2294 else 2302 * end_bio_extent_readpage as well (if we're lucky, not
2295 ret = bio_readpage_error(bio, page, start, end, 2303 * in the !uptodate case). In that case it returns 0 and
2296 failed_mirror, NULL); 2304 * we just go on with the next page in our bio. If it
2305 * can't handle the error it will return -EIO and we
2306 * remain responsible for that page.
2307 */
2308 ret = bio_readpage_error(bio, page, start, end,
2309 failed_mirror, NULL);
2297 if (ret == 0) { 2310 if (ret == 0) {
2311error_handled:
2298 uptodate = 2312 uptodate =
2299 test_bit(BIO_UPTODATE, &bio->bi_flags); 2313 test_bit(BIO_UPTODATE, &bio->bi_flags);
2300 if (err) 2314 if (err)
@@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2302 uncache_state(&cached); 2316 uncache_state(&cached);
2303 continue; 2317 continue;
2304 } 2318 }
2319 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2320 ret = tree->ops->readpage_io_failed_hook(
2321 bio, page, start, end,
2322 failed_mirror, state);
2323 if (ret == 0)
2324 goto error_handled;
2325 }
2305 } 2326 }
2306 2327
2307 if (uptodate) { 2328 if (uptodate) {
@@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3366 return -ENOMEM; 3387 return -ENOMEM;
3367 path->leave_spinning = 1; 3388 path->leave_spinning = 1;
3368 3389
3390 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3391 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3392
3369 /* 3393 /*
3370 * lookup the last file extent. We're not using i_size here 3394 * lookup the last file extent. We're not using i_size here
3371 * because there might be preallocation past i_size 3395 * because there might be preallocation past i_size
@@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3413 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3437 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3414 &cached_state, GFP_NOFS); 3438 &cached_state, GFP_NOFS);
3415 3439
3416 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3440 em = get_extent_skip_holes(inode, start, last_for_get_extent,
3417 get_extent); 3441 get_extent);
3418 if (!em) 3442 if (!em)
3419 goto out; 3443 goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
70 unsigned long bio_flags); 70 unsigned long bio_flags);
71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, u64 failed_mirror, 73 u64 start, u64 end, int failed_mirror,
74 struct extent_state *state); 74 struct extent_state *state);
75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
76 u64 start, u64 end, 76 u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dafdfa059bf6..97fbe939c050 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1169 (sizeof(struct page *))); 1169 (sizeof(struct page *)));
1170 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1171 nrptrs = max(nrptrs, 8);
1170 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1172 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1171 if (!pages) 1173 if (!pages)
1172 return -ENOMEM; 1174 return -ENOMEM;
@@ -1387,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1387 goto out; 1389 goto out;
1388 } 1390 }
1389 1391
1390 file_update_time(file); 1392 err = btrfs_update_time(file);
1393 if (err) {
1394 mutex_unlock(&inode->i_mutex);
1395 goto out;
1396 }
1391 BTRFS_I(inode)->sequence++; 1397 BTRFS_I(inode)->sequence++;
1392 1398
1393 start_pos = round_down(pos, root->sectorsize); 1399 start_pos = round_down(pos, root->sectorsize);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 181760f9d2ab..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
351 } 351 }
352 } 352 }
353 353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
354 return 0; 359 return 0;
355} 360}
356 361
@@ -1465,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1465{ 1470{
1466 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1467 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1468 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1469 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1470 1476
@@ -1844,7 +1850,13 @@ again:
1844 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1845 1, 0); 1851 1, 0);
1846 if (!info) { 1852 if (!info) {
1847 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1848 goto out_lock; 1860 goto out_lock;
1849 } 1861 }
1850 } 1862 }
@@ -2308,6 +2320,7 @@ again:
2308 2320
2309 if (!found) { 2321 if (!found) {
2310 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2311 found = true; 2324 found = true;
2312 } 2325 }
2313 2326
@@ -2451,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2451{ 2464{
2452 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2453 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2454 struct rb_node *node;
2455 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2456 2469
2457 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2458 return -ENOSPC; 2471 return -ENOSPC;
2459 2472
2460 /* 2473 /*
2461 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2462 * here that will work. 2475 * is just its start offset.
2463 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2464 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2465 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2466 continue; 2486 continue;
@@ -2471,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2471 } 2491 }
2472 2492
2473 /* 2493 /*
2474 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2475 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2476 * this list and start the search from there.
2477 */ 2496 */
2478 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2479 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2480 list);
2481 node = rb_next(&entry->offset_index);
2482 if (!node)
2483 return -ENOSPC;
2484 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2485 goto search;
2486 }
2487
2488 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2489 if (!entry)
2490 return -ENOSPC;
2491
2492search:
2493 node = &entry->offset_index;
2494 do {
2495 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2496 node = rb_next(&entry->offset_index);
2497 if (!entry->bitmap)
2498 continue;
2499 if (entry->bytes < min_bytes)
2500 continue;
2501 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2502 bytes, min_bytes);
2503 } while (ret && node);
2504
2505 return ret;
2506} 2498}
2507 2499
2508/* 2500/*
@@ -2520,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2520 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2521{ 2513{
2522 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2523 struct list_head bitmaps;
2524 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2525 u64 min_bytes; 2517 u64 min_bytes;
2526 int ret; 2518 int ret;
2527 2519
@@ -2560,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2560 goto out; 2552 goto out;
2561 } 2553 }
2562 2554
2563 INIT_LIST_HEAD(&bitmaps);
2564 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2565 bytes, min_bytes); 2556 bytes, min_bytes);
2566 if (ret) 2557 if (ret)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 116ab67a06df..0a6b928813a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -2031,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2031 /* insert an orphan item to track this unlinked/truncated file */ 2032 /* insert an orphan item to track this unlinked/truncated file */
2032 if (insert >= 1) { 2033 if (insert >= 1) {
2033 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2034 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2034 BUG_ON(ret); 2035 BUG_ON(ret && ret != -EEXIST);
2035 } 2036 }
2036 2037
2037 /* insert an orphan item to track subvolume contains orphan files */ 2038 /* insert an orphan item to track subvolume contains orphan files */
@@ -2158,6 +2159,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2158 if (ret && ret != -ESTALE) 2159 if (ret && ret != -ESTALE)
2159 goto out; 2160 goto out;
2160 2161
2162 if (ret == -ESTALE && root == root->fs_info->tree_root) {
2163 struct btrfs_root *dead_root;
2164 struct btrfs_fs_info *fs_info = root->fs_info;
2165 int is_dead_root = 0;
2166
2167 /*
2168 * this is an orphan in the tree root. Currently these
2169 * could come from 2 sources:
2170 * a) a snapshot deletion in progress
2171 * b) a free space cache inode
2172 * We need to distinguish those two, as the snapshot
2173 * orphan must not get deleted.
2174 * find_dead_roots already ran before us, so if this
2175 * is a snapshot deletion, we should find the root
2176 * in the dead_roots list
2177 */
2178 spin_lock(&fs_info->trans_lock);
2179 list_for_each_entry(dead_root, &fs_info->dead_roots,
2180 root_list) {
2181 if (dead_root->root_key.objectid ==
2182 found_key.objectid) {
2183 is_dead_root = 1;
2184 break;
2185 }
2186 }
2187 spin_unlock(&fs_info->trans_lock);
2188 if (is_dead_root) {
2189 /* prevent this orphan from being found again */
2190 key.offset = found_key.objectid - 1;
2191 continue;
2192 }
2193 }
2161 /* 2194 /*
2162 * Inode is already gone but the orphan item is still there, 2195 * Inode is already gone but the orphan item is still there,
2163 * kill the orphan item. 2196 * kill the orphan item.
@@ -2191,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2191 continue; 2224 continue;
2192 } 2225 }
2193 nr_truncate++; 2226 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2194 ret = btrfs_truncate(inode); 2233 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2195 } else { 2235 } else {
2196 nr_unlink++; 2236 nr_unlink++;
2197 } 2237 }
@@ -3327,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3327 u64 hint_byte = 0; 3367 u64 hint_byte = 0;
3328 hole_size = last_byte - cur_offset; 3368 hole_size = last_byte - cur_offset;
3329 3369
3330 trans = btrfs_start_transaction(root, 2); 3370 trans = btrfs_start_transaction(root, 3);
3331 if (IS_ERR(trans)) { 3371 if (IS_ERR(trans)) {
3332 err = PTR_ERR(trans); 3372 err = PTR_ERR(trans);
3333 break; 3373 break;
@@ -3337,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3337 cur_offset + hole_size, 3377 cur_offset + hole_size,
3338 &hint_byte, 1); 3378 &hint_byte, 1);
3339 if (err) { 3379 if (err) {
3380 btrfs_update_inode(trans, root, inode);
3340 btrfs_end_transaction(trans, root); 3381 btrfs_end_transaction(trans, root);
3341 break; 3382 break;
3342 } 3383 }
@@ -3346,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3346 0, hole_size, 0, hole_size, 3387 0, hole_size, 0, hole_size,
3347 0, 0, 0); 3388 0, 0, 0);
3348 if (err) { 3389 if (err) {
3390 btrfs_update_inode(trans, root, inode);
3349 btrfs_end_transaction(trans, root); 3391 btrfs_end_transaction(trans, root);
3350 break; 3392 break;
3351 } 3393 }
@@ -3353,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3353 btrfs_drop_extent_cache(inode, hole_start, 3395 btrfs_drop_extent_cache(inode, hole_start,
3354 last_byte - 1, 0); 3396 last_byte - 1, 0);
3355 3397
3398 btrfs_update_inode(trans, root, inode);
3356 btrfs_end_transaction(trans, root); 3399 btrfs_end_transaction(trans, root);
3357 } 3400 }
3358 free_extent_map(em); 3401 free_extent_map(em);
@@ -3370,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3370 3413
3371static int btrfs_setsize(struct inode *inode, loff_t newsize) 3414static int btrfs_setsize(struct inode *inode, loff_t newsize)
3372{ 3415{
3416 struct btrfs_root *root = BTRFS_I(inode)->root;
3417 struct btrfs_trans_handle *trans;
3373 loff_t oldsize = i_size_read(inode); 3418 loff_t oldsize = i_size_read(inode);
3374 int ret; 3419 int ret;
3375 3420
@@ -3377,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3377 return 0; 3422 return 0;
3378 3423
3379 if (newsize > oldsize) { 3424 if (newsize > oldsize) {
3380 i_size_write(inode, newsize);
3381 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3382 truncate_pagecache(inode, oldsize, newsize); 3425 truncate_pagecache(inode, oldsize, newsize);
3383 ret = btrfs_cont_expand(inode, oldsize, newsize); 3426 ret = btrfs_cont_expand(inode, oldsize, newsize);
3384 if (ret) { 3427 if (ret)
3385 btrfs_setsize(inode, oldsize);
3386 return ret; 3428 return ret;
3387 }
3388 3429
3389 mark_inode_dirty(inode); 3430 trans = btrfs_start_transaction(root, 1);
3431 if (IS_ERR(trans))
3432 return PTR_ERR(trans);
3433
3434 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root);
3390 } else { 3438 } else {
3391 3439
3392 /* 3440 /*
@@ -3426,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3426 3474
3427 if (attr->ia_valid) { 3475 if (attr->ia_valid) {
3428 setattr_copy(inode, attr); 3476 setattr_copy(inode, attr);
3429 mark_inode_dirty(inode); 3477 err = btrfs_dirty_inode(inode);
3430 3478
3431 if (attr->ia_valid & ATTR_MODE) 3479 if (!err && attr->ia_valid & ATTR_MODE)
3432 err = btrfs_acl_chmod(inode); 3480 err = btrfs_acl_chmod(inode);
3433 } 3481 }
3434 3482
@@ -3490,7 +3538,7 @@ void btrfs_evict_inode(struct inode *inode)
3490 * doing the truncate. 3538 * doing the truncate.
3491 */ 3539 */
3492 while (1) { 3540 while (1) {
3493 ret = btrfs_block_rsv_refill(root, rsv, min_size); 3541 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3494 3542
3495 /* 3543 /*
3496 * Try and steal from the global reserve since we will 3544 * Try and steal from the global reserve since we will
@@ -4204,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4204 * FIXME, needs more benchmarking...there are no reasons other than performance 4252 * FIXME, needs more benchmarking...there are no reasons other than performance
4205 * to keep or drop this code. 4253 * to keep or drop this code.
4206 */ 4254 */
4207void btrfs_dirty_inode(struct inode *inode, int flags) 4255int btrfs_dirty_inode(struct inode *inode)
4208{ 4256{
4209 struct btrfs_root *root = BTRFS_I(inode)->root; 4257 struct btrfs_root *root = BTRFS_I(inode)->root;
4210 struct btrfs_trans_handle *trans; 4258 struct btrfs_trans_handle *trans;
4211 int ret; 4259 int ret;
4212 4260
4213 if (BTRFS_I(inode)->dummy_inode) 4261 if (BTRFS_I(inode)->dummy_inode)
4214 return; 4262 return 0;
4215 4263
4216 trans = btrfs_join_transaction(root); 4264 trans = btrfs_join_transaction(root);
4217 BUG_ON(IS_ERR(trans)); 4265 if (IS_ERR(trans))
4266 return PTR_ERR(trans);
4218 4267
4219 ret = btrfs_update_inode(trans, root, inode); 4268 ret = btrfs_update_inode(trans, root, inode);
4220 if (ret && ret == -ENOSPC) { 4269 if (ret && ret == -ENOSPC) {
4221 /* whoops, lets try again with the full transaction */ 4270 /* whoops, lets try again with the full transaction */
4222 btrfs_end_transaction(trans, root); 4271 btrfs_end_transaction(trans, root);
4223 trans = btrfs_start_transaction(root, 1); 4272 trans = btrfs_start_transaction(root, 1);
4224 if (IS_ERR(trans)) { 4273 if (IS_ERR(trans))
4225 printk_ratelimited(KERN_ERR "btrfs: fail to " 4274 return PTR_ERR(trans);
4226 "dirty inode %llu error %ld\n",
4227 (unsigned long long)btrfs_ino(inode),
4228 PTR_ERR(trans));
4229 return;
4230 }
4231 4275
4232 ret = btrfs_update_inode(trans, root, inode); 4276 ret = btrfs_update_inode(trans, root, inode);
4233 if (ret) {
4234 printk_ratelimited(KERN_ERR "btrfs: fail to "
4235 "dirty inode %llu error %d\n",
4236 (unsigned long long)btrfs_ino(inode),
4237 ret);
4238 }
4239 } 4277 }
4240 btrfs_end_transaction(trans, root); 4278 btrfs_end_transaction(trans, root);
4241 if (BTRFS_I(inode)->delayed_node) 4279 if (BTRFS_I(inode)->delayed_node)
4242 btrfs_balance_delayed_items(root); 4280 btrfs_balance_delayed_items(root);
4281
4282 return ret;
4283}
4284
4285/*
4286 * This is a copy of file_update_time. We need this so we can return error on
4287 * ENOSPC for updating the inode in the case of file write and mmap writes.
4288 */
4289int btrfs_update_time(struct file *file)
4290{
4291 struct inode *inode = file->f_path.dentry->d_inode;
4292 struct timespec now;
4293 int ret;
4294 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4295
4296 /* First try to exhaust all avenues to not sync */
4297 if (IS_NOCMTIME(inode))
4298 return 0;
4299
4300 now = current_fs_time(inode->i_sb);
4301 if (!timespec_equal(&inode->i_mtime, &now))
4302 sync_it = S_MTIME;
4303
4304 if (!timespec_equal(&inode->i_ctime, &now))
4305 sync_it |= S_CTIME;
4306
4307 if (IS_I_VERSION(inode))
4308 sync_it |= S_VERSION;
4309
4310 if (!sync_it)
4311 return 0;
4312
4313 /* Finally allowed to write? Takes lock. */
4314 if (mnt_want_write_file(file))
4315 return 0;
4316
4317 /* Only change inode inside the lock region */
4318 if (sync_it & S_VERSION)
4319 inode_inc_iversion(inode);
4320 if (sync_it & S_CTIME)
4321 inode->i_ctime = now;
4322 if (sync_it & S_MTIME)
4323 inode->i_mtime = now;
4324 ret = btrfs_dirty_inode(inode);
4325 if (!ret)
4326 mark_inode_dirty_sync(inode);
4327 mnt_drop_write(file->f_path.mnt);
4328 return ret;
4243} 4329}
4244 4330
4245/* 4331/*
@@ -4555,11 +4641,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4555 goto out_unlock; 4641 goto out_unlock;
4556 } 4642 }
4557 4643
4644 /*
4645 * If the active LSM wants to access the inode during
4646 * d_instantiate it needs these. Smack checks to see
4647 * if the filesystem supports xattrs by looking at the
4648 * ops vector.
4649 */
4650
4651 inode->i_op = &btrfs_special_inode_operations;
4558 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4652 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4559 if (err) 4653 if (err)
4560 drop_inode = 1; 4654 drop_inode = 1;
4561 else { 4655 else {
4562 inode->i_op = &btrfs_special_inode_operations;
4563 init_special_inode(inode, inode->i_mode, rdev); 4656 init_special_inode(inode, inode->i_mode, rdev);
4564 btrfs_update_inode(trans, root, inode); 4657 btrfs_update_inode(trans, root, inode);
4565 } 4658 }
@@ -4613,14 +4706,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4613 goto out_unlock; 4706 goto out_unlock;
4614 } 4707 }
4615 4708
4709 /*
4710 * If the active LSM wants to access the inode during
4711 * d_instantiate it needs these. Smack checks to see
4712 * if the filesystem supports xattrs by looking at the
4713 * ops vector.
4714 */
4715 inode->i_fop = &btrfs_file_operations;
4716 inode->i_op = &btrfs_file_inode_operations;
4717
4616 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4718 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4617 if (err) 4719 if (err)
4618 drop_inode = 1; 4720 drop_inode = 1;
4619 else { 4721 else {
4620 inode->i_mapping->a_ops = &btrfs_aops; 4722 inode->i_mapping->a_ops = &btrfs_aops;
4621 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4723 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4622 inode->i_fop = &btrfs_file_operations;
4623 inode->i_op = &btrfs_file_inode_operations;
4624 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4724 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4625 } 4725 }
4626out_unlock: 4726out_unlock:
@@ -6303,7 +6403,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6303 u64 page_start; 6403 u64 page_start;
6304 u64 page_end; 6404 u64 page_end;
6305 6405
6406 /* Need this to keep space reservations serialized */
6407 mutex_lock(&inode->i_mutex);
6306 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6408 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6409 mutex_unlock(&inode->i_mutex);
6410 if (!ret)
6411 ret = btrfs_update_time(vma->vm_file);
6307 if (ret) { 6412 if (ret) {
6308 if (ret == -ENOMEM) 6413 if (ret == -ENOMEM)
6309 ret = VM_FAULT_OOM; 6414 ret = VM_FAULT_OOM;
@@ -6515,8 +6620,9 @@ static int btrfs_truncate(struct inode *inode)
6515 /* Just need the 1 for updating the inode */ 6620 /* Just need the 1 for updating the inode */
6516 trans = btrfs_start_transaction(root, 1); 6621 trans = btrfs_start_transaction(root, 1);
6517 if (IS_ERR(trans)) { 6622 if (IS_ERR(trans)) {
6518 err = PTR_ERR(trans); 6623 ret = err = PTR_ERR(trans);
6519 goto out; 6624 trans = NULL;
6625 break;
6520 } 6626 }
6521 } 6627 }
6522 6628
@@ -6794,11 +6900,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6794 struct dentry *dentry, struct kstat *stat) 6900 struct dentry *dentry, struct kstat *stat)
6795{ 6901{
6796 struct inode *inode = dentry->d_inode; 6902 struct inode *inode = dentry->d_inode;
6903 u32 blocksize = inode->i_sb->s_blocksize;
6904
6797 generic_fillattr(inode, stat); 6905 generic_fillattr(inode, stat);
6798 stat->dev = BTRFS_I(inode)->root->anon_dev; 6906 stat->dev = BTRFS_I(inode)->root->anon_dev;
6799 stat->blksize = PAGE_CACHE_SIZE; 6907 stat->blksize = PAGE_CACHE_SIZE;
6800 stat->blocks = (inode_get_bytes(inode) + 6908 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6801 BTRFS_I(inode)->delalloc_bytes) >> 9; 6909 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6802 return 0; 6910 return 0;
6803} 6911}
6804 6912
@@ -7074,14 +7182,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7074 goto out_unlock; 7182 goto out_unlock;
7075 } 7183 }
7076 7184
7185 /*
7186 * If the active LSM wants to access the inode during
7187 * d_instantiate it needs these. Smack checks to see
7188 * if the filesystem supports xattrs by looking at the
7189 * ops vector.
7190 */
7191 inode->i_fop = &btrfs_file_operations;
7192 inode->i_op = &btrfs_file_inode_operations;
7193
7077 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7194 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7078 if (err) 7195 if (err)
7079 drop_inode = 1; 7196 drop_inode = 1;
7080 else { 7197 else {
7081 inode->i_mapping->a_ops = &btrfs_aops; 7198 inode->i_mapping->a_ops = &btrfs_aops;
7082 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7199 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7083 inode->i_fop = &btrfs_file_operations;
7084 inode->i_op = &btrfs_file_inode_operations;
7085 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7200 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7086 } 7201 }
7087 if (drop_inode) 7202 if (drop_inode)
@@ -7351,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7351 .follow_link = page_follow_link_light, 7466 .follow_link = page_follow_link_light,
7352 .put_link = page_put_link, 7467 .put_link = page_put_link,
7353 .getattr = btrfs_getattr, 7468 .getattr = btrfs_getattr,
7469 .setattr = btrfs_setattr,
7354 .permission = btrfs_permission, 7470 .permission = btrfs_permission,
7355 .setxattr = btrfs_setxattr, 7471 .setxattr = btrfs_setxattr,
7356 .getxattr = btrfs_getxattr, 7472 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f126..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
252 trans = btrfs_join_transaction(root); 252 trans = btrfs_join_transaction(root);
253 BUG_ON(IS_ERR(trans)); 253 BUG_ON(IS_ERR(trans));
254 254
255 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME;
255 ret = btrfs_update_inode(trans, root, inode); 257 ret = btrfs_update_inode(trans, root, inode);
256 BUG_ON(ret); 258 BUG_ON(ret);
257 259
258 btrfs_update_iflags(inode);
259 inode->i_ctime = CURRENT_TIME;
260 btrfs_end_transaction(trans, root); 260 btrfs_end_transaction(trans, root);
261 261
262 mnt_drop_write(file->f_path.mnt); 262 mnt_drop_write(file->f_path.mnt);
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 858 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 860
861 mutex_lock(&inode->i_mutex);
861 ret = btrfs_delalloc_reserve_space(inode, 862 ret = btrfs_delalloc_reserve_space(inode,
862 num_pages << PAGE_CACHE_SHIFT); 863 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
863 if (ret) 865 if (ret)
864 return ret; 866 return ret;
865again: 867again:
@@ -1216,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1216 *devstr = '\0'; 1218 *devstr = '\0';
1217 devstr = vol_args->name; 1219 devstr = vol_args->name;
1218 devid = simple_strtoull(devstr, &end, 10); 1220 devid = simple_strtoull(devstr, &end, 10);
1219 printk(KERN_INFO "resizing devid %llu\n", 1221 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1220 (unsigned long long)devid); 1222 (unsigned long long)devid);
1221 } 1223 }
1222 device = btrfs_find_device(root, devid, NULL, NULL); 1224 device = btrfs_find_device(root, devid, NULL, NULL);
1223 if (!device) { 1225 if (!device) {
1224 printk(KERN_INFO "resizer unable to find device %llu\n", 1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1225 (unsigned long long)devid); 1227 (unsigned long long)devid);
1226 ret = -EINVAL; 1228 ret = -EINVAL;
1227 goto out_unlock; 1229 goto out_unlock;
@@ -1267,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1267 do_div(new_size, root->sectorsize); 1269 do_div(new_size, root->sectorsize);
1268 new_size *= root->sectorsize; 1270 new_size *= root->sectorsize;
1269 1271
1270 printk(KERN_INFO "new size for %s is %llu\n", 1272 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1271 device->name, (unsigned long long)new_size); 1273 device->name, (unsigned long long)new_size);
1272 1274
1273 if (new_size > old_size) { 1275 if (new_size > old_size) {
@@ -1278,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1278 } 1280 }
1279 ret = btrfs_grow_device(trans, device, new_size); 1281 ret = btrfs_grow_device(trans, device, new_size);
1280 btrfs_commit_transaction(trans, root); 1282 btrfs_commit_transaction(trans, root);
1281 } else { 1283 } else if (new_size < old_size) {
1282 ret = btrfs_shrink_device(device, new_size); 1284 ret = btrfs_shrink_device(device, new_size);
1283 } 1285 }
1284 1286
@@ -2930,11 +2932,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2930 goto out; 2932 goto out;
2931 2933
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 2934 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; 2935 rel_ptr = ipath->fspath->val[i] -
2936 (u64)(unsigned long)ipath->fspath->val;
2934 ipath->fspath->val[i] = rel_ptr; 2937 ipath->fspath->val[i] = rel_ptr;
2935 } 2938 }
2936 2939
2937 ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); 2940 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2941 (void *)(unsigned long)ipath->fspath, size);
2938 if (ret) { 2942 if (ret) {
2939 ret = -EFAULT; 2943 ret = -EFAULT;
2940 goto out; 2944 goto out;
@@ -3017,7 +3021,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3017 if (ret < 0) 3021 if (ret < 0)
3018 goto out; 3022 goto out;
3019 3023
3020 ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); 3024 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3025 (void *)(unsigned long)inodes, size);
3021 if (ret) 3026 if (ret)
3022 ret = -EFAULT; 3027 ret = -EFAULT;
3023 3028
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5e151a..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2949 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2950 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2951 if (ret) 2953 if (ret)
2952 goto out; 2954 goto out;
2953 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4190f22edfb..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
256 btrfs_release_path(swarn->path); 256 btrfs_release_path(swarn->path);
257 257
258 ipath = init_ipath(4096, local_root, swarn->path); 258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
259 ret = paths_from_inode(inum, ipath); 264 ret = paths_from_inode(inum, ipath);
260 265
261 if (ret < 0) 266 if (ret < 0)
@@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
272 swarn->logical, swarn->dev->name, 277 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset, 278 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink, 279 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]); 280 (char *)(unsigned long)ipath->fspath->val[i]);
276 281
277 free_ipath(ipath); 282 free_ipath(ipath);
278 return 0; 283 return 0;
@@ -1530,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1530static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1535static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1531{ 1536{
1532 struct btrfs_fs_info *fs_info = root->fs_info; 1537 struct btrfs_fs_info *fs_info = root->fs_info;
1538 int ret = 0;
1533 1539
1534 mutex_lock(&fs_info->scrub_lock); 1540 mutex_lock(&fs_info->scrub_lock);
1535 if (fs_info->scrub_workers_refcnt == 0) { 1541 if (fs_info->scrub_workers_refcnt == 0) {
1536 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1542 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1537 fs_info->thread_pool_size, &fs_info->generic_worker); 1543 fs_info->thread_pool_size, &fs_info->generic_worker);
1538 fs_info->scrub_workers.idle_thresh = 4; 1544 fs_info->scrub_workers.idle_thresh = 4;
1539 btrfs_start_workers(&fs_info->scrub_workers, 1); 1545 ret = btrfs_start_workers(&fs_info->scrub_workers);
1546 if (ret)
1547 goto out;
1540 } 1548 }
1541 ++fs_info->scrub_workers_refcnt; 1549 ++fs_info->scrub_workers_refcnt;
1550out:
1542 mutex_unlock(&fs_info->scrub_lock); 1551 mutex_unlock(&fs_info->scrub_lock);
1543 1552
1544 return 0; 1553 return ret;
1545} 1554}
1546 1555
1547static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1556static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8bd9d6d0e07a..200f63bc6675 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h> 43#include <linux/mnt_namespace.h>
44#include <linux/ratelimit.h>
44#include "compat.h" 45#include "compat.h"
45#include "delayed-inode.h" 46#include "delayed-inode.h"
46#include "ctree.h" 47#include "ctree.h"
@@ -825,13 +826,9 @@ static char *setup_root_args(char *args)
825static struct dentry *mount_subvol(const char *subvol_name, int flags, 826static struct dentry *mount_subvol(const char *subvol_name, int flags,
826 const char *device_name, char *data) 827 const char *device_name, char *data)
827{ 828{
828 struct super_block *s;
829 struct dentry *root; 829 struct dentry *root;
830 struct vfsmount *mnt; 830 struct vfsmount *mnt;
831 struct mnt_namespace *ns_private;
832 char *newargs; 831 char *newargs;
833 struct path path;
834 int error;
835 832
836 newargs = setup_root_args(data); 833 newargs = setup_root_args(data);
837 if (!newargs) 834 if (!newargs)
@@ -842,39 +839,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
842 if (IS_ERR(mnt)) 839 if (IS_ERR(mnt))
843 return ERR_CAST(mnt); 840 return ERR_CAST(mnt);
844 841
845 ns_private = create_mnt_ns(mnt); 842 root = mount_subtree(mnt, subvol_name);
846 if (IS_ERR(ns_private)) {
847 mntput(mnt);
848 return ERR_CAST(ns_private);
849 }
850 843
851 /* 844 if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
852 * This will trigger the automount of the subvol so we can just 845 struct super_block *s = root->d_sb;
853 * drop the mnt we have here and return the dentry that we 846 dput(root);
854 * found. 847 root = ERR_PTR(-EINVAL);
855 */ 848 deactivate_locked_super(s);
856 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
857 LOOKUP_FOLLOW, &path);
858 put_mnt_ns(ns_private);
859 if (error)
860 return ERR_PTR(error);
861
862 if (!is_subvolume_inode(path.dentry->d_inode)) {
863 path_put(&path);
864 mntput(mnt);
865 error = -EINVAL;
866 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", 849 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
867 subvol_name); 850 subvol_name);
868 return ERR_PTR(-EINVAL);
869 } 851 }
870 852
871 /* Get a ref to the sb and the dentry we found and return it */
872 s = path.mnt->mnt_sb;
873 atomic_inc(&s->s_active);
874 root = dget(path.dentry);
875 path_put(&path);
876 down_write(&s->s_umount);
877
878 return root; 853 return root;
879} 854}
880 855
@@ -1079,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1079 u64 avail_space; 1054 u64 avail_space;
1080 u64 used_space; 1055 u64 used_space;
1081 u64 min_stripe_size; 1056 u64 min_stripe_size;
1082 int min_stripes = 1; 1057 int min_stripes = 1, num_stripes = 1;
1083 int i = 0, nr_devices; 1058 int i = 0, nr_devices;
1084 int ret; 1059 int ret;
1085 1060
1086 nr_devices = fs_info->fs_devices->rw_devices; 1061 nr_devices = fs_info->fs_devices->open_devices;
1087 BUG_ON(!nr_devices); 1062 BUG_ON(!nr_devices);
1088 1063
1089 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1064 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1093,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1093 1068
1094 /* calc min stripe number for data space alloction */ 1069 /* calc min stripe number for data space alloction */
1095 type = btrfs_get_alloc_profile(root, 1); 1070 type = btrfs_get_alloc_profile(root, 1);
1096 if (type & BTRFS_BLOCK_GROUP_RAID0) 1071 if (type & BTRFS_BLOCK_GROUP_RAID0) {
1097 min_stripes = 2; 1072 min_stripes = 2;
1098 else if (type & BTRFS_BLOCK_GROUP_RAID1) 1073 num_stripes = nr_devices;
1074 } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
1099 min_stripes = 2; 1075 min_stripes = 2;
1100 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1076 num_stripes = 2;
1077 } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
1101 min_stripes = 4; 1078 min_stripes = 4;
1079 num_stripes = 4;
1080 }
1102 1081
1103 if (type & BTRFS_BLOCK_GROUP_DUP) 1082 if (type & BTRFS_BLOCK_GROUP_DUP)
1104 min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1083 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1105 else 1084 else
1106 min_stripe_size = BTRFS_STRIPE_LEN; 1085 min_stripe_size = BTRFS_STRIPE_LEN;
1107 1086
1108 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1087 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1109 if (!device->in_fs_metadata) 1088 if (!device->in_fs_metadata || !device->bdev)
1110 continue; 1089 continue;
1111 1090
1112 avail_space = device->total_bytes - device->bytes_used; 1091 avail_space = device->total_bytes - device->bytes_used;
@@ -1167,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1167 i = nr_devices - 1; 1146 i = nr_devices - 1;
1168 avail_space = 0; 1147 avail_space = 0;
1169 while (nr_devices >= min_stripes) { 1148 while (nr_devices >= min_stripes) {
1149 if (num_stripes > nr_devices)
1150 num_stripes = nr_devices;
1151
1170 if (devices_info[i].max_avail >= min_stripe_size) { 1152 if (devices_info[i].max_avail >= min_stripe_size) {
1171 int j; 1153 int j;
1172 u64 alloc_size; 1154 u64 alloc_size;
1173 1155
1174 avail_space += devices_info[i].max_avail * min_stripes; 1156 avail_space += devices_info[i].max_avail * num_stripes;
1175 alloc_size = devices_info[i].max_avail; 1157 alloc_size = devices_info[i].max_avail;
1176 for (j = i + 1 - min_stripes; j <= i; j++) 1158 for (j = i + 1 - num_stripes; j <= i; j++)
1177 devices_info[j].max_avail -= alloc_size; 1159 devices_info[j].max_avail -= alloc_size;
1178 } 1160 }
1179 i--; 1161 i--;
@@ -1290,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb)
1290 return 0; 1272 return 0;
1291} 1273}
1292 1274
1275static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1276{
1277 int ret;
1278
1279 ret = btrfs_dirty_inode(inode);
1280 if (ret)
1281 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1282 "error %d\n", btrfs_ino(inode), ret);
1283}
1284
1293static const struct super_operations btrfs_super_ops = { 1285static const struct super_operations btrfs_super_ops = {
1294 .drop_inode = btrfs_drop_inode, 1286 .drop_inode = btrfs_drop_inode,
1295 .evict_inode = btrfs_evict_inode, 1287 .evict_inode = btrfs_evict_inode,
@@ -1297,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = {
1297 .sync_fs = btrfs_sync_fs, 1289 .sync_fs = btrfs_sync_fs,
1298 .show_options = btrfs_show_options, 1290 .show_options = btrfs_show_options,
1299 .write_inode = btrfs_write_inode, 1291 .write_inode = btrfs_write_inode,
1300 .dirty_inode = btrfs_dirty_inode, 1292 .dirty_inode = btrfs_fs_dirty_inode,
1301 .alloc_inode = btrfs_alloc_inode, 1293 .alloc_inode = btrfs_alloc_inode,
1302 .destroy_inode = btrfs_destroy_inode, 1294 .destroy_inode = btrfs_destroy_inode,
1303 .statfs = btrfs_statfs, 1295 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6a0574e923bc..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
785 785
786 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
787 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
788 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
789 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
790 switch_commit_root(root); 794 switch_commit_root(root);
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
947 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
948 free_extent_buffer(old); 952 free_extent_buffer(old);
949 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
950 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
951 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
952 key.offset = trans->transid; 960 key.offset = trans->transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c37433d3cd82..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
295 btrfs_requeue_work(&device->work); 295 btrfs_requeue_work(&device->work);
296 goto done; 296 goto done;
297 } 297 }
298 /* unplug every 64 requests just for good measure */
299 if (batch_run % 64 == 0) {
300 blk_finish_plug(&plug);
301 blk_start_plug(&plug);
302 sync_pending = 0;
303 }
298 } 304 }
299 305
300 cond_resched(); 306 cond_resched();
@@ -1611,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1611 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1617 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1612 return -EINVAL; 1618 return -EINVAL;
1613 1619
1614 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1620 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1615 root->fs_info->bdev_holder); 1621 root->fs_info->bdev_holder);
1616 if (IS_ERR(bdev)) 1622 if (IS_ERR(bdev))
1617 return PTR_ERR(bdev); 1623 return PTR_ERR(bdev);
@@ -3258,7 +3264,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
3258 */ 3264 */
3259 if (atomic_read(&bbio->error) > bbio->max_errors) { 3265 if (atomic_read(&bbio->error) > bbio->max_errors) {
3260 err = -EIO; 3266 err = -EIO;
3261 } else if (err) { 3267 } else {
3262 /* 3268 /*
3263 * this bio is actually up to date, we didn't 3269 * this bio is actually up to date, we didn't
3264 * go over the max number of errors 3270 * go over the max number of errors
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
100 struct reada_zone *reada_curr_zone; 100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones; 101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents; 102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
103}; 109};
104 110
105struct btrfs_fs_devices { 111struct btrfs_fs_devices {