summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2011-12-20 09:43:53 -0500
committerDave Airlie <airlied@redhat.com>2011-12-20 09:43:53 -0500
commit1fbe6f625f69e48c4001051dc1431afc704acfaa (patch)
tree826b741201a2e09a627ed350c6ff36935f5cff79 /fs/btrfs
parent0cecdd818cd79d092e36e70dfe3a71f2878d6b96 (diff)
parent384703b8e6cd4c8ef08512e596024e028c91c339 (diff)
Merge tag 'v3.2-rc6' of /home/airlied/devel/kernel/linux-2.6 into drm-core-next
Merge in the upstream tree to bring in the mainline fixes. Conflicts: drivers/gpu/drm/exynos/exynos_drm_fbdev.c drivers/gpu/drm/nouveau/nouveau_sgdma.c
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c117
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/delayed-inode.c62
-rw-r--r--fs/btrfs/disk-io.c223
-rw-r--r--fs/btrfs/extent-tree.c336
-rw-r--r--fs/btrfs/extent_io.c60
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c8
-rw-r--r--fs/btrfs/free-space-cache.c82
-rw-r--r--fs/btrfs/inode-map.c28
-rw-r--r--fs/btrfs/inode.c272
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/scrub.c79
-rw-r--r--fs/btrfs/super.c125
-rw-r--r--fs/btrfs/transaction.c12
-rw-r--r--fs/btrfs/volumes.c15
-rw-r--r--fs/btrfs/volumes.h6
22 files changed, 987 insertions, 505 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..cb97174e2366 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
64 int idle; 64 int idle;
65}; 65};
66 66
67static int __btrfs_start_workers(struct btrfs_workers *workers);
68
67/* 69/*
68 * btrfs_start_workers uses kthread_run, which can block waiting for memory 70 * btrfs_start_workers uses kthread_run, which can block waiting for memory
69 * for a very long time. It will actually throttle on page writeback, 71 * for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
88{ 90{
89 struct worker_start *start; 91 struct worker_start *start;
90 start = container_of(work, struct worker_start, work); 92 start = container_of(work, struct worker_start, work);
91 btrfs_start_workers(start->queue, 1); 93 __btrfs_start_workers(start->queue);
92 kfree(start); 94 kfree(start);
93} 95}
94 96
95static int start_new_worker(struct btrfs_workers *queue)
96{
97 struct worker_start *start;
98 int ret;
99
100 start = kzalloc(sizeof(*start), GFP_NOFS);
101 if (!start)
102 return -ENOMEM;
103
104 start->work.func = start_new_worker_func;
105 start->queue = queue;
106 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
107 if (ret)
108 kfree(start);
109 return ret;
110}
111
112/* 97/*
113 * helper function to move a thread onto the idle list after it 98 * helper function to move a thread onto the idle list after it
114 * has finished some requests. 99 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
153static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 138static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
154{ 139{
155 struct btrfs_workers *workers = worker->workers; 140 struct btrfs_workers *workers = worker->workers;
141 struct worker_start *start;
156 unsigned long flags; 142 unsigned long flags;
157 143
158 rmb(); 144 rmb();
159 if (!workers->atomic_start_pending) 145 if (!workers->atomic_start_pending)
160 return; 146 return;
161 147
148 start = kzalloc(sizeof(*start), GFP_NOFS);
149 if (!start)
150 return;
151
152 start->work.func = start_new_worker_func;
153 start->queue = workers;
154
162 spin_lock_irqsave(&workers->lock, flags); 155 spin_lock_irqsave(&workers->lock, flags);
163 if (!workers->atomic_start_pending) 156 if (!workers->atomic_start_pending)
164 goto out; 157 goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
170 163
171 workers->num_workers_starting += 1; 164 workers->num_workers_starting += 1;
172 spin_unlock_irqrestore(&workers->lock, flags); 165 spin_unlock_irqrestore(&workers->lock, flags);
173 start_new_worker(workers); 166 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
174 return; 167 return;
175 168
176out: 169out:
170 kfree(start);
177 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
178} 172}
179 173
@@ -331,7 +325,7 @@ again:
331 run_ordered_completions(worker->workers, work); 325 run_ordered_completions(worker->workers, work);
332 326
333 check_pending_worker_creates(worker); 327 check_pending_worker_creates(worker);
334 328 cond_resched();
335 } 329 }
336 330
337 spin_lock_irq(&worker->lock); 331 spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
462 * starts new worker threads. This does not enforce the max worker 456 * starts new worker threads. This does not enforce the max worker
463 * count in case you need to temporarily go past it. 457 * count in case you need to temporarily go past it.
464 */ 458 */
465static int __btrfs_start_workers(struct btrfs_workers *workers, 459static int __btrfs_start_workers(struct btrfs_workers *workers)
466 int num_workers)
467{ 460{
468 struct btrfs_worker_thread *worker; 461 struct btrfs_worker_thread *worker;
469 int ret = 0; 462 int ret = 0;
470 int i;
471 463
472 for (i = 0; i < num_workers; i++) { 464 worker = kzalloc(sizeof(*worker), GFP_NOFS);
473 worker = kzalloc(sizeof(*worker), GFP_NOFS); 465 if (!worker) {
474 if (!worker) { 466 ret = -ENOMEM;
475 ret = -ENOMEM; 467 goto fail;
476 goto fail; 468 }
477 }
478 469
479 INIT_LIST_HEAD(&worker->pending); 470 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending); 471 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list); 472 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock); 473 spin_lock_init(&worker->lock);
483 474
484 atomic_set(&worker->num_pending, 0); 475 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1); 476 atomic_set(&worker->refs, 1);
486 worker->workers = workers; 477 worker->workers = workers;
487 worker->task = kthread_run(worker_loop, worker, 478 worker->task = kthread_run(worker_loop, worker,
488 "btrfs-%s-%d", workers->name, 479 "btrfs-%s-%d", workers->name,
489 workers->num_workers + i); 480 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) { 481 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task); 482 ret = PTR_ERR(worker->task);
492 kfree(worker); 483 kfree(worker);
493 goto fail; 484 goto fail;
494 }
495 spin_lock_irq(&workers->lock);
496 list_add_tail(&worker->worker_list, &workers->idle_list);
497 worker->idle = 1;
498 workers->num_workers++;
499 workers->num_workers_starting--;
500 WARN_ON(workers->num_workers_starting < 0);
501 spin_unlock_irq(&workers->lock);
502 } 485 }
486 spin_lock_irq(&workers->lock);
487 list_add_tail(&worker->worker_list, &workers->idle_list);
488 worker->idle = 1;
489 workers->num_workers++;
490 workers->num_workers_starting--;
491 WARN_ON(workers->num_workers_starting < 0);
492 spin_unlock_irq(&workers->lock);
493
503 return 0; 494 return 0;
504fail: 495fail:
505 btrfs_stop_workers(workers); 496 spin_lock_irq(&workers->lock);
497 workers->num_workers_starting--;
498 spin_unlock_irq(&workers->lock);
506 return ret; 499 return ret;
507} 500}
508 501
509int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 502int btrfs_start_workers(struct btrfs_workers *workers)
510{ 503{
511 spin_lock_irq(&workers->lock); 504 spin_lock_irq(&workers->lock);
512 workers->num_workers_starting += num_workers; 505 workers->num_workers_starting++;
513 spin_unlock_irq(&workers->lock); 506 spin_unlock_irq(&workers->lock);
514 return __btrfs_start_workers(workers, num_workers); 507 return __btrfs_start_workers(workers);
515} 508}
516 509
517/* 510/*
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
568 struct btrfs_worker_thread *worker; 561 struct btrfs_worker_thread *worker;
569 unsigned long flags; 562 unsigned long flags;
570 struct list_head *fallback; 563 struct list_head *fallback;
564 int ret;
571 565
572again: 566again:
573 spin_lock_irqsave(&workers->lock, flags); 567 spin_lock_irqsave(&workers->lock, flags);
@@ -584,7 +578,9 @@ again:
584 workers->num_workers_starting++; 578 workers->num_workers_starting++;
585 spin_unlock_irqrestore(&workers->lock, flags); 579 spin_unlock_irqrestore(&workers->lock, flags);
586 /* we're below the limit, start another worker */ 580 /* we're below the limit, start another worker */
587 __btrfs_start_workers(workers, 1); 581 ret = __btrfs_start_workers(workers);
582 if (ret)
583 goto fallback;
588 goto again; 584 goto again;
589 } 585 }
590 } 586 }
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
665/* 661/*
666 * places a struct btrfs_work into the pending queue of one of the kthreads 662 * places a struct btrfs_work into the pending queue of one of the kthreads
667 */ 663 */
668int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) 664void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
669{ 665{
670 struct btrfs_worker_thread *worker; 666 struct btrfs_worker_thread *worker;
671 unsigned long flags; 667 unsigned long flags;
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
673 669
674 /* don't requeue something already on a list */ 670 /* don't requeue something already on a list */
675 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 671 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
676 goto out; 672 return;
677 673
678 worker = find_worker(workers); 674 worker = find_worker(workers);
679 if (workers->ordered) { 675 if (workers->ordered) {
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
712 if (wake) 708 if (wake)
713 wake_up_process(worker->task); 709 wake_up_process(worker->task);
714 spin_unlock_irqrestore(&worker->lock, flags); 710 spin_unlock_irqrestore(&worker->lock, flags);
715
716out:
717 return 0;
718} 711}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
109 char *name; 109 char *name;
110}; 110};
111 111
112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers);
114int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter); 116 struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929c..22c64fff1bd5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 683 return PTR_ERR(fspath);
684 684
685 if (fspath > fspath_min) { 685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)fspath; 686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 689 } else {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a3935..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
147 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
148 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
149 * new data the application may have written before commit. 149 * new data the application may have written before commit.
150 *
151 * yes, its silly to have a single bitflag, but we might grow more
152 * of these.
153 */ 150 */
154 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
155 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
156 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
157 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
158 156
159 /* 157 /*
160 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea38..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff9292..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
848enum btrfs_caching_type { 848enum btrfs_caching_type {
849 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
850 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
851 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
852}; 853};
853 854
854enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
1271 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1272 */ 1273 */
1273 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1274}; 1277};
1275 1278
1276struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
2366int btrfs_block_rsv_refill(struct btrfs_root *root, 2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2367 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2368 u64 min_reserved); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2370 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2371 u64 num_bytes); 2377 u64 num_bytes);
@@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2686int btrfs_readpage(struct file *file, struct page *page); 2692int btrfs_readpage(struct file *file, struct page *page);
2687void btrfs_evict_inode(struct inode *inode); 2693void btrfs_evict_inode(struct inode *inode);
2688int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2694int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2689void btrfs_dirty_inode(struct inode *inode, int flags); 2695int btrfs_dirty_inode(struct inode *inode);
2696int btrfs_update_time(struct file *file);
2690struct inode *btrfs_alloc_inode(struct super_block *sb); 2697struct inode *btrfs_alloc_inode(struct super_block *sb);
2691void btrfs_destroy_inode(struct inode *inode); 2698void btrfs_destroy_inode(struct inode *inode);
2692int btrfs_drop_inode(struct inode *inode); 2699int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 3a1b939c9ae2..9c1eccc2c503 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
627 int release = false;
626 628
627 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
628 dst_rsv = &root->fs_info->delayed_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -638,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
638 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
639 * we're accounted for. 641 * we're accounted for.
640 */ 642 */
641 if (!trans->bytes_reserved && 643 if (!src_rsv || (!trans->bytes_reserved &&
642 src_rsv != &root->fs_info->delalloc_block_rsv) { 644 src_rsv != &root->fs_info->delalloc_block_rsv)) {
643 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
644 /* 646 /*
645 * Since we're under a transaction reserve_metadata_bytes could 647 * Since we're under a transaction reserve_metadata_bytes could
@@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata(
652 if (!ret) 654 if (!ret)
653 node->bytes_reserved = num_bytes; 655 node->bytes_reserved = num_bytes;
654 return ret; 656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
655 } 691 }
656 692
693migrate:
657 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
658 if (!ret) 710 if (!ret)
659 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
660 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
661 return ret; 716 return ret;
662} 717}
663 718
@@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1708 goto release_node; 1763 goto release_node;
1709 } 1764 }
1710 1765
1711 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1767 delayed_node);
1712 if (ret) 1768 if (ret)
1713 goto release_node; 1769 goto release_node;
1714 1770
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 102c176fc29c..f44b3928dc2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
620 620
621static int btree_io_failed_hook(struct bio *failed_bio, 621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end, 622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state) 623 int mirror_num, struct extent_state *state)
624{ 624{
625 struct extent_io_tree *tree; 625 struct extent_io_tree *tree;
626 unsigned long len; 626 unsigned long len;
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1890 u64 features; 1890 u64 features;
1891 struct btrfs_key location; 1891 struct btrfs_key location;
1892 struct buffer_head *bh; 1892 struct buffer_head *bh;
1893 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1894 GFP_NOFS);
1895 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1896 GFP_NOFS);
1897 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1898 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1899 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1900 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1901 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1902 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1903 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1904
1905 int ret; 1901 int ret;
1906 int err = -EINVAL; 1902 int err = -EINVAL;
1907 int num_backups_tried = 0; 1903 int num_backups_tried = 0;
1908 int backup_index = 0; 1904 int backup_index = 0;
1909 1905
1910 struct btrfs_super_block *disk_super; 1906 extent_root = fs_info->extent_root =
1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1911 1914
1912 if (!extent_root || !tree_root || !tree_root->fs_info || 1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1913 !chunk_root || !dev_root || !csum_root) {
1914 err = -ENOMEM; 1916 err = -ENOMEM;
1915 goto fail; 1917 goto fail;
1916 } 1918 }
1917 fs_info = tree_root->fs_info;
1918 1919
1919 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1920 if (ret) { 1921 if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1955 1956
1956 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1957 fs_info->tree_root = tree_root;
1958 fs_info->extent_root = extent_root;
1959 fs_info->csum_root = csum_root;
1960 fs_info->chunk_root = chunk_root;
1961 fs_info->dev_root = dev_root;
1962 fs_info->fs_devices = fs_devices;
1963 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1964 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1965 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -2199,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2199 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2200 fs_info->readahead_workers.idle_thresh = 2; 2195 fs_info->readahead_workers.idle_thresh = 2;
2201 2196
2202 btrfs_start_workers(&fs_info->workers, 1); 2197 /*
2203 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 * btrfs_start_workers can really only fail because of ENOMEM so just
2204 btrfs_start_workers(&fs_info->submit_workers, 1); 2199 * return -ENOMEM if any of these fail.
2205 btrfs_start_workers(&fs_info->delalloc_workers, 1); 2200 */
2206 btrfs_start_workers(&fs_info->fixup_workers, 1); 2201 ret = btrfs_start_workers(&fs_info->workers);
2207 btrfs_start_workers(&fs_info->endio_workers, 1); 2202 ret |= btrfs_start_workers(&fs_info->generic_worker);
2208 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 2203 ret |= btrfs_start_workers(&fs_info->submit_workers);
2209 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 2204 ret |= btrfs_start_workers(&fs_info->delalloc_workers);
2210 btrfs_start_workers(&fs_info->endio_write_workers, 1); 2205 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2211 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 ret |= btrfs_start_workers(&fs_info->endio_workers);
2212 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2213 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2214 btrfs_start_workers(&fs_info->readahead_workers, 1); 2209 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2210 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2211 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2212 ret |= btrfs_start_workers(&fs_info->caching_workers);
2213 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2214 if (ret) {
2215 ret = -ENOMEM;
2216 goto fail_sb_buffer;
2217 }
2215 2218
2216 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2219 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2217 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2220 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2465,21 +2468,20 @@ fail_sb_buffer:
2465 btrfs_stop_workers(&fs_info->caching_workers); 2468 btrfs_stop_workers(&fs_info->caching_workers);
2466fail_alloc: 2469fail_alloc:
2467fail_iput: 2470fail_iput:
2471 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2472
2468 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2473 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2469 iput(fs_info->btree_inode); 2474 iput(fs_info->btree_inode);
2470
2471 btrfs_close_devices(fs_info->fs_devices);
2472 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2473fail_bdi: 2475fail_bdi:
2474 bdi_destroy(&fs_info->bdi); 2476 bdi_destroy(&fs_info->bdi);
2475fail_srcu: 2477fail_srcu:
2476 cleanup_srcu_struct(&fs_info->subvol_srcu); 2478 cleanup_srcu_struct(&fs_info->subvol_srcu);
2477fail: 2479fail:
2480 btrfs_close_devices(fs_info->fs_devices);
2478 free_fs_info(fs_info); 2481 free_fs_info(fs_info);
2479 return ERR_PTR(err); 2482 return ERR_PTR(err);
2480 2483
2481recovery_tree_root: 2484recovery_tree_root:
2482
2483 if (!btrfs_test_opt(tree_root, RECOVERY)) 2485 if (!btrfs_test_opt(tree_root, RECOVERY))
2484 goto fail_tree_roots; 2486 goto fail_tree_roots;
2485 2487
@@ -2579,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
2579 int errors = 0; 2581 int errors = 0;
2580 u32 crc; 2582 u32 crc;
2581 u64 bytenr; 2583 u64 bytenr;
2582 int last_barrier = 0;
2583 2584
2584 if (max_mirrors == 0) 2585 if (max_mirrors == 0)
2585 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2586 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2586 2587
2587 /* make sure only the last submit_bh does a barrier */
2588 if (do_barriers) {
2589 for (i = 0; i < max_mirrors; i++) {
2590 bytenr = btrfs_sb_offset(i);
2591 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2592 device->total_bytes)
2593 break;
2594 last_barrier = i;
2595 }
2596 }
2597
2598 for (i = 0; i < max_mirrors; i++) { 2588 for (i = 0; i < max_mirrors; i++) {
2599 bytenr = btrfs_sb_offset(i); 2589 bytenr = btrfs_sb_offset(i);
2600 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2590 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2640,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
2640 bh->b_end_io = btrfs_end_buffer_write_sync; 2630 bh->b_end_io = btrfs_end_buffer_write_sync;
2641 } 2631 }
2642 2632
2643 if (i == last_barrier && do_barriers) 2633 /*
2644 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2634 * we fua the first super. The others we allow
2645 else 2635 * to go down lazy.
2646 ret = submit_bh(WRITE_SYNC, bh); 2636 */
2647 2637 ret = submit_bh(WRITE_FUA, bh);
2648 if (ret) 2638 if (ret)
2649 errors++; 2639 errors++;
2650 } 2640 }
2651 return errors < i ? 0 : -1; 2641 return errors < i ? 0 : -1;
2652} 2642}
2653 2643
2644/*
2645 * endio for the write_dev_flush, this will wake anyone waiting
2646 * for the barrier when it is done
2647 */
2648static void btrfs_end_empty_barrier(struct bio *bio, int err)
2649{
2650 if (err) {
2651 if (err == -EOPNOTSUPP)
2652 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2653 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2654 }
2655 if (bio->bi_private)
2656 complete(bio->bi_private);
2657 bio_put(bio);
2658}
2659
2660/*
2661 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2662 * sent down. With wait == 1, it waits for the previous flush.
2663 *
2664 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2665 * capable
2666 */
2667static int write_dev_flush(struct btrfs_device *device, int wait)
2668{
2669 struct bio *bio;
2670 int ret = 0;
2671
2672 if (device->nobarriers)
2673 return 0;
2674
2675 if (wait) {
2676 bio = device->flush_bio;
2677 if (!bio)
2678 return 0;
2679
2680 wait_for_completion(&device->flush_wait);
2681
2682 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2683 printk("btrfs: disabling barriers on dev %s\n",
2684 device->name);
2685 device->nobarriers = 1;
2686 }
2687 if (!bio_flagged(bio, BIO_UPTODATE)) {
2688 ret = -EIO;
2689 }
2690
2691 /* drop the reference from the wait == 0 run */
2692 bio_put(bio);
2693 device->flush_bio = NULL;
2694
2695 return ret;
2696 }
2697
2698 /*
2699 * one reference for us, and we leave it for the
2700 * caller
2701 */
2702 device->flush_bio = NULL;;
2703 bio = bio_alloc(GFP_NOFS, 0);
2704 if (!bio)
2705 return -ENOMEM;
2706
2707 bio->bi_end_io = btrfs_end_empty_barrier;
2708 bio->bi_bdev = device->bdev;
2709 init_completion(&device->flush_wait);
2710 bio->bi_private = &device->flush_wait;
2711 device->flush_bio = bio;
2712
2713 bio_get(bio);
2714 submit_bio(WRITE_FLUSH, bio);
2715
2716 return 0;
2717}
2718
2719/*
2720 * send an empty flush down to each device in parallel,
2721 * then wait for them
2722 */
2723static int barrier_all_devices(struct btrfs_fs_info *info)
2724{
2725 struct list_head *head;
2726 struct btrfs_device *dev;
2727 int errors = 0;
2728 int ret;
2729
2730 /* send down all the barriers */
2731 head = &info->fs_devices->devices;
2732 list_for_each_entry_rcu(dev, head, dev_list) {
2733 if (!dev->bdev) {
2734 errors++;
2735 continue;
2736 }
2737 if (!dev->in_fs_metadata || !dev->writeable)
2738 continue;
2739
2740 ret = write_dev_flush(dev, 0);
2741 if (ret)
2742 errors++;
2743 }
2744
2745 /* wait for all the barriers */
2746 list_for_each_entry_rcu(dev, head, dev_list) {
2747 if (!dev->bdev) {
2748 errors++;
2749 continue;
2750 }
2751 if (!dev->in_fs_metadata || !dev->writeable)
2752 continue;
2753
2754 ret = write_dev_flush(dev, 1);
2755 if (ret)
2756 errors++;
2757 }
2758 if (errors)
2759 return -EIO;
2760 return 0;
2761}
2762
2654int write_all_supers(struct btrfs_root *root, int max_mirrors) 2763int write_all_supers(struct btrfs_root *root, int max_mirrors)
2655{ 2764{
2656 struct list_head *head; 2765 struct list_head *head;
@@ -2672,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2672 2781
2673 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2782 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2674 head = &root->fs_info->fs_devices->devices; 2783 head = &root->fs_info->fs_devices->devices;
2784
2785 if (do_barriers)
2786 barrier_all_devices(root->fs_info);
2787
2675 list_for_each_entry_rcu(dev, head, dev_list) { 2788 list_for_each_entry_rcu(dev, head, dev_list) {
2676 if (!dev->bdev) { 2789 if (!dev->bdev) {
2677 total_errors++; 2790 total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9879bd474632..f5fbe576d2ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 struct btrfs_root *root, 467 struct btrfs_root *root,
468 int load_cache_only) 468 int load_cache_only)
469{ 469{
470 DEFINE_WAIT(wait);
470 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
471 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
472 int ret = 0; 473 int ret = 0;
473 474
474 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
475 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
476 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
477 523
478 /* 524 /*
479 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
484 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
485 (root && root != root->fs_info->tree_root) && 531 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) { 532 btrfs_test_opt(root, SPACE_CACHE)) {
487 spin_lock(&cache->lock);
488 if (cache->cached != BTRFS_CACHE_NO) {
489 spin_unlock(&cache->lock);
490 return 0;
491 }
492 cache->cached = BTRFS_CACHE_STARTED;
493 spin_unlock(&cache->lock);
494
495 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
496 534
497 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
498 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
499 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
500 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
501 } else { 540 } else {
502 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
503 } 547 }
504 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
505 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
506 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
507 return 0; 553 return 0;
508 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
509 } 569 }
510 570
511 if (load_cache_only) 571 if (load_cache_only) {
512 return 0; 572 put_caching_control(caching_ctl);
513
514 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
515 BUG_ON(!caching_ctl);
516
517 INIT_LIST_HEAD(&caching_ctl->list);
518 mutex_init(&caching_ctl->mutex);
519 init_waitqueue_head(&caching_ctl->wait);
520 caching_ctl->block_group = cache;
521 caching_ctl->progress = cache->key.objectid;
522 /* one for caching kthread, one for caching block group list */
523 atomic_set(&caching_ctl->count, 2);
524 caching_ctl->work.func = caching_thread;
525
526 spin_lock(&cache->lock);
527 if (cache->cached != BTRFS_CACHE_NO) {
528 spin_unlock(&cache->lock);
529 kfree(caching_ctl);
530 return 0; 573 return 0;
531 } 574 }
532 cache->caching_ctl = caching_ctl;
533 cache->cached = BTRFS_CACHE_STARTED;
534 spin_unlock(&cache->lock);
535 575
536 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
537 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
538 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
539 580
@@ -2781,7 +2822,7 @@ out_free:
2781 btrfs_release_path(path); 2822 btrfs_release_path(path);
2782out: 2823out:
2783 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2784 if (!ret) 2825 if (!ret && dcs == BTRFS_DC_SETUP)
2785 block_group->cache_generation = trans->transid; 2826 block_group->cache_generation = trans->transid;
2786 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2787 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
@@ -3797,16 +3838,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
3797 kfree(rsv); 3838 kfree(rsv);
3798} 3839}
3799 3840
3800int btrfs_block_rsv_add(struct btrfs_root *root, 3841static inline int __block_rsv_add(struct btrfs_root *root,
3801 struct btrfs_block_rsv *block_rsv, 3842 struct btrfs_block_rsv *block_rsv,
3802 u64 num_bytes) 3843 u64 num_bytes, int flush)
3803{ 3844{
3804 int ret; 3845 int ret;
3805 3846
3806 if (num_bytes == 0) 3847 if (num_bytes == 0)
3807 return 0; 3848 return 0;
3808 3849
3809 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); 3850 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3810 if (!ret) { 3851 if (!ret) {
3811 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3852 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3812 return 0; 3853 return 0;
@@ -3815,22 +3856,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
3815 return ret; 3856 return ret;
3816} 3857}
3817 3858
3859int btrfs_block_rsv_add(struct btrfs_root *root,
3860 struct btrfs_block_rsv *block_rsv,
3861 u64 num_bytes)
3862{
3863 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3864}
3865
3818int btrfs_block_rsv_add_noflush(struct btrfs_root *root, 3866int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3819 struct btrfs_block_rsv *block_rsv, 3867 struct btrfs_block_rsv *block_rsv,
3820 u64 num_bytes) 3868 u64 num_bytes)
3821{ 3869{
3822 int ret; 3870 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3823
3824 if (num_bytes == 0)
3825 return 0;
3826
3827 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
3828 if (!ret) {
3829 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3830 return 0;
3831 }
3832
3833 return ret;
3834} 3871}
3835 3872
3836int btrfs_block_rsv_check(struct btrfs_root *root, 3873int btrfs_block_rsv_check(struct btrfs_root *root,
@@ -3851,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
3851 return ret; 3888 return ret;
3852} 3889}
3853 3890
3854int btrfs_block_rsv_refill(struct btrfs_root *root, 3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3855 struct btrfs_block_rsv *block_rsv, 3892 struct btrfs_block_rsv *block_rsv,
3856 u64 min_reserved) 3893 u64 min_reserved, int flush)
3857{ 3894{
3858 u64 num_bytes = 0; 3895 u64 num_bytes = 0;
3859 int ret = -ENOSPC; 3896 int ret = -ENOSPC;
@@ -3872,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3872 if (!ret) 3909 if (!ret)
3873 return 0; 3910 return 0;
3874 3911
3875 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); 3912 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3876 if (!ret) { 3913 if (!ret) {
3877 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3914 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3878 return 0; 3915 return 0;
@@ -3881,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3881 return ret; 3918 return ret;
3882} 3919}
3883 3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922 struct btrfs_block_rsv *block_rsv,
3923 u64 min_reserved)
3924{
3925 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929 struct btrfs_block_rsv *block_rsv,
3930 u64 min_reserved)
3931{
3932 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3933}
3934
3884int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3885 struct btrfs_block_rsv *dst_rsv, 3936 struct btrfs_block_rsv *dst_rsv,
3886 u64 num_bytes) 3937 u64 num_bytes)
@@ -4064,23 +4115,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4064 */ 4115 */
4065static unsigned drop_outstanding_extent(struct inode *inode) 4116static unsigned drop_outstanding_extent(struct inode *inode)
4066{ 4117{
4118 unsigned drop_inode_space = 0;
4067 unsigned dropped_extents = 0; 4119 unsigned dropped_extents = 0;
4068 4120
4069 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4121 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4070 BTRFS_I(inode)->outstanding_extents--; 4122 BTRFS_I(inode)->outstanding_extents--;
4071 4123
4124 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4125 BTRFS_I(inode)->delalloc_meta_reserved) {
4126 drop_inode_space = 1;
4127 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4128 }
4129
4072 /* 4130 /*
4073 * If we have more or the same amount of outsanding extents than we have 4131 * If we have more or the same amount of outsanding extents than we have
4074 * reserved then we need to leave the reserved extents count alone. 4132 * reserved then we need to leave the reserved extents count alone.
4075 */ 4133 */
4076 if (BTRFS_I(inode)->outstanding_extents >= 4134 if (BTRFS_I(inode)->outstanding_extents >=
4077 BTRFS_I(inode)->reserved_extents) 4135 BTRFS_I(inode)->reserved_extents)
4078 return 0; 4136 return drop_inode_space;
4079 4137
4080 dropped_extents = BTRFS_I(inode)->reserved_extents - 4138 dropped_extents = BTRFS_I(inode)->reserved_extents -
4081 BTRFS_I(inode)->outstanding_extents; 4139 BTRFS_I(inode)->outstanding_extents;
4082 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4140 BTRFS_I(inode)->reserved_extents -= dropped_extents;
4083 return dropped_extents; 4141 return dropped_extents + drop_inode_space;
4084} 4142}
4085 4143
4086/** 4144/**
@@ -4146,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4146 struct btrfs_root *root = BTRFS_I(inode)->root; 4204 struct btrfs_root *root = BTRFS_I(inode)->root;
4147 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4205 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4148 u64 to_reserve = 0; 4206 u64 to_reserve = 0;
4207 u64 csum_bytes;
4149 unsigned nr_extents = 0; 4208 unsigned nr_extents = 0;
4209 int extra_reserve = 0;
4150 int flush = 1; 4210 int flush = 1;
4151 int ret; 4211 int ret;
4152 4212
4213 /* Need to be holding the i_mutex here if we aren't free space cache */
4153 if (btrfs_is_free_space_inode(root, inode)) 4214 if (btrfs_is_free_space_inode(root, inode))
4154 flush = 0; 4215 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4155 4218
4156 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4219 if (flush && btrfs_transaction_in_commit(root->fs_info))
4157 schedule_timeout(1); 4220 schedule_timeout(1);
@@ -4162,14 +4225,22 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4162 BTRFS_I(inode)->outstanding_extents++; 4225 BTRFS_I(inode)->outstanding_extents++;
4163 4226
4164 if (BTRFS_I(inode)->outstanding_extents > 4227 if (BTRFS_I(inode)->outstanding_extents >
4165 BTRFS_I(inode)->reserved_extents) { 4228 BTRFS_I(inode)->reserved_extents)
4166 nr_extents = BTRFS_I(inode)->outstanding_extents - 4229 nr_extents = BTRFS_I(inode)->outstanding_extents -
4167 BTRFS_I(inode)->reserved_extents; 4230 BTRFS_I(inode)->reserved_extents;
4168 BTRFS_I(inode)->reserved_extents += nr_extents;
4169 4231
4170 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4232 /*
4233 * Add an item to reserve for updating the inode when we complete the
4234 * delalloc io.
4235 */
4236 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4237 nr_extents++;
4238 extra_reserve = 1;
4171 } 4239 }
4240
4241 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4172 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4242 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4243 csum_bytes = BTRFS_I(inode)->csum_bytes;
4173 spin_unlock(&BTRFS_I(inode)->lock); 4244 spin_unlock(&BTRFS_I(inode)->lock);
4174 4245
4175 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4246 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4179,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4179 4250
4180 spin_lock(&BTRFS_I(inode)->lock); 4251 spin_lock(&BTRFS_I(inode)->lock);
4181 dropped = drop_outstanding_extent(inode); 4252 dropped = drop_outstanding_extent(inode);
4182 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4183 spin_unlock(&BTRFS_I(inode)->lock);
4184 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4185
4186 /* 4253 /*
4187 * Somebody could have come in and twiddled with the 4254 * If the inodes csum_bytes is the same as the original
4188 * reservation, so if we have to free more than we would have 4255 * csum_bytes then we know we haven't raced with any free()ers
4189 * reserved from this reservation go ahead and release those 4256 * so we can just reduce our inodes csum bytes and carry on.
4190 * bytes. 4257 * Otherwise we have to do the normal free thing to account for
4258 * the case that the free side didn't free up its reserve
4259 * because of this outstanding reservation.
4191 */ 4260 */
4192 to_free -= to_reserve; 4261 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4262 calc_csum_metadata_size(inode, num_bytes, 0);
4263 else
4264 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4265 spin_unlock(&BTRFS_I(inode)->lock);
4266 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268
4193 if (to_free) 4269 if (to_free)
4194 btrfs_block_rsv_release(root, block_rsv, to_free); 4270 btrfs_block_rsv_release(root, block_rsv, to_free);
4195 return ret; 4271 return ret;
4196 } 4272 }
4197 4273
4274 spin_lock(&BTRFS_I(inode)->lock);
4275 if (extra_reserve) {
4276 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4277 nr_extents--;
4278 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock);
4281
4198 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4282 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4199 4283
4200 return 0; 4284 return 0;
@@ -5040,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5040 struct btrfs_root *root = orig_root->fs_info->extent_root; 5124 struct btrfs_root *root = orig_root->fs_info->extent_root;
5041 struct btrfs_free_cluster *last_ptr = NULL; 5125 struct btrfs_free_cluster *last_ptr = NULL;
5042 struct btrfs_block_group_cache *block_group = NULL; 5126 struct btrfs_block_group_cache *block_group = NULL;
5127 struct btrfs_block_group_cache *used_block_group;
5043 int empty_cluster = 2 * 1024 * 1024; 5128 int empty_cluster = 2 * 1024 * 1024;
5044 int allowed_chunk_alloc = 0; 5129 int allowed_chunk_alloc = 0;
5045 int done_chunk_alloc = 0; 5130 int done_chunk_alloc = 0;
5046 struct btrfs_space_info *space_info; 5131 struct btrfs_space_info *space_info;
5047 int last_ptr_loop = 0;
5048 int loop = 0; 5132 int loop = 0;
5049 int index = 0; 5133 int index = 0;
5050 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5134 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5106,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5106ideal_cache: 5190ideal_cache:
5107 block_group = btrfs_lookup_block_group(root->fs_info, 5191 block_group = btrfs_lookup_block_group(root->fs_info,
5108 search_start); 5192 search_start);
5193 used_block_group = block_group;
5109 /* 5194 /*
5110 * we don't want to use the block group if it doesn't match our 5195 * we don't want to use the block group if it doesn't match our
5111 * allocation bits, or if its not cached. 5196 * allocation bits, or if its not cached.
@@ -5143,6 +5228,7 @@ search:
5143 u64 offset; 5228 u64 offset;
5144 int cached; 5229 int cached;
5145 5230
5231 used_block_group = block_group;
5146 btrfs_get_block_group(block_group); 5232 btrfs_get_block_group(block_group);
5147 search_start = block_group->key.objectid; 5233 search_start = block_group->key.objectid;
5148 5234
@@ -5166,13 +5252,15 @@ search:
5166 } 5252 }
5167 5253
5168have_block_group: 5254have_block_group:
5169 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5255 cached = block_group_cache_done(block_group);
5256 if (unlikely(!cached)) {
5170 u64 free_percent; 5257 u64 free_percent;
5171 5258
5259 found_uncached_bg = true;
5172 ret = cache_block_group(block_group, trans, 5260 ret = cache_block_group(block_group, trans,
5173 orig_root, 1); 5261 orig_root, 1);
5174 if (block_group->cached == BTRFS_CACHE_FINISHED) 5262 if (block_group->cached == BTRFS_CACHE_FINISHED)
5175 goto have_block_group; 5263 goto alloc;
5176 5264
5177 free_percent = btrfs_block_group_used(&block_group->item); 5265 free_percent = btrfs_block_group_used(&block_group->item);
5178 free_percent *= 100; 5266 free_percent *= 100;
@@ -5194,7 +5282,6 @@ have_block_group:
5194 orig_root, 0); 5282 orig_root, 0);
5195 BUG_ON(ret); 5283 BUG_ON(ret);
5196 } 5284 }
5197 found_uncached_bg = true;
5198 5285
5199 /* 5286 /*
5200 * If loop is set for cached only, try the next block 5287 * If loop is set for cached only, try the next block
@@ -5204,94 +5291,80 @@ have_block_group:
5204 goto loop; 5291 goto loop;
5205 } 5292 }
5206 5293
5207 cached = block_group_cache_done(block_group); 5294alloc:
5208 if (unlikely(!cached))
5209 found_uncached_bg = true;
5210
5211 if (unlikely(block_group->ro)) 5295 if (unlikely(block_group->ro))
5212 goto loop; 5296 goto loop;
5213 5297
5214 spin_lock(&block_group->free_space_ctl->tree_lock); 5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5215 if (cached && 5299 if (cached &&
5216 block_group->free_space_ctl->free_space < 5300 block_group->free_space_ctl->free_space <
5217 num_bytes + empty_size) { 5301 num_bytes + empty_cluster + empty_size) {
5218 spin_unlock(&block_group->free_space_ctl->tree_lock); 5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5219 goto loop; 5303 goto loop;
5220 } 5304 }
5221 spin_unlock(&block_group->free_space_ctl->tree_lock); 5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5222 5306
5223 /* 5307 /*
5224 * Ok we want to try and use the cluster allocator, so lets look 5308 * Ok we want to try and use the cluster allocator, so
5225 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5309 * lets look there
5226 * have tried the cluster allocator plenty of times at this
5227 * point and not have found anything, so we are likely way too
5228 * fragmented for the clustering stuff to find anything, so lets
5229 * just skip it and let the allocator find whatever block it can
5230 * find
5231 */ 5310 */
5232 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5311 if (last_ptr) {
5233 /* 5312 /*
5234 * the refill lock keeps out other 5313 * the refill lock keeps out other
5235 * people trying to start a new cluster 5314 * people trying to start a new cluster
5236 */ 5315 */
5237 spin_lock(&last_ptr->refill_lock); 5316 spin_lock(&last_ptr->refill_lock);
5238 if (last_ptr->block_group && 5317 used_block_group = last_ptr->block_group;
5239 (last_ptr->block_group->ro || 5318 if (used_block_group != block_group &&
5240 !block_group_bits(last_ptr->block_group, data))) { 5319 (!used_block_group ||
5241 offset = 0; 5320 used_block_group->ro ||
5321 !block_group_bits(used_block_group, data))) {
5322 used_block_group = block_group;
5242 goto refill_cluster; 5323 goto refill_cluster;
5243 } 5324 }
5244 5325
5245 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5326 if (used_block_group != block_group)
5246 num_bytes, search_start); 5327 btrfs_get_block_group(used_block_group);
5328
5329 offset = btrfs_alloc_from_cluster(used_block_group,
5330 last_ptr, num_bytes, used_block_group->key.objectid);
5247 if (offset) { 5331 if (offset) {
5248 /* we have a block, we're done */ 5332 /* we have a block, we're done */
5249 spin_unlock(&last_ptr->refill_lock); 5333 spin_unlock(&last_ptr->refill_lock);
5250 goto checks; 5334 goto checks;
5251 } 5335 }
5252 5336
5253 spin_lock(&last_ptr->lock); 5337 WARN_ON(last_ptr->block_group != used_block_group);
5254 /* 5338 if (used_block_group != block_group) {
5255 * whoops, this cluster doesn't actually point to 5339 btrfs_put_block_group(used_block_group);
5256 * this block group. Get a ref on the block 5340 used_block_group = block_group;
5257 * group is does point to and try again
5258 */
5259 if (!last_ptr_loop && last_ptr->block_group &&
5260 last_ptr->block_group != block_group &&
5261 index <=
5262 get_block_group_index(last_ptr->block_group)) {
5263
5264 btrfs_put_block_group(block_group);
5265 block_group = last_ptr->block_group;
5266 btrfs_get_block_group(block_group);
5267 spin_unlock(&last_ptr->lock);
5268 spin_unlock(&last_ptr->refill_lock);
5269
5270 last_ptr_loop = 1;
5271 search_start = block_group->key.objectid;
5272 /*
5273 * we know this block group is properly
5274 * in the list because
5275 * btrfs_remove_block_group, drops the
5276 * cluster before it removes the block
5277 * group from the list
5278 */
5279 goto have_block_group;
5280 } 5341 }
5281 spin_unlock(&last_ptr->lock);
5282refill_cluster: 5342refill_cluster:
5343 BUG_ON(used_block_group != block_group);
5344 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5345 * set up a new clusters, so lets just skip it
5346 * and let the allocator find whatever block
5347 * it can find. If we reach this point, we
5348 * will have tried the cluster allocator
5349 * plenty of times and not have found
5350 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find
5352 * anything. */
5353 if (loop >= LOOP_NO_EMPTY_SIZE) {
5354 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc;
5356 }
5357
5283 /* 5358 /*
5284 * this cluster didn't work out, free it and 5359 * this cluster didn't work out, free it and
5285 * start over 5360 * start over
5286 */ 5361 */
5287 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5362 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5288 5363
5289 last_ptr_loop = 0;
5290
5291 /* allocate a cluster in this block group */ 5364 /* allocate a cluster in this block group */
5292 ret = btrfs_find_space_cluster(trans, root, 5365 ret = btrfs_find_space_cluster(trans, root,
5293 block_group, last_ptr, 5366 block_group, last_ptr,
5294 offset, num_bytes, 5367 search_start, num_bytes,
5295 empty_cluster + empty_size); 5368 empty_cluster + empty_size);
5296 if (ret == 0) { 5369 if (ret == 0) {
5297 /* 5370 /*
@@ -5327,6 +5400,7 @@ refill_cluster:
5327 goto loop; 5400 goto loop;
5328 } 5401 }
5329 5402
5403unclustered_alloc:
5330 offset = btrfs_find_space_for_alloc(block_group, search_start, 5404 offset = btrfs_find_space_for_alloc(block_group, search_start,
5331 num_bytes, empty_size); 5405 num_bytes, empty_size);
5332 /* 5406 /*
@@ -5353,14 +5427,14 @@ checks:
5353 search_start = stripe_align(root, offset); 5427 search_start = stripe_align(root, offset);
5354 /* move on to the next group */ 5428 /* move on to the next group */
5355 if (search_start + num_bytes >= search_end) { 5429 if (search_start + num_bytes >= search_end) {
5356 btrfs_add_free_space(block_group, offset, num_bytes); 5430 btrfs_add_free_space(used_block_group, offset, num_bytes);
5357 goto loop; 5431 goto loop;
5358 } 5432 }
5359 5433
5360 /* move on to the next group */ 5434 /* move on to the next group */
5361 if (search_start + num_bytes > 5435 if (search_start + num_bytes >
5362 block_group->key.objectid + block_group->key.offset) { 5436 used_block_group->key.objectid + used_block_group->key.offset) {
5363 btrfs_add_free_space(block_group, offset, num_bytes); 5437 btrfs_add_free_space(used_block_group, offset, num_bytes);
5364 goto loop; 5438 goto loop;
5365 } 5439 }
5366 5440
@@ -5368,14 +5442,14 @@ checks:
5368 ins->offset = num_bytes; 5442 ins->offset = num_bytes;
5369 5443
5370 if (offset < search_start) 5444 if (offset < search_start)
5371 btrfs_add_free_space(block_group, offset, 5445 btrfs_add_free_space(used_block_group, offset,
5372 search_start - offset); 5446 search_start - offset);
5373 BUG_ON(offset > search_start); 5447 BUG_ON(offset > search_start);
5374 5448
5375 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 5449 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5376 alloc_type); 5450 alloc_type);
5377 if (ret == -EAGAIN) { 5451 if (ret == -EAGAIN) {
5378 btrfs_add_free_space(block_group, offset, num_bytes); 5452 btrfs_add_free_space(used_block_group, offset, num_bytes);
5379 goto loop; 5453 goto loop;
5380 } 5454 }
5381 5455
@@ -5384,15 +5458,19 @@ checks:
5384 ins->offset = num_bytes; 5458 ins->offset = num_bytes;
5385 5459
5386 if (offset < search_start) 5460 if (offset < search_start)
5387 btrfs_add_free_space(block_group, offset, 5461 btrfs_add_free_space(used_block_group, offset,
5388 search_start - offset); 5462 search_start - offset);
5389 BUG_ON(offset > search_start); 5463 BUG_ON(offset > search_start);
5464 if (used_block_group != block_group)
5465 btrfs_put_block_group(used_block_group);
5390 btrfs_put_block_group(block_group); 5466 btrfs_put_block_group(block_group);
5391 break; 5467 break;
5392loop: 5468loop:
5393 failed_cluster_refill = false; 5469 failed_cluster_refill = false;
5394 failed_alloc = false; 5470 failed_alloc = false;
5395 BUG_ON(index != get_block_group_index(block_group)); 5471 BUG_ON(index != get_block_group_index(block_group));
5472 if (used_block_group != block_group)
5473 btrfs_put_block_group(used_block_group);
5396 btrfs_put_block_group(block_group); 5474 btrfs_put_block_group(block_group);
5397 } 5475 }
5398 up_read(&space_info->groups_sem); 5476 up_read(&space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a0..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
935 node = tree_search(tree, start); 935 node = tree_search(tree, start);
936 if (!node) { 936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc); 937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc) 938 if (!prealloc) {
939 return -ENOMEM; 939 err = -ENOMEM;
940 goto out;
941 }
940 err = insert_state(tree, prealloc, start, end, &bits); 942 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL; 943 prealloc = NULL;
942 BUG_ON(err == -EEXIST); 944 BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
992 */ 994 */
993 if (state->start < start) { 995 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc); 996 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc) 997 if (!prealloc) {
996 return -ENOMEM; 998 err = -ENOMEM;
999 goto out;
1000 }
997 err = split_state(tree, state, prealloc, start); 1001 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST); 1002 BUG_ON(err == -EEXIST);
999 prealloc = NULL; 1003 prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
1024 this_end = last_start - 1; 1028 this_end = last_start - 1;
1025 1029
1026 prealloc = alloc_extent_state_atomic(prealloc); 1030 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc) 1031 if (!prealloc) {
1028 return -ENOMEM; 1032 err = -ENOMEM;
1033 goto out;
1034 }
1029 1035
1030 /* 1036 /*
1031 * Avoid to free 'prealloc' if it can be merged with 1037 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
1051 */ 1057 */
1052 if (state->start <= end && state->end > end) { 1058 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc); 1059 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc) 1060 if (!prealloc) {
1055 return -ENOMEM; 1061 err = -ENOMEM;
1062 goto out;
1063 }
1056 1064
1057 err = split_state(tree, state, prealloc, end + 1); 1065 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST); 1066 BUG_ON(err == -EEXIST);
@@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2285 clean_io_failure(start, page); 2293 clean_io_failure(start, page);
2286 } 2294 }
2287 if (!uptodate) { 2295 if (!uptodate) {
2288 u64 failed_mirror; 2296 int failed_mirror;
2289 failed_mirror = (u64)bio->bi_bdev; 2297 failed_mirror = (int)(unsigned long)bio->bi_bdev;
2290 if (tree->ops && tree->ops->readpage_io_failed_hook) 2298 /*
2291 ret = tree->ops->readpage_io_failed_hook( 2299 * The generic bio_readpage_error handles errors the
2292 bio, page, start, end, 2300 * following way: If possible, new read requests are
2293 failed_mirror, state); 2301 * created and submitted and will end up in
2294 else 2302 * end_bio_extent_readpage as well (if we're lucky, not
2295 ret = bio_readpage_error(bio, page, start, end, 2303 * in the !uptodate case). In that case it returns 0 and
2296 failed_mirror, NULL); 2304 * we just go on with the next page in our bio. If it
2305 * can't handle the error it will return -EIO and we
2306 * remain responsible for that page.
2307 */
2308 ret = bio_readpage_error(bio, page, start, end,
2309 failed_mirror, NULL);
2297 if (ret == 0) { 2310 if (ret == 0) {
2311error_handled:
2298 uptodate = 2312 uptodate =
2299 test_bit(BIO_UPTODATE, &bio->bi_flags); 2313 test_bit(BIO_UPTODATE, &bio->bi_flags);
2300 if (err) 2314 if (err)
@@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2302 uncache_state(&cached); 2316 uncache_state(&cached);
2303 continue; 2317 continue;
2304 } 2318 }
2319 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2320 ret = tree->ops->readpage_io_failed_hook(
2321 bio, page, start, end,
2322 failed_mirror, state);
2323 if (ret == 0)
2324 goto error_handled;
2325 }
2305 } 2326 }
2306 2327
2307 if (uptodate) { 2328 if (uptodate) {
@@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3366 return -ENOMEM; 3387 return -ENOMEM;
3367 path->leave_spinning = 1; 3388 path->leave_spinning = 1;
3368 3389
3390 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3391 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3392
3369 /* 3393 /*
3370 * lookup the last file extent. We're not using i_size here 3394 * lookup the last file extent. We're not using i_size here
3371 * because there might be preallocation past i_size 3395 * because there might be preallocation past i_size
@@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3413 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3437 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3414 &cached_state, GFP_NOFS); 3438 &cached_state, GFP_NOFS);
3415 3439
3416 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3440 em = get_extent_skip_holes(inode, start, last_for_get_extent,
3417 get_extent); 3441 get_extent);
3418 if (!em) 3442 if (!em)
3419 goto out; 3443 goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
70 unsigned long bio_flags); 70 unsigned long bio_flags);
71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, u64 failed_mirror, 73 u64 start, u64 end, int failed_mirror,
74 struct extent_state *state); 74 struct extent_state *state);
75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
76 u64 start, u64 end, 76 u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dafdfa059bf6..97fbe939c050 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1169 (sizeof(struct page *))); 1169 (sizeof(struct page *)));
1170 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1171 nrptrs = max(nrptrs, 8);
1170 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1172 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1171 if (!pages) 1173 if (!pages)
1172 return -ENOMEM; 1174 return -ENOMEM;
@@ -1387,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1387 goto out; 1389 goto out;
1388 } 1390 }
1389 1391
1390 file_update_time(file); 1392 err = btrfs_update_time(file);
1393 if (err) {
1394 mutex_unlock(&inode->i_mutex);
1395 goto out;
1396 }
1391 BTRFS_I(inode)->sequence++; 1397 BTRFS_I(inode)->sequence++;
1392 1398
1393 start_pos = round_down(pos, root->sectorsize); 1399 start_pos = round_down(pos, root->sectorsize);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 7a15fcfb3e1f..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
351 } 351 }
352 } 352 }
353 353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
354 return 0; 359 return 0;
355} 360}
356 361
@@ -537,6 +542,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
537 struct btrfs_free_space *entry, u8 *type) 542 struct btrfs_free_space *entry, u8 *type)
538{ 543{
539 struct btrfs_free_space_entry *e; 544 struct btrfs_free_space_entry *e;
545 int ret;
546
547 if (!io_ctl->cur) {
548 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
549 if (ret)
550 return ret;
551 }
540 552
541 e = io_ctl->cur; 553 e = io_ctl->cur;
542 entry->offset = le64_to_cpu(e->offset); 554 entry->offset = le64_to_cpu(e->offset);
@@ -550,10 +562,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
550 562
551 io_ctl_unmap_page(io_ctl); 563 io_ctl_unmap_page(io_ctl);
552 564
553 if (io_ctl->index >= io_ctl->num_pages) 565 return 0;
554 return 0;
555
556 return io_ctl_check_crc(io_ctl, io_ctl->index);
557} 566}
558 567
559static int io_ctl_read_bitmap(struct io_ctl *io_ctl, 568static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
@@ -561,9 +570,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
561{ 570{
562 int ret; 571 int ret;
563 572
564 if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
565 io_ctl_unmap_page(io_ctl);
566
567 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 573 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
568 if (ret) 574 if (ret)
569 return ret; 575 return ret;
@@ -699,6 +705,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
699 num_entries--; 705 num_entries--;
700 } 706 }
701 707
708 io_ctl_unmap_page(&io_ctl);
709
702 /* 710 /*
703 * We add the bitmaps at the end of the entries in order that 711 * We add the bitmaps at the end of the entries in order that
704 * the bitmap entries are added to the cache. 712 * the bitmap entries are added to the cache.
@@ -1462,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1462{ 1470{
1463 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1464 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1465 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1466 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1467 1476
@@ -1841,7 +1850,13 @@ again:
1841 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1842 1, 0); 1851 1, 0);
1843 if (!info) { 1852 if (!info) {
1844 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1845 goto out_lock; 1860 goto out_lock;
1846 } 1861 }
1847 } 1862 }
@@ -2305,6 +2320,7 @@ again:
2305 2320
2306 if (!found) { 2321 if (!found) {
2307 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2308 found = true; 2324 found = true;
2309 } 2325 }
2310 2326
@@ -2448,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2448{ 2464{
2449 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2450 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2451 struct rb_node *node;
2452 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2453 2469
2454 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2455 return -ENOSPC; 2471 return -ENOSPC;
2456 2472
2457 /* 2473 /*
2458 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2459 * here that will work. 2475 * is just its start offset.
2460 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2461 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2462 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2463 continue; 2486 continue;
@@ -2468,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2468 } 2491 }
2469 2492
2470 /* 2493 /*
2471 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2472 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2473 * this list and start the search from there.
2474 */ 2496 */
2475 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2476 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2477 list);
2478 node = rb_next(&entry->offset_index);
2479 if (!node)
2480 return -ENOSPC;
2481 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2482 goto search;
2483 }
2484
2485 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2486 if (!entry)
2487 return -ENOSPC;
2488
2489search:
2490 node = &entry->offset_index;
2491 do {
2492 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2493 node = rb_next(&entry->offset_index);
2494 if (!entry->bitmap)
2495 continue;
2496 if (entry->bytes < min_bytes)
2497 continue;
2498 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2499 bytes, min_bytes);
2500 } while (ret && node);
2501
2502 return ret;
2503} 2498}
2504 2499
2505/* 2500/*
@@ -2517,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2517 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2518{ 2513{
2519 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2520 struct list_head bitmaps;
2521 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2522 u64 min_bytes; 2517 u64 min_bytes;
2523 int ret; 2518 int ret;
2524 2519
@@ -2557,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2557 goto out; 2552 goto out;
2558 } 2553 }
2559 2554
2560 INIT_LIST_HEAD(&bitmaps);
2561 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2562 bytes, min_bytes); 2556 bytes, min_bytes);
2563 if (ret) 2557 if (ret)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 53dcbdf446cd..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -477,11 +494,14 @@ again:
477 } 494 }
478 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
479 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
480out_put: 498out_put:
481 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
482out: 502out:
483 if (ret == 0) 503 trans->block_rsv = rsv;
484 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
485 505
486 btrfs_free_path(path); 506 btrfs_free_path(path);
487 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 966ddcc4c63d..0a6b928813a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 94 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 95 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 96 unsigned long *nr_written, int unlock);
97static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
98 struct btrfs_root *root, struct inode *inode);
96 99
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 100static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 101 struct inode *inode, struct inode *dir,
@@ -1741,7 +1744,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1741 trans = btrfs_join_transaction(root); 1744 trans = btrfs_join_transaction(root);
1742 BUG_ON(IS_ERR(trans)); 1745 BUG_ON(IS_ERR(trans));
1743 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1746 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1744 ret = btrfs_update_inode(trans, root, inode); 1747 ret = btrfs_update_inode_fallback(trans, root, inode);
1745 BUG_ON(ret); 1748 BUG_ON(ret);
1746 } 1749 }
1747 goto out; 1750 goto out;
@@ -1791,7 +1794,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1791 1794
1792 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1795 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1793 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1796 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1794 ret = btrfs_update_inode(trans, root, inode); 1797 ret = btrfs_update_inode_fallback(trans, root, inode);
1795 BUG_ON(ret); 1798 BUG_ON(ret);
1796 } 1799 }
1797 ret = 0; 1800 ret = 0;
@@ -2029,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2029 /* insert an orphan item to track this unlinked/truncated file */ 2032 /* insert an orphan item to track this unlinked/truncated file */
2030 if (insert >= 1) { 2033 if (insert >= 1) {
2031 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2034 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2032 BUG_ON(ret); 2035 BUG_ON(ret && ret != -EEXIST);
2033 } 2036 }
2034 2037
2035 /* insert an orphan item to track subvolume contains orphan files */ 2038 /* insert an orphan item to track subvolume contains orphan files */
@@ -2156,6 +2159,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2156 if (ret && ret != -ESTALE) 2159 if (ret && ret != -ESTALE)
2157 goto out; 2160 goto out;
2158 2161
2162 if (ret == -ESTALE && root == root->fs_info->tree_root) {
2163 struct btrfs_root *dead_root;
2164 struct btrfs_fs_info *fs_info = root->fs_info;
2165 int is_dead_root = 0;
2166
2167 /*
2168 * this is an orphan in the tree root. Currently these
2169 * could come from 2 sources:
2170 * a) a snapshot deletion in progress
2171 * b) a free space cache inode
2172 * We need to distinguish those two, as the snapshot
2173 * orphan must not get deleted.
2174 * find_dead_roots already ran before us, so if this
2175 * is a snapshot deletion, we should find the root
2176 * in the dead_roots list
2177 */
2178 spin_lock(&fs_info->trans_lock);
2179 list_for_each_entry(dead_root, &fs_info->dead_roots,
2180 root_list) {
2181 if (dead_root->root_key.objectid ==
2182 found_key.objectid) {
2183 is_dead_root = 1;
2184 break;
2185 }
2186 }
2187 spin_unlock(&fs_info->trans_lock);
2188 if (is_dead_root) {
2189 /* prevent this orphan from being found again */
2190 key.offset = found_key.objectid - 1;
2191 continue;
2192 }
2193 }
2159 /* 2194 /*
2160 * Inode is already gone but the orphan item is still there, 2195 * Inode is already gone but the orphan item is still there,
2161 * kill the orphan item. 2196 * kill the orphan item.
@@ -2189,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2189 continue; 2224 continue;
2190 } 2225 }
2191 nr_truncate++; 2226 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2192 ret = btrfs_truncate(inode); 2233 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2193 } else { 2235 } else {
2194 nr_unlink++; 2236 nr_unlink++;
2195 } 2237 }
@@ -2199,6 +2241,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2199 if (ret) 2241 if (ret)
2200 goto out; 2242 goto out;
2201 } 2243 }
2244 /* release the path since we're done with it */
2245 btrfs_release_path(path);
2246
2202 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2247 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2203 2248
2204 if (root->orphan_block_rsv) 2249 if (root->orphan_block_rsv)
@@ -2426,7 +2471,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2426/* 2471/*
2427 * copy everything in the in-memory inode into the btree. 2472 * copy everything in the in-memory inode into the btree.
2428 */ 2473 */
2429noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2474static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2430 struct btrfs_root *root, struct inode *inode) 2475 struct btrfs_root *root, struct inode *inode)
2431{ 2476{
2432 struct btrfs_inode_item *inode_item; 2477 struct btrfs_inode_item *inode_item;
@@ -2434,21 +2479,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2434 struct extent_buffer *leaf; 2479 struct extent_buffer *leaf;
2435 int ret; 2480 int ret;
2436 2481
2437 /*
2438 * If the inode is a free space inode, we can deadlock during commit
2439 * if we put it into the delayed code.
2440 *
2441 * The data relocation inode should also be directly updated
2442 * without delay
2443 */
2444 if (!btrfs_is_free_space_inode(root, inode)
2445 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2446 ret = btrfs_delayed_update_inode(trans, root, inode);
2447 if (!ret)
2448 btrfs_set_inode_last_trans(trans, inode);
2449 return ret;
2450 }
2451
2452 path = btrfs_alloc_path(); 2482 path = btrfs_alloc_path();
2453 if (!path) 2483 if (!path)
2454 return -ENOMEM; 2484 return -ENOMEM;
@@ -2477,6 +2507,43 @@ failed:
2477} 2507}
2478 2508
2479/* 2509/*
2510 * copy everything in the in-memory inode into the btree.
2511 */
2512noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2513 struct btrfs_root *root, struct inode *inode)
2514{
2515 int ret;
2516
2517 /*
2518 * If the inode is a free space inode, we can deadlock during commit
2519 * if we put it into the delayed code.
2520 *
2521 * The data relocation inode should also be directly updated
2522 * without delay
2523 */
2524 if (!btrfs_is_free_space_inode(root, inode)
2525 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2526 ret = btrfs_delayed_update_inode(trans, root, inode);
2527 if (!ret)
2528 btrfs_set_inode_last_trans(trans, inode);
2529 return ret;
2530 }
2531
2532 return btrfs_update_inode_item(trans, root, inode);
2533}
2534
2535static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2536 struct btrfs_root *root, struct inode *inode)
2537{
2538 int ret;
2539
2540 ret = btrfs_update_inode(trans, root, inode);
2541 if (ret == -ENOSPC)
2542 return btrfs_update_inode_item(trans, root, inode);
2543 return ret;
2544}
2545
2546/*
2480 * unlink helper that gets used here in inode.c and in the tree logging 2547 * unlink helper that gets used here in inode.c and in the tree logging
2481 * recovery code. It remove a link in a directory with a given name, and 2548 * recovery code. It remove a link in a directory with a given name, and
2482 * also drops the back refs in the inode to the directory 2549 * also drops the back refs in the inode to the directory
@@ -3300,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3300 u64 hint_byte = 0; 3367 u64 hint_byte = 0;
3301 hole_size = last_byte - cur_offset; 3368 hole_size = last_byte - cur_offset;
3302 3369
3303 trans = btrfs_start_transaction(root, 2); 3370 trans = btrfs_start_transaction(root, 3);
3304 if (IS_ERR(trans)) { 3371 if (IS_ERR(trans)) {
3305 err = PTR_ERR(trans); 3372 err = PTR_ERR(trans);
3306 break; 3373 break;
@@ -3310,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3310 cur_offset + hole_size, 3377 cur_offset + hole_size,
3311 &hint_byte, 1); 3378 &hint_byte, 1);
3312 if (err) { 3379 if (err) {
3380 btrfs_update_inode(trans, root, inode);
3313 btrfs_end_transaction(trans, root); 3381 btrfs_end_transaction(trans, root);
3314 break; 3382 break;
3315 } 3383 }
@@ -3319,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3319 0, hole_size, 0, hole_size, 3387 0, hole_size, 0, hole_size,
3320 0, 0, 0); 3388 0, 0, 0);
3321 if (err) { 3389 if (err) {
3390 btrfs_update_inode(trans, root, inode);
3322 btrfs_end_transaction(trans, root); 3391 btrfs_end_transaction(trans, root);
3323 break; 3392 break;
3324 } 3393 }
@@ -3326,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3326 btrfs_drop_extent_cache(inode, hole_start, 3395 btrfs_drop_extent_cache(inode, hole_start,
3327 last_byte - 1, 0); 3396 last_byte - 1, 0);
3328 3397
3398 btrfs_update_inode(trans, root, inode);
3329 btrfs_end_transaction(trans, root); 3399 btrfs_end_transaction(trans, root);
3330 } 3400 }
3331 free_extent_map(em); 3401 free_extent_map(em);
@@ -3343,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3343 3413
3344static int btrfs_setsize(struct inode *inode, loff_t newsize) 3414static int btrfs_setsize(struct inode *inode, loff_t newsize)
3345{ 3415{
3416 struct btrfs_root *root = BTRFS_I(inode)->root;
3417 struct btrfs_trans_handle *trans;
3346 loff_t oldsize = i_size_read(inode); 3418 loff_t oldsize = i_size_read(inode);
3347 int ret; 3419 int ret;
3348 3420
@@ -3350,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3350 return 0; 3422 return 0;
3351 3423
3352 if (newsize > oldsize) { 3424 if (newsize > oldsize) {
3353 i_size_write(inode, newsize);
3354 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3355 truncate_pagecache(inode, oldsize, newsize); 3425 truncate_pagecache(inode, oldsize, newsize);
3356 ret = btrfs_cont_expand(inode, oldsize, newsize); 3426 ret = btrfs_cont_expand(inode, oldsize, newsize);
3357 if (ret) { 3427 if (ret)
3358 btrfs_setsize(inode, oldsize);
3359 return ret; 3428 return ret;
3360 }
3361 3429
3362 mark_inode_dirty(inode); 3430 trans = btrfs_start_transaction(root, 1);
3431 if (IS_ERR(trans))
3432 return PTR_ERR(trans);
3433
3434 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root);
3363 } else { 3438 } else {
3364 3439
3365 /* 3440 /*
@@ -3399,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3399 3474
3400 if (attr->ia_valid) { 3475 if (attr->ia_valid) {
3401 setattr_copy(inode, attr); 3476 setattr_copy(inode, attr);
3402 mark_inode_dirty(inode); 3477 err = btrfs_dirty_inode(inode);
3403 3478
3404 if (attr->ia_valid & ATTR_MODE) 3479 if (!err && attr->ia_valid & ATTR_MODE)
3405 err = btrfs_acl_chmod(inode); 3480 err = btrfs_acl_chmod(inode);
3406 } 3481 }
3407 3482
@@ -3463,7 +3538,7 @@ void btrfs_evict_inode(struct inode *inode)
3463 * doing the truncate. 3538 * doing the truncate.
3464 */ 3539 */
3465 while (1) { 3540 while (1) {
3466 ret = btrfs_block_rsv_refill(root, rsv, min_size); 3541 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3467 3542
3468 /* 3543 /*
3469 * Try and steal from the global reserve since we will 3544 * Try and steal from the global reserve since we will
@@ -4177,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4177 * FIXME, needs more benchmarking...there are no reasons other than performance 4252 * FIXME, needs more benchmarking...there are no reasons other than performance
4178 * to keep or drop this code. 4253 * to keep or drop this code.
4179 */ 4254 */
4180void btrfs_dirty_inode(struct inode *inode, int flags) 4255int btrfs_dirty_inode(struct inode *inode)
4181{ 4256{
4182 struct btrfs_root *root = BTRFS_I(inode)->root; 4257 struct btrfs_root *root = BTRFS_I(inode)->root;
4183 struct btrfs_trans_handle *trans; 4258 struct btrfs_trans_handle *trans;
4184 int ret; 4259 int ret;
4185 4260
4186 if (BTRFS_I(inode)->dummy_inode) 4261 if (BTRFS_I(inode)->dummy_inode)
4187 return; 4262 return 0;
4188 4263
4189 trans = btrfs_join_transaction(root); 4264 trans = btrfs_join_transaction(root);
4190 BUG_ON(IS_ERR(trans)); 4265 if (IS_ERR(trans))
4266 return PTR_ERR(trans);
4191 4267
4192 ret = btrfs_update_inode(trans, root, inode); 4268 ret = btrfs_update_inode(trans, root, inode);
4193 if (ret && ret == -ENOSPC) { 4269 if (ret && ret == -ENOSPC) {
4194 /* whoops, lets try again with the full transaction */ 4270 /* whoops, lets try again with the full transaction */
4195 btrfs_end_transaction(trans, root); 4271 btrfs_end_transaction(trans, root);
4196 trans = btrfs_start_transaction(root, 1); 4272 trans = btrfs_start_transaction(root, 1);
4197 if (IS_ERR(trans)) { 4273 if (IS_ERR(trans))
4198 printk_ratelimited(KERN_ERR "btrfs: fail to " 4274 return PTR_ERR(trans);
4199 "dirty inode %llu error %ld\n",
4200 (unsigned long long)btrfs_ino(inode),
4201 PTR_ERR(trans));
4202 return;
4203 }
4204 4275
4205 ret = btrfs_update_inode(trans, root, inode); 4276 ret = btrfs_update_inode(trans, root, inode);
4206 if (ret) {
4207 printk_ratelimited(KERN_ERR "btrfs: fail to "
4208 "dirty inode %llu error %d\n",
4209 (unsigned long long)btrfs_ino(inode),
4210 ret);
4211 }
4212 } 4277 }
4213 btrfs_end_transaction(trans, root); 4278 btrfs_end_transaction(trans, root);
4214 if (BTRFS_I(inode)->delayed_node) 4279 if (BTRFS_I(inode)->delayed_node)
4215 btrfs_balance_delayed_items(root); 4280 btrfs_balance_delayed_items(root);
4281
4282 return ret;
4283}
4284
4285/*
4286 * This is a copy of file_update_time. We need this so we can return error on
4287 * ENOSPC for updating the inode in the case of file write and mmap writes.
4288 */
4289int btrfs_update_time(struct file *file)
4290{
4291 struct inode *inode = file->f_path.dentry->d_inode;
4292 struct timespec now;
4293 int ret;
4294 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4295
4296 /* First try to exhaust all avenues to not sync */
4297 if (IS_NOCMTIME(inode))
4298 return 0;
4299
4300 now = current_fs_time(inode->i_sb);
4301 if (!timespec_equal(&inode->i_mtime, &now))
4302 sync_it = S_MTIME;
4303
4304 if (!timespec_equal(&inode->i_ctime, &now))
4305 sync_it |= S_CTIME;
4306
4307 if (IS_I_VERSION(inode))
4308 sync_it |= S_VERSION;
4309
4310 if (!sync_it)
4311 return 0;
4312
4313 /* Finally allowed to write? Takes lock. */
4314 if (mnt_want_write_file(file))
4315 return 0;
4316
4317 /* Only change inode inside the lock region */
4318 if (sync_it & S_VERSION)
4319 inode_inc_iversion(inode);
4320 if (sync_it & S_CTIME)
4321 inode->i_ctime = now;
4322 if (sync_it & S_MTIME)
4323 inode->i_mtime = now;
4324 ret = btrfs_dirty_inode(inode);
4325 if (!ret)
4326 mark_inode_dirty_sync(inode);
4327 mnt_drop_write(file->f_path.mnt);
4328 return ret;
4216} 4329}
4217 4330
4218/* 4331/*
@@ -4528,11 +4641,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4528 goto out_unlock; 4641 goto out_unlock;
4529 } 4642 }
4530 4643
4644 /*
4645 * If the active LSM wants to access the inode during
4646 * d_instantiate it needs these. Smack checks to see
4647 * if the filesystem supports xattrs by looking at the
4648 * ops vector.
4649 */
4650
4651 inode->i_op = &btrfs_special_inode_operations;
4531 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4652 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4532 if (err) 4653 if (err)
4533 drop_inode = 1; 4654 drop_inode = 1;
4534 else { 4655 else {
4535 inode->i_op = &btrfs_special_inode_operations;
4536 init_special_inode(inode, inode->i_mode, rdev); 4656 init_special_inode(inode, inode->i_mode, rdev);
4537 btrfs_update_inode(trans, root, inode); 4657 btrfs_update_inode(trans, root, inode);
4538 } 4658 }
@@ -4586,14 +4706,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4586 goto out_unlock; 4706 goto out_unlock;
4587 } 4707 }
4588 4708
4709 /*
4710 * If the active LSM wants to access the inode during
4711 * d_instantiate it needs these. Smack checks to see
4712 * if the filesystem supports xattrs by looking at the
4713 * ops vector.
4714 */
4715 inode->i_fop = &btrfs_file_operations;
4716 inode->i_op = &btrfs_file_inode_operations;
4717
4589 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4718 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4590 if (err) 4719 if (err)
4591 drop_inode = 1; 4720 drop_inode = 1;
4592 else { 4721 else {
4593 inode->i_mapping->a_ops = &btrfs_aops; 4722 inode->i_mapping->a_ops = &btrfs_aops;
4594 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4723 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4595 inode->i_fop = &btrfs_file_operations;
4596 inode->i_op = &btrfs_file_inode_operations;
4597 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4724 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4598 } 4725 }
4599out_unlock: 4726out_unlock:
@@ -5632,7 +5759,7 @@ again:
5632 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5759 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5633 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5760 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5634 if (!ret) 5761 if (!ret)
5635 err = btrfs_update_inode(trans, root, inode); 5762 err = btrfs_update_inode_fallback(trans, root, inode);
5636 goto out; 5763 goto out;
5637 } 5764 }
5638 5765
@@ -5670,7 +5797,7 @@ again:
5670 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5797 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5671 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5798 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5672 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5799 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5673 btrfs_update_inode(trans, root, inode); 5800 btrfs_update_inode_fallback(trans, root, inode);
5674 ret = 0; 5801 ret = 0;
5675out_unlock: 5802out_unlock:
5676 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5803 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6276,7 +6403,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6276 u64 page_start; 6403 u64 page_start;
6277 u64 page_end; 6404 u64 page_end;
6278 6405
6406 /* Need this to keep space reservations serialized */
6407 mutex_lock(&inode->i_mutex);
6279 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6408 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6409 mutex_unlock(&inode->i_mutex);
6410 if (!ret)
6411 ret = btrfs_update_time(vma->vm_file);
6280 if (ret) { 6412 if (ret) {
6281 if (ret == -ENOMEM) 6413 if (ret == -ENOMEM)
6282 ret = VM_FAULT_OOM; 6414 ret = VM_FAULT_OOM;
@@ -6488,8 +6620,9 @@ static int btrfs_truncate(struct inode *inode)
6488 /* Just need the 1 for updating the inode */ 6620 /* Just need the 1 for updating the inode */
6489 trans = btrfs_start_transaction(root, 1); 6621 trans = btrfs_start_transaction(root, 1);
6490 if (IS_ERR(trans)) { 6622 if (IS_ERR(trans)) {
6491 err = PTR_ERR(trans); 6623 ret = err = PTR_ERR(trans);
6492 goto out; 6624 trans = NULL;
6625 break;
6493 } 6626 }
6494 } 6627 }
6495 6628
@@ -6529,14 +6662,16 @@ end_trans:
6529 ret = btrfs_orphan_del(NULL, inode); 6662 ret = btrfs_orphan_del(NULL, inode);
6530 } 6663 }
6531 6664
6532 trans->block_rsv = &root->fs_info->trans_block_rsv; 6665 if (trans) {
6533 ret = btrfs_update_inode(trans, root, inode); 6666 trans->block_rsv = &root->fs_info->trans_block_rsv;
6534 if (ret && !err) 6667 ret = btrfs_update_inode(trans, root, inode);
6535 err = ret; 6668 if (ret && !err)
6669 err = ret;
6536 6670
6537 nr = trans->blocks_used; 6671 nr = trans->blocks_used;
6538 ret = btrfs_end_transaction_throttle(trans, root); 6672 ret = btrfs_end_transaction_throttle(trans, root);
6539 btrfs_btree_balance_dirty(root, nr); 6673 btrfs_btree_balance_dirty(root, nr);
6674 }
6540 6675
6541out: 6676out:
6542 btrfs_free_block_rsv(root, rsv); 6677 btrfs_free_block_rsv(root, rsv);
@@ -6605,6 +6740,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6605 ei->orphan_meta_reserved = 0; 6740 ei->orphan_meta_reserved = 0;
6606 ei->dummy_inode = 0; 6741 ei->dummy_inode = 0;
6607 ei->in_defrag = 0; 6742 ei->in_defrag = 0;
6743 ei->delalloc_meta_reserved = 0;
6608 ei->force_compress = BTRFS_COMPRESS_NONE; 6744 ei->force_compress = BTRFS_COMPRESS_NONE;
6609 6745
6610 ei->delayed_node = NULL; 6746 ei->delayed_node = NULL;
@@ -6764,11 +6900,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6764 struct dentry *dentry, struct kstat *stat) 6900 struct dentry *dentry, struct kstat *stat)
6765{ 6901{
6766 struct inode *inode = dentry->d_inode; 6902 struct inode *inode = dentry->d_inode;
6903 u32 blocksize = inode->i_sb->s_blocksize;
6904
6767 generic_fillattr(inode, stat); 6905 generic_fillattr(inode, stat);
6768 stat->dev = BTRFS_I(inode)->root->anon_dev; 6906 stat->dev = BTRFS_I(inode)->root->anon_dev;
6769 stat->blksize = PAGE_CACHE_SIZE; 6907 stat->blksize = PAGE_CACHE_SIZE;
6770 stat->blocks = (inode_get_bytes(inode) + 6908 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6771 BTRFS_I(inode)->delalloc_bytes) >> 9; 6909 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6772 return 0; 6910 return 0;
6773} 6911}
6774 6912
@@ -7044,14 +7182,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7044 goto out_unlock; 7182 goto out_unlock;
7045 } 7183 }
7046 7184
7185 /*
7186 * If the active LSM wants to access the inode during
7187 * d_instantiate it needs these. Smack checks to see
7188 * if the filesystem supports xattrs by looking at the
7189 * ops vector.
7190 */
7191 inode->i_fop = &btrfs_file_operations;
7192 inode->i_op = &btrfs_file_inode_operations;
7193
7047 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7194 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7048 if (err) 7195 if (err)
7049 drop_inode = 1; 7196 drop_inode = 1;
7050 else { 7197 else {
7051 inode->i_mapping->a_ops = &btrfs_aops; 7198 inode->i_mapping->a_ops = &btrfs_aops;
7052 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7199 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7053 inode->i_fop = &btrfs_file_operations;
7054 inode->i_op = &btrfs_file_inode_operations;
7055 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7200 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7056 } 7201 }
7057 if (drop_inode) 7202 if (drop_inode)
@@ -7321,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7321 .follow_link = page_follow_link_light, 7466 .follow_link = page_follow_link_light,
7322 .put_link = page_put_link, 7467 .put_link = page_put_link,
7323 .getattr = btrfs_getattr, 7468 .getattr = btrfs_getattr,
7469 .setattr = btrfs_setattr,
7324 .permission = btrfs_permission, 7470 .permission = btrfs_permission,
7325 .setxattr = btrfs_setxattr, 7471 .setxattr = btrfs_setxattr,
7326 .getxattr = btrfs_getxattr, 7472 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f126..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
252 trans = btrfs_join_transaction(root); 252 trans = btrfs_join_transaction(root);
253 BUG_ON(IS_ERR(trans)); 253 BUG_ON(IS_ERR(trans));
254 254
255 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME;
255 ret = btrfs_update_inode(trans, root, inode); 257 ret = btrfs_update_inode(trans, root, inode);
256 BUG_ON(ret); 258 BUG_ON(ret);
257 259
258 btrfs_update_iflags(inode);
259 inode->i_ctime = CURRENT_TIME;
260 btrfs_end_transaction(trans, root); 260 btrfs_end_transaction(trans, root);
261 261
262 mnt_drop_write(file->f_path.mnt); 262 mnt_drop_write(file->f_path.mnt);
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 858 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 860
861 mutex_lock(&inode->i_mutex);
861 ret = btrfs_delalloc_reserve_space(inode, 862 ret = btrfs_delalloc_reserve_space(inode,
862 num_pages << PAGE_CACHE_SHIFT); 863 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
863 if (ret) 865 if (ret)
864 return ret; 866 return ret;
865again: 867again:
@@ -1216,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1216 *devstr = '\0'; 1218 *devstr = '\0';
1217 devstr = vol_args->name; 1219 devstr = vol_args->name;
1218 devid = simple_strtoull(devstr, &end, 10); 1220 devid = simple_strtoull(devstr, &end, 10);
1219 printk(KERN_INFO "resizing devid %llu\n", 1221 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1220 (unsigned long long)devid); 1222 (unsigned long long)devid);
1221 } 1223 }
1222 device = btrfs_find_device(root, devid, NULL, NULL); 1224 device = btrfs_find_device(root, devid, NULL, NULL);
1223 if (!device) { 1225 if (!device) {
1224 printk(KERN_INFO "resizer unable to find device %llu\n", 1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1225 (unsigned long long)devid); 1227 (unsigned long long)devid);
1226 ret = -EINVAL; 1228 ret = -EINVAL;
1227 goto out_unlock; 1229 goto out_unlock;
@@ -1267,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1267 do_div(new_size, root->sectorsize); 1269 do_div(new_size, root->sectorsize);
1268 new_size *= root->sectorsize; 1270 new_size *= root->sectorsize;
1269 1271
1270 printk(KERN_INFO "new size for %s is %llu\n", 1272 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1271 device->name, (unsigned long long)new_size); 1273 device->name, (unsigned long long)new_size);
1272 1274
1273 if (new_size > old_size) { 1275 if (new_size > old_size) {
@@ -1278,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1278 } 1280 }
1279 ret = btrfs_grow_device(trans, device, new_size); 1281 ret = btrfs_grow_device(trans, device, new_size);
1280 btrfs_commit_transaction(trans, root); 1282 btrfs_commit_transaction(trans, root);
1281 } else { 1283 } else if (new_size < old_size) {
1282 ret = btrfs_shrink_device(device, new_size); 1284 ret = btrfs_shrink_device(device, new_size);
1283 } 1285 }
1284 1286
@@ -2930,11 +2932,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2930 goto out; 2932 goto out;
2931 2933
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 2934 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; 2935 rel_ptr = ipath->fspath->val[i] -
2936 (u64)(unsigned long)ipath->fspath->val;
2934 ipath->fspath->val[i] = rel_ptr; 2937 ipath->fspath->val[i] = rel_ptr;
2935 } 2938 }
2936 2939
2937 ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); 2940 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2941 (void *)(unsigned long)ipath->fspath, size);
2938 if (ret) { 2942 if (ret) {
2939 ret = -EFAULT; 2943 ret = -EFAULT;
2940 goto out; 2944 goto out;
@@ -3017,7 +3021,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3017 if (ret < 0) 3021 if (ret < 0)
3018 goto out; 3022 goto out;
3019 3023
3020 ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); 3024 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3025 (void *)(unsigned long)inodes, size);
3021 if (ret) 3026 if (ret)
3022 ret = -EFAULT; 3027 ret = -EFAULT;
3023 3028
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 24d654ce7a06..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2945,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2945 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2946 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2947 while (index <= last_index) { 2949 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2948 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2949 if (ret) 2953 if (ret)
2950 goto out; 2954 goto out;
2951 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ed11d3866afd..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
256 btrfs_release_path(swarn->path); 256 btrfs_release_path(swarn->path);
257 257
258 ipath = init_ipath(4096, local_root, swarn->path); 258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
259 ret = paths_from_inode(inum, ipath); 264 ret = paths_from_inode(inum, ipath);
260 265
261 if (ret < 0) 266 if (ret < 0)
@@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
272 swarn->logical, swarn->dev->name, 277 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset, 278 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink, 279 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]); 280 (char *)(unsigned long)ipath->fspath->val[i]);
276 281
277 free_ipath(ipath); 282 free_ipath(ipath);
278 return 0; 283 return 0;
@@ -944,50 +949,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
944static int scrub_submit(struct scrub_dev *sdev) 949static int scrub_submit(struct scrub_dev *sdev)
945{ 950{
946 struct scrub_bio *sbio; 951 struct scrub_bio *sbio;
947 struct bio *bio;
948 int i;
949 952
950 if (sdev->curr == -1) 953 if (sdev->curr == -1)
951 return 0; 954 return 0;
952 955
953 sbio = sdev->bios[sdev->curr]; 956 sbio = sdev->bios[sdev->curr];
954
955 bio = bio_alloc(GFP_NOFS, sbio->count);
956 if (!bio)
957 goto nomem;
958
959 bio->bi_private = sbio;
960 bio->bi_end_io = scrub_bio_end_io;
961 bio->bi_bdev = sdev->dev->bdev;
962 bio->bi_sector = sbio->physical >> 9;
963
964 for (i = 0; i < sbio->count; ++i) {
965 struct page *page;
966 int ret;
967
968 page = alloc_page(GFP_NOFS);
969 if (!page)
970 goto nomem;
971
972 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
973 if (!ret) {
974 __free_page(page);
975 goto nomem;
976 }
977 }
978
979 sbio->err = 0; 957 sbio->err = 0;
980 sdev->curr = -1; 958 sdev->curr = -1;
981 atomic_inc(&sdev->in_flight); 959 atomic_inc(&sdev->in_flight);
982 960
983 submit_bio(READ, bio); 961 submit_bio(READ, sbio->bio);
984 962
985 return 0; 963 return 0;
986
987nomem:
988 scrub_free_bio(bio);
989
990 return -ENOMEM;
991} 964}
992 965
993static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 966static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -995,6 +968,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
995 u8 *csum, int force) 968 u8 *csum, int force)
996{ 969{
997 struct scrub_bio *sbio; 970 struct scrub_bio *sbio;
971 struct page *page;
972 int ret;
998 973
999again: 974again:
1000 /* 975 /*
@@ -1015,12 +990,22 @@ again:
1015 } 990 }
1016 sbio = sdev->bios[sdev->curr]; 991 sbio = sdev->bios[sdev->curr];
1017 if (sbio->count == 0) { 992 if (sbio->count == 0) {
993 struct bio *bio;
994
1018 sbio->physical = physical; 995 sbio->physical = physical;
1019 sbio->logical = logical; 996 sbio->logical = logical;
997 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
998 if (!bio)
999 return -ENOMEM;
1000
1001 bio->bi_private = sbio;
1002 bio->bi_end_io = scrub_bio_end_io;
1003 bio->bi_bdev = sdev->dev->bdev;
1004 bio->bi_sector = sbio->physical >> 9;
1005 sbio->err = 0;
1006 sbio->bio = bio;
1020 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1007 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
1021 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1008 sbio->logical + sbio->count * PAGE_SIZE != logical) {
1022 int ret;
1023
1024 ret = scrub_submit(sdev); 1009 ret = scrub_submit(sdev);
1025 if (ret) 1010 if (ret)
1026 return ret; 1011 return ret;
@@ -1030,6 +1015,20 @@ again:
1030 sbio->spag[sbio->count].generation = gen; 1015 sbio->spag[sbio->count].generation = gen;
1031 sbio->spag[sbio->count].have_csum = 0; 1016 sbio->spag[sbio->count].have_csum = 0;
1032 sbio->spag[sbio->count].mirror_num = mirror_num; 1017 sbio->spag[sbio->count].mirror_num = mirror_num;
1018
1019 page = alloc_page(GFP_NOFS);
1020 if (!page)
1021 return -ENOMEM;
1022
1023 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1024 if (!ret) {
1025 __free_page(page);
1026 ret = scrub_submit(sdev);
1027 if (ret)
1028 return ret;
1029 goto again;
1030 }
1031
1033 if (csum) { 1032 if (csum) {
1034 sbio->spag[sbio->count].have_csum = 1; 1033 sbio->spag[sbio->count].have_csum = 1;
1035 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1034 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -1536,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1536static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1535static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1537{ 1536{
1538 struct btrfs_fs_info *fs_info = root->fs_info; 1537 struct btrfs_fs_info *fs_info = root->fs_info;
1538 int ret = 0;
1539 1539
1540 mutex_lock(&fs_info->scrub_lock); 1540 mutex_lock(&fs_info->scrub_lock);
1541 if (fs_info->scrub_workers_refcnt == 0) { 1541 if (fs_info->scrub_workers_refcnt == 0) {
1542 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1542 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1543 fs_info->thread_pool_size, &fs_info->generic_worker); 1543 fs_info->thread_pool_size, &fs_info->generic_worker);
1544 fs_info->scrub_workers.idle_thresh = 4; 1544 fs_info->scrub_workers.idle_thresh = 4;
1545 btrfs_start_workers(&fs_info->scrub_workers, 1); 1545 ret = btrfs_start_workers(&fs_info->scrub_workers);
1546 if (ret)
1547 goto out;
1546 } 1548 }
1547 ++fs_info->scrub_workers_refcnt; 1549 ++fs_info->scrub_workers_refcnt;
1550out:
1548 mutex_unlock(&fs_info->scrub_lock); 1551 mutex_unlock(&fs_info->scrub_lock);
1549 1552
1550 return 0; 1553 return ret;
1551} 1554}
1552 1555
1553static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1556static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 57080dffdfc6..200f63bc6675 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h> 43#include <linux/mnt_namespace.h>
44#include <linux/ratelimit.h>
44#include "compat.h" 45#include "compat.h"
45#include "delayed-inode.h" 46#include "delayed-inode.h"
46#include "ctree.h" 47#include "ctree.h"
@@ -197,7 +198,7 @@ static match_table_t tokens = {
197 {Opt_subvolrootid, "subvolrootid=%d"}, 198 {Opt_subvolrootid, "subvolrootid=%d"},
198 {Opt_defrag, "autodefrag"}, 199 {Opt_defrag, "autodefrag"},
199 {Opt_inode_cache, "inode_cache"}, 200 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "no_space_cache"}, 201 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"}, 202 {Opt_recovery, "recovery"},
202 {Opt_err, NULL}, 203 {Opt_err, NULL},
203}; 204};
@@ -448,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
448 token = match_token(p, tokens, args); 449 token = match_token(p, tokens, args);
449 switch (token) { 450 switch (token) {
450 case Opt_subvol: 451 case Opt_subvol:
452 kfree(*subvol_name);
451 *subvol_name = match_strdup(&args[0]); 453 *subvol_name = match_strdup(&args[0]);
452 break; 454 break;
453 case Opt_subvolid: 455 case Opt_subvolid:
@@ -710,7 +712,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
710 if (btrfs_test_opt(root, SPACE_CACHE)) 712 if (btrfs_test_opt(root, SPACE_CACHE))
711 seq_puts(seq, ",space_cache"); 713 seq_puts(seq, ",space_cache");
712 else 714 else
713 seq_puts(seq, ",no_space_cache"); 715 seq_puts(seq, ",nospace_cache");
714 if (btrfs_test_opt(root, CLEAR_CACHE)) 716 if (btrfs_test_opt(root, CLEAR_CACHE))
715 seq_puts(seq, ",clear_cache"); 717 seq_puts(seq, ",clear_cache");
716 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 718 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -824,13 +826,9 @@ static char *setup_root_args(char *args)
824static struct dentry *mount_subvol(const char *subvol_name, int flags, 826static struct dentry *mount_subvol(const char *subvol_name, int flags,
825 const char *device_name, char *data) 827 const char *device_name, char *data)
826{ 828{
827 struct super_block *s;
828 struct dentry *root; 829 struct dentry *root;
829 struct vfsmount *mnt; 830 struct vfsmount *mnt;
830 struct mnt_namespace *ns_private;
831 char *newargs; 831 char *newargs;
832 struct path path;
833 int error;
834 832
835 newargs = setup_root_args(data); 833 newargs = setup_root_args(data);
836 if (!newargs) 834 if (!newargs)
@@ -841,39 +839,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
841 if (IS_ERR(mnt)) 839 if (IS_ERR(mnt))
842 return ERR_CAST(mnt); 840 return ERR_CAST(mnt);
843 841
844 ns_private = create_mnt_ns(mnt); 842 root = mount_subtree(mnt, subvol_name);
845 if (IS_ERR(ns_private)) {
846 mntput(mnt);
847 return ERR_CAST(ns_private);
848 }
849
850 /*
851 * This will trigger the automount of the subvol so we can just
852 * drop the mnt we have here and return the dentry that we
853 * found.
854 */
855 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
856 LOOKUP_FOLLOW, &path);
857 put_mnt_ns(ns_private);
858 if (error)
859 return ERR_PTR(error);
860 843
861 if (!is_subvolume_inode(path.dentry->d_inode)) { 844 if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
862 path_put(&path); 845 struct super_block *s = root->d_sb;
863 mntput(mnt); 846 dput(root);
864 error = -EINVAL; 847 root = ERR_PTR(-EINVAL);
848 deactivate_locked_super(s);
865 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", 849 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
866 subvol_name); 850 subvol_name);
867 return ERR_PTR(-EINVAL);
868 } 851 }
869 852
870 /* Get a ref to the sb and the dentry we found and return it */
871 s = path.mnt->mnt_sb;
872 atomic_inc(&s->s_active);
873 root = dget(path.dentry);
874 path_put(&path);
875 down_write(&s->s_umount);
876
877 return root; 853 return root;
878} 854}
879 855
@@ -890,7 +866,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
890 struct super_block *s; 866 struct super_block *s;
891 struct dentry *root; 867 struct dentry *root;
892 struct btrfs_fs_devices *fs_devices = NULL; 868 struct btrfs_fs_devices *fs_devices = NULL;
893 struct btrfs_root *tree_root = NULL;
894 struct btrfs_fs_info *fs_info = NULL; 869 struct btrfs_fs_info *fs_info = NULL;
895 fmode_t mode = FMODE_READ; 870 fmode_t mode = FMODE_READ;
896 char *subvol_name = NULL; 871 char *subvol_name = NULL;
@@ -904,8 +879,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
904 error = btrfs_parse_early_options(data, mode, fs_type, 879 error = btrfs_parse_early_options(data, mode, fs_type,
905 &subvol_name, &subvol_objectid, 880 &subvol_name, &subvol_objectid,
906 &subvol_rootid, &fs_devices); 881 &subvol_rootid, &fs_devices);
907 if (error) 882 if (error) {
883 kfree(subvol_name);
908 return ERR_PTR(error); 884 return ERR_PTR(error);
885 }
909 886
910 if (subvol_name) { 887 if (subvol_name) {
911 root = mount_subvol(subvol_name, flags, device_name, data); 888 root = mount_subvol(subvol_name, flags, device_name, data);
@@ -917,15 +894,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
917 if (error) 894 if (error)
918 return ERR_PTR(error); 895 return ERR_PTR(error);
919 896
920 error = btrfs_open_devices(fs_devices, mode, fs_type);
921 if (error)
922 return ERR_PTR(error);
923
924 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
925 error = -EACCES;
926 goto error_close_devices;
927 }
928
929 /* 897 /*
930 * Setup a dummy root and fs_info for test/set super. This is because 898 * Setup a dummy root and fs_info for test/set super. This is because
931 * we don't actually fill this stuff out until open_ctree, but we need 899 * we don't actually fill this stuff out until open_ctree, but we need
@@ -933,24 +901,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
933 * then open_ctree will properly initialize everything later. 901 * then open_ctree will properly initialize everything later.
934 */ 902 */
935 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 903 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
936 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 904 if (!fs_info)
937 if (!fs_info || !tree_root) { 905 return ERR_PTR(-ENOMEM);
906
907 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
908 if (!fs_info->tree_root) {
938 error = -ENOMEM; 909 error = -ENOMEM;
939 goto error_close_devices; 910 goto error_fs_info;
940 } 911 }
941 fs_info->tree_root = tree_root; 912 fs_info->tree_root->fs_info = fs_info;
942 fs_info->fs_devices = fs_devices; 913 fs_info->fs_devices = fs_devices;
943 tree_root->fs_info = fs_info;
944 914
945 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 915 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
946 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 916 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
947 if (!fs_info->super_copy || !fs_info->super_for_commit) { 917 if (!fs_info->super_copy || !fs_info->super_for_commit) {
948 error = -ENOMEM; 918 error = -ENOMEM;
919 goto error_fs_info;
920 }
921
922 error = btrfs_open_devices(fs_devices, mode, fs_type);
923 if (error)
924 goto error_fs_info;
925
926 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
927 error = -EACCES;
949 goto error_close_devices; 928 goto error_close_devices;
950 } 929 }
951 930
952 bdev = fs_devices->latest_bdev; 931 bdev = fs_devices->latest_bdev;
953 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 932 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
933 fs_info->tree_root);
954 if (IS_ERR(s)) { 934 if (IS_ERR(s)) {
955 error = PTR_ERR(s); 935 error = PTR_ERR(s);
956 goto error_close_devices; 936 goto error_close_devices;
@@ -959,12 +939,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
959 if (s->s_root) { 939 if (s->s_root) {
960 if ((flags ^ s->s_flags) & MS_RDONLY) { 940 if ((flags ^ s->s_flags) & MS_RDONLY) {
961 deactivate_locked_super(s); 941 deactivate_locked_super(s);
962 return ERR_PTR(-EBUSY); 942 error = -EBUSY;
943 goto error_close_devices;
963 } 944 }
964 945
965 btrfs_close_devices(fs_devices); 946 btrfs_close_devices(fs_devices);
966 free_fs_info(fs_info); 947 free_fs_info(fs_info);
967 kfree(tree_root);
968 } else { 948 } else {
969 char b[BDEVNAME_SIZE]; 949 char b[BDEVNAME_SIZE];
970 950
@@ -991,8 +971,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
991 971
992error_close_devices: 972error_close_devices:
993 btrfs_close_devices(fs_devices); 973 btrfs_close_devices(fs_devices);
974error_fs_info:
994 free_fs_info(fs_info); 975 free_fs_info(fs_info);
995 kfree(tree_root);
996 return ERR_PTR(error); 976 return ERR_PTR(error);
997} 977}
998 978
@@ -1074,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1074 u64 avail_space; 1054 u64 avail_space;
1075 u64 used_space; 1055 u64 used_space;
1076 u64 min_stripe_size; 1056 u64 min_stripe_size;
1077 int min_stripes = 1; 1057 int min_stripes = 1, num_stripes = 1;
1078 int i = 0, nr_devices; 1058 int i = 0, nr_devices;
1079 int ret; 1059 int ret;
1080 1060
1081 nr_devices = fs_info->fs_devices->rw_devices; 1061 nr_devices = fs_info->fs_devices->open_devices;
1082 BUG_ON(!nr_devices); 1062 BUG_ON(!nr_devices);
1083 1063
1084 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1064 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1088,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1088 1068
1089 /* calc min stripe number for data space alloction */ 1069 /* calc min stripe number for data space alloction */
1090 type = btrfs_get_alloc_profile(root, 1); 1070 type = btrfs_get_alloc_profile(root, 1);
1091 if (type & BTRFS_BLOCK_GROUP_RAID0) 1071 if (type & BTRFS_BLOCK_GROUP_RAID0) {
1092 min_stripes = 2; 1072 min_stripes = 2;
1093 else if (type & BTRFS_BLOCK_GROUP_RAID1) 1073 num_stripes = nr_devices;
1074 } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
1094 min_stripes = 2; 1075 min_stripes = 2;
1095 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1076 num_stripes = 2;
1077 } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
1096 min_stripes = 4; 1078 min_stripes = 4;
1079 num_stripes = 4;
1080 }
1097 1081
1098 if (type & BTRFS_BLOCK_GROUP_DUP) 1082 if (type & BTRFS_BLOCK_GROUP_DUP)
1099 min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1083 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1100 else 1084 else
1101 min_stripe_size = BTRFS_STRIPE_LEN; 1085 min_stripe_size = BTRFS_STRIPE_LEN;
1102 1086
1103 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1087 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1104 if (!device->in_fs_metadata) 1088 if (!device->in_fs_metadata || !device->bdev)
1105 continue; 1089 continue;
1106 1090
1107 avail_space = device->total_bytes - device->bytes_used; 1091 avail_space = device->total_bytes - device->bytes_used;
@@ -1162,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1162 i = nr_devices - 1; 1146 i = nr_devices - 1;
1163 avail_space = 0; 1147 avail_space = 0;
1164 while (nr_devices >= min_stripes) { 1148 while (nr_devices >= min_stripes) {
1149 if (num_stripes > nr_devices)
1150 num_stripes = nr_devices;
1151
1165 if (devices_info[i].max_avail >= min_stripe_size) { 1152 if (devices_info[i].max_avail >= min_stripe_size) {
1166 int j; 1153 int j;
1167 u64 alloc_size; 1154 u64 alloc_size;
1168 1155
1169 avail_space += devices_info[i].max_avail * min_stripes; 1156 avail_space += devices_info[i].max_avail * num_stripes;
1170 alloc_size = devices_info[i].max_avail; 1157 alloc_size = devices_info[i].max_avail;
1171 for (j = i + 1 - min_stripes; j <= i; j++) 1158 for (j = i + 1 - num_stripes; j <= i; j++)
1172 devices_info[j].max_avail -= alloc_size; 1159 devices_info[j].max_avail -= alloc_size;
1173 } 1160 }
1174 i--; 1161 i--;
@@ -1285,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb)
1285 return 0; 1272 return 0;
1286} 1273}
1287 1274
1275static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1276{
1277 int ret;
1278
1279 ret = btrfs_dirty_inode(inode);
1280 if (ret)
1281 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1282 "error %d\n", btrfs_ino(inode), ret);
1283}
1284
1288static const struct super_operations btrfs_super_ops = { 1285static const struct super_operations btrfs_super_ops = {
1289 .drop_inode = btrfs_drop_inode, 1286 .drop_inode = btrfs_drop_inode,
1290 .evict_inode = btrfs_evict_inode, 1287 .evict_inode = btrfs_evict_inode,
@@ -1292,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = {
1292 .sync_fs = btrfs_sync_fs, 1289 .sync_fs = btrfs_sync_fs,
1293 .show_options = btrfs_show_options, 1290 .show_options = btrfs_show_options,
1294 .write_inode = btrfs_write_inode, 1291 .write_inode = btrfs_write_inode,
1295 .dirty_inode = btrfs_dirty_inode, 1292 .dirty_inode = btrfs_fs_dirty_inode,
1296 .alloc_inode = btrfs_alloc_inode, 1293 .alloc_inode = btrfs_alloc_inode,
1297 .destroy_inode = btrfs_destroy_inode, 1294 .destroy_inode = btrfs_destroy_inode,
1298 .statfs = btrfs_statfs, 1295 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 960835eaf4da..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
785 785
786 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
787 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
788 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
789 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
790 switch_commit_root(root); 794 switch_commit_root(root);
@@ -882,8 +886,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
882 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 886 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
883 887
884 if (to_reserve > 0) { 888 if (to_reserve > 0) {
885 ret = btrfs_block_rsv_add(root, &pending->block_rsv, 889 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
886 to_reserve); 890 to_reserve);
887 if (ret) { 891 if (ret) {
888 pending->error = ret; 892 pending->error = ret;
889 goto fail; 893 goto fail;
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
947 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
948 free_extent_buffer(old); 952 free_extent_buffer(old);
949 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
950 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
951 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
952 key.offset = trans->transid; 960 key.offset = trans->transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f8e2943101a1..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
295 btrfs_requeue_work(&device->work); 295 btrfs_requeue_work(&device->work);
296 goto done; 296 goto done;
297 } 297 }
298 /* unplug every 64 requests just for good measure */
299 if (batch_run % 64 == 0) {
300 blk_finish_plug(&plug);
301 blk_start_plug(&plug);
302 sync_pending = 0;
303 }
298 } 304 }
299 305
300 cond_resched(); 306 cond_resched();
@@ -999,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
999 key.objectid = device->devid; 1005 key.objectid = device->devid;
1000 key.offset = start; 1006 key.offset = start;
1001 key.type = BTRFS_DEV_EXTENT_KEY; 1007 key.type = BTRFS_DEV_EXTENT_KEY;
1002 1008again:
1003 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1009 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1004 if (ret > 0) { 1010 if (ret > 0) {
1005 ret = btrfs_previous_item(root, path, key.objectid, 1011 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1012,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1012 struct btrfs_dev_extent); 1018 struct btrfs_dev_extent);
1013 BUG_ON(found_key.offset > start || found_key.offset + 1019 BUG_ON(found_key.offset > start || found_key.offset +
1014 btrfs_dev_extent_length(leaf, extent) < start); 1020 btrfs_dev_extent_length(leaf, extent) < start);
1021 key = found_key;
1022 btrfs_release_path(path);
1023 goto again;
1015 } else if (ret == 0) { 1024 } else if (ret == 0) {
1016 leaf = path->nodes[0]; 1025 leaf = path->nodes[0];
1017 extent = btrfs_item_ptr(leaf, path->slots[0], 1026 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1608,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1608 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1617 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1609 return -EINVAL; 1618 return -EINVAL;
1610 1619
1611 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1620 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1612 root->fs_info->bdev_holder); 1621 root->fs_info->bdev_holder);
1613 if (IS_ERR(bdev)) 1622 if (IS_ERR(bdev))
1614 return PTR_ERR(bdev); 1623 return PTR_ERR(bdev);
@@ -3255,7 +3264,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
3255 */ 3264 */
3256 if (atomic_read(&bbio->error) > bbio->max_errors) { 3265 if (atomic_read(&bbio->error) > bbio->max_errors) {
3257 err = -EIO; 3266 err = -EIO;
3258 } else if (err) { 3267 } else {
3259 /* 3268 /*
3260 * this bio is actually up to date, we didn't 3269 * this bio is actually up to date, we didn't
3261 * go over the max number of errors 3270 * go over the max number of errors
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
100 struct reada_zone *reada_curr_zone; 100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones; 101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents; 102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
103}; 109};
104 110
105struct btrfs_fs_devices { 111struct btrfs_fs_devices {