Merge tag 'v3.2-rc6' of /home/airlied/devel/kernel/linux-2.6 into drm-core-next

Merge in the upstream tree to bring in the mainline fixes. Conflicts: drivers/gpu/drm/exynos/exynos_drm_fbdev.c drivers/gpu/drm/nouveau/nouveau_sgdma.c
author: Dave Airlie <airlied@redhat.com> 2011-12-20 09:43:53 -0500
committer: Dave Airlie <airlied@redhat.com> 2011-12-20 09:43:53 -0500
commit: 1fbe6f625f69e48c4001051dc1431afc704acfaa (patch)
tree: 826b741201a2e09a627ed350c6ff36935f5cff79 /fs/btrfs
parent: 0cecdd818cd79d092e36e70dfe3a71f2878d6b96 (diff)
parent: 384703b8e6cd4c8ef08512e596024e028c91c339 (diff)
22 files changed, 987 insertions, 505 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..cb97174e2366 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
        int idle;
 };
+static int __btrfs_start_workers(struct btrfs_workers *workers);
 /*
 * btrfs_start_workers uses kthread_run, which can block waiting for memory
 * for a very long time.  It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
 {
        struct worker_start *start;
        start = container_of(work, struct worker_start, work);
-        btrfs_start_workers(start->queue, 1);
+        __btrfs_start_workers(start->queue);
        kfree(start);
 }
-static int start_new_worker(struct btrfs_workers *queue)
-{
-        struct worker_start *start;
-        int ret;
-        start = kzalloc(sizeof(*start), GFP_NOFS);
-        if (!start)
-                return -ENOMEM;
-        start->work.func = start_new_worker_func;
-        start->queue = queue;
-        ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
-        if (ret)
-                kfree(start);
-        return ret;
-}
 /*
 * helper function to move a thread onto the idle list after it
 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 {
        struct btrfs_workers *workers = worker->workers;
+        struct worker_start *start;
        unsigned long flags;
        rmb();
        if (!workers->atomic_start_pending)
                return;
+        start = kzalloc(sizeof(*start), GFP_NOFS);
+        if (!start)
+                return;
+        start->work.func = start_new_worker_func;
+        start->queue = workers;
        spin_lock_irqsave(&workers->lock, flags);
        if (!workers->atomic_start_pending)
                goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
        workers->num_workers_starting += 1;
        spin_unlock_irqrestore(&workers->lock, flags);
-        start_new_worker(workers);
+        btrfs_queue_worker(workers->atomic_worker_start, &start->work);
        return;
 out:
+        kfree(start);
        spin_unlock_irqrestore(&workers->lock, flags);
 }
@@ -331,7 +325,7 @@ again:
                        run_ordered_completions(worker->workers, work);
                        check_pending_worker_creates(worker);
+                        cond_resched();
                }
                spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
 * starts new worker threads.  This does not enforce the max worker
 * count in case you need to temporarily go past it.
 */
-static int __btrfs_start_workers(struct btrfs_workers *workers,
+static int __btrfs_start_workers(struct btrfs_workers *workers)
-                                 int num_workers)
 {
        struct btrfs_worker_thread *worker;
        int ret = 0;
-        int i;
-        for (i = 0; i < num_workers; i++) {
+        worker = kzalloc(sizeof(*worker), GFP_NOFS);
-                worker = kzalloc(sizeof(*worker), GFP_NOFS);
+        if (!worker) {
-                if (!worker) {
+                ret = -ENOMEM;
-                        ret = -ENOMEM;
+                goto fail;
-                        goto fail;
+        }
-                }
-                INIT_LIST_HEAD(&worker->pending);
+        INIT_LIST_HEAD(&worker->pending);
-                INIT_LIST_HEAD(&worker->prio_pending);
+        INIT_LIST_HEAD(&worker->prio_pending);
-                INIT_LIST_HEAD(&worker->worker_list);
+        INIT_LIST_HEAD(&worker->worker_list);
-                spin_lock_init(&worker->lock);
+        spin_lock_init(&worker->lock);
-                atomic_set(&worker->num_pending, 0);
+        atomic_set(&worker->num_pending, 0);
-                atomic_set(&worker->refs, 1);
+        atomic_set(&worker->refs, 1);
-                worker->workers = workers;
+        worker->workers = workers;
-                worker->task = kthread_run(worker_loop, worker,
+        worker->task = kthread_run(worker_loop, worker,
-                                           "btrfs-%s-%d", workers->name,
+                                   "btrfs-%s-%d", workers->name,
-                                           workers->num_workers + i);
+                                   workers->num_workers + 1);
-                if (IS_ERR(worker->task)) {
+        if (IS_ERR(worker->task)) {
-                        ret = PTR_ERR(worker->task);
+                ret = PTR_ERR(worker->task);
-                        kfree(worker);
+                kfree(worker);
-                        goto fail;
+                goto fail;
-                }
-                spin_lock_irq(&workers->lock);
-                list_add_tail(&worker->worker_list, &workers->idle_list);
-                worker->idle = 1;
-                workers->num_workers++;
-                workers->num_workers_starting--;
-                WARN_ON(workers->num_workers_starting < 0);
-                spin_unlock_irq(&workers->lock);
        }
+        spin_lock_irq(&workers->lock);
+        list_add_tail(&worker->worker_list, &workers->idle_list);
+        worker->idle = 1;
+        workers->num_workers++;
+        workers->num_workers_starting--;
+        WARN_ON(workers->num_workers_starting < 0);
+        spin_unlock_irq(&workers->lock);
        return 0;
 fail:
-        btrfs_stop_workers(workers);
+        spin_lock_irq(&workers->lock);
+        workers->num_workers_starting--;
+        spin_unlock_irq(&workers->lock);
        return ret;
 }
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+int btrfs_start_workers(struct btrfs_workers *workers)
 {
        spin_lock_irq(&workers->lock);
-        workers->num_workers_starting += num_workers;
+        workers->num_workers_starting++;
        spin_unlock_irq(&workers->lock);
-        return __btrfs_start_workers(workers, num_workers);
+        return __btrfs_start_workers(workers);
 }
 /*
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
        struct btrfs_worker_thread *worker;
        unsigned long flags;
        struct list_head *fallback;
+        int ret;
 again:
        spin_lock_irqsave(&workers->lock, flags);
@@ -584,7 +578,9 @@ again:
                        workers->num_workers_starting++;
                        spin_unlock_irqrestore(&workers->lock, flags);
                        /* we're below the limit, start another worker */
-                        __btrfs_start_workers(workers, 1);
+                        ret = __btrfs_start_workers(workers);
+                        if (ret)
+                                goto fallback;
                        goto again;
                }
        }
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
 /*
 * places a struct btrfs_work into the pending queue of one of the kthreads
 */
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 {
        struct btrfs_worker_thread *worker;
        unsigned long flags;
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        /* don't requeue something already on a list */
        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-                goto out;
+                return;
        worker = find_worker(workers);
        if (workers->ordered) {
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        if (wake)
                wake_up_process(worker->task);
        spin_unlock_irqrestore(&worker->lock, flags);
-out:
-        return 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
        char *name;
 };
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_start_workers(struct btrfs_workers *workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
                        struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929c..22c64fff1bd5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                return PTR_ERR(fspath);
        if (fspath > fspath_min) {
-                ipath->fspath->val[i] = (u64)fspath;
+                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a3935..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
         * the btrfs file release call will add this inode to the
         * ordered operations list so that we make sure to flush out any
         * new data the application may have written before commit.
-         *
-         * yes, its silly to have a single bitflag, but we might grow more
-         * of these.
         */
        unsigned ordered_data_close:1;
        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
        unsigned in_defrag:1;
+        unsigned delalloc_meta_reserved:1;
        /*
         * always compress this one file
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea38..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
 {
+        /* ensure we can see the force_cow */
+        smp_rmb();
+        /*
+         * We do not need to cow a block if
+         * 1) this block is not created or changed in this transaction;
+         * 2) this block does not belong to TREE_RELOC tree;
+         * 3) the root is not forced COW.
+         *
+         * What is forced COW:
+         *    when we create snapshot during commiting the transaction,
+         *    after we've finished coping src root, we must COW the shared
+         *    block to ensure the metadata consistency.
+         */
        if (btrfs_header_generation(buf) == trans->transid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
-              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+            !root->force_cow)
                return 0;
        return 1;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff9292..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
 enum btrfs_caching_type {
        BTRFS_CACHE_NO          = 0,
        BTRFS_CACHE_STARTED     = 1,
-        BTRFS_CACHE_FINISHED    = 2,
+        BTRFS_CACHE_FAST        = 2,
+        BTRFS_CACHE_FINISHED    = 3,
 };
 enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
         * for stat.  It may be used for more later
         */
        dev_t anon_dev;
+        int force_cow;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 int btrfs_block_rsv_refill(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv,
                          u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+                                   struct btrfs_block_rsv *block_rsv,
+                                   u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
@@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode, int flags);
+int btrfs_dirty_inode(struct inode *inode);
+int btrfs_update_time(struct file *file);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 3a1b939c9ae2..9c1eccc2c503 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 static int btrfs_delayed_inode_reserve_metadata(
                                        struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
+                                        struct inode *inode,
                                        struct btrfs_delayed_node *node)
 {
        struct btrfs_block_rsv *src_rsv;
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;
+        int release = false;
        src_rsv = trans->block_rsv;
        dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -638,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
         * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
         * we're accounted for.
         */
-        if (!trans->bytes_reserved &&
+        if (!src_rsv || (!trans->bytes_reserved &&
-            src_rsv != &root->fs_info->delalloc_block_rsv) {
+            src_rsv != &root->fs_info->delalloc_block_rsv)) {
                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
                /*
                 * Since we're under a transaction reserve_metadata_bytes could
@@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata(
                if (!ret)
                        node->bytes_reserved = num_bytes;
                return ret;
+        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+                spin_lock(&BTRFS_I(inode)->lock);
+                if (BTRFS_I(inode)->delalloc_meta_reserved) {
+                        BTRFS_I(inode)->delalloc_meta_reserved = 0;
+                        spin_unlock(&BTRFS_I(inode)->lock);
+                        release = true;
+                        goto migrate;
+                }
+                spin_unlock(&BTRFS_I(inode)->lock);
+                /* Ok we didn't have space pre-reserved.  This shouldn't happen
+                 * too often but it can happen if we do delalloc to an existing
+                 * inode which gets dirtied because of the time update, and then
+                 * isn't touched again until after the transaction commits and
+                 * then we try to write out the data.  First try to be nice and
+                 * reserve something strictly for us.  If not be a pain and try
+                 * to steal from the delalloc block rsv.
+                 */
+                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                if (!ret)
+                        goto out;
+                ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+                if (!ret)
+                        goto out;
+                /*
+                 * Ok this is a problem, let's just steal from the global rsv
+                 * since this really shouldn't happen that often.
+                 */
+                WARN_ON(1);
+                ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+                                              dst_rsv, num_bytes);
+                goto out;
        }
+migrate:
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+out:
+        /*
+         * Migrate only takes a reservation, it doesn't touch the size of the
+         * block_rsv.  This is to simplify people who don't normally have things
+         * migrated from their block rsv.  If they go to release their
+         * reservation, that will decrease the size as well, so if migrate
+         * reduced size we'd end up with a negative size.  But for the
+         * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+         * but we could in fact do this reserve/migrate dance several times
+         * between the time we did the original reservation and we'd clean it
+         * up.  So to take care of this, release the space for the meta
+         * reservation here.  I think it may be time for a documentation page on
+         * how block rsvs. work.
+         */
        if (!ret)
                node->bytes_reserved = num_bytes;
+        if (release)
+                btrfs_block_rsv_release(root, src_rsv, num_bytes);
        return ret;
 }
@@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                goto release_node;
        }
-        ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+        ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+                                                   delayed_node);
        if (ret)
                goto release_node;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 102c176fc29c..f44b3928dc2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
 static int btree_io_failed_hook(struct bio *failed_bio,
                         struct page *page, u64 start, u64 end,
-                         u64 mirror_num, struct extent_state *state)
+                         int mirror_num, struct extent_state *state)
 {
        struct extent_io_tree *tree;
        unsigned long len;
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        u64 features;
        struct btrfs_key location;
        struct buffer_head *bh;
-        struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_super_block *disk_super;
-                                                 GFP_NOFS);
-        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
-                                                 GFP_NOFS);
        struct btrfs_root *tree_root = btrfs_sb(sb);
-        struct btrfs_fs_info *fs_info = NULL;
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
-        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *extent_root;
-                                                GFP_NOFS);
+        struct btrfs_root *csum_root;
-        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *chunk_root;
-                                              GFP_NOFS);
+        struct btrfs_root *dev_root;
        struct btrfs_root *log_tree_root;
        int ret;
        int err = -EINVAL;
        int num_backups_tried = 0;
        int backup_index = 0;
-        struct btrfs_super_block *disk_super;
+        extent_root = fs_info->extent_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        csum_root = fs_info->csum_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        chunk_root = fs_info->chunk_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        dev_root = fs_info->dev_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-        if (!extent_root || !tree_root || !tree_root->fs_info ||
+        if (!extent_root || !csum_root || !chunk_root || !dev_root) {
-            !chunk_root || !dev_root || !csum_root) {
                err = -ENOMEM;
                goto fail;
        }
-        fs_info = tree_root->fs_info;
        ret = init_srcu_struct(&fs_info->subvol_srcu);
        if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->reloc_mutex);
        init_completion(&fs_info->kobj_unregister);
-        fs_info->tree_root = tree_root;
-        fs_info->extent_root = extent_root;
-        fs_info->csum_root = csum_root;
-        fs_info->chunk_root = chunk_root;
-        fs_info->dev_root = dev_root;
-        fs_info->fs_devices = fs_devices;
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        btrfs_mapping_init(&fs_info->mapping_tree);
@@ -2199,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->endio_meta_write_workers.idle_thresh = 2;
        fs_info->readahead_workers.idle_thresh = 2;
-        btrfs_start_workers(&fs_info->workers, 1);
+        /*
-        btrfs_start_workers(&fs_info->generic_worker, 1);
+         * btrfs_start_workers can really only fail because of ENOMEM so just
-        btrfs_start_workers(&fs_info->submit_workers, 1);
+         * return -ENOMEM if any of these fail.
-        btrfs_start_workers(&fs_info->delalloc_workers, 1);
+         */
-        btrfs_start_workers(&fs_info->fixup_workers, 1);
+        ret = btrfs_start_workers(&fs_info->workers);
-        btrfs_start_workers(&fs_info->endio_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->generic_worker);
-        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->submit_workers);
-        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->delalloc_workers);
-        btrfs_start_workers(&fs_info->endio_write_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->fixup_workers);
-        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_workers);
-        btrfs_start_workers(&fs_info->delayed_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
-        btrfs_start_workers(&fs_info->caching_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
-        btrfs_start_workers(&fs_info->readahead_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_write_workers);
+        ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
+        ret |= btrfs_start_workers(&fs_info->delayed_workers);
+        ret |= btrfs_start_workers(&fs_info->caching_workers);
+        ret |= btrfs_start_workers(&fs_info->readahead_workers);
+        if (ret) {
+                ret = -ENOMEM;
+                goto fail_sb_buffer;
+        }
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2465,21 +2468,20 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
 fail_iput:
+        btrfs_mapping_tree_free(&fs_info->mapping_tree);
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
-        btrfs_close_devices(fs_info->fs_devices);
-        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+        btrfs_close_devices(fs_info->fs_devices);
        free_fs_info(fs_info);
        return ERR_PTR(err);
 recovery_tree_root:
        if (!btrfs_test_opt(tree_root, RECOVERY))
                goto fail_tree_roots;
@@ -2579,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
        int errors = 0;
        u32 crc;
        u64 bytenr;
-        int last_barrier = 0;
        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
-        /* make sure only the last submit_bh does a barrier */
-        if (do_barriers) {
-                for (i = 0; i < max_mirrors; i++) {
-                        bytenr = btrfs_sb_offset(i);
-                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                            device->total_bytes)
-                                break;
-                        last_barrier = i;
-                }
-        }
        for (i = 0; i < max_mirrors; i++) {
                bytenr = btrfs_sb_offset(i);
                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2640,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
-                if (i == last_barrier && do_barriers)
+                /*
-                        ret = submit_bh(WRITE_FLUSH_FUA, bh);
+                 * we fua the first super.  The others we allow
-                else
+                 * to go down lazy.
-                        ret = submit_bh(WRITE_SYNC, bh);
+                 */
+                ret = submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
        return errors < i ? 0 : -1;
 }
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        }
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        bio_put(bio);
+}
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+        struct bio *bio;
+        int ret = 0;
+        if (device->nobarriers)
+                return 0;
+        if (wait) {
+                bio = device->flush_bio;
+                if (!bio)
+                        return 0;
+                wait_for_completion(&device->flush_wait);
+                if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+                        printk("btrfs: disabling barriers on dev %s\n",
+                               device->name);
+                        device->nobarriers = 1;
+                }
+                if (!bio_flagged(bio, BIO_UPTODATE)) {
+                        ret = -EIO;
+                }
+                /* drop the reference from the wait == 0 run */
+                bio_put(bio);
+                device->flush_bio = NULL;
+                return ret;
+        }
+        /*
+         * one reference for us, and we leave it for the
+         * caller
+         */
+        device->flush_bio = NULL;;
+        bio = bio_alloc(GFP_NOFS, 0);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_end_io = btrfs_end_empty_barrier;
+        bio->bi_bdev = device->bdev;
+        init_completion(&device->flush_wait);
+        bio->bi_private = &device->flush_wait;
+        device->flush_bio = bio;
+        bio_get(bio);
+        submit_bio(WRITE_FLUSH, bio);
+        return 0;
+}
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+        struct list_head *head;
+        struct btrfs_device *dev;
+        int errors = 0;
+        int ret;
+        /* send down all the barriers */
+        head = &info->fs_devices->devices;
+        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (!dev->bdev) {
+                        errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_flush(dev, 0);
+                if (ret)
+                        errors++;
+        }
+        /* wait for all the barriers */
+        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (!dev->bdev) {
+                        errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_flush(dev, 1);
+                if (ret)
+                        errors++;
+        }
+        if (errors)
+                return -EIO;
+        return 0;
+}
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
        struct list_head *head;
@@ -2672,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
+        if (do_barriers)
+                barrier_all_devices(root->fs_info);
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9879bd474632..f5fbe576d2ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_root *root,
                             int load_cache_only)
 {
+        DEFINE_WAIT(wait);
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
        int ret = 0;
-        smp_mb();
+        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        if (cache->cached != BTRFS_CACHE_NO)
+        BUG_ON(!caching_ctl);
+        INIT_LIST_HEAD(&caching_ctl->list);
+        mutex_init(&caching_ctl->mutex);
+        init_waitqueue_head(&caching_ctl->wait);
+        caching_ctl->block_group = cache;
+        caching_ctl->progress = cache->key.objectid;
+        atomic_set(&caching_ctl->count, 1);
+        caching_ctl->work.func = caching_thread;
+        spin_lock(&cache->lock);
+        /*
+         * This should be a rare occasion, but this could happen I think in the
+         * case where one thread starts to load the space cache info, and then
+         * some other thread starts a transaction commit which tries to do an
+         * allocation while the other thread is still loading the space cache
+         * info.  The previous loop should have kept us from choosing this block
+         * group, but if we've moved to the state where we will wait on caching
+         * block groups we need to first check if we're doing a fast load here,
+         * so we can wait for it to finish, otherwise we could end up allocating
+         * from a block group who's cache gets evicted for one reason or
+         * another.
+         */
+        while (cache->cached == BTRFS_CACHE_FAST) {
+                struct btrfs_caching_control *ctl;
+                ctl = cache->caching_ctl;
+                atomic_inc(&ctl->count);
+                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&cache->lock);
+                schedule();
+                finish_wait(&ctl->wait, &wait);
+                put_caching_control(ctl);
+                spin_lock(&cache->lock);
+        }
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                kfree(caching_ctl);
                return 0;
+        }
+        WARN_ON(cache->caching_ctl);
+        cache->caching_ctl = caching_ctl;
+        cache->cached = BTRFS_CACHE_FAST;
+        spin_unlock(&cache->lock);
        /*
         * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        if (trans && (!trans->transaction->in_commit) &&
            (root && root != root->fs_info->tree_root) &&
            btrfs_test_opt(root, SPACE_CACHE)) {
-                spin_lock(&cache->lock);
-                if (cache->cached != BTRFS_CACHE_NO) {
-                        spin_unlock(&cache->lock);
-                        return 0;
-                }
-                cache->cached = BTRFS_CACHE_STARTED;
-                spin_unlock(&cache->lock);
                ret = load_free_space_cache(fs_info, cache);
                spin_lock(&cache->lock);
                if (ret == 1) {
+                        cache->caching_ctl = NULL;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        cache->last_byte_to_unpin = (u64)-1;
                } else {
-                        cache->cached = BTRFS_CACHE_NO;
+                        if (load_cache_only) {
+                                cache->caching_ctl = NULL;
+                                cache->cached = BTRFS_CACHE_NO;
+                        } else {
+                                cache->cached = BTRFS_CACHE_STARTED;
+                        }
                }
                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
                if (ret == 1) {
+                        put_caching_control(caching_ctl);
                        free_excluded_extents(fs_info->extent_root, cache);
                        return 0;
                }
+        } else {
+                /*
+                 * We are not going to do the fast caching, set cached to the
+                 * appropriate value and wakeup any waiters.
+                 */
+                spin_lock(&cache->lock);
+                if (load_cache_only) {
+                        cache->caching_ctl = NULL;
+                        cache->cached = BTRFS_CACHE_NO;
+                } else {
+                        cache->cached = BTRFS_CACHE_STARTED;
+                }
+                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
        }
-        if (load_cache_only)
+        if (load_cache_only) {
-                return 0;
+                put_caching_control(caching_ctl);
-        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        BUG_ON(!caching_ctl);
-        INIT_LIST_HEAD(&caching_ctl->list);
-        mutex_init(&caching_ctl->mutex);
-        init_waitqueue_head(&caching_ctl->wait);
-        caching_ctl->block_group = cache;
-        caching_ctl->progress = cache->key.objectid;
-        /* one for caching kthread, one for caching block group list */
-        atomic_set(&caching_ctl->count, 2);
-        caching_ctl->work.func = caching_thread;
-        spin_lock(&cache->lock);
-        if (cache->cached != BTRFS_CACHE_NO) {
-                spin_unlock(&cache->lock);
-                kfree(caching_ctl);
                return 0;
        }
-        cache->caching_ctl = caching_ctl;
-        cache->cached = BTRFS_CACHE_STARTED;
-        spin_unlock(&cache->lock);
        down_write(&fs_info->extent_commit_sem);
+        atomic_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        up_write(&fs_info->extent_commit_sem);
@@ -2781,7 +2822,7 @@ out_free:
        btrfs_release_path(path);
 out:
        spin_lock(&block_group->lock);
-        if (!ret)
+        if (!ret && dcs == BTRFS_DC_SETUP)
                block_group->cache_generation = trans->transid;
        block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);
@@ -3797,16 +3838,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
        kfree(rsv);
 }
-int btrfs_block_rsv_add(struct btrfs_root *root,
+static inline int __block_rsv_add(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv,
+                                  struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes)
+                                  u64 num_bytes, int flush)
 {
        int ret;
        if (num_bytes == 0)
                return 0;
-        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
@@ -3815,22 +3856,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
        return ret;
 }
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes)
+{
+        return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
                                struct btrfs_block_rsv *block_rsv,
                                u64 num_bytes)
 {
-        int ret;
+        return __block_rsv_add(root, block_rsv, num_bytes, 0);
-        if (num_bytes == 0)
-                return 0;
-        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
-        if (!ret) {
-                block_rsv_add_bytes(block_rsv, num_bytes, 1);
-                return 0;
-        }
-        return ret;
 }
 int btrfs_block_rsv_check(struct btrfs_root *root,
@@ -3851,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
        return ret;
 }
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
+                                           struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
+                                           u64 min_reserved, int flush)
 {
        u64 num_bytes = 0;
        int ret = -ENOSPC;
@@ -3872,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
        if (!ret)
                return 0;
-        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 0);
                return 0;
@@ -3881,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                           struct btrfs_block_rsv *block_rsv,
+                           u64 min_reserved)
+{
+        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+                                   struct btrfs_block_rsv *block_rsv,
+                                   u64 min_reserved)
+{
+        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes)
@@ -4064,23 +4115,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
+        unsigned drop_inode_space = 0;
        unsigned dropped_extents = 0;
        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
        BTRFS_I(inode)->outstanding_extents--;
+        if (BTRFS_I(inode)->outstanding_extents == 0 &&
+            BTRFS_I(inode)->delalloc_meta_reserved) {
+                drop_inode_space = 1;
+                BTRFS_I(inode)->delalloc_meta_reserved = 0;
+        }
        /*
         * If we have more or the same amount of outsanding extents than we have
         * reserved then we need to leave the reserved extents count alone.
         */
        if (BTRFS_I(inode)->outstanding_extents >=
            BTRFS_I(inode)->reserved_extents)
-                return 0;
+                return drop_inode_space;
        dropped_extents = BTRFS_I(inode)->reserved_extents -
                BTRFS_I(inode)->outstanding_extents;
        BTRFS_I(inode)->reserved_extents -= dropped_extents;
-        return dropped_extents;
+        return dropped_extents + drop_inode_space;
 }
 /**
@@ -4146,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve = 0;
+        u64 csum_bytes;
        unsigned nr_extents = 0;
+        int extra_reserve = 0;
        int flush = 1;
        int ret;
+        /* Need to be holding the i_mutex here if we aren't free space cache */
        if (btrfs_is_free_space_inode(root, inode))
                flush = 0;
+        else
+                WARN_ON(!mutex_is_locked(&inode->i_mutex));
        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
@@ -4162,14 +4225,22 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        BTRFS_I(inode)->outstanding_extents++;
        if (BTRFS_I(inode)->outstanding_extents >
-            BTRFS_I(inode)->reserved_extents) {
+            BTRFS_I(inode)->reserved_extents)
                nr_extents = BTRFS_I(inode)->outstanding_extents -
                        BTRFS_I(inode)->reserved_extents;
-                BTRFS_I(inode)->reserved_extents += nr_extents;
-                to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+        /*
+         * Add an item to reserve for updating the inode when we complete the
+         * delalloc io.
+         */
+        if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+                nr_extents++;
+                extra_reserve = 1;
        }
+        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4179,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                spin_lock(&BTRFS_I(inode)->lock);
                dropped = drop_outstanding_extent(inode);
-                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-                spin_unlock(&BTRFS_I(inode)->lock);
-                to_free += btrfs_calc_trans_metadata_size(root, dropped);
                /*
-                 * Somebody could have come in and twiddled with the
+                 * If the inodes csum_bytes is the same as the original
-                 * reservation, so if we have to free more than we would have
+                 * csum_bytes then we know we haven't raced with any free()ers
-                 * reserved from this reservation go ahead and release those
+                 * so we can just reduce our inodes csum bytes and carry on.
-                 * bytes.
+                 * Otherwise we have to do the normal free thing to account for
+                 * the case that the free side didn't free up its reserve
+                 * because of this outstanding reservation.
                 */
-                to_free -= to_reserve;
+                if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+                        calc_csum_metadata_size(inode, num_bytes, 0);
+                else
+                        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+                spin_unlock(&BTRFS_I(inode)->lock);
+                if (dropped)
+                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
                if (to_free)
                        btrfs_block_rsv_release(root, block_rsv, to_free);
                return ret;
        }
+        spin_lock(&BTRFS_I(inode)->lock);
+        if (extra_reserve) {
+                BTRFS_I(inode)->delalloc_meta_reserved = 1;
+                nr_extents--;
+        }
+        BTRFS_I(inode)->reserved_extents += nr_extents;
+        spin_unlock(&BTRFS_I(inode)->lock);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
        return 0;
@@ -5040,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = orig_root->fs_info->extent_root;
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
+        struct btrfs_block_group_cache *used_block_group;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
        int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
-        int last_ptr_loop = 0;
        int loop = 0;
        int index = 0;
        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5106,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 ideal_cache:
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
+                used_block_group = block_group;
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -5143,6 +5228,7 @@ search:
                u64 offset;
                int cached;
+                used_block_group = block_group;
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
@@ -5166,13 +5252,15 @@ search:
                }
 have_block_group:
-                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
                        u64 free_percent;
+                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, trans,
                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
-                                goto have_block_group;
+                                goto alloc;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
@@ -5194,7 +5282,6 @@ have_block_group:
                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
-                        found_uncached_bg = true;
                        /*
                         * If loop is set for cached only, try the next block
@@ -5204,94 +5291,80 @@ have_block_group:
                                goto loop;
                }
-                cached = block_group_cache_done(block_group);
+alloc:
-                if (unlikely(!cached))
-                        found_uncached_bg = true;
                if (unlikely(block_group->ro))
                        goto loop;
                spin_lock(&block_group->free_space_ctl->tree_lock);
                if (cached &&
                    block_group->free_space_ctl->free_space <
-                    num_bytes + empty_size) {
+                    num_bytes + empty_cluster + empty_size) {
                        spin_unlock(&block_group->free_space_ctl->tree_lock);
                        goto loop;
                }
                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
-                 * Ok we want to try and use the cluster allocator, so lets look
+                 * Ok we want to try and use the cluster allocator, so
-                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+                 * lets look there
-                 * have tried the cluster allocator plenty of times at this
-                 * point and not have found anything, so we are likely way too
-                 * fragmented for the clustering stuff to find anything, so lets
-                 * just skip it and let the allocator find whatever block it can
-                 * find
                 */
-                if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+                if (last_ptr) {
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
                         */
                        spin_lock(&last_ptr->refill_lock);
-                        if (last_ptr->block_group &&
+                        used_block_group = last_ptr->block_group;
-                            (last_ptr->block_group->ro ||
+                        if (used_block_group != block_group &&
-                            !block_group_bits(last_ptr->block_group, data))) {
+                            (!used_block_group ||
-                                offset = 0;
+                             used_block_group->ro ||
+                             !block_group_bits(used_block_group, data))) {
+                                used_block_group = block_group;
                                goto refill_cluster;
                        }
-                        offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+                        if (used_block_group != block_group)
-                                                 num_bytes, search_start);
+                                btrfs_get_block_group(used_block_group);
+                        offset = btrfs_alloc_from_cluster(used_block_group,
+                          last_ptr, num_bytes, used_block_group->key.objectid);
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
                                goto checks;
                        }
-                        spin_lock(&last_ptr->lock);
+                        WARN_ON(last_ptr->block_group != used_block_group);
-                        /*
+                        if (used_block_group != block_group) {
-                         * whoops, this cluster doesn't actually point to
+                                btrfs_put_block_group(used_block_group);
-                         * this block group.  Get a ref on the block
+                                used_block_group = block_group;
-                         * group is does point to and try again
-                         */
-                        if (!last_ptr_loop && last_ptr->block_group &&
-                            last_ptr->block_group != block_group &&
-                            index <=
-                                 get_block_group_index(last_ptr->block_group)) {
-                                btrfs_put_block_group(block_group);
-                                block_group = last_ptr->block_group;
-                                btrfs_get_block_group(block_group);
-                                spin_unlock(&last_ptr->lock);
-                                spin_unlock(&last_ptr->refill_lock);
-                                last_ptr_loop = 1;
-                                search_start = block_group->key.objectid;
-                                /*
-                                 * we know this block group is properly
-                                 * in the list because
-                                 * btrfs_remove_block_group, drops the
-                                 * cluster before it removes the block
-                                 * group from the list
-                                 */
-                                goto have_block_group;
                        }
-                        spin_unlock(&last_ptr->lock);
 refill_cluster:
+                        BUG_ON(used_block_group != block_group);
+                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+                         * set up a new clusters, so lets just skip it
+                         * and let the allocator find whatever block
+                         * it can find.  If we reach this point, we
+                         * will have tried the cluster allocator
+                         * plenty of times and not have found
+                         * anything, so we are likely way too
+                         * fragmented for the clustering stuff to find
+                         * anything.  */
+                        if (loop >= LOOP_NO_EMPTY_SIZE) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto unclustered_alloc;
+                        }
                        /*
                         * this cluster didn't work out, free it and
                         * start over
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
-                        last_ptr_loop = 0;
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
-                                               offset, num_bytes,
+                                               search_start, num_bytes,
                                               empty_cluster + empty_size);
                        if (ret == 0) {
                                /*
@@ -5327,6 +5400,7 @@ refill_cluster:
                        goto loop;
                }
+unclustered_alloc:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5353,14 +5427,14 @@ checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
                /* move on to the next group */
                if (search_start + num_bytes >
-                    block_group->key.objectid + block_group->key.offset) {
+                    used_block_group->key.objectid + used_block_group->key.offset) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
@@ -5368,14 +5442,14 @@ checks:
                ins->offset = num_bytes;
                if (offset < search_start)
-                        btrfs_add_free_space(block_group, offset,
+                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+                ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
                                                  alloc_type);
                if (ret == -EAGAIN) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
@@ -5384,15 +5458,19 @@ checks:
                ins->offset = num_bytes;
                if (offset < search_start)
-                        btrfs_add_free_space(block_group, offset,
+                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
+                if (used_block_group != block_group)
+                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
                BUG_ON(index != get_block_group_index(block_group));
+                if (used_block_group != block_group)
+                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a0..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
        node = tree_search(tree, start);
        if (!node) {
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
         */
        if (state->start < start) {
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                err = split_state(tree, state, prealloc, start);
                BUG_ON(err == -EEXIST);
                prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
                        this_end = last_start - 1;
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                /*
                 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
         */
        if (state->start <= end && state->end > end) {
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
@@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                clean_io_failure(start, page);
                }
                if (!uptodate) {
-                        u64 failed_mirror;
+                        int failed_mirror;
-                        failed_mirror = (u64)bio->bi_bdev;
+                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
-                        if (tree->ops && tree->ops->readpage_io_failed_hook)
+                        /*
-                                ret = tree->ops->readpage_io_failed_hook(
+                         * The generic bio_readpage_error handles errors the
-                                                bio, page, start, end,
+                         * following way: If possible, new read requests are
-                                                failed_mirror, state);
+                         * created and submitted and will end up in
-                        else
+                         * end_bio_extent_readpage as well (if we're lucky, not
-                                ret = bio_readpage_error(bio, page, start, end,
+                         * in the !uptodate case). In that case it returns 0 and
-                                                         failed_mirror, NULL);
+                         * we just go on with the next page in our bio. If it
+                         * can't handle the error it will return -EIO and we
+                         * remain responsible for that page.
+                         */
+                        ret = bio_readpage_error(bio, page, start, end,
+                                                        failed_mirror, NULL);
                        if (ret == 0) {
+error_handled:
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
@@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                uncache_state(&cached);
                                continue;
                        }
+                        if (tree->ops && tree->ops->readpage_io_failed_hook) {
+                                ret = tree->ops->readpage_io_failed_hook(
+                                                        bio, page, start, end,
+                                                        failed_mirror, state);
+                                if (ret == 0)
+                                        goto error_handled;
+                        }
                }
                if (uptodate) {
@@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
        /*
         * lookup the last file extent.  We're not using i_size here
         * because there might be preallocation past i_size
@@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+        em = get_extent_skip_holes(inode, start, last_for_get_extent,
                                   get_extent);
        if (!em)
                goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-                                       u64 start, u64 end, u64 failed_mirror,
+                                       u64 start, u64 end, int failed_mirror,
                                       struct extent_state *state);
        int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
                                        u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dafdfa059bf6..97fbe939c050 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
                     (sizeof(struct page *)));
+        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+        nrptrs = max(nrptrs, 8);
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                return -ENOMEM;
@@ -1387,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                goto out;
        }
-        file_update_time(file);
+        err = btrfs_update_time(file);
+        if (err) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
+        }
        BTRFS_I(inode)->sequence++;
        start_pos = round_down(pos, root->sectorsize);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 7a15fcfb3e1f..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
                }
        }
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                clear_page_dirty_for_io(io_ctl->pages[i]);
+                set_page_extent_mapped(io_ctl->pages[i]);
+        }
        return 0;
 }
@@ -537,6 +542,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
                            struct btrfs_free_space *entry, u8 *type)
 {
        struct btrfs_free_space_entry *e;
+        int ret;
+        if (!io_ctl->cur) {
+                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+                if (ret)
+                        return ret;
+        }
        e = io_ctl->cur;
        entry->offset = le64_to_cpu(e->offset);
@@ -550,10 +562,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
        io_ctl_unmap_page(io_ctl);
-        if (io_ctl->index >= io_ctl->num_pages)
+        return 0;
-                return 0;
-        return io_ctl_check_crc(io_ctl, io_ctl->index);
 }
 static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
@@ -561,9 +570,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
 {
        int ret;
-        if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
-                io_ctl_unmap_page(io_ctl);
        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
        if (ret)
                return ret;
@@ -699,6 +705,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                num_entries--;
        }
+        io_ctl_unmap_page(&io_ctl);
        /*
         * We add the bitmaps at the end of the entries in order that
         * the bitmap entries are added to the cache.
@@ -1462,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
 {
        info->offset = offset_to_bitmap(ctl, offset);
        info->bytes = 0;
+        INIT_LIST_HEAD(&info->list);
        link_free_space(ctl, info);
        ctl->total_bitmaps++;
@@ -1841,7 +1850,13 @@ again:
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
-                        WARN_ON(1);
+                        /* the tree logging code might be calling us before we
+                         * have fully loaded the free space rbtree for this
+                         * block group.  So it is possible the entry won't
+                         * be in the rbtree yet at all.  The caching code
+                         * will make sure not to put it in the rbtree if
+                         * the logging code has pinned it.
+                         */
                        goto out_lock;
                }
        }
@@ -2305,6 +2320,7 @@ again:
        if (!found) {
                start = i;
+                cluster->max_size = 0;
                found = true;
        }
@@ -2448,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
-        struct rb_node *node;
        int ret = -ENOSPC;
+        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
        if (ctl->total_bitmaps == 0)
                return -ENOSPC;
        /*
-         * First check our cached list of bitmaps and see if there is an entry
+         * The bitmap that covers offset won't be in the list unless offset
-         * here that will work.
+         * is just its start offset.
         */
+        entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+        if (entry->offset != bitmap_offset) {
+                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+                if (entry && list_empty(&entry->list))
+                        list_add(&entry->list, bitmaps);
+        }
        list_for_each_entry(entry, bitmaps, list) {
                if (entry->bytes < min_bytes)
                        continue;
@@ -2468,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
        }
        /*
-         * If we do have entries on our list and we are here then we didn't find
+         * The bitmaps list has all the bitmaps that record free space
-         * anything, so go ahead and get the next entry after the last entry in
+         * starting after offset, so no more search is required.
-         * this list and start the search from there.
         */
-        if (!list_empty(bitmaps)) {
+        return -ENOSPC;
-                entry = list_entry(bitmaps->prev, struct btrfs_free_space,
-                                   list);
-                node = rb_next(&entry->offset_index);
-                if (!node)
-                        return -ENOSPC;
-                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                goto search;
-        }
-        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
-        if (!entry)
-                return -ENOSPC;
-search:
-        node = &entry->offset_index;
-        do {
-                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                node = rb_next(&entry->offset_index);
-                if (!entry->bitmap)
-                        continue;
-                if (entry->bytes < min_bytes)
-                        continue;
-                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-                                           bytes, min_bytes);
-        } while (ret && node);
-        return ret;
 }
 /*
@@ -2517,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-        struct list_head bitmaps;
        struct btrfs_free_space *entry, *tmp;
+        LIST_HEAD(bitmaps);
        u64 min_bytes;
        int ret;
@@ -2557,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
-        INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                      bytes, min_bytes);
        if (ret)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 53dcbdf446cd..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        struct btrfs_path *path;
        struct inode *inode;
+        struct btrfs_block_rsv *rsv;
+        u64 num_bytes;
        u64 alloc_hint = 0;
        int ret;
        int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
+        rsv = trans->block_rsv;
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
+        num_bytes = trans->bytes_reserved;
+        /*
+         * 1 item for inode item insertion if need
+         * 3 items for inode item update (in the worst case)
+         * 1 item for free space object
+         * 3 items for pre-allocation
+         */
+        trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+        ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+                                          trans->bytes_reserved);
+        if (ret)
+                goto out;
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
                ret = PTR_ERR(inode);
-                goto out;
+                goto out_release;
        }
        if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
                ret = create_free_ino_inode(root, trans, path);
                if (ret)
-                        goto out;
+                        goto out_release;
                goto again;
        }
@@ -477,11 +494,14 @@ again:
        }
        btrfs_free_reserved_data_space(inode, prealloc);
+        ret = btrfs_write_out_ino_cache(root, trans, path);
 out_put:
        iput(inode);
+out_release:
+        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
-        if (ret == 0)
+        trans->block_rsv = rsv;
-                ret = btrfs_write_out_ino_cache(root, trans, path);
+        trans->bytes_reserved = num_bytes;
        btrfs_free_path(path);
        return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 966ddcc4c63d..0a6b928813a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
 #include <linux/falloc.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/mount.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   u64 start, u64 end, int *page_started,
                                   unsigned long *nr_written, int unlock);
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
@@ -1741,7 +1744,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                trans = btrfs_join_transaction(root);
                        BUG_ON(IS_ERR(trans));
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-                        ret = btrfs_update_inode(trans, root, inode);
+                        ret = btrfs_update_inode_fallback(trans, root, inode);
                        BUG_ON(ret);
                }
                goto out;
@@ -1791,7 +1794,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-                ret = btrfs_update_inode(trans, root, inode);
+                ret = btrfs_update_inode_fallback(trans, root, inode);
                BUG_ON(ret);
        }
        ret = 0;
@@ -2029,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        /* insert an orphan item to track this unlinked/truncated file */
        if (insert >= 1) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-                BUG_ON(ret);
+                BUG_ON(ret && ret != -EEXIST);
        }
        /* insert an orphan item to track subvolume contains orphan files */
@@ -2156,6 +2159,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                if (ret && ret != -ESTALE)
                        goto out;
+                if (ret == -ESTALE && root == root->fs_info->tree_root) {
+                        struct btrfs_root *dead_root;
+                        struct btrfs_fs_info *fs_info = root->fs_info;
+                        int is_dead_root = 0;
+                        /*
+                         * this is an orphan in the tree root. Currently these
+                         * could come from 2 sources:
+                         *  a) a snapshot deletion in progress
+                         *  b) a free space cache inode
+                         * We need to distinguish those two, as the snapshot
+                         * orphan must not get deleted.
+                         * find_dead_roots already ran before us, so if this
+                         * is a snapshot deletion, we should find the root
+                         * in the dead_roots list
+                         */
+                        spin_lock(&fs_info->trans_lock);
+                        list_for_each_entry(dead_root, &fs_info->dead_roots,
+                                            root_list) {
+                                if (dead_root->root_key.objectid ==
+                                    found_key.objectid) {
+                                        is_dead_root = 1;
+                                        break;
+                                }
+                        }
+                        spin_unlock(&fs_info->trans_lock);
+                        if (is_dead_root) {
+                                /* prevent this orphan from being found again */
+                                key.offset = found_key.objectid - 1;
+                                continue;
+                        }
+                }
                /*
                 * Inode is already gone but the orphan item is still there,
                 * kill the orphan item.
@@ -2189,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
+                        /*
+                         * Need to hold the imutex for reservation purposes, not
+                         * a huge deal here but I have a WARN_ON in
+                         * btrfs_delalloc_reserve_space to catch offenders.
+                         */
+                        mutex_lock(&inode->i_mutex);
                        ret = btrfs_truncate(inode);
+                        mutex_unlock(&inode->i_mutex);
                } else {
                        nr_unlink++;
                }
@@ -2199,6 +2241,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                if (ret)
                        goto out;
        }
+        /* release the path since we're done with it */
+        btrfs_release_path(path);
        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
        if (root->orphan_block_rsv)
@@ -2426,7 +2471,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
 * copy everything in the in-memory inode into the btree.
 */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, struct inode *inode)
 {
        struct btrfs_inode_item *inode_item;
@@ -2434,21 +2479,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        int ret;
-        /*
-         * If the inode is a free space inode, we can deadlock during commit
-         * if we put it into the delayed code.
-         *
-         * The data relocation inode should also be directly updated
-         * without delay
-         */
-        if (!btrfs_is_free_space_inode(root, inode)
-            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
-                ret = btrfs_delayed_update_inode(trans, root, inode);
-                if (!ret)
-                        btrfs_set_inode_last_trans(trans, inode);
-                return ret;
-        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -2477,6 +2507,43 @@ failed:
 }
 /*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode)
+{
+        int ret;
+        /*
+         * If the inode is a free space inode, we can deadlock during commit
+         * if we put it into the delayed code.
+         *
+         * The data relocation inode should also be directly updated
+         * without delay
+         */
+        if (!btrfs_is_free_space_inode(root, inode)
+            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                ret = btrfs_delayed_update_inode(trans, root, inode);
+                if (!ret)
+                        btrfs_set_inode_last_trans(trans, inode);
+                return ret;
+        }
+        return btrfs_update_inode_item(trans, root, inode);
+}
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode)
+{
+        int ret;
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret == -ENOSPC)
+                return btrfs_update_inode_item(trans, root, inode);
+        return ret;
+}
+/*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
@@ -3300,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        trans = btrfs_start_transaction(root, 2);
+                        trans = btrfs_start_transaction(root, 3);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
                                break;
@@ -3310,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
                        if (err) {
+                                btrfs_update_inode(trans, root, inode);
                                btrfs_end_transaction(trans, root);
                                break;
                        }
@@ -3319,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
                        if (err) {
+                                btrfs_update_inode(trans, root, inode);
                                btrfs_end_transaction(trans, root);
                                break;
                        }
@@ -3326,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
+                        btrfs_update_inode(trans, root, inode);
                        btrfs_end_transaction(trans, root);
                }
                free_extent_map(em);
@@ -3343,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 static int btrfs_setsize(struct inode *inode, loff_t newsize)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
        int ret;
@@ -3350,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                return 0;
        if (newsize > oldsize) {
-                i_size_write(inode, newsize);
-                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                truncate_pagecache(inode, oldsize, newsize);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
-                if (ret) {
+                if (ret)
-                        btrfs_setsize(inode, oldsize);
                        return ret;
-                }
-                mark_inode_dirty(inode);
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                i_size_write(inode, newsize);
+                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+                ret = btrfs_update_inode(trans, root, inode);
+                btrfs_end_transaction_throttle(trans, root);
        } else {
                /*
@@ -3399,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (attr->ia_valid) {
                setattr_copy(inode, attr);
-                mark_inode_dirty(inode);
+                err = btrfs_dirty_inode(inode);
-                if (attr->ia_valid & ATTR_MODE)
+                if (!err && attr->ia_valid & ATTR_MODE)
                        err = btrfs_acl_chmod(inode);
        }
@@ -3463,7 +3538,7 @@ void btrfs_evict_inode(struct inode *inode)
         * doing the truncate.
         */
        while (1) {
-                ret = btrfs_block_rsv_refill(root, rsv, min_size);
+                ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
                /*
                 * Try and steal from the global reserve since we will
@@ -4177,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
-void btrfs_dirty_inode(struct inode *inode, int flags)
+int btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret;
        if (BTRFS_I(inode)->dummy_inode)
-                return;
+                return 0;
        trans = btrfs_join_transaction(root);
-        BUG_ON(IS_ERR(trans));
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && ret == -ENOSPC) {
                /* whoops, lets try again with the full transaction */
                btrfs_end_transaction(trans, root);
                trans = btrfs_start_transaction(root, 1);
-                if (IS_ERR(trans)) {
+                if (IS_ERR(trans))
-                        printk_ratelimited(KERN_ERR "btrfs: fail to "
+                        return PTR_ERR(trans);
-                                       "dirty  inode %llu error %ld\n",
-                                       (unsigned long long)btrfs_ino(inode),
-                                       PTR_ERR(trans));
-                        return;
-                }
                ret = btrfs_update_inode(trans, root, inode);
-                if (ret) {
-                        printk_ratelimited(KERN_ERR "btrfs: fail to "
-                                       "dirty  inode %llu error %d\n",
-                                       (unsigned long long)btrfs_ino(inode),
-                                       ret);
-                }
        }
        btrfs_end_transaction(trans, root);
        if (BTRFS_I(inode)->delayed_node)
                btrfs_balance_delayed_items(root);
+        return ret;
+}
+/*
+ * This is a copy of file_update_time.  We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+int btrfs_update_time(struct file *file)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct timespec now;
+        int ret;
+        enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+        /* First try to exhaust all avenues to not sync */
+        if (IS_NOCMTIME(inode))
+                return 0;
+        now = current_fs_time(inode->i_sb);
+        if (!timespec_equal(&inode->i_mtime, &now))
+                sync_it = S_MTIME;
+        if (!timespec_equal(&inode->i_ctime, &now))
+                sync_it |= S_CTIME;
+        if (IS_I_VERSION(inode))
+                sync_it |= S_VERSION;
+        if (!sync_it)
+                return 0;
+        /* Finally allowed to write? Takes lock. */
+        if (mnt_want_write_file(file))
+                return 0;
+        /* Only change inode inside the lock region */
+        if (sync_it & S_VERSION)
+                inode_inc_iversion(inode);
+        if (sync_it & S_CTIME)
+                inode->i_ctime = now;
+        if (sync_it & S_MTIME)
+                inode->i_mtime = now;
+        ret = btrfs_dirty_inode(inode);
+        if (!ret)
+                mark_inode_dirty_sync(inode);
+        mnt_drop_write(file->f_path.mnt);
+        return ret;
 }
 /*
@@ -4528,11 +4641,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        /*
+        * If the active LSM wants to access the inode during
+        * d_instantiate it needs these. Smack checks to see
+        * if the filesystem supports xattrs by looking at the
+        * ops vector.
+        */
+        inode->i_op = &btrfs_special_inode_operations;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
-                inode->i_op = &btrfs_special_inode_operations;
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
@@ -4586,14 +4706,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        /*
+        * If the active LSM wants to access the inode during
+        * d_instantiate it needs these. Smack checks to see
+        * if the filesystem supports xattrs by looking at the
+        * ops vector.
+        */
+        inode->i_fop = &btrfs_file_operations;
+        inode->i_op = &btrfs_file_inode_operations;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
                inode->i_mapping->a_ops = &btrfs_aops;
                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                inode->i_fop = &btrfs_file_operations;
-                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
 out_unlock:
@@ -5632,7 +5759,7 @@ again:
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
                if (!ret)
-                        err = btrfs_update_inode(trans, root, inode);
+                        err = btrfs_update_inode_fallback(trans, root, inode);
                goto out;
        }
@@ -5670,7 +5797,7 @@ again:
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-                btrfs_update_inode(trans, root, inode);
+                btrfs_update_inode_fallback(trans, root, inode);
        ret = 0;
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6276,7 +6403,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
+        /* Need this to keep space reservations serialized */
+        mutex_lock(&inode->i_mutex);
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+        mutex_unlock(&inode->i_mutex);
+        if (!ret)
+                ret = btrfs_update_time(vma->vm_file);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -6488,8 +6620,9 @@ static int btrfs_truncate(struct inode *inode)
                        /* Just need the 1 for updating the inode */
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
-                                err = PTR_ERR(trans);
+                                ret = err = PTR_ERR(trans);
-                                goto out;
+                                trans = NULL;
+                                break;
                        }
                }
@@ -6529,14 +6662,16 @@ end_trans:
                ret = btrfs_orphan_del(NULL, inode);
        }
-        trans->block_rsv = &root->fs_info->trans_block_rsv;
+        if (trans) {
-        ret = btrfs_update_inode(trans, root, inode);
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
-        if (ret && !err)
+                ret = btrfs_update_inode(trans, root, inode);
-                err = ret;
+                if (ret && !err)
+                        err = ret;
-        nr = trans->blocks_used;
+                nr = trans->blocks_used;
-        ret = btrfs_end_transaction_throttle(trans, root);
+                ret = btrfs_end_transaction_throttle(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root, nr);
+        }
 out:
        btrfs_free_block_rsv(root, rsv);
@@ -6605,6 +6740,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
        ei->in_defrag = 0;
+        ei->delalloc_meta_reserved = 0;
        ei->force_compress = BTRFS_COMPRESS_NONE;
        ei->delayed_node = NULL;
@@ -6764,11 +6900,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
+        u32 blocksize = inode->i_sb->s_blocksize;
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
        stat->blksize = PAGE_CACHE_SIZE;
-        stat->blocks = (inode_get_bytes(inode) +
+        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-                        BTRFS_I(inode)->delalloc_bytes) >> 9;
+                ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
@@ -7044,14 +7182,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        /*
+        * If the active LSM wants to access the inode during
+        * d_instantiate it needs these. Smack checks to see
+        * if the filesystem supports xattrs by looking at the
+        * ops vector.
+        */
+        inode->i_fop = &btrfs_file_operations;
+        inode->i_op = &btrfs_file_inode_operations;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
                inode->i_mapping->a_ops = &btrfs_aops;
                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                inode->i_fop = &btrfs_file_operations;
-                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
        if (drop_inode)
@@ -7321,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
        .getattr        = btrfs_getattr,
+        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
        .getxattr       = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f126..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
+        btrfs_update_iflags(inode);
+        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-        btrfs_update_iflags(inode);
-        inode->i_ctime = CURRENT_TIME;
        btrfs_end_transaction(trans, root);
        mnt_drop_write(file->f_path.mnt);
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
                return 0;
        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+        mutex_lock(&inode->i_mutex);
        ret = btrfs_delalloc_reserve_space(inode,
                                           num_pages << PAGE_CACHE_SHIFT);
+        mutex_unlock(&inode->i_mutex);
        if (ret)
                return ret;
 again:
@@ -1216,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
-                printk(KERN_INFO "resizing devid %llu\n",
+                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root, devid, NULL, NULL);
        if (!device) {
-                printk(KERN_INFO "resizer unable to find device %llu\n",
+                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_unlock;
@@ -1267,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        do_div(new_size, root->sectorsize);
        new_size *= root->sectorsize;
-        printk(KERN_INFO "new size for %s is %llu\n",
+        printk(KERN_INFO "btrfs: new size for %s is %llu\n",
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
@@ -1278,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
-        } else {
+        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
        }
@@ -2930,11 +2932,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
                goto out;
        for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
-                rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
+                rel_ptr = ipath->fspath->val[i] -
+                          (u64)(unsigned long)ipath->fspath->val;
                ipath->fspath->val[i] = rel_ptr;
        }
-        ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
+        ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+                           (void *)(unsigned long)ipath->fspath, size);
        if (ret) {
                ret = -EFAULT;
                goto out;
@@ -3017,7 +3021,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
        if (ret < 0)
                goto out;
-        ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
+        ret = copy_to_user((void *)(unsigned long)loi->inodes,
+                           (void *)(unsigned long)inodes, size);
        if (ret)
                ret = -EFAULT;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 24d654ce7a06..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
                        list_add_tail(&new_edge->list[UPPER],
                                      &new_node->lower);
                }
+        } else {
+                list_add_tail(&new_node->lower, &cache->leaves);
        }
        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2945,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
+                mutex_lock(&inode->i_mutex);
                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+                mutex_unlock(&inode->i_mutex);
                if (ret)
                        goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ed11d3866afd..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
        btrfs_release_path(swarn->path);
        ipath = init_ipath(4096, local_root, swarn->path);
+        if (IS_ERR(ipath)) {
+                ret = PTR_ERR(ipath);
+                ipath = NULL;
+                goto err;
+        }
        ret = paths_from_inode(inum, ipath);
        if (ret < 0)
@@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
                        swarn->logical, swarn->dev->name,
                        (unsigned long long)swarn->sector, root, inum, offset,
                        min(isize - offset, (u64)PAGE_SIZE), nlink,
-                        (char *)ipath->fspath->val[i]);
+                        (char *)(unsigned long)ipath->fspath->val[i]);
        free_ipath(ipath);
        return 0;
@@ -944,50 +949,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
        struct scrub_bio *sbio;
-        struct bio *bio;
-        int i;
        if (sdev->curr == -1)
                return 0;
        sbio = sdev->bios[sdev->curr];
-        bio = bio_alloc(GFP_NOFS, sbio->count);
-        if (!bio)
-                goto nomem;
-        bio->bi_private = sbio;
-        bio->bi_end_io = scrub_bio_end_io;
-        bio->bi_bdev = sdev->dev->bdev;
-        bio->bi_sector = sbio->physical >> 9;
-        for (i = 0; i < sbio->count; ++i) {
-                struct page *page;
-                int ret;
-                page = alloc_page(GFP_NOFS);
-                if (!page)
-                        goto nomem;
-                ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-                if (!ret) {
-                        __free_page(page);
-                        goto nomem;
-                }
-        }
        sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
-        submit_bio(READ, bio);
+        submit_bio(READ, sbio->bio);
        return 0;
-nomem:
-        scrub_free_bio(bio);
-        return -ENOMEM;
 }
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -995,6 +968,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
                      u8 *csum, int force)
 {
        struct scrub_bio *sbio;
+        struct page *page;
+        int ret;
 again:
        /*
@@ -1015,12 +990,22 @@ again:
        }
        sbio = sdev->bios[sdev->curr];
        if (sbio->count == 0) {
+                struct bio *bio;
                sbio->physical = physical;
                sbio->logical = logical;
+                bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+                if (!bio)
+                        return -ENOMEM;
+                bio->bi_private = sbio;
+                bio->bi_end_io = scrub_bio_end_io;
+                bio->bi_bdev = sdev->dev->bdev;
+                bio->bi_sector = sbio->physical >> 9;
+                sbio->err = 0;
+                sbio->bio = bio;
        } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
                   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-                int ret;
                ret = scrub_submit(sdev);
                if (ret)
                        return ret;
@@ -1030,6 +1015,20 @@ again:
        sbio->spag[sbio->count].generation = gen;
        sbio->spag[sbio->count].have_csum = 0;
        sbio->spag[sbio->count].mirror_num = mirror_num;
+        page = alloc_page(GFP_NOFS);
+        if (!page)
+                return -ENOMEM;
+        ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
+        if (!ret) {
+                __free_page(page);
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
+                goto again;
+        }
        if (csum) {
                sbio->spag[sbio->count].have_csum = 1;
                memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -1536,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+        int ret = 0;
        mutex_lock(&fs_info->scrub_lock);
        if (fs_info->scrub_workers_refcnt == 0) {
                btrfs_init_workers(&fs_info->scrub_workers, "scrub",
                           fs_info->thread_pool_size, &fs_info->generic_worker);
                fs_info->scrub_workers.idle_thresh = 4;
-                btrfs_start_workers(&fs_info->scrub_workers, 1);
+                ret = btrfs_start_workers(&fs_info->scrub_workers);
+                if (ret)
+                        goto out;
        }
        ++fs_info->scrub_workers_refcnt;
+out:
        mutex_unlock(&fs_info->scrub_lock);
-        return 0;
+        return ret;
 }
 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 57080dffdfc6..200f63bc6675 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/mnt_namespace.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -197,7 +198,7 @@ static match_table_t tokens = {
        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_defrag, "autodefrag"},
        {Opt_inode_cache, "inode_cache"},
-        {Opt_no_space_cache, "no_space_cache"},
+        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
        {Opt_err, NULL},
 };
@@ -448,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_subvol:
+                        kfree(*subvol_name);
                        *subvol_name = match_strdup(&args[0]);
                        break;
                case Opt_subvolid:
@@ -710,7 +712,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (btrfs_test_opt(root, SPACE_CACHE))
                seq_puts(seq, ",space_cache");
        else
-                seq_puts(seq, ",no_space_cache");
+                seq_puts(seq, ",nospace_cache");
        if (btrfs_test_opt(root, CLEAR_CACHE))
                seq_puts(seq, ",clear_cache");
        if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -824,13 +826,9 @@ static char *setup_root_args(char *args)
 static struct dentry *mount_subvol(const char *subvol_name, int flags,
                                   const char *device_name, char *data)
 {
-        struct super_block *s;
        struct dentry *root;
        struct vfsmount *mnt;
-        struct mnt_namespace *ns_private;
        char *newargs;
-        struct path path;
-        int error;
        newargs = setup_root_args(data);
        if (!newargs)
@@ -841,39 +839,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
        if (IS_ERR(mnt))
                return ERR_CAST(mnt);
-        ns_private = create_mnt_ns(mnt);
+        root = mount_subtree(mnt, subvol_name);
-        if (IS_ERR(ns_private)) {
-                mntput(mnt);
-                return ERR_CAST(ns_private);
-        }
-        /*
-         * This will trigger the automount of the subvol so we can just
-         * drop the mnt we have here and return the dentry that we
-         * found.
-         */
-        error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
-                                LOOKUP_FOLLOW, &path);
-        put_mnt_ns(ns_private);
-        if (error)
-                return ERR_PTR(error);
-        if (!is_subvolume_inode(path.dentry->d_inode)) {
+        if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
-                path_put(&path);
+                struct super_block *s = root->d_sb;
-                mntput(mnt);
+                dput(root);
-                error = -EINVAL;
+                root = ERR_PTR(-EINVAL);
+                deactivate_locked_super(s);
                printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
                                subvol_name);
-                return ERR_PTR(-EINVAL);
        }
-        /* Get a ref to the sb and the dentry we found and return it */
-        s = path.mnt->mnt_sb;
-        atomic_inc(&s->s_active);
-        root = dget(path.dentry);
-        path_put(&path);
-        down_write(&s->s_umount);
        return root;
 }
@@ -890,7 +866,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
-        struct btrfs_root *tree_root = NULL;
        struct btrfs_fs_info *fs_info = NULL;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
@@ -904,8 +879,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        error = btrfs_parse_early_options(data, mode, fs_type,
                                          &subvol_name, &subvol_objectid,
                                          &subvol_rootid, &fs_devices);
-        if (error)
+        if (error) {
+                kfree(subvol_name);
                return ERR_PTR(error);
+        }
        if (subvol_name) {
                root = mount_subvol(subvol_name, flags, device_name, data);
@@ -917,15 +894,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (error)
                return ERR_PTR(error);
-        error = btrfs_open_devices(fs_devices, mode, fs_type);
-        if (error)
-                return ERR_PTR(error);
-        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
-                error = -EACCES;
-                goto error_close_devices;
-        }
        /*
         * Setup a dummy root and fs_info for test/set super.  This is because
         * we don't actually fill this stuff out until open_ctree, but we need
@@ -933,24 +901,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
         * then open_ctree will properly initialize everything later.
         */
        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
-        tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info)
-        if (!fs_info || !tree_root) {
+                return ERR_PTR(-ENOMEM);
+        fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info->tree_root) {
                error = -ENOMEM;
-                goto error_close_devices;
+                goto error_fs_info;
        }
-        fs_info->tree_root = tree_root;
+        fs_info->tree_root->fs_info = fs_info;
        fs_info->fs_devices = fs_devices;
-        tree_root->fs_info = fs_info;
        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
        fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
        if (!fs_info->super_copy || !fs_info->super_for_commit) {
                error = -ENOMEM;
+                goto error_fs_info;
+        }
+        error = btrfs_open_devices(fs_devices, mode, fs_type);
+        if (error)
+                goto error_fs_info;
+        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+                error = -EACCES;
                goto error_close_devices;
        }
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+                 fs_info->tree_root);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto error_close_devices;
@@ -959,12 +939,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (s->s_root) {
                if ((flags ^ s->s_flags) & MS_RDONLY) {
                        deactivate_locked_super(s);
-                        return ERR_PTR(-EBUSY);
+                        error = -EBUSY;
+                        goto error_close_devices;
                }
                btrfs_close_devices(fs_devices);
                free_fs_info(fs_info);
-                kfree(tree_root);
        } else {
                char b[BDEVNAME_SIZE];
@@ -991,8 +971,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 error_close_devices:
        btrfs_close_devices(fs_devices);
+error_fs_info:
        free_fs_info(fs_info);
-        kfree(tree_root);
        return ERR_PTR(error);
 }
@@ -1074,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        u64 avail_space;
        u64 used_space;
        u64 min_stripe_size;
-        int min_stripes = 1;
+        int min_stripes = 1, num_stripes = 1;
        int i = 0, nr_devices;
        int ret;
-        nr_devices = fs_info->fs_devices->rw_devices;
+        nr_devices = fs_info->fs_devices->open_devices;
        BUG_ON(!nr_devices);
        devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1088,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        /* calc min stripe number for data space alloction */
        type = btrfs_get_alloc_profile(root, 1);
-        if (type & BTRFS_BLOCK_GROUP_RAID0)
+        if (type & BTRFS_BLOCK_GROUP_RAID0) {
                min_stripes = 2;
-        else if (type & BTRFS_BLOCK_GROUP_RAID1)
+                num_stripes = nr_devices;
+        } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
                min_stripes = 2;
-        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                num_stripes = 2;
+        } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
                min_stripes = 4;
+                num_stripes = 4;
+        }
        if (type & BTRFS_BLOCK_GROUP_DUP)
                min_stripe_size = 2 * BTRFS_STRIPE_LEN;
        else
                min_stripe_size = BTRFS_STRIPE_LEN;
-        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (!device->in_fs_metadata)
+                if (!device->in_fs_metadata || !device->bdev)
                        continue;
                avail_space = device->total_bytes - device->bytes_used;
@@ -1162,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        i = nr_devices - 1;
        avail_space = 0;
        while (nr_devices >= min_stripes) {
+                if (num_stripes > nr_devices)
+                        num_stripes = nr_devices;
                if (devices_info[i].max_avail >= min_stripe_size) {
                        int j;
                        u64 alloc_size;
-                        avail_space += devices_info[i].max_avail * min_stripes;
+                        avail_space += devices_info[i].max_avail * num_stripes;
                        alloc_size = devices_info[i].max_avail;
-                        for (j = i + 1 - min_stripes; j <= i; j++)
+                        for (j = i + 1 - num_stripes; j <= i; j++)
                                devices_info[j].max_avail -= alloc_size;
                }
                i--;
@@ -1285,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb)
        return 0;
 }
+static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
+{
+        int ret;
+        ret = btrfs_dirty_inode(inode);
+        if (ret)
+                printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
+                                   "error %d\n", btrfs_ino(inode), ret);
+}
 static const struct super_operations btrfs_super_ops = {
        .drop_inode     = btrfs_drop_inode,
        .evict_inode    = btrfs_evict_inode,
@@ -1292,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = {
        .sync_fs        = btrfs_sync_fs,
        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
-        .dirty_inode    = btrfs_dirty_inode,
+        .dirty_inode    = btrfs_fs_dirty_inode,
        .alloc_inode    = btrfs_alloc_inode,
        .destroy_inode  = btrfs_destroy_inode,
        .statfs         = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 960835eaf4da..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_save_ino_cache(root, trans);
+                        /* see comments in should_cow_block() */
+                        root->force_cow = 0;
+                        smp_wmb();
                        if (root->commit_root != root->node) {
                                mutex_lock(&root->fs_commit_mutex);
                                switch_commit_root(root);
@@ -882,8 +886,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
        if (to_reserve > 0) {
-                ret = btrfs_block_rsv_add(root, &pending->block_rsv,
+                ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
-                                          to_reserve);
+                                                  to_reserve);
                if (ret) {
                        pending->error = ret;
                        goto fail;
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
+        /* see comments in should_cow_block() */
+        root->force_cow = 1;
+        smp_wmb();
        btrfs_set_root_node(new_root_item, tmp);
        /* record when the snapshot was created in key.offset */
        key.offset = trans->transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f8e2943101a1..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
                        btrfs_requeue_work(&device->work);
                        goto done;
                }
+                /* unplug every 64 requests just for good measure */
+                if (batch_run % 64 == 0) {
+                        blk_finish_plug(&plug);
+                        blk_start_plug(&plug);
+                        sync_pending = 0;
+                }
        }
        cond_resched();
@@ -999,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        key.objectid = device->devid;
        key.offset = start;
        key.type = BTRFS_DEV_EXTENT_KEY;
+again:
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid,
@@ -1012,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                                        struct btrfs_dev_extent);
                BUG_ON(found_key.offset > start || found_key.offset +
                       btrfs_dev_extent_length(leaf, extent) < start);
+                key = found_key;
+                btrfs_release_path(path);
+                goto again;
        } else if (ret == 0) {
                leaf = path->nodes[0];
                extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1608,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -3255,7 +3264,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
                 */
                if (atomic_read(&bbio->error) > bbio->max_errors) {
                        err = -EIO;
-                } else if (err) {
+                } else {
                        /*
                         * this bio is actually up to date, we didn't
                         * go over the max number of errors
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
        struct reada_zone *reada_curr_zone;
        struct radix_tree_root reada_zones;
        struct radix_tree_root reada_extents;
+        /* for sending down flush barriers */
+        struct bio *flush_bio;
+        struct completion flush_wait;
+        int nobarriers;
 };
 struct btrfs_fs_devices {
author	Dave Airlie <airlied@redhat.com>	2011-12-20 09:43:53 -0500
committer	Dave Airlie <airlied@redhat.com>	2011-12-20 09:43:53 -0500
commit	1fbe6f625f69e48c4001051dc1431afc704acfaa (patch)
tree	826b741201a2e09a627ed350c6ff36935f5cff79 /fs/btrfs
parent	0cecdd818cd79d092e36e70dfe3a71f2878d6b96 (diff)
parent	384703b8e6cd4c8ef08512e596024e028c91c339 (diff)