Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into for-3.18

This is to receive 0a30288da1ae ("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe") which implements __percpu_ref_kill_expedited() to work around SCSI blk-mq stall. The commit reverted and patches to implement proper fix will be added. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de>
author: Tejun Heo <tj@kernel.org> 2014-09-24 13:00:21 -0400
committer: Tejun Heo <tj@kernel.org> 2014-09-24 13:00:21 -0400
commit: d06efebf0c37d438fcf07057be00dd40fcfce08d (patch)
tree: 31a0786d132aadf4cbb9725f3f444ef6e1052128 /fs/btrfs
parent: bb2e226b3bef596dd56be97df655d857b4603923 (diff)
parent: 0a30288da1aec914e158c2d7a3482a85f632750f (diff)
29 files changed, 1046 insertions, 533 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..fbd76ded9a34 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/workqueue.h>
 #include "async-thread.h"
 #include "ctree.h"
@@ -55,8 +54,39 @@ struct btrfs_workqueue {
        struct __btrfs_workqueue *high;
 };
-static inline struct __btrfs_workqueue
+static void normal_work_helper(struct btrfs_work *work);
-*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+#define BTRFS_WORK_HELPER(name)                                 \
+void btrfs_##name(struct work_struct *arg)                              \
+{                                                                       \
+        struct btrfs_work *work = container_of(arg, struct btrfs_work,  \
+                                               normal_work);            \
+        normal_work_helper(work);                                       \
+}
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
                         int thresh)
 {
        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
        spin_unlock_irqrestore(lock, flags);
 }
-static void normal_work_helper(struct work_struct *arg)
+static void normal_work_helper(struct btrfs_work *work)
 {
-        struct btrfs_work *work;
        struct __btrfs_workqueue *wq;
        int need_order = 0;
-        work = container_of(arg, struct btrfs_work, normal_work);
        /*
         * We should not touch things inside work in the following cases:
         * 1) after work->func() if it has no ordered_free
@@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg)
                trace_btrfs_all_work_done(work);
 }
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
                     btrfs_func_t func,
                     btrfs_func_t ordered_func,
                     btrfs_func_t ordered_free)
@@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work,
        work->func = func;
        work->ordered_func = ordered_func;
        work->ordered_free = ordered_free;
-        INIT_WORK(&work->normal_work, normal_work_helper);
+        INIT_WORK(&work->normal_work, uniq_func);
        INIT_LIST_HEAD(&work->ordered_list);
        work->flags = 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e9e31c94758f 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
 struct btrfs_workqueue;
 /* Internal use only */
 struct __btrfs_workqueue;
 struct btrfs_work;
 typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
 struct btrfs_work {
        btrfs_func_t func;
@@ -38,11 +40,35 @@ struct btrfs_work {
        unsigned long flags;
 };
+#define BTRFS_WORK_HELPER_PROTO(name)                                   \
+void btrfs_##name(struct work_struct *arg)
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              int flags,
                                              int max_active,
                                              int thresh);
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
                     btrfs_func_t func,
                     btrfs_func_t ordered_func,
                     btrfs_func_t ordered_free);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..54a201dac7f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                        }
                        if (ret > 0)
                                goto next;
-                        ret = ulist_add_merge(parents, eb->start,
+                        ret = ulist_add_merge_ptr(parents, eb->start,
-                                              (uintptr_t)eie,
+                                                  eie, (void **)&old, GFP_NOFS);
-                                              (u64 *)&old, GFP_NOFS);
                        if (ret < 0)
                                break;
                        if (!ret && extent_item_pos) {
@@ -1001,16 +1000,19 @@ again:
                                        ret = -EIO;
                                        goto out;
                                }
+                                btrfs_tree_read_lock(eb);
+                                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                                ret = find_extent_in_eb(eb, bytenr,
                                                        *extent_item_pos, &eie);
+                                btrfs_tree_read_unlock_blocking(eb);
                                free_extent_buffer(eb);
                                if (ret < 0)
                                        goto out;
                                ref->inode_list = eie;
                        }
-                        ret = ulist_add_merge(refs, ref->parent,
+                        ret = ulist_add_merge_ptr(refs, ref->parent,
-                                              (uintptr_t)ref->inode_list,
+                                                  ref->inode_list,
-                                              (u64 *)&eie, GFP_NOFS);
+                                                  (void **)&eie, GFP_NOFS);
                        if (ret < 0)
                                goto out;
                        if (!ret && extent_item_pos) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..56b8522d5767 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -84,12 +84,6 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
-        /*
-         * list for tracking inodes that must be sent to disk before a
-         * rename or truncate commit
-         */
-        struct list_head ordered_operations;
        /* node for the red-black tree that links inodes in subvolume root */
        struct rb_node rb_node;
@@ -240,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
            BTRFS_I(inode)->last_sub_trans <=
            BTRFS_I(inode)->last_log_commit &&
            BTRFS_I(inode)->last_sub_trans <=
-            BTRFS_I(inode)->root->last_log_commit)
+            BTRFS_I(inode)->root->last_log_commit) {
-                return 1;
+                /*
+                 * After a ranged fsync we might have left some extent maps
+                 * (that fall outside the fsync's range). So return false
+                 * here if the list isn't empty, to make sure btrfs_log_inode()
+                 * will be called and process those extent maps.
+                 */
+                smp_mb();
+                if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+                        return 1;
+        }
        return 0;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..44ee5d2e52a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                ret = btrfs_inc_ref(trans, root, cow, 1);
        else
-                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                ret = btrfs_inc_ref(trans, root, cow, 0);
        if (ret)
                return ret;
@@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                        ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+                        ret = btrfs_inc_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                                ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+                                ret = btrfs_dec_ref(trans, root, buf, 0);
                                BUG_ON(ret); /* -ENOMEM */
-                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
                                BUG_ON(ret); /* -ENOMEM */
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
-                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
                        BUG_ON(ret); /* -ENOMEM */
                }
                if (new_flags != 0) {
@@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
-                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
                        BUG_ON(ret); /* -ENOMEM */
-                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+                        ret = btrfs_dec_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
                }
                clean_tree_block(trans, root, buf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be91397f4e92..8e29b614fe93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
                         u64 min_alloc_size, u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota);
+                  struct extent_buffer *buf, int full_backref);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota);
+                  struct extent_buffer *buf, int full_backref);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..a2e90f855d7d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
                return -ENOMEM;
        async_work->delayed_root = delayed_root;
-        btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+        btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
-                        NULL, NULL);
+                        btrfs_async_run_delayed_root, NULL, NULL);
        async_work->nr = nr;
        btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 61dae01788d7..d0d78dc07792 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "print-tree.h"
-#include "async-thread.h"
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
@@ -60,8 +59,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                                    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
-                                             struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
@@ -695,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err)
 {
        struct end_io_wq *end_io_wq = bio->bi_private;
        struct btrfs_fs_info *fs_info;
+        struct btrfs_workqueue *wq;
+        btrfs_work_func_t func;
        fs_info = end_io_wq->info;
        end_io_wq->error = err;
-        btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
        if (bio->bi_rw & REQ_WRITE) {
-                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
-                        btrfs_queue_work(fs_info->endio_meta_write_workers,
+                        wq = fs_info->endio_meta_write_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_meta_write_helper;
-                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
-                        btrfs_queue_work(fs_info->endio_freespace_worker,
+                        wq = fs_info->endio_freespace_worker;
-                                         &end_io_wq->work);
+                        func = btrfs_freespace_write_helper;
-                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
-                        btrfs_queue_work(fs_info->endio_raid56_workers,
+                        wq = fs_info->endio_raid56_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_raid56_helper;
-                else
+                } else {
-                        btrfs_queue_work(fs_info->endio_write_workers,
+                        wq = fs_info->endio_write_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_write_helper;
+                }
        } else {
-                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
-                        btrfs_queue_work(fs_info->endio_raid56_workers,
+                        wq = fs_info->endio_raid56_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_raid56_helper;
-                else if (end_io_wq->metadata)
+                } else if (end_io_wq->metadata) {
-                        btrfs_queue_work(fs_info->endio_meta_workers,
+                        wq = fs_info->endio_meta_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_meta_helper;
-                else
+                } else {
-                        btrfs_queue_work(fs_info->endio_workers,
+                        wq = fs_info->endio_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_helper;
+                }
        }
+        btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+        btrfs_queue_work(wq, &end_io_wq->work);
 }
 /*
@@ -830,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->submit_bio_start = submit_bio_start;
        async->submit_bio_done = submit_bio_done;
-        btrfs_init_work(&async->work, run_one_async_start,
+        btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
                        run_one_async_done, run_one_async_free);
        async->bio_flags = bio_flags;
@@ -3452,7 +3455,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                btrfs_set_stack_device_generation(dev_item, 0);
                btrfs_set_stack_device_type(dev_item, dev->type);
                btrfs_set_stack_device_id(dev_item, dev->devid);
-                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+                btrfs_set_stack_device_total_bytes(dev_item,
+                                                   dev->disk_total_bytes);
                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
@@ -3829,34 +3833,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
        btrfs_cleanup_transaction(root);
 }
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
-                                             struct btrfs_root *root)
-{
-        struct btrfs_inode *btrfs_inode;
-        struct list_head splice;
-        INIT_LIST_HEAD(&splice);
-        mutex_lock(&root->fs_info->ordered_operations_mutex);
-        spin_lock(&root->fs_info->ordered_root_lock);
-        list_splice_init(&t->ordered_operations, &splice);
-        while (!list_empty(&splice)) {
-                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                         ordered_operations);
-                list_del_init(&btrfs_inode->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-                btrfs_invalidate_inodes(btrfs_inode->root);
-                spin_lock(&root->fs_info->ordered_root_lock);
-        }
-        spin_unlock(&root->fs_info->ordered_root_lock);
-        mutex_unlock(&root->fs_info->ordered_operations_mutex);
-}
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
        struct btrfs_ordered_extent *ordered;
@@ -4093,8 +4069,6 @@ again:
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
 {
-        btrfs_destroy_ordered_operations(cur_trans, root);
        btrfs_destroy_delayed_refs(cur_trans, root);
        cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 94ec71eda86b..caaf015d6e4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        caching_ctl->block_group = cache;
        caching_ctl->progress = cache->key.objectid;
        atomic_set(&caching_ctl->count, 1);
-        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+                        caching_thread, NULL, NULL);
        spin_lock(&cache->lock);
        /*
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
                async->sync = 0;
        init_completion(&async->wait);
-        btrfs_init_work(&async->work, delayed_ref_async_start,
+        btrfs_init_work(&async->work, btrfs_extent_refs_helper,
-                        NULL, NULL);
+                        delayed_ref_async_start, NULL, NULL);
        btrfs_queue_work(root->fs_info->extent_workers, &async->work);
@@ -3057,7 +3058,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                           int full_backref, int inc, int no_quota)
+                           int full_backref, int inc)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -3111,7 +3112,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                           key.offset, no_quota);
+                                           key.offset, 1);
                        if (ret)
                                goto fail;
                } else {
@@ -3119,7 +3120,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, level - 1, 0,
-                                           no_quota);
+                                           1);
                        if (ret)
                                goto fail;
                }
@@ -3130,15 +3131,15 @@ fail:
 }
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota)
+                  struct extent_buffer *buf, int full_backref)
 {
-        return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
 }
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota)
+                  struct extent_buffer *buf, int full_backref)
 {
-        return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 }
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 */
 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        /*
+        u64 num_devices = root->fs_info->fs_devices->rw_devices;
-         * we add in the count of missing devices because we want
-         * to make sure that any RAID levels on a degraded FS
-         * continue to be honored.
-         */
-        u64 num_devices = root->fs_info->fs_devices->rw_devices +
-                root->fs_info->fs_devices->missing_devices;
        u64 target;
        u64 tmp;
@@ -7478,6 +7473,220 @@ reada:
        wc->reada_slot = slot;
 }
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct extent_buffer *eb)
+{
+        int nr = btrfs_header_nritems(eb);
+        int i, extent_type, ret;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        u64 bytenr, num_bytes;
+        for (i = 0; i < nr; i++) {
+                btrfs_item_key_to_cpu(eb, &key, i);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+                /* filter out non qgroup-accountable extents  */
+                extent_type = btrfs_file_extent_type(eb, fi);
+                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+                if (!bytenr)
+                        continue;
+                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+                ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+                                              root->objectid,
+                                              bytenr, num_bytes,
+                                              BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+                                struct btrfs_path *path, int root_level)
+{
+        int level = 0;
+        int nr, slot;
+        struct extent_buffer *eb;
+        if (root_level == 0)
+                return 1;
+        while (level <= root_level) {
+                eb = path->nodes[level];
+                nr = btrfs_header_nritems(eb);
+                path->slots[level]++;
+                slot = path->slots[level];
+                if (slot >= nr || level == 0) {
+                        /*
+                         * Don't free the root -  we will detect this
+                         * condition after our loop and return a
+                         * positive value for caller to stop walking the tree.
+                         */
+                        if (level != root_level) {
+                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
+                                free_extent_buffer(eb);
+                                path->nodes[level] = NULL;
+                                path->slots[level] = 0;
+                        }
+                } else {
+                        /*
+                         * We have a valid slot to walk back down
+                         * from. Stop here so caller can process these
+                         * new nodes.
+                         */
+                        break;
+                }
+                level++;
+        }
+        eb = path->nodes[root_level];
+        if (path->slots[root_level] >= btrfs_header_nritems(eb))
+                return 1;
+        return 0;
+}
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct extent_buffer *root_eb,
+                                  u64 root_gen,
+                                  int root_level)
+{
+        int ret = 0;
+        int level;
+        struct extent_buffer *eb = root_eb;
+        struct btrfs_path *path = NULL;
+        BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+        BUG_ON(root_eb == NULL);
+        if (!root->fs_info->quota_enabled)
+                return 0;
+        if (!extent_buffer_uptodate(root_eb)) {
+                ret = btrfs_read_buffer(root_eb, root_gen);
+                if (ret)
+                        goto out;
+        }
+        if (root_level == 0) {
+                ret = account_leaf_items(trans, root, root_eb);
+                goto out;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /*
+         * Walk down the tree.  Missing extent blocks are filled in as
+         * we go. Metadata is accounted every time we read a new
+         * extent block.
+         *
+         * When we reach a leaf, we account for file extent items in it,
+         * walk back up the tree (adjusting slot pointers as we go)
+         * and restart the search process.
+         */
+        extent_buffer_get(root_eb); /* For path */
+        path->nodes[root_level] = root_eb;
+        path->slots[root_level] = 0;
+        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+        level = root_level;
+        while (level >= 0) {
+                if (path->nodes[level] == NULL) {
+                        int child_bsize = root->nodesize;
+                        int parent_slot;
+                        u64 child_gen;
+                        u64 child_bytenr;
+                        /* We need to get child blockptr/gen from
+                         * parent before we can read it. */
+                        eb = path->nodes[level + 1];
+                        parent_slot = path->slots[level + 1];
+                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+                        eb = read_tree_block(root, child_bytenr, child_bsize,
+                                             child_gen);
+                        if (!eb || !extent_buffer_uptodate(eb)) {
+                                ret = -EIO;
+                                goto out;
+                        }
+                        path->nodes[level] = eb;
+                        path->slots[level] = 0;
+                        btrfs_tree_read_lock(eb);
+                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+                        ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+                                                root->objectid,
+                                                child_bytenr,
+                                                child_bsize,
+                                                BTRFS_QGROUP_OPER_SUB_SUBTREE,
+                                                0);
+                        if (ret)
+                                goto out;
+                }
+                if (level == 0) {
+                        ret = account_leaf_items(trans, root, path->nodes[level]);
+                        if (ret)
+                                goto out;
+                        /* Nonzero return here means we completed our search */
+                        ret = adjust_slots_upwards(root, path, root_level);
+                        if (ret)
+                                break;
+                        /* Restart search with new slots */
+                        goto walk_down;
+                }
+                level--;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
 * helper to process tree block while walking down the tree.
 *
@@ -7532,9 +7741,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
-                ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+                ret = btrfs_inc_ref(trans, root, eb, 1);
                BUG_ON(ret); /* -ENOMEM */
-                ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+                ret = btrfs_dec_ref(trans, root, eb, 0);
                BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                  eb->len, flag,
@@ -7581,6 +7790,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        int level = wc->level;
        int reada = 0;
        int ret = 0;
+        bool need_account = false;
        generation = btrfs_node_ptr_generation(path->nodes[level],
                                               path->slots[level]);
@@ -7626,6 +7836,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level - 1] > 1) {
+                        need_account = true;
                        if (level == 1 &&
                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
                                goto skip;
@@ -7689,6 +7900,16 @@ skip:
                        parent = 0;
                }
+                if (need_account) {
+                        ret = account_shared_subtree(trans, root, next,
+                                                     generation, level - 1);
+                        if (ret) {
+                                printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                                        "%d accounting shared subtree. Quota "
+                                        "is out of sync, rescan required.\n",
+                                        root->fs_info->sb->s_id, ret);
+                        }
+                }
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
                                root->root_key.objectid, level - 1, 0, 0);
                BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +7990,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                                ret = btrfs_dec_ref(trans, root, eb, 1,
+                                ret = btrfs_dec_ref(trans, root, eb, 1);
-                                                    wc->for_reloc);
                        else
-                                ret = btrfs_dec_ref(trans, root, eb, 0,
+                                ret = btrfs_dec_ref(trans, root, eb, 0);
-                                                    wc->for_reloc);
                        BUG_ON(ret); /* -ENOMEM */
+                        ret = account_leaf_items(trans, root, eb);
+                        if (ret) {
+                                printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                                        "%d accounting leaf items. Quota "
+                                        "is out of sync, rescan required.\n",
+                                        root->fs_info->sb->s_id, ret);
+                        }
                }
                /* make block locked assertion in clean_tree_block happy */
                if (!path->locks[level] &&
@@ -7900,6 +8126,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int level;
        bool root_dropped = false;
+        btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
@@ -8025,6 +8253,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                goto out_end_trans;
                        }
+                        /*
+                         * Qgroup update accounting is run from
+                         * delayed ref handling. This usually works
+                         * out because delayed refs are normally the
+                         * only way qgroup updates are added. However,
+                         * we may have added updates during our tree
+                         * walk so run qgroups here to make sure we
+                         * don't lose any updates.
+                         */
+                        ret = btrfs_delayed_qgroup_accounting(trans,
+                                                              root->fs_info);
+                        if (ret)
+                                printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+                                                   "running qgroup updates "
+                                                   "during snapshot delete. "
+                                                   "Quota is out of sync, "
+                                                   "rescan required.\n", ret);
                        btrfs_end_transaction_throttle(trans, tree_root);
                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
                                pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8324,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
        root_dropped = true;
 out_end_trans:
+        ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+        if (ret)
+                printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+                                   "running qgroup updates "
+                                   "during snapshot delete. "
+                                   "Quota is out of sync, "
+                                   "rescan required.\n", ret);
        btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
        kfree(wc);
@@ -8181,13 +8435,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        if (stripped)
                return extended_to_chunk(stripped);
-        /*
+        num_devices = root->fs_info->fs_devices->rw_devices;
-         * we add in the count of missing devices because we want
-         * to make sure that any RAID levels on a degraded FS
-         * continue to be honored.
-         */
-        num_devices = root->fs_info->fs_devices->rw_devices +
-                root->fs_info->fs_devices->missing_devices;
        stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..af0359dcf337 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2532,6 +2532,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                offset += len;
                                continue;
                        }
                }
@@ -4207,8 +4208,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
-        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+        start = round_down(start, BTRFS_I(inode)->root->sectorsize);
-        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+        len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
        /*
         * lookup the last file extent.  We're not using i_size here
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..54c84daec9b5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -756,7 +756,7 @@ again:
                                found_next = 1;
                        if (ret != 0)
                                goto insert;
-                        slot = 0;
+                        slot = path->slots[0];
                }
                btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
                if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..ff1cc0399b9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1838,6 +1838,8 @@ out:
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+        if (filp->private_data)
+                btrfs_ioctl_trans_end(filp);
        /*
         * ordered_data_close is set by settattr when we are about to truncate
         * a file from a non-zero size to a zero size.  This tries to
@@ -1845,26 +1847,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
         * application were using truncate to replace a file in place.
         */
        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                               &BTRFS_I(inode)->runtime_flags)) {
+                               &BTRFS_I(inode)->runtime_flags))
-                struct btrfs_trans_handle *trans;
-                struct btrfs_root *root = BTRFS_I(inode)->root;
-                /*
-                 * We need to block on a committing transaction to keep us from
-                 * throwing a ordered operation on to the list and causing
-                 * something like sync to deadlock trying to flush out this
-                 * inode.
-                 */
-                trans = btrfs_start_transaction(root, 0);
-                if (IS_ERR(trans))
-                        return PTR_ERR(trans);
-                btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
-                btrfs_end_transaction(trans, root);
-                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                        filemap_flush(inode->i_mapping);
-        }
-        if (filp->private_data)
-                btrfs_ioctl_trans_end(filp);
        return 0;
 }
@@ -1982,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        btrfs_init_log_ctx(&ctx);
-        ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+        ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
@@ -2112,10 +2096,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
                goto out;
        }
-        if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+        if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
                u64 num_bytes;
-                path->slots[0]++;
                key.offset = offset;
                btrfs_set_item_key_safe(root, path, &key);
                fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2240,7 +2223,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                goto out_only_mutex;
        }
-        lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+        lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
        lockend = round_down(offset + len,
                             BTRFS_I(inode)->root->sectorsize) - 1;
        same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2301,7 +2284,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                                                tail_start + tail_len, 0, 1);
                                if (ret)
                                        goto out_only_mutex;
-                                }
+                        }
                }
        }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3183742d6f0d..016c403bfe7e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -709,6 +709,18 @@ retry:
                                unlock_extent(io_tree, async_extent->start,
                                              async_extent->start +
                                              async_extent->ram_size - 1);
+                                /*
+                                 * we need to redirty the pages if we decide to
+                                 * fallback to uncompressed IO, otherwise we
+                                 * will not submit these pages down to lower
+                                 * layers.
+                                 */
+                                extent_range_redirty_for_io(inode,
+                                                async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1);
                                goto retry;
                        }
                        goto out_free;
@@ -766,8 +778,12 @@ retry:
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-                if (ret)
+                if (ret) {
+                        btrfs_drop_extent_cache(inode, async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1, 0);
                        goto out_free_reserve;
+                }
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -959,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode,
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
                if (ret)
-                        goto out_reserve;
+                        goto out_drop_extent_cache;
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
                        if (ret)
-                                goto out_reserve;
+                                goto out_drop_extent_cache;
                }
                if (disk_num_bytes < cur_alloc_size)
@@ -994,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode,
 out:
        return ret;
+out_drop_extent_cache:
+        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
@@ -1084,8 +1102,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
-                btrfs_init_work(&async_cow->work, async_cow_start,
+                btrfs_init_work(&async_cow->work,
-                                async_cow_submit, async_cow_free);
+                                btrfs_delalloc_helper,
+                                async_cow_start, async_cow_submit,
+                                async_cow_free);
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
@@ -1869,7 +1889,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        SetPageChecked(page);
        page_cache_get(page);
-        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+                        btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
        return -EBUSY;
@@ -2810,7 +2831,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered_extent = NULL;
-        struct btrfs_workqueue *workers;
+        struct btrfs_workqueue *wq;
+        btrfs_work_func_t func;
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2819,13 +2841,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                            end - start + 1, uptodate))
                return 0;
-        btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+        if (btrfs_is_free_space_inode(inode)) {
+                wq = root->fs_info->endio_freespace_worker;
+                func = btrfs_freespace_write_helper;
+        } else {
+                wq = root->fs_info->endio_write_workers;
+                func = btrfs_endio_write_helper;
+        }
-        if (btrfs_is_free_space_inode(inode))
+        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
-                workers = root->fs_info->endio_freespace_worker;
+                        NULL);
-        else
+        btrfs_queue_work(wq, &ordered_extent->work);
-                workers = root->fs_info->endio_write_workers;
-        btrfs_queue_work(workers, &ordered_extent->work);
        return 0;
 }
@@ -4222,7 +4248,8 @@ out:
                        btrfs_abort_transaction(trans, root, ret);
        }
 error:
-        if (last_size != (u64)-1)
+        if (last_size != (u64)-1 &&
+            root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
        btrfs_free_path(path);
        return err;
@@ -4662,6 +4689,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                remove_extent_mapping(map_tree, em);
                free_extent_map(em);
+                if (need_resched()) {
+                        write_unlock(&map_tree->lock);
+                        cond_resched();
+                        write_lock(&map_tree->lock);
+                }
        }
        write_unlock(&map_tree->lock);
@@ -4684,6 +4716,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                                 &cached_state, GFP_NOFS);
                free_extent_state(state);
+                cond_resched();
                spin_lock(&io_tree->lock);
        }
        spin_unlock(&io_tree->lock);
@@ -5169,6 +5202,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        iput(inode);
                        inode = ERR_PTR(ret);
                }
+                /*
+                 * If orphan cleanup did remove any orphans, it means the tree
+                 * was modified and therefore the commit root is not the same as
+                 * the current root anymore. This is a problem, because send
+                 * uses the commit root and therefore can see inode items that
+                 * don't exist in the current root anymore, and for example make
+                 * calls to btrfs_iget, which will do tree lookups based on the
+                 * current root and not on the commit root. Those lookups will
+                 * fail, returning a -ESTALE error, and making send fail with
+                 * that error. So make sure a send does not see any orphans we
+                 * have just removed, and that it will see the same inodes
+                 * regardless of whether a transaction commit happened before
+                 * it started (meaning that the commit root will be the same as
+                 * the current root) or not.
+                 */
+                if (sub_root->node != sub_root->commit_root) {
+                        u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
+                        if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
+                                struct extent_buffer *eb;
+                                /*
+                                 * Assert we can't have races between dentry
+                                 * lookup called through the snapshot creation
+                                 * ioctl and the VFS.
+                                 */
+                                ASSERT(mutex_is_locked(&dir->i_mutex));
+                                down_write(&root->fs_info->commit_root_sem);
+                                eb = sub_root->commit_root;
+                                sub_root->commit_root =
+                                        btrfs_root_node(sub_root);
+                                up_write(&root->fs_info->commit_root_sem);
+                                free_extent_buffer(eb);
+                        }
+                }
        }
        return inode;
@@ -5565,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
        return ret;
 }
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+        struct btrfs_iget_args args;
+        args.location = &BTRFS_I(inode)->location;
+        args.root = BTRFS_I(inode)->root;
+        return insert_inode_locked4(inode,
+                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+                   btrfs_find_actor, &args);
+}
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
@@ -5594,6 +5674,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        }
        /*
+         * O_TMPFILE, set link count to 0, so that after this point,
+         * we fill in an inode item with the correct link count.
+         */
+        if (!name)
+                set_nlink(inode, 0);
+        /*
         * we have to initialize this early, so we can reclaim the inode
         * number if we fail afterwards in this function.
         */
@@ -5650,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                sizes[1] = name_len + sizeof(*ref);
        }
+        location = &BTRFS_I(inode)->location;
+        location->objectid = objectid;
+        location->offset = 0;
+        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        ret = btrfs_insert_inode_locked(inode);
+        if (ret < 0)
+                goto fail;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
        if (ret != 0)
-                goto fail;
+                goto fail_unlock;
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
@@ -5676,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        location = &BTRFS_I(inode)->location;
-        location->objectid = objectid;
-        location->offset = 0;
-        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
        btrfs_inherit_iflags(inode, dir);
        if (S_ISREG(mode)) {
@@ -5691,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                BTRFS_INODE_NODATASUM;
        }
-        btrfs_insert_inode_hash(inode);
        inode_tree_add(inode);
        trace_btrfs_inode_new(inode);
@@ -5706,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                          btrfs_ino(inode), root->root_key.objectid, ret);
        return inode;
+fail_unlock:
+        unlock_new_inode(inode);
 fail:
        if (dir && name)
                BTRFS_I(dir)->index_cnt--;
@@ -5840,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
        * if the filesystem supports xattrs by looking at the
        * ops vector.
        */
        inode->i_op = &btrfs_special_inode_operations;
-        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+        init_special_inode(inode, inode->i_mode, rdev);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-                drop_inode = 1;
+                goto out_unlock_inode;
-        else {
-                init_special_inode(inode, inode->i_mode, rdev);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+        if (err) {
+                goto out_unlock_inode;
+        } else {
                btrfs_update_inode(trans, root, inode);
+                unlock_new_inode(inode);
                d_instantiate(dentry, inode);
        }
 out_unlock:
        btrfs_end_transaction(trans, root);
        btrfs_balance_delayed_items(root);
@@ -5871,6 +5964,12 @@ out_unlock:
                iput(inode);
        }
        return err;
+out_unlock_inode:
+        drop_inode = 1;
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5905,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
        drop_inode_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err)
-                goto out_unlock;
-        err = btrfs_update_inode(trans, root, inode);
-        if (err)
-                goto out_unlock;
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5922,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+        if (err)
+                goto out_unlock_inode;
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                goto out_unlock_inode;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                goto out_unlock;
+                goto out_unlock_inode;
-        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
 out_unlock:
@@ -5941,6 +6040,11 @@ out_unlock:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+out_unlock_inode:
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6048,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        }
        drop_on_err = 1;
+        /* these must be set before we unlock the inode */
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
-        inode->i_op = &btrfs_dir_inode_operations;
-        inode->i_fop = &btrfs_dir_file_operations;
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
                             dentry->d_name.len, 0, index);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
        d_instantiate(dentry, inode);
+        /*
+         * mkdir is special.  We're unlocking after we call d_instantiate
+         * to avoid a race with nfsd calling d_instantiate.
+         */
+        unlock_new_inode(inode);
        drop_on_err = 0;
 out_fail:
@@ -6076,6 +6185,10 @@ out_fail:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+out_fail_inode:
+        unlock_new_inode(inode);
+        goto out_fail;
 }
 /* helper for btfs_get_extent.  Given an existing extent in the tree,
@@ -6085,14 +6198,14 @@ out_fail:
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                struct extent_map *existing,
                                struct extent_map *em,
-                                u64 map_start, u64 map_len)
+                                u64 map_start)
 {
        u64 start_diff;
        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
        start_diff = map_start - em->start;
        em->start = map_start;
-        em->len = map_len;
+        em->len = existing->start - em->start;
        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
@@ -6263,6 +6376,8 @@ next:
                        goto not_found;
                if (start + len <= found_key.offset)
                        goto not_found;
+                if (start > found_key.offset)
+                        goto next;
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
@@ -6378,8 +6493,7 @@ insert:
                                                         em->len);
                        if (existing) {
                                err = merge_extent_mapping(em_tree, existing,
-                                                           em, start,
+                                                           em, start);
-                                                           root->sectorsize);
                                free_extent_map(existing);
                                if (err) {
                                        free_extent_map(em);
@@ -7146,7 +7260,8 @@ again:
        if (!ret)
                goto out_test;
-        btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+        btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+                        finish_ordered_fn, NULL, NULL);
        btrfs_queue_work(root->fs_info->endio_write_workers,
                         &ordered->work);
 out_test:
@@ -7294,10 +7409,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
-        if (ret) {
+        if (ret)
-                bio_put(orig_bio);
                return -EIO;
-        }
        if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
@@ -7314,6 +7427,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
        if (!bio)
                return -ENOMEM;
        bio->bi_private = dip;
        bio->bi_end_io = btrfs_end_dio_bio;
        atomic_inc(&dip->pending_bios);
@@ -7522,7 +7636,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        count = iov_iter_count(iter);
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags))
-                filemap_fdatawrite_range(inode->i_mapping, offset, count);
+                filemap_fdatawrite_range(inode->i_mapping, offset,
+                                         offset + count - 1);
        if (rw & WRITE) {
                /*
@@ -7939,27 +8054,6 @@ static int btrfs_truncate(struct inode *inode)
        BUG_ON(ret);
        /*
-         * setattr is responsible for setting the ordered_data_close flag,
-         * but that is only tested during the last file release.  That
-         * could happen well after the next commit, leaving a great big
-         * window where new writes may get lost if someone chooses to write
-         * to this file after truncating to zero
-         *
-         * The inode doesn't have any dirty data here, and so if we commit
-         * this is a noop.  If someone immediately starts writing to the inode
-         * it is very likely we'll catch some of their writes in this
-         * transaction, and the commit will find this file on the ordered
-         * data list with good things to send down.
-         *
-         * This is a best effort solution, there is still a window where
-         * using truncate to replace the contents of the file will
-         * end up with a zero length file after a crash.
-         */
-        if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                                           &BTRFS_I(inode)->runtime_flags))
-                btrfs_add_ordered_operation(trans, root, inode);
-        /*
         * So if we truncate and then write and fsync we normally would just
         * write the extents that changed, which is a problem if we need to
         * first truncate that entire inode.  So set this flag so we write out
@@ -8050,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        set_nlink(inode, 1);
        btrfs_i_size_write(inode, 0);
+        unlock_new_inode(inode);
        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
        if (err)
@@ -8106,7 +8201,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
-        INIT_LIST_HEAD(&ei->ordered_operations);
        RB_CLEAR_NODE(&ei->rb_node);
        return inode;
@@ -8146,17 +8240,6 @@ void btrfs_destroy_inode(struct inode *inode)
        if (!root)
                goto free;
-        /*
-         * Make sure we're properly removed from the ordered operation
-         * lists.
-         */
-        smp_mb();
-        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-                spin_lock(&root->fs_info->ordered_root_lock);
-                list_del_init(&BTRFS_I(inode)->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-        }
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                     &BTRFS_I(inode)->runtime_flags)) {
                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8421,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        ret = 0;
        /*
-         * we're using rename to replace one file with another.
+         * we're using rename to replace one file with another.  Start IO on it
-         * and the replacement file is large.  Start IO on it now so
+         * now so  we don't add too much work to the end of the transaction
-         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
-            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
        /* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8472,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 */
                btrfs_pin_log_trans(root);
        }
-        /*
-         * make sure the inode gets flushed if it is replacing
-         * something.
-         */
-        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-                btrfs_add_ordered_operation(trans, root, old_inode);
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
@@ -8524,7 +8599,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        work->inode = inode;
        work->wait = wait;
        work->delay_iput = delay_iput;
-        btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+        WARN_ON_ONCE(!inode);
+        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+                        btrfs_run_delalloc_work, NULL, NULL);
        return work;
 }
@@ -8728,12 +8805,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -8742,23 +8813,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+        if (err)
+                goto out_unlock_inode;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                drop_inode = 1;
+                goto out_unlock_inode;
-        else {
-                inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-        }
-        if (drop_inode)
-                goto out_unlock;
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
-                drop_inode = 1;
+                goto out_unlock_inode;
-                goto out_unlock;
        }
        key.objectid = btrfs_ino(inode);
        key.offset = 0;
@@ -8767,9 +8837,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        err = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
        if (err) {
-                drop_inode = 1;
                btrfs_free_path(path);
-                goto out_unlock;
+                goto out_unlock_inode;
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8793,12 +8862,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
-        if (err)
+        if (err) {
                drop_inode = 1;
+                goto out_unlock_inode;
+        }
+        unlock_new_inode(inode);
+        d_instantiate(dentry, inode);
 out_unlock:
-        if (!err)
-                d_instantiate(dentry, inode);
        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -8806,6 +8878,11 @@ out_unlock:
        }
        btrfs_btree_balance_dirty(root);
        return err;
+out_unlock_inode:
+        drop_inode = 1;
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8989,14 +9066,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out;
        }
-        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
-        if (ret)
-                goto out;
-        ret = btrfs_update_inode(trans, root, inode);
-        if (ret)
-                goto out;
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
@@ -9004,10 +9073,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+        if (ret)
+                goto out_inode;
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret)
+                goto out_inode;
        ret = btrfs_orphan_add(trans, inode);
        if (ret)
-                goto out;
+                goto out_inode;
+        /*
+         * We set number of links to 0 in btrfs_new_inode(), and here we set
+         * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+         * through:
+         *
+         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+         */
+        set_nlink(inode, 1);
+        unlock_new_inode(inode);
        d_tmpfile(dentry, inode);
        mark_inode_dirty(inode);
@@ -9017,8 +9102,12 @@ out:
                iput(inode);
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return ret;
+out_inode:
+        unlock_new_inode(inode);
+        goto out;
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..8a8e29878c34 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (ret)
                goto fail;
-        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
-        if (ret)
-                goto fail;
-        /*
-         * If orphan cleanup did remove any orphans, it means the tree was
-         * modified and therefore the commit root is not the same as the
-         * current root anymore. This is a problem, because send uses the
-         * commit root and therefore can see inode items that don't exist
-         * in the current root anymore, and for example make calls to
-         * btrfs_iget, which will do tree lookups based on the current root
-         * and not on the commit root. Those lookups will fail, returning a
-         * -ESTALE error, and making send fail with that error. So make sure
-         * a send does not see any orphans we have just removed, and that it
-         * will see the same inodes regardless of whether a transaction
-         * commit happened before it started (meaning that the commit root
-         * will be the same as the current root) or not.
-         */
-        if (readonly && pending_snapshot->snap->node !=
-            pending_snapshot->snap->commit_root) {
-                trans = btrfs_join_transaction(pending_snapshot->snap);
-                if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
-                        ret = PTR_ERR(trans);
-                        goto fail;
-                }
-                if (!IS_ERR(trans)) {
-                        ret = btrfs_commit_transaction(trans,
-                                                       pending_snapshot->snap);
-                        if (ret)
-                                goto fail;
-                }
-        }
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -1052,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
                return false;
        next = defrag_lookup_extent(inode, em->start + em->len);
-        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
-            (em->block_start + em->block_len == next->block_start))
+                ret = false;
+        else if ((em->block_start + em->block_len == next->block_start) &&
+                 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
                ret = false;
        free_extent_map(next);
@@ -1088,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
        }
        next_mergeable = defrag_check_next_extent(inode, em);
        /*
         * we hit a real extent, if it is big or the next extent is not a
         * real extent, don't bother defragging it
@@ -1735,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
            ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
              BTRFS_SUBVOL_QGROUP_INHERIT)) {
                ret = -EOPNOTSUPP;
-                goto out;
+                goto free_args;
        }
        if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
                if (vol_args->size > PAGE_CACHE_SIZE) {
                        ret = -EINVAL;
-                        goto out;
+                        goto free_args;
                }
                inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
                if (IS_ERR(inherit)) {
                        ret = PTR_ERR(inherit);
-                        goto out;
+                        goto free_args;
                }
        }
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                              vol_args->fd, subvol, ptr,
                                              readonly, inherit);
+        if (ret)
+                goto free_inherit;
-        if (ret == 0 && ptr &&
+        if (ptr && copy_to_user(arg +
-            copy_to_user(arg +
+                                offsetof(struct btrfs_ioctl_vol_args_v2,
-                         offsetof(struct btrfs_ioctl_vol_args_v2,
+                                        transid),
-                                  transid), ptr, sizeof(*ptr)))
+                                ptr, sizeof(*ptr)))
                ret = -EFAULT;
-out:
-        kfree(vol_args);
+free_inherit:
        kfree(inherit);
+free_args:
+        kfree(vol_args);
        return ret;
 }
@@ -2685,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
-                goto out;
+                goto err_drop;
        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2703,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 out:
        kfree(vol_args);
+err_drop:
        mnt_drop_write_file(file);
        return ret;
 }
@@ -3527,7 +3500,8 @@ process_slot:
                        btrfs_mark_buffer_dirty(leaf);
                        btrfs_release_path(path);
-                        last_dest_end = new_key.offset + datal;
+                        last_dest_end = ALIGN(new_key.offset + datal,
+                                              root->sectorsize);
                        ret = clone_finish_inode_update(trans, inode,
                                                        last_dest_end,
                                                        destoff, olen);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        trace_btrfs_ordered_extent_remove(inode, entry);
-        /*
-         * we have no more ordered extents for this inode and
-         * no dirty pages.  We can safely remove it from the
-         * list of ordered extents
-         */
-        if (RB_EMPTY_ROOT(&tree->tree) &&
-            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-                spin_lock(&root->fs_info->ordered_root_lock);
-                list_del_init(&BTRFS_I(inode)->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-        }
        if (!root->nr_ordered_extents) {
                spin_lock(&root->fs_info->ordered_root_lock);
                BUG_ON(list_empty(&root->ordered_root));
@@ -627,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
                spin_unlock(&root->ordered_extent_lock);
                btrfs_init_work(&ordered->flush_work,
+                                btrfs_flush_delalloc_helper,
                                btrfs_run_ordered_extent_work, NULL, NULL);
                list_add_tail(&ordered->work_list, &works);
                btrfs_queue_work(root->fs_info->flush_workers,
@@ -687,81 +676,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 }
 /*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list.  These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io.  When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root, int wait)
-{
-        struct btrfs_inode *btrfs_inode;
-        struct inode *inode;
-        struct btrfs_transaction *cur_trans = trans->transaction;
-        struct list_head splice;
-        struct list_head works;
-        struct btrfs_delalloc_work *work, *next;
-        int ret = 0;
-        INIT_LIST_HEAD(&splice);
-        INIT_LIST_HEAD(&works);
-        mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
-        spin_lock(&root->fs_info->ordered_root_lock);
-        list_splice_init(&cur_trans->ordered_operations, &splice);
-        while (!list_empty(&splice)) {
-                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                   ordered_operations);
-                inode = &btrfs_inode->vfs_inode;
-                list_del_init(&btrfs_inode->ordered_operations);
-                /*
-                 * the inode may be getting freed (in sys_unlink path).
-                 */
-                inode = igrab(inode);
-                if (!inode)
-                        continue;
-                if (!wait)
-                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                                      &cur_trans->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-                work = btrfs_alloc_delalloc_work(inode, wait, 1);
-                if (!work) {
-                        spin_lock(&root->fs_info->ordered_root_lock);
-                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
-                                list_add_tail(&btrfs_inode->ordered_operations,
-                                              &splice);
-                        list_splice_tail(&splice,
-                                         &cur_trans->ordered_operations);
-                        spin_unlock(&root->fs_info->ordered_root_lock);
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                list_add_tail(&work->list, &works);
-                btrfs_queue_work(root->fs_info->flush_workers,
-                                 &work->work);
-                cond_resched();
-                spin_lock(&root->fs_info->ordered_root_lock);
-        }
-        spin_unlock(&root->fs_info->ordered_root_lock);
-out:
-        list_for_each_entry_safe(work, next, &works, list) {
-                list_del_init(&work->list);
-                btrfs_wait_and_free_delalloc_work(work);
-        }
-        mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
-        return ret;
-}
-/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
@@ -1120,42 +1034,6 @@ out:
        return index;
 }
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root, struct inode *inode)
-{
-        struct btrfs_transaction *cur_trans = trans->transaction;
-        u64 last_mod;
-        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-        /*
-         * if this file hasn't been changed since the last transaction
-         * commit, we can safely return without doing anything
-         */
-        if (last_mod <= root->fs_info->last_trans_committed)
-                return;
-        spin_lock(&root->fs_info->ordered_root_lock);
-        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
-                list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                              &cur_trans->ordered_operations);
-        }
-        spin_unlock(&root->fs_info->ordered_root_lock);
-}
 int __init ordered_data_init(void)
 {
        btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                           u32 *sum, int len);
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root, int wait);
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 struct inode *inode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..ded5c601d916 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1201,6 +1201,50 @@ out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
+static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
+                           struct btrfs_qgroup_operation *oper2)
+{
+        /*
+         * Ignore seq and type here, we're looking for any operation
+         * at all related to this extent on that root.
+         */
+        if (oper1->bytenr < oper2->bytenr)
+                return -1;
+        if (oper1->bytenr > oper2->bytenr)
+                return 1;
+        if (oper1->ref_root < oper2->ref_root)
+                return -1;
+        if (oper1->ref_root > oper2->ref_root)
+                return 1;
+        return 0;
+}
+static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
+                              struct btrfs_qgroup_operation *oper)
+{
+        struct rb_node *n;
+        struct btrfs_qgroup_operation *cur;
+        int cmp;
+        spin_lock(&fs_info->qgroup_op_lock);
+        n = fs_info->qgroup_op_tree.rb_node;
+        while (n) {
+                cur = rb_entry(n, struct btrfs_qgroup_operation, n);
+                cmp = comp_oper_exist(cur, oper);
+                if (cmp < 0) {
+                        n = n->rb_right;
+                } else if (cmp) {
+                        n = n->rb_left;
+                } else {
+                        spin_unlock(&fs_info->qgroup_op_lock);
+                        return -EEXIST;
+                }
+        }
+        spin_unlock(&fs_info->qgroup_op_lock);
+        return 0;
+}
 static int comp_oper(struct btrfs_qgroup_operation *oper1,
                     struct btrfs_qgroup_operation *oper2)
 {
@@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
        oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
        INIT_LIST_HEAD(&oper->elem.list);
        oper->elem.seq = 0;
+        if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
+                /*
+                 * If any operation for this bytenr/ref_root combo
+                 * exists, then we know it's not exclusively owned and
+                 * shouldn't be queued up.
+                 *
+                 * This also catches the case where we have a cloned
+                 * extent that gets queued up multiple times during
+                 * drop snapshot.
+                 */
+                if (qgroup_oper_exists(fs_info, oper)) {
+                        kfree(oper);
+                        return 0;
+                }
+        }
        ret = insert_qgroup_oper(fs_info, oper);
        if (ret) {
                /* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1945,111 @@ out:
 }
 /*
+ * Process a reference to a shared subtree. This type of operation is
+ * queued during snapshot removal when we encounter extents which are
+ * shared between more than one root.
+ */
+static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
+                                     struct btrfs_fs_info *fs_info,
+                                     struct btrfs_qgroup_operation *oper)
+{
+        struct ulist *roots = NULL;
+        struct ulist_node *unode;
+        struct ulist_iterator uiter;
+        struct btrfs_qgroup_list *glist;
+        struct ulist *parents;
+        int ret = 0;
+        int err;
+        struct btrfs_qgroup *qg;
+        u64 root_obj = 0;
+        struct seq_list elem = {};
+        parents = ulist_alloc(GFP_NOFS);
+        if (!parents)
+                return -ENOMEM;
+        btrfs_get_tree_mod_seq(fs_info, &elem);
+        ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+                                   elem.seq, &roots);
+        btrfs_put_tree_mod_seq(fs_info, &elem);
+        if (ret < 0)
+                goto out;
+        if (roots->nnodes != 1)
+                goto out;
+        ULIST_ITER_INIT(&uiter);
+        unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
+        /*
+         * If we find our ref root then that means all refs
+         * this extent has to the root have not yet been
+         * deleted. In that case, we do nothing and let the
+         * last ref for this bytenr drive our update.
+         *
+         * This can happen for example if an extent is
+         * referenced multiple times in a snapshot (clone,
+         * etc). If we are in the middle of snapshot removal,
+         * queued updates for such an extent will find the
+         * root if we have not yet finished removing the
+         * snapshot.
+         */
+        if (unode->val == oper->ref_root)
+                goto out;
+        root_obj = unode->val;
+        BUG_ON(!root_obj);
+        spin_lock(&fs_info->qgroup_lock);
+        qg = find_qgroup_rb(fs_info, root_obj);
+        if (!qg)
+                goto out_unlock;
+        qg->excl += oper->num_bytes;
+        qg->excl_cmpr += oper->num_bytes;
+        qgroup_dirty(fs_info, qg);
+        /*
+         * Adjust counts for parent groups. First we find all
+         * parents, then in the 2nd loop we do the adjustment
+         * while adding parents of the parents to our ulist.
+         */
+        list_for_each_entry(glist, &qg->groups, next_group) {
+                err = ulist_add(parents, glist->group->qgroupid,
+                                ptr_to_u64(glist->group), GFP_ATOMIC);
+                if (err < 0) {
+                        ret = err;
+                        goto out_unlock;
+                }
+        }
+        ULIST_ITER_INIT(&uiter);
+        while ((unode = ulist_next(parents, &uiter))) {
+                qg = u64_to_ptr(unode->aux);
+                qg->excl += oper->num_bytes;
+                qg->excl_cmpr += oper->num_bytes;
+                qgroup_dirty(fs_info, qg);
+                /* Add any parents of the parents */
+                list_for_each_entry(glist, &qg->groups, next_group) {
+                        err = ulist_add(parents, glist->group->qgroupid,
+                                        ptr_to_u64(glist->group), GFP_ATOMIC);
+                        if (err < 0) {
+                                ret = err;
+                                goto out_unlock;
+                        }
+                }
+        }
+out_unlock:
+        spin_unlock(&fs_info->qgroup_lock);
+out:
+        ulist_free(roots);
+        ulist_free(parents);
+        return ret;
+}
+/*
 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
 * from the fs. First, all roots referencing the extent are searched, and
 * then the space is accounted accordingly to the different roots. The
@@ -1920,6 +2086,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
        case BTRFS_QGROUP_OPER_SUB_SHARED:
                ret = qgroup_shared_accounting(trans, fs_info, oper);
                break;
+        case BTRFS_QGROUP_OPER_SUB_SUBTREE:
+                ret = qgroup_subtree_accounting(trans, fs_info, oper);
+                break;
        default:
                ASSERT(0);
        }
@@ -2551,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
        memset(&fs_info->qgroup_rescan_work, 0,
               sizeof(fs_info->qgroup_rescan_work));
        btrfs_init_work(&fs_info->qgroup_rescan_work,
+                        btrfs_qgroup_rescan_helper,
                        btrfs_qgroup_rescan_worker, NULL, NULL);
        if (ret) {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
        BTRFS_QGROUP_OPER_ADD_SHARED,
        BTRFS_QGROUP_OPER_SUB_EXCL,
        BTRFS_QGROUP_OPER_SUB_SHARED,
+        BTRFS_QGROUP_OPER_SUB_SUBTREE,
 };
 struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..0a6b6e4bcbb9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,7 +1416,8 @@ cleanup:
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
-        btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        rmw_work, NULL, NULL);
        btrfs_queue_work(rbio->fs_info->rmw_workers,
                         &rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 {
-        btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        read_rebuild_work, NULL, NULL);
        btrfs_queue_work(rbio->fs_info->rmw_workers,
                         &rbio->work);
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
        plug = container_of(cb, struct btrfs_plug_cb, cb);
        if (from_schedule) {
-                btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+                btrfs_init_work(&plug->work, btrfs_rmw_helper,
+                                unplug_work, NULL, NULL);
                btrfs_queue_work(plug->info->rmw_workers,
                                 &plug->work);
                return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..20408c6b665a 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
                /* FIXME we cannot handle this properly right now */
                BUG();
        }
-        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
+        btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+                        reada_start_machine_worker, NULL, NULL);
        rmw->fs_info = fs_info;
        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..f4a41f37be22 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
                sbio->index = i;
                sbio->sctx = sctx;
                sbio->page_count = 0;
-                btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+                btrfs_init_work(&sbio->work, btrfs_scrub_helper,
-                                NULL, NULL);
+                                scrub_bio_end_io_worker, NULL, NULL);
                if (i != SCRUB_BIOS_PER_SCTX - 1)
                        sctx->bios[i]->next_free = i + 1;
@@ -999,8 +999,8 @@ nodatasum_case:
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                scrub_pending_trans_workers_inc(sctx);
-                btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+                btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
-                                NULL, NULL);
+                                scrub_fixup_nodatasum, NULL, NULL);
                btrfs_queue_work(fs_info->scrub_workers,
                                 &fixup_nodatasum->work);
                goto out;
@@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
        sbio->err = err;
        sbio->bio = bio;
-        btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+        btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+                         scrub_wr_bio_end_io_worker, NULL, NULL);
        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
@@ -2904,6 +2905,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        struct scrub_ctx *sctx;
        int ret;
        struct btrfs_device *dev;
+        struct rcu_string *name;
        if (btrfs_fs_closing(fs_info))
                return -EINVAL;
@@ -2965,6 +2967,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return -ENODEV;
        }
+        if (!is_dev_replace && !readonly && !dev->writeable) {
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+                rcu_read_lock();
+                name = rcu_dereference(dev->name);
+                btrfs_err(fs_info, "scrub: device %s is not writable",
+                          name->str);
+                rcu_read_unlock();
+                return -EROFS;
+        }
        mutex_lock(&fs_info->scrub_lock);
        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
        nocow_ctx->len = len;
        nocow_ctx->mirror_num = mirror_num;
        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-        btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
+        btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+                        copy_nocow_pages_worker, NULL, NULL);
        INIT_LIST_HEAD(&nocow_ctx->inodes);
        btrfs_queue_work(fs_info->scrub_nocow_workers,
                         &nocow_ctx->work);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67b48b9a03e0..c4124de4435b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1665,6 +1665,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        return 0;
 }
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes, order of allocations and the
+ * 'alloc_start' value, this is a close approximation of the actual use but
+ * there are other factors that may change the result (like a new metadata
+ * chunk).
+ *
+ * FIXME: not accurate for mixed block groups, total and free/used are ok,
+ * available appears slightly larger.
+ */
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1675,6 +1690,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)fs_info->fsid;
+        unsigned factor = 1;
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        int ret;
        /* holding chunk_muext to avoid allocating new chunks */
@@ -1682,30 +1699,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+                        int i;
                        total_free_data += found->disk_total - found->disk_used;
                        total_free_data -=
                                btrfs_account_ro_block_groups_free_space(found);
+                        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+                                if (!list_empty(&found->block_groups[i])) {
+                                        switch (i) {
+                                        case BTRFS_RAID_DUP:
+                                        case BTRFS_RAID_RAID1:
+                                        case BTRFS_RAID_RAID10:
+                                                factor = 2;
+                                        }
+                                }
+                        }
                }
                total_used += found->disk_used;
        }
        rcu_read_unlock();
-        buf->f_namelen = BTRFS_NAME_LEN;
+        buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
-        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+        buf->f_blocks >>= bits;
-        buf->f_bfree = buf->f_blocks - (total_used >> bits);
+        buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
-        buf->f_bsize = dentry->d_sb->s_blocksize;
-        buf->f_type = BTRFS_SUPER_MAGIC;
+        /* Account global block reserve as used, it's in logical size already */
+        spin_lock(&block_rsv->lock);
+        buf->f_bfree -= block_rsv->size >> bits;
+        spin_unlock(&block_rsv->lock);
        buf->f_bavail = total_free_data;
        ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
                mutex_unlock(&fs_info->chunk_mutex);
                return ret;
        }
-        buf->f_bavail += total_free_data;
+        buf->f_bavail += div_u64(total_free_data, factor);
        buf->f_bavail = buf->f_bavail >> bits;
        mutex_unlock(&fs_info->chunk_mutex);
+        buf->f_type = BTRFS_SUPER_MAGIC;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
+        buf->f_namelen = BTRFS_NAME_LEN;
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
           on a big-endian or little-endian host */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..12e53556e214 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -614,7 +614,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
        if (!fs_info->device_dir_kobj)
                return -EINVAL;
-        if (one_device) {
+        if (one_device && one_device->bdev) {
                disk = one_device->bdev->bd_part;
                disk_kobj = &part_to_dev(disk)->kobj;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5f379affdf23..d89c6d3542ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
        spin_lock_init(&cur_trans->delayed_refs.lock);
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-        INIT_LIST_HEAD(&cur_trans->ordered_operations);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
-static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root)
-{
-        int ret;
-        ret = btrfs_run_delayed_items(trans, root);
-        if (ret)
-                return ret;
-        /*
-         * rename don't use btrfs_join_transaction, so, once we
-         * set the transaction to blocked above, we aren't going
-         * to get any new ordered operations.  We can safely run
-         * it here and no for sure that nothing new will be added
-         * to the list
-         */
-        ret = btrfs_run_ordered_operations(trans, root, 1);
-        return ret;
-}
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *prev_trans = NULL;
        int ret;
-        ret = btrfs_run_ordered_operations(trans, root, 0);
-        if (ret) {
-                btrfs_abort_transaction(trans, root, ret);
-                btrfs_end_transaction(trans, root);
-                return ret;
-        }
        /* Stop the commit early if ->aborted is set */
        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        if (ret)
                goto cleanup_transaction;
-        ret = btrfs_flush_all_pending_stuffs(trans, root);
+        ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                goto cleanup_transaction;
@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                   extwriter_counter_read(cur_trans) == 0);
        /* some pending stuffs might be added after the previous flush. */
-        ret = btrfs_flush_all_pending_stuffs(trans, root);
+        ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                goto cleanup_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..579be51b27e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
-        struct list_head ordered_operations;
        struct list_head pending_chunks;
        struct list_head switch_commits;
        struct btrfs_delayed_ref_root delayed_refs;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..1d1ba083ca6e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, struct inode *inode,
+                           struct btrfs_root *root, struct inode *inode,
-                             int inode_only);
+                           int inode_only,
+                           const loff_t start,
+                           const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        struct list_head ordered_sums;
        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        bool has_extents = false;
-        bool need_find_last_extent = (*last_extent == 0);
+        bool need_find_last_extent = true;
        bool done = false;
        INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                 */
                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
                        has_extents = true;
-                        if (need_find_last_extent &&
+                        if (first_key.objectid == (u64)-1)
-                            first_key.objectid == (u64)-1)
                                first_key = ins_keys[i];
                } else {
                        need_find_last_extent = false;
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        if (!has_extents)
                return ret;
+        if (need_find_last_extent && *last_extent == first_key.offset) {
+                /*
+                 * We don't have any leafs between our current one and the one
+                 * we processed before that can have file extent items for our
+                 * inode (and have a generation number smaller than our current
+                 * transaction id).
+                 */
+                need_find_last_extent = false;
+        }
        /*
         * Because we use btrfs_search_forward we could skip leaves that were
         * not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
                                               0, 0);
                if (ret)
                        break;
-                *last_extent = offset + len;
+                *last_extent = extent_end;
        }
        /*
         * Need to let the callers know we dropped the path so they should
@@ -3849,8 +3860,10 @@ process:
 * This handles both files and directories.
 */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, struct inode *inode,
+                           struct btrfs_root *root, struct inode *inode,
-                             int inode_only)
+                           int inode_only,
+                           const loff_t start,
+                           const loff_t end)
 {
        struct btrfs_path *path;
        struct btrfs_path *dst_path;
@@ -3867,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int ins_nr;
        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        path = btrfs_alloc_path();
        if (!path)
@@ -4040,13 +4054,35 @@ log_extents:
                        goto out_unlock;
                }
        } else if (inode_only == LOG_INODE_ALL) {
-                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                struct extent_map *em, *n;
-                write_lock(&tree->lock);
+                write_lock(&em_tree->lock);
-                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+                /*
-                        list_del_init(&em->list);
+                 * We can't just remove every em if we're called for a ranged
-                write_unlock(&tree->lock);
+                 * fsync - that is, one that doesn't cover the whole possible
+                 * file range (0 to LLONG_MAX). This is because we can have
+                 * em's that fall outside the range we're logging and therefore
+                 * their ordered operations haven't completed yet
+                 * (btrfs_finish_ordered_io() not invoked yet). This means we
+                 * didn't get their respective file extent item in the fs/subvol
+                 * tree yet, and need to let the next fast fsync (one which
+                 * consults the list of modified extent maps) find the em so
+                 * that it logs a matching file extent item and waits for the
+                 * respective ordered operation to complete (if it's still
+                 * running).
+                 *
+                 * Removing every em outside the range we're logging would make
+                 * the next fast fsync not log their matching file extent items,
+                 * therefore making us lose data after a log replay.
+                 */
+                list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+                                         list) {
+                        const u64 mod_end = em->mod_start + em->mod_len - 1;
+                        if (em->mod_start >= start && mod_end <= end)
+                                list_del_init(&em->list);
+                }
+                write_unlock(&em_tree->lock);
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,6 +4092,7 @@ log_extents:
                        goto out_unlock;
                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
@@ -4152,7 +4189,10 @@ out:
 */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, struct inode *inode,
-                                  struct dentry *parent, int exists_only,
+                                  struct dentry *parent,
+                                  const loff_t start,
+                                  const loff_t end,
+                                  int exists_only,
                                  struct btrfs_log_ctx *ctx)
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4238,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        if (ret)
                goto end_no_trans;
-        ret = btrfs_log_inode(trans, root, inode, inode_only);
+        ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
        if (ret)
                goto end_trans;
@@ -4226,7 +4266,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
-                        ret = btrfs_log_inode(trans, root, inode, inode_only);
+                        ret = btrfs_log_inode(trans, root, inode, inode_only,
+                                              0, LLONG_MAX);
                        if (ret)
                                goto end_trans;
                }
@@ -4260,13 +4301,15 @@ end_no_trans:
 */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                          const loff_t start,
+                          const loff_t end,
                          struct btrfs_log_ctx *ctx)
 {
        struct dentry *parent = dget_parent(dentry);
        int ret;
        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
-                                     0, ctx);
+                                     start, end, 0, ctx);
        dput(parent);
        return ret;
@@ -4503,6 +4546,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
                    root->fs_info->last_trans_committed))
                return 0;
-        return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+        return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+                                      LLONG_MAX, 1, NULL);
 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..e2e798ae7cd7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                          const loff_t start,
+                          const loff_t end,
                          struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
 int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                    u64 *old_aux, gfp_t gfp_mask);
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+                                      void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+        u64 old64 = (uintptr_t)*old_aux;
+        int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+        *old_aux = (void *)((uintptr_t)old64);
+        return ret;
+#else
+        return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
 struct ulist_node *ulist_next(struct ulist *ulist,
                              struct ulist_iterator *uiter);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..2c2d6d1d8eee 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -508,6 +508,43 @@ static noinline int device_list_add(const char *path,
                ret = 1;
                device->fs_devices = fs_devices;
        } else if (!device->name || strcmp(device->name->str, path)) {
+                /*
+                 * When FS is already mounted.
+                 * 1. If you are here and if the device->name is NULL that
+                 *    means this device was missing at time of FS mount.
+                 * 2. If you are here and if the device->name is different
+                 *    from 'path' that means either
+                 *      a. The same device disappeared and reappeared with
+                 *         different name. or
+                 *      b. The missing-disk-which-was-replaced, has
+                 *         reappeared now.
+                 *
+                 * We must allow 1 and 2a above. But 2b would be a spurious
+                 * and unintentional.
+                 *
+                 * Further in case of 1 and 2a above, the disk at 'path'
+                 * would have missed some transaction when it was away and
+                 * in case of 2a the stale bdev has to be updated as well.
+                 * 2b must not be allowed at all time.
+                 */
+                /*
+                 * For now, we do allow update to btrfs_fs_device through the
+                 * btrfs dev scan cli after FS has been mounted.  We're still
+                 * tracking a problem where systems fail mount by subvolume id
+                 * when we reject replacement on a mounted FS.
+                 */
+                if (!fs_devices->opened && found_transid < device->generation) {
+                        /*
+                         * That is if the FS is _not_ mounted and if you
+                         * are here, that means there is more than one
+                         * disk with same uuid and devid.We keep the one
+                         * with larger generation number or the last-in if
+                         * generation are equal.
+                         */
+                        return -EEXIST;
+                }
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
@@ -519,6 +556,15 @@ static noinline int device_list_add(const char *path,
                }
        }
+        /*
+         * Unmount does not free the btrfs_device struct but would zero
+         * generation along with most of the other members. So just update
+         * it back. We need it to pick the disk with largest generation
+         * (as above).
+         */
+        if (!fs_devices->opened)
+                device->generation = found_transid;
        if (found_transid > fs_devices->latest_trans) {
                fs_devices->latest_devid = devid;
                fs_devices->latest_trans = found_transid;
@@ -1436,7 +1482,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
        btrfs_set_device_group(leaf, dev_item, 0);
        btrfs_set_device_seek_speed(leaf, dev_item, 0);
@@ -1671,7 +1717,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        device->fs_devices->total_devices--;
        if (device->missing)
-                root->fs_info->fs_devices->missing_devices--;
+                device->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
@@ -1801,8 +1847,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
        if (srcdev->bdev) {
                fs_info->fs_devices->open_devices--;
-                /* zero out the old super */
+                /*
-                btrfs_scratch_superblock(srcdev);
+                 * zero out the old super if it is not writable
+                 * (e.g. seed device)
+                 */
+                if (srcdev->writeable)
+                        btrfs_scratch_superblock(srcdev);
        }
        call_rcu(&srcdev->rcu, free_device);
@@ -1941,6 +1991,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
        fs_devices->seeding = 0;
        fs_devices->num_devices = 0;
        fs_devices->open_devices = 0;
+        fs_devices->missing_devices = 0;
+        fs_devices->num_can_discard = 0;
+        fs_devices->rotating = 0;
        fs_devices->seed = seed_devices;
        generate_random_uuid(fs_devices->fsid);
@@ -5800,7 +5853,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
        else
                generate_random_uuid(dev->uuid);
-        btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
+        btrfs_init_work(&dev->work, btrfs_submit_helper,
+                        pending_bios_fn, NULL, NULL);
        return dev;
 }
author	Tejun Heo <tj@kernel.org>	2014-09-24 13:00:21 -0400
committer	Tejun Heo <tj@kernel.org>	2014-09-24 13:00:21 -0400
commit	d06efebf0c37d438fcf07057be00dd40fcfce08d (patch)
tree	31a0786d132aadf4cbb9725f3f444ef6e1052128 /fs/btrfs
parent	bb2e226b3bef596dd56be97df655d857b4603923 (diff)
parent	0a30288da1aec914e158c2d7a3482a85f632750f (diff)